Index: generic/tclEncoding.c ================================================================== --- generic/tclEncoding.c +++ generic/tclEncoding.c @@ -232,16 +232,21 @@ static int TableToUtfProc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); static size_t unilen(const char *src); -static int UniCharToUtfProc(ClientData clientData, +static int Utf16ToUtfProc(ClientData clientData, + const char *src, int srcLen, int flags, + Tcl_EncodingState *statePtr, char *dst, int dstLen, + int *srcReadPtr, int *dstWrotePtr, + int *dstCharsPtr); +static int UtfToUtf16Proc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); -static int UtfToUniCharProc(ClientData clientData, +static int UtfToUcs2Proc(ClientData clientData, const char *src, int srcLen, int flags, Tcl_EncodingState *statePtr, char *dst, int dstLen, int *srcReadPtr, int *dstWrotePtr, int *dstCharsPtr); static int UtfToUtfProc(ClientData clientData, @@ -562,15 +567,20 @@ { Tcl_EncodingType type; TableEncodingData *dataPtr; unsigned size; unsigned short i; + union { + char c; + short s; + } isLe; if (encodingsInitialized) { return; } + isLe.s = 1; Tcl_MutexLock(&encodingMutex); Tcl_InitHashTable(&encodingTable, TCL_STRING_KEYS); Tcl_MutexUnlock(&encodingMutex); /* @@ -593,17 +603,42 @@ type.freeProc = NULL; type.nullSize = 1; type.clientData = NULL; Tcl_CreateEncoding(&type); - type.encodingName = "unicode"; - type.toUtfProc = UniCharToUtfProc; - type.fromUtfProc = UtfToUniCharProc; + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUcs2Proc; + type.freeProc = NULL; + type.nullSize = 2; + type.encodingName = "ucs-2le"; + type.clientData = INT2PTR(1); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "ucs-2"; + type.clientData = INT2PTR(isLe.c); + Tcl_CreateEncoding(&type); + + type.toUtfProc = Utf16ToUtfProc; + type.fromUtfProc = UtfToUtf16Proc; type.freeProc = NULL; type.nullSize = 2; - type.clientData = NULL; + type.encodingName = "utf-16le"; + type.clientData = INT2PTR(1);; + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16be"; + type.clientData = INT2PTR(0); + Tcl_CreateEncoding(&type); + type.encodingName = "utf-16"; + type.clientData = INT2PTR(isLe.c);; + Tcl_CreateEncoding(&type); + +#ifndef TCL_NO_DEPRECATED + type.encodingName = "unicode"; Tcl_CreateEncoding(&type); +#endif /* * Need the iso8859-1 encoding in order to process binary data, so force * it to always be embedded. Note that this encoding *must* be a proper * table encoding or some of the escape encodings crash! Hence the ugly @@ -1277,11 +1312,11 @@ flags, statePtr, dst, dstLen, srcReadPtr, dstWrotePtr, dstCharsPtr); if (*dstCharsPtr <= maxChars) { break; } - dstLen = Tcl_UtfAtIndex(dst, maxChars) - 1 - dst + TCL_UTF_MAX; + dstLen = Tcl_UtfAtIndex(dst, maxChars) - dst + (TCL_UTF_MAX - 1); flags = savedFlags; *statePtr = savedState; } while (1); if (!noTerminate) { /* ...and then append it */ @@ -2399,13 +2434,13 @@ } /* *------------------------------------------------------------------------- * - * UniCharToUtfProc -- + * Utf16ToUtfProc -- * - * Convert from Unicode to UTF-8. + * Convert from UTF-16 to UTF-8. * * Results: * Returns TCL_OK if conversion was successful. * * Side effects: @@ -2413,12 +2448,12 @@ * *------------------------------------------------------------------------- */ static int -UniCharToUtfProc( - ClientData clientData, /* Not used. */ +Utf16ToUtfProc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in Unicode. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state * information used during a piecewise @@ -2466,16 +2501,19 @@ if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } + if (clientData) { + ch = (src[1] & 0xFF) << 8 | (src[0] & 0xFF); + } else { + ch = (src[0] & 0xFF) << 8 | (src[1] & 0xFF); + } /* * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ - - ch = *(unsigned short *)src; if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch, dst); } @@ -2489,13 +2527,13 @@ } /* *------------------------------------------------------------------------- * - * UtfToUniCharProc -- + * UtfToUtf16Proc -- * - * Convert from UTF-8 to Unicode. + * Convert from UTF-8 to UTF-16. * * Results: * Returns TCL_OK if conversion was successful. * * Side effects: @@ -2503,13 +2541,12 @@ * *------------------------------------------------------------------------- */ static int -UtfToUniCharProc( - ClientData clientData, /* TableEncodingData that specifies - * encoding. */ +UtfToUtf16Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ Tcl_EncodingState *statePtr,/* Place for conversion routine to store state * information used during a piecewise @@ -2569,48 +2606,155 @@ /* * Need to handle this in a way that won't cause misalignment by * casting dst to a Tcl_UniChar. [Bug 1122671] */ -#ifdef WORDS_BIGENDIAN -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); - } else { - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; - *dst++ = (*chPtr & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - } -#else - *dst++ = (*chPtr >> 8); - *dst++ = (*chPtr & 0xFF); -#endif -#else -#if TCL_UTF_MAX > 4 - if (*chPtr <= 0xFFFF) { - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); - } else { - *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); - *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; - *dst++ = (*chPtr & 0xFF); - *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; - } -#else - *dst++ = (*chPtr & 0xFF); - *dst++ = (*chPtr >> 8); -#endif -#endif - } - *srcReadPtr = src - srcStart; - *dstWrotePtr = dst - dstStart; - *dstCharsPtr = numChars; - return result; -} - + if (clientData) { +#if TCL_UTF_MAX > 4 + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr & 0xFF); + *dst++ = (*chPtr >> 8); + } else { + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (*chPtr & 0xFF); + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + } +#else + *dst++ = (*chPtr & 0xFF); + *dst++ = (*chPtr >> 8); +#endif + } else { +#if TCL_UTF_MAX > 4 + if (*chPtr <= 0xFFFF) { + *dst++ = (*chPtr >> 8); + *dst++ = (*chPtr & 0xFF); + } else { + *dst++ = ((*chPtr & 0x3) >> 8) | 0xDC; + *dst++ = (*chPtr & 0xFF); + *dst++ = (((*chPtr - 0x10000) >> 18) & 0x3) | 0xD8; + *dst++ = (((*chPtr - 0x10000) >> 10) & 0xFF); + } +#else + *dst++ = (*chPtr >> 8); + *dst++ = (*chPtr & 0xFF); +#endif + } + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + +/* + *------------------------------------------------------------------------- + * + * UtfToUcs2Proc -- + * + * Convert from UTF-8 to UCS-2. + * + * Results: + * Returns TCL_OK if conversion was successful. + * + * Side effects: + * None. + * + *------------------------------------------------------------------------- + */ + +static int +UtfToUcs2Proc( + ClientData clientData, /* != NULL means LE, == NUL means BE */ + const char *src, /* Source string in UTF-8. */ + int srcLen, /* Source string length in bytes. */ + int flags, /* Conversion control flags. */ + Tcl_EncodingState *statePtr,/* Place for conversion routine to store state + * information used during a piecewise + * conversion. Contents of statePtr are + * initialized and/or reset by conversion + * routine under control of flags argument. */ + char *dst, /* Output buffer in which converted string is + * stored. */ + int dstLen, /* The maximum length of output buffer in + * bytes. */ + int *srcReadPtr, /* Filled with the number of bytes from the + * source string that were converted. This may + * be less than the original source length if + * there was a problem converting some source + * characters. */ + int *dstWrotePtr, /* Filled with the number of bytes that were + * stored in the output buffer as a result of + * the conversion. */ + int *dstCharsPtr) /* Filled with the number of characters that + * correspond to the bytes stored in the + * output buffer. */ +{ + const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; + int result, numChars; +#if TCL_UTF_MAX <= 4 + int len; +#endif + Tcl_UniChar ch = 0; + + srcStart = src; + srcEnd = src + srcLen; + srcClose = srcEnd; + if ((flags & TCL_ENCODING_END) == 0) { + srcClose -= TCL_UTF_MAX; + } + + dstStart = dst; + dstEnd = dst + dstLen - sizeof(Tcl_UniChar); + + result = TCL_OK; + for (numChars = 0; src < srcEnd; numChars++) { + if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { + /* + * If there is more string to follow, this will ensure that the + * last UTF-8 character in the source buffer hasn't been cut off. + */ + + result = TCL_CONVERT_MULTIBYTE; + break; + } + if (dst > dstEnd) { + result = TCL_CONVERT_NOSPACE; + break; + } +#if TCL_UTF_MAX <= 4 + src += (len = TclUtfToUniChar(src, &ch)); + if ((ch >= 0xD800) && (len < 3)) { + src += TclUtfToUniChar(src, &ch); + ch = 0xFFFD; + } +#else + src += TclUtfToUniChar(src, &ch); + if (ch > 0xFFFF) { + ch = 0xFFFD; + } +#endif + + /* + * Need to handle this in a way that won't cause misalignment by + * casting dst to a Tcl_UniChar. [Bug 1122671] + */ + + if (clientData) { + *dst++ = (ch & 0xFF); + *dst++ = (ch >> 8); + } else { + *dst++ = (ch >> 8); + *dst++ = (ch & 0xFF); + } + } + *srcReadPtr = src - srcStart; + *dstWrotePtr = dst - dstStart; + *dstCharsPtr = numChars; + return result; +} + /* *------------------------------------------------------------------------- * * TableToUtfProc -- * Index: tests/binary.test ================================================================== --- tests/binary.test +++ tests/binary.test @@ -2910,11 +2910,11 @@ return $one[binary format H* $b] }} ab cd } [binary format H* abcd] test binary-78.1 {unicode (out of BMP) to byte-array conversion, bug-[bd94500678]} -body { - # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX <= 4): + # just test for BO-segfault (high surrogate w/o advance source pointer for out of BMP char if TCL_UTF_MAX == 3): binary encode hex \U0001f415 binary scan \U0001f415 a* v; set v set str {} } -result {} Index: tests/chanio.test ================================================================== --- tests/chanio.test +++ tests/chanio.test @@ -886,11 +886,11 @@ set x "" } -constraints {stdio testchannel openpipe fileevent} -body { # Tcl_ExternalToUtf() set f [openpipe w+ $path(cat)] chan configure $f -translation {auto lf} -buffering none - chan configure $f -encoding unicode + chan configure $f -encoding utf-16 chan puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r" chan configure $f -buffersize 16 chan gets $f chan configure $f -blocking 0 lappend x [chan gets $f line] $line [testchannel queuedcr $f] @@ -1127,11 +1127,11 @@ chan configure $f -translation lf -encoding ascii -buffering none chan puts -nonewline $f "123456789012345\r\nbcdefghijklmnopqrstuvwxyz" chan event $f read [namespace code { lappend x [chan gets $f line] $line [testchannel inputbuffered $f] }] - chan configure $f -encoding unicode -buffersize 16 -blocking 0 + chan configure $f -encoding utf-16 -buffersize 16 -blocking 0 vwait [namespace which -variable x] chan configure $f -translation auto -encoding ascii -blocking 1 # here vwait [namespace which -variable x] return $x Index: tests/encoding.test ================================================================== --- tests/encoding.test +++ tests/encoding.test @@ -320,22 +320,33 @@ set y [encoding convertfrom utf-8 [encoding convertto utf-8 \u0000]] binary scan [teststringbytes $y] H* z set z } c080 -test encoding-16.1 {UnicodeToUtfProc} -body { - set val [encoding convertfrom unicode NN] +test encoding-16.1 {Utf16ToUtfProc} -body { + set val [encoding convertfrom utf-16 NN] + list $val [format %x [scan $val %c]] +} -result "\u4e4e 4e4e" +test encoding-16.2 {Utf16ToUtfProc} -body { + set val [encoding convertfrom utf-16 "\xd8\xd8\xdc\xdc"] + list $val [format %x [scan $val %c]] +} -result "\U460dc 460dc" +test encoding-16.3 {Ucs2ToUtfProc} -body { + set val [encoding convertfrom ucs-2 NN] list $val [format %x [scan $val %c]] } -result "\u4e4e 4e4e" -test encoding-16.2 {UnicodeToUtfProc} -body { - set val [encoding convertfrom unicode "\xd8\xd8\xdc\xdc"] +test encoding-16.4 {Ucs2ToUtfProc} -body { + set val [encoding convertfrom ucs-2 "\xd8\xd8\xdc\xdc"] list $val [format %x [scan $val %c]] } -result "\U460dc 460dc" -test encoding-17.1 {UtfToUnicodeProc} -body { - encoding convertto unicode "\U460dc" +test encoding-17.1 {UtfToUtf16Proc} -body { + encoding convertto utf-16 "\U460dc" } -result "\xd8\xd8\xdc\xdc" +test encoding-17.2 {UtfToUcs2Proc} -body { + encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460dc"] +} -result "\ufffd" test encoding-18.1 {TableToUtfProc} { } {} test encoding-19.1 {TableFromUtfProc} { Index: tests/io.test ================================================================== --- tests/io.test +++ tests/io.test @@ -916,11 +916,11 @@ test io-6.45 {Tcl_GetsObj: input saw cr, skip right number of bytes} {stdio testchannel openpipe fileevent} { # Tcl_ExternalToUtf() set f [open "|[list [interpreter] $path(cat)]" w+] fconfigure $f -translation {auto lf} -buffering none - fconfigure $f -encoding unicode + fconfigure $f -encoding utf-16 puts -nonewline $f "bbbbbbbbbbbbbbb\n123456789abcdef\r" fconfigure $f -buffersize 16 gets $f fconfigure $f -blocking 0 set x [list [gets $f line] $line [testchannel queuedcr $f]] @@ -1160,11 +1160,11 @@ fileevent $f read [namespace code "ready $f"] proc ready {f} { variable x lappend x [gets $f line] $line [testchannel inputbuffered $f] } - fconfigure $f -encoding unicode -buffersize 16 -blocking 0 + fconfigure $f -encoding utf-16 -buffersize 16 -blocking 0 vwait [namespace which -variable x] fconfigure $f -translation auto -encoding ascii -blocking 1 # here vwait [namespace which -variable x] close $f Index: tests/ioCmd.test ================================================================== --- tests/ioCmd.test +++ tests/ioCmd.test @@ -239,27 +239,27 @@ } -result {bad value for -translation: must be one of auto, binary, cr, lf, crlf, or platform} test iocmd-8.7 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] - fconfigure $f1 -translation lf -eofchar {} -encoding unicode + fconfigure $f1 -translation lf -eofchar {} -encoding utf-16 fconfigure $f1 } -cleanup { catch {close $f1} -} -result {-blocking 1 -buffering full -buffersize 4096 -encoding unicode -eofchar {} -translation lf} +} -result {-blocking 1 -buffering full -buffersize 4096 -encoding utf-16 -eofchar {} -translation lf} test iocmd-8.8 {fconfigure command} -setup { file delete $path(test1) set x {} } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation lf -buffering line -buffersize 3030 \ - -eofchar {} -encoding unicode + -eofchar {} -encoding utf-16 lappend x [fconfigure $f1 -buffering] lappend x [fconfigure $f1] } -cleanup { catch {close $f1} -} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding unicode -eofchar {} -translation lf}} +} -result {line {-blocking 1 -buffering line -buffersize 3030 -encoding utf-16 -eofchar {} -translation lf}} test iocmd-8.9 {fconfigure command} -setup { file delete $path(test1) } -body { set f1 [open $path(test1) w] fconfigure $f1 -translation binary -buffering none -buffersize 4040 \ Index: tests/source.test ================================================================== --- tests/source.test +++ tests/source.test @@ -238,16 +238,16 @@ # file that contains the byte \x1A, although not the character \u001A in # the indicated encoding. set sourcefile [makeFile {} source.file] file delete $sourcefile set f [open $sourcefile w] - fconfigure $f -encoding unicode + fconfigure $f -encoding utf-16 puts $f "set symbol(square-root) \u221A; set x correct" close $f } -body { set x unset - source -encoding unicode $sourcefile + source -encoding utf-16 $sourcefile set x } -cleanup { removeFile source.file } -result correct test source-7.3 {source -encoding: syntax} -body {