Index: doc/FileSystem.3 ================================================================== --- doc/FileSystem.3 +++ doc/FileSystem.3 @@ -1743,11 +1743,11 @@ .TP \fIlinkProc\fR . If \fItoPtr\fR is NULL, this should return a value with reference count 1 that has just been allocated and passed to \fBTcl_IncrRefCount\fR. If \fItoPtr\fR -is not NULL, it should be returned on success. +is not NULL, it should be returned on success. .TP \fIlistVolumesProc\fR . The result value should be a list (if non-NULL); it will have its reference count decremented once (with \fBTcl_DecrRefCount\fR) by Tcl once done. Index: doc/NRE.3 ================================================================== --- doc/NRE.3 +++ doc/NRE.3 @@ -243,11 +243,11 @@ .PP Use \fBTcl_NRAddCallback\fR to schedule any required final decrementing of the reference counts of arguments to any of the other functions on this page, as with any other post-processing step in the non-recursive execution engine. .PP -The +The .SH "SEE ALSO" Tcl_CreateCommand(3), Tcl_CreateObjCommand(3), Tcl_EvalObjEx(3), Tcl_GetCommandFromObj(3), Tcl_ExprObj(3) .SH KEYWORDS stackless, nonrecursive, execute, command, global, value, result, script .SH COPYRIGHT Index: doc/UniCharIsAlpha.3 ================================================================== --- doc/UniCharIsAlpha.3 +++ doc/UniCharIsAlpha.3 @@ -6,11 +6,11 @@ '\" .TH Tcl_UniCharIsAlpha 3 "8.1" Tcl "Tcl Library Procedures" .so man.macros .BS .SH NAME -Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters +Tcl_UniCharIsAlnum, Tcl_UniCharIsAlpha, Tcl_UniCharIsControl, Tcl_UniCharIsDigit, Tcl_UniCharIsGraph, Tcl_UniCharIsLower, Tcl_UniCharIsPrint, Tcl_UniCharIsPunct, Tcl_UniCharIsSpace, Tcl_UniCharIsUpper, Tcl_UniCharIsUnicode, Tcl_UniCharIsWordChar \- routines for classification of Tcl_UniChar characters .SH SYNOPSIS .nf \fB#include \fR .sp int @@ -41,10 +41,13 @@ \fBTcl_UniCharIsSpace\fR(\fIch\fR) .sp int \fBTcl_UniCharIsUpper\fR(\fIch\fR) .sp +int +\fBTcl_UniCharIsUnicode\fR(\fIch\fR) +.sp int \fBTcl_UniCharIsWordChar\fR(\fIch\fR) .SH ARGUMENTS .AS int ch .AP int ch in @@ -78,11 +81,14 @@ \fBTcl_UniCharIsPunct\fR tests if the character is a Unicode punctuation character. .PP \fBTcl_UniCharIsSpace\fR tests if the character is a whitespace Unicode character. .PP \fBTcl_UniCharIsUpper\fR tests if the character is an uppercase Unicode character. +.PP +\fBTcl_UniCharIsUnicode\fR tests if the character is a Unicode character, not being +a surrogate or noncharacter. .PP \fBTcl_UniCharIsWordChar\fR tests if the character is alphanumeric or a connector punctuation mark. .SH KEYWORDS unicode, classification Index: doc/string.n ================================================================== --- doc/string.n +++ doc/string.n @@ -411,14 +411,14 @@ ever be used by Tcl's implementation, the number of bytes used to store the representation is of very low value (except to C extension code, which has direct access for the purpose of memory management, etc.) .PP -\fICompatibility note:\fR it is likely that this subcommand will be -withdrawn in a future version of Tcl. It is better to use the -\fBencoding convertto\fR command to convert a string to a known -encoding and then apply \fBstring length\fR to that. +\fICompatibility note:\fR This subcommand is deprecated and will +be removed in Tcl 9.0. It is better to use the \fBencoding convertto\fR +command to convert a string to a known encoding (e.g. "utf-8" or "cesu-8") +and then apply \fBstring length\fR to that. .PP .CS \fBstring length\fR [encoding convertto utf-8 $theString] .CE .RE Index: generic/tcl.decls ================================================================== --- generic/tcl.decls +++ generic/tcl.decls @@ -2421,10 +2421,13 @@ const char *Tcl_UtfNext(const char *src) } declare 656 { const char *Tcl_UtfPrev(const char *src, const char *start) } +declare 657 { + int Tcl_UniCharIsUnicode(int ch) +} # ----- BASELINE -- FOR -- 8.7.0 ----- # ############################################################################## Index: generic/tclCmdMZ.c ================================================================== --- generic/tclCmdMZ.c +++ generic/tclCmdMZ.c @@ -1531,20 +1531,20 @@ static const char *const isClasses[] = { "alnum", "alpha", "ascii", "control", "boolean", "dict", "digit", "double", "entier", "false", "graph", "integer", "list", "lower", "print", "punct", - "space", "true", "upper", "wideinteger", - "wordchar", "xdigit", NULL + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, - STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, - STR_IS_WORD, STR_IS_XDIGIT + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; static const char *const isOptions[] = { "-strict", "-failindex", NULL }; enum isOptionsEnum { @@ -1870,10 +1870,13 @@ chcomp = Tcl_UniCharIsSpace; break; case STR_IS_UPPER: chcomp = Tcl_UniCharIsUpper; break; + case STR_IS_UNICODE: + chcomp = Tcl_UniCharIsUnicode; + break; case STR_IS_WORD: chcomp = Tcl_UniCharIsWordChar; break; case STR_IS_XDIGIT: chcomp = UniCharIsHexDigit; @@ -2830,10 +2833,11 @@ * Side effects: * See the user documentation. * *---------------------------------------------------------------------- */ +#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED) static int StringBytesCmd( TCL_UNUSED(ClientData), Tcl_Interp *interp, /* Current interpreter. */ int objc, /* Number of arguments. */ @@ -2848,10 +2852,11 @@ (void) TclGetStringFromObj(objv[1], &length); Tcl_SetObjResult(interp, Tcl_NewWideIntObj(length)); return TCL_OK; } +#endif /* *---------------------------------------------------------------------- * * StringLenCmd -- @@ -3304,11 +3309,13 @@ Tcl_Command TclInitStringCmd( Tcl_Interp *interp) /* Current interpreter. */ { static const EnsembleImplMap stringImplMap[] = { +#if TCL_MAJOR_VERSION < 9 && !defined(TCL_NO_DEPRECATED) {"bytelength", StringBytesCmd, TclCompileBasic1ArgCmd, NULL, NULL, 0}, +#endif {"cat", StringCatCmd, TclCompileStringCatCmd, NULL, NULL, 0}, {"compare", StringCmpCmd, TclCompileStringCmpCmd, NULL, NULL, 0}, {"equal", StringEqualCmd, TclCompileStringEqualCmd, NULL, NULL, 0}, {"first", StringFirstCmd, TclCompileStringFirstCmd, NULL, NULL, 0}, {"index", StringIndexCmd, TclCompileStringIndexCmd, NULL, NULL, 0}, Index: generic/tclCompCmdsSZ.c ================================================================== --- generic/tclCompCmdsSZ.c +++ generic/tclCompCmdsSZ.c @@ -503,23 +503,23 @@ { DefineLineInformation; /* TIP #280 */ Tcl_Token *tokenPtr = TokenAfter(parsePtr->tokenPtr); static const char *const isClasses[] = { "alnum", "alpha", "ascii", "control", - "boolean", "dict", "digit", "double", "entier", - "false", "graph", "integer", "list", - "lower", "print", "punct", "space", - "true", "upper", "wideinteger", "wordchar", - "xdigit", NULL + "boolean", "dict", "digit", "double", + "entier", "false", "graph", "integer", + "list", "lower", "print", "punct", + "space", "true", "upper", "unicode", + "wideinteger", "wordchar", "xdigit", NULL }; enum isClassesEnum { STR_IS_ALNUM, STR_IS_ALPHA, STR_IS_ASCII, STR_IS_CONTROL, - STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, STR_IS_ENTIER, - STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, STR_IS_LIST, - STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, STR_IS_SPACE, - STR_IS_TRUE, STR_IS_UPPER, STR_IS_WIDE, STR_IS_WORD, - STR_IS_XDIGIT + STR_IS_BOOL, STR_IS_DICT, STR_IS_DIGIT, STR_IS_DOUBLE, + STR_IS_ENTIER, STR_IS_FALSE, STR_IS_GRAPH, STR_IS_INT, + STR_IS_LIST, STR_IS_LOWER, STR_IS_PRINT, STR_IS_PUNCT, + STR_IS_SPACE, STR_IS_TRUE, STR_IS_UPPER, STR_IS_UNICODE, + STR_IS_WIDE, STR_IS_WORD, STR_IS_XDIGIT }; int t, range, allowEmpty = 0, end; InstStringClassType strClassType; Tcl_Obj *isClass; @@ -607,10 +607,13 @@ strClassType = STR_CLASS_SPACE; goto compileStrClass; case STR_IS_UPPER: strClassType = STR_CLASS_UPPER; goto compileStrClass; + case STR_IS_UNICODE: + strClassType = STR_CLASS_UNICODE; + goto compileStrClass; case STR_IS_WORD: strClassType = STR_CLASS_WORD; goto compileStrClass; case STR_IS_XDIGIT: strClassType = STR_CLASS_XDIGIT; @@ -1413,10 +1416,11 @@ {"punct", Tcl_UniCharIsPunct}, {"space", Tcl_UniCharIsSpace}, {"upper", Tcl_UniCharIsUpper}, {"word", Tcl_UniCharIsWordChar}, {"xdigit", UniCharIsHexDigit}, + {"unicode", Tcl_UniCharIsUnicode}, {"", NULL} }; /* *---------------------------------------------------------------------- Index: generic/tclCompile.h ================================================================== --- generic/tclCompile.h +++ generic/tclCompile.h @@ -920,12 +920,13 @@ STR_CLASS_PUNCT, /* Unicode punctuation characters. */ STR_CLASS_SPACE, /* Unicode space characters. */ STR_CLASS_UPPER, /* Unicode upper-case alphabet characters. */ STR_CLASS_WORD, /* Unicode word (alphabetic, digit, connector * punctuation) characters. */ - STR_CLASS_XDIGIT /* Characters that can be used as digits in + STR_CLASS_XDIGIT, /* Characters that can be used as digits in * hexadecimal numbers ([0-9A-Fa-f]). */ + STR_CLASS_UNICODE /* Unicode characters. */ } InstStringClassType; typedef struct StringClassDesc { char name[8]; /* Name of the class. */ int (*comparator)(int); /* Function to test if a single unicode Index: generic/tclDecls.h ================================================================== --- generic/tclDecls.h +++ generic/tclDecls.h @@ -1935,10 +1935,12 @@ EXTERN int Tcl_UtfCharComplete(const char *src, int length); /* 655 */ EXTERN const char * Tcl_UtfNext(const char *src); /* 656 */ EXTERN const char * Tcl_UtfPrev(const char *src, const char *start); +/* 657 */ +EXTERN int Tcl_UniCharIsUnicode(int ch); typedef struct { const struct TclPlatStubs *tclPlatStubs; const struct TclIntStubs *tclIntStubs; const struct TclIntPlatStubs *tclIntPlatStubs; @@ -2627,10 +2629,11 @@ Tcl_UniChar * (*tclGetUnicodeFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 652 */ unsigned char * (*tclGetByteArrayFromObj) (Tcl_Obj *objPtr, size_t *lengthPtr); /* 653 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 654 */ const char * (*tcl_UtfNext) (const char *src); /* 655 */ const char * (*tcl_UtfPrev) (const char *src, const char *start); /* 656 */ + int (*tcl_UniCharIsUnicode) (int ch); /* 657 */ } TclStubs; extern const TclStubs *tclStubsPtr; #ifdef __cplusplus @@ -3969,10 +3972,12 @@ (tclStubsPtr->tcl_UtfCharComplete) /* 654 */ #define Tcl_UtfNext \ (tclStubsPtr->tcl_UtfNext) /* 655 */ #define Tcl_UtfPrev \ (tclStubsPtr->tcl_UtfPrev) /* 656 */ +#define Tcl_UniCharIsUnicode \ + (tclStubsPtr->tcl_UniCharIsUnicode) /* 657 */ #endif /* defined(USE_TCL_STUBS) */ /* !END!: Do not edit above this line. */ Index: generic/tclEncoding.c ================================================================== --- generic/tclEncoding.c +++ generic/tclEncoding.c @@ -508,15 +508,16 @@ * Depends on the memory, object, and IO subsystems. * *--------------------------------------------------------------------------- */ -/* This flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +/* Those flags must not conflict with other TCL_ENCODING_* flags in tcl.h */ +/* Since TCL_ENCODING_MODIFIED is only used for utf-8/cesu-8 and + * TCL_ENCODING_LE is only used for utf-16/ucs-2. re-use the same value */ #define TCL_ENCODING_MODIFIED 0x20 /* Converting NULL bytes to 0xC0 0x80 */ -/* Since TCL_ENCODING_MODIFIED is only used for utf-8 and - * TCL_ENCODING_LE is only used for utf-16/ucs-2, re-use the same value */ #define TCL_ENCODING_LE TCL_ENCODING_MODIFIED /* Little-endian encoding */ +#define TCL_ENCODING_UTF 0x200 /* For UTF-8 encoding, allow 4-byte output sequences */ void TclInitEncodingSubsystem(void) { Tcl_EncodingType type; @@ -554,11 +555,14 @@ type.encodingName = "utf-8"; type.toUtfProc = UtfToUtfProc; type.fromUtfProc = UtfToUtfProc; type.freeProc = NULL; type.nullSize = 1; - type.clientData = NULL; + type.clientData = INT2PTR(TCL_ENCODING_UTF); + Tcl_CreateEncoding(&type); + type.clientData = INT2PTR(0); + type.encodingName = "cesu-8"; Tcl_CreateEncoding(&type); type.toUtfProc = Utf16ToUtfProc; type.fromUtfProc = UtfToUcs2Proc; type.freeProc = NULL; @@ -1139,11 +1143,11 @@ srcLen = encodingPtr->lengthProc(src); } flags = TCL_ENCODING_START | TCL_ENCODING_END; if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } while (1) { result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); @@ -1256,11 +1260,11 @@ */ dstLen--; } if (encodingPtr->toUtfProc == UtfToUtfProc) { - flags |= TCL_ENCODING_MODIFIED; + flags |= TCL_ENCODING_MODIFIED | TCL_ENCODING_UTF; } do { Tcl_EncodingState savedState = *statePtr; result = encodingPtr->toUtfProc(encodingPtr->clientData, src, srcLen, @@ -1335,20 +1339,20 @@ result = encodingPtr->fromUtfProc(encodingPtr->clientData, src, srcLen, flags, &state, dst, dstLen, &srcRead, &dstWrote, &dstChars); soFar = dst + dstWrote - Tcl_DStringValue(dstPtr); + src += srcRead; if (result != TCL_CONVERT_NOSPACE) { if (encodingPtr->nullSize == 2) { Tcl_DStringSetLength(dstPtr, soFar + 1); } Tcl_DStringSetLength(dstPtr, soFar); return Tcl_DStringValue(dstPtr); } flags &= ~TCL_ENCODING_START; - src += srcRead; srcLen -= srcRead; if (Tcl_DStringLength(dstPtr) == 0) { Tcl_DStringSetLength(dstPtr, dstLen); } Tcl_DStringSetLength(dstPtr, 2 * Tcl_DStringLength(dstPtr) + 1); @@ -2213,11 +2217,11 @@ charLimit = *dstCharsPtr; } dstStart = dst; flags |= PTR2INT(clientData); - dstEnd = dst + dstLen - TCL_UTF_MAX; + dstEnd = dst + dstLen - ((flags & TCL_ENCODING_UTF) ? TCL_UTF_MAX : 6); for (numChars = 0; src < srcEnd && numChars <= charLimit; numChars++) { if ((src > srcClose) && (!Tcl_UtfCharComplete(src, srcEnd - src))) { /* * If there is more string to follow, this will ensure that the @@ -2266,34 +2270,63 @@ TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); } else { int low; + const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && (flags & TCL_ENCODING_STOPONERROR) && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; break; } src += len; - if ((ch | 0x7FF) == 0xDFFF) { + if (!(flags & TCL_ENCODING_UTF)) { + if (ch > 0xFFFF) { + /* CESU-8 6-byte sequence for chars > U+FFFF */ + ch -= 0x10000; + *dst++ = 0xED; + *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); + *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); + ch = (ch & 0x0CFF) | 0xDC00; + } + goto cesu8; + } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } + cesu8: *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); continue; } src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; + } else if (!Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + src = saveSrc; + break; + } + if (!(flags & TCL_ENCODING_MODIFIED)) { + ch = 0xFFFD; + } } dst += Tcl_UniCharToUtf(ch, dst); } } @@ -2448,11 +2481,11 @@ * correspond to the bytes stored in the * output buffer. */ { const char *srcStart, *srcEnd, *srcClose, *dstStart, *dstEnd; int result, numChars; - int ch; + int ch, len; srcStart = src; srcEnd = src + srcLen; srcClose = srcEnd; if ((flags & TCL_ENCODING_END) == 0) { @@ -2476,11 +2509,19 @@ } if (dst > dstEnd) { result = TCL_CONVERT_NOSPACE; break; } - src += TclUtfToUCS4(src, &ch); + len = TclUtfToUCS4(src, &ch); + if (!Tcl_UniCharIsUnicode(ch)) { + if (flags & TCL_ENCODING_STOPONERROR) { + result = TCL_CONVERT_UNKNOWN; + break; + } + ch = 0xFFFD; + } + src += len; if (flags & TCL_ENCODING_LE) { if (ch <= 0xFFFF) { *dst++ = (ch & 0xFF); *dst++ = (ch >> 8); } else { Index: generic/tclStubInit.c ================================================================== --- generic/tclStubInit.c +++ generic/tclStubInit.c @@ -1939,8 +1939,9 @@ TclGetUnicodeFromObj, /* 652 */ TclGetByteArrayFromObj, /* 653 */ Tcl_UtfCharComplete, /* 654 */ Tcl_UtfNext, /* 655 */ Tcl_UtfPrev, /* 656 */ + Tcl_UniCharIsUnicode, /* 657 */ }; /* !END!: Do not edit above this line. */ Index: generic/tclUtf.c ================================================================== --- generic/tclUtf.c +++ generic/tclUtf.c @@ -2177,10 +2177,40 @@ return 0; } return (GetCategory(ch) == UPPERCASE_LETTER); } +/* + *---------------------------------------------------------------------- + * + * Tcl_UniCharIsUnicode -- + * + * Test if a character is a Unicode character. + * + * Results: + * Returns non-zero if character belongs to the Unicode set. + * + * Excluded are: + * 1) All characters > U+10FFFF + * 2) Surrogates U+D800 - U+DFFF + * 3) Last 2 characters of each plane, so U+??FFFE and U+??FFFF + * 4) The characters in the range U+FDD0 - U+FDEF + * + * Side effects: + * None. + * + *---------------------------------------------------------------------- + */ + +int +Tcl_UniCharIsUnicode( + int ch) /* Unicode character to test. */ +{ + return ((unsigned int)ch <= 0x10FFFF) && ((ch & 0xFFF800) != 0xD800) + && ((ch & 0xFFFE) != 0xFFFE) && ((unsigned int)(ch - 0xFDD0) >= 32); +} + /* *---------------------------------------------------------------------- * * Tcl_UniCharIsWordChar -- * Index: library/init.tcl ================================================================== --- library/init.tcl +++ library/init.tcl @@ -212,13 +212,13 @@ # construct the stack trace. # set errInfo [dict get $opts -errorinfo] set errCode [dict get $opts -errorcode] set cinfo $args - if {[string bytelength $cinfo] > 150} { + if {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 150] - while {[string bytelength $cinfo] > 150} { + while {[string length [encoding convertto utf-8 $cinfo]] > 150} { set cinfo [string range $cinfo 0 end-1] } append cinfo ... } set tail "\n (\"uplevel\" body line 1)\n invoked\ Index: tests/encoding.test ================================================================== --- tests/encoding.test +++ tests/encoding.test @@ -341,65 +341,65 @@ test encoding-15.6 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z -} {10 edb882f09f9882eda0bd} +} {10 efbfbdf09f9882efbfbd} test encoding-15.7 {UtfToUtfProc emoji character output} { set x \uDE02\uD83D\uD83D set y [encoding convertto utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 9 edb882eda0bdeda0bd} +} {3 9 efbfbdefbfbdefbfbd} test encoding-15.8 {UtfToUtfProc emoji character output} { set x \uDE02\uD83Dé set y [encoding convertto utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 8 edb882eda0bdc3a9} +} {3 8 efbfbdefbfbdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} { set x \uDE02\uD83DX set y [encoding convertto utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z -} {3 7 edb882eda0bd58} +} {3 7 efbfbdefbfbd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} { set x \uDE02é set y [encoding convertto utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 5 edb882c3a9} +} {2 5 efbfbdc3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} { set x \uDA02é set y [encoding convertto utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 5 eda882c3a9} +} {2 5 efbfbdc3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} { set x \uDE02Y set y [encoding convertto utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 4 edb88259} +} {2 4 efbfbd59} test encoding-15.13 {UtfToUtfProc low surrogate character output} { set x \uDA02Y set y [encoding convertto utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z -} {2 4 eda88259} +} {2 4 efbfbd59} test encoding-15.14 {UtfToUtfProc high surrogate character output} { set x \uDE02 set y [encoding convertto utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z -} {1 3 edb882} +} {1 3 efbfbd} test encoding-15.15 {UtfToUtfProc low surrogate character output} { set x \uDA02 set y [encoding convertto utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z -} {1 3 eda882} +} {1 3 efbfbd} test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 set y [encoding convertfrom utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" @@ -407,10 +407,30 @@ set x 😂 set y [encoding convertto utf-8 😂] binary scan $y H* z list [string length $y] $z } {4 f09f9882} +test encoding-15.18 {UtfToUtfProc CESU-8 6-byte sequence} { + set y [encoding convertto cesu-8 \U10000] + binary scan $y H* z + list [string length $y] $z +} {6 eda080edb080} +test encoding-15.19 {UtfToUtfProc CESU-8 upper surrogate} { + set y [encoding convertto cesu-8 \uD800] + binary scan $y H* z + list [string length $y] $z +} {3 eda080} +test encoding-15.20 {UtfToUtfProc CESU-8 lower surrogate} { + set y [encoding convertto cesu-8 \uDC00] + binary scan $y H* z + list [string length $y] $z +} {3 edb080} +test encoding-15.21 {UtfToUtfProc CESU-8 noncharacter} { + set y [encoding convertto cesu-8 \uFFFF] + binary scan $y H* z + list [string length $y] $z +} {3 efbfbf} test encoding-16.1 {Utf16ToUtfProc} -body { set val [encoding convertfrom utf-16 NN] list $val [format %x [scan $val %c]] } -result "乎 4e4e" @@ -432,19 +452,19 @@ } -result "\U460DC 460dc" test encoding-17.1 {UtfToUtf16Proc} -body { encoding convertto utf-16 "\U460DC" } -result "\xD8\xD8\xDC\xDC" -test encoding-17.2 {UtfToUtf16Proc} -body { - encoding convertto utf-16 "\uDCDC" -} -result "\xDC\xDC" -test encoding-17.3 {UtfToUtf16Proc} -body { - encoding convertto utf-16 "\uD8D8" -} -result "\xD8\xD8" -test encoding-17.4 {UtfToUcs2Proc} -body { +test encoding-17.2 {UtfToUcs2Proc} -body { encoding convertfrom utf-16 [encoding convertto ucs-2 "\U460DC"] } -result "\uFFFD" +test encoding-17.3 {UtfToUtf16Proc} -body { + encoding convertto utf-16be "\uDCDC" +} -result "\xFF\xFD" +test encoding-17.4 {UtfToUtf16Proc} -body { + encoding convertto utf-16le "\uD8D8" +} -result "\xFD\xFF" test encoding-18.1 {TableToUtfProc} { } {} test encoding-19.1 {TableFromUtfProc} { @@ -740,11 +760,11 @@ # discard the cached internal representation of Tcl_Encoding # Unfortunately, without this, encoding 2-1 fails. llength $name } return $count -} -result [expr {[info exists ::tcl_precision] ? 86 : 85}] +} -result [expr {[info exists ::tcl_precision] ? 87 : 86}] runtests } Index: tests/info.test ================================================================== --- tests/info.test +++ tests/info.test @@ -20,11 +20,11 @@ namespace import -force ::tcltest::* } ::tcltest::loadTestedCommands catch [list package require -exact tcl::test [info patchlevel]] testConstraint zlib [llength [info commands zlib]] - +testConstraint nodep [info exists tcl_precision] # Set up namespaces needed to test operation of "info args", "info body", # "info default", and "info procs" with imported procedures. catch {namespace delete test_ns_info1 test_ns_info2} @@ -99,11 +99,11 @@ eval [info body foo] } -returnCodes error -result {can't read "args": no such variable} # Fix for problem tested for in info-2.5 caused problems when # procedure body had no string rep (i.e. was not yet bytecode) # causing an empty string to be returned [Bug #545644] -test info-2.6 {info body option, returning list bodies} { +test info-2.6 {info body option, returning list bodies} nodep { proc foo args [list subst bar] list [string bytelength [info body foo]] \ [foo; string bytelength [info body foo]] } {9 9} Index: tests/io.test ================================================================== --- tests/io.test +++ tests/io.test @@ -2414,11 +2414,11 @@ set chan [chan create r {apply {{cmd chan args} { switch $cmd { blocking - finalize { } watch { - chan postevent $chan read + chan postevent $chan read } initialize { list initialize finalize watch read write configure blocking } default { @@ -2435,11 +2435,11 @@ return } } [namespace current]]] vwait [namespace current]::done return success -} success +} success test io-29.1 {Tcl_WriteChars, channel not writable} { list [catch {puts stdin hello} msg] $msg Index: tests/regexp.test ================================================================== --- tests/regexp.test +++ tests/regexp.test @@ -17,10 +17,11 @@ } unset -nocomplain foo testConstraint exec [llength [info commands exec]] +testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] if {[testConstraint memory]} { proc memtest script { @@ -763,11 +764,11 @@ regsub -all {@} {@hel@lo@} "\0a\0" result set expected "\0a\0hel\0a\0lo\0a\0" string equal $result $expected } 1 -test regexp-20.1 {regsub shared object shimmering} -body { +test regexp-20.1 {regsub shared object shimmering} -constraints nodep -body { # Bug #461322 set a abcdefghijklmnopqurstuvwxyz set b $a set c abcdefghijklmnopqurstuvwxyz0123456789 regsub $a $c $b d Index: tests/regexpComp.test ================================================================== --- tests/regexpComp.test +++ tests/regexpComp.test @@ -13,10 +13,12 @@ if {"::tcltest" ni [namespace children]} { package require tcltest 2.5 namespace import -force ::tcltest::* } + +testConstraint nodep [info exists tcl_precision] # Procedure to evaluate a script within a proc, to test compilation # functionality proc evalInProc { script } { @@ -789,11 +791,11 @@ regsub -all {@} {@hel@lo@} "\0a\0" result list $result [string length $result] } } "\0a\0hel\0a\0lo\0a\0 14" -test regexpComp-20.1 {regsub shared object shimmering} { +test regexpComp-20.1 {regsub shared object shimmering} nodep { evalInProc { # Bug #461322 set a abcdefghijklmnopqurstuvwxyz set b $a set c abcdefghijklmnopqurstuvwxyz0123456789 Index: tests/string.test ================================================================== --- tests/string.test +++ tests/string.test @@ -31,10 +31,11 @@ testConstraint testobj [expr {[info commands testobj] ne {}}] testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf16 [expr {[string length \U010000] == 2}] testConstraint testbytestring [llength [info commands testbytestring]] +testConstraint nodep [info exists tcl_precision] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] if {[testConstraint memory]} { proc getbytes {} { @@ -70,13 +71,13 @@ interp alias {} run {} try set constraints {} } -test string-1.1.$noComp {error conditions} { +test string-1.1.$noComp {error conditions} -body { list [catch {run {string gorp a b}} msg] $msg -} {1 {unknown or ambiguous subcommand "gorp": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "gorp": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-1.2.$noComp {error conditions} { list [catch {run {string}} msg] $msg } {1 {wrong # args: should be "string subcommand ?arg ...?"}} test stringComp-1.3.$noComp {error condition - undefined method during compile} { # We don't want this to complain about 'never' because it may never @@ -523,14 +524,14 @@ test string-6.4.$noComp {string is, too many args} { list [catch {run {string is alpha -failin var -strict str more}} msg] $msg } {1 {wrong # args: should be "string is class ?-strict? ?-failindex var? str"}} test string-6.5.$noComp {string is, class check} { list [catch {run {string is bogus str}} msg] $msg -} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {bad class "bogus": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.6.$noComp {string is, ambiguous class} { list [catch {run {string is al str}} msg] $msg -} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, wideinteger, wordchar, or xdigit}} +} {1 {ambiguous class "al": must be alnum, alpha, ascii, control, boolean, dict, digit, double, entier, false, graph, integer, list, lower, print, punct, space, true, upper, unicode, wideinteger, wordchar, or xdigit}} test string-6.7.$noComp {string is alpha, all ok} { run {string is alpha -strict -failindex var abc} } 1 test string-6.8.$noComp {string is, error in var} { list [run {string is alpha -failindex var abc5def}] $var @@ -959,10 +960,32 @@ list [run {string is entier -fail var 0o1234561123412345612345656234561234561234561234561234561234561234561234561234561234536963}] $var } {0 87} test string-6.131.$noComp {string is entier, false on bad hex} { list [run {string is entier -fail var 0X12345611234123456123456562345612345612345612345612345612345612345612345612345612345345XYZ}] $var } {0 88} +test string-6.132.$noComp {string is unicode} { + run {string is unicode \U10FFFD\uD7FF\uE000\uFDCF\uFDF0} +} 1 +test string-6.133.$noComp {string is unicode, upper surrogate} { + run {string is unicode \uD800} +} 0 +test string-6.134.$noComp {string is unicode, lower surrogate} { + run {string is unicode \uDFFF} +} 0 +test string-6.135.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFE} +} 0 +test string-6.136.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFFFF} +} 0 +test string-6.137.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDD0} +} 0 +test string-6.138.$noComp {string is unicode, noncharacter} { + run {string is unicode \uFDEF} +} 0 + test string-7.1.$noComp {string last, not enough args} { list [catch {run {string last a}} msg] $msg } {1 {wrong # args: should be "string last needleString haystackString ?lastIndex?"}} test string-7.2.$noComp {string last, bad args} { @@ -1011,20 +1034,20 @@ } -1 test string-7.16.$noComp {string last, start index} { run {string last Üa ÜadÜad end-1} } 3 -test string-8.1.$noComp {string bytelength} { +test string-8.1.$noComp {string bytelength} nodep { list [catch {run {string bytelength}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.2.$noComp {string bytelength} { +test string-8.2.$noComp {string bytelength} nodep { list [catch {run {string bytelength a b}} msg] $msg } {1 {wrong # args: should be "string bytelength string"}} -test string-8.3.$noComp {string bytelength} { +test string-8.3.$noComp {string bytelength} nodep { run {string bytelength "\xC7"} } 2 -test string-8.4.$noComp {string bytelength} { +test string-8.4.$noComp {string bytelength} nodep { run {string b ""} } 0 test string-9.1.$noComp {string length} { list [catch {run {string length}} msg] $msg @@ -1815,13 +1838,13 @@ } \u1361ABC test string-20.1.$noComp {string trimright errors} { list [catch {run {string trimright}} msg] $msg } {1 {wrong # args: should be "string trimright string ?chars?"}} -test string-20.2.$noComp {string trimright errors} { +test string-20.2.$noComp {string trimright errors} -body { list [catch {run {string trimg a}} msg] $msg -} {1 {unknown or ambiguous subcommand "trimg": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "trimg": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-20.3.$noComp {string trimright} { run {string trimright " XYZ "} } { XYZ} test string-20.4.$noComp {string trimright} { run {string trimright " "} @@ -1937,11 +1960,11 @@ run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg -} -result {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} +} -match glob -result {1 {unknown or ambiguous subcommand "word": must be *cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-22.2.$noComp {string wordstart} -body { list [catch {run {string wordstart a}} msg] $msg } -result {1 {wrong # args: should be "string wordstart string index"}} test string-22.3.$noComp {string wordstart} -body { list [catch {run {string wordstart a b c}} msg] $msg