Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch tip-619 Excluding Merge-Ins
This is equivalent to a diff from ccbe007452 to 631cce8656
2022-07-04
| ||
11:21 | TIP #619: New TCL_COMBINE flag for Tcl_UniCharToUtf() check-in: 3bb6f6602c user: jan.nijtmans tags: trunk, main | |
2022-06-10
| ||
20:24 | Merge 8.7 check-in: 8c31c163af user: jan.nijtmans tags: trunk, main | |
13:45 | Provide help with solving merge conflicts ;-) check-in: 71597277a7 user: jan.nijtmans tags: dgp-refactor | |
2022-06-09
| ||
20:25 | Merge 9.0 Closed-Leaf check-in: 631cce8656 user: jan.nijtmans tags: tip-619 | |
16:31 | Merge 9.0 check-in: 475b98506a user: jan.nijtmans tags: tip-626 | |
13:45 | Internal bug in Tcl_GetIndexFromObjStruct re-definition (works, as long as sizeof(struct) == sizeof(... check-in: ccbe007452 user: jan.nijtmans tags: trunk, main | |
07:03 | More (internal) int -> size_t, allowing values > 2^31 check-in: 5d828b481d user: jan.nijtmans tags: trunk, main | |
2022-05-19
| ||
15:05 | Merge 9.0 check-in: 60d39427c4 user: jan.nijtmans tags: tip-619 | |
Changes to doc/Utf.3.
︙ | ︙ | |||
162 163 164 165 166 167 168 | consists of a lead byte followed by some number of trail bytes. .PP \fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR can consume in a single call. .PP \fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored | | > | | < | > | 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 | consists of a lead byte followed by some number of trail bytes. .PP \fBTCL_UTF_MAX\fR is the maximum number of bytes that \fBTcl_UtfToUniChar\fR can consume in a single call. .PP \fBTcl_UniCharToUtf\fR stores the character \fIch\fR as a UTF-8 string in starting at \fIbuf\fR. The return value is the number of bytes stored in \fIbuf\fR. The character \fIch\fR can be or'ed with the value TCL_COMBINE to enable special behavior, compatible with Tcl 8.x. Then, if ch is a high surrogate (range U+D800 - U+DBFF), the return value will be 1 and a single byte in the range 0xF0 - 0xF4 will be stored. If \fIch\fR is a low surrogate (range U+DC00 - U+DFFF), an attempt is made to combine the result with the earlier produced bytes, resulting in a 4-byte UTF-8 byte sequence. .PP \fBTcl_UtfToUniChar\fR reads one UTF-8 character starting at \fIsrc\fR and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is |
︙ | ︙ |
Changes to generic/tcl.h.
︙ | ︙ | |||
831 832 833 834 835 836 837 838 839 840 841 842 843 844 | * a table that will not live long enough to make it worthwhile. */ #define TCL_EXACT 1 #define TCL_INDEX_NULL_OK 32 #define TCL_INDEX_TEMP_TABLE 64 /* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for * evalFlag bits in tclInt.h! * * Meanings: | > > > > > > > | 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 | * a table that will not live long enough to make it worthwhile. */ #define TCL_EXACT 1 #define TCL_INDEX_NULL_OK 32 #define TCL_INDEX_TEMP_TABLE 64 /* * Flags that may be passed to Tcl_UniCharToUtf. * TCL_COMBINE Combine surrogates */ #define TCL_COMBINE 0x1000000 /* *---------------------------------------------------------------------------- * Flag values passed to Tcl_RecordAndEval, Tcl_EvalObj, Tcl_EvalObjv. * WARNING: these bit choices must not conflict with the bit choices for * evalFlag bits in tclInt.h! * * Meanings: |
︙ | ︙ |
Changes to generic/tclCmdMZ.c.
︙ | ︙ | |||
1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 | unsigned char uch = UCHAR(ch); Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { char buf[4] = ""; end = Tcl_UniCharToUtf(ch, buf); if ((ch >= 0xD800) && (end < 3)) { end += Tcl_UniCharToUtf(-1, buf + end); } Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end)); } } return TCL_OK; } /* | > > | 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 | unsigned char uch = UCHAR(ch); Tcl_SetObjResult(interp, Tcl_NewByteArrayObj(&uch, 1)); } else { char buf[4] = ""; end = Tcl_UniCharToUtf(ch, buf); #if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (end < 3)) { end += Tcl_UniCharToUtf(-1, buf + end); } #endif Tcl_SetObjResult(interp, Tcl_NewStringObj(buf, end)); } } return TCL_OK; } /* |
︙ | ︙ |
Changes to generic/tclDecls.h.
︙ | ︙ | |||
4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 | # define Tcl_UniCharToUtfDString Tcl_Char16ToUtfDString # undef Tcl_UtfToUniCharDString # define Tcl_UtfToUniCharDString Tcl_UtfToChar16DString # undef Tcl_UtfToUniChar # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len #if !defined(BUILD_tcl) # undef Tcl_NumUtfChars # define Tcl_NumUtfChars TclNumUtfChars # undef Tcl_GetCharLength # define Tcl_GetCharLength TclGetCharLength # undef Tcl_UtfAtIndex # define Tcl_UtfAtIndex TclUtfAtIndex | > > > > > > > > | 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 | # define Tcl_UniCharToUtfDString Tcl_Char16ToUtfDString # undef Tcl_UtfToUniCharDString # define Tcl_UtfToUniCharDString Tcl_UtfToChar16DString # undef Tcl_UtfToUniChar # define Tcl_UtfToUniChar Tcl_UtfToChar16 # undef Tcl_UniCharLen # define Tcl_UniCharLen Tcl_Char16Len # undef Tcl_UniCharToUtf # if defined(USE_TCL_STUBS) # define Tcl_UniCharToUtf(c, p) \ (tclStubsPtr->tcl_UniCharToUtf((c)|TCL_COMBINE, (p))) # else # define Tcl_UniCharToUtf(c, p) \ ((Tcl_UniCharToUtf)((c)|TCL_COMBINE, (p))) # endif #if !defined(BUILD_tcl) # undef Tcl_NumUtfChars # define Tcl_NumUtfChars TclNumUtfChars # undef Tcl_GetCharLength # define Tcl_GetCharLength TclGetCharLength # undef Tcl_UtfAtIndex # define Tcl_UtfAtIndex TclUtfAtIndex |
︙ | ︙ |
Changes to generic/tclEncoding.c.
︙ | ︙ | |||
2312 2313 2314 2315 2316 2317 2318 | } else { char chbuf[2]; chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); } else { | < > | > > > > > > | | < < < < > | 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 | } else { char chbuf[2]; chbuf[0] = UCHAR(*src++); chbuf[1] = 0; TclUtfToUCS4(chbuf, &ch); } dst += Tcl_UniCharToUtf(ch, dst); } else { const char *saveSrc = src; size_t len = TclUtfToUCS4(src, &ch); if ((len < 2) && (ch != 0) && !(flags & TCL_ENCODING_NOCOMPLAIN) && (flags & TCL_ENCODING_MODIFIED)) { result = TCL_CONVERT_SYNTAX; break; } src += len; if (!(flags & TCL_ENCODING_UTF) && (ch > 0x3FF)) { if (ch > 0xFFFF) { /* CESU-8 6-byte sequence for chars > U+FFFF */ ch -= 0x10000; *dst++ = 0xED; *dst++ = (char) (((ch >> 16) & 0x0F) | 0xA0); *dst++ = (char) (((ch >> 10) & 0x3F) | 0x80); ch = (ch & 0x0CFF) | 0xDC00; } #if TCL_UTF_MAX < 4 cesu8: #endif *dst++ = (char) (((ch >> 12) | 0xE0) & 0xEF); *dst++ = (char) (((ch >> 6) | 0x80) & 0xBF); *dst++ = (char) ((ch | 0x80) & 0xBF); continue; #if TCL_UTF_MAX < 4 } else if ((ch | 0x7FF) == 0xDFFF) { /* * A surrogate character is detected, handle especially. */ int low = ch; len = (src <= srcEnd-3) ? TclUtfToUCS4(src, &low) : 0; if (((low & ~0x3FF) != 0xDC00) || (ch & 0x400)) { if (!(flags & TCL_ENCODING_NOCOMPLAIN)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } goto cesu8; } src += len; dst += Tcl_UniCharToUtf(ch, dst); ch = low; #endif } else if (!Tcl_UniCharIsUnicode(ch)) { if (!(flags & TCL_ENCODING_NOCOMPLAIN)) { result = TCL_CONVERT_UNKNOWN; src = saveSrc; break; } if (!(flags & TCL_ENCODING_MODIFIED)) { |
︙ | ︙ | |||
2663 2664 2665 2666 2667 2668 2669 | * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { | | | 2666 2667 2668 2669 2670 2671 2672 2673 2674 2675 2676 2677 2678 2679 2680 | * Special case for 1-byte utf chars for speed. Make sure we work with * unsigned short-size data. */ if (ch && ch < 0x80) { *dst++ = (ch & 0xFF); } else { dst += Tcl_UniCharToUtf(ch | TCL_COMBINE, dst); } src += sizeof(unsigned short); } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; |
︙ | ︙ |
Changes to generic/tclExecute.c.
︙ | ︙ | |||
5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 | * but creating the object as a string seems to be faster in * practical use. */ if (ch == -1) { TclNewObj(objResultPtr); } else { slength = Tcl_UniCharToUtf(ch, buf); if ((ch >= 0xD800) && (slength < 3)) { slength += Tcl_UniCharToUtf(-1, buf + slength); } objResultPtr = Tcl_NewStringObj(buf, slength); } } TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr))); NEXT_INST_F(1, 2, 1); | > > | 5128 5129 5130 5131 5132 5133 5134 5135 5136 5137 5138 5139 5140 5141 5142 5143 5144 5145 5146 | * but creating the object as a string seems to be faster in * practical use. */ if (ch == -1) { TclNewObj(objResultPtr); } else { slength = Tcl_UniCharToUtf(ch, buf); #if TCL_UTF_MAX < 4 if ((ch >= 0xD800) && (slength < 3)) { slength += Tcl_UniCharToUtf(-1, buf + slength); } #endif objResultPtr = Tcl_NewStringObj(buf, slength); } } TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr))); NEXT_INST_F(1, 2, 1); |
︙ | ︙ |
Changes to generic/tclParse.c.
︙ | ︙ | |||
865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 | case 'u': count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result); if (count == 2) { /* * No hexdigits -> This is just "u". */ result = 'u'; } else if (((result & 0xFC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { /* If high surrogate is immediately followed by a low surrogate * escape, combine them into one character. */ int low; int count2 = ParseHex(p+7, 4, &low); if ((count2 == 4) && ((low & 0xFC00) == 0xDC00)) { result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; count += count2 + 2; } } break; case 'U': count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); if (count == 2) { /* * No hexdigits -> This is just "U". */ result = 'U'; | > > < < < | 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 | case 'u': count += ParseHex(p+1, (numBytes > 5) ? 4 : numBytes-2, &result); if (count == 2) { /* * No hexdigits -> This is just "u". */ result = 'u'; #if TCL_UTF_MAX < 4 } else if (((result & 0xFC00) == 0xD800) && (count == 6) && (p[5] == '\\') && (p[6] == 'u') && (numBytes >= 10)) { /* If high surrogate is immediately followed by a low surrogate * escape, combine them into one character. */ int low; int count2 = ParseHex(p+7, 4, &low); if ((count2 == 4) && ((low & 0xFC00) == 0xDC00)) { result = ((result & 0x3FF)<<10 | (low & 0x3FF)) + 0x10000; count += count2 + 2; } #endif } break; case 'U': count += ParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); if (count == 2) { /* * No hexdigits -> This is just "U". */ result = 'U'; } break; case '\n': count--; do { p++; count++; |
︙ | ︙ | |||
950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 | } done: if (readPtr != NULL) { *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); if ((result >= 0xD800) && (count < 3)) { /* Special case for handling high surrogates. */ count += Tcl_UniCharToUtf(-1, dst + count); } return count; } /* *---------------------------------------------------------------------- * * ParseComment -- | > > | 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 | } done: if (readPtr != NULL) { *readPtr = count; } count = Tcl_UniCharToUtf(result, dst); #if TCL_UTF_MAX < 4 if ((result >= 0xD800) && (count < 3)) { /* Special case for handling high surrogates. */ count += Tcl_UniCharToUtf(-1, dst + count); } #endif return count; } /* *---------------------------------------------------------------------- * * ParseComment -- |
︙ | ︙ |
Changes to generic/tclStringObj.c.
︙ | ︙ | |||
66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | static void GrowUnicodeBuffer(Tcl_Obj *objPtr, size_t needed); static int SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, size_t numChars); static size_t UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); #define ISCONTINUATION(bytes) (\ ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80)))) /* * The structure below defines the string Tcl object type by means of * functions that can be invoked by generic object code. */ | > > > > > | 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | static void GrowUnicodeBuffer(Tcl_Obj *objPtr, size_t needed); static int SetStringFromAny(Tcl_Interp *interp, Tcl_Obj *objPtr); static void SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, size_t numChars); static size_t UnicodeLength(const Tcl_UniChar *unicode); static void UpdateStringOfString(Tcl_Obj *objPtr); #if TCL_UTF_MAX > 3 #define ISCONTINUATION(bytes) (\ ((bytes)[0] & 0xC0) == 0x80) #else #define ISCONTINUATION(bytes) (\ ((((bytes)[0] & 0xC0) == 0x80) || (((bytes)[0] == '\xED') \ && (((bytes)[1] & 0xF0) == 0xB0) && (((bytes)[2] & 0xC0) == 0x80)))) #endif /* * The structure below defines the string Tcl object type by means of * functions that can be invoked by generic object code. */ |
︙ | ︙ | |||
2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 | char buf[4] = ""; int code, length; if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } length = Tcl_UniCharToUtf(code, buf); if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ length += Tcl_UniCharToUtf(-1, buf + length); } segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); allocSegment = 1; break; } case 'u': | > > | 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 | char buf[4] = ""; int code, length; if (TclGetIntFromObj(interp, segment, &code) != TCL_OK) { goto error; } length = Tcl_UniCharToUtf(code, buf); #if TCL_UTF_MAX < 4 if ((code >= 0xD800) && (length < 3)) { /* Special case for handling high surrogates. */ length += Tcl_UniCharToUtf(-1, buf + length); } #endif segment = Tcl_NewStringObj(buf, length); Tcl_IncrRefCount(segment); allocSegment = 1; break; } case 'u': |
︙ | ︙ | |||
4178 4179 4180 4181 4182 4183 4184 | stringPtr->numChars = needed; } else { numAppendChars = 0; } dst = stringPtr->unicode + numOrigChars; if (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); | < < < < < < < < | 4185 4186 4187 4188 4189 4190 4191 4192 4193 4194 4195 4196 4197 4198 | stringPtr->numChars = needed; } else { numAppendChars = 0; } dst = stringPtr->unicode + numOrigChars; if (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); *dst++ = unichar; while (numAppendChars-- > 0) { bytes += TclUtfToUniChar(bytes, &unichar); *dst++ = unichar; } } *dst = 0; |
︙ | ︙ |
Changes to generic/tclUtf.c.
︙ | ︙ | |||
204 205 206 207 208 209 210 211 212 213 | * * Side effects: * None. * *--------------------------------------------------------------------------- */ int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the | > | > > > > > > > > > > > | | 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 | * * Side effects: * None. * *--------------------------------------------------------------------------- */ #undef Tcl_UniCharToUtf int Tcl_UniCharToUtf( int ch, /* The Tcl_UniChar to be stored in the * buffer. Can be or'ed with flag TCL_COMBINE */ char *buf) /* Buffer in which the UTF-8 representation of * the Tcl_UniChar is stored. Buffer must be * large enough to hold the UTF-8 character * (at most 4 bytes). */ { #if TCL_UTF_MAX > 3 int flags = ch; #endif if (ch >= TCL_COMBINE) { ch &= (TCL_COMBINE - 1); } if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) { buf[0] = (char) ch; return 1; } if (ch >= 0) { if (ch <= 0x7FF) { buf[1] = (char) ((ch | 0x80) & 0xBF); buf[0] = (char) ((ch >> 6) | 0xC0); return 2; } if (ch <= 0xFFFF) { if ( #if TCL_UTF_MAX > 3 (flags & TCL_COMBINE) && #endif ((ch & 0xF800) == 0xD800)) { if (ch & 0x0400) { /* Low surrogate */ if (((buf[0] & 0xC0) == 0x80) && ((buf[1] & 0xCF) == 0)) { /* Previous Tcl_UniChar was a high surrogate, so combine */ buf[2] = (char) ((ch & 0x3F) | 0x80); buf[1] |= (char) (((ch >> 6) & 0x0F) | 0x80); return 3; |
︙ | ︙ | |||
373 374 375 376 377 378 379 | p = string; wEnd = uniStr + uniLength; for (w = uniStr; w < wEnd; ) { if (!len && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } | | | 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 | p = string; wEnd = uniStr + uniLength; for (w = uniStr; w < wEnd; ) { if (!len && ((*w & 0xFC00) != 0xDC00)) { /* Special case for handling high surrogates. */ p += Tcl_UniCharToUtf(-1, p); } len = Tcl_UniCharToUtf(*w | TCL_COMBINE, p); p += len; if ((*w >= 0xD800) && (len < 3)) { len = 0; /* Indication that high surrogate was found */ } w++; } if (!len) { |
︙ | ︙ | |||
1344 1345 1346 1347 1348 1349 1350 | /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. */ | | | 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 | /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. */ if (len < TclUtfCount(upChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(upChar, dst); } src += len; } |
︙ | ︙ | |||
1397 1398 1399 1400 1401 1402 1403 | /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. */ | | | 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422 1423 | /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. */ if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } src += len; } |
︙ | ︙ | |||
1447 1448 1449 1450 1451 1452 1453 | src = dst = str; if (*src) { len = TclUtfToUCS4(src, &ch); titleChar = Tcl_UniCharToTitle(ch); | | | | 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 | src = dst = str; if (*src) { len = TclUtfToUCS4(src, &ch); titleChar = Tcl_UniCharToTitle(ch); if (len < TclUtfCount(titleChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(titleChar, dst); } src += len; } while (*src) { len = TclUtfToUCS4(src, &ch); lowChar = ch; /* Special exception for Georgian Asomtavruli chars, no titlecase. */ if ((unsigned)(lowChar - 0x1C90) >= 0x30) { lowChar = Tcl_UniCharToLower(lowChar); } if (len < TclUtfCount(lowChar)) { memmove(dst, src, len); dst += len; } else { dst += Tcl_UniCharToUtf(lowChar, dst); } src += len; } |
︙ | ︙ |
Changes to tests/encoding.test.
︙ | ︙ | |||
36 37 38 39 40 41 42 | # Some tests require the testencoding command testConstraint testencoding [llength [info commands testencoding]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint exec [llength [info commands exec]] testConstraint testgetencpath [llength [info commands testgetencpath]] | > > > | | 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 | # Some tests require the testencoding command testConstraint testencoding [llength [info commands testencoding]] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint teststringbytes [llength [info commands teststringbytes]] testConstraint exec [llength [info commands exec]] testConstraint testgetencpath [llength [info commands testgetencpath]] testConstraint fullutf [expr {[format %c 0x010000] ne "\uFFFD"}] testConstraint utf32 [expr {[testConstraint fullutf] && [string length [format %c 0x10000]] == 1}] # TclInitEncodingSubsystem is tested by the rest of this file # TclFinalizeEncodingSubsystem is not currently tested test encoding-1.1 {Tcl_GetEncoding: system encoding} -setup { set old [encoding system] } -constraints {testencoding} -body { testencoding create foo [namespace origin toutf] [namespace origin fromutf] |
︙ | ︙ | |||
332 333 334 335 336 337 338 | test encoding-15.3 {UtfToUtfProc null character input} teststringbytes { set y [encoding convertfrom utf-8 [encoding convertto utf-8 \x00]] binary scan [teststringbytes $y] H* z set z } c080 test encoding-15.4 {UtfToUtfProc emoji character input} -body { set x \xED\xA0\xBD\xED\xB8\x82 | | | | | | | | | | | | | | | | | | | | | | | | 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 | test encoding-15.3 {UtfToUtfProc null character input} teststringbytes { set y [encoding convertfrom utf-8 [encoding convertto utf-8 \x00]] binary scan [teststringbytes $y] H* z set z } c080 test encoding-15.4 {UtfToUtfProc emoji character input} -body { set x \xED\xA0\xBD\xED\xB8\x82 set y [encoding convertfrom -nocomplain utf-8 \xED\xA0\xBD\xED\xB8\x82] list [string length $x] $y } -result "6 \uD83D\uDE02" test encoding-15.5 {UtfToUtfProc emoji character input} { set x \xF0\x9F\x98\x82 set y [encoding convertfrom utf-8 \xF0\x9F\x98\x82] list [string length $x] $y } "4 😂" test encoding-15.6 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83D\uDE02\uD83D set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uDE02\uD83D] binary scan $y H* z list [string length $y] $z } {12 efbfbdefbfbdefbfbdefbfbd} test encoding-15.7 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83D\uD83D set y [encoding convertto -nocomplain utf-8 \uDE02\uD83D\uD83D] binary scan $y H* z list [string length $x] [string length $y] $z } {3 9 efbfbdefbfbdefbfbd} test encoding-15.8 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83Dé set y [encoding convertto -nocomplain utf-8 \uDE02\uD83Dé] binary scan $y H* z list [string length $x] [string length $y] $z } {3 8 efbfbdefbfbdc3a9} test encoding-15.9 {UtfToUtfProc emoji character output} utf32 { set x \uDE02\uD83DX set y [encoding convertto -nocomplain utf-8 \uDE02\uD83DX] binary scan $y H* z list [string length $x] [string length $y] $z } {3 7 efbfbdefbfbd58} test encoding-15.10 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02é set y [encoding convertto -nocomplain utf-8 \uDE02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} test encoding-15.11 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02é set y [encoding convertto -nocomplain utf-8 \uDA02é] binary scan $y H* z list [string length $x] [string length $y] $z } {2 5 efbfbdc3a9} test encoding-15.12 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02Y set y [encoding convertto -nocomplain utf-8 \uDE02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} test encoding-15.13 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02Y set y [encoding convertto -nocomplain utf-8 \uDA02Y] binary scan $y H* z list [string length $x] [string length $y] $z } {2 4 efbfbd59} test encoding-15.14 {UtfToUtfProc high surrogate character output} utf32 { set x \uDE02 set y [encoding convertto -nocomplain utf-8 \uDE02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} test encoding-15.15 {UtfToUtfProc low surrogate character output} utf32 { set x \uDA02 set y [encoding convertto -nocomplain utf-8 \uDA02] binary scan $y H* z list [string length $x] [string length $y] $z } {1 3 efbfbd} test encoding-15.16 {UtfToUtfProc: Invalid 4-byte UTF-8, see [ed29806ba]} { set x \xF0\xA0\xA1\xC2 set y [encoding convertfrom -nocomplain utf-8 \xF0\xA0\xA1\xC2] list [string length $x] $y } "4 \xF0\xA0\xA1\xC2" test encoding-15.17 {UtfToUtfProc emoji character output} { set x 😂 |
︙ | ︙ |
Changes to tests/string.test.
︙ | ︙ | |||
30 31 32 33 34 35 36 37 38 39 40 41 42 43 | # Some tests require the testobj command testConstraint testobj [expr {[info commands testobj] ne {}}] testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf32 [expr {[string length \U010000] == 1}] testConstraint testbytestring [llength [info commands testbytestring]] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] if {[testConstraint memory]} { proc getbytes {} { set lines [split [memory info] \n] return [lindex $lines 3 3] | > > | 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 | # Some tests require the testobj command testConstraint testobj [expr {[info commands testobj] ne {}}] testConstraint testindexobj [expr {[info commands testindexobj] ne {}}] testConstraint testevalex [expr {[info commands testevalex] ne {}}] testConstraint utf32 [expr {[string length \U010000] == 1}] testConstraint testbytestring [llength [info commands testbytestring]] testConstraint utf32 [expr {[testConstraint fullutf] && [string length [format %c 0x10000]] == 1}] # Used for constraining memory leak tests testConstraint memory [llength [info commands memory]] if {[testConstraint memory]} { proc getbytes {} { set lines [split [memory info] \n] return [lindex $lines 3 3] |
︙ | ︙ | |||
1942 1943 1944 1945 1946 1947 1948 | test string-21.21.$noComp {string trimleft, unicode} { run {string trimleft "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.22.$noComp {string trimright, unicode} { run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.23.$noComp {string trim, unicode} { | | | | 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 | test string-21.21.$noComp {string trimleft, unicode} { run {string trimleft "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.22.$noComp {string trimright, unicode} { run {string trimright "\uF602Hello world!\uF602" \uD83D\uDE02} } "\uF602Hello world!\uF602" test string-21.23.$noComp {string trim, unicode} { run {string trim "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.24.$noComp {string trimleft, unicode} { run {string trimleft "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D\uDE02} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-21.25.$noComp {string trimright, unicode} { run {string trimright "\uD83D\uDE02Hello world!\uD83D\uDE02" \uD93D} } "\uD83D\uDE02Hello world!\uD83D\uDE02" test string-22.1.$noComp {string wordstart} -body { list [catch {run {string word a}} msg] $msg } -result {1 {unknown or ambiguous subcommand "word": must be cat, compare, equal, first, index, insert, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}} test string-22.2.$noComp {string wordstart} -body { list [catch {run {string wordstart a}} msg] $msg |
︙ | ︙ | |||
2112 2113 2114 2115 2116 2117 2118 | binary scan [run {string reverse [binary format H* 010203]}] H* x set x } 030201 test string-24.15.$noComp {string reverse command - pure bytearray} { binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x set x } 030201 | | | | | | | | | | 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 | binary scan [run {string reverse [binary format H* 010203]}] H* x set x } 030201 test string-24.15.$noComp {string reverse command - pure bytearray} { binary scan [run {tcl::string::reverse [binary format H* 010203]}] H* x set x } 030201 test string-24.16.$noComp {string reverse command - surrogates} utf32 { run {string reverse \u0444bulb\uD83D\uDE02} } \uDE02\uD83Dblub\u0444 test string-24.17.$noComp {string reverse command - surrogates} utf32 { run {string reverse \uD83D\uDE02hello\uD83D\uDE02} } \uDE02\uD83Dolleh\uDE02\uD83D test string-24.18.$noComp {string reverse command - surrogates} utf32 { set s \u0444bulb\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} } \uDE02\uD83Dblub\u0444 test string-24.19.$noComp {string reverse command - surrogates} utf32 { set s \uD83D\uDE02hello\uD83D\uDE02 # shim shimmery ... string index $s 0 run {string reverse $s} } \uDE02\uD83Dolleh\uDE02\uD83D test string-25.1.$noComp {string is list} { run {string is list {a b c}} } 1 test string-25.2.$noComp {string is list} { run {string is list "a \{b c"} } 0 |
︙ | ︙ |
Changes to tests/utf.test.
︙ | ︙ | |||
74 75 76 77 78 79 80 | } 1 test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring { expr {[format %c 0xDC42] eq [testbytestring \xED\xB1\x82]} } 1 test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 | | | | 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 | } 1 test utf-1.11 {Tcl_UniCharToUtf: 3 byte sequence, low surrogate} testbytestring { expr {[format %c 0xDC42] eq [testbytestring \xED\xB1\x82]} } 1 test utf-1.12 {Tcl_UniCharToUtf: 4 byte sequence, high/low surrogate} {pairsTo4bytes testbytestring} { expr {"\uD842\uDC42" eq [testbytestring \xF0\xA0\xA1\x82]} } 1 test utf-1.13.0 {Tcl_UniCharToUtf: Invalid surrogate} Uesc { expr {"\UD842" eq "\uD842"} } 1 test utf-1.13.1 {Tcl_UniCharToUtf: Invalid surrogate} {fullutf testbytestring} { expr {"\UD842" eq [testbytestring \xED\xA1\x82]} } 1 test utf-1.14 {Tcl_UniCharToUtf: surrogate pairs from concat} { set lo \uDE02 return \uD83D$lo } \uD83D\uDE02 test utf-1.15 {Tcl_UniCharToUtf: surrogate pairs from concat} { set hi \uD83D |
︙ | ︙ | |||
1110 1111 1112 1113 1114 1115 1116 | } ᲐᲐ test utf-11.6 {Tcl_UtfToUpper beyond U+FFFF} fullutf { string toupper 𐐨 } 𐐀 test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} fullutf { string toupper 𐐨 } 𐐀 | | | | 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 | } ᲐᲐ test utf-11.6 {Tcl_UtfToUpper beyond U+FFFF} fullutf { string toupper 𐐨 } 𐐀 test utf-11.7 {Tcl_UtfToUpper beyond U+FFFF} fullutf { string toupper 𐐨 } 𐐀 test utf-11.8 {Tcl_UtfToUpper low/high surrogate)} utf32 { string toupper \uDC24\uD824 } \uDC24\uD824 test utf-12.1 {Tcl_UtfToLower} { string tolower {} } {} test utf-12.2 {Tcl_UtfToLower} { string tolower ABC } abc test utf-12.3 {Tcl_UtfToLower} { string tolower ÃGH } ãgh test utf-12.4 {Tcl_UtfToLower} { string tolower ǢGH } ǣgh test utf-12.5 {Tcl_UtfToLower Georgian (new in Unicode 11)} { string tolower აᲐ } აა test utf-12.6 {Tcl_UtfToLower low/high surrogate)} utf32 { string tolower \uDC24\uD824 } \uDC24\uD824 test utf-12.7 {Tcl_UtfToLower beyond U+FFFF} fullutf { string tolower 𐐀 } 𐐨 test utf-12.8 {Tcl_UtfToLower beyond U+FFFF} fullutf { string tolower 𐐀 |
︙ | ︙ | |||
1157 1158 1159 1160 1161 1162 1163 | } Dzab test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle აᲐ } აᲐ test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle Აა } Აა | | | 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 | } Dzab test utf-13.5 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle აᲐ } აᲐ test utf-13.6 {Tcl_UtfToTitle Georgian (new in Unicode 11)} { string totitle Აა } Აა test utf-13.7 {Tcl_UtfToTitle low/high surrogate)} utf32 { string totitle \uDC24\uD824 } \uDC24\uD824 test utf-13.8 {Tcl_UtfToTitle beyond U+FFFF} fullutf { string totitle 𐐨𐐀 } 𐐀𐐨 test utf-13.9 {Tcl_UtfToTitle beyond U+FFFF} fullutf { string totitle 𐐨𐐀 |
︙ | ︙ |