Index: doc/StringObj.3 ================================================================== --- doc/StringObj.3 +++ doc/StringObj.3 @@ -35,11 +35,11 @@ \fBTcl_GetUnicodeFromObj\fR(\fIobjPtr, lengthPtr\fR) .sp Tcl_UniChar * \fBTcl_GetUnicode\fR(\fIobjPtr\fR) .sp -Tcl_UniChar +int \fBTcl_GetUniChar\fR(\fIobjPtr, index\fR) .sp int \fBTcl_GetCharLength\fR(\fIobjPtr\fR) .sp @@ -202,11 +202,12 @@ the caller. The procedure \fBTcl_GetUnicode\fR is used in the common case where the caller does not need the length of the unicode string representation. .PP \fBTcl_GetUniChar\fR returns the \fIindex\fR'th character in the -value's Unicode representation. +value's Unicode representation. If the index is out of range or +it references a low surrogate preceded by a high surrogate, it returns -1; .PP \fBTcl_GetRange\fR returns a newly created value comprised of the characters between \fIfirst\fR and \fIlast\fR (inclusive) in the value's Unicode representation. If the value's Unicode representation is invalid, the Unicode representation is regenerated Index: doc/ToUpper.3 ================================================================== --- doc/ToUpper.3 +++ doc/ToUpper.3 @@ -11,17 +11,17 @@ Tcl_UniCharToUpper, Tcl_UniCharToLower, Tcl_UniCharToTitle, Tcl_UtfToUpper, Tcl_UtfToLower, Tcl_UtfToTitle \- routines for manipulating the case of Unicode characters and UTF-8 strings .SH SYNOPSIS .nf \fB#include \fR .sp -Tcl_UniChar +int \fBTcl_UniCharToUpper\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToLower\fR(\fIch\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharToTitle\fR(\fIch\fR) .sp int \fBTcl_UtfToUpper\fR(\fIstr\fR) .sp Index: doc/Utf.3 ================================================================== --- doc/Utf.3 +++ doc/Utf.3 @@ -61,11 +61,11 @@ \fBTcl_UtfNext\fR(\fIsrc\fR) .sp const char * \fBTcl_UtfPrev\fR(\fIsrc, start\fR) .sp -Tcl_UniChar +int \fBTcl_UniCharAtIndex\fR(\fIsrc, index\fR) .sp const char * \fBTcl_UtfAtIndex\fR(\fIsrc, index\fR) .sp @@ -138,10 +138,13 @@ and stores it as a Tcl_UniChar in \fI*chPtr\fR. The return value is the number of bytes read from \fIsrc\fR. The caller must ensure that the source buffer is long enough such that this routine does not run off the end and dereference non-existent or random memory; if the source buffer is known to be null-terminated, this will not happen. If the input is +a byte in the range 0x80 - 0x9F, \fBTcl_UtfToUniChar\fR assumes the +cp1252 encoding, stores the corresponding Tcl_UniChar in \fI*chPtr\fR +and returns 1. If the input is otherwise not in proper UTF-8 format, \fBTcl_UtfToUniChar\fR will store the first byte of \fIsrc\fR in \fI*chPtr\fR as a Tcl_UniChar between 0x0000 and 0x00ff and return 1. .PP \fBTcl_UniCharToUtfDString\fR converts the given Unicode string Index: generic/tcl.decls ================================================================== --- generic/tcl.decls +++ generic/tcl.decls @@ -1143,20 +1143,20 @@ declare 319 { void Tcl_ThreadQueueEvent(Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position) } declare 320 { - Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index) + int Tcl_UniCharAtIndex(const char *src, int index) } declare 321 { - Tcl_UniChar Tcl_UniCharToLower(int ch) + int Tcl_UniCharToLower(int ch) } declare 322 { - Tcl_UniChar Tcl_UniCharToTitle(int ch) + int Tcl_UniCharToTitle(int ch) } declare 323 { - Tcl_UniChar Tcl_UniCharToUpper(int ch) + int Tcl_UniCharToUpper(int ch) } declare 324 { int Tcl_UniCharToUtf(int ch, char *buf) } declare 325 { @@ -1346,11 +1346,11 @@ } declare 380 { int Tcl_GetCharLength(Tcl_Obj *objPtr) } declare 381 { - Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index) + int Tcl_GetUniChar(Tcl_Obj *objPtr, int index) } declare 382 { Tcl_UniChar *Tcl_GetUnicode(Tcl_Obj *objPtr) } declare 383 { Index: generic/tcl.h ================================================================== --- generic/tcl.h +++ generic/tcl.h @@ -2153,11 +2153,11 @@ * is the default and recommended mode. UCS-4 is experimental and not * recommended. It works for the core, but most extensions expect UCS-2. */ #ifndef TCL_UTF_MAX -#define TCL_UTF_MAX 3 +#define TCL_UTF_MAX 4 #endif /* * This represents a Unicode character. Any changes to this should also be * reflected in regcustom.h. @@ -2343,14 +2343,14 @@ int result); /* *---------------------------------------------------------------------------- * The following constant is used to test for older versions of Tcl in the - * stubs tables. + * stubs tables. If TCL_UTF_MAX>4 use a different value. */ -#define TCL_STUB_MAGIC ((int) 0xFCA3BACF) +#define TCL_STUB_MAGIC ((int) 0xFCA3BACF + (TCL_UTF_MAX>4)) /* * The following function is required to be defined in all stubs aware * extensions. The function is actually implemented in the stub library, not * the main Tcl library, although there is a trivial implementation in the Index: generic/tclCmdMZ.c ================================================================== --- generic/tclCmdMZ.c +++ generic/tclCmdMZ.c @@ -1417,20 +1417,24 @@ Tcl_WrongNumArgs(interp, 1, objv, "string charIndex"); return TCL_ERROR; } /* - * Get the char length to calulate what 'end' means. + * Get the char length to calculate what 'end' means. */ length = Tcl_GetCharLength(objv[1]); if (TclGetIntForIndexM(interp, objv[2], length-1, &index) != TCL_OK) { return TCL_ERROR; } if ((index >= 0) && (index < length)) { - Tcl_UniChar ch = Tcl_GetUniChar(objv[1], index); + int ch = Tcl_GetUniChar(objv[1], index); + + if (ch == -1) { + return TCL_OK; + } /* * If we have a ByteArray object, we're careful to generate a new * bytearray for a result. */ Index: generic/tclDecls.h ================================================================== --- generic/tclDecls.h +++ generic/tclDecls.h @@ -960,17 +960,17 @@ EXTERN void Tcl_ThreadAlert(Tcl_ThreadId threadId); /* 319 */ EXTERN void Tcl_ThreadQueueEvent(Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 320 */ -EXTERN Tcl_UniChar Tcl_UniCharAtIndex(const char *src, int index); +EXTERN int Tcl_UniCharAtIndex(const char *src, int index); /* 321 */ -EXTERN Tcl_UniChar Tcl_UniCharToLower(int ch); +EXTERN int Tcl_UniCharToLower(int ch); /* 322 */ -EXTERN Tcl_UniChar Tcl_UniCharToTitle(int ch); +EXTERN int Tcl_UniCharToTitle(int ch); /* 323 */ -EXTERN Tcl_UniChar Tcl_UniCharToUpper(int ch); +EXTERN int Tcl_UniCharToUpper(int ch); /* 324 */ EXTERN int Tcl_UniCharToUtf(int ch, char *buf); /* 325 */ EXTERN const char * Tcl_UtfAtIndex(const char *src, int index); /* 326 */ @@ -1121,11 +1121,11 @@ EXTERN void Tcl_SetUnicodeObj(Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 380 */ EXTERN int Tcl_GetCharLength(Tcl_Obj *objPtr); /* 381 */ -EXTERN Tcl_UniChar Tcl_GetUniChar(Tcl_Obj *objPtr, int index); +EXTERN int Tcl_GetUniChar(Tcl_Obj *objPtr, int index); /* 382 */ EXTERN Tcl_UniChar * Tcl_GetUnicode(Tcl_Obj *objPtr); /* 383 */ EXTERN Tcl_Obj * Tcl_GetRange(Tcl_Obj *objPtr, int first, int last); /* 384 */ @@ -2191,14 +2191,14 @@ void (*tcl_SaveResult) (Tcl_Interp *interp, Tcl_SavedResult *statePtr); /* 315 */ int (*tcl_SetSystemEncoding) (Tcl_Interp *interp, const char *name); /* 316 */ Tcl_Obj * (*tcl_SetVar2Ex) (Tcl_Interp *interp, const char *part1, const char *part2, Tcl_Obj *newValuePtr, int flags); /* 317 */ void (*tcl_ThreadAlert) (Tcl_ThreadId threadId); /* 318 */ void (*tcl_ThreadQueueEvent) (Tcl_ThreadId threadId, Tcl_Event *evPtr, Tcl_QueuePosition position); /* 319 */ - Tcl_UniChar (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ - Tcl_UniChar (*tcl_UniCharToLower) (int ch); /* 321 */ - Tcl_UniChar (*tcl_UniCharToTitle) (int ch); /* 322 */ - Tcl_UniChar (*tcl_UniCharToUpper) (int ch); /* 323 */ + int (*tcl_UniCharAtIndex) (const char *src, int index); /* 320 */ + int (*tcl_UniCharToLower) (int ch); /* 321 */ + int (*tcl_UniCharToTitle) (int ch); /* 322 */ + int (*tcl_UniCharToUpper) (int ch); /* 323 */ int (*tcl_UniCharToUtf) (int ch, char *buf); /* 324 */ const char * (*tcl_UtfAtIndex) (const char *src, int index); /* 325 */ int (*tcl_UtfCharComplete) (const char *src, int length); /* 326 */ int (*tcl_UtfBackslash) (const char *src, int *readPtr, char *dst); /* 327 */ const char * (*tcl_UtfFindFirst) (const char *src, int ch); /* 328 */ @@ -2252,11 +2252,11 @@ int (*tcl_RegExpExecObj) (Tcl_Interp *interp, Tcl_RegExp regexp, Tcl_Obj *textObj, int offset, int nmatches, int flags); /* 376 */ void (*tcl_RegExpGetInfo) (Tcl_RegExp regexp, Tcl_RegExpInfo *infoPtr); /* 377 */ Tcl_Obj * (*tcl_NewUnicodeObj) (const Tcl_UniChar *unicode, int numChars); /* 378 */ void (*tcl_SetUnicodeObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int numChars); /* 379 */ int (*tcl_GetCharLength) (Tcl_Obj *objPtr); /* 380 */ - Tcl_UniChar (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ + int (*tcl_GetUniChar) (Tcl_Obj *objPtr, int index); /* 381 */ Tcl_UniChar * (*tcl_GetUnicode) (Tcl_Obj *objPtr); /* 382 */ Tcl_Obj * (*tcl_GetRange) (Tcl_Obj *objPtr, int first, int last); /* 383 */ void (*tcl_AppendUnicodeToObj) (Tcl_Obj *objPtr, const Tcl_UniChar *unicode, int length); /* 384 */ int (*tcl_RegExpMatchObj) (Tcl_Interp *interp, Tcl_Obj *textObj, Tcl_Obj *patternObj); /* 385 */ void (*tcl_SetNotifier) (Tcl_NotifierProcs *notifierProcPtr); /* 386 */ Index: generic/tclExecute.c ================================================================== --- generic/tclExecute.c +++ generic/tclExecute.c @@ -5296,19 +5296,19 @@ } else if (valuePtr->bytes && length == valuePtr->length) { objResultPtr = Tcl_NewStringObj((const char *) valuePtr->bytes+index, 1); } else { char buf[TCL_UTF_MAX]; - Tcl_UniChar ch = Tcl_GetUniChar(valuePtr, index); + int ch = Tcl_GetUniChar(valuePtr, index); /* * This could be: Tcl_NewUnicodeObj((const Tcl_UniChar *)&ch, 1) * but creating the object as a string seems to be faster in * practical use. */ - length = Tcl_UniCharToUtf(ch, buf); + length = (ch != -1) ? Tcl_UniCharToUtf(ch, buf) : 0; objResultPtr = Tcl_NewStringObj(buf, length); } TRACE_APPEND(("\"%s\"\n", O2S(objResultPtr))); NEXT_INST_F(1, 2, 1); Index: generic/tclStringObj.c ================================================================== --- generic/tclStringObj.c +++ generic/tclStringObj.c @@ -458,12 +458,13 @@ /* *---------------------------------------------------------------------- * * Tcl_GetUniChar -- * - * Get the index'th Unicode character from the String object. The index - * is assumed to be in the appropriate range. + * Get the index'th Unicode character from the String object. If index + * is out of range or it references a low surrogate preceded by a high + * surrogate, the result = -1; * * Results: * Returns the index'th Unicode character in the Object. * * Side effects: @@ -470,27 +471,35 @@ * Fills unichar with the index'th Unicode character. * *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_GetUniChar( Tcl_Obj *objPtr, /* The object to get the Unicode charater * from. */ int index) /* Get the index'th Unicode character. */ { String *stringPtr; + int ch, length; + + if (index < 0) { + return -1; + } /* * Optimize the case where we're really dealing with a bytearray object * we don't need to convert to a string to perform the indexing operation. */ if (TclIsPureByteArray(objPtr)) { - unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (index >= length) { + return -1; + } - return (Tcl_UniChar) bytes[index]; + return (int) bytes[index]; } /* * OK, need to work with the object as a string. */ @@ -510,11 +519,30 @@ return (Tcl_UniChar) objPtr->bytes[index]; } FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - return stringPtr->unicode[index]; + + if (index >= stringPtr->numChars) { + return -1; + } + ch = stringPtr->unicode[index]; +#if TCL_UTF_MAX == 4 + /* See: bug [11ae2be95dac9417] */ + if ((ch&0xF800) == 0xD800) { + if (ch&0x400) { + if ((index > 0) && ((stringPtr->unicode[index-1]&0xFC00) == 0xD800)) { + ch = -1; /* low surrogate preceded by high surrogate */ + } + } else if ((++index < stringPtr->numChars) + && ((stringPtr->unicode[index]&0xFC00) == 0xDC00)) { + /* high surrogate followed by low surrogate */ + ch = (((ch & 0x3FF) << 10) | (stringPtr->unicode[index] & 0x3FF)) + 0x10000; + } + } +#endif + return ch; } /* *---------------------------------------------------------------------- * @@ -610,19 +638,28 @@ int first, /* First index of the range. */ int last) /* Last index of the range. */ { Tcl_Obj *newObjPtr; /* The Tcl object to find the range of. */ String *stringPtr; + int length; + if (first < 0) { + first = 0; + } /* * Optimize the case where we're really dealing with a bytearray object * we don't need to convert to a string to perform the substring operation. */ if (TclIsPureByteArray(objPtr)) { - unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, NULL); - + unsigned char *bytes = Tcl_GetByteArrayFromObj(objPtr, &length); + if (last >= length) { + last = length - 1; + } + if (last < first) { + return Tcl_NewObj(); + } return Tcl_NewByteArrayObj(bytes+first, last-first+1); } /* * OK, need to work with the object as a string. @@ -638,10 +675,16 @@ if (stringPtr->numChars == -1) { TclNumUtfChars(stringPtr->numChars, objPtr->bytes, objPtr->length); } if (stringPtr->numChars == objPtr->length) { + if (last >= stringPtr->numChars) { + last = stringPtr->numChars - 1; + } + if (last < first) { + return Tcl_NewObj(); + } newObjPtr = Tcl_NewStringObj(objPtr->bytes + first, last-first+1); /* * Since we know the char length of the result, store it. */ @@ -652,11 +695,16 @@ return newObjPtr; } FillUnicodeRep(objPtr); stringPtr = GET_STRING(objPtr); } - + if (last > stringPtr->numChars) { + last = stringPtr->numChars; + } + if (last < first) { + return Tcl_NewObj(); + } #if TCL_UTF_MAX == 4 /* See: bug [11ae2be95dac9417] */ if ((first>0) && ((stringPtr->unicode[first]&0xFC00) == 0xDC00) && ((stringPtr->unicode[first-1]&0xFC00) == 0xD800)) { ++first; Index: generic/tclUtf.c ================================================================== --- generic/tclUtf.c +++ generic/tclUtf.c @@ -146,19 +146,26 @@ if (ch <= 0xFFFF) { #if TCL_UTF_MAX == 4 if ((ch & 0xF800) == 0xD800) { if (ch & 0x0400) { /* Low surrogate */ - buf[3] = (char) ((ch | 0x80) & 0xBF); - buf[2] |= (char) (((ch >> 6) | 0x80) & 0x8F); - return 4; + if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) + && ((buf[2] & 0xCF) == 0)) { + /* Previous Tcl_UniChar was a High surrogate, so combine */ + buf[3] = (char) ((ch & 0x3F) | 0x80); + buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); + return 4; + } + /* Previous Tcl_UniChar was not a High surrogate, so just output */ } else { /* High surrogate */ ch += 0x40; - buf[2] = (char) (((ch << 4) | 0x80) & 0xB0); - buf[1] = (char) (((ch >> 2) | 0x80) & 0xBF); - buf[0] = (char) (((ch >> 8) | 0xF0) & 0xF7); + /* Fill buffer with specific 3-byte (invalid) byte combination, + so following Low surrogate can recognize it and combine */ + buf[2] = (char) ((ch << 4) & 0x30); + buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); + buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); return 0; } } #endif goto three; @@ -265,10 +272,17 @@ * Side effects: * None. * *--------------------------------------------------------------------------- */ + +static const unsigned short cp1252[32] = { + 0x20ac, 0x81, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021, + 0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x8D, 0x017D, 0x8F, + 0x90, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, + 0x2DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x9D, 0x017E, 0x0178 +}; int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by @@ -282,15 +296,21 @@ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. - * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid + * Treats naked trail bytes 0x80 to 0x9F as valid characters from + * the cp1252 table. See: + * Also treats \0 and other naked trail bytes 0xA0 to 0xBF as valid * characters representing themselves. */ - *chPtr = (Tcl_UniChar) byte; + if ((unsigned)(byte-0x80) < (unsigned) 0x20) { + *chPtr = (Tcl_UniChar) cp1252[byte-0x80]; + } else { + *chPtr = (Tcl_UniChar) byte; + } return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. @@ -715,21 +735,37 @@ * None. * *--------------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharAtIndex( register const char *src, /* The UTF-8 string to dereference. */ register int index) /* The position of the desired character. */ { Tcl_UniChar ch = 0; + int fullchar = 0; +#if TCL_UTF_MAX == 4 + int len = 1; +#endif while (index-- >= 0) { +#if TCL_UTF_MAX == 4 + src += (len = TclUtfToUniChar(src, &ch)); +#else src += TclUtfToUniChar(src, &ch); +#endif + } + fullchar = ch; +#if TCL_UTF_MAX == 4 + if (!len) { + /* If last Tcl_UniChar was an upper surrogate, combine with lower surrogate */ + (void)TclUtfToUniChar(src, &ch); + fullchar = (((fullchar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; } - return ch; +#endif + return fullchar; } /* *--------------------------------------------------------------------------- * @@ -833,11 +869,12 @@ int Tcl_UtfToUpper( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, upChar; + Tcl_UniChar ch = 0; + int upChar; char *src, *dst; int bytes; /* * Iterate over the string until we hit the terminating null. @@ -844,11 +881,18 @@ */ src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - upChar = Tcl_UniCharToUpper(ch); + upChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + upChar = (((upChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + upChar = Tcl_UniCharToUpper(upChar); /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the upper case * char to dst if its size is <= the original char. @@ -886,11 +930,12 @@ int Tcl_UtfToLower( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, lowChar; + Tcl_UniChar ch = 0; + int lowChar; char *src, *dst; int bytes; /* * Iterate over the string until we hit the terminating null. @@ -897,11 +942,18 @@ */ src = dst = str; while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); /* * To keep badly formed Utf strings from getting inflated by the * conversion (thereby causing a segfault), only copy the lower case * char to dst if its size is <= the original char. @@ -940,11 +992,12 @@ int Tcl_UtfToTitle( char *str) /* String to convert in place. */ { - Tcl_UniChar ch = 0, titleChar, lowChar; + Tcl_UniChar ch = 0; + int titleChar, lowChar; char *src, *dst; int bytes; /* * Capitalize the first character and then lowercase the rest of the @@ -953,11 +1006,18 @@ src = dst = str; if (*src) { bytes = TclUtfToUniChar(src, &ch); - titleChar = Tcl_UniCharToTitle(ch); + titleChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + titleChar = (((titleChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + titleChar = Tcl_UniCharToTitle(titleChar); if (bytes < TclUtfCount(titleChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -965,11 +1025,18 @@ } src += bytes; } while (*src) { bytes = TclUtfToUniChar(src, &ch); - lowChar = Tcl_UniCharToLower(ch); + lowChar = ch; + if (!bytes) { + /* TclUtfToUniChar only returns 0 for chars > 0xffff ! */ + bytes = TclUtfToUniChar(src, &ch); + /* Combine surrogates */ + lowChar = (((lowChar & 0x3ff) << 10) | (ch & 0x3ff)) + 0x10000; + } + lowChar = Tcl_UniCharToLower(lowChar); if (bytes < TclUtfCount(lowChar)) { memcpy(dst, src, (size_t) bytes); dst += bytes; } else { @@ -1250,20 +1317,22 @@ * None. * *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToUpper( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x04) { - ch -= GetDelta(info); + if (GetCaseType(info) & 0x04) { + ch -= GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* *---------------------------------------------------------------------- * @@ -1278,20 +1347,22 @@ * None. * *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToLower( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); - if (GetCaseType(info) & 0x02) { - ch += GetDelta(info); + if (GetCaseType(info) & 0x02) { + ch += GetDelta(info); + } } - return (Tcl_UniChar) ch; + return ch & 0x1FFFFF; } /* *---------------------------------------------------------------------- * @@ -1306,27 +1377,29 @@ * None. * *---------------------------------------------------------------------- */ -Tcl_UniChar +int Tcl_UniCharToTitle( int ch) /* Unicode character to convert. */ { - int info = GetUniCharInfo(ch); - int mode = GetCaseType(info); - - if (mode & 0x1) { - /* - * Subtract or add one depending on the original case. - */ - - ch += ((mode & 0x4) ? -1 : 1); - } else if (mode == 0x4) { - ch -= GetDelta(info); - } - return (Tcl_UniChar) ch; + if (!UNICODE_OUT_OF_RANGE(ch)) { + int info = GetUniCharInfo(ch); + int mode = GetCaseType(info); + + if (mode & 0x1) { + /* + * Subtract or add one depending on the original case. + */ + + ch += ((mode & 0x4) ? -1 : 1); + } else if (mode == 0x4) { + ch -= GetDelta(info); + } + } + return ch & 0x1FFFFF; } /* *---------------------------------------------------------------------- * Index: tests/string.test ================================================================== --- tests/string.test +++ tests/string.test @@ -485,10 +485,13 @@ run {string index [binary format I* {0x50515253 0x52}] -1} } {} test string-5.20.$noComp {string index, bytearray object out of bounds} { run {string index [binary format I* {0x50515253 0x52}] 20} } {} +test string-5.21 {string index, surrogates, bug [11ae2be95dac9417]} fullutf { + list [string index a\U100000b 1] [string index a\U100000b 2] [string index a\U100000b 3] +} [list \U100000 {} b] proc largest_int {} { # This will give us what the largest valid int on this machine is, # so we can test for overflow properly below on >32 bit systems