Tcl Source Code

Artifact [8b4255ad7d]
Login

Artifact 8b4255ad7d9d8cf13344a4835a83837ab724b2016c75deacc8f3a18fb703de6f:

Attachment "stringrev.diff" to ticket [22324bcbdf] added by chw 2021-02-13 05:17:43. (unpublished)
Index: generic/tclStringObj.c
==================================================================
--- generic/tclStringObj.c
+++ generic/tclStringObj.c
@@ -2897,10 +2897,13 @@
 TclStringReverse(
     Tcl_Obj *objPtr)
 {
     String *stringPtr;
     Tcl_UniChar ch = 0;
+#if TCL_UTF_MAX <= 4
+    int needFlip = 0;
+#endif
 
     if (TclIsPureByteArray(objPtr)) {
 	int numBytes;
 	unsigned char *from = Tcl_GetByteArrayFromObj(objPtr, &numBytes);
 
@@ -2915,36 +2918,70 @@
     stringPtr = GET_STRING(objPtr);
 
     if (stringPtr->hasUnicode) {
 	Tcl_UniChar *from = Tcl_GetUnicode(objPtr);
 	Tcl_UniChar *src = from + stringPtr->numChars;
+	Tcl_UniChar *to;
 
 	if (Tcl_IsShared(objPtr)) {
-	    Tcl_UniChar *to;
-
 	    /*
 	     * Create a non-empty, pure unicode value, so we can coax
 	     * Tcl_SetObjLength into growing the unicode rep buffer.
 	     */
 
 	    objPtr = Tcl_NewUnicodeObj(&ch, 1);
 	    Tcl_SetObjLength(objPtr, stringPtr->numChars);
 	    to = Tcl_GetUnicode(objPtr);
 	    while (--src >= from) {
+#if TCL_UTF_MAX <= 4
+		ch = *src;
+		if ((ch & 0xF800) == 0xD800) {
+		    needFlip = 1;
+		}
+		*to++ = ch;
+#else
 		*to++ = *src;
+#endif
 	    }
 	} else {
 	    /*
 	     * Reversing in place.
 	     */
 
+#if TCL_UTF_MAX <= 4
+	    to = src;
+#endif
 	    while (--src > from) {
 		ch = *src;
+#if TCL_UTF_MAX <= 4
+		if ((ch & 0xF800) == 0xD800) {
+		    needFlip = 1;
+		}
+#endif
 		*src = *from;
 		*from++ = ch;
 	    }
 	}
+#if TCL_UTF_MAX <= 4
+	if (needFlip) {
+	    /*
+	     * Flip back surrogate pairs. There might be a better way.
+	     */
+
+	    from = to - stringPtr->numChars;
+	    while (--to >= from) {
+		ch = *to;
+		if ((ch & 0xFC00) == 0xD800) {
+		    if ((to-1 >= from) && ((to[-1] & 0xFC00) == 0xDC00)) {
+			to[0] = to[-1];
+			to[-1] = ch;
+			--to;
+		    }
+		}
+	    }
+	}
+#endif
     }
 
     if (objPtr->bytes) {
 	int numChars = stringPtr->numChars;
 	int numBytes = objPtr->length;
@@ -2976,10 +3013,15 @@
 		 * skip calling Tcl_UtfCharComplete() here.
 		 */
 
 		int bytesInChar = TclUtfToUniChar(from, &ch);
 
+#if TCL_UTF_MAX <= 4
+		if ((ch & 0xF800) == 0xD800) {
+		    needFlip = 1;
+		}
+#endif
 		ReverseBytes((unsigned char *)to, (unsigned char *)from,
 			bytesInChar);
 		to += bytesInChar;
 		from += bytesInChar;
 		bytesLeft -= bytesInChar;
@@ -2989,10 +3031,35 @@
 	    from = to = objPtr->bytes;
 	    stringPtr->numChars = charCount;
 	}
 	/* Pass 2. Reverse all the bytes. */
 	ReverseBytes((unsigned char *)to, (unsigned char *)from, numBytes);
+
+#if TCL_UTF_MAX <= 4
+	if (needFlip) {
+	    /* Pass 3. Flip back surrogate pairs. Might be a better way. */
+	    numBytes = objPtr->length;
+	    from = to = objPtr->bytes;
+	    from += numBytes;	/* This is the end (The Doors, Jim Morrison) */
+	    while (to < from) {
+		Tcl_UniChar ch1, ch2;
+		int len, len2;
+
+		len = TclUtfToUniChar(to, &ch1);
+	 	if (((ch1 & 0xFC00) == 0xDC00) &&
+			(to + len + TCL_UTF_MAX <= from)) {
+		    len2 = TclUtfToUniChar(to + len, &ch2);
+		    if ((ch2 & 0xFC00) == 0xD800) {
+			Tcl_UniCharToUtf(ch2, to);
+			Tcl_UniCharToUtf(ch1, to + len);
+			len += len2;
+		    }
+		}
+		to += len;
+	     }
+	}
+#endif
     }
 
     return objPtr;
 }
 

Index: tests/string.test
==================================================================
--- tests/string.test
+++ tests/string.test
@@ -1799,10 +1799,28 @@
 } 030201
 test string-24.15 {string reverse command - pure bytearray} {
     binary scan [tcl::string::reverse [binary format H* 010203]] H* x
     set x
 } 030201
+test string-24.16 {string reverse command - surrogates} {
+    string reverse \u0444bulb\ud83d\ude02
+} \ud83d\ude02blub\u0444
+test string-24.17 {string reverse command - surrogates} {
+    string reverse \ud83d\ude02hello\ud83d\ude02
+} \ud83d\ude02olleh\ud83d\ude02
+test string-24.18 {string reverse command - surrogates} {
+    set s \u0444bulb\ud83d\ude02
+    # shim shimmery ...
+    string index $s 0
+    string reverse $s
+} \ud83d\ude02blub\u0444
+test string-24.19 {string reverse command - surrogates} {
+    set s \ud83d\ude02hello\ud83d\ude02
+    # shim shimmery ...
+    string index $s 0
+    string reverse $s
+} \ud83d\ude02olleh\ud83d\ude02
 
 test string-25.1 {string is list} {
     string is list {a b c}
 } 1
 test string-25.2 {string is list} {