Tcl Source Code

Artifact [a6980457b4]
Login

Artifact a6980457b4181b3957c742edb1e2843b34941425e8454e8da4604c3b023c3d3e:

Attachment "stringtrim.diff" to ticket [d43f96c1a8] added by chw 2021-02-14 10:03:05. (unpublished)
Index: generic/tclUtil.c
==================================================================
--- generic/tclUtil.c
+++ generic/tclUtil.c
@@ -1683,58 +1683,112 @@
     const char *trim,	/* String of trim characters... */
     int numTrim)	/* ...and its length in bytes */
 			/* Calls to TclUtfToUniChar() in this routine
 			 * rely on (trim[numTrim] == '\0'). */
 {
-    const char *pp, *p = bytes + numBytes;
+    const char *pp, *p = bytes + numBytes, *q;
+    int ch, i;
     Tcl_UniChar ch1 = 0;
+    Tcl_DString ds;
 
     /* Empty strings -> nothing to do */
     if ((numBytes == 0) || (numTrim == 0)) {
 	return 0;
     }
 
+    /*
+     * See if we can optimize.
+     */
+
+    for (i = 0; i < numTrim; i++) {
+	if (UCHAR(trim[i]) >= 0x80) {
+	    goto slowPath;
+	}
+    }
+
+    /*
+     * Same logic as below, but working with plain ASCII trim string.
+     */
+
+    do {
+	for (i = 0; i < numTrim; i++) {
+	    if (p[-1] == trim[i]) {
+		break;
+	    }
+	}
+	if (i >= numTrim) {
+	    break;
+	}
+	p--;
+    } while (p > bytes);
+
+    goto done;
+
+    /*
+     * Make trim string into unicode array.
+     */
+
+slowPath:
+    Tcl_DStringInit(&ds);
+    q = trim;
+    do {
+	q += TclUtfToUCS4(q, &ch);
+	Tcl_DStringAppend(&ds, (char *) &ch, sizeof(int));
+    } while (q < trim + numTrim);
+    numTrim = Tcl_DStringLength(&ds) / sizeof(int);
+
     /*
      * Outer loop: iterate over string to be trimmed.
      */
 
     do {
-	const char *q = trim;
-	int pInc = 0, bytesLeft = numTrim;
-	Tcl_UniChar ch2 = 0;
+	int pInc = 0;
 
 	pp = TclUtfPrev(p, bytes);
 	do {
 	    pp += pInc;
  	    pInc = TclUtfToUniChar(pp, &ch1);
 	} while (pp + pInc < p);
+	ch = ch1;
+
+#if TCL_UTF_MAX <= 4
+	if ((ch & 0xFC00) == 0xDC00) {
+	    int ch2;
+	    const char *ppp = pp;
+
+	    ppp = TclUtfPrev(pp, bytes);
+	    TclUtfToUCS4(ppp, &ch2);
+	    if (ch2 > 0x10000) {
+		ch = ch2;
+		pp = ppp;
+	    }
+	}
+#endif
 
 	/*
 	 * Inner loop: scan trim string for match to current character.
 	 */
 
-	do {
-	    int qInc = TclUtfToUniChar(q, &ch2);
-
-	    if (ch1 == ch2) {
+	for (i = 0; i < numTrim; i++) {
+	    if (ch == ((int *)Tcl_DStringValue(&ds))[i]) {
 		break;
 	    }
+	}
 
-	    q += qInc;
-	    bytesLeft -= qInc;
-	} while (bytesLeft);
-
-	if (bytesLeft == 0) {
+	if (i >= numTrim) {
 	    /*
 	     * No match; trim task done; *p is last non-trimmed char.
 	     */
 
 	    break;
 	}
 	p = pp;
     } while (p > bytes);
 
+    Tcl_DStringFree(&ds);
+
+done:
     return numBytes - (p - bytes);
 }
 
 /*
  *----------------------------------------------------------------------
@@ -1763,44 +1817,79 @@
     const char *trim,	/* String of trim characters... */
     int numTrim)	/* ...and its length in bytes */
 			/* Calls to TclUtfToUniChar() in this routine
 			 * rely on (trim[numTrim] == '\0'). */
 {
-    const char *p = bytes;
-    Tcl_UniChar ch1 = 0;
+    const char *p = bytes, *q;
+    int ch, i;
+    Tcl_DString ds;
 
     /* Empty strings -> nothing to do */
     if ((numBytes == 0) || (numTrim == 0)) {
 	return 0;
     }
 
+    /*
+     * See if we can optimize.
+     */
+
+    for (i = 0; i < numTrim; i++) {
+	if (UCHAR(trim[i]) >= 0x80) {
+	    goto slowPath;
+	}
+    }
+
+    /*
+     * Same logic as below, but working with plain ASCII trim string.
+     */
+
+    do {
+	for (i = 0; i < numTrim; i++) {
+	    if (p[0] == trim[i]) {
+		break;
+	    }
+	}
+	if (i >= numTrim) {
+	    break;
+	}
+	p++;
+	numBytes--;
+    } while (numBytes > 0);
+
+    goto done;
+
+    /*
+     * Make trim string into unicode array.
+     */
+
+slowPath:
+    Tcl_DStringInit(&ds);
+    q = trim;
+    do {
+	q += TclUtfToUCS4(q, &ch);
+	Tcl_DStringAppend(&ds, (char *) &ch, sizeof(int));
+    } while (q < trim + numTrim);
+    numTrim = Tcl_DStringLength(&ds) / sizeof(int);
+
     /*
      * Outer loop: iterate over string to be trimmed.
      */
 
     do {
-	Tcl_UniChar ch2 = 0;
-	int pInc = TclUtfToUniChar(p, &ch1);
-	const char *q = trim;
-	int bytesLeft = numTrim;
+	int pInc = TclUtfToUCS4(p, &ch);
 
 	/*
 	 * Inner loop: scan trim string for match to current character.
 	 */
 
-	do {
-	    int qInc = TclUtfToUniChar(q, &ch2);
-
-	    if (ch1 == ch2) {
+	for (i = 0; i < numTrim; i++) {
+	    if (ch == ((int *)Tcl_DStringValue(&ds))[i]) {
 		break;
 	    }
+	}
 
-	    q += qInc;
-	    bytesLeft -= qInc;
-	} while (bytesLeft);
-
-	if (bytesLeft == 0) {
+	if (i >= numTrim) {
 	    /*
 	     * No match; trim task done; *p is first non-trimmed char.
 	     */
 
 	    break;
@@ -1808,10 +1897,13 @@
 
 	p += pInc;
 	numBytes -= pInc;
     } while (numBytes > 0);
 
+    Tcl_DStringFree(&ds);
+
+done:
     return p - bytes;
 }
 
 /*
  *----------------------------------------------------------------------

Index: tests/string.test
==================================================================
--- tests/string.test
+++ tests/string.test
@@ -1643,10 +1643,37 @@
     string wordend "xyz\u2045de fg" 0
 } 3
 test string-21.14 {string wordend, unicode} {
     string wordend "\uC700\uC700 abc" 8
 } 6
+test string-21.15 {string trim, unicode} {
+    string trim "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "Hello world!"
+test string-21.16 {string trimleft, unicode} {
+    string trimleft "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "Hello world!\ud83d\ude02"
+test string-21.17 {string trimright, unicode} {
+    string trimright "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "\ud83d\ude02Hello world!"
+test string-21.18 {string trim, unicode} {
+    string trim "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.19 {string trimleft, unicode} {
+    string trimleft "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.20 {string trimright, unicode} {
+    string trimright "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.21 {string trim, unicode} {
+    string trim "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
+test string-21.22 {string trimleft, unicode} {
+    string trimleft "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
+test string-21.23 {string trimright, unicode} {
+    string trimright "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
 
 test string-22.1 {string wordstart} {
     list [catch {string word a} msg] $msg
 } {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
 test string-22.2 {string wordstart} {