Attachment "stringtrim.diff" to
ticket [d43f96c1a8]
added by
chw
2021-02-14 10:03:05.
Index: generic/tclUtil.c
==================================================================
--- generic/tclUtil.c
+++ generic/tclUtil.c
@@ -1683,58 +1683,112 @@
const char *trim, /* String of trim characters... */
int numTrim) /* ...and its length in bytes */
/* Calls to TclUtfToUniChar() in this routine
* rely on (trim[numTrim] == '\0'). */
{
- const char *pp, *p = bytes + numBytes;
+ const char *pp, *p = bytes + numBytes, *q;
+ int ch, i;
Tcl_UniChar ch1 = 0;
+ Tcl_DString ds;
/* Empty strings -> nothing to do */
if ((numBytes == 0) || (numTrim == 0)) {
return 0;
}
+ /*
+ * See if we can optimize.
+ */
+
+ for (i = 0; i < numTrim; i++) {
+ if (UCHAR(trim[i]) >= 0x80) {
+ goto slowPath;
+ }
+ }
+
+ /*
+ * Same logic as below, but working with plain ASCII trim string.
+ */
+
+ do {
+ for (i = 0; i < numTrim; i++) {
+ if (p[-1] == trim[i]) {
+ break;
+ }
+ }
+ if (i >= numTrim) {
+ break;
+ }
+ p--;
+ } while (p > bytes);
+
+ goto done;
+
+ /*
+ * Make trim string into unicode array.
+ */
+
+slowPath:
+ Tcl_DStringInit(&ds);
+ q = trim;
+ do {
+ q += TclUtfToUCS4(q, &ch);
+ Tcl_DStringAppend(&ds, (char *) &ch, sizeof(int));
+ } while (q < trim + numTrim);
+ numTrim = Tcl_DStringLength(&ds) / sizeof(int);
+
/*
* Outer loop: iterate over string to be trimmed.
*/
do {
- const char *q = trim;
- int pInc = 0, bytesLeft = numTrim;
- Tcl_UniChar ch2 = 0;
+ int pInc = 0;
pp = TclUtfPrev(p, bytes);
do {
pp += pInc;
pInc = TclUtfToUniChar(pp, &ch1);
} while (pp + pInc < p);
+ ch = ch1;
+
+#if TCL_UTF_MAX <= 4
+ if ((ch & 0xFC00) == 0xDC00) {
+ int ch2;
+ const char *ppp = pp;
+
+ ppp = TclUtfPrev(pp, bytes);
+ TclUtfToUCS4(ppp, &ch2);
+ if (ch2 > 0x10000) {
+ ch = ch2;
+ pp = ppp;
+ }
+ }
+#endif
/*
* Inner loop: scan trim string for match to current character.
*/
- do {
- int qInc = TclUtfToUniChar(q, &ch2);
-
- if (ch1 == ch2) {
+ for (i = 0; i < numTrim; i++) {
+ if (ch == ((int *)Tcl_DStringValue(&ds))[i]) {
break;
}
+ }
- q += qInc;
- bytesLeft -= qInc;
- } while (bytesLeft);
-
- if (bytesLeft == 0) {
+ if (i >= numTrim) {
/*
* No match; trim task done; *p is last non-trimmed char.
*/
break;
}
p = pp;
} while (p > bytes);
+ Tcl_DStringFree(&ds);
+
+done:
return numBytes - (p - bytes);
}
/*
*----------------------------------------------------------------------
@@ -1763,44 +1817,79 @@
const char *trim, /* String of trim characters... */
int numTrim) /* ...and its length in bytes */
/* Calls to TclUtfToUniChar() in this routine
* rely on (trim[numTrim] == '\0'). */
{
- const char *p = bytes;
- Tcl_UniChar ch1 = 0;
+ const char *p = bytes, *q;
+ int ch, i;
+ Tcl_DString ds;
/* Empty strings -> nothing to do */
if ((numBytes == 0) || (numTrim == 0)) {
return 0;
}
+ /*
+ * See if we can optimize.
+ */
+
+ for (i = 0; i < numTrim; i++) {
+ if (UCHAR(trim[i]) >= 0x80) {
+ goto slowPath;
+ }
+ }
+
+ /*
+ * Same logic as below, but working with plain ASCII trim string.
+ */
+
+ do {
+ for (i = 0; i < numTrim; i++) {
+ if (p[0] == trim[i]) {
+ break;
+ }
+ }
+ if (i >= numTrim) {
+ break;
+ }
+ p++;
+ numBytes--;
+ } while (numBytes > 0);
+
+ goto done;
+
+ /*
+ * Make trim string into unicode array.
+ */
+
+slowPath:
+ Tcl_DStringInit(&ds);
+ q = trim;
+ do {
+ q += TclUtfToUCS4(q, &ch);
+ Tcl_DStringAppend(&ds, (char *) &ch, sizeof(int));
+ } while (q < trim + numTrim);
+ numTrim = Tcl_DStringLength(&ds) / sizeof(int);
+
/*
* Outer loop: iterate over string to be trimmed.
*/
do {
- Tcl_UniChar ch2 = 0;
- int pInc = TclUtfToUniChar(p, &ch1);
- const char *q = trim;
- int bytesLeft = numTrim;
+ int pInc = TclUtfToUCS4(p, &ch);
/*
* Inner loop: scan trim string for match to current character.
*/
- do {
- int qInc = TclUtfToUniChar(q, &ch2);
-
- if (ch1 == ch2) {
+ for (i = 0; i < numTrim; i++) {
+ if (ch == ((int *)Tcl_DStringValue(&ds))[i]) {
break;
}
+ }
- q += qInc;
- bytesLeft -= qInc;
- } while (bytesLeft);
-
- if (bytesLeft == 0) {
+ if (i >= numTrim) {
/*
* No match; trim task done; *p is first non-trimmed char.
*/
break;
@@ -1808,10 +1897,13 @@
p += pInc;
numBytes -= pInc;
} while (numBytes > 0);
+ Tcl_DStringFree(&ds);
+
+done:
return p - bytes;
}
/*
*----------------------------------------------------------------------
Index: tests/string.test
==================================================================
--- tests/string.test
+++ tests/string.test
@@ -1643,10 +1643,37 @@
string wordend "xyz\u2045de fg" 0
} 3
test string-21.14 {string wordend, unicode} {
string wordend "\uC700\uC700 abc" 8
} 6
+test string-21.15 {string trim, unicode} {
+ string trim "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "Hello world!"
+test string-21.16 {string trimleft, unicode} {
+ string trimleft "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "Hello world!\ud83d\ude02"
+test string-21.17 {string trimright, unicode} {
+ string trimright "\ud83d\ude02Hello world!\ud83d\ude02" \ud83d\ude02
+} "\ud83d\ude02Hello world!"
+test string-21.18 {string trim, unicode} {
+ string trim "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.19 {string trimleft, unicode} {
+ string trimleft "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.20 {string trimright, unicode} {
+ string trimright "\uf602Hello world!\uf602" \ud83d\ude02
+} "\uf602Hello world!\uf602"
+test string-21.21 {string trim, unicode} {
+ string trim "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
+test string-21.22 {string trimleft, unicode} {
+ string trimleft "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
+test string-21.23 {string trimright, unicode} {
+ string trimright "\ud83d\ude02Hello world!\ud83d\ude02" \ud93d\ude02
+} "\ud83d\ude02Hello world!\ud83d\ude02"
test string-22.1 {string wordstart} {
list [catch {string word a} msg] $msg
} {1 {unknown or ambiguous subcommand "word": must be bytelength, cat, compare, equal, first, index, is, last, length, map, match, range, repeat, replace, reverse, tolower, totitle, toupper, trim, trimleft, trimright, wordend, or wordstart}}
test string-22.2 {string wordstart} {