Attachment "TclUtfCasecmp-10.patch" to
ticket [1236896fff]
added by
rmax
2005-07-19 00:45:20.
Index: generic/tclCmdIL.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclCmdIL.c,v
retrieving revision 1.78
diff -u -r1.78 tclCmdIL.c
--- generic/tclCmdIL.c 17 Jul 2005 21:17:30 -0000 1.78
+++ generic/tclCmdIL.c 18 Jul 2005 17:43:18 -0000
@@ -3305,7 +3305,7 @@
dataType = INTEGER;
break;
case LSEARCH_NOCASE: /* -nocase */
- strCmpFn = strcasecmp;
+ strCmpFn = TclUtfCasecmp;
noCase = 1;
break;
case LSEARCH_NOT: /* -not */
@@ -3661,7 +3661,7 @@
*/
if (noCase) {
- match = (strcasecmp(bytes, patternBytes) == 0);
+ match = (TclUtfCasecmp(bytes, patternBytes) == 0);
} else {
match = (memcmp(bytes, patternBytes,
(size_t) length) == 0);
@@ -3883,6 +3883,7 @@
Tcl_SetObjResult(interp, listPtr);
return TCL_OK;
}
+
/*
*----------------------------------------------------------------------
@@ -4040,7 +4041,7 @@
sortInfo.sortMode = SORTMODE_INTEGER;
break;
case LSORT_NOCASE:
- sortInfo.strCmpFn = strcasecmp;
+ sortInfo.strCmpFn = TclUtfCasecmp;
break;
case LSORT_REAL:
sortInfo.sortMode = SORTMODE_REAL;
Index: generic/tclCmdMZ.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclCmdMZ.c,v
retrieving revision 1.127
diff -u -r1.127 tclCmdMZ.c
--- generic/tclCmdMZ.c 17 Jul 2005 21:17:37 -0000 1.127
+++ generic/tclCmdMZ.c 18 Jul 2005 17:43:18 -0000
@@ -2633,7 +2633,7 @@
matchVarObj = objv[i];
numMatchesSaved = -1;
} else if (index == OPT_NOCASE) {
- strCmpFn = strcasecmp;
+ strCmpFn = TclUtfCasecmp;
noCase = 1;
} else {
if (foundmode) {
Index: generic/tclInt.decls
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclInt.decls,v
retrieving revision 1.90
diff -u -r1.90 tclInt.decls
--- generic/tclInt.decls 5 Jul 2005 18:15:55 -0000 1.90
+++ generic/tclInt.decls 18 Jul 2005 17:43:18 -0000
@@ -897,6 +897,10 @@
TclPlatformType *TclGetPlatform(void)
}
+declare 225 generic {
+ int TclUtfCasecmp( CONST char *cs, CONST char *ct )
+}
+
##############################################################################
# Define the platform specific internal Tcl interface. These functions are
Index: generic/tclUtf.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclUtf.c,v
retrieving revision 1.34
diff -u -r1.34 tclUtf.c
--- generic/tclUtf.c 10 May 2005 18:34:51 -0000 1.34
+++ generic/tclUtf.c 18 Jul 2005 17:43:18 -0000
@@ -1042,15 +1042,27 @@
* representation of \u0001 (the byte 0x01.)
*/
while (numChars-- > 0) {
+ unsigned char s, t;
/*
* n must be interpreted as chars, not bytes.
* This should be called only when both strings are of
* at least n chars long (no need for \0 check)
*/
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- return (ch1 - ch2);
+ s = *cs;
+ t = *ct;
+ if ((s | t) < 0x80) {
+ /* We have 7bit ASCII characters in both strings */
+ if (s != t) {
+ return (s - t);
+ }
+ cs++;
+ ct++;
+ } else {
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+ return (ch1 - ch2);
+ }
}
}
return 0;
@@ -1080,20 +1092,40 @@
CONST char *ct; /* UTF string cs is compared to. */
unsigned long numChars; /* Number of UTF chars to compare. */
{
- Tcl_UniChar ch1, ch2;
while (numChars-- > 0) {
+ unsigned char s, t;
/*
* n must be interpreted as chars, not bytes.
* This should be called only when both strings are of
* at least n chars long (no need for \0 check)
*/
- cs += TclUtfToUniChar(cs, &ch1);
- ct += TclUtfToUniChar(ct, &ch2);
- if (ch1 != ch2) {
- ch1 = Tcl_UniCharToLower(ch1);
- ch2 = Tcl_UniCharToLower(ch2);
+ s = *cs;
+ t = *ct;
+ if ((s | t) < 0x80) {
+ /* We have 7bit ASCII characters in both strings */
+ if (s != t) {
+ if (s >= 'A' && s <= 'Z') {
+ s |= 0x20;
+ }
+ if (t >= 'A' && t <= 'Z') {
+ t |= 0x20;
+ }
+ if (s != t) {
+ return (s - t);
+ }
+ }
+ cs++;
+ ct++;
+ } else {
+ Tcl_UniChar ch1, ch2;
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
if (ch1 != ch2) {
- return (ch1 - ch2);
+ ch1 = Tcl_UniCharToLower(ch1);
+ ch2 = Tcl_UniCharToLower(ch2);
+ if (ch1 != ch2) {
+ return (ch1 - ch2);
+ }
}
}
}
@@ -1103,6 +1135,67 @@
/*
*----------------------------------------------------------------------
*
+ * TclUtfCasecmp --
+ *
+ * Compare string cs to string ct case insensitively.
+ *
+ * Results:
+ * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct.
+ *
+ * Side effects:
+ * None.
+ *
+ *----------------------------------------------------------------------
+ */
+
+int
+TclUtfCasecmp(cs, ct)
+ CONST char *cs; /* UTF string to compare to ct. */
+ CONST char *ct; /* UTF string cs is compared to. */
+{
+ unsigned char s, t;
+ do {
+ s = *cs;
+ t = *ct;
+ if ((s | t) < 0x80) {
+ /* We have 7bit ASCII characters in both strings */
+ if (s != t) {
+ if (s >= 'A' && s <= 'Z') {
+ s |= 0x20;
+ }
+ if (t >= 'A' && t <= 'Z') {
+ t |= 0x20;
+ }
+ if (s != t) {
+ return (s - t);
+ }
+ }
+ cs++; ct++;
+ } else {
+ Tcl_UniChar ch1, ch2;
+ cs += TclUtfToUniChar(cs, &ch1);
+ ct += TclUtfToUniChar(ct, &ch2);
+ if (ch1 != ch2) {
+ ch1 = Tcl_UniCharToLower(ch1);
+ ch2 = Tcl_UniCharToLower(ch2);
+ if (ch1 != ch2) {
+ return (ch1 - ch2);
+ }
+ }
+ }
+ } while (s != '\0');
+ /*
+ * At this point we can be sure that both strings have reached the
+ * end, because otherwise one of the returns above would alread
+ * have fired, that's also the reason why only one of the strings
+ * needs to be checked for a null character the while condition.
+ */
+ return 0;
+}
+
+/*
+ *----------------------------------------------------------------------
+ *
* Tcl_UniCharToUpper --
*
* Compute the uppercase equivalent of the given Unicode character.
Index: tests/cmdIL.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/cmdIL.test,v
retrieving revision 1.26
diff -u -r1.26 cmdIL.test
--- tests/cmdIL.test 1 Jun 2005 11:00:35 -0000 1.26
+++ tests/cmdIL.test 18 Jul 2005 17:43:18 -0000
@@ -431,6 +431,12 @@
} -result 0 -cleanup {
rename test_lsort ""
}
+test cmdIL-5.6 {lsort -nocase with non-ASCII characters} -body {
+ lsort -nocase {\u00c4a \u20aca \u00c4c \u00e4b Ac ab Aa}
+} -result "Aa ab Ac \u00c4a \u00e4b \u00c4c \u20aca"
+test cmdIL-5.7 {lsort -nocase with ASCII non-letter characters} {
+ list [lsort -nocase {@ `}] [lsort -nocase {` @}]
+} {{@ `} {@ `}}
# Compiled version
test cmdIL-6.1 {lassign command syntax} -body {