Tcl Source Code: Artifact [50ba8ceb47]

Artifact 50ba8ceb47ddca26d46c037e65d745147f79642c:

Attachment "None" to ticket [403932ffff] added by dkf 2001-02-21 23:39:48.
Index: ChangeLog
===================================================================
RCS file: /cvsroot/tcl/tcl/ChangeLog,v
retrieving revision 1.368
diff -u -r1.368 ChangeLog
--- ChangeLog	2001/02/16 09:26:30	1.368
+++ ChangeLog	2001/02/21 16:32:43
@@ -1,3 +1,27 @@
+2001-02-21  Donal K. Fellows  <[email protected]>
+
+	* tests/util.test (util-5.*): Rewrote so as to test
+	Tcl_StringMatch and not Tcl_UniCharCaseMatch.  The sense of
+	util-5.15 and util-5.45 have changed as well, due to different
+	interpretation of failed patterns.  Added util-5.52 to util-5.59
+
+	* tests/string.test (string-11.29,11.50-11.57): Test for new
+	matching behaviour which matches definition in manual page more
+	closely.
+
+	* generic/tclUtf.c (Tcl_UniCharCaseMatch): Reworked so that
+	after-* handling worked properly when matching case-insensitively
+	and tackled the handling of incomplete match sets.
+
+	* generic/tclUtil.c (Tcl_StringCaseMatch): Reworked to be properly
+	Unicode aware and handle case-insensitivity correctly, and tackled
+	the handling of incomplete match sets.
+
+2001-02-20  Donal K. Fellows  <[email protected]>
+
+	* tests/string.test (string-11.49): Added test for case in [string
+	match] where it fails to handle UTF-8 characters correctly.
+
 2001-02-15  Donal K. Fellows  <[email protected]>
 
 	* generic/tclCmdMZ.c (Tcl_SplitObjCmd): Improved efficiency of
Index: generic/tclUtf.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclUtf.c,v
retrieving revision 1.14
diff -u -r1.14 tclUtf.c
--- generic/tclUtf.c	2000/06/05 23:36:21	1.14
+++ generic/tclUtf.c	2001/02/21 16:32:43
@@ -1684,6 +1684,9 @@
 	    if (p == 0) {
 		return 1;
 	    }
+	    if (nocase) {
+		p = Tcl_UniCharToLower(p);
+	    }
 	    while (1) {
 		/*
 		 * Optimization for matching - cruise through the string
@@ -1729,20 +1732,43 @@
 	
 	if (p == '[') {
 	    Tcl_UniChar startChar, endChar;
+	    CONST Tcl_UniChar *savedPattern, *savedString;
+
+	    /*
+	     * Save these for later in case the pattern itself is malformed
+	     */
+	    savedPattern = pattern;
+	    savedString = string;
 
 	    pattern++;
 	    ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
 	    string++;
 	    while (1) {
-		if ((*pattern == ']') || (*pattern == 0)) {
+		if (*pattern == ']') {
 		    return 0;
+		} else if (*pattern == 0) {
+		    /*
+		     * Pattern malformed (no closing bracket) so fall
+		     * back to exact matching from the start of where
+		     * the pattern should have been.
+		     */
+		    pattern = savedPattern;
+		    string = savedString;
+		    goto matchLiteral;
 		}
 		startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
 		pattern++;
 		if (*pattern == '-') {
 		    pattern++;
 		    if (*pattern == 0) {
-			return 0;
+			/*
+			 * Pattern malformed (no closing bracket) so
+			 * fall back to exact matching from the start
+			 * of where the pattern should have been.
+			 */
+			pattern = savedPattern;
+			string = savedString;
+			goto matchLiteral;
 		    }
 		    endChar = (nocase ? Tcl_UniCharToLower(*pattern)
 			    : *pattern);
@@ -1758,10 +1784,31 @@
 		    break;
 		}
 	    }
+	    /*
+	     * At this point, we've found a match for the character
+	     * that was at the front of the string in the match set,
+	     * but we're looking for the end of the set itself, since
+	     * we should not match an ill-formed pattern.
+	     */
 	    while (*pattern != ']') {
+		if (*pattern == '-') {
+		    /*
+		     * Where you see a "-", you should have a
+		     * following character; the preceding character
+		     * will have already been swallowed correctly
+		     * though.
+		     */
+		    pattern++;
+		}
 		if (*pattern == 0) {
-		    pattern--;
-		    break;
+		    /*
+		     * Bracketed match set was not properly closed.
+		     * Fall back to exact matching from the start of
+		     * where the match set was.
+		     */
+		    pattern = savedPattern;
+		    string = savedString;
+		    goto matchLiteral;
 		}
 		pattern++;
 	    }
@@ -1785,6 +1832,7 @@
 	 * bytes of each string match.
 	 */
 
+    matchLiteral:
 	if (nocase) {
 	    if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
 		return 0;
Index: generic/tclUtil.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclUtil.c,v
retrieving revision 1.18
diff -u -r1.18 tclUtil.c
--- generic/tclUtil.c	2000/05/08 21:59:59	1.18
+++ generic/tclUtil.c	2001/02/21 16:32:43
@@ -1145,7 +1145,8 @@
  * Tcl_StringCaseMatch --
  *
  *	See if a particular string matches a particular pattern.
- *	Allows case insensitivity.
+ *	Allows case insensitivity.  This is the UTF-8 equivalent of
+ *	the Unicode Tcl_UniCharCaseMatch.
  *
  * Results:
  *	The return value is 1 if string matches pattern, and
@@ -1167,7 +1168,6 @@
     int nocase;			/* 0 for case sensitive, 1 for insensitive */
 {
     int p;
-    CONST char *pstart = pattern;
     Tcl_UniChar ch1, ch2;
     
     while (1) {
@@ -1209,17 +1209,28 @@
 		 * character
 		 */
 		if ((p != '[') && (p != '?') && (p != '\\')) {
+		    (void) Tcl_UtfToUniChar(pattern, &ch2);
 		    if (nocase) {
-			while (*string && (p != *string)) {
-			    ch2 = Tcl_UtfToUniChar(string, &ch1);
-			    if (p == Tcl_UniCharToLower(ch1)) {
+			ch2 = Tcl_UniCharToLower(ch2);
+			while (*string) {
+			    int charLen = Tcl_UtfToUniChar(string, &ch1);
+			    if (ch2 == Tcl_UniCharToLower(ch1)) {
 				break;
 			    }
-			    string += ch2;
+			    string += charLen;
 			}
 		    } else {
-			while (*string && (p != *string)) {
-			    string += Tcl_UtfToUniChar(string, &ch1);
+			/*
+			 * There's no point in trying to make this code
+			 * shorter, as the number of bytes you want to
+			 * compare each time is non-constant.
+			 */
+			while (*string) {
+			    int charLen = Tcl_UtfToUniChar(string, &ch1);
+			    if (ch2 == ch1) {
+				break;
+			    }
+			    string += charLen;
 			}
 		    }
 		}
@@ -1252,16 +1263,33 @@
 
 	if (p == '[') {
 	    Tcl_UniChar startChar, endChar;
+	    CONST char *savedPattern, *savedString;
 
+	    /*
+	     * Save these for later in case the pattern itself is malformed
+	     */
+	    savedPattern = pattern;
+	    savedString = string;
+
 	    pattern++;
 	    string += Tcl_UtfToUniChar(string, &ch1);
 	    if (nocase) {
 		ch1 = Tcl_UniCharToLower(ch1);
 	    }
 	    while (1) {
-		if ((*pattern == ']') || (*pattern == '\0')) {
+		if (*pattern == ']') {
 		    return 0;
 		}
+		if (*pattern == '\0') {
+		    /*
+		     * Pattern malformed (no closing bracket) so fall
+		     * back to exact matching from the start of where
+		     * the pattern should have been.
+		     */
+		    pattern = savedPattern;
+		    string = savedString;
+		    goto matchLiteral;
+		}
 		pattern += Tcl_UtfToUniChar(pattern, &startChar);
 		if (nocase) {
 		    startChar = Tcl_UniCharToLower(startChar);
@@ -1269,7 +1297,14 @@
 		if (*pattern == '-') {
 		    pattern++;
 		    if (*pattern == '\0') {
-			return 0;
+			/*
+			 * Pattern malformed (no closing bracket) so
+			 * fall back to exact matching from the start
+			 * of where the pattern should have been.
+			 */
+			pattern = savedPattern;
+			string = savedString;
+			goto matchLiteral;
 		    }
 		    pattern += Tcl_UtfToUniChar(pattern, &endChar);
 		    if (nocase) {
@@ -1287,12 +1322,33 @@
 		    break;
 		}
 	    }
+	    /*
+	     * At this point, we've found a match for the character
+	     * that was at the front of the string in the match set,
+	     * but we're looking for the end of the set itself, since
+	     * we should not match an ill-formed pattern.
+	     */
 	    while (*pattern != ']') {
+		if (*pattern == '-') {
+		    /*
+		     * Where you see a "-", you should have a
+		     * following character; the preceding character
+		     * will have already been swallowed correctly
+		     * though.
+		     */
+		    pattern++;
+		}
 		if (*pattern == '\0') {
-		    pattern = Tcl_UtfPrev(pattern, pstart);
-		    break;
+		    /*
+		     * Bracketed match set was not properly closed.
+		     * Fall back to exact matching from the start of
+		     * where the match set was.
+		     */
+		    pattern = savedPattern;
+		    string = savedString;
+		    goto matchLiteral;
 		}
-		pattern++;
+		pattern += Tcl_UtfToUniChar(pattern, /*DUMMY*/ &endChar);
 	    }
 	    pattern++;
 	    continue;
@@ -1315,6 +1371,7 @@
 	 * bytes of each string match.
 	 */
 
+    matchLiteral:
 	string  += Tcl_UtfToUniChar(string, &ch1);
 	pattern += Tcl_UtfToUniChar(pattern, &ch2);
 	if (nocase) {
Index: tests/string.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/string.test,v
retrieving revision 1.26
diff -u -r1.26 string.test
--- tests/string.test	2000/09/06 18:35:13	1.26
+++ tests/string.test	2001/02/21 16:32:43
@@ -777,8 +777,8 @@
 test string-11.28 {string match} {
     string match "" ""
 } 1
-test string-11.29 {string match} {
-    string match \[a a
+test string-11.29 {string match, incomplete sets are not sets} {
+    string match \[a \[a
 } 1
 test string-11.30 {string match, bad args} {
     list [catch {string match - b c} msg] $msg
@@ -843,6 +843,34 @@
 test string-11.48 {string match, *special case} {
     string match "?\\*" "a*"
 } 1
+test string-11.49 {string match; *, -nocase and UTF-8} {
+    string match -nocase [binary format I 717316707] \
+	    [binary format I 2028036707]
+} 1
+test string-11.50 {string match, incomplete sets are not sets} {
+    string match \[a- \[a-
+} 1
+test string-11.51 {string match, incomplete sets are not sets} {
+    string match \[a-c \[a-c
+} 1
+test string-11.52 {string match, incomplete sets are not sets} {
+    string match \[a a
+} 0
+test string-11.53 {string match, incomplete sets are not sets} {
+    string match \[a- a
+} 0
+test string-11.54 {string match, incomplete sets are not sets} {
+    string match \[a-c b
+} 0
+test string-11.55 {string match, incomplete sets are not sets} {
+    string match \[a-cf b
+} 0
+test string-11.56 {string match, incomplete sets are not sets} {
+    string match \[a-cf- b
+} 0
+test string-11.57 {string match, incomplete sets are not sets} {
+    string match \[a-cf-h b
+} 0
 
 test string-12.1 {string range} {
     list [catch {string range} msg] $msg
Index: tests/util.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/util.test,v
retrieving revision 1.7
diff -u -r1.7 util.test
--- tests/util.test	2000/04/10 17:19:06	1.7
+++ tests/util.test	2001/02/21 16:32:43
@@ -63,181 +63,209 @@
     concat a { } c
 } {a c}
 
+proc stringMatchUTF {pattern string} {
+    # Forces use of Tcl_StringMatch, not Tcl_UniCharCaseMatch
+    switch -glob -- $string $pattern {return 1} default {return 0}
+}
 test util-5.1 {Tcl_StringMatch} {
-    string match ab*c abc
+    stringMatchUTF ab*c abc
 } 1
 test util-5.2 {Tcl_StringMatch} {
-    string match ab**c abc
+    stringMatchUTF ab**c abc
 } 1
 test util-5.3 {Tcl_StringMatch} {
-    string match ab* abcdef
+    stringMatchUTF ab* abcdef
 } 1
 test util-5.4 {Tcl_StringMatch} {
-    string match *c abc
+    stringMatchUTF *c abc
 } 1
 test util-5.5 {Tcl_StringMatch} {
-    string match *3*6*9 0123456789
+    stringMatchUTF *3*6*9 0123456789
 } 1
 test util-5.6 {Tcl_StringMatch} {
-    string match *3*6*9 01234567890
+    stringMatchUTF *3*6*9 01234567890
 } 0
 test util-5.7 {Tcl_StringMatch: UTF-8} {
-    string match *u \u4e4fu
+    stringMatchUTF *u \u4e4fu
 } 1
 test util-5.8 {Tcl_StringMatch} {
-    string match a?c abc
+    stringMatchUTF a?c abc
 } 1
 test util-5.9 {Tcl_StringMatch: UTF-8} {
     # skip one character in string
 
-    string match a?c a\u4e4fc
+    stringMatchUTF a?c a\u4e4fc
 } 1
 test util-5.10 {Tcl_StringMatch} {
-    string match a??c abc
+    stringMatchUTF a??c abc
 } 0
 test util-5.11 {Tcl_StringMatch} {
-    string match ?1??4???8? 0123456789
+    stringMatchUTF ?1??4???8? 0123456789
 } 1
 test util-5.12 {Tcl_StringMatch} {
-    string match {[abc]bc} abc
+    stringMatchUTF {[abc]bc} abc
 } 1
 test util-5.13 {Tcl_StringMatch: UTF-8} {
     # string += Tcl_UtfToUniChar(string, &ch);
 
-    string match "\[\u4e4fxy\]bc" "\u4e4fbc"
+    stringMatchUTF "\[\u4e4fxy\]bc" "\u4e4fbc"
 } 1
 test util-5.14 {Tcl_StringMatch} {
     # if ((*pattern == ']') || (*pattern == '\0'))
     # badly formed pattern
 
-    string match {[]} {[]}
+    stringMatchUTF {[]} {[]}
 } 0
 test util-5.15 {Tcl_StringMatch} {
     # if ((*pattern == ']') || (*pattern == '\0'))
-    # badly formed pattern
+    # badly formed set in pattern is not a set
 
-    string match {[} {[}
-} 0
+    stringMatchUTF {[} {[}
+} 1
 test util-5.16 {Tcl_StringMatch} {
-    string match {a[abc]c} abc
+    stringMatchUTF {a[abc]c} abc
 } 1
 test util-5.17 {Tcl_StringMatch: UTF-8} {
     # pattern += Tcl_UtfToUniChar(pattern, &endChar);
     # get 1 UTF-8 character
 
-    string match "a\[a\u4e4fc]c" "a\u4e4fc"
+    stringMatchUTF "a\[a\u4e4fc]c" "a\u4e4fc"
 } 1
 test util-5.18 {Tcl_StringMatch: UTF-8} {
     # pattern += Tcl_UtfToUniChar(pattern, &endChar);
     # proper advance: wrong answer would match on UTF trail byte of \u4e4f
 
-    string match {a[a\u4e4fc]c} [bytestring a\u008fc]
+    stringMatchUTF {a[a\u4e4fc]c} [bytestring a\u008fc]
 } 0
 test util-5.19 {Tcl_StringMatch: UTF-8} {
     # pattern += Tcl_UtfToUniChar(pattern, &endChar);
     # proper advance.
 
-    string match {a[a\u4e4fc]c} "acc"
+    stringMatchUTF {a[a\u4e4fc]c} "acc"
 } 1
 test util-5.20 {Tcl_StringMatch} {
-    string match {a[xyz]c} abc
+    stringMatchUTF {a[xyz]c} abc
 } 0
 test util-5.21 {Tcl_StringMatch} {
-    string match {12[2-7]45} 12345
+    stringMatchUTF {12[2-7]45} 12345
 } 1
 test util-5.22 {Tcl_StringMatch: UTF-8 range} {
-    string match "\[\u4e00-\u4e4f]" "0"
+    stringMatchUTF "\[\u4e00-\u4e4f]" "0"
 } 0
 test util-5.23 {Tcl_StringMatch: UTF-8 range} {
-    string match "\[\u4e00-\u4e4f]" "\u4e33"
+    stringMatchUTF "\[\u4e00-\u4e4f]" "\u4e33"
 } 1
 test util-5.24 {Tcl_StringMatch: UTF-8 range} {
-    string match "\[\u4e00-\u4e4f]" "\uff08"
+    stringMatchUTF "\[\u4e00-\u4e4f]" "\uff08"
 } 0
 test util-5.25 {Tcl_StringMatch} {
-    string match {12[ab2-4cd]45} 12345
+    stringMatchUTF {12[ab2-4cd]45} 12345
 } 1
 test util-5.26 {Tcl_StringMatch} {
-    string match {12[ab2-4cd]45} 12b45
+    stringMatchUTF {12[ab2-4cd]45} 12b45
 } 1
 test util-5.27 {Tcl_StringMatch} {
-    string match {12[ab2-4cd]45} 12d45
+    stringMatchUTF {12[ab2-4cd]45} 12d45
 } 1
 test util-5.28 {Tcl_StringMatch} {
-    string match {12[ab2-4cd]45} 12145
+    stringMatchUTF {12[ab2-4cd]45} 12145
 } 0
 test util-5.29 {Tcl_StringMatch} {
-    string match {12[ab2-4cd]45} 12545
+    stringMatchUTF {12[ab2-4cd]45} 12545
 } 0
 test util-5.30 {Tcl_StringMatch: forwards range} {
-    string match {[k-w]} "z"
+    stringMatchUTF {[k-w]} "z"
 } 0
 test util-5.31 {Tcl_StringMatch: forwards range} {
-    string match {[k-w]} "w"
+    stringMatchUTF {[k-w]} "w"
 } 1
 test util-5.32 {Tcl_StringMatch: forwards range} {
-    string match {[k-w]} "r"
+    stringMatchUTF {[k-w]} "r"
 } 1
 test util-5.33 {Tcl_StringMatch: forwards range} {
-    string match {[k-w]} "k"
+    stringMatchUTF {[k-w]} "k"
 } 1
 test util-5.34 {Tcl_StringMatch: forwards range} {
-    string match {[k-w]} "a"
+    stringMatchUTF {[k-w]} "a"
 } 0
 test util-5.35 {Tcl_StringMatch: reverse range} {
-    string match {[w-k]} "z"
+    stringMatchUTF {[w-k]} "z"
 } 0
 test util-5.36 {Tcl_StringMatch: reverse range} {
-    string match {[w-k]} "w"
+    stringMatchUTF {[w-k]} "w"
 } 1
 test util-5.37 {Tcl_StringMatch: reverse range} {
-    string match {[w-k]} "r"
+    stringMatchUTF {[w-k]} "r"
 } 1
 test util-5.38 {Tcl_StringMatch: reverse range} {
-    string match {[w-k]} "k"
+    stringMatchUTF {[w-k]} "k"
 } 1
 test util-5.39 {Tcl_StringMatch: reverse range} {
-    string match {[w-k]} "a"
+    stringMatchUTF {[w-k]} "a"
 } 0
 test util-5.40 {Tcl_StringMatch: skip correct number of ']'} {
-    string match {[A-]x} Ax
+    stringMatchUTF {[A-]x} Ax
 } 0
 test util-5.41 {Tcl_StringMatch: skip correct number of ']'} {
-    string match {[A-]]x} Ax
+    stringMatchUTF {[A-]]x} Ax
 } 1
 test util-5.42 {Tcl_StringMatch: skip correct number of ']'} {
-    string match {[A-]]x} \ue1x
+    stringMatchUTF {[A-]]x} \ue1x
 } 0
 test util-5.43 {Tcl_StringMatch: skip correct number of ']'} {
-    string match \[A-]\ue1]x \ue1x
+    stringMatchUTF \[A-]\ue1]x \ue1x
 } 1
 test util-5.44 {Tcl_StringMatch: skip correct number of ']'} {
-    string match {[A-]h]x} hx
+    stringMatchUTF {[A-]h]x} hx
 } 1
 test util-5.45 {Tcl_StringMatch} {
     # if (*pattern == '\0')
-    # badly formed pattern, still treats as a set
+    # badly formed pattern, so not treated as a set
 
-    string match {[a} a
-} 1
+    stringMatchUTF {[a} a
+} 0
 test util-5.46 {Tcl_StringMatch} {
-    string match {a\*b} a*b
+    stringMatchUTF {a\*b} a*b
 } 1
 test util-5.47 {Tcl_StringMatch} {
-    string match {a\*b} ab
+    stringMatchUTF {a\*b} ab
 } 0
 test util-5.48 {Tcl_StringMatch} {
-    string match {a\*\?\[\]\\\x} "a*?\[\]\\x"
+    stringMatchUTF {a\*\?\[\]\\\x} "a*?\[\]\\x"
 } 1
 test util-5.49 {Tcl_StringMatch} {
-    string match ** ""
+    stringMatchUTF ** ""
 } 1
 test util-5.50 {Tcl_StringMatch} {
-    string match *. ""
+    stringMatchUTF *. ""
 } 0
 test util-5.51 {Tcl_StringMatch} {
-    string match "" ""
+    stringMatchUTF "" ""
+} 1
+test util-5.52 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a- \[a-
+} 1
+test util-5.53 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a-c \[a-c
 } 1
+test util-5.54 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a a
+} 0
+test util-5.55 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a- a
+} 0
+test util-5.56 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a-c b
+} 0
+test util-5.57 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a-cf b
+} 0
+test util-5.58 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a-cf- b
+} 0
+test util-5.59 {Tcl_StringMatch, incomplete sets are not sets} {
+    stringMatchUTF \[a-cf-h b
+} 0
 
 test util-6.1 {Tcl_PrintDouble - using tcl_precision} {
     concat x[expr 1.4]
@@ -293,15 +321,3 @@
 # cleanup
 ::tcltest::cleanupTests
 return
-
-
-
-
-
-
-
-
-
-
-
-