Attachment "None" to
ticket [403932ffff]
added by
dkf
2001-02-21 23:39:48.
Index: ChangeLog
===================================================================
RCS file: /cvsroot/tcl/tcl/ChangeLog,v
retrieving revision 1.368
diff -u -r1.368 ChangeLog
--- ChangeLog 2001/02/16 09:26:30 1.368
+++ ChangeLog 2001/02/21 16:32:43
@@ -1,3 +1,27 @@
+2001-02-21 Donal K. Fellows <[email protected]>
+
+ * tests/util.test (util-5.*): Rewrote so as to test
+ Tcl_StringMatch and not Tcl_UniCharCaseMatch. The sense of
+ util-5.15 and util-5.45 have changed as well, due to different
+ interpretation of failed patterns. Added util-5.52 to util-5.59
+
+ * tests/string.test (string-11.29,11.50-11.57): Test for new
+ matching behaviour which matches definition in manual page more
+ closely.
+
+ * generic/tclUtf.c (Tcl_UniCharCaseMatch): Reworked so that
+ after-* handling worked properly when matching case-insensitively
+ and tackled the handling of incomplete match sets.
+
+ * generic/tclUtil.c (Tcl_StringCaseMatch): Reworked to be properly
+ Unicode aware and handle case-insensitivity correctly, and tackled
+ the handling of incomplete match sets.
+
+2001-02-20 Donal K. Fellows <[email protected]>
+
+ * tests/string.test (string-11.49): Added test for case in [string
+ match] where it fails to handle UTF-8 characters correctly.
+
2001-02-15 Donal K. Fellows <[email protected]>
* generic/tclCmdMZ.c (Tcl_SplitObjCmd): Improved efficiency of
Index: generic/tclUtf.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclUtf.c,v
retrieving revision 1.14
diff -u -r1.14 tclUtf.c
--- generic/tclUtf.c 2000/06/05 23:36:21 1.14
+++ generic/tclUtf.c 2001/02/21 16:32:43
@@ -1684,6 +1684,9 @@
if (p == 0) {
return 1;
}
+ if (nocase) {
+ p = Tcl_UniCharToLower(p);
+ }
while (1) {
/*
* Optimization for matching - cruise through the string
@@ -1729,20 +1732,43 @@
if (p == '[') {
Tcl_UniChar startChar, endChar;
+ CONST Tcl_UniChar *savedPattern, *savedString;
+
+ /*
+ * Save these for later in case the pattern itself is malformed
+ */
+ savedPattern = pattern;
+ savedString = string;
pattern++;
ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string);
string++;
while (1) {
- if ((*pattern == ']') || (*pattern == 0)) {
+ if (*pattern == ']') {
return 0;
+ } else if (*pattern == 0) {
+ /*
+ * Pattern malformed (no closing bracket) so fall
+ * back to exact matching from the start of where
+ * the pattern should have been.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
}
startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern);
pattern++;
if (*pattern == '-') {
pattern++;
if (*pattern == 0) {
- return 0;
+ /*
+ * Pattern malformed (no closing bracket) so
+ * fall back to exact matching from the start
+ * of where the pattern should have been.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
}
endChar = (nocase ? Tcl_UniCharToLower(*pattern)
: *pattern);
@@ -1758,10 +1784,31 @@
break;
}
}
+ /*
+ * At this point, we've found a match for the character
+ * that was at the front of the string in the match set,
+ * but we're looking for the end of the set itself, since
+ * we should not match an ill-formed pattern.
+ */
while (*pattern != ']') {
+ if (*pattern == '-') {
+ /*
+ * Where you see a "-", you should have a
+ * following character; the preceding character
+ * will have already been swallowed correctly
+ * though.
+ */
+ pattern++;
+ }
if (*pattern == 0) {
- pattern--;
- break;
+ /*
+ * Bracketed match set was not properly closed.
+ * Fall back to exact matching from the start of
+ * where the match set was.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
}
pattern++;
}
@@ -1785,6 +1832,7 @@
* bytes of each string match.
*/
+ matchLiteral:
if (nocase) {
if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) {
return 0;
Index: generic/tclUtil.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclUtil.c,v
retrieving revision 1.18
diff -u -r1.18 tclUtil.c
--- generic/tclUtil.c 2000/05/08 21:59:59 1.18
+++ generic/tclUtil.c 2001/02/21 16:32:43
@@ -1145,7 +1145,8 @@
* Tcl_StringCaseMatch --
*
* See if a particular string matches a particular pattern.
- * Allows case insensitivity.
+ * Allows case insensitivity. This is the UTF-8 equivalent of
+ * the Unicode Tcl_UniCharCaseMatch.
*
* Results:
* The return value is 1 if string matches pattern, and
@@ -1167,7 +1168,6 @@
int nocase; /* 0 for case sensitive, 1 for insensitive */
{
int p;
- CONST char *pstart = pattern;
Tcl_UniChar ch1, ch2;
while (1) {
@@ -1209,17 +1209,28 @@
* character
*/
if ((p != '[') && (p != '?') && (p != '\\')) {
+ (void) Tcl_UtfToUniChar(pattern, &ch2);
if (nocase) {
- while (*string && (p != *string)) {
- ch2 = Tcl_UtfToUniChar(string, &ch1);
- if (p == Tcl_UniCharToLower(ch1)) {
+ ch2 = Tcl_UniCharToLower(ch2);
+ while (*string) {
+ int charLen = Tcl_UtfToUniChar(string, &ch1);
+ if (ch2 == Tcl_UniCharToLower(ch1)) {
break;
}
- string += ch2;
+ string += charLen;
}
} else {
- while (*string && (p != *string)) {
- string += Tcl_UtfToUniChar(string, &ch1);
+ /*
+ * There's no point in trying to make this code
+ * shorter, as the number of bytes you want to
+ * compare each time is non-constant.
+ */
+ while (*string) {
+ int charLen = Tcl_UtfToUniChar(string, &ch1);
+ if (ch2 == ch1) {
+ break;
+ }
+ string += charLen;
}
}
}
@@ -1252,16 +1263,33 @@
if (p == '[') {
Tcl_UniChar startChar, endChar;
+ CONST char *savedPattern, *savedString;
+ /*
+ * Save these for later in case the pattern itself is malformed
+ */
+ savedPattern = pattern;
+ savedString = string;
+
pattern++;
string += Tcl_UtfToUniChar(string, &ch1);
if (nocase) {
ch1 = Tcl_UniCharToLower(ch1);
}
while (1) {
- if ((*pattern == ']') || (*pattern == '\0')) {
+ if (*pattern == ']') {
return 0;
}
+ if (*pattern == '\0') {
+ /*
+ * Pattern malformed (no closing bracket) so fall
+ * back to exact matching from the start of where
+ * the pattern should have been.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
+ }
pattern += Tcl_UtfToUniChar(pattern, &startChar);
if (nocase) {
startChar = Tcl_UniCharToLower(startChar);
@@ -1269,7 +1297,14 @@
if (*pattern == '-') {
pattern++;
if (*pattern == '\0') {
- return 0;
+ /*
+ * Pattern malformed (no closing bracket) so
+ * fall back to exact matching from the start
+ * of where the pattern should have been.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
}
pattern += Tcl_UtfToUniChar(pattern, &endChar);
if (nocase) {
@@ -1287,12 +1322,33 @@
break;
}
}
+ /*
+ * At this point, we've found a match for the character
+ * that was at the front of the string in the match set,
+ * but we're looking for the end of the set itself, since
+ * we should not match an ill-formed pattern.
+ */
while (*pattern != ']') {
+ if (*pattern == '-') {
+ /*
+ * Where you see a "-", you should have a
+ * following character; the preceding character
+ * will have already been swallowed correctly
+ * though.
+ */
+ pattern++;
+ }
if (*pattern == '\0') {
- pattern = Tcl_UtfPrev(pattern, pstart);
- break;
+ /*
+ * Bracketed match set was not properly closed.
+ * Fall back to exact matching from the start of
+ * where the match set was.
+ */
+ pattern = savedPattern;
+ string = savedString;
+ goto matchLiteral;
}
- pattern++;
+ pattern += Tcl_UtfToUniChar(pattern, /*DUMMY*/ &endChar);
}
pattern++;
continue;
@@ -1315,6 +1371,7 @@
* bytes of each string match.
*/
+ matchLiteral:
string += Tcl_UtfToUniChar(string, &ch1);
pattern += Tcl_UtfToUniChar(pattern, &ch2);
if (nocase) {
Index: tests/string.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/string.test,v
retrieving revision 1.26
diff -u -r1.26 string.test
--- tests/string.test 2000/09/06 18:35:13 1.26
+++ tests/string.test 2001/02/21 16:32:43
@@ -777,8 +777,8 @@
test string-11.28 {string match} {
string match "" ""
} 1
-test string-11.29 {string match} {
- string match \[a a
+test string-11.29 {string match, incomplete sets are not sets} {
+ string match \[a \[a
} 1
test string-11.30 {string match, bad args} {
list [catch {string match - b c} msg] $msg
@@ -843,6 +843,34 @@
test string-11.48 {string match, *special case} {
string match "?\\*" "a*"
} 1
+test string-11.49 {string match; *, -nocase and UTF-8} {
+ string match -nocase [binary format I 717316707] \
+ [binary format I 2028036707]
+} 1
+test string-11.50 {string match, incomplete sets are not sets} {
+ string match \[a- \[a-
+} 1
+test string-11.51 {string match, incomplete sets are not sets} {
+ string match \[a-c \[a-c
+} 1
+test string-11.52 {string match, incomplete sets are not sets} {
+ string match \[a a
+} 0
+test string-11.53 {string match, incomplete sets are not sets} {
+ string match \[a- a
+} 0
+test string-11.54 {string match, incomplete sets are not sets} {
+ string match \[a-c b
+} 0
+test string-11.55 {string match, incomplete sets are not sets} {
+ string match \[a-cf b
+} 0
+test string-11.56 {string match, incomplete sets are not sets} {
+ string match \[a-cf- b
+} 0
+test string-11.57 {string match, incomplete sets are not sets} {
+ string match \[a-cf-h b
+} 0
test string-12.1 {string range} {
list [catch {string range} msg] $msg
Index: tests/util.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/util.test,v
retrieving revision 1.7
diff -u -r1.7 util.test
--- tests/util.test 2000/04/10 17:19:06 1.7
+++ tests/util.test 2001/02/21 16:32:43
@@ -63,181 +63,209 @@
concat a { } c
} {a c}
+proc stringMatchUTF {pattern string} {
+ # Forces use of Tcl_StringMatch, not Tcl_UniCharCaseMatch
+ switch -glob -- $string $pattern {return 1} default {return 0}
+}
test util-5.1 {Tcl_StringMatch} {
- string match ab*c abc
+ stringMatchUTF ab*c abc
} 1
test util-5.2 {Tcl_StringMatch} {
- string match ab**c abc
+ stringMatchUTF ab**c abc
} 1
test util-5.3 {Tcl_StringMatch} {
- string match ab* abcdef
+ stringMatchUTF ab* abcdef
} 1
test util-5.4 {Tcl_StringMatch} {
- string match *c abc
+ stringMatchUTF *c abc
} 1
test util-5.5 {Tcl_StringMatch} {
- string match *3*6*9 0123456789
+ stringMatchUTF *3*6*9 0123456789
} 1
test util-5.6 {Tcl_StringMatch} {
- string match *3*6*9 01234567890
+ stringMatchUTF *3*6*9 01234567890
} 0
test util-5.7 {Tcl_StringMatch: UTF-8} {
- string match *u \u4e4fu
+ stringMatchUTF *u \u4e4fu
} 1
test util-5.8 {Tcl_StringMatch} {
- string match a?c abc
+ stringMatchUTF a?c abc
} 1
test util-5.9 {Tcl_StringMatch: UTF-8} {
# skip one character in string
- string match a?c a\u4e4fc
+ stringMatchUTF a?c a\u4e4fc
} 1
test util-5.10 {Tcl_StringMatch} {
- string match a??c abc
+ stringMatchUTF a??c abc
} 0
test util-5.11 {Tcl_StringMatch} {
- string match ?1??4???8? 0123456789
+ stringMatchUTF ?1??4???8? 0123456789
} 1
test util-5.12 {Tcl_StringMatch} {
- string match {[abc]bc} abc
+ stringMatchUTF {[abc]bc} abc
} 1
test util-5.13 {Tcl_StringMatch: UTF-8} {
# string += Tcl_UtfToUniChar(string, &ch);
- string match "\[\u4e4fxy\]bc" "\u4e4fbc"
+ stringMatchUTF "\[\u4e4fxy\]bc" "\u4e4fbc"
} 1
test util-5.14 {Tcl_StringMatch} {
# if ((*pattern == ']') || (*pattern == '\0'))
# badly formed pattern
- string match {[]} {[]}
+ stringMatchUTF {[]} {[]}
} 0
test util-5.15 {Tcl_StringMatch} {
# if ((*pattern == ']') || (*pattern == '\0'))
- # badly formed pattern
+ # badly formed set in pattern is not a set
- string match {[} {[}
-} 0
+ stringMatchUTF {[} {[}
+} 1
test util-5.16 {Tcl_StringMatch} {
- string match {a[abc]c} abc
+ stringMatchUTF {a[abc]c} abc
} 1
test util-5.17 {Tcl_StringMatch: UTF-8} {
# pattern += Tcl_UtfToUniChar(pattern, &endChar);
# get 1 UTF-8 character
- string match "a\[a\u4e4fc]c" "a\u4e4fc"
+ stringMatchUTF "a\[a\u4e4fc]c" "a\u4e4fc"
} 1
test util-5.18 {Tcl_StringMatch: UTF-8} {
# pattern += Tcl_UtfToUniChar(pattern, &endChar);
# proper advance: wrong answer would match on UTF trail byte of \u4e4f
- string match {a[a\u4e4fc]c} [bytestring a\u008fc]
+ stringMatchUTF {a[a\u4e4fc]c} [bytestring a\u008fc]
} 0
test util-5.19 {Tcl_StringMatch: UTF-8} {
# pattern += Tcl_UtfToUniChar(pattern, &endChar);
# proper advance.
- string match {a[a\u4e4fc]c} "acc"
+ stringMatchUTF {a[a\u4e4fc]c} "acc"
} 1
test util-5.20 {Tcl_StringMatch} {
- string match {a[xyz]c} abc
+ stringMatchUTF {a[xyz]c} abc
} 0
test util-5.21 {Tcl_StringMatch} {
- string match {12[2-7]45} 12345
+ stringMatchUTF {12[2-7]45} 12345
} 1
test util-5.22 {Tcl_StringMatch: UTF-8 range} {
- string match "\[\u4e00-\u4e4f]" "0"
+ stringMatchUTF "\[\u4e00-\u4e4f]" "0"
} 0
test util-5.23 {Tcl_StringMatch: UTF-8 range} {
- string match "\[\u4e00-\u4e4f]" "\u4e33"
+ stringMatchUTF "\[\u4e00-\u4e4f]" "\u4e33"
} 1
test util-5.24 {Tcl_StringMatch: UTF-8 range} {
- string match "\[\u4e00-\u4e4f]" "\uff08"
+ stringMatchUTF "\[\u4e00-\u4e4f]" "\uff08"
} 0
test util-5.25 {Tcl_StringMatch} {
- string match {12[ab2-4cd]45} 12345
+ stringMatchUTF {12[ab2-4cd]45} 12345
} 1
test util-5.26 {Tcl_StringMatch} {
- string match {12[ab2-4cd]45} 12b45
+ stringMatchUTF {12[ab2-4cd]45} 12b45
} 1
test util-5.27 {Tcl_StringMatch} {
- string match {12[ab2-4cd]45} 12d45
+ stringMatchUTF {12[ab2-4cd]45} 12d45
} 1
test util-5.28 {Tcl_StringMatch} {
- string match {12[ab2-4cd]45} 12145
+ stringMatchUTF {12[ab2-4cd]45} 12145
} 0
test util-5.29 {Tcl_StringMatch} {
- string match {12[ab2-4cd]45} 12545
+ stringMatchUTF {12[ab2-4cd]45} 12545
} 0
test util-5.30 {Tcl_StringMatch: forwards range} {
- string match {[k-w]} "z"
+ stringMatchUTF {[k-w]} "z"
} 0
test util-5.31 {Tcl_StringMatch: forwards range} {
- string match {[k-w]} "w"
+ stringMatchUTF {[k-w]} "w"
} 1
test util-5.32 {Tcl_StringMatch: forwards range} {
- string match {[k-w]} "r"
+ stringMatchUTF {[k-w]} "r"
} 1
test util-5.33 {Tcl_StringMatch: forwards range} {
- string match {[k-w]} "k"
+ stringMatchUTF {[k-w]} "k"
} 1
test util-5.34 {Tcl_StringMatch: forwards range} {
- string match {[k-w]} "a"
+ stringMatchUTF {[k-w]} "a"
} 0
test util-5.35 {Tcl_StringMatch: reverse range} {
- string match {[w-k]} "z"
+ stringMatchUTF {[w-k]} "z"
} 0
test util-5.36 {Tcl_StringMatch: reverse range} {
- string match {[w-k]} "w"
+ stringMatchUTF {[w-k]} "w"
} 1
test util-5.37 {Tcl_StringMatch: reverse range} {
- string match {[w-k]} "r"
+ stringMatchUTF {[w-k]} "r"
} 1
test util-5.38 {Tcl_StringMatch: reverse range} {
- string match {[w-k]} "k"
+ stringMatchUTF {[w-k]} "k"
} 1
test util-5.39 {Tcl_StringMatch: reverse range} {
- string match {[w-k]} "a"
+ stringMatchUTF {[w-k]} "a"
} 0
test util-5.40 {Tcl_StringMatch: skip correct number of ']'} {
- string match {[A-]x} Ax
+ stringMatchUTF {[A-]x} Ax
} 0
test util-5.41 {Tcl_StringMatch: skip correct number of ']'} {
- string match {[A-]]x} Ax
+ stringMatchUTF {[A-]]x} Ax
} 1
test util-5.42 {Tcl_StringMatch: skip correct number of ']'} {
- string match {[A-]]x} \ue1x
+ stringMatchUTF {[A-]]x} \ue1x
} 0
test util-5.43 {Tcl_StringMatch: skip correct number of ']'} {
- string match \[A-]\ue1]x \ue1x
+ stringMatchUTF \[A-]\ue1]x \ue1x
} 1
test util-5.44 {Tcl_StringMatch: skip correct number of ']'} {
- string match {[A-]h]x} hx
+ stringMatchUTF {[A-]h]x} hx
} 1
test util-5.45 {Tcl_StringMatch} {
# if (*pattern == '\0')
- # badly formed pattern, still treats as a set
+ # badly formed pattern, so not treated as a set
- string match {[a} a
-} 1
+ stringMatchUTF {[a} a
+} 0
test util-5.46 {Tcl_StringMatch} {
- string match {a\*b} a*b
+ stringMatchUTF {a\*b} a*b
} 1
test util-5.47 {Tcl_StringMatch} {
- string match {a\*b} ab
+ stringMatchUTF {a\*b} ab
} 0
test util-5.48 {Tcl_StringMatch} {
- string match {a\*\?\[\]\\\x} "a*?\[\]\\x"
+ stringMatchUTF {a\*\?\[\]\\\x} "a*?\[\]\\x"
} 1
test util-5.49 {Tcl_StringMatch} {
- string match ** ""
+ stringMatchUTF ** ""
} 1
test util-5.50 {Tcl_StringMatch} {
- string match *. ""
+ stringMatchUTF *. ""
} 0
test util-5.51 {Tcl_StringMatch} {
- string match "" ""
+ stringMatchUTF "" ""
+} 1
+test util-5.52 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a- \[a-
+} 1
+test util-5.53 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a-c \[a-c
} 1
+test util-5.54 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a a
+} 0
+test util-5.55 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a- a
+} 0
+test util-5.56 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a-c b
+} 0
+test util-5.57 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a-cf b
+} 0
+test util-5.58 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a-cf- b
+} 0
+test util-5.59 {Tcl_StringMatch, incomplete sets are not sets} {
+ stringMatchUTF \[a-cf-h b
+} 0
test util-6.1 {Tcl_PrintDouble - using tcl_precision} {
concat x[expr 1.4]
@@ -293,15 +321,3 @@
# cleanup
::tcltest::cleanupTests
return
-
-
-
-
-
-
-
-
-
-
-
-