Tcl Source Code

Artifact [14b4c463fe]
Login

Artifact 14b4c463fe91807fa43a2f3f4b877888e6b21e31:

Attachment "tcl_bug_2826551.patch" to ticket [2948425fff] added by mdejong 2010-02-09 18:26:25.
2010-02-09  Mo DeJong  <[email protected]>

        Tcl Bug 2826551 regexp bugs related to
        -all -line and -start and newlines

	* generic/tclCmdMZ.c (Tcl_RegexpObjCmd):
        If -offset is given, treat it as the start
        of the line if the previous character was
        a newline. Fix nasty edge case where a zero
        length match would not advance the index.
        * tests/regexp.test: Add regression tests back
        ported from Jacl. Checks for a number of issues
        related to -line and newline handling. A few
        of tests were broken before the patch and
        continue to be broken, marked as knownBug.

Index: generic/tclCmdMZ.c
===================================================================
RCS file: /cvsroot/tcl/tcl/generic/tclCmdMZ.c,v
retrieving revision 1.198
diff -u -r1.198 tclCmdMZ.c
--- generic/tclCmdMZ.c	25 Dec 2009 22:45:05 -0000	1.198
+++ generic/tclCmdMZ.c	9 Feb 2010 11:14:20 -0000
@@ -108,7 +108,7 @@
     Tcl_Obj *const objv[])	/* Argument objects. */
 {
     int i, indices, match, about, offset, all, doinline, numMatchesSaved;
-    int cflags, eflags, stringLength;
+    int cflags, eflags, stringLength, matchLength;
     Tcl_RegExp regExpr;
     Tcl_Obj *objPtr, *startIndex = NULL, *resultPtr = NULL;
     Tcl_RegExpInfo info;
@@ -250,15 +250,6 @@
 	return TCL_ERROR;
     }
 
-    if (offset > 0) {
-	/*
-	 * Add flag if using offset (string is part of a larger string), so
-	 * that "^" won't match.
-	 */
-
-	eflags |= TCL_REG_NOTBOL;
-    }
-
     objc -= 2;
     objv += 2;
 
@@ -286,11 +277,25 @@
      */
 
     while (1) {
+        /*
+         * Pass either 0 or TCL_REG_NOTBOL in the eflags. Passing 
+         * TCL_REG_NOTBOL indicates that the character at offset
+         * should not be considered the start of the line. If
+         * for example the pattern {^} is passed and -start is positive,
+         * then the pattern will not match the start of the string
+         * unless the previous character is a newline.
+         */
+
+        if ((offset == 0) ||
+            ((offset > 0) &&
+                (Tcl_GetUniChar(objPtr,offset-1) == (Tcl_UniChar)'\n'))) {
+	    eflags = 0;
+        } else {
+	    eflags = TCL_REG_NOTBOL;
+        }
+
 	match = Tcl_RegExpExecObj(interp, regExpr, objPtr,
-		offset /* offset */, numMatchesSaved, eflags
-		| ((offset > 0 &&
-		(Tcl_GetUniChar(objPtr,offset-1) != (Tcl_UniChar)'\n'))
-		? TCL_REG_NOTBOL : 0));
+		offset, numMatchesSaved, eflags);
 
 	if (match < 0) {
 	    return TCL_ERROR;
@@ -408,12 +413,19 @@
 	 * offset never changes).
 	 */
 
-	if (info.matches[0].end == 0) {
+	matchLength = (info.matches[0].end - info.matches[0].start);
+
+	offset += info.matches[0].end;
+
+        /*
+         * A match of length zero could happen for {^} {$} or {.*}
+         * and in these cases we always want to bump the index up one.
+         */
+
+	if (matchLength == 0) {
 	    offset++;
 	}
-	offset += info.matches[0].end;
 	all++;
-	eflags |= TCL_REG_NOTBOL;
 	if (offset >= stringLength) {
 	    break;
 	}
Index: tests/regexp.test
===================================================================
RCS file: /cvsroot/tcl/tcl/tests/regexp.test,v
retrieving revision 1.34
diff -u -r1.34 regexp.test
--- tests/regexp.test	21 Sep 2009 21:30:41 -0000	1.34
+++ tests/regexp.test	9 Feb 2010 11:14:21 -0000
@@ -311,12 +311,39 @@
 test regexp-7.18 {basic regsub replacement} {
     list [regsub a+ aaa {&} foo] $foo
 } {1 aaa}
-test regexp-7.19 {basic regsub backslash replacement} {
+test regexp-7.19 {basic regsub replacement} {
+    list [regsub a+ aaa {\&} foo] $foo
+} {1 &}
+test regexp-7.20 {basic regsub replacement} {
+    list [regsub a+ aaa {\\&} foo] $foo
+} {1 {\aaa}}
+test regexp-7.21 {basic regsub replacement} {
+    list [regsub a+ aaa {\\\&} foo] $foo
+} {1 {\&}}
+test regexp-7.22 {basic regsub replacement} {
     list [regsub a+ aaa {\0} foo] $foo
 } {1 aaa}
-test regexp-7.20 {basic regsub backslash replacement} {
+test regexp-7.23 {basic regsub replacement} {
+    list [regsub a+ aaa {\\0} foo] $foo
+} {1 {\0}}
+test regexp-7.24 {basic regsub replacement} {
     list [regsub a+ aaa {\\\0} foo] $foo
 } {1 {\aaa}}
+test regexp-7.25 {basic regsub replacement} {
+    list [regsub a+ aaa {\\\\0} foo] $foo
+} {1 {\\0}}
+test regexp-7.26 {dollar zero is not a backslash replacement} {
+    list [regsub a+ aaa {$0} foo] $foo
+} {1 {$0}}
+test regexp-7.27 {dollar zero is not a backslash replacement} {
+    list [regsub a+ aaa {\0$0} foo] $foo
+} {1 {aaa$0}}
+test regexp-7.28 {dollar zero is not a backslash replacement} {
+    list [regsub a+ aaa {\$0} foo] $foo
+} {1 {\$0}}
+test regexp-7.29 {dollar zero is not a backslash replacement} {
+    list [regsub a+ aaa {\\} foo] $foo
+} {1 \\}
 
 test regexp-8.1 {case conversion in regsub} {
     list [regsub -nocase a(a+) xaAAaAAay & foo] $foo
@@ -700,6 +727,337 @@
     regexp -- {([bc])\1} bb
 } 1
 
+test regexp-23.1 {regexp -all and -line} {
+    set string ""
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1}] \
+    [list {0 -1}] \
+    [list {0 -1}] \
+  ]
+
+test regexp-23.2 {regexp -all and -line} {
+    set string "\n"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1}] \
+    [list {0 -1}] \
+    [list {0 -1}] \
+  ]
+
+test regexp-23.3 {regexp -all and -line} {
+    set string "\n\n"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {1 0}] \
+    [list {0 -1} {1 0}] \
+    [list {0 -1} {1 0}] \
+  ]
+
+test regexp-23.4 {regexp -all and -line} {
+    set string "a"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1}] \
+    [list {0 0}] \
+    [list {1 0}] \
+  ]
+
+test regexp-23.5 {regexp -all and -line} {knownBug} {
+    set string "a\n"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {2 1}] \
+    [list {0 0} {2 1}] \
+    [list {1 0} {2 1}] \
+  ]
+
+test regexp-23.6 {regexp -all and -line} {
+    set string "\na"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {1 0}] \
+    [list {0 -1} {1 1}] \
+    [list {0 -1} {2 1}] \
+  ]
+
+test regexp-23.7 {regexp -all and -line} {knownBug} {
+    set string "ab\n"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {3 2}] \
+    [list {0 1} {3 2}] \
+    [list {2 1} {3 2}] \
+  ]
+
+test regexp-23.8 {regexp -all and -line} {
+    set string "a\nb"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {2 1}] \
+    [list {0 0} {2 2}] \
+    [list {1 0} {3 2}] \
+  ]
+
+test regexp-23.9 {regexp -all and -line} {knownBug} {
+    set string "a\nb\n"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {2 1} {4 3}] \
+    [list {0 0} {2 2} {4 3}] \
+    [list {1 0} {3 2} {4 3}] \
+  ]
+
+test regexp-23.10 {regexp -all and -line} {
+    set string "a\nb\nc"
+    list \
+        [regexp -all -inline -indices -line -- {^} $string] \
+        [regexp -all -inline -indices -line -- {^.*$} $string] \
+        [regexp -all -inline -indices -line -- {$} $string]
+} [list \
+    [list {0 -1} {2 1} {4 3}] \
+    [list {0 0} {2 2} {4 4}] \
+    [list {1 0} {3 2} {5 4}] \
+  ]
+
+test regexp-23.11 {regexp -all and -line} {
+    regexp -all -inline -indices -line -- {b} "abb\nb"
+} {{1 1} {2 2} {4 4}}
+
+
+test regexp-24.1 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string ""
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} {1 <> 1 <> 1 <>}
+
+test regexp-24.2 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "\n"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 2 "<>\n<>" 2 "<>\n<>" 2 "<>\n<>"]
+
+test regexp-24.3 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "\n\n"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 3 "<>\n<>\n<>" 3 "<>\n<>\n<>" 3 "<>\n<>\n<>"]
+
+test regexp-24.4 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "a"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 1 "<>a" 1 "<a>" 1 "a<>"]
+
+test regexp-24.5 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "a\n"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 2 "<>a\n<>" 2 "<a>\n<>" 2 "a<>\n<>"]
+
+test regexp-24.6 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "\na"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 2 "<>\n<>a" 2 "<>\n<a>" 2 "<>\na<>"]
+
+test regexp-24.7 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "ab\n"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 2 "<>ab\n<>" 2 "<ab>\n<>" 2 "ab<>\n<>"]
+
+test regexp-24.8 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "a\nb"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 2 "<>a\n<>b" 2 "<a>\n<b>" 2 "a<>\nb<>"]
+
+test regexp-24.9 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "a\nb\n"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 3 "<>a\n<>b\n<>" 3 "<a>\n<b>\n<>" 3 "a<>\nb<>\n<>"]
+
+test regexp-24.10 {regsub -all and -line} {
+    foreach {v1 v2 v3} {{} {} {}} {}
+    set string "a\nb\nc"
+    list \
+        [regsub -line -all {^} $string {<&>} v1] $v1 \
+        [regsub -line -all {^.*$} $string {<&>} v2] $v2 \
+        [regsub -line -all {$} $string {<&>} v3] $v3
+} [list 3 "<>a\n<>b\n<>c" 3 "<a>\n<b>\n<c>" 3 "a<>\nb<>\nc<>"]
+
+test regexp-24.11 {regsub -all and -line} {
+    regsub -line -all {b} "abb\nb" {<&>}
+} "a<b><b>\n<b>"
+
+
+test regexp-25.1 {regexp without -line option} {
+    set foo ""
+    list [regexp {a.*b} "dabc\naxyb\n" foo] $foo
+} [list 1 abc\naxyb]
+
+test regexp-25.2 {regexp without -line option} {
+    set foo ""
+    list [regexp {^a.*b$} "dabc\naxyb\n" foo] $foo
+} {0 {}}
+
+test regexp-25.3 {regexp with -line option} {
+    set foo ""
+    list [regexp -line {^a.*b$} "dabc\naxyb\n" foo] $foo
+} {1 axyb}
+
+test regexp-25.4 {regexp with -line option} {
+    set foo ""
+    list [regexp -line {^a.*b$} "dabc\naxyb\nxb" foo] $foo
+} {1 axyb}
+
+test regexp-25.5 {regexp without -line option} {
+    set foo ""
+    list [regexp {^a.*b$} "dabc\naxyb\nxb" foo] $foo
+} {0 {}}
+
+test regexp-25.6 {regexp without -line option} {
+    set foo ""
+    list [regexp {a.*b$} "dabc\naxyb\nxb" foo] $foo
+} "1 {abc\naxyb\nxb}"
+
+test regexp-25.7 {regexp with -lineanchor option} {
+    set foo ""
+    list [regexp -lineanchor {^a.*b$} "dabc\naxyb\nxb" foo] $foo
+} "1 {axyb\nxb}"
+
+test regexp-25.8 {regexp with -lineanchor and -linestop option} {
+    set foo ""
+    list [regexp -lineanchor -linestop {^a.*b$} "dabc\naxyb\nxb" foo] $foo
+} {1 axyb}
+
+test regexp-25.9 {regexp with -linestop option} {
+    set foo ""
+    list [regexp -linestop {a.*b} "ab\naxyb\nxb" foo] $foo
+} {1 ab}
+
+test regexp-26.1 {matches start of line 1 time} {
+    regexp -all -inline -- {^a+} "aab\naaa"
+} {aa}
+
+test regexp-26.2 {matches start of line(s) 2 times} {
+    regexp -all -inline -line -- {^a+} "aab\naaa"
+} {aa aaa}
+
+test regexp-26.3 {effect of -line -all and -start} {
+    list \
+    [regexp -all -inline -line -start 0 -- {^a+} "aab\naaa"] \
+    [regexp -all -inline -line -start 1 -- {^a+} "aab\naaa"] \
+    [regexp -all -inline -line -start 3 -- {^a+} "aab\naaa"] \
+    [regexp -all -inline -line -start 4 -- {^a+} "aab\naaa"] \
+} {{aa aaa} aaa aaa aaa}
+
+test regexp-26.5 {match length 0, match length 1} {
+    regexp -all -inline -line -- {^b*} "a\nb"
+} {{} b}
+
+test regexp-26.6 {non reporting capture group} {
+    regexp -all -inline -line -- {^(?:a+|b)} "aab\naaa"
+} {aa aaa}
+
+test regexp-26.7 {Tcl bug 2826551: -line sensitive regexp and -start} {
+    set match1 {}
+    set match2 {}
+    list \
+      [regexp -start 0 -indices -line {^a} "\nab" match1] $match1 \
+      [regexp -start 1 -indices -line {^a} "\nab" match2] $match2
+} {1 {1 1} 1 {1 1}}
+
+test regexp-26.8 {Tcl bug 2826551: diff regexp with -line option} {
+    set data "@1\n2\n+3\n@4\n-5\n+6\n7\n@8\n9\n"
+    regexp -all -inline -line {^@.*\n(?:[^@].*\n?)*} $data
+} [list \
+    "@1\n2\n+3\n" \
+    "@4\n-5\n+6\n7\n" \
+    "@8\n9\n" \
+  ]
+
+test regexp-26.9 {Tcl bug 2826551: diff regexp with embedded -line option} {
+    set data "@1\n2\n+3\n@4\n-5\n+6\n7\n@8\n9\n"
+    regexp -all -inline {(?n)^@.*\n(?:[^@].*\n?)*} $data
+} [list \
+    "@1\n2\n+3\n" \
+    "@4\n-5\n+6\n7\n" \
+    "@8\n9\n" \
+  ]
+
+test regexp-26.10 {regexp with -line option} {
+    regexp -all -inline -line -- {a*} "a\n"
+} {a {}}
+
+test regexp-26.11 {regexp without -line option} {
+    regexp -all -inline -- {a*} "a\n"
+} {a {}}
+
+test regexp-26.12 {regexp with -line option} {
+    regexp -all -inline -line -- {a*} "b\n"
+} {{} {}}
+
+test regexp-26.13 {regexp without -line option} {
+    regexp -all -inline -- {a*} "b\n"
+} {{} {}}
+
 # cleanup
 ::tcltest::cleanupTests
 return