Tcl Source Code: Artifact [5db04169a8]

Artifact 5db04169a82f52c44dbaa9d3c62428d322d3b98e:

Attachment "zpatchdesc" to ticket [3487443fff] added by tgl 2012-02-14 08:55:46.
The attached patch fixes the case of "\n*", that is a backref with *
quantification directly attached to it.

The core of the problem is in regcomp.c, which first changes "\n*" to
"\n+|" (lines 1167ff as of 8.5.11) and then applies repeat() to the NFA
representing the backref atom (line 1198).  repeat() thinks that any arc
leading into its "rp" argument is part of the sub-NFA to be repeated ---
see moveins() call at line 1363.  Unfortunately, since line 1170 already
created the arc that was intended to represent the empty bypass around
"\n+", this arc gets moved too, so that it now leads into the state loop
created by repeat().  Thus, what was supposed to be an "empty" bypass gets
turned into something that represents zero or more repetitions of the NFA
representing the backref atom.  In the original example, in place of
	^([bc])\1*$
we now have something that acts like
	^([bc])(\1+|[bc]*)$
At runtime, the branch involving the actual backref fails, as it's supposed
to, but then the other branch succeeds anyway.

We could no doubt fix this by some rearrangement of the operations in
parseqatom(), but that code is plenty ugly already.  What I propose instead
is to allow the m == 0 case to be handled at runtime, so that there is no
need to install the empty-alternative branch at all for the backref case.
This makes the patch in regcomp.c a one-liner, at the cost of having to
tweak cbrdissect() a little.  In the attached patch I went a bit further
than that and rewrote cbrdissect() to check all the string-length-related
conditions before it starts comparing characters.  It seems a bit stupid to
possibly iterate through many many copies of an n-character backreference,
only to fail at the end because the target string's length isn't a multiple
of n --- we could have found that out before starting.  The existing coding
could only be a win if integer division is hugely expensive compared to
character comparison, but I don't know of any modern machine where that
might be true.

The patch also corrects an obviously bogus comment at regexec.c:802.
Perhaps at one time it was valid, but now this is the only caller of
cbrdissect() ...

This does not fix all the problems with quantified back-references.  In
particular, the code is completely broken for back-references that appear
within a larger expression that is quantified, so several of the cases
shown in the comments to this bug entry still don't work.  I do not have
a solution for that yet; it looks to me like it may require major surgery.
However, the changes I propose now seem appropriate to make anyway.  Doing
direct handling of quantification in a simple back-ref is a performance win
compared to anything we can possibly do at a higher level, so this code
path seems worth preserving even if we need another solution for the more
general case.


diff -cr tcl8.5.11.orig/generic/regcomp.c tcl8.5.11/generic/regcomp.c
*** tcl8.5.11.orig/generic/regcomp.c	Tue Nov  1 10:04:51 2011
--- tcl8.5.11/generic/regcomp.c	Mon Feb 13 16:03:05 2012
***************
*** 1161,1170 ****
      }
  
      /*
!      * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
       */
  
!     if (m == 0) {
  	EMPTYARC(s2, atom->end);/* the bypass */
  	assert(PREF(qprefer) != 0);
  	f = COMBINE(qprefer, atom->flags);
--- 1161,1172 ----
      }
  
      /*
!      * It's quantifier time.  If the atom is just a BACKREF, we'll let it deal
!      * with quantifiers internally.  Otherwise, the first step is to turn
!      * x{0,...} into x{1,...}|empty
       */
  
!     if (m == 0 && atomtype != BACKREF) {
  	EMPTYARC(s2, atom->end);/* the bypass */
  	assert(PREF(qprefer) != 0);
  	f = COMBINE(qprefer, atom->flags);
diff -cr tcl8.5.11.orig/generic/regexec.c tcl8.5.11/generic/regexec.c
*** tcl8.5.11.orig/generic/regexec.c	Tue Nov  1 10:04:51 2011
--- tcl8.5.11/generic/regexec.c	Mon Feb 13 16:03:11 2012
***************
*** 799,805 ****
      case '|':			/* alternation */
  	assert(t->left != NULL);
  	return caltdissect(v, t, begin, end);
!     case 'b':			/* back ref -- shouldn't be calling us! */
  	assert(t->left == NULL && t->right == NULL);
  	return cbrdissect(v, t, begin, end);
      case '.':			/* concatenation */
--- 799,805 ----
      case '|':			/* alternation */
  	assert(t->left != NULL);
  	return caltdissect(v, t, begin, end);
!     case 'b':			/* back reference */
  	assert(t->left == NULL && t->right == NULL);
  	return cbrdissect(v, t, begin, end);
      case '.':			/* concatenation */
***************
*** 1065,1076 ****
      chr *begin,			/* beginning of relevant substring */
      chr *end)			/* end of same */
  {
-     int i;
      int n = t->subno;
!     size_t len;
!     chr *paren;
      chr *p;
-     chr *stop;
      int min = t->min;
      int max = t->max;
  
--- 1065,1076 ----
      chr *begin,			/* beginning of relevant substring */
      chr *end)			/* end of same */
  {
      int n = t->subno;
!     size_t numreps;
!     size_t tlen;
!     size_t brlen;
!     chr *brstring;
      chr *p;
      int min = t->min;
      int max = t->max;
  
***************
*** 1081,1091 ****
  
      MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
  
      if (v->pmatch[n].rm_so == -1) {
  	return REG_NOMATCH;
      }
!     paren = v->start + v->pmatch[n].rm_so;
!     len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
  
      /*
       * No room to maneuver -- retries are pointless.
--- 1081,1092 ----
  
      MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
  
+     /* Get the backreferenced string */
      if (v->pmatch[n].rm_so == -1) {
  	return REG_NOMATCH;
      }
!     brstring = v->start + v->pmatch[n].rm_so;
!     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
  
      /*
       * No room to maneuver -- retries are pointless.
***************
*** 1096,1146 ****
      }
      v->mem[t->retry] = 1;
  
!     /*
!      * Special-case zero-length string.
!      */
! 
!     if (len == 0) {
! 	if (begin == end) {
  	    return REG_OKAY;
  	}
  	return REG_NOMATCH;
      }
! 
!     /*
!      * And too-short string.
!      */
! 
!     assert(end >= begin);
!     if ((size_t)(end - begin) < len) {
  	return REG_NOMATCH;
      }
-     stop = end - len;
  
      /*
!      * Count occurrences.
       */
  
!     i = 0;
!     for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
! 	if ((*v->g->compare)(paren, p, len) != 0) {
! 	    break;
! 	}
! 	i++;
      }
-     MDEBUG(("cbackref found %d\n", i));
- 
-     /*
-      * And sort it out.
-      */
  
!     if (p != end) {		/* didn't consume all of it */
! 	return REG_NOMATCH;
!     }
!     if (min <= i && (i <= max || max == INFINITY)) {
! 	return REG_OKAY;
!     }
!     return REG_NOMATCH;		/* out of range */
  }
  
  /*
--- 1097,1145 ----
      }
      v->mem[t->retry] = 1;
  
!     /* Special cases for zero-length strings */
!     if (brlen == 0) {
! 	/*
! 	 * Matches only if target is zero length, but any number of
! 	 * repetitions can be considered to be present
! 	 */
! 	if (begin == end && min <= max) {
! 	    MDEBUG(("cbackref matched trivially\n"));
  	    return REG_OKAY;
  	}
  	return REG_NOMATCH;
      }
!     if (begin == end) {
! 	/* Matches only if zero repetitions are okay */
! 	if (min == 0) {
! 	    MDEBUG(("cbackref matched trivially\n"));
! 	    return REG_OKAY;
! 	}
  	return REG_NOMATCH;
      }
  
      /*
!      * Check target length to see if it could possibly be an allowed number of
!      * repetitions of brstring
       */
+     assert(end > begin);
+     tlen = end - begin;
+     if (tlen % brlen != 0)
+ 	return REG_NOMATCH;
+     numreps = tlen / brlen;
+     if (numreps < min || (numreps > max && max != INFINITY))
+ 	return REG_NOMATCH;
  
!     /* Okay, compare the actual string contents */
!     p = begin;
!     while (numreps-- > 0) {
! 	if ((*v->g->compare) (brstring, p, brlen) != 0)
! 	    return REG_NOMATCH;
! 	p += brlen;
      }
  
!     MDEBUG(("cbackref matched\n"));
!     return REG_OKAY;
  }
  
  /*