Tcl Source Code: Artifact [f33d430f32]

Artifact f33d430f321c5e9ca2898f967c52a780a176ce6d:

Attachment "tcl-regex.patch" to ticket [3606683fff] added by tgl 2013-03-04 00:34:41.
diff -pcdr Tcl_head/generic/regc_nfa.c Tcl_regex_fix/generic/regc_nfa.c
*** Tcl_head/generic/regc_nfa.c	Thu Feb 28 12:09:26 2013
--- Tcl_regex_fix/generic/regc_nfa.c	Sun Mar  3 11:53:57 2013
*************** freearc(
*** 497,502 ****
--- 497,538 ----
  }
  
  /*
+  - nonemptyouts - count non-EMPTY out arcs of a state
+  ^ static int nonemptyouts(struct state *);
+  */
+ static int
+ nonemptyouts(
+     struct state *s)
+ {
+     int n = 0;
+     struct arc *a;
+ 
+     for (a = s->outs; a != NULL; a = a->outchain) {
+ 	if (a->type != EMPTY)
+ 	    n++;
+     }
+     return n;
+ }
+ 
+ /*
+  - nonemptyins - count non-EMPTY in arcs of a state
+  ^ static int nonemptyins(struct state *);
+  */
+ static int
+ nonemptyins(
+     struct state *s)
+ {
+     int n = 0;
+     struct arc *a;
+ 
+     for (a = s->ins; a != NULL; a = a->inchain) {
+ 	if (a->type != EMPTY)
+ 	    n++;
+     }
+     return n;
+ }
+ 
+ /*
   - findarc - find arc, if any, from given source with given type and color
   * If there is more than one such arc, the result is random.
   ^ static struct arc *findarc(struct state *, int, pcolor);
*************** copyins(
*** 578,583 ****
--- 614,639 ----
  }
  
  /*
+  - copynonemptyins - as above, but ignore empty arcs
+  ^ static void copynonemptyins(struct nfa *, struct state *, struct state *);
+  */
+ static void
+ copynonemptyins(
+     struct nfa *nfa,
+     struct state *oldState,
+     struct state *newState)
+ {
+     struct arc *a;
+ 
+     assert(oldState != newState);
+ 
+     for (a=oldState->ins ; a!=NULL ; a=a->inchain) {
+ 	if (a->type != EMPTY)
+ 	    cparc(nfa, a, a->from, newState);
+     }
+ }
+ 
+ /*
   - moveouts - move all out arcs of a state to another state
   ^ static void moveouts(struct nfa *, struct state *, struct state *);
   */
*************** copyouts(
*** 617,622 ****
--- 673,698 ----
  }
  
  /*
+  - copynonemptyouts - as above, but ignore empty arcs
+  ^ static void copynonemptyouts(struct nfa *, struct state *, struct state *);
+  */
+ static void
+ copynonemptyouts(
+     struct nfa *nfa,
+     struct state *oldState,
+     struct state *newState)
+ {
+     struct arc *a;
+ 
+     assert(oldState != newState);
+ 
+     for (a=oldState->outs ; a!=NULL ; a=a->outchain) {
+ 	if (a->type != EMPTY)
+ 	    cparc(nfa, a, newState, a->to);
+     }
+ }
+ 
+ /*
   - cloneouts - copy out arcs of a state to another state pair, modifying type
   ^ static void cloneouts(struct nfa *, struct state *, struct state *,
   ^ 	struct state *, int);
*************** fixempties(
*** 1247,1364 ****
      FILE *f)			/* for debug output; NULL none */
  {
      struct state *s;
      struct state *nexts;
-     struct state *to;
      struct arc *a;
      struct arc *nexta;
-     int progress;
  
      /*
!      * Find and eliminate empties until there are no more.
       */
  
!     do {
! 	progress = 0;
! 	for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
! 	    nexts = s->next;
! 	    for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) {
! 		if (a->type == EMPTY) {
! 
! 		    /*
! 		     *  Mark a for deletion; copy arcs to preserve graph
! 		     * connectivity after it is gone.
! 		     */
! 
! 		    unempty(nfa, a);
! 		}
  	    }
  
  	    /*
! 	     * Now pass through and delete the marked arcs.  Doing all the
! 	     * deletion after all the marking prevents arc copying from
! 	     * resurrecting deleted arcs which can cause failure to converge.
! 	     * [Tcl Bug 3604074]
  	     */
  
! 	    for (a = s->outs; a != NULL; a = nexta) {
! 		nexta = a->outchain;
! 		if (a->from == NULL) {
! 		    progress = 1;
! 		    to = a->to;
! 		    a->from = s;
! 		    freearc(nfa, a);
! 		    if (to->nins == 0) {
! 			while ((a = to->outs)) {
! 			    freearc(nfa, a);
! 			}
! 			if (nexts == to) {
! 			    nexts = to->next;
! 			}
! 			freestate(nfa, to);
! 		    }
! 		    if (s->nouts == 0) {
! 			while ((a = s->ins)) {
! 			    freearc(nfa, a);
! 			}
! 			freestate(nfa, s);
! 		    }
! 		}
! 	    }
  	}
! 	if (progress && f != NULL) {
! 	    dumpnfa(nfa, f);
  	}
!     } while (progress && !NISERR());
  }
  
  /*
!  - unempty - optimize out an EMPTY arc, if possible
!  * Actually, as it stands this function always succeeds, but the return value
!  * is kept with an eye on possible future changes.
!  ^ static int unempty(struct nfa *, struct arc *);
   */
! static int			/* 0 couldn't, 1 could */
! unempty(
!     struct nfa *nfa,
!     struct arc *a)
  {
!     struct state *from = a->from;
!     struct state *to = a->to;
! 
!     assert(a->type == EMPTY);
!     assert(from != nfa->pre && to != nfa->post);
  
!     if (from == to) {		/* vacuous loop */
! 	freearc(nfa, a);
! 	return 1;
      }
  
      /*
!      *  Mark arc for deletion.
       */
  
!     a->from = NULL;
! 
!     if (from->nouts > to->nins) {
! 	copyouts(nfa, to, from);
! 	return 1;
      }
!     if (from->nouts < to->nins) {
! 	copyins(nfa, from, to);
! 	return 1;
      }
  
      /*
!      * from->nouts == to->nins . decide on secondary issue:  copy fewest arcs
       */
- 
      if (from->nins > to->nouts) {
! 	copyouts(nfa, to, from);
! 	return 1;
      }
- 
-     copyins(nfa, from, to);
-     return 1;
  }
  
  /*
--- 1323,1513 ----
      FILE *f)			/* for debug output; NULL none */
  {
      struct state *s;
+     struct state *s2;
      struct state *nexts;
      struct arc *a;
      struct arc *nexta;
  
      /*
!      * First, get rid of any states whose sole out-arc is an EMPTY, since
!      * they're basically just aliases for their successor.  The parsing
!      * algorithm creates enough of these that it's worth special-casing this.
       */
+     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+ 	nexts = s->next;
+ 	if (s->nouts == 1 && !s->flag) {
+ 	    a = s->outs;
+ 	    assert(a != NULL && a->outchain == NULL);
+ 	    if (a->type == EMPTY) {
+ 		if (s != a->to)
+ 		    moveins(nfa, s, a->to);
+ 		dropstate(nfa, s);
+ 	    }
+ 	}
+     }
  
!     /*
!      * Similarly, get rid of any state with a single EMPTY in-arc, by folding
!      * it into its predecessor.
!      */
!     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
! 	nexts = s->next;
! 	/* while we're at it, ensure tmp fields are clear for next step */
! 	s->tmp = NULL;
! 	if (s->nins == 1 && !s->flag) {
! 	    a = s->ins;
! 	    assert(a != NULL && a->inchain == NULL);
! 	    if (a->type == EMPTY) {
! 		if (s != a->from)
! 		    moveouts(nfa, s, a->from);
! 		dropstate(nfa, s);
  	    }
+ 	}
+     }
  
+     /*
+      * For each remaining NFA state, find all other states that are reachable
+      * from it by a chain of one or more EMPTY arcs.  Then generate new arcs
+      * that eliminate the need for each such chain.
+      *
+      * If we just do this straightforwardly, the algorithm gets slow in
+      * complex graphs, because the same arcs get copied to all intermediate
+      * states of an EMPTY chain, and then uselessly pushed repeatedly to the
+      * chain's final state; we waste a lot of time in newarc's duplicate
+      * checking.  To improve matters, we decree that any state with only EMPTY
+      * out-arcs is "doomed" and will not be part of the final NFA. That can be
+      * ensured by not adding any new out-arcs to such a state. Having ensured
+      * that, we need not update the state's in-arcs list either; all arcs that
+      * might have gotten pushed forward to it will just get pushed directly to
+      * successor states.  This eliminates most of the useless duplicate arcs.
+      */
+     for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
+ 	for (s2 = emptyreachable(s, s); s2 != s && !NISERR(); s2 = nexts) {
  	    /*
! 	     * If s2 is doomed, we decide that (1) we will always push arcs
! 	     * forward to it, not pull them back to s; and (2) we can optimize
! 	     * away the push-forward, per comment above.  So do nothing.
  	     */
+ 	    if (s2->flag || nonemptyouts(s2) > 0)
+ 		replaceempty(nfa, s, s2);
  
! 	    /* Reset the tmp fields as we walk back */
! 	    nexts = s2->tmp;
! 	    s2->tmp = NULL;
  	}
! 	s->tmp = NULL;
!     }
! 
!     /*
!      * Now remove all the EMPTY arcs, since we don't need them anymore.
!      */
!     for (s = nfa->states; s != NULL && !NISERR(); s = s->next) {
! 	for (a = s->outs; a != NULL; a = nexta) {
! 	    nexta = a->outchain;
! 	    if (a->type == EMPTY)
! 		freearc(nfa, a);
  	}
!     }
! 
!     /*
!      * And remove any states that have become useless.  (This cleanup is not
!      * very thorough, and would be even less so if we tried to combine it with
!      * the previous step; but cleanup() will take care of anything we miss.)
!      */
!     for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
! 	nexts = s->next;
! 	if ((s->nins == 0 || s->nouts == 0) && !s->flag)
! 	    dropstate(nfa, s);
!     }
! 
!     if (f != NULL && !NISERR())
! 	dumpnfa(nfa, f);
  }
  
  /*
!  - emptyreachable - recursively find all states reachable from s by EMPTY arcs
!  * The return value is the last such state found.  Its tmp field links back
!  * to the next-to-last such state, and so on back to s, so that all these
!  * states can be located without searching the whole NFA.
!  * The maximum recursion depth here is equal to the length of the longest
!  * loop-free chain of EMPTY arcs, which is surely no more than the size of
!  * the NFA, and in practice will be a lot less than that.
!  ^ static struct state *emptyreachable(struct state *, struct state *);
   */
! static struct state *
! emptyreachable(
!     struct state *s,
!     struct state *lastfound)
  {
!     struct arc *a;
  
!     s->tmp = lastfound;
!     lastfound = s;
!     for (a = s->outs; a != NULL; a = a->outchain) {
! 	if (a->type == EMPTY && a->to->tmp == NULL)
! 	    lastfound = emptyreachable(a->to, lastfound);
      }
+     return lastfound;
+ }
+ 
+ /*
+  - replaceempty - replace an EMPTY arc chain with some non-empty arcs
+  * The EMPTY arc(s) should be deleted later, but we can't do it here because
+  * they may still be needed to identify other arc chains during fixempties().
+  ^ static void replaceempty(struct nfa *, struct state *, struct state *);
+  */
+ static void
+ replaceempty(
+     struct nfa *nfa,
+     struct state *from,
+     struct state *to)
+ {
+     int fromouts;
+     int toins;
+ 
+     assert(from != to);
  
      /*
!      * Create replacement arcs that bypass the need for the EMPTY chain.  We
!      * can do this either by pushing arcs forward (linking directly from
!      * "from"'s predecessors to "to") or by pulling them back (linking
!      * directly from "from" to "to"'s successors).  In general, we choose
!      * whichever way creates greater fan-out or fan-in, so as to improve the
!      * odds of reducing the other state to zero in-arcs or out-arcs and
!      * thereby being able to delete it.  However, if "from" is doomed (has no
!      * non-EMPTY out-arcs), we must keep it so, so always push forward in that
!      * case.
!      *
!      * The fan-out/fan-in comparison should count only non-EMPTY arcs.  If
!      * "from" is doomed, we can skip counting "to"'s arcs, since we want to
!      * force taking the copynonemptyins path in that case.
       */
+     fromouts = nonemptyouts(from);
+     toins = (fromouts == 0) ? 1 : nonemptyins(to);
  
!     if (fromouts > toins) {
! 	copynonemptyouts(nfa, to, from);
! 	return;
      }
!     if (fromouts < toins) {
! 	copynonemptyins(nfa, from, to);
! 	return;
      }
  
      /*
!      * fromouts == toins.  Decide on secondary issue: copy fewest arcs.
!      *
!      * Doesn't seem to be worth the trouble to exclude empties from these
!      * comparisons; that takes extra time and doesn't seem to improve the
!      * resulting graph much.
       */
      if (from->nins > to->nouts) {
! 	copynonemptyouts(nfa, to, from);
! 	return;
!     } else {
! 	copynonemptyins(nfa, from, to);
! 	return;
      }
  }
  
  /*
diff -pcdr Tcl_head/generic/regcomp.c Tcl_regex_fix/generic/regcomp.c
*** Tcl_head/generic/regcomp.c	Thu Feb 28 12:09:26 2013
--- Tcl_regex_fix/generic/regcomp.c	Sun Mar  3 12:15:48 2013
*************** static void destroystate(struct nfa *, s
*** 121,132 ****
--- 121,136 ----
  static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
  static struct arc *allocarc(struct nfa *, struct state *);
  static void freearc(struct nfa *, struct arc *);
+ static int nonemptyouts(struct state *);
+ static int nonemptyins(struct state *);
  static struct arc *findarc(struct state *, int, pcolor);
  static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
  static void moveins(struct nfa *, struct state *, struct state *);
  static void copyins(struct nfa *, struct state *, struct state *);
+ static void copynonemptyins(struct nfa *, struct state *, struct state *);
  static void moveouts(struct nfa *, struct state *, struct state *);
  static void copyouts(struct nfa *, struct state *, struct state *);
+ static void copynonemptyouts(struct nfa *, struct state *, struct state *);
  static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
  static void delsub(struct nfa *, struct state *, struct state *);
  static void deltraverse(struct nfa *, struct state *, struct state *);
*************** static int push(struct nfa *, struct arc
*** 144,150 ****
  #define	COMPATIBLE	3	/* compatible but not satisfied yet */
  static int combine(struct arc *, struct arc *);
  static void fixempties(struct nfa *, FILE *);
! static int unempty(struct nfa *, struct arc *);
  static void cleanup(struct nfa *);
  static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
  static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
--- 148,155 ----
  #define	COMPATIBLE	3	/* compatible but not satisfied yet */
  static int combine(struct arc *, struct arc *);
  static void fixempties(struct nfa *, FILE *);
! static struct state *emptyreachable(struct state *, struct state *);
! static void replaceempty(struct nfa *, struct state *, struct state *);
  static void cleanup(struct nfa *);
  static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
  static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);