Attachment "lookbehind.patch" to
ticket [d351de9d7b]
added by
tgl
2015-11-06 19:41:41.
diff -pcdr src/doc/re_syntax.n lookbehind/doc/re_syntax.n
*** src/doc/re_syntax.n Mon Sep 21 18:12:24 2015
--- lookbehind/doc/re_syntax.n Fri Nov 6 12:55:30 2015
*************** substring matching \fIre\fR begins
*** 152,161 ****
.
\fInegative lookahead\fR (AREs only), matches at any point where no
substring matching \fIre\fR begins
.RE
.PP
! The lookahead constraints may not contain back references (see later),
! and all parentheses within them are considered non-capturing.
.PP
An RE may not end with
.QW \fB\e\fR .
--- 152,171 ----
.
\fInegative lookahead\fR (AREs only), matches at any point where no
substring matching \fIre\fR begins
+ .TP
+ \fB(?<=\fIre\fB)\fR
+ .
+ \fIpositive lookbehind\fR (AREs only), matches at any point where a
+ substring matching \fIre\fR ends
+ .TP
+ \fB(?<!\fIre\fB)\fR
+ .
+ \fInegative lookbehind\fR (AREs only), matches at any point where no
+ substring matching \fIre\fR ends
.RE
.PP
! Lookahead and lookbehind constraints may not contain back references
! (see later), and all parentheses within them are considered non-capturing.
.PP
An RE may not end with
.QW \fB\e\fR .
*************** Incompatibilities of note include
*** 784,790 ****
the lack of special treatment for a trailing newline, the addition of
complemented bracket expressions to the things affected by
newline-sensitive matching, the restrictions on parentheses and back
! references in lookahead constraints, and the longest/shortest-match
(rather than first-match) matching semantics.
.PP
The matching rules for REs containing both normal and non-greedy
--- 794,800 ----
the lack of special treatment for a trailing newline, the addition of
complemented bracket expressions to the things affected by
newline-sensitive matching, the restrictions on parentheses and back
! references in lookahead/lookbehind constraints, and the longest/shortest-match
(rather than first-match) matching semantics.
.PP
The matching rules for REs containing both normal and non-greedy
diff -pcdr src/generic/regc_lex.c lookbehind/generic/regc_lex.c
*** src/generic/regc_lex.c Mon Sep 21 18:12:24 2015
--- lookbehind/generic/regc_lex.c Fri Nov 6 13:00:17 2015
*************** next(
*** 613,618 ****
--- 613,621 ----
if ((v->cflags®_ADVF) && NEXT1('?')) {
NOTE(REG_UNONPOSIX);
v->now++;
+ if (ATEOS()) {
+ FAILW(REG_BADRPT);
+ }
switch (*v->now++) {
case CHR(':'): /* non-capturing paren */
RETV('(', 0);
*************** next(
*** 628,639 ****
return next(v);
break;
case CHR('='): /* positive lookahead */
! NOTE(REG_ULOOKAHEAD);
! RETV(LACON, 1);
break;
case CHR('!'): /* negative lookahead */
! NOTE(REG_ULOOKAHEAD);
! RETV(LACON, 0);
break;
default:
FAILW(REG_BADRPT);
--- 631,661 ----
return next(v);
break;
case CHR('='): /* positive lookahead */
! NOTE(REG_ULOOKAROUND);
! RETV(LACON, LATYPE_AHEAD_POS);
break;
case CHR('!'): /* negative lookahead */
! NOTE(REG_ULOOKAROUND);
! RETV(LACON, LATYPE_AHEAD_NEG);
! break;
! case CHR('<'):
! if (ATEOS()) {
! FAILW(REG_BADRPT);
! }
! switch (*v->now++) {
! case CHR('='): /* positive lookbehind */
! NOTE(REG_ULOOKAROUND);
! RETV(LACON, LATYPE_BEHIND_POS);
! break;
! case CHR('!'): /* negative lookbehind */
! NOTE(REG_ULOOKAROUND);
! RETV(LACON, LATYPE_BEHIND_NEG);
! break;
! default:
! FAILW(REG_BADRPT);
! break;
! }
! assert(NOTREACHED);
break;
default:
FAILW(REG_BADRPT);
diff -pcdr src/generic/regc_nfa.c lookbehind/generic/regc_nfa.c
*** src/generic/regc_nfa.c Tue Oct 27 14:39:17 2015
--- lookbehind/generic/regc_nfa.c Fri Nov 6 13:46:52 2015
*************** newarc(
*** 293,299 ****
}
}
}
!
/* no dup, so create the arc */
createarc(nfa, t, co, from, to);
}
--- 293,299 ----
}
}
}
!
/* no dup, so create the arc */
createarc(nfa, t, co, from, to);
}
*************** sortins_cmp(
*** 657,663 ****
}
return 0;
}
!
/*
* sortouts - sort the out arcs of a state by to/color/type
*/
--- 657,663 ----
}
return 0;
}
!
/*
* sortouts - sort the out arcs of a state by to/color/type
*/
*************** cleartraverse(
*** 1357,1362 ****
--- 1357,1407 ----
}
/*
+ - single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
+ * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
+ * of a set of colors), return a state whose outarc list contains only PLAIN
+ * arcs of those color(s). Otherwise return NULL.
+ * This is used before optimizing the NFA, so there may be EMPTY arcs, which
+ * we should ignore; the possibility of an EMPTY is why the result state could
+ * be different from s1.
+ * It's worth troubling to handle multiple parallel PLAIN arcs here because a
+ * bracket construct such as [abc] might yield either one or several parallel
+ * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
+ * that implementation detail not create user-visible performance differences.
+ */
+ static struct state *
+ single_color_transition(
+ struct state *s1,
+ struct state *s2)
+ {
+ struct arc *a;
+
+ /* Ignore leading EMPTY arc, if any */
+ if (s1->nouts == 1 && s1->outs->type == EMPTY) {
+ s1 = s1->outs->to;
+ }
+ /* Likewise for any trailing EMPTY arc */
+ if (s2->nins == 1 && s2->ins->type == EMPTY) {
+ s2 = s2->ins->from;
+ }
+ /* Perhaps we could have a single-state loop in between, if so reject */
+ if (s1 == s2) {
+ return NULL;
+ }
+ /* s1 must have at least one outarc... */
+ if (s1->outs == NULL) {
+ return NULL;
+ }
+ /* ... and they must all be PLAIN arcs to s2 */
+ for (a = s1->outs; a != NULL; a = a->outchain) {
+ if (a->type != PLAIN || a->to != s2)
+ return NULL;
+ }
+ /* OK, return s1 as the possessor of the relevant outarcs */
+ return s1;
+ }
+
+ /*
- specialcolors - fill in special colors for an NFA
^ static void specialcolors(struct nfa *);
*/
*************** fixempties(
*** 2020,2026 ****
arcarray[arccount++] = a;
}
}
!
/* Reset the tmp fields as we walk back */
nexts = s2->tmp;
s2->tmp = NULL;
--- 2065,2071 ----
arcarray[arccount++] = a;
}
}
!
/* Reset the tmp fields as we walk back */
nexts = s2->tmp;
s2->tmp = NULL;
*************** fixempties(
*** 2042,2048 ****
}
inarcsorig[s->no] = a;
}
!
FREE(arcarray);
FREE(inarcsorig);
--- 2087,2093 ----
}
inarcsorig[s->no] = a;
}
!
FREE(arcarray);
FREE(inarcsorig);
*************** fixconstraintloops(
*** 2193,2199 ****
dropstate(nfa, s);
}
}
!
/* Nothing to do if no remaining constraint arcs */
if (NISERR() || !hasconstraints) {
return;
--- 2238,2244 ----
dropstate(nfa, s);
}
}
!
/* Nothing to do if no remaining constraint arcs */
if (NISERR() || !hasconstraints) {
return;
*************** carc_cmp(
*** 2909,2915 ****
{
const struct carc *aa = (const struct carc *) a;
const struct carc *bb = (const struct carc *) b;
!
if (aa->co < bb->co) {
return -1;
}
--- 2954,2960 ----
{
const struct carc *aa = (const struct carc *) a;
const struct carc *bb = (const struct carc *) b;
!
if (aa->co < bb->co) {
return -1;
}
diff -pcdr src/generic/regcomp.c lookbehind/generic/regcomp.c
*** src/generic/regcomp.c Tue Oct 27 14:39:17 2015
--- lookbehind/generic/regcomp.c Fri Nov 6 13:45:32 2015
*************** static const chr *scanplain(struct vars
*** 56,61 ****
--- 56,63 ----
static void onechr(struct vars *, pchr, struct state *, struct state *);
static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
static void wordchrs(struct vars *);
+ static void processlacon(struct vars *, struct state *, struct state *, int,
+ struct state *, struct state *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
*************** static int numst(struct subre *, int);
*** 64,70 ****
static void markst(struct subre *);
static void cleanst(struct vars *);
static long nfatree(struct vars *, struct subre *, FILE *);
! static long nfanode(struct vars *, struct subre *, FILE *);
static int newlacon(struct vars *, struct state *, struct state *, int);
static void freelacons(struct subre *, int);
static void rfree(regex_t *);
--- 66,72 ----
static void markst(struct subre *);
static void cleanst(struct vars *);
static long nfatree(struct vars *, struct subre *, FILE *);
! static long nfanode(struct vars *, struct subre *, int, FILE *);
static int newlacon(struct vars *, struct state *, struct state *, int);
static void freelacons(struct subre *, int);
static void rfree(regex_t *);
*************** static void deltraverse(struct nfa *, st
*** 138,143 ****
--- 140,146 ----
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
static void duptraverse(struct nfa *, struct state *, struct state *, int);
static void cleartraverse(struct nfa *, struct state *);
+ static struct state *single_color_transition(struct state *, struct state *);
static void specialcolors(struct nfa *);
static long optimize(struct nfa *, FILE *);
static void pullback(struct nfa *, FILE *);
*************** struct vars {
*** 222,229 ****
int ntree; /* number of tree nodes, plus one */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
! struct subre *lacons; /* lookahead-constraint vector */
! int nlacons; /* size of lacons */
size_t spaceused; /* approx. space used for compilation */
};
--- 225,233 ----
int ntree; /* number of tree nodes, plus one */
struct cvec *cv; /* interface cvec */
struct cvec *cv2; /* utility cvec */
! struct subre *lacons; /* lookaround-constraint vector */
! int nlacons; /* size of lacons[]; note that only slots
! * numbered 1 .. nlacons-1 are used */
size_t spaceused; /* approx. space used for compilation */
};
*************** struct vars {
*** 254,260 ****
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
#define RANGE 'R' /* - within [] which might be range delim. */
! #define LACON 'L' /* lookahead constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
#define BEHIND 'r' /* color-lookbehind arc */
#define WBDRY 'w' /* word boundary constraint */
--- 258,264 ----
#define CCLASS 'C' /* start of [: */
#define END 'X' /* end of [. [= [: */
#define RANGE 'R' /* - within [] which might be range delim. */
! #define LACON 'L' /* lookaround constraint subRE */
#define AHEAD 'a' /* color-lookahead arc */
#define BEHIND 'r' /* color-lookbehind arc */
#define WBDRY 'w' /* word boundary constraint */
*************** compile(
*** 412,421 ****
CNOERR();
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++) {
if (debug != NULL) {
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
}
! nfanode(v, &v->lacons[i], debug);
}
CNOERR();
if (v->tree->flags&SHORTER) {
--- 416,428 ----
CNOERR();
assert(v->nlacons == 0 || v->lacons != NULL);
for (i = 1; i < v->nlacons; i++) {
+ struct subre *lasub = &v->lacons[i];
+
if (debug != NULL) {
fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
}
! /* Prepend .* to pattern if it's a lookbehind LACON */
! nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
}
CNOERR();
if (v->tree->flags&SHORTER) {
*************** static struct subre *
*** 653,659 ****
parse(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookahead subRE) or PLAIN */
struct state *init, /* initial state */
struct state *final) /* final state */
{
--- 660,666 ----
parse(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookaround subRE) or PLAIN */
struct state *init, /* initial state */
struct state *final) /* final state */
{
*************** static struct subre *
*** 735,741 ****
parsebranch(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookahead subRE) or PLAIN */
struct state *left, /* leftmost state */
struct state *right, /* rightmost state */
int partial) /* is this only part of a branch? */
--- 742,748 ----
parsebranch(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookaround subRE) or PLAIN */
struct state *left, /* leftmost state */
struct state *right, /* rightmost state */
int partial) /* is this only part of a branch? */
*************** static void
*** 784,790 ****
parseqatom(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookahead subRE) or PLAIN */
struct state *lp, /* left state to hang it on */
struct state *rp, /* right state to hang it on */
struct subre *top) /* subtree top */
--- 791,797 ----
parseqatom(
struct vars *v,
int stopper, /* EOS or ')' */
! int type, /* LACON (lookaround subRE) or PLAIN */
struct state *lp, /* left state to hang it on */
struct state *rp, /* right state to hang it on */
struct subre *top) /* subtree top */
*************** parseqatom(
*** 796,802 ****
struct subre *atom; /* atom's subtree */
struct subre *t;
int cap; /* capturing parens? */
! int pos; /* positive lookahead? */
int subno; /* capturing-parens or backref number */
int atomtype;
int qprefer; /* quantifier short/long preference */
--- 803,809 ----
struct subre *atom; /* atom's subtree */
struct subre *t;
int cap; /* capturing parens? */
! int latype; /* lookaround constraint type */
int subno; /* capturing-parens or backref number */
int atomtype;
int qprefer; /* quantifier short/long preference */
*************** parseqatom(
*** 879,897 ****
nonword(v, BEHIND, lp, s);
nonword(v, AHEAD, s, rp);
return;
! case LACON: /* lookahead constraint */
! pos = v->nextvalue;
NEXT();
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERR();
t = parse(v, ')', LACON, s, s2);
freesubre(v, t); /* internal structure irrelevant */
- assert(SEE(')') || ISERR());
- NEXT();
- n = newlacon(v, s, s2, pos);
NOERR();
! ARCV(LACON, n);
return;
/*
--- 886,903 ----
nonword(v, BEHIND, lp, s);
nonword(v, AHEAD, s, rp);
return;
! case LACON: /* lookaround constraint */
! latype = v->nextvalue;
NEXT();
s = newstate(v->nfa);
s2 = newstate(v->nfa);
NOERR();
t = parse(v, ')', LACON, s, s2);
freesubre(v, t); /* internal structure irrelevant */
NOERR();
! assert(SEE(')'));
! NEXT();
! processlacon(v, s, s2, latype, lp, rp);
return;
/*
*************** wordchrs(
*** 1719,1724 ****
--- 1725,1794 ----
}
/*
+ - processlacon - generate the NFA representation of a LACON
+ * In the general case this is just newlacon() + newarc(), but some cases
+ * can be optimized.
+ */
+ static void
+ processlacon(
+ struct vars *v,
+ struct state *begin, /* start of parsed LACON sub-re */
+ struct state *end, /* end of parsed LACON sub-re */
+ int latype,
+ struct state *lp, /* left state to hang it on */
+ struct state *rp) /* right state to hang it on */
+ {
+ struct state *s1;
+ int n;
+
+ /*
+ * Check for lookaround RE consisting of a single plain color arc (or set
+ * of arcs); this would typically be a simple chr or a bracket expression.
+ */
+ s1 = single_color_transition(begin, end);
+ switch (latype) {
+ case LATYPE_AHEAD_POS:
+ /* If lookahead RE is just colorset C, convert to AHEAD(C) */
+ if (s1 != NULL) {
+ cloneouts(v->nfa, s1, lp, rp, AHEAD);
+ return;
+ }
+ break;
+ case LATYPE_AHEAD_NEG:
+ /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
+ if (s1 != NULL) {
+ colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
+ newarc(v->nfa, '$', 1, lp, rp);
+ newarc(v->nfa, '$', 0, lp, rp);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_POS:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
+ if (s1 != NULL) {
+ cloneouts(v->nfa, s1, lp, rp, BEHIND);
+ return;
+ }
+ break;
+ case LATYPE_BEHIND_NEG:
+ /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
+ if (s1 != NULL) {
+ colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
+ newarc(v->nfa, '^', 1, lp, rp);
+ newarc(v->nfa, '^', 0, lp, rp);
+ return;
+ }
+ break;
+ default:
+ assert(NOTREACHED);
+ }
+
+ /* General case: we need a LACON subre and arc */
+ n = newlacon(v, begin, end, latype);
+ newarc(v->nfa, LACON, n, lp, rp);
+ }
+
+ /*
- subre - allocate a subre
^ static struct subre *subre(struct vars *, int, int, struct state *,
^ struct state *);
*************** nfatree(
*** 1926,1942 ****
(DISCARD) nfatree(v, t->right, f);
}
! return nfanode(v, t, f);
}
/*
! - nfanode - do one NFA for nfatree
! ^ static long nfanode(struct vars *, struct subre *, FILE *);
*/
static long /* optimize results */
nfanode(
struct vars *v,
struct subre *t,
FILE *f) /* for debug output */
{
struct nfa *nfa;
--- 1996,2014 ----
(DISCARD) nfatree(v, t->right, f);
}
! return nfanode(v, t, 0, f);
}
/*
! - nfanode - do one NFA for nfatree or lacons
! * If converttosearch is true, apply makesearch() to the NFA.
! ^ static long nfanode(struct vars *, struct subre *, int, FILE *);
*/
static long /* optimize results */
nfanode(
struct vars *v,
struct subre *t,
+ int converttosearch,
FILE *f) /* for debug output */
{
struct nfa *nfa;
*************** nfanode(
*** 1954,1961 ****
--- 2026,2038 ----
dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
if (!ISERR()) {
specialcolors(nfa);
+ }
+ if (!ISERR()) {
ret = optimize(nfa, f);
}
+ if (converttosearch && !ISERR()) {
+ makesearch(v, nfa);
+ }
if (!ISERR()) {
compact(nfa, &t->cnfa);
}
*************** nfanode(
*** 1965,1971 ****
}
/*
! - newlacon - allocate a lookahead-constraint subRE
^ static int newlacon(struct vars *, struct state *, struct state *, int);
*/
static int /* lacon number */
--- 2042,2048 ----
}
/*
! - newlacon - allocate a lookaround-constraint subRE
^ static int newlacon(struct vars *, struct state *, struct state *, int);
*/
static int /* lacon number */
*************** newlacon(
*** 1973,1979 ****
struct vars *v,
struct state *begin,
struct state *end,
! int pos)
{
int n;
struct subre *newlacons;
--- 2050,2056 ----
struct vars *v,
struct state *begin,
struct state *end,
! int latype)
{
int n;
struct subre *newlacons;
*************** newlacon(
*** 1998,2010 ****
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
! sub->subno = pos;
ZAPCNFA(sub->cnfa);
return n;
}
/*
! - freelacons - free lookahead-constraint subRE vector
^ static void freelacons(struct subre *, int);
*/
static void
--- 2075,2087 ----
sub = &v->lacons[n];
sub->begin = begin;
sub->end = end;
! sub->subno = latype;
ZAPCNFA(sub->cnfa);
return n;
}
/*
! - freelacons - free lookaround-constraint subRE vector
^ static void freelacons(struct subre *, int);
*/
static void
*************** dump(
*** 2095,2103 ****
dumpcnfa(&g->search, f);
}
for (i = 1; i < g->nlacons; i++) {
! fprintf(f, "\nla%d (%s):\n", i,
! (g->lacons[i].subno) ? "positive" : "negative");
! dumpcnfa(&g->lacons[i].cnfa, f);
}
fprintf(f, "\n");
dumpst(g->tree, f, 0);
--- 2172,2199 ----
dumpcnfa(&g->search, f);
}
for (i = 1; i < g->nlacons; i++) {
! struct subre *lasub = &g->lacons[i];
! const char *latype;
!
! switch (lasub->subno) {
! case LATYPE_AHEAD_POS:
! latype = "positive lookahead";
! break;
! case LATYPE_AHEAD_NEG:
! latype = "negative lookahead";
! break;
! case LATYPE_BEHIND_POS:
! latype = "positive lookbehind";
! break;
! case LATYPE_BEHIND_NEG:
! latype = "negative lookbehind";
! break;
! default:
! latype = "???";
! break;
! }
! fprintf(f, "\nla%d (%s):\n", i, latype);
! dumpcnfa(&lasub->cnfa, f);
}
fprintf(f, "\n");
dumpst(g->tree, f, 0);
diff -pcdr src/generic/rege_dfa.c lookbehind/generic/rege_dfa.c
*** src/generic/rege_dfa.c Mon Sep 21 18:12:24 2015
--- lookbehind/generic/rege_dfa.c Fri Nov 6 13:46:57 2015
*************** shortest(
*** 282,287 ****
--- 282,402 ----
}
/*
+ - matchuntil - incremental matching engine
+ * This is meant for use with a search-style NFA (that is, the pattern is
+ * known to act as though it had a leading .*). We determine whether a
+ * match exists starting at v->start and ending at probe. Multiple calls
+ * require only O(N) time not O(N^2) so long as the probe values are
+ * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
+ * starting a series of calls.
+ * Returns 1 if a match exists, 0 if not.
+ * Internal errors also return 0, with v->err set.
+ */
+ static int
+ matchuntil(
+ struct vars *v,
+ struct dfa *d,
+ chr *probe, /* we want to know if a match ends here */
+ struct sset **lastcss, /* state storage across calls */
+ chr **lastcp) /* state storage across calls */
+ {
+ chr *cp = *lastcp;
+ color co;
+ struct sset *css = *lastcss;
+ struct sset *ss;
+ struct colormap *cm = d->cm;
+
+ /* initialize and startup, or restart, if necessary */
+ if (cp == NULL || cp > probe) {
+ cp = v->start;
+ css = initialize(v, d, cp);
+ if (css == NULL)
+ return 0;
+
+ FDEBUG((">>> startup >>>\n"));
+ co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+
+ css = miss(v, d, css, co, cp, v->start);
+ if (css == NULL)
+ return 0;
+ css->lastseen = cp;
+ }
+ else if (css == NULL) {
+ /* we previously found that no match is possible beyond *lastcp */
+ return 0;
+ }
+ ss = css;
+
+ /*
+ * This is the main text-scanning loop. It seems worth having two copies
+ * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
+ * builds, when you're not actively tracing.
+ */
+ #ifdef REG_DEBUG
+ if (v->eflags & REG_FTRACE) {
+ while (cp < probe) {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+ else
+ #endif
+ {
+ while (cp < probe) {
+ co = GETCOLOR(cm, *cp);
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ if (ss == NULL)
+ break; /* NOTE BREAK OUT */
+ }
+ cp++;
+ ss->lastseen = cp;
+ css = ss;
+ }
+ }
+
+ *lastcss = ss;
+ *lastcp = cp;
+
+ if (ss == NULL) {
+ return 0; /* impossible match, or internal error */
+ }
+
+ /* We need to process one more chr, or the EOS symbol, to check match */
+ if (cp < v->stop) {
+ FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
+ co = GETCOLOR(cm, *cp);
+ FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
+ ss = css->outs[co];
+ if (ss == NULL) {
+ ss = miss(v, d, css, co, cp + 1, v->start);
+ }
+ } else {
+ assert(cp == v->stop);
+ co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
+ FDEBUG(("color %ld\n", (long) co));
+ ss = miss(v, d, css, co, cp, v->start);
+ }
+
+ if (ss == NULL || !(ss->flags & POSTSTATE)) {
+ return 0;
+ }
+ return 1;
+ }
+
+ /*
- lastCold - determine last point at which no progress had been made
^ static chr *lastCold(struct vars *, struct dfa *);
*/
*************** miss(
*** 593,599 ****
*/
}
! if (!sawLAConstraints) { /* lookahead conds. always cache miss */
FDEBUG(("c%d[%d]->c%d\n",
(int) (css - d->ssets), co, (int) (p - d->ssets)));
css->outs[co] = p;
--- 708,714 ----
*/
}
! if (!sawLAConstraints) { /* lookaround conds. always cache miss */
FDEBUG(("c%d[%d]->c%d\n",
(int) (css - d->ssets), co, (int) (p - d->ssets)));
css->outs[co] = p;
*************** miss(
*** 605,611 ****
}
/*
! - checkLAConstraint - lookahead-constraint checker for miss()
^ static int checkLAConstraint(struct vars *, struct cnfa *, chr *, pcolor);
*/
static int /* predicate: constraint satisfied? */
--- 720,726 ----
}
/*
! - checkLAConstraint - lookaround-constraint checker for miss()
^ static int checkLAConstraint(struct vars *, struct cnfa *, chr *, pcolor);
*/
static int /* predicate: constraint satisfied? */
*************** checkLAConstraint(
*** 613,639 ****
struct vars *const v,
struct cnfa *const pcnfa, /* parent cnfa */
chr *const cp,
! const pcolor co) /* "color" of the lookahead constraint */
{
int n;
struct subre *sub;
struct dfa *d;
- struct smalldfa sd;
chr *end;
n = co - pcnfa->ncolors;
! assert(n < v->g->nlacons && v->g->lacons != NULL);
FDEBUG(("=== testing lacon %d\n", n));
sub = &v->g->lacons[n];
! d = newDFA(v, &sub->cnfa, &v->g->cmap, &sd);
if (d == NULL) {
- ERR(REG_ESPACE);
return 0;
}
! end = longest(v, d, cp, v->stop, NULL);
! freeDFA(d);
! FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
! return (sub->subno) ? (end != NULL) : (end == NULL);
}
/*
--- 728,769 ----
struct vars *const v,
struct cnfa *const pcnfa, /* parent cnfa */
chr *const cp,
! const pcolor co) /* "color" of the lookaround constraint */
{
int n;
struct subre *sub;
struct dfa *d;
chr *end;
+ int satisfied;
n = co - pcnfa->ncolors;
! assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
FDEBUG(("=== testing lacon %d\n", n));
sub = &v->g->lacons[n];
! d = getladfa(v, n);
if (d == NULL) {
return 0;
}
! if (LATYPE_IS_AHEAD(sub->subno)) {
! /* used to use longest() here, but shortest() could be much cheaper */
! end = shortest(v, d, cp, cp, v->stop, NULL, NULL);
! satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
! } else {
! /*
! * To avoid doing O(N^2) work when repeatedly testing a lookbehind
! * constraint in an N-character string, we use matchuntil() which can
! * cache the DFA state across calls. We only need to restart if the
! * probe point decreases, which is not common. The NFA we're using is
! * a search NFA, so it doesn't mind scanning over stuff before the
! * nominal match.
! */
! satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
! if (!LATYPE_IS_POS(sub->subno)) {
! satisfied = !satisfied;
! }
! }
! FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
! return satisfied;
}
/*
diff -pcdr src/generic/regex.h lookbehind/generic/regex.h
*** src/generic/regex.h Tue Oct 27 14:39:17 2015
--- lookbehind/generic/regex.h Fri Nov 6 13:37:48 2015
*************** typedef struct {
*** 163,169 ****
size_t re_nsub; /* number of subexpressions */
long re_info; /* information about RE */
#define REG_UBACKREF 000001
! #define REG_ULOOKAHEAD 000002
#define REG_UBOUNDS 000004
#define REG_UBRACES 000010
#define REG_UBSALNUM 000020
--- 163,169 ----
size_t re_nsub; /* number of subexpressions */
long re_info; /* information about RE */
#define REG_UBACKREF 000001
! #define REG_ULOOKAROUND 000002
#define REG_UBOUNDS 000004
#define REG_UBRACES 000010
#define REG_UBSALNUM 000020
diff -pcdr src/generic/regexec.c lookbehind/generic/regexec.c
*** src/generic/regexec.c Tue Oct 20 18:38:00 2015
--- lookbehind/generic/regexec.c Fri Nov 6 13:46:36 2015
*************** struct vars {
*** 107,113 ****
chr *start; /* start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
! struct dfa **subdfas; /* per-subre DFAs */
struct smalldfa dfa1;
struct smalldfa dfa2;
};
--- 107,116 ----
chr *start; /* start of string */
chr *stop; /* just past end of string */
int err; /* error code if any (0 none) */
! struct dfa **subdfas; /* per-tree-subre DFAs */
! struct dfa **ladfas; /* per-lacon-subre DFAs */
! struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
! chr **lblastcp; /* per-lacon-subre lookbehind restart data */
struct smalldfa dfa1;
struct smalldfa dfa2;
};
*************** struct vars {
*** 127,132 ****
--- 130,136 ----
/* === regexec.c === */
int exec(regex_t *, const chr *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
static struct dfa *getsubdfa(struct vars *, struct subre *);
+ static struct dfa *getladfa(struct vars *, int);
static int simpleFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFind(struct vars *const, struct cnfa *const, struct colormap *const);
static int complicatedFindLoop(struct vars *const, struct cnfa *const, struct colormap *const, struct dfa *const, struct dfa *const, chr **const);
*************** static int creviterdissect(struct vars *
*** 143,148 ****
--- 147,153 ----
/* === rege_dfa.c === */
static chr *longest(struct vars *const, struct dfa *const, chr *const, chr *const, int *const);
static chr *shortest(struct vars *const, struct dfa *const, chr *const, chr *const, chr *const, chr **const, int *const);
+ static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
static chr *lastCold(struct vars *const, struct dfa *const);
static struct dfa *newDFA(struct vars *const, struct cnfa *const, struct colormap *const, struct smalldfa *);
static void freeDFA(struct dfa *const);
*************** exec(
*** 235,255 ****
v->start = (chr *)string;
v->stop = (chr *)string + len;
v->err = 0;
assert(v->g->ntree >= 0);
n = (size_t) v->g->ntree;
! if (n <= LOCALDFAS)
v->subdfas = subdfas;
! else
v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
! if (v->subdfas == NULL) {
! if (v->pmatch != pmatch && v->pmatch != mat)
! FREE(v->pmatch);
! FreeVars(v);
! return REG_ESPACE;
}
for (i = 0; i < n; i++)
v->subdfas[i] = NULL;
/*
* Do it.
*/
--- 240,287 ----
v->start = (chr *)string;
v->stop = (chr *)string + len;
v->err = 0;
+ v->subdfas = NULL;
+ v->ladfas = NULL;
+ v->lblastcss = NULL;
+ v->lblastcp = NULL;
+ /* below this point, "goto cleanup" will behave sanely */
+
assert(v->g->ntree >= 0);
n = (size_t) v->g->ntree;
! if (n <= LOCALDFAS) {
v->subdfas = subdfas;
! } else {
v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
! if (v->subdfas == NULL) {
! st = REG_ESPACE;
! goto cleanup;
! }
}
for (i = 0; i < n; i++)
v->subdfas[i] = NULL;
+ assert(v->g->nlacons >= 0);
+ n = (size_t) v->g->nlacons;
+ if (n > 0) {
+ v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
+ if (v->ladfas == NULL) {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++)
+ v->ladfas[i] = NULL;
+ v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
+ v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
+ if (v->lblastcss == NULL || v->lblastcp == NULL) {
+ st = REG_ESPACE;
+ goto cleanup;
+ }
+ for (i = 0; i < n; i++) {
+ v->lblastcss[i] = NULL;
+ v->lblastcp[i] = NULL;
+ }
+ }
+
/*
* Do it.
*/
*************** exec(
*** 274,296 ****
/*
* Clean up.
*/
!
if (v->pmatch != pmatch && v->pmatch != mat) {
FREE(v->pmatch);
}
! n = (size_t) v->g->ntree;
! for (i = 0; i < n; i++) {
! if (v->subdfas[i] != NULL)
! freeDFA(v->subdfas[i]);
}
! if (v->subdfas != subdfas)
! FREE(v->subdfas);
FreeVars(v);
return st;
}
/*
! - getsubdfa - create or re-fetch the DFA for a subre node
* We only need to create the DFA once per overall regex execution.
* The DFA will be freed by the cleanup step in exec().
*/
--- 306,342 ----
/*
* Clean up.
*/
! cleanup:
if (v->pmatch != pmatch && v->pmatch != mat) {
FREE(v->pmatch);
}
! if (v->subdfas != NULL) {
! n = (size_t) v->g->ntree;
! for (i = 0; i < n; i++) {
! if (v->subdfas[i] != NULL)
! freeDFA(v->subdfas[i]);
! }
! if (v->subdfas != subdfas)
! FREE(v->subdfas);
}
! if (v->ladfas != NULL) {
! n = (size_t) v->g->nlacons;
! for (i = 0; i < n; i++) {
! if (v->ladfas[i] != NULL)
! freeDFA(v->ladfas[i]);
! }
! FREE(v->ladfas);
! }
! if (v->lblastcss != NULL)
! FREE(v->lblastcss);
! if (v->lblastcp != NULL)
! FREE(v->lblastcp);
FreeVars(v);
return st;
}
/*
! - getsubdfa - create or re-fetch the DFA for a tree subre node
* We only need to create the DFA once per overall regex execution.
* The DFA will be freed by the cleanup step in exec().
*/
*************** getsubdfa(struct vars * v,
*** 307,312 ****
--- 353,377 ----
}
/*
+ - getladfa - create or re-fetch the DFA for a LACON subre node
+ * Same as above, but for LACONs.
+ */
+ static struct dfa *
+ getladfa(struct vars *v,
+ int n)
+ {
+ assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
+ if (v->ladfas[n] == NULL) {
+ struct subre *sub = &v->g->lacons[n];
+
+ v->ladfas[n] = newDFA(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
+ if (ISERR())
+ return NULL;
+ }
+ return v->ladfas[n];
+ }
+
+ /*
- simpleFind - find a match for the main NFA (no-complications case)
^ static int simpleFind(struct vars *, struct cnfa *, struct colormap *);
*/
diff -pcdr src/generic/regguts.h lookbehind/generic/regguts.h
*** src/generic/regguts.h Tue Oct 27 14:39:17 2015
--- lookbehind/generic/regguts.h Fri Nov 6 13:40:57 2015
***************
*** 96,108 ****
*/
#define NOTREACHED 0
- #define xxx 1
#define DUPMAX _POSIX2_RE_DUP_MAX
#define DUPINF (DUPMAX+1)
#define REMAGIC 0xfed7 /* magic number for main struct */
/*
* debugging facilities
*/
--- 96,115 ----
*/
#define NOTREACHED 0
#define DUPMAX _POSIX2_RE_DUP_MAX
#define DUPINF (DUPMAX+1)
#define REMAGIC 0xfed7 /* magic number for main struct */
+ /* Type codes for lookaround constraints */
+ #define LATYPE_AHEAD_POS 03 /* positive lookahead */
+ #define LATYPE_AHEAD_NEG 02 /* negative lookahead */
+ #define LATYPE_BEHIND_POS 01 /* positive lookbehind */
+ #define LATYPE_BEHIND_NEG 00 /* negative lookbehind */
+ #define LATYPE_IS_POS(la) ((la) & 01)
+ #define LATYPE_IS_AHEAD(la) ((la) & 02)
+
/*
* debugging facilities
*/
*************** struct nfa {
*** 311,317 ****
*
* The non-dummy carc structs are of two types: plain arcs and LACON arcs.
* Plain arcs just store the transition color number as "co". LACON arcs
! * store the lookahead constraint number plus cnfa.ncolors as "co". LACON
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
*/
--- 318,324 ----
*
* The non-dummy carc structs are of two types: plain arcs and LACON arcs.
* Plain arcs just store the transition color number as "co". LACON arcs
! * store the lookaround constraint number plus cnfa.ncolors as "co". LACON
* arcs can be distinguished from plain by testing for co >= cnfa.ncolors.
*/
*************** struct cnfa {
*** 324,330 ****
int nstates; /* number of states */
int ncolors; /* number of colors */
int flags;
! #define HASLACONS 01 /* uses lookahead constraints */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
--- 331,337 ----
int nstates; /* number of states */
int ncolors; /* number of colors */
int flags;
! #define HASLACONS 01 /* uses lookaround constraints */
int pre; /* setup state number */
int post; /* teardown state number */
color bos[2]; /* colors, if any, assigned to BOS and BOL */
*************** struct subre {
*** 391,397 ****
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short id; /* ID of subre (1..ntree-1) */
! int subno; /* subexpression number (for 'b' and '(') */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
--- 398,405 ----
#define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
#define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2))
short id; /* ID of subre (1..ntree-1) */
! int subno; /* subexpression number for 'b' and '(', or
! * LATYPE code for lookaround constraint */
short min; /* min repetitions for iteration or backref */
short max; /* max repetitions for iteration or backref */
struct subre *left; /* left child, if any (also freelist chain) */
*************** struct guts {
*** 426,433 ****
int ntree; /* number of subre's, plus one */
struct colormap cmap;
int FUNCPTR(compare, (const chr *, const chr *, size_t));
! struct subre *lacons; /* lookahead-constraint vector */
! int nlacons; /* size of lacons */
};
/*
--- 434,442 ----
int ntree; /* number of subre's, plus one */
struct colormap cmap;
int FUNCPTR(compare, (const chr *, const chr *, size_t));
! struct subre *lacons; /* lookaround-constraint vector */
! int nlacons; /* size of lacons[]; note that only slots
! * numbered 1 .. nlacons-1 are used */
};
/*
diff -pcdr src/generic/tclRegexp.c lookbehind/generic/tclRegexp.c
*** src/generic/tclRegexp.c Mon Sep 21 18:12:24 2015
--- lookbehind/generic/tclRegexp.c Fri Nov 6 13:43:13 2015
*************** TclRegAbout(
*** 638,644 ****
};
static const struct infoname infonames[] = {
{REG_UBACKREF, "REG_UBACKREF"},
! {REG_ULOOKAHEAD, "REG_ULOOKAHEAD"},
{REG_UBOUNDS, "REG_UBOUNDS"},
{REG_UBRACES, "REG_UBRACES"},
{REG_UBSALNUM, "REG_UBSALNUM"},
--- 638,644 ----
};
static const struct infoname infonames[] = {
{REG_UBACKREF, "REG_UBACKREF"},
! {REG_ULOOKAROUND, "REG_ULOOKAROUND"},
{REG_UBOUNDS, "REG_UBOUNDS"},
{REG_UBRACES, "REG_UBRACES"},
{REG_UBSALNUM, "REG_UBSALNUM"},
diff -pcdr src/tests/reg.test lookbehind/tests/reg.test
*** src/tests/reg.test Tue Oct 27 14:39:17 2015
--- lookbehind/tests/reg.test Fri Nov 6 14:08:51 2015
*************** namespace eval RETest {
*** 111,117 ****
A REG_UBSALNUM
B REG_UBRACES
E REG_UBBS
! H REG_ULOOKAHEAD
I REG_UIMPOSSIBLE
L REG_ULOCALE
M REG_UUNPORT
--- 111,117 ----
A REG_UBSALNUM
B REG_UBRACES
E REG_UBBS
! H REG_ULOOKAROUND
I REG_UIMPOSSIBLE
L REG_ULOCALE
M REG_UUNPORT
*************** expectMatch 22.21 &+L {a[^b]} ach ach
*** 831,837 ****
expectNomatch 22.22 &+L {a[^b]} abe
! doing 23 "lookahead constraints"
expectMatch 23.1 HP a(?=b)b* ab ab
expectNomatch 23.2 HP a(?=b)b* a
expectMatch 23.3 HP a(?=b)b*(?=c)c* abc abc
--- 831,838 ----
expectNomatch 22.22 &+L {a[^b]} abe
! doing 23 "lookaround constraints"
! # lookahead
expectMatch 23.1 HP a(?=b)b* ab ab
expectNomatch 23.2 HP a(?=b)b* a
expectMatch 23.3 HP a(?=b)b*(?=c)c* abc abc
*************** expectNomatch 23.5 HP a(?!b)b* ab
*** 840,845 ****
--- 841,869 ----
expectMatch 23.6 HP a(?!b)b* a a
expectMatch 23.7 HP (?=b)b b b
expectNomatch 23.8 HP (?=b)b a
+ # lookbehind
+ expectMatch 23.9 HNP (?<=a)b* abb bb
+ expectMatch 23.10 HP a(?<=a)b* a a
+ expectMatch 23.11 HP a(?<=a)b*(?<=b)c* abc abc
+ expectMatch 23.12 HP a(?<=a)b*(?<=b)c* ab ab
+ expectMatch 23.13 HNP a*(?<!a)b* ab {}
+ expectNomatch 23.14 HP a*(?<!a)b+ ab
+ expectMatch 23.15 HP a*(?<!a)b+ b b
+ expectNomatch 23.16 HIP a(?<!a)b* a
+ expectNomatch 23.17 HP (?<=b)b b
+ expectNomatch 23.18 HP (?<=f)b+ foobar
+ expectMatch 23.19 HP (?<=foo)b+ foobar b
+ expectMatch 23.20 HP (?<=oo)b+ foobar b
+ # Test optimization of single-chr-or-bracket-expression lookaround constraints
+ expectNomatch 23.21 HP {x(?=[xy])} xz
+ expectMatch 23.22 HP {x(?=[xy])} xy x
+ expectMatch 23.23 HP {x(?![xy])} xz x
+ expectNomatch 23.24 HP {x(?![xy])} xy
+ expectMatch 23.25 HP {x(?![xy])} x x
+ expectMatch 23.26 HP {(?<=[xy])yy+} xyy yy
+ expectNomatch 23.27 HP {(?<=[xy])yy+} zyy
+ expectNomatch 23.28 HP {(?<![xy])yy+} xyy
+ expectMatch 23.29 HP {(?<![xy])yy+} zyy yy
doing 24 "non-greedy quantifiers"