Tcl Source Code

Check-in [3eca80b890]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to tclconference@googlegroups.com
or submit via the online form by Sep 9.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change regexp C API to handle indexes > 2G. New TIP upcoming.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | regexp-api-64bit
Files: files | file ages | folders
SHA3-256:3eca80b890089c518b6b25d52a7ad79ed6e403e490348d9c7fd1b62ad82c21ee
User & Date: jan.nijtmans 2019-03-29 09:41:05
Context
2019-03-29
12:54
More internal use of size_t check-in: 305e64b47b user: jan.nijtmans tags: regexp-api-64bit
09:41
Change regexp C API to handle indexes > 2G. New TIP upcoming. check-in: 3eca80b890 user: jan.nijtmans tags: regexp-api-64bit
2019-03-28
21:24
Merge 8.7 check-in: 81d2077e6a user: jan.nijtmans tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/RegExp.3.

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
...
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
\fBTcl_RegExpGetInfo\fR retrieves information about the last match
performed with a given regular expression \fIregexp\fR.  The
\fIinfoPtr\fR argument contains a pointer to a structure that is
defined as follows:
.PP
.CS
typedef struct Tcl_RegExpInfo {
    int \fInsubs\fR;
    Tcl_RegExpIndices *\fImatches\fR;
    long \fIextendStart\fR;
} \fBTcl_RegExpInfo\fR;
.CE
.PP
The \fInsubs\fR field contains a count of the number of parenthesized
subexpressions within the regular expression.  If the \fBTCL_REG_NOSUB\fR
was used, then this value will be zero.  The \fImatches\fR field
points to an array of \fInsubs\fR+1 values that indicate the bounds of each
................................................................................
range matched by the entire regular expression, and subsequent elements
refer to the parenthesized subexpressions in the order that they
appear in the pattern.  Each element is a structure that is defined as
follows:
.PP
.CS
typedef struct Tcl_RegExpIndices {
    long \fIstart\fR;
    long \fIend\fR;
} \fBTcl_RegExpIndices\fR;
.CE
.PP
The \fIstart\fR and \fIend\fR values are Unicode character indices
relative to the offset location within \fIobjPtr\fR where matching began.
The \fIstart\fR index identifies the first character of the matched
subexpression.  The \fIend\fR index identifies the first character







|

|







 







|
|







333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
...
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
\fBTcl_RegExpGetInfo\fR retrieves information about the last match
performed with a given regular expression \fIregexp\fR.  The
\fIinfoPtr\fR argument contains a pointer to a structure that is
defined as follows:
.PP
.CS
typedef struct Tcl_RegExpInfo {
    size_t \fInsubs\fR;
    Tcl_RegExpIndices *\fImatches\fR;
    size_t \fIextendStart\fR;
} \fBTcl_RegExpInfo\fR;
.CE
.PP
The \fInsubs\fR field contains a count of the number of parenthesized
subexpressions within the regular expression.  If the \fBTCL_REG_NOSUB\fR
was used, then this value will be zero.  The \fImatches\fR field
points to an array of \fInsubs\fR+1 values that indicate the bounds of each
................................................................................
range matched by the entire regular expression, and subsequent elements
refer to the parenthesized subexpressions in the order that they
appear in the pattern.  Each element is a structure that is defined as
follows:
.PP
.CS
typedef struct Tcl_RegExpIndices {
    size_t \fIstart\fR;
    size_t \fIend\fR;
} \fBTcl_RegExpIndices\fR;
.CE
.PP
The \fIstart\fR and \fIend\fR values are Unicode character indices
relative to the offset location within \fIobjPtr\fR where matching began.
The \fIstart\fR index identifies the first character of the matched
subexpression.  The \fIend\fR index identifies the first character

Changes to generic/regcomp.c.

334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
....
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
    v->cv = NULL;
    v->cv2 = NULL;
    v->lacons = NULL;
    v->nlacons = 0;
    v->spaceused = 0;
    re->re_magic = REMAGIC;
    re->re_info = 0;		/* bits get set during parse */
    re->re_csize = sizeof(chr);
    re->re_guts = NULL;
    re->re_fns = (void*)(&functions);

    /*
     * More complex setup, malloced things.
     */

................................................................................
    g = (struct guts *) re->re_guts;
    if (g->magic != GUTSMAGIC) {
	fprintf(f, "bad guts magic number (0x%x not 0x%x)\n",
		g->magic, GUTSMAGIC);
    }

    fprintf(f, "\n\n\n========= DUMP ==========\n");
    fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
	    (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);

    dumpcolors(&g->cmap, f);
    if (!NULLCNFA(g->search)) {
	fprintf(f, "\nsearch:\n");
	dumpcnfa(&g->search, f);
    }
    for (i = 1; i < g->nlacons; i++) {







<







 







|
|







334
335
336
337
338
339
340

341
342
343
344
345
346
347
....
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
    v->cv = NULL;
    v->cv2 = NULL;
    v->lacons = NULL;
    v->nlacons = 0;
    v->spaceused = 0;
    re->re_magic = REMAGIC;
    re->re_info = 0;		/* bits get set during parse */

    re->re_guts = NULL;
    re->re_fns = (void*)(&functions);

    /*
     * More complex setup, malloced things.
     */

................................................................................
    g = (struct guts *) re->re_guts;
    if (g->magic != GUTSMAGIC) {
	fprintf(f, "bad guts magic number (0x%x not 0x%x)\n",
		g->magic, GUTSMAGIC);
    }

    fprintf(f, "\n\n\n========= DUMP ==========\n");
    fprintf(f, "nsub %" TCL_Z_MODIFIER "d, info 0%lo, ntree %d\n",
	    re->re_nsub, re->re_info, g->ntree);

    dumpcolors(&g->cmap, f);
    if (!NULLCNFA(g->search)) {
	fprintf(f, "\nsearch:\n");
	dumpcnfa(&g->search, f);
    }
    for (i = 1; i < g->nlacons; i++) {

Changes to generic/regex.h.

113
114
115
116
117
118
119
120
121

122
123
124
125
126
127
128
...
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
/*
 * other interface types
 */

/* the biggie, a compiled RE (or rather, a front end to same) */
typedef struct {
    int re_magic;		/* magic number */
    size_t re_nsub;		/* number of subexpressions */
    long re_info;		/* information about RE */

#define	REG_UBACKREF		000001
#define	REG_ULOOKAHEAD		000002
#define	REG_UBOUNDS		000004
#define	REG_UBRACES		000010
#define	REG_UBSALNUM		000020
#define	REG_UPBOTCH		000040
#define	REG_UBBS		000100
................................................................................
#define	REG_UNONPOSIX		000200
#define	REG_UUNSPEC		000400
#define	REG_UUNPORT		001000
#define	REG_ULOCALE		002000
#define	REG_UEMPTYMATCH		004000
#define	REG_UIMPOSSIBLE		010000
#define	REG_USHORTEST		020000
    int re_csize;		/* sizeof(character) */
    char *re_endp;		/* backward compatibility kludge */
    /* the rest is opaque pointers to hidden innards */
    char *re_guts;		/* `char *' is more portable than `void *' */
    char *re_fns;
} regex_t;

/* result reporting (may acquire more fields later) */
typedef struct {
    long rm_so;		/* start of substring */
    long rm_eo;		/* end of substring */
} regmatch_t;

/* supplementary control and reporting */
typedef struct {
    regmatch_t rm_extend;	/* see REG_EXPECT */
} rm_detail_t;








<

>







 







<








|
|







113
114
115
116
117
118
119

120
121
122
123
124
125
126
127
128
...
129
130
131
132
133
134
135

136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
/*
 * other interface types
 */

/* the biggie, a compiled RE (or rather, a front end to same) */
typedef struct {
    int re_magic;		/* magic number */

    long re_info;		/* information about RE */
    size_t re_nsub;		/* number of subexpressions */
#define	REG_UBACKREF		000001
#define	REG_ULOOKAHEAD		000002
#define	REG_UBOUNDS		000004
#define	REG_UBRACES		000010
#define	REG_UBSALNUM		000020
#define	REG_UPBOTCH		000040
#define	REG_UBBS		000100
................................................................................
#define	REG_UNONPOSIX		000200
#define	REG_UUNSPEC		000400
#define	REG_UUNPORT		001000
#define	REG_ULOCALE		002000
#define	REG_UEMPTYMATCH		004000
#define	REG_UIMPOSSIBLE		010000
#define	REG_USHORTEST		020000

    char *re_endp;		/* backward compatibility kludge */
    /* the rest is opaque pointers to hidden innards */
    char *re_guts;		/* `char *' is more portable than `void *' */
    char *re_fns;
} regex_t;

/* result reporting (may acquire more fields later) */
typedef struct {
    size_t rm_so;		/* start of substring */
    size_t rm_eo;		/* end of substring */
} regmatch_t;

/* supplementary control and reporting */
typedef struct {
    regmatch_t rm_extend;	/* see REG_EXPECT */
} rm_detail_t;

Changes to generic/regexec.c.

183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
...
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
     * Sanity checks.
     */

    if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
	FreeVars(v);
	return REG_INVARG;
    }
    if (re->re_csize != sizeof(chr)) {
	FreeVars(v);
	return REG_MIXED;
    }

    /*
     * Setup.
     */

    v->re = re;
    v->g = (struct guts *)re->re_guts;
................................................................................
    assert(t->op == 'b');
    assert(n >= 0);
    assert((size_t)n < v->nmatch);

    MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));

    /* get the backreferenced string */
    if (v->pmatch[n].rm_so == -1) {
	return REG_NOMATCH;
    }
    brstring = v->start + v->pmatch[n].rm_so;
    brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;

    /* special cases for zero-length strings */
    if (brlen == 0) {







<
<
<
<







 







|







183
184
185
186
187
188
189




190
191
192
193
194
195
196
...
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
     * Sanity checks.
     */

    if (re == NULL || string == NULL || re->re_magic != REMAGIC) {
	FreeVars(v);
	return REG_INVARG;
    }





    /*
     * Setup.
     */

    v->re = re;
    v->g = (struct guts *)re->re_guts;
................................................................................
    assert(t->op == 'b');
    assert(n >= 0);
    assert((size_t)n < v->nmatch);

    MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));

    /* get the backreferenced string */
    if (v->pmatch[n].rm_so == (size_t)-1) {
	return REG_NOMATCH;
    }
    brstring = v->start + v->pmatch[n].rm_so;
    brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;

    /* special cases for zero-length strings */
    if (brlen == 0) {

Changes to generic/tcl.h.

455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
/*
 * Structures filled in by Tcl_RegExpInfo. Note that all offset values are
 * relative to the start of the match string, not the beginning of the entire
 * string.
 */

typedef struct Tcl_RegExpIndices {
    long start;			/* Character offset of first character in
				 * match. */
    long end;			/* Character offset of first character after
				 * the match. */
} Tcl_RegExpIndices;

typedef struct Tcl_RegExpInfo {
    int nsubs;			/* Number of subexpressions in the compiled
				 * expression. */
    Tcl_RegExpIndices *matches;	/* Array of nsubs match offset pairs. */
    long extendStart;		/* The offset at which a subsequent match
				 * might begin. */
    long reserved;		/* Reserved for later use. */
} Tcl_RegExpInfo;

/*
 * Picky compilers complain if this typdef doesn't appear before the struct's
 * reference in tclDecls.h.
 */








|

|




|


|

<







455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473

474
475
476
477
478
479
480
/*
 * Structures filled in by Tcl_RegExpInfo. Note that all offset values are
 * relative to the start of the match string, not the beginning of the entire
 * string.
 */

typedef struct Tcl_RegExpIndices {
    size_t start;			/* Character offset of first character in
				 * match. */
    size_t end;			/* Character offset of first character after
				 * the match. */
} Tcl_RegExpIndices;

typedef struct Tcl_RegExpInfo {
    size_t nsubs;			/* Number of subexpressions in the compiled
				 * expression. */
    Tcl_RegExpIndices *matches;	/* Array of nsubs match offset pairs. */
    size_t extendStart;		/* The offset at which a subsequent match
				 * might begin. */

} Tcl_RegExpInfo;

/*
 * Picky compilers complain if this typdef doesn't appear before the struct's
 * reference in tclDecls.h.
 */

Changes to generic/tclCmdMZ.c.

123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
...
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
...
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
...
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
...
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
...
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
...
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
...
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
....
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
int
Tcl_RegexpObjCmd(
    ClientData dummy,		/* Not used. */
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    size_t offset;
    int i, indices, match, about, all, doinline, numMatchesSaved;
    int cflags, eflags, stringLength, matchLength;
    Tcl_RegExp regExpr;
    Tcl_Obj *objPtr, *startIndex = NULL, *resultPtr = NULL;
    Tcl_RegExpInfo info;
    static const char *const options[] = {
	"-all",		"-about",	"-indices",	"-inline",
	"-expanded",	"-line",	"-linestop",	"-lineanchor",
	"-nocase",	"-start",	"--",		NULL
................................................................................
	 * considered the start of the line. If for example the pattern {^} is
	 * passed and -start is positive, then the pattern will not match the
	 * start of the string unless the previous character is a newline.
	 */

	if (offset == TCL_INDEX_START) {
	    eflags = 0;
	} else if (offset + 1 > (size_t)stringLength + 1) {
	    eflags = TCL_REG_NOTBOL;
	} else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') {
	    eflags = 0;
	} else {
	    eflags = TCL_REG_NOTBOL;
	}

................................................................................
		Tcl_Obj *objs[2];

		/*
		 * Only adjust the match area if there was a match for that
		 * area. (Scriptics Bug 4391/SF Bug #219232)
		 */

		if (i <= info.nsubs && info.matches[i].start >= 0) {
		    start = offset + info.matches[i].start;
		    end = offset + info.matches[i].end;

		    /*
		     * Adjust index so it refers to the last character in the
		     * match instead of the first character after the match.
		     */
................................................................................
		}

		objs[0] = Tcl_NewWideIntObj(start);
		objs[1] = Tcl_NewWideIntObj(end);

		newPtr = Tcl_NewListObj(2, objs);
	    } else {
		if (i <= info.nsubs) {
		    newPtr = Tcl_GetRange(objPtr,
			    offset + info.matches[i].start,
			    offset + info.matches[i].end - 1);
		} else {
		    newPtr = Tcl_NewObj();
		}
	    }
................................................................................
	 * these cases we always want to bump the index up one.
	 */

	if (matchLength == 0) {
	    offset++;
	}
	all++;
	if (offset + 1 >= (size_t)stringLength + 1) {
	    break;
	}
    }

    /*
     * Set the interpreter's object result to an integer object with value 1
     * if -all wasn't specified, otherwise it's all-1 (the number of times
................................................................................
	    int numArgs;

	    Tcl_ListObjGetElements(interp, subPtr, &numParts, &parts);
	    numArgs = numParts + info.nsubs + 1;
	    args = Tcl_Alloc(sizeof(Tcl_Obj*) * numArgs);
	    memcpy(args, parts, sizeof(Tcl_Obj*) * numParts);

	    for (idx = 0 ; idx <= info.nsubs ; idx++) {
		subStart = info.matches[idx].start;
		subEnd = info.matches[idx].end;
		if ((subStart >= 0) && (subEnd >= 0)) {
		    args[idx + numParts] = Tcl_NewUnicodeObj(
			    wstring + offset + subStart, subEnd - subStart);
		} else {
		    args[idx + numParts] = Tcl_NewObj();
................................................................................
	     * because Tcl_EvalObjv is "hairy monster" in terms of refcount
	     * handling, being able to optionally add references to any of its
	     * argument words. We'll drop the local refs immediately
	     * afterwards; subPtr is handled in the main exit stanza.
	     */

	    result = Tcl_EvalObjv(interp, numArgs, args, 0);
	    for (idx = 0 ; idx <= info.nsubs ; idx++) {
		TclDecrRefCount(args[idx + numParts]);
	    }
	    Tcl_Free(args);
	    if (result != TCL_OK) {
		if (result == TCL_ERROR) {
		    Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf(
			    "\n    (%s substitution computation script)",
................................................................................
	    }

	    if (wfirstChar != wsrc) {
		Tcl_AppendUnicodeToObj(resultPtr, wfirstChar,
			wsrc - wfirstChar);
	    }

	    if (idx <= info.nsubs) {
		subStart = info.matches[idx].start;
		subEnd = info.matches[idx].end;
		if ((subStart >= 0) && (subEnd >= 0)) {
		    Tcl_AppendUnicodeToObj(resultPtr,
			    wstring + offset + subStart, subEnd - subStart);
		}
	    }
................................................................................
	} else {
	    matchesObj = NULL;
	}
	if (indexVarObj != NULL) {
	    TclNewObj(indicesObj);
	}

	for (j=0 ; j<=info.nsubs ; j++) {
	    if (indexVarObj != NULL) {
		Tcl_Obj *rangeObjAry[2];

		if (info.matches[j].end > 0) {
		    rangeObjAry[0] = Tcl_NewWideIntObj(info.matches[j].start);
		    rangeObjAry[1] = Tcl_NewWideIntObj(info.matches[j].end-1);
		} else {
		    rangeObjAry[0] = rangeObjAry[1] = Tcl_NewWideIntObj(-1);
		}

		/*
		 * Never fails; the object is always clean at this point.
		 */







|

|







 







|







 







|







 







|







 







|







 







|







 







|







 







|







 







|



|
|
|







123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
...
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
...
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
...
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
...
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
...
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
...
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
...
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
....
3730
3731
3732
3733
3734
3735
3736
3737
3738
3739
3740
3741
3742
3743
3744
3745
3746
3747
3748
3749
3750
int
Tcl_RegexpObjCmd(
    ClientData dummy,		/* Not used. */
    Tcl_Interp *interp,		/* Current interpreter. */
    int objc,			/* Number of arguments. */
    Tcl_Obj *const objv[])	/* Argument objects. */
{
    size_t offset, stringLength, matchLength;
    int i, indices, match, about, all, doinline, numMatchesSaved;
    int cflags, eflags;
    Tcl_RegExp regExpr;
    Tcl_Obj *objPtr, *startIndex = NULL, *resultPtr = NULL;
    Tcl_RegExpInfo info;
    static const char *const options[] = {
	"-all",		"-about",	"-indices",	"-inline",
	"-expanded",	"-line",	"-linestop",	"-lineanchor",
	"-nocase",	"-start",	"--",		NULL
................................................................................
	 * considered the start of the line. If for example the pattern {^} is
	 * passed and -start is positive, then the pattern will not match the
	 * start of the string unless the previous character is a newline.
	 */

	if (offset == TCL_INDEX_START) {
	    eflags = 0;
	} else if (offset + 1 > stringLength + 1) {
	    eflags = TCL_REG_NOTBOL;
	} else if (Tcl_GetUniChar(objPtr, offset-1) == '\n') {
	    eflags = 0;
	} else {
	    eflags = TCL_REG_NOTBOL;
	}

................................................................................
		Tcl_Obj *objs[2];

		/*
		 * Only adjust the match area if there was a match for that
		 * area. (Scriptics Bug 4391/SF Bug #219232)
		 */

		if (i <= (int)info.nsubs && info.matches[i].start != TCL_INDEX_NONE) {
		    start = offset + info.matches[i].start;
		    end = offset + info.matches[i].end;

		    /*
		     * Adjust index so it refers to the last character in the
		     * match instead of the first character after the match.
		     */
................................................................................
		}

		objs[0] = Tcl_NewWideIntObj(start);
		objs[1] = Tcl_NewWideIntObj(end);

		newPtr = Tcl_NewListObj(2, objs);
	    } else {
		if (i <= (int)info.nsubs) {
		    newPtr = Tcl_GetRange(objPtr,
			    offset + info.matches[i].start,
			    offset + info.matches[i].end - 1);
		} else {
		    newPtr = Tcl_NewObj();
		}
	    }
................................................................................
	 * these cases we always want to bump the index up one.
	 */

	if (matchLength == 0) {
	    offset++;
	}
	all++;
	if (offset + 1 >= stringLength + 1) {
	    break;
	}
    }

    /*
     * Set the interpreter's object result to an integer object with value 1
     * if -all wasn't specified, otherwise it's all-1 (the number of times
................................................................................
	    int numArgs;

	    Tcl_ListObjGetElements(interp, subPtr, &numParts, &parts);
	    numArgs = numParts + info.nsubs + 1;
	    args = Tcl_Alloc(sizeof(Tcl_Obj*) * numArgs);
	    memcpy(args, parts, sizeof(Tcl_Obj*) * numParts);

	    for (idx = 0 ; idx <= (int)info.nsubs ; idx++) {
		subStart = info.matches[idx].start;
		subEnd = info.matches[idx].end;
		if ((subStart >= 0) && (subEnd >= 0)) {
		    args[idx + numParts] = Tcl_NewUnicodeObj(
			    wstring + offset + subStart, subEnd - subStart);
		} else {
		    args[idx + numParts] = Tcl_NewObj();
................................................................................
	     * because Tcl_EvalObjv is "hairy monster" in terms of refcount
	     * handling, being able to optionally add references to any of its
	     * argument words. We'll drop the local refs immediately
	     * afterwards; subPtr is handled in the main exit stanza.
	     */

	    result = Tcl_EvalObjv(interp, numArgs, args, 0);
	    for (idx = 0 ; idx <= (int)info.nsubs ; idx++) {
		TclDecrRefCount(args[idx + numParts]);
	    }
	    Tcl_Free(args);
	    if (result != TCL_OK) {
		if (result == TCL_ERROR) {
		    Tcl_AppendObjToErrorInfo(interp, Tcl_ObjPrintf(
			    "\n    (%s substitution computation script)",
................................................................................
	    }

	    if (wfirstChar != wsrc) {
		Tcl_AppendUnicodeToObj(resultPtr, wfirstChar,
			wsrc - wfirstChar);
	    }

	    if (idx <= (int)info.nsubs) {
		subStart = info.matches[idx].start;
		subEnd = info.matches[idx].end;
		if ((subStart >= 0) && (subEnd >= 0)) {
		    Tcl_AppendUnicodeToObj(resultPtr,
			    wstring + offset + subStart, subEnd - subStart);
		}
	    }
................................................................................
	} else {
	    matchesObj = NULL;
	}
	if (indexVarObj != NULL) {
	    TclNewObj(indicesObj);
	}

	for (j=0 ; j<=(int)info.nsubs ; j++) {
	    if (indexVarObj != NULL) {
		Tcl_Obj *rangeObjAry[2];

		if (info.matches[j].end + 1 > 1) {
		    rangeObjAry[0] = TclNewWideIntObjFromSize(info.matches[j].start);
		    rangeObjAry[1] = TclNewWideIntObjFromSize(info.matches[j].end-1);
		} else {
		    rangeObjAry[0] = rangeObjAry[1] = Tcl_NewWideIntObj(-1);
		}

		/*
		 * Never fails; the object is always clean at this point.
		 */

Changes to generic/tclInt.decls.

610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625

# Added for Tcl 8.2

declare 150 {
    int TclRegAbout(Tcl_Interp *interp, Tcl_RegExp re)
}
declare 151 {
    void TclRegExpRangeUniChar(Tcl_RegExp re, size_t index, int *startPtr,
	    int *endPtr)
}
declare 152 {
    void TclSetLibraryPath(Tcl_Obj *pathPtr)
}
declare 153 {
    Tcl_Obj *TclGetLibraryPath(void)
}







|
|







610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625

# Added for Tcl 8.2

declare 150 {
    int TclRegAbout(Tcl_Interp *interp, Tcl_RegExp re)
}
declare 151 {
    void TclRegExpRangeUniChar(Tcl_RegExp re, size_t index, size_t *startPtr,
	    size_t *endPtr)
}
declare 152 {
    void TclSetLibraryPath(Tcl_Obj *pathPtr)
}
declare 153 {
    Tcl_Obj *TclGetLibraryPath(void)
}

Changes to generic/tclInt.h.

4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
 */

#if (!defined(TCL_WIDE_INT_IS_LONG) || (LONG_MAX > UINT_MAX)) && (SIZE_MAX <= UINT_MAX)
#   define TclWideIntFromSize(value)	(((Tcl_WideInt)(((size_t)(value))+1))-1)
#   define TclNewWideIntObjFromSize(value) \
	Tcl_NewWideIntObj(TclWideIntFromSize(value))
#else
#   define TclWideIntFromSize(value)	(value)
#   define TclNewWideIntObjFromSize Tcl_NewWideIntObj
#endif

/*
 * Support for Clang Static Analyzer <http://clang-analyzer.llvm.org>
 */








|







4932
4933
4934
4935
4936
4937
4938
4939
4940
4941
4942
4943
4944
4945
4946
 */

#if (!defined(TCL_WIDE_INT_IS_LONG) || (LONG_MAX > UINT_MAX)) && (SIZE_MAX <= UINT_MAX)
#   define TclWideIntFromSize(value)	(((Tcl_WideInt)(((size_t)(value))+1))-1)
#   define TclNewWideIntObjFromSize(value) \
	Tcl_NewWideIntObj(TclWideIntFromSize(value))
#else
#   define TclWideIntFromSize(value)	((Tcl_WideInt)(value))
#   define TclNewWideIntObjFromSize Tcl_NewWideIntObj
#endif

/*
 * Support for Clang Static Analyzer <http://clang-analyzer.llvm.org>
 */

Changes to generic/tclIntDecls.h.

333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
...
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
EXTERN TclHandle	TclHandlePreserve(TclHandle handle);
/* 149 */
EXTERN void		TclHandleRelease(TclHandle handle);
/* 150 */
EXTERN int		TclRegAbout(Tcl_Interp *interp, Tcl_RegExp re);
/* 151 */
EXTERN void		TclRegExpRangeUniChar(Tcl_RegExp re, size_t index,
				int *startPtr, int *endPtr);
/* 152 */
EXTERN void		TclSetLibraryPath(Tcl_Obj *pathPtr);
/* 153 */
EXTERN Tcl_Obj *	TclGetLibraryPath(void);
/* Slot 154 is reserved */
/* Slot 155 is reserved */
/* 156 */
................................................................................
    void (*tclHideLiteral) (Tcl_Interp *interp, struct CompileEnv *envPtr, int index); /* 144 */
    const struct AuxDataType * (*tclGetAuxDataType) (const char *typeName); /* 145 */
    TclHandle (*tclHandleCreate) (void *ptr); /* 146 */
    void (*tclHandleFree) (TclHandle handle); /* 147 */
    TclHandle (*tclHandlePreserve) (TclHandle handle); /* 148 */
    void (*tclHandleRelease) (TclHandle handle); /* 149 */
    int (*tclRegAbout) (Tcl_Interp *interp, Tcl_RegExp re); /* 150 */
    void (*tclRegExpRangeUniChar) (Tcl_RegExp re, size_t index, int *startPtr, int *endPtr); /* 151 */
    void (*tclSetLibraryPath) (Tcl_Obj *pathPtr); /* 152 */
    Tcl_Obj * (*tclGetLibraryPath) (void); /* 153 */
    void (*reserved154)(void);
    void (*reserved155)(void);
    void (*tclRegError) (Tcl_Interp *interp, const char *msg, int status); /* 156 */
    Var * (*tclVarTraceExists) (Tcl_Interp *interp, const char *varName); /* 157 */
    void (*reserved158)(void);







|







 







|







333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
...
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
EXTERN TclHandle	TclHandlePreserve(TclHandle handle);
/* 149 */
EXTERN void		TclHandleRelease(TclHandle handle);
/* 150 */
EXTERN int		TclRegAbout(Tcl_Interp *interp, Tcl_RegExp re);
/* 151 */
EXTERN void		TclRegExpRangeUniChar(Tcl_RegExp re, size_t index,
				size_t *startPtr, size_t *endPtr);
/* 152 */
EXTERN void		TclSetLibraryPath(Tcl_Obj *pathPtr);
/* 153 */
EXTERN Tcl_Obj *	TclGetLibraryPath(void);
/* Slot 154 is reserved */
/* Slot 155 is reserved */
/* 156 */
................................................................................
    void (*tclHideLiteral) (Tcl_Interp *interp, struct CompileEnv *envPtr, int index); /* 144 */
    const struct AuxDataType * (*tclGetAuxDataType) (const char *typeName); /* 145 */
    TclHandle (*tclHandleCreate) (void *ptr); /* 146 */
    void (*tclHandleFree) (TclHandle handle); /* 147 */
    TclHandle (*tclHandlePreserve) (TclHandle handle); /* 148 */
    void (*tclHandleRelease) (TclHandle handle); /* 149 */
    int (*tclRegAbout) (Tcl_Interp *interp, Tcl_RegExp re); /* 150 */
    void (*tclRegExpRangeUniChar) (Tcl_RegExp re, size_t index, size_t *startPtr, size_t *endPtr); /* 151 */
    void (*tclSetLibraryPath) (Tcl_Obj *pathPtr); /* 152 */
    Tcl_Obj * (*tclGetLibraryPath) (void); /* 153 */
    void (*reserved154)(void);
    void (*reserved155)(void);
    void (*tclRegError) (Tcl_Interp *interp, const char *msg, int status); /* 156 */
    Var * (*tclVarTraceExists) (Tcl_Interp *interp, const char *varName); /* 157 */
    void (*reserved158)(void);

Changes to generic/tclRegexp.c.

260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
...
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    const char *string;

    if (index > regexpPtr->re.re_nsub) {
	*startPtr = *endPtr = NULL;
    } else if (regexpPtr->matches[index].rm_so == -1) {
	*startPtr = *endPtr = NULL;
    } else {
	if (regexpPtr->objPtr) {
	    string = TclGetString(regexpPtr->objPtr);
	} else {
	    string = regexpPtr->string;
	}
................................................................................
TclRegExpRangeUniChar(
    Tcl_RegExp re,		/* Compiled regular expression that has been
				 * passed to Tcl_RegExpExec. */
    size_t index,			/* 0 means give the range of the entire match,
				 * > 0 means give the range of a matching
				 * subrange, -1 means the range of the
				 * rm_extend field. */
    int *startPtr,		/* Store address of first character in
				 * (sub-)range here. */
    int *endPtr)		/* Store address of character just after last
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;

    if ((regexpPtr->flags&REG_EXPECT) && index == TCL_AUTO_LENGTH) {
	*startPtr = regexpPtr->details.rm_extend.rm_so;
	*endPtr = regexpPtr->details.rm_extend.rm_eo;
    } else if (index > regexpPtr->re.re_nsub) {
	*startPtr = -1;
	*endPtr = -1;
    } else {
	*startPtr = regexpPtr->matches[index].rm_so;
	*endPtr = regexpPtr->matches[index].rm_eo;
    }
}
................................................................................
     * Assume that there will never be more than INT_MAX subexpressions. This
     * is a pretty reasonable assumption; the RE engine doesn't scale _that_
     * well and Tcl has other limits that constrain things as well...
     */

    resultObj = Tcl_NewObj();
    Tcl_ListObjAppendElement(NULL, resultObj,
	    Tcl_NewWideIntObj((Tcl_WideInt) regexpPtr->re.re_nsub));

    /*
     * Now append a list of all the bit-flags set for the RE.
     */

    TclNewObj(infoObj);
    for (inf=infonames ; inf->bit != 0 ; inf++) {







|







 







|

|




|


|







 







|







260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
...
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
...
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;
    const char *string;

    if (index > regexpPtr->re.re_nsub) {
	*startPtr = *endPtr = NULL;
    } else if (regexpPtr->matches[index].rm_so == (size_t)-1) {
	*startPtr = *endPtr = NULL;
    } else {
	if (regexpPtr->objPtr) {
	    string = TclGetString(regexpPtr->objPtr);
	} else {
	    string = regexpPtr->string;
	}
................................................................................
TclRegExpRangeUniChar(
    Tcl_RegExp re,		/* Compiled regular expression that has been
				 * passed to Tcl_RegExpExec. */
    size_t index,			/* 0 means give the range of the entire match,
				 * > 0 means give the range of a matching
				 * subrange, -1 means the range of the
				 * rm_extend field. */
    size_t *startPtr,		/* Store address of first character in
				 * (sub-)range here. */
    size_t *endPtr)		/* Store address of character just after last
				 * in (sub-)range here. */
{
    TclRegexp *regexpPtr = (TclRegexp *) re;

    if ((regexpPtr->flags&REG_EXPECT) && (index == TCL_INDEX_NONE)) {
	*startPtr = regexpPtr->details.rm_extend.rm_so;
	*endPtr = regexpPtr->details.rm_extend.rm_eo;
    } else if (index + 1 > regexpPtr->re.re_nsub + 1) {
	*startPtr = -1;
	*endPtr = -1;
    } else {
	*startPtr = regexpPtr->matches[index].rm_so;
	*endPtr = regexpPtr->matches[index].rm_eo;
    }
}
................................................................................
     * Assume that there will never be more than INT_MAX subexpressions. This
     * is a pretty reasonable assumption; the RE engine doesn't scale _that_
     * well and Tcl has other limits that constrain things as well...
     */

    resultObj = Tcl_NewObj();
    Tcl_ListObjAppendElement(NULL, resultObj,
	    TclNewWideIntObjFromSize(regexpPtr->re.re_nsub));

    /*
     * Now append a list of all the bit-flags set for the RE.
     */

    TclNewObj(infoObj);
    for (inf=infonames ; inf->bit != 0 ; inf++) {

Changes to generic/tclTest.c.

338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
....
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
....
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
....
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000

4001
4002
4003
4004
4005
4006
4007
4008
static int		TestregexpObjCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static int		TestreturnObjCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static void		TestregexpXflags(const char *string,
			    int length, int *cflagsPtr, int *eflagsPtr);
static int		TestsaveresultCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static void		TestsaveresultFree(void *blockPtr);
static int		TestsetassocdataCmd(void *dummy,
			    Tcl_Interp *interp, int argc, const char **argv);
static int		TestsetCmd(void *dummy,
................................................................................
	 * value 0.
	 */

	Tcl_SetIntObj(Tcl_GetObjResult(interp), 0);
	if (objc > 2 && (cflags&REG_EXPECT) && indices) {
	    const char *varName;
	    const char *value;
	    int start, end;
	    char resinfo[TCL_INTEGER_SPACE * 2];

	    varName = Tcl_GetString(objv[2]);
	    TclRegExpRangeUniChar(regExpr, -1, &start, &end);
	    sprintf(resinfo, "%d %d", start, end-1);
	    value = Tcl_SetVar2(interp, varName, NULL, resinfo, 0);
	    if (value == NULL) {
		Tcl_AppendResult(interp, "couldn't set variable \"",
			varName, "\"", NULL);
		return TCL_ERROR;
	    }
	} else if (cflags & TCL_REG_CANMATCH) {
	    const char *varName;
	    const char *value;
	    char resinfo[TCL_INTEGER_SPACE * 2];

	    Tcl_RegExpGetInfo(regExpr, &info);
	    varName = Tcl_GetString(objv[2]);
	    sprintf(resinfo, "%ld", info.extendStart);
	    value = Tcl_SetVar2(interp, varName, NULL, resinfo, 0);
	    if (value == NULL) {
		Tcl_AppendResult(interp, "couldn't set variable \"",
			varName, "\"", NULL);
		return TCL_ERROR;
	    }
	}
................................................................................
     */

    objc -= 2;
    objv += 2;

    Tcl_RegExpGetInfo(regExpr, &info);
    for (i = 0; i < objc; i++) {
	int start, end;
	Tcl_Obj *newPtr, *varPtr, *valuePtr;

	varPtr = objv[i];
	ii = ((cflags&REG_EXPECT) && i == objc-1) ? -1 : i;
	if (indices) {
	    Tcl_Obj *objs[2];

	    if (ii == -1) {
		TclRegExpRangeUniChar(regExpr, ii, &start, &end);
	    } else if (ii > info.nsubs) {
		start = -1;
		end = -1;
	    } else {
		start = info.matches[ii].start;
		end = info.matches[ii].end;
	    }

	    /*
	     * Adjust index so it refers to the last character in the match
	     * instead of the first character after the match.
	     */

	    if (end >= 0) {
		end--;
	    }

	    objs[0] = Tcl_NewWideIntObj(start);
	    objs[1] = Tcl_NewWideIntObj(end);

	    newPtr = Tcl_NewListObj(2, objs);
	} else {
	    if (ii == -1) {
		TclRegExpRangeUniChar(regExpr, ii, &start, &end);
		newPtr = Tcl_GetRange(objPtr, start, end);
	    } else if (ii > info.nsubs) {
		newPtr = Tcl_NewObj();
	    } else {
		newPtr = Tcl_GetRange(objPtr, info.matches[ii].start,
			info.matches[ii].end - 1);
	    }
	}
	valuePtr = Tcl_ObjSetVar2(interp, varPtr, NULL, newPtr, TCL_LEAVE_ERR_MSG);
................................................................................
 *
 *----------------------------------------------------------------------
 */

static void
TestregexpXflags(
    const char *string,	/* The string of flags. */
    int length,			/* The length of the string in bytes. */
    int *cflagsPtr,		/* compile flags word */
    int *eflagsPtr)		/* exec flags word */
{

    int i, cflags, eflags;

    cflags = *cflagsPtr;
    eflags = *eflagsPtr;
    for (i = 0; i < length; i++) {
	switch (string[i]) {
	case 'a':
	    cflags |= REG_ADVF;







|







 







|




|













|







 







|









|
|
|










|



|
|






|







 







|



>
|







338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
....
3874
3875
3876
3877
3878
3879
3880
3881
3882
3883
3884
3885
3886
3887
3888
3889
3890
3891
3892
3893
3894
3895
3896
3897
3898
3899
3900
3901
3902
3903
3904
3905
3906
3907
....
3914
3915
3916
3917
3918
3919
3920
3921
3922
3923
3924
3925
3926
3927
3928
3929
3930
3931
3932
3933
3934
3935
3936
3937
3938
3939
3940
3941
3942
3943
3944
3945
3946
3947
3948
3949
3950
3951
3952
3953
3954
3955
3956
3957
3958
3959
3960
3961
3962
3963
....
3990
3991
3992
3993
3994
3995
3996
3997
3998
3999
4000
4001
4002
4003
4004
4005
4006
4007
4008
4009
static int		TestregexpObjCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static int		TestreturnObjCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static void		TestregexpXflags(const char *string,
			    size_t length, int *cflagsPtr, int *eflagsPtr);
static int		TestsaveresultCmd(void *dummy,
			    Tcl_Interp *interp, int objc,
			    Tcl_Obj *const objv[]);
static void		TestsaveresultFree(void *blockPtr);
static int		TestsetassocdataCmd(void *dummy,
			    Tcl_Interp *interp, int argc, const char **argv);
static int		TestsetCmd(void *dummy,
................................................................................
	 * value 0.
	 */

	Tcl_SetIntObj(Tcl_GetObjResult(interp), 0);
	if (objc > 2 && (cflags&REG_EXPECT) && indices) {
	    const char *varName;
	    const char *value;
	    size_t start, end;
	    char resinfo[TCL_INTEGER_SPACE * 2];

	    varName = Tcl_GetString(objv[2]);
	    TclRegExpRangeUniChar(regExpr, -1, &start, &end);
	    sprintf(resinfo, "%" TCL_LL_MODIFIER "d %" TCL_LL_MODIFIER "d", TclWideIntFromSize(start), TclWideIntFromSize(end-1));
	    value = Tcl_SetVar2(interp, varName, NULL, resinfo, 0);
	    if (value == NULL) {
		Tcl_AppendResult(interp, "couldn't set variable \"",
			varName, "\"", NULL);
		return TCL_ERROR;
	    }
	} else if (cflags & TCL_REG_CANMATCH) {
	    const char *varName;
	    const char *value;
	    char resinfo[TCL_INTEGER_SPACE * 2];

	    Tcl_RegExpGetInfo(regExpr, &info);
	    varName = Tcl_GetString(objv[2]);
	    sprintf(resinfo, "%" TCL_LL_MODIFIER "d", TclWideIntFromSize(info.extendStart));
	    value = Tcl_SetVar2(interp, varName, NULL, resinfo, 0);
	    if (value == NULL) {
		Tcl_AppendResult(interp, "couldn't set variable \"",
			varName, "\"", NULL);
		return TCL_ERROR;
	    }
	}
................................................................................
     */

    objc -= 2;
    objv += 2;

    Tcl_RegExpGetInfo(regExpr, &info);
    for (i = 0; i < objc; i++) {
	size_t start, end;
	Tcl_Obj *newPtr, *varPtr, *valuePtr;

	varPtr = objv[i];
	ii = ((cflags&REG_EXPECT) && i == objc-1) ? -1 : i;
	if (indices) {
	    Tcl_Obj *objs[2];

	    if (ii == -1) {
		TclRegExpRangeUniChar(regExpr, ii, &start, &end);
	    } else if (ii > (int)info.nsubs) {
		start = TCL_INDEX_NONE;
		end = TCL_INDEX_NONE;
	    } else {
		start = info.matches[ii].start;
		end = info.matches[ii].end;
	    }

	    /*
	     * Adjust index so it refers to the last character in the match
	     * instead of the first character after the match.
	     */

	    if (end != TCL_INDEX_NONE) {
		end--;
	    }

	    objs[0] = TclNewWideIntObjFromSize(start);
	    objs[1] = TclNewWideIntObjFromSize(end);

	    newPtr = Tcl_NewListObj(2, objs);
	} else {
	    if (ii == -1) {
		TclRegExpRangeUniChar(regExpr, ii, &start, &end);
		newPtr = Tcl_GetRange(objPtr, start, end);
	    } else if (ii > (int)info.nsubs) {
		newPtr = Tcl_NewObj();
	    } else {
		newPtr = Tcl_GetRange(objPtr, info.matches[ii].start,
			info.matches[ii].end - 1);
	    }
	}
	valuePtr = Tcl_ObjSetVar2(interp, varPtr, NULL, newPtr, TCL_LEAVE_ERR_MSG);
................................................................................
 *
 *----------------------------------------------------------------------
 */

static void
TestregexpXflags(
    const char *string,	/* The string of flags. */
    size_t length,			/* The length of the string in bytes. */
    int *cflagsPtr,		/* compile flags word */
    int *eflagsPtr)		/* exec flags word */
{
    size_t i;
	int cflags, eflags;

    cflags = *cflagsPtr;
    eflags = *eflagsPtr;
    for (i = 0; i < length; i++) {
	switch (string[i]) {
	case 'a':
	    cflags |= REG_ADVF;