Index: doc/Tcl.n ================================================================== --- doc/Tcl.n +++ doc/Tcl.n @@ -4,11 +4,11 @@ '\" '\" See the file "license.terms" for information on usage and redistribution '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES. '\" .so man.macros -.TH Tcl n "8.5" Tcl "Tcl Built-In Commands" +.TH Tcl n "8.6" Tcl "Tcl Built-In Commands" .BS .SH NAME Tcl \- Tool Command Language .SH SYNOPSIS Summary of Tcl language syntax. @@ -191,27 +191,37 @@ Backslash .PQ \e "" . .TP 7 \e\fIooo\fR . -The digits \fIooo\fR (one, two, or three of them) give an eight-bit octal -value for the Unicode character that will be inserted. The upper bits of the -Unicode character will be 0. +The digits \fIooo\fR (one, two, or three of them) give a eight-bit octal +value for the Unicode character that will be inserted, in the range \fI000\fR +- \fI377\fR. The parser will stop just before this range overflows, or when +the maximum of three digits is reached. The upper bits of the Unicode +character will be 0. .TP 7 \e\fBx\fIhh\fR . -The hexadecimal digits \fIhh\fR give an eight-bit hexadecimal value for the -Unicode character that will be inserted. Any number of hexadecimal digits -may be present; however, all but the last two are ignored (the result is -always a one-byte quantity). The upper bits of the Unicode character will -be 0. +The hexadecimal digits \fIhh\fR (one or two of them) give an eight-bit +hexadecimal value for the Unicode character that will be inserted. The upper +bits of the Unicode character will be 0. .TP 7 \e\fBu\fIhhhh\fR . The hexadecimal digits \fIhhhh\fR (one, two, three, or four of them) give a sixteen-bit hexadecimal value for the Unicode character that will be -inserted. +inserted. The upper bits of the Unicode character will be 0. +.TP 7 +\e\fBU\fIhhhhhhhh\fR +. +The hexadecimal digits \fIhhhhhhhh\fR (one up to eight of them) give a +twentiy-one-bit hexadecimal value for the Unicode character that will be +inserted, in the range U+0000..U+10FFFF. The parser will stop just +before this range overflows, or when the maximum of eight digits +is reached. The upper bits of the Unicode character will be 0. +.PP +The range U+010000..U+10FFFD is reserved for the future. .PP Backslash substitution is not performed on words enclosed in braces, except for backslash-newline as described above. .RE .IP "[10] \fBComments.\fR" Index: doc/re_syntax.n ================================================================== --- doc/re_syntax.n +++ doc/re_syntax.n @@ -357,43 +357,46 @@ . horizontal tab, as in C .TP \fB\eu\fIwxyz\fR . -(where \fIwxyz\fR is exactly four hexadecimal digits) the Unicode +(where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode character \fBU+\fIwxyz\fR in the local byte ordering .TP \fB\eU\fIstuvwxyz\fR . -(where \fIstuvwxyz\fR is exactly eight hexadecimal digits) reserved -for a somewhat-hypothetical Unicode extension to 32 bits +(where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved +for a Unicode extension up to 21 bits. The digits are parsed until the +first non-hexadecimal character is encountered, the maximun of eight +hexadecimal digits are reached, or an overflow would occur in the maximum +value of \fBU+\fI10ffff\fR. .TP \fB\ev\fR . vertical tab, as in C are all available. .TP -\fB\ex\fIhhh\fR +\fB\ex\fIhh\fR . -(where \fIhhh\fR is any sequence of hexadecimal digits) the character -whose hexadecimal value is \fB0x\fIhhh\fR (a single character no -matter how many hexadecimal digits are used). +(where \fIhh\fR is one or two hexadecimal digits) the character +whose hexadecimal value is \fB0x\fIhh\fR. .TP \fB\e0\fR . the character whose value is \fB0\fR .TP +\fB\e\fIxyz\fR +. +(where \fIxyz\fR is exactly three octal digits, and is not a \fIback +reference\fR (see below)) the character whose octal value is +\fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise +the two-digit form is assumed. +.TP \fB\e\fIxy\fR . (where \fIxy\fR is exactly two octal digits, and is not a \fIback reference\fR (see below)) the character whose octal value is \fB0\fIxy\fR -.TP -\fB\e\fIxyz\fR -. -(where \fIxyz\fR is exactly three octal digits, and is not a back -reference (see below)) the character whose octal value is -\fB0\fIxyz\fR .RE .PP Hexadecimal digits are .QR \fB0\fR \fB9\fR , .QR \fBa\fR \fBf\fR , Index: generic/regc_lex.c ================================================================== --- generic/regc_lex.c +++ generic/regc_lex.c @@ -740,10 +740,11 @@ static int /* not actually used, but convenient for RETV */ lexescape( struct vars *v) { chr c; + int i; static const chr alert[] = { CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t') }; static const chr esc[] = { CHR('E'), CHR('S'), CHR('C') @@ -816,22 +817,27 @@ break; case CHR('t'): RETV(PLAIN, CHR('\t')); break; case CHR('u'): - c = lexdigits(v, 16, 4, 4); + c = (uchr) lexdigits(v, 16, 1, 4); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; case CHR('U'): - c = lexdigits(v, 16, 8, 8); + i = lexdigits(v, 16, 1, 8); if (ISERR()) { FAILW(REG_EESCAPE); } - RETV(PLAIN, c); + if (i > 0xFFFF) { + /* TODO: output a Surrogate pair + */ + i = 0xFFFD; + } + RETV(PLAIN, (uchr) i); break; case CHR('v'): RETV(PLAIN, CHR('\v')); break; case CHR('w'): @@ -842,11 +848,11 @@ NOTE(REG_ULOCALE); RETV(CCLASS, 'W'); break; case CHR('x'): NOTE(REG_UUNPORT); - c = lexdigits(v, 16, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 16, 1, 2); if (ISERR()) { FAILW(REG_EESCAPE); } RETV(PLAIN, c); break; @@ -864,11 +870,11 @@ case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): save = v->now; v->now--; /* put first digit back */ - c = lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ + c = (uchr) lexdigits(v, 10, 1, 255); /* REs >255 long outside spec */ if (ISERR()) { FAILW(REG_EESCAPE); } /* @@ -891,13 +897,18 @@ */ case CHR('0'): NOTE(REG_UUNPORT); v->now--; /* put first digit back */ - c = lexdigits(v, 8, 1, 3); + c = (uchr) lexdigits(v, 8, 1, 3); if (ISERR()) { FAILW(REG_EESCAPE); + } + if (c > 0xff) { + /* out of range, so we handled one digit too much */ + v->now--; + c >>= 3; } RETV(PLAIN, c); break; default: assert(iscalpha(c)); @@ -907,27 +918,31 @@ assert(NOTREACHED); } /* - lexdigits - slurp up digits and return chr value - ^ static chr lexdigits(struct vars *, int, int, int); + ^ static int lexdigits(struct vars *, int, int, int); */ -static chr /* chr value; errors signalled via ERR */ +static int /* chr value; errors signalled via ERR */ lexdigits( struct vars *v, int base, int minlen, int maxlen) { - uchr n; /* unsigned to avoid overflow misbehavior */ + int n; int len; chr c; int d; const uchr ub = (uchr) base; n = 0; for (len = 0; len < maxlen && !ATEOS(); len++) { + if (n > 0x10fff) { + /* Stop when continuing would otherwise overflow */ + break; + } c = *v->now++; switch (c) { case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'): case CHR('9'): @@ -956,11 +971,11 @@ } if (len < minlen) { ERR(REG_EESCAPE); } - return (chr)n; + return n; } /* - brenext - get next BRE token * This is much like EREs except for all the stupid backslashes and the Index: generic/regcomp.c ================================================================== --- generic/regcomp.c +++ generic/regcomp.c @@ -77,11 +77,11 @@ static void prefixes(struct vars *); static void lexnest(struct vars *, const chr *, const chr *); static void lexword(struct vars *); static int next(struct vars *); static int lexescape(struct vars *); -static chr lexdigits(struct vars *, int, int, int); +static int lexdigits(struct vars *, int, int, int); static int brenext(struct vars *, pchr); static void skip(struct vars *); static chr newline(NOPARMS); #ifdef REG_DEBUG static const chr *ch(NOPARMS); Index: generic/regcustom.h ================================================================== --- generic/regcustom.h +++ generic/regcustom.h @@ -95,11 +95,11 @@ typedef unsigned uchr; /* Unsigned type that will hold a chr. */ typedef int celt; /* Type to hold chr, or NOCELT */ #define NOCELT (-1) /* Celt value which is not valid chr */ #define CHR(c) (UCHAR(c)) /* Turn char literal into chr literal */ #define DIGITVAL(c) ((c)-'0') /* Turn chr digit into its value */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 #define CHRBITS 32 /* Bits in a chr; must not use sizeof */ #define CHR_MIN 0x00000000 /* Smallest and largest chr; the value */ #define CHR_MAX 0xffffffff /* CHR_MAX-CHR_MIN+1 should fit in uchr */ #else #define CHRBITS 16 /* Bits in a chr; must not use sizeof */ Index: generic/tcl.h ================================================================== --- generic/tcl.h +++ generic/tcl.h @@ -2151,16 +2151,16 @@ #define TCL_CONVERT_UNKNOWN (-3) #define TCL_CONVERT_NOSPACE (-4) /* * The maximum number of bytes that are necessary to represent a single - * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1 - * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar - * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must - * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and - * recommended mode. UCS-4 is experimental and not recommended. It works for - * the core, but most extensions expect UCS-2. + * Unicode character in UTF-8. The valid values should be 3, 4 or 6 + * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or + * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6, + * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode + * is the default and recommended mode. UCS-4 is experimental and not + * recommended. It works for the core, but most extensions expect UCS-2. */ #ifndef TCL_UTF_MAX #define TCL_UTF_MAX 3 #endif @@ -2168,11 +2168,11 @@ /* * This represents a Unicode character. Any changes to this should also be * reflected in regcustom.h. */ -#if TCL_UTF_MAX > 3 +#if TCL_UTF_MAX > 4 /* * unsigned int isn't 100% accurate as it should be a strict 4-byte value * (perhaps wchar_t). 64-bit systems may have troubles. The size of this * value must be reflected correctly in regcustom.h and * in tclEncoding.c. Index: generic/tclParse.c ================================================================== --- generic/tclParse.c +++ generic/tclParse.c @@ -752,11 +752,11 @@ register const char *p = src; while (numBytes--) { unsigned char digit = UCHAR(*p); - if (!isxdigit(digit)) { + if (!isxdigit(digit) || (result > 0x10fff)) { break; } p++; result <<= 4; @@ -864,11 +864,11 @@ break; case 'v': result = 0xb; break; case 'x': - count += TclParseHex(p+1, numBytes-2, &result); + count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result); if (count == 2) { /* * No hexadigits -> This is just "x". */ @@ -886,10 +886,19 @@ /* * No hexadigits -> This is just "u". */ result = 'u'; } + break; + case 'U': + count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result); + if (count == 2) { + /* + * No hexadigits -> This is just "U". + */ + result = 'U'; + } break; case '\n': count--; do { p++; @@ -915,11 +924,11 @@ } count = 3; result = (result << 3) + (*p - '0'); p++; if ((numBytes == 3) || !isdigit(UCHAR(*p)) /* INTL: digit */ - || (UCHAR(*p) >= '8')) { + || (UCHAR(*p) >= '8') || (result >= 0x20)) { break; } count = 4; result = UCHAR((result << 3) + (*p - '0')); break; Index: tests/reg.test ================================================================== --- tests/reg.test +++ tests/reg.test @@ -624,20 +624,28 @@ expectMatch 13.12 P "a\\fb" "a\fb" "a\fb" expectMatch 13.13 P "a\\nb" "a\nb" "a\nb" expectMatch 13.14 P "a\\rb" "a\rb" "a\rb" expectMatch 13.15 P "a\\tb" "a\tb" "a\tb" expectMatch 13.16 P "a\\u0008x" "a\bx" "a\bx" -expectError 13.17 - {a\u008x} EESCAPE +expectMatch 13.17 P {a\u008x} "a\bx" "a\bx" expectMatch 13.18 P "a\\u00088x" "a\b8x" "a\b8x" expectMatch 13.19 P "a\\U00000008x" "a\bx" "a\bx" -expectError 13.20 - {a\U0000008x} EESCAPE +expectMatch 13.20 P {a\U0000008x} "a\bx" "a\bx" expectMatch 13.21 P "a\\vb" "a\vb" "a\vb" expectMatch 13.22 MP "a\\x08x" "a\bx" "a\bx" expectError 13.23 - {a\xq} EESCAPE -expectMatch 13.24 MP "a\\x0008x" "a\bx" "a\bx" +expectMatch 13.24 MP "a\\x08x" "a\bx" "a\bx" expectError 13.25 - {a\z} EESCAPE expectMatch 13.26 MP "a\\010b" "a\bb" "a\bb" +expectMatch 13.27 P "a\\U00001234x" "a\u1234x" "a\u1234x" +expectMatch 13.28 P {a\U00001234x} "a\u1234x" "a\u1234x" +expectMatch 13.29 P "a\\U0001234x" "a\u1234x" "a\u1234x" +expectMatch 13.30 P {a\U0001234x} "a\u1234x" "a\u1234x" +expectMatch 13.31 P "a\\U000012345x" "a\u12345x" "a\u12345x" +expectMatch 13.32 P {a\U000012345x} "a\u12345x" "a\u12345x" +expectMatch 13.33 P "a\\U1000000x" "a\ufffd0x" "a\ufffd0x" +expectMatch 13.34 P {a\U1000000x} "a\ufffd0x" "a\ufffd0x" doing 14 "back references" # ugh expectMatch 14.1 RP {a(b*)c\1} abbcbb abbcbb bb @@ -680,10 +688,11 @@ expectError 15.9 - {a((((((((((b\10))))))))))c} ESUBREG # BREs don't have octal, EREs don't have backrefs expectMatch 15.10 MP "a\\12b" "a\nb" "a\nb" expectError 15.11 b {a\12b} ESUBREG expectMatch 15.12 eAS {a\12b} a12b a12b +expectMatch 15.13 MP {a\701b} a\u00381b a\u00381b doing 16 "expanded syntax" expectMatch 16.1 xP "a b c" "abc" "abc" expectMatch 16.2 xP "a b #oops\nc\td" "abcd" "abcd" Index: tests/utf.test ================================================================== --- tests/utf.test +++ tests/utf.test @@ -169,19 +169,31 @@ bsCheck b\0 98 bsCheck \x 120 bsCheck \xa 10 bsCheck \xA 10 bsCheck \x41 65 -bsCheck \x541 65 +bsCheck \x541 84 bsCheck \u 117 bsCheck \uk 117 bsCheck \u41 65 bsCheck \ua 10 bsCheck \uA 10 bsCheck \340 224 bsCheck \ua1 161 bsCheck \u4e21 20001 +bsCheck \741 60 +bsCheck \U 85 +bsCheck \Uk 85 +bsCheck \U41 65 +bsCheck \Ua 10 +bsCheck \UA 10 +bsCheck \Ua1 161 +bsCheck \U4e21 20001 +bsCheck \U004e21 20001 +bsCheck \U00004e21 20001 +bsCheck \U00110000 65533 +bsCheck \Uffffffff 65533 test utf-11.1 {Tcl_UtfToUpper} { string toupper {} } {} test utf-11.2 {Tcl_UtfToUpper} {