Index: doc/Tcl.n
==================================================================
--- doc/Tcl.n
+++ doc/Tcl.n
@@ -4,11 +4,11 @@
 '\"
 '\" See the file "license.terms" for information on usage and redistribution
 '\" of this file, and for a DISCLAIMER OF ALL WARRANTIES.
 '\"
 .so man.macros
-.TH Tcl n "8.5" Tcl "Tcl Built-In Commands"
+.TH Tcl n "8.6" Tcl "Tcl Built-In Commands"
 .BS
 .SH NAME
 Tcl \- Tool Command Language
 .SH SYNOPSIS
 Summary of Tcl language syntax.
@@ -191,27 +191,37 @@
 Backslash
 .PQ \e "" .
 .TP 7
 \e\fIooo\fR 
 .
-The digits \fIooo\fR (one, two, or three of them) give an eight-bit octal 
-value for the Unicode character that will be inserted.  The upper bits of the
-Unicode character will be 0.
+The digits \fIooo\fR (one, two, or three of them) give a eight-bit octal 
+value for the Unicode character that will be inserted, in the range \fI000\fR
+- \fI377\fR.  The parser will stop just before this range overflows, or when
+the maximum of three digits is reached.  The upper bits of the Unicode
+character will be 0.
 .TP 7
 \e\fBx\fIhh\fR 
 .
-The hexadecimal digits \fIhh\fR give an eight-bit hexadecimal value for the
-Unicode character that will be inserted.  Any number of hexadecimal digits
-may be present; however, all but the last two are ignored (the result is
-always a one-byte quantity).  The upper bits of the Unicode character will
-be 0.
+The hexadecimal digits \fIhh\fR (one or two of them) give an eight-bit
+hexadecimal value for the Unicode character that will be inserted.  The upper
+bits of the Unicode character will be 0.
 .TP 7
 \e\fBu\fIhhhh\fR 
 .
 The hexadecimal digits \fIhhhh\fR (one, two, three, or four of them) give a
 sixteen-bit hexadecimal value for the Unicode character that will be
-inserted.
+inserted.  The upper bits of the Unicode character will be 0.
+.TP 7
+\e\fBU\fIhhhhhhhh\fR 
+.
+The hexadecimal digits \fIhhhhhhhh\fR (one up to eight of them) give a
+twentiy-one-bit hexadecimal value for the Unicode character that will be
+inserted, in the range U+0000..U+10FFFF.  The parser will stop just
+before this range overflows, or when the maximum of eight digits
+is reached.  The upper bits of the Unicode character will be 0.
+.PP
+The range U+010000..U+10FFFD is reserved for the future.
 .PP
 Backslash substitution is not performed on words enclosed in braces,
 except for backslash-newline as described above.
 .RE
 .IP "[10] \fBComments.\fR"

Index: doc/re_syntax.n
==================================================================
--- doc/re_syntax.n
+++ doc/re_syntax.n
@@ -357,43 +357,46 @@
 .
 horizontal tab, as in C
 .TP
 \fB\eu\fIwxyz\fR
 .
-(where \fIwxyz\fR is exactly four hexadecimal digits) the Unicode
+(where \fIwxyz\fR is one up to four hexadecimal digits) the Unicode
 character \fBU+\fIwxyz\fR in the local byte ordering
 .TP
 \fB\eU\fIstuvwxyz\fR
 .
-(where \fIstuvwxyz\fR is exactly eight hexadecimal digits) reserved
-for a somewhat-hypothetical Unicode extension to 32 bits
+(where \fIstuvwxyz\fR is one up to eight hexadecimal digits) reserved
+for a Unicode extension up to 21 bits. The digits are parsed until the
+first non-hexadecimal character is encountered, the maximun of eight
+hexadecimal digits are reached, or an overflow would occur in the maximum
+value of \fBU+\fI10ffff\fR.
 .TP
 \fB\ev\fR
 .
 vertical tab, as in C are all available.
 .TP
-\fB\ex\fIhhh\fR
+\fB\ex\fIhh\fR
 .
-(where \fIhhh\fR is any sequence of hexadecimal digits) the character
-whose hexadecimal value is \fB0x\fIhhh\fR (a single character no
-matter how many hexadecimal digits are used).
+(where \fIhh\fR is one or two hexadecimal digits) the character
+whose hexadecimal value is \fB0x\fIhh\fR.
 .TP
 \fB\e0\fR
 .
 the character whose value is \fB0\fR
 .TP
+\fB\e\fIxyz\fR
+.
+(where \fIxyz\fR is exactly three octal digits, and is not a \fIback
+reference\fR (see below)) the character whose octal value is
+\fB0\fIxyz\fR. The first digit must be in the range 0-3, otherwise
+the two-digit form is assumed.
+.TP
 \fB\e\fIxy\fR
 .
 (where \fIxy\fR is exactly two octal digits, and is not a \fIback
 reference\fR (see below)) the character whose octal value is
 \fB0\fIxy\fR
-.TP
-\fB\e\fIxyz\fR
-.
-(where \fIxyz\fR is exactly three octal digits, and is not a back
-reference (see below)) the character whose octal value is
-\fB0\fIxyz\fR
 .RE
 .PP
 Hexadecimal digits are
 .QR \fB0\fR \fB9\fR ,
 .QR \fBa\fR \fBf\fR ,

Index: generic/regc_lex.c
==================================================================
--- generic/regc_lex.c
+++ generic/regc_lex.c
@@ -740,10 +740,11 @@
 static int			/* not actually used, but convenient for RETV */
 lexescape(
     struct vars *v)
 {
     chr c;
+    int i;
     static const chr alert[] = {
 	CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
     };
     static const chr esc[] = {
 	CHR('E'), CHR('S'), CHR('C')
@@ -816,22 +817,27 @@
 	break;
     case CHR('t'):
 	RETV(PLAIN, CHR('\t'));
 	break;
     case CHR('u'):
-	c = lexdigits(v, 16, 4, 4);
+	c = (uchr) lexdigits(v, 16, 1, 4);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 	RETV(PLAIN, c);
 	break;
     case CHR('U'):
-	c = lexdigits(v, 16, 8, 8);
+	i = lexdigits(v, 16, 1, 8);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
-	RETV(PLAIN, c);
+	if (i > 0xFFFF) {
+	    /* TODO: output a Surrogate pair
+	     */
+	    i = 0xFFFD;
+	}
+	RETV(PLAIN, (uchr) i);
 	break;
     case CHR('v'):
 	RETV(PLAIN, CHR('\v'));
 	break;
     case CHR('w'):
@@ -842,11 +848,11 @@
 	NOTE(REG_ULOCALE);
 	RETV(CCLASS, 'W');
 	break;
     case CHR('x'):
 	NOTE(REG_UUNPORT);
-	c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 16, 1, 2);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 	RETV(PLAIN, c);
 	break;
@@ -864,11 +870,11 @@
     case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
     case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
     case CHR('9'):
 	save = v->now;
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+	c = (uchr) lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
 	}
 
 	/*
@@ -891,13 +897,18 @@
 	 */
 
     case CHR('0'):
 	NOTE(REG_UUNPORT);
 	v->now--;		/* put first digit back */
-	c = lexdigits(v, 8, 1, 3);
+	c = (uchr) lexdigits(v, 8, 1, 3);
 	if (ISERR()) {
 	    FAILW(REG_EESCAPE);
+	}
+	if (c > 0xff) {
+	    /* out of range, so we handled one digit too much */
+	    v->now--;
+	    c >>= 3;
 	}
 	RETV(PLAIN, c);
 	break;
     default:
 	assert(iscalpha(c));
@@ -907,27 +918,31 @@
     assert(NOTREACHED);
 }
 
 /*
  - lexdigits - slurp up digits and return chr value
- ^ static chr lexdigits(struct vars *, int, int, int);
+ ^ static int lexdigits(struct vars *, int, int, int);
  */
-static chr			/* chr value; errors signalled via ERR */
+static int			/* chr value; errors signalled via ERR */
 lexdigits(
     struct vars *v,
     int base,
     int minlen,
     int maxlen)
 {
-    uchr n;			/* unsigned to avoid overflow misbehavior */
+    int n;
     int len;
     chr c;
     int d;
     const uchr ub = (uchr) base;
 
     n = 0;
     for (len = 0; len < maxlen && !ATEOS(); len++) {
+	if (n > 0x10fff) {
+	    /* Stop when continuing would otherwise overflow */
+	    break;
+	}
 	c = *v->now++;
 	switch (c) {
 	case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
 	case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
 	case CHR('8'): case CHR('9'):
@@ -956,11 +971,11 @@
     }
     if (len < minlen) {
 	ERR(REG_EESCAPE);
     }
 
-    return (chr)n;
+    return n;
 }
 
 /*
  - brenext - get next BRE token
  * This is much like EREs except for all the stupid backslashes and the

Index: generic/regcomp.c
==================================================================
--- generic/regcomp.c
+++ generic/regcomp.c
@@ -77,11 +77,11 @@
 static void prefixes(struct vars *);
 static void lexnest(struct vars *, const chr *, const chr *);
 static void lexword(struct vars *);
 static int next(struct vars *);
 static int lexescape(struct vars *);
-static chr lexdigits(struct vars *, int, int, int);
+static int lexdigits(struct vars *, int, int, int);
 static int brenext(struct vars *, pchr);
 static void skip(struct vars *);
 static chr newline(NOPARMS);
 #ifdef REG_DEBUG
 static const chr *ch(NOPARMS);

Index: generic/regcustom.h
==================================================================
--- generic/regcustom.h
+++ generic/regcustom.h
@@ -95,11 +95,11 @@
 typedef unsigned uchr;		/* Unsigned type that will hold a chr. */
 typedef int celt;		/* Type to hold chr, or NOCELT */
 #define	NOCELT (-1)		/* Celt value which is not valid chr */
 #define	CHR(c) (UCHAR(c))	/* Turn char literal into chr literal */
 #define	DIGITVAL(c) ((c)-'0')	/* Turn chr digit into its value */
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
 #define	CHRBITS	32		/* Bits in a chr; must not use sizeof */
 #define	CHR_MIN	0x00000000	/* Smallest and largest chr; the value */
 #define	CHR_MAX	0xffffffff	/* CHR_MAX-CHR_MIN+1 should fit in uchr */
 #else
 #define	CHRBITS	16		/* Bits in a chr; must not use sizeof */

Index: generic/tcl.h
==================================================================
--- generic/tcl.h
+++ generic/tcl.h
@@ -2151,16 +2151,16 @@
 #define TCL_CONVERT_UNKNOWN	(-3)
 #define TCL_CONVERT_NOSPACE	(-4)
 
 /*
  * The maximum number of bytes that are necessary to represent a single
- * Unicode character in UTF-8. The valid values should be 3 or 6 (or perhaps 1
- * if we want to support a non-unicode enabled core). If 3, then Tcl_UniChar
- * must be 2-bytes in size (UCS-2) (the default). If 6, then Tcl_UniChar must
- * be 4-bytes in size (UCS-4). At this time UCS-2 mode is the default and
- * recommended mode. UCS-4 is experimental and not recommended. It works for
- * the core, but most extensions expect UCS-2.
+ * Unicode character in UTF-8. The valid values should be 3, 4 or 6
+ * (or perhaps 1 if we want to support a non-unicode enabled core). If 3 or
+ * 4, then Tcl_UniChar must be 2-bytes in size (UCS-2) (the default). If 6,
+ * then Tcl_UniChar must be 4-bytes in size (UCS-4). At this time UCS-2 mode
+ * is the default and recommended mode. UCS-4 is experimental and not
+ * recommended. It works for the core, but most extensions expect UCS-2.
  */
 
 #ifndef TCL_UTF_MAX
 #define TCL_UTF_MAX		3
 #endif
@@ -2168,11 +2168,11 @@
 /*
  * This represents a Unicode character. Any changes to this should also be
  * reflected in regcustom.h.
  */
 
-#if TCL_UTF_MAX > 3
+#if TCL_UTF_MAX > 4
     /*
      * unsigned int isn't 100% accurate as it should be a strict 4-byte value
      * (perhaps wchar_t). 64-bit systems may have troubles. The size of this
      * value must be reflected correctly in regcustom.h and
      * in tclEncoding.c.

Index: generic/tclParse.c
==================================================================
--- generic/tclParse.c
+++ generic/tclParse.c
@@ -752,11 +752,11 @@
     register const char *p = src;
 
     while (numBytes--) {
 	unsigned char digit = UCHAR(*p);
 
-	if (!isxdigit(digit)) {
+	if (!isxdigit(digit) || (result > 0x10fff)) {
 	    break;
 	}
 
 	p++;
 	result <<= 4;
@@ -864,11 +864,11 @@
 	break;
     case 'v':
 	result = 0xb;
 	break;
     case 'x':
-	count += TclParseHex(p+1, numBytes-2, &result);
+	count += TclParseHex(p+1, (numBytes > 3) ? 2 : numBytes-2, &result);
 	if (count == 2) {
 	    /*
 	     * No hexadigits -> This is just "x".
 	     */
 
@@ -886,10 +886,19 @@
 	    /*
 	     * No hexadigits -> This is just "u".
 	     */
 	    result = 'u';
 	}
+	break;
+    case 'U':
+	count += TclParseHex(p+1, (numBytes > 9) ? 8 : numBytes-2, &result);
+	if (count == 2) {
+	    /*
+	     * No hexadigits -> This is just "U".
+	     */
+	    result = 'U';
+	}
 	break;
     case '\n':
 	count--;
 	do {
 	    p++;
@@ -915,11 +924,11 @@
 	    }
 	    count = 3;
 	    result = (result << 3) + (*p - '0');
 	    p++;
 	    if ((numBytes == 3) || !isdigit(UCHAR(*p))	/* INTL: digit */
-		    || (UCHAR(*p) >= '8')) {
+		    || (UCHAR(*p) >= '8') || (result >= 0x20)) {
 		break;
 	    }
 	    count = 4;
 	    result = UCHAR((result << 3) + (*p - '0'));
 	    break;

Index: tests/reg.test
==================================================================
--- tests/reg.test
+++ tests/reg.test
@@ -624,20 +624,28 @@
 expectMatch	13.12 P		"a\\fb"		"a\fb"	"a\fb"
 expectMatch	13.13 P		"a\\nb"		"a\nb"	"a\nb"
 expectMatch	13.14 P		"a\\rb"		"a\rb"	"a\rb"
 expectMatch	13.15 P		"a\\tb"		"a\tb"	"a\tb"
 expectMatch	13.16 P		"a\\u0008x"	"a\bx"	"a\bx"
-expectError	13.17 -		{a\u008x}	EESCAPE
+expectMatch	13.17 P		{a\u008x}	"a\bx"	"a\bx"
 expectMatch	13.18 P		"a\\u00088x"	"a\b8x"	"a\b8x"
 expectMatch	13.19 P		"a\\U00000008x"	"a\bx"	"a\bx"
-expectError	13.20 -		{a\U0000008x}	EESCAPE
+expectMatch	13.20 P		{a\U0000008x}	"a\bx"	"a\bx"
 expectMatch	13.21 P		"a\\vb"		"a\vb"	"a\vb"
 expectMatch	13.22 MP	"a\\x08x"	"a\bx"	"a\bx"
 expectError	13.23 -		{a\xq}		EESCAPE
-expectMatch	13.24 MP	"a\\x0008x"	"a\bx"	"a\bx"
+expectMatch	13.24 MP	"a\\x08x"	"a\bx"	"a\bx"
 expectError	13.25 -		{a\z}		EESCAPE
 expectMatch	13.26 MP	"a\\010b"	"a\bb"	"a\bb"
+expectMatch	13.27 P		"a\\U00001234x"	"a\u1234x"	"a\u1234x"
+expectMatch	13.28 P		{a\U00001234x}	"a\u1234x"	"a\u1234x"
+expectMatch	13.29 P		"a\\U0001234x"	"a\u1234x"	"a\u1234x"
+expectMatch	13.30 P		{a\U0001234x}	"a\u1234x"	"a\u1234x"
+expectMatch	13.31 P		"a\\U000012345x"	"a\u12345x"	"a\u12345x"
+expectMatch	13.32 P		{a\U000012345x}	"a\u12345x"	"a\u12345x"
+expectMatch	13.33 P		"a\\U1000000x"	"a\ufffd0x"	"a\ufffd0x"
+expectMatch	13.34 P		{a\U1000000x}	"a\ufffd0x"	"a\ufffd0x"
 
 
 doing 14 "back references"
 # ugh
 expectMatch	14.1  RP	{a(b*)c\1}	abbcbb	abbcbb	bb
@@ -680,10 +688,11 @@
 expectError	15.9  -	{a((((((((((b\10))))))))))c}	ESUBREG
 # BREs don't have octal, EREs don't have backrefs
 expectMatch	15.10 MP	"a\\12b"	"a\nb"	"a\nb"
 expectError	15.11 b		{a\12b}		ESUBREG
 expectMatch	15.12 eAS	{a\12b}		a12b	a12b
+expectMatch	15.13 MP	{a\701b}	a\u00381b	a\u00381b
 
 
 doing 16 "expanded syntax"
 expectMatch	16.1 xP		"a b c"		"abc"	"abc"
 expectMatch	16.2 xP		"a b #oops\nc\td"	"abcd"	"abcd"

Index: tests/utf.test
==================================================================
--- tests/utf.test
+++ tests/utf.test
@@ -169,19 +169,31 @@
 bsCheck b\0	98
 bsCheck \x	120
 bsCheck \xa	10
 bsCheck \xA	10
 bsCheck \x41	65
-bsCheck \x541	65
+bsCheck \x541	84
 bsCheck \u	117
 bsCheck \uk	117
 bsCheck \u41	65
 bsCheck \ua	10
 bsCheck \uA	10
 bsCheck \340	224
 bsCheck \ua1	161
 bsCheck \u4e21	20001
+bsCheck \741	60
+bsCheck \U	85
+bsCheck \Uk	85
+bsCheck \U41	65
+bsCheck \Ua	10
+bsCheck \UA	10
+bsCheck \Ua1	161
+bsCheck \U4e21	20001
+bsCheck \U004e21	20001
+bsCheck \U00004e21	20001
+bsCheck \U00110000	65533
+bsCheck \Uffffffff	65533
 
 test utf-11.1 {Tcl_UtfToUpper} {
     string toupper {}
 } {}
 test utf-11.2 {Tcl_UtfToUpper} {