Index: doc/string.n ================================================================== --- doc/string.n +++ doc/string.n @@ -147,11 +147,12 @@ .IP \fBprint\fR 12 Any Unicode printing character, including space. .IP \fBpunct\fR 12 Any Unicode punctuation character. .IP \fBspace\fR 12 -Any Unicode space character. +Any Unicode whitespace character, zero width space (U+200b), +word joiner (U+2060) and zero width no-break space (U+feff) (=BOM). .IP \fBtrue\fR 12 Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is true. .IP \fBupper\fR 12 Any upper case alphabet character in the Unicode character set. @@ -333,26 +334,26 @@ .TP \fBstring trim \fIstring\fR ?\fIchars\fR? . Returns a value equal to \fIstring\fR except that any leading or trailing characters present in the string given by \fIchars\fR are removed. If -\fIchars\fR is not specified then white space is removed (spaces, -tabs, newlines, and carriage returns). +\fIchars\fR is not specified then white space is removed (any character +for which \fBstring is space\fR returns 1, and "\0"). .TP \fBstring trimleft \fIstring\fR ?\fIchars\fR? . Returns a value equal to \fIstring\fR except that any leading characters present in the string given by \fIchars\fR are removed. If -\fIchars\fR is not specified then white space is removed (spaces, -tabs, newlines, and carriage returns). +\fIchars\fR is not specified then white space is removed (any character +for which \fBstring is space\fR returns 1, and "\0"). .TP \fBstring trimright \fIstring\fR ?\fIchars\fR? . Returns a value equal to \fIstring\fR except that any trailing characters present in the string given by \fIchars\fR are removed. If -\fIchars\fR is not specified then white space is removed (spaces, -tabs, newlines, and carriage returns). +\fIchars\fR is not specified then white space is removed (any character +for which \fBstring is space\fR returns 1, and "\0"). .TP \fBstring wordend \fIstring charIndex\fR . Returns the index of the character just after the last one in the word containing character \fIcharIndex\fR of \fIstring\fR. \fIcharIndex\fR Index: generic/regc_locale.c ================================================================== --- generic/regc_locale.c +++ generic/regc_locale.c @@ -352,17 +352,18 @@ /* * Unicode: white space characters. */ static const crange spaceRangeTable[] = { - {0x9, 0xd}, {0x2000, 0x200a} + {0x9, 0xd}, {0x2000, 0x200b} }; #define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange)) static const chr spaceCharTable[] = { - 0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x3000 + 0x20, 0x85, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, + 0x2060, 0x3000, 0xfeff }; #define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr)) /* Index: generic/tclCmdMZ.c ================================================================== --- generic/tclCmdMZ.c +++ generic/tclCmdMZ.c @@ -32,16 +32,39 @@ static int UniCharIsAscii(int character); static int UniCharIsHexDigit(int character); /* * Default set of characters to trim in [string trim] and friends. This is a - * UTF-8 literal string containing space, tab, newline, carriage return, - * ethiopic wordspace (U+1361), ogham space mark (U+1680), and ideographic - * space (U+3000). [TIP #318] + * UTF-8 literal string containing all Unicode space characters [TIP #413] */ -#define DEFAULT_TRIM_SET " \t\n\r\xe1\x8d\xa1\xe1\x9a\x80\xe3\x80\x80" +#define DEFAULT_TRIM_SET \ + "\x09\x0a\x0b\x0c\x0d " /* ASCII */\ + "\xc0\x80" /* nul (U+0000) */\ + "\xc2\x85" /* next line (U+0085) */\ + "\xc2\xa0" /* non-breaking space (U+00a0) */\ + "\xe1\x9a\x80" /* ogham space mark (U+1680) */ \ + "\xe1\xa0\x8e" /* mongolian vowel separator (U+180e) */\ + "\xe2\x80\x80" /* en quad (U+2000) */\ + "\xe2\x80\x81" /* em quad (U+2001) */\ + "\xe2\x80\x82" /* en space (U+2002) */\ + "\xe2\x80\x83" /* em space (U+2003) */\ + "\xe2\x80\x84" /* three-per-em space (U+2004) */\ + "\xe2\x80\x85" /* four-per-em space (U+2005) */\ + "\xe2\x80\x86" /* six-per-em space (U+2006) */\ + "\xe2\x80\x87" /* figure space (U+2007) */\ + "\xe2\x80\x88" /* punctuation space (U+2008) */\ + "\xe2\x80\x89" /* thin space (U+2009) */\ + "\xe2\x80\x8a" /* hair space (U+200a) */\ + "\xe2\x80\x8b" /* zero width space (U+200b) */\ + "\xe2\x80\xa8" /* line separator (U+2028) */\ + "\xe2\x80\xa9" /* paragraph separator (U+2029) */\ + "\xe2\x80\xaf" /* narrow no-break space (U+202f) */\ + "\xe2\x81\x9f" /* medium mathematical space (U+205f) */\ + "\xe2\x81\xa0" /* word joiner (U+2060) */\ + "\xe3\x80\x80" /* ideographic space (U+3000) */\ + "\xef\xbb\xbf" /* zero width no-break space (U+feff) */ /* *---------------------------------------------------------------------- * * Tcl_PwdObjCmd -- Index: generic/tclUtf.c ================================================================== --- generic/tclUtf.c +++ generic/tclUtf.c @@ -1514,10 +1514,13 @@ * standard C function, otherwise consult the Unicode table. */ if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) { return isspace(UCHAR(ch)); /* INTL: ISO space */ + } else if ((Tcl_UniChar) ch == 0x0085 || (Tcl_UniChar) ch == 0x200b + || (Tcl_UniChar) ch == 0x2060 || (Tcl_UniChar) ch == 0xfeff) { + return 1; } else { return ((SPACE_BITS >> GetCategory(ch)) & 1); } } Index: tests/string.test ================================================================== --- tests/string.test +++ tests/string.test @@ -1482,22 +1482,22 @@ } {ABC} test string-18.11 {string trim, unicode} { string trim "\xe7\xe8 AB\xe7C \xe8\xe7" \xe7\xe8 } " AB\xe7C " test string-18.12 {string trim, unicode default} { - string trim ABC\u1361\u1680\u3000 -} ABC + string trim \ufeff\x00\u0085\u00a0\u1680\u180eABC\u1361\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000 +} ABC\u1361 test string-19.1 {string trimleft} { list [catch {string trimleft} msg] $msg } {1 {wrong # args: should be "string trimleft string ?chars?"}} test string-19.2 {string trimleft} { string trimleft " XYZ " } {XYZ } test string-19.3 {string trimleft, unicode default} { - string trimleft \u1361\u1680\u3000ABC -} ABC + string trimleft \ufeff\u0085\u00a0\x00\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000\u1361ABC +} \u1361ABC test string-20.1 {string trimright errors} { list [catch {string trimright} msg] $msg } {1 {wrong # args: should be "string trimright string ?chars?"}} test string-20.2 {string trimright errors} { @@ -1511,12 +1511,12 @@ } {} test string-20.5 {string trimright} { string trimright "" } {} test string-20.6 {string trimright, unicode default} { - string trimright ABC\u1361\u1680\u3000 -} ABC + string trimright ABC\u1361\u0085\x00\u00a0\u1680\u180e\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200a\u200b\u2028\u2029\u202f\u205f\u3000 +} ABC\u1361 test string-21.1 {string wordend} { list [catch {string wordend a} msg] $msg } {1 {wrong # args: should be "string wordend string index"}} test string-21.2 {string wordend} {