Tcl Source Code

Check-in [f199c9d65c]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for Tcl it still is. "NEL/Next Line" (U+0085) should have been a Unicode whitespace, but never was in Tcl. This is corrected in Tcl 8.6, but for legacy reasons not in Tcl 8.5. Update documentation accordingly, and extend test-cases for Unicode 7 compliance.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | core-8-5-branch
Files: files | file ages | folders
SHA1: f199c9d65c292fe83e3f2722e6343e53bd192b84
User & Date: jan.nijtmans 2014-07-11 10:38:55
Context
2014-07-16
21:46
Assert an equality test, not an assignment. check-in: 72391d07b6 user: dgp tags: core-8-5-branch
2014-07-11
10:43
Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for... check-in: 8320e361b7 user: jan.nijtmans tags: trunk
10:38
Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for... check-in: f199c9d65c user: jan.nijtmans tags: core-8-5-branch
2014-07-10
18:00
dup test name check-in: 033a1af993 user: dgp tags: core-8-5-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/string.n.

157
158
159
160
161
162
163
164

165
166
167
168
169
170
171
.IP \fBlower\fR 12
Any Unicode lower case alphabet character.
.IP \fBprint\fR 12
Any Unicode printing character, including space.
.IP \fBpunct\fR 12
Any Unicode punctuation character.
.IP \fBspace\fR 12
Any Unicode space character.

.IP \fBtrue\fR 12
Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
true.
.IP \fBupper\fR 12
Any upper case alphabet character in the Unicode character set.
.VS 8.5
.IP \fBwideinteger\fR 12







|
>







157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
.IP \fBlower\fR 12
Any Unicode lower case alphabet character.
.IP \fBprint\fR 12
Any Unicode printing character, including space.
.IP \fBpunct\fR 12
Any Unicode punctuation character.
.IP \fBspace\fR 12
Any Unicode whitespace character or mongolian vowel separator (U+180e),
but not NEL/Next Line (U+0085).
.IP \fBtrue\fR 12
Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
true.
.IP \fBupper\fR 12
Any upper case alphabet character in the Unicode character set.
.VS 8.5
.IP \fBwideinteger\fR 12

Changes to generic/regc_locale.c.

377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
static const crange spaceRangeTable[] = {
    {0x9, 0xd}, {0x2000, 0x200a}
};

#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))

static const chr spaceCharTable[] = {
    0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f, 0x2060,
    0x3000, 0xfeff
};

#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))

/*
 * Unicode: lowercase characters.
 */







|
|







377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
static const crange spaceRangeTable[] = {
    {0x9, 0xd}, {0x2000, 0x200a}
};

#define NUM_SPACE_RANGE (sizeof(spaceRangeTable)/sizeof(crange))

static const chr spaceCharTable[] = {
    0x20, 0xa0, 0x1680, 0x180e, 0x2028, 0x2029, 0x202f, 0x205f,
    0x3000
};

#define NUM_SPACE_CHAR (sizeof(spaceCharTable)/sizeof(chr))

/*
 * Unicode: lowercase characters.
 */

Changes to tests/utf.test.

274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    list [regexp \\d abc456def foo] $foo
} {1 4}

test utf-20.1 {TclUniCharNcmp} {
} {}

test utf-21.1 {TclUniCharIsAlnum} {
    # this returns 1 with Unicode 6 compliance
    string is alnum \u1040\u021f\u0220
} {1}
test utf-21.2 {unicode alnum char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220]
} {1 1}
test utf-21.3 {unicode print char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    regexp {^[[:print:]]+$} \ufbc1
} 1
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {







|



|



|







274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
    list [regexp \\d abc456def foo] $foo
} {1 4}

test utf-20.1 {TclUniCharNcmp} {
} {}

test utf-21.1 {TclUniCharIsAlnum} {
    # this returns 1 with Unicode 7 compliance
    string is alnum \u1040\u021f\u0220
} {1}
test utf-21.2 {unicode alnum char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220]
} {1 1}
test utf-21.3 {unicode print char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    regexp {^[[:print:]]+$} \ufbc1
} 1
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
} {0}
test utf-21.10 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.11 {TclUniCharIsControl} {
    # [Bug 3464428]
    string is control \u00ad
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
    # [Bug 3464428], [Bug a876646efe]
    regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad
} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0
} 10

test utf-23.1 {TclUniCharIsAlpha} {
    # this returns 1 with Unicode 6 compliance
    string is alpha \u021f\u0220
} {1}
test utf-23.2 {unicode alpha char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    regexp {^[[:alpha:]]+$} \u021f\u0220
} {1}

test utf-24.1 {TclUniCharIsDigit} {
    # this returns 1 with Unicode 6 compliance
    string is digit \u1040\uabf0
} {1}
test utf-24.2 {unicode digit char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0]
} {1 1}

test utf-24.3 {TclUniCharIsSpace} {
    # this returns 1 with Unicode 6 compliance
    string is space \u1680\u180e
} {1}
test utf-24.4 {unicode space char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:space:]]+$} \u1680\u180e] [regexp {^\s+$} \u1680\u180e]
} {1 1}

testConstraint teststringobj [llength [info commands teststringobj]]

test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \
    -setup {
	testobj freeallvars







|



|










|
|


|
|



|



|




|
|


|
|







315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
} {0}
test utf-21.10 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.11 {TclUniCharIsControl} {
    # [Bug 3464428]
    string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
    # [Bug 3464428], [Bug a876646efe]
    regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0
} 10

test utf-23.1 {TclUniCharIsAlpha} {
    # this returns 1 with Unicode 7 compliance
    string is alpha \u021f\u0220\u037f\u052f
} {1}
test utf-23.2 {unicode alpha char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f
} {1}

test utf-24.1 {TclUniCharIsDigit} {
    # this returns 1 with Unicode 7 compliance
    string is digit \u1040\uabf0
} {1}
test utf-24.2 {unicode digit char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0]
} {1 1}

test utf-24.3 {TclUniCharIsSpace} {
    # this returns 1 with Unicode 7 compliance
    string is space \u1680\u180e\u202f
} {1}
test utf-24.4 {unicode space char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    list [regexp {^[[:space:]]+$} \u1680\u180e\u202f] [regexp {^\s+$} \u1680\u180e\u202f]
} {1 1}

testConstraint teststringobj [llength [info commands teststringobj]]

test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \
    -setup {
	testobj freeallvars