Tcl Source Code

Check-in [8320e361b7]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for Tcl it still is. Update documentation accordingly, and extend test-cases for Unicode 7 compliance.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 8320e361b7bd18ae338aa6699b19862921772c7b
User & Date: jan.nijtmans 2014-07-11 10:43:15
Context
2014-07-11
12:56
Stop memleak in [info frame]. check-in: 1610c45009 user: dgp tags: trunk
10:43
Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for... check-in: 8320e361b7 user: jan.nijtmans tags: trunk
10:38
Starting with Unicode 6.3, the mongolian vowel separator (U+180e) is no longer a whitespace, but for... check-in: f199c9d65c user: jan.nijtmans tags: core-8-5-branch
04:49
[3479689] Plug memory leak due to incomplete bug fix. check-in: 2830730e99 user: dgp tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to doc/string.n.

127
128
129
130
131
132
133
134

135
136
137
138
139
140
141
142
.IP \fBlower\fR 12
Any Unicode lower case alphabet character.
.IP \fBprint\fR 12
Any Unicode printing character, including space.
.IP \fBpunct\fR 12
Any Unicode punctuation character.
.IP \fBspace\fR 12
Any Unicode whitespace character, zero width space (U+200b),

word joiner (U+2060) and zero width no-break space (U+feff) (=BOM).
.IP \fBtrue\fR 12
Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
true.
.IP \fBupper\fR 12
Any upper case alphabet character in the Unicode character set.
.IP \fBwideinteger\fR 12
Any of the valid forms for a wide integer in Tcl, with optional







|
>
|







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
.IP \fBlower\fR 12
Any Unicode lower case alphabet character.
.IP \fBprint\fR 12
Any Unicode printing character, including space.
.IP \fBpunct\fR 12
Any Unicode punctuation character.
.IP \fBspace\fR 12
Any Unicode whitespace character, mongolian vowel separator
(U+180e), zero width space (U+200b), word joiner (U+2060) or
zero width no-break space (U+feff) (=BOM).
.IP \fBtrue\fR 12
Any of the forms allowed to \fBTcl_GetBoolean\fR where the value is
true.
.IP \fBupper\fR 12
Any upper case alphabet character in the Unicode character set.
.IP \fBwideinteger\fR 12
Any of the valid forms for a wide integer in Tcl, with optional

Changes to tests/utf.test.

289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
    list [regexp \\d abc456def foo] $foo
} {1 4}

test utf-20.1 {TclUniCharNcmp} {
} {}

test utf-21.1 {TclUniCharIsAlnum} {
    # this returns 1 with Unicode 6 compliance
    string is alnum \u1040\u021f\u0220
} {1}
test utf-21.2 {unicode alnum char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220]
} {1 1}
test utf-21.3 {unicode print char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    regexp {^[[:print:]]+$} \ufbc1
} 1
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {







|



|



|







289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
    list [regexp \\d abc456def foo] $foo
} {1 4}

test utf-20.1 {TclUniCharNcmp} {
} {}

test utf-21.1 {TclUniCharIsAlnum} {
    # this returns 1 with Unicode 7 compliance
    string is alnum \u1040\u021f\u0220
} {1}
test utf-21.2 {unicode alnum char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    list [regexp {^[[:alnum:]]+$} \u1040\u021f\u0220] [regexp {^\w+$} \u1040\u021f\u0220]
} {1 1}
test utf-21.3 {unicode print char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    regexp {^[[:print:]]+$} \ufbc1
} 1
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
} {0}
test utf-21.10 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.11 {TclUniCharIsControl} {
    # [Bug 3464428]
    string is control \u00ad
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
    # [Bug 3464428], [Bug a876646efe]
    regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad
} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0
} 10

test utf-23.1 {TclUniCharIsAlpha} {
    # this returns 1 with Unicode 6 compliance
    string is alpha \u021f\u0220
} {1}
test utf-23.2 {unicode alpha char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    regexp {^[[:alpha:]]+$} \u021f\u0220
} {1}

test utf-24.1 {TclUniCharIsDigit} {
    # this returns 1 with Unicode 6 compliance
    string is digit \u1040\uabf0
} {1}
test utf-24.2 {unicode digit char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0]
} {1 1}

test utf-24.3 {TclUniCharIsSpace} {
    # this returns 1 with Unicode 6 compliance
    string is space \u1680\u180e
} {1}
test utf-24.4 {unicode space char in regc_locale.c} {
    # this returns 1 with Unicode 6 compliance
    list [regexp {^[[:space:]]+$} \u1680\u180e] [regexp {^\s+$} \u1680\u180e]
} {1 1}

testConstraint teststringobj [llength [info commands teststringobj]]

test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \
    -setup {
	testobj freeallvars







|



|










|
|


|
|



|



|




|
|


|
|







330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
} {0}
test utf-21.10 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.11 {TclUniCharIsControl} {
    # [Bug 3464428]
    string is control \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
    # [Bug 3464428], [Bug a876646efe]
    regexp {^[[:cntrl:]]*$} \u0000\u001f\u00ad\u0605\u061c\u180e\u2066\ufeff
} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0
} 10

test utf-23.1 {TclUniCharIsAlpha} {
    # this returns 1 with Unicode 7 compliance
    string is alpha \u021f\u0220\u037f\u052f
} {1}
test utf-23.2 {unicode alpha char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    regexp {^[[:alpha:]]+$} \u021f\u0220\u037f\u052f
} {1}

test utf-24.1 {TclUniCharIsDigit} {
    # this returns 1 with Unicode 7 compliance
    string is digit \u1040\uabf0
} {1}
test utf-24.2 {unicode digit char in regc_locale.c} {
    # this returns 1 with Unicode 7 compliance
    list [regexp {^[[:digit:]]+$} \u1040\uabf0] [regexp {^\d+$} \u1040\uabf0]
} {1 1}

test utf-24.3 {TclUniCharIsSpace} {
    # this returns 1 with Unicode 7/TIP 413 compliance
    string is space \u0085\u1680\u180e\u200b\u202f\u2060
} {1}
test utf-24.4 {unicode space char in regc_locale.c} {
    # this returns 1 with Unicode 7/TIP 413 compliance
    list [regexp {^[[:space:]]+$} \u0085\u1680\u180e\u200b\u202f\u2060] [regexp {^\s+$} \u0085\u1680\u180e\u200b\u202f\u2060]
} {1 1}

testConstraint teststringobj [llength [info commands teststringobj]]

test utf-25.1 {Tcl_UniCharNcasecmp} -constraints teststringobj \
    -setup {
	testobj freeallvars