Tcl Source Code

Check-in [e9a619e9dc]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[Bug 3464428] string is graph \u0120 is wrong
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: e9a619e9dc3cc5d617ea1d0682e9a763ece64435
User & Date: jan.nijtmans 2012-01-09 20:34:29
References
2013-06-17
04:50 Ticket [a876646efe] re_expr character class cntrl: should contain \u0000 - \u001f status still Open with 4 other changes artifact: fa040a5149 user: jan.nijtmans
Context
2012-01-12
13:23
[Bug 3466506]: Document more environment variables. check-in: 6f0fbae68b user: dkf tags: trunk
2012-01-09
20:34
[Bug 3464428] string is graph \u0120 is wrong check-in: e9a619e9dc user: jan.nijtmans tags: trunk
20:31
[Bug 3464428] string is graph \u0120 is wrong check-in: 14fc5c19b7 user: jan.nijtmans tags: core-8-5-branch
13:50
Revert mistaken commit. check-in: 38a63eef25 user: dgp tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ChangeLog.








1
2
3
4
5
6
7







2012-01-08  Kevin B. Kenny  <[email protected]>

	* library/clock.tcl (ReadZoneinfoFile): Corrected a bug where loading
	* tests/clock.test (clock-56.4):        zoneinfo would fail if one
	timezone abbreviation was a proper tail of another, and zic used the
	same bytes of the file to represent both of them.  Added a test case
	for the bug, using the same data that caused the observed failure
>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
14
2012-01-09  Jan Nijtmans  <[email protected]>

	* generic/tclUtf.c:      [Bug 3464428] string is graph \u0120 is wrong
	* generic/regc_locale.c: Add table for Unicode [:cntrl:] class
	* tools/uniClass.tcl:    Generate Unicode [:cntrl:] class table
	* tests/utf.test:

2012-01-08  Kevin B. Kenny  <[email protected]>

	* library/clock.tcl (ReadZoneinfoFile): Corrected a bug where loading
	* tests/clock.test (clock-56.4):        zoneinfo would fail if one
	timezone abbreviation was a proper tail of another, and zic used the
	same bytes of the file to represent both of them.  Added a test case
	for the bug, using the same data that caused the observed failure

Changes to generic/regc_locale.c.

219
220
221
222
223
224
225

















226
227
228
229
230
231
232
    0x2e2f, 0x3005, 0x3006, 0x303b, 0x303c, 0xa62a, 0xa62b, 0xa790, 0xa791,
    0xa8fb, 0xa9cf, 0xaa7a, 0xaab1, 0xaab5, 0xaab6, 0xaac0, 0xaac2, 0xfb1d,
    0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44
};

#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))


















/*
 * Unicode: decimal digit characters.
 */

static const crange digitRangeTable[] = {
    {0x0030, 0x0039}, {0x0660, 0x0669}, {0x06f0, 0x06f9}, {0x07c0, 0x07c9},
    {0x0966, 0x096f}, {0x09e6, 0x09ef}, {0x0a66, 0x0a6f}, {0x0ae6, 0x0aef},







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
    0x2e2f, 0x3005, 0x3006, 0x303b, 0x303c, 0xa62a, 0xa62b, 0xa790, 0xa791,
    0xa8fb, 0xa9cf, 0xaa7a, 0xaab1, 0xaab5, 0xaab6, 0xaac0, 0xaac2, 0xfb1d,
    0xfb3e, 0xfb40, 0xfb41, 0xfb43, 0xfb44
};

#define NUM_ALPHA_CHAR (sizeof(alphaCharTable)/sizeof(chr))

/*
 * Unicode: control characters.
 */

static const crange controlRangeTable[] = {
    {0x007f, 0x009f}, {0x0600, 0x0603}, {0x200b, 0x200f}, {0x202a, 0x202e},
    {0x2060, 0x2064}, {0x206a, 0x206f}, {0xe000, 0xf8ff}, {0xfff9, 0xfffb}
};

#define NUM_CONTROL_RANGE (sizeof(controlRangeTable)/sizeof(crange))

static const chr controlCharTable[] = {
    0x00ad, 0x06dd, 0x070f, 0x17b4, 0x17b5, 0xfeff
};

#define NUM_CONTROL_CHAR (sizeof(controlCharTable)/sizeof(chr))

/*
 * Unicode: decimal digit characters.
 */

static const crange digitRangeTable[] = {
    {0x0030, 0x0039}, {0x0660, 0x0669}, {0x06f0, 0x06f9}, {0x07c0, 0x07c9},
    {0x0966, 0x096f}, {0x09e6, 0x09ef}, {0x0a66, 0x0a6f}, {0x0ae6, 0x0aef},
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))

/*
 * Unicode: unicode print characters excluding space.
 */

static const crange graphRangeTable[] = {
    {0x0021, 0x007e}, {0x00a0, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
    {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x0527}, {0x0531, 0x0556},
    {0x0559, 0x055f}, {0x0561, 0x0587}, {0x0591, 0x05c7}, {0x05d0, 0x05ea},
    {0x05f0, 0x05f4}, {0x0606, 0x061b}, {0x061e, 0x06dc}, {0x06de, 0x070d},
    {0x0710, 0x074a}, {0x074d, 0x07b1}, {0x07c0, 0x07fa}, {0x0800, 0x082d},
    {0x0830, 0x083e}, {0x0840, 0x085b}, {0x0900, 0x0977}, {0x0979, 0x097f},
    {0x0981, 0x0983}, {0x0985, 0x098c}, {0x0993, 0x09a8}, {0x09aa, 0x09b0},
    {0x09b6, 0x09b9}, {0x09bc, 0x09c4}, {0x09cb, 0x09ce}, {0x09df, 0x09e3},







|







491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
#define NUM_UPPER_CHAR (sizeof(upperCharTable)/sizeof(chr))

/*
 * Unicode: unicode print characters excluding space.
 */

static const crange graphRangeTable[] = {
    {0x0021, 0x007e}, {0x00a1, 0x00ac}, {0x00ae, 0x0377}, {0x037a, 0x037e},
    {0x0384, 0x038a}, {0x038e, 0x03a1}, {0x03a3, 0x0527}, {0x0531, 0x0556},
    {0x0559, 0x055f}, {0x0561, 0x0587}, {0x0591, 0x05c7}, {0x05d0, 0x05ea},
    {0x05f0, 0x05f4}, {0x0606, 0x061b}, {0x061e, 0x06dc}, {0x06de, 0x070d},
    {0x0710, 0x074a}, {0x074d, 0x07b1}, {0x07c0, 0x07fa}, {0x0800, 0x082d},
    {0x0830, 0x083e}, {0x0840, 0x085b}, {0x0900, 0x0977}, {0x0979, 0x097f},
    {0x0981, 0x0983}, {0x0985, 0x098c}, {0x0993, 0x09a8}, {0x09aa, 0x09b0},
    {0x09b6, 0x09b9}, {0x09bc, 0x09c4}, {0x09cb, 0x09ce}, {0x09df, 0x09e3},
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
    {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb9}, {0x0ebb, 0x0ebd}, {0x0ec0, 0x0ec4},
    {0x0ec8, 0x0ecd}, {0x0ed0, 0x0ed9}, {0x0f00, 0x0f47}, {0x0f49, 0x0f6c},
    {0x0f71, 0x0f97}, {0x0f99, 0x0fbc}, {0x0fbe, 0x0fcc}, {0x0fce, 0x0fda},
    {0x1000, 0x10c5}, {0x10d0, 0x10fc}, {0x1100, 0x1248}, {0x124a, 0x124d},
    {0x1250, 0x1256}, {0x125a, 0x125d}, {0x1260, 0x1288}, {0x128a, 0x128d},
    {0x1290, 0x12b0}, {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5},
    {0x12c8, 0x12d6}, {0x12d8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135a},
    {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x169c},
    {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714}, {0x1720, 0x1736},
    {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770}, {0x1780, 0x17b3},
    {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9}, {0x1800, 0x180e},
    {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa}, {0x18b0, 0x18f5},
    {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b}, {0x1944, 0x196d},
    {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9}, {0x19d0, 0x19da},
    {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c}, {0x1a7f, 0x1a89},
    {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b}, {0x1b50, 0x1b7c},
    {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3}, {0x1bfc, 0x1c37},
    {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2}, {0x1d00, 0x1de6},
    {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45}, {0x1f48, 0x1f4d},
    {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4}, {0x1fb6, 0x1fc4},
    {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef}, {0x1ff2, 0x1ff4},
    {0x1ff6, 0x1ffe}, {0x2000, 0x200a}, {0x2010, 0x2029}, {0x202f, 0x205f},
    {0x2074, 0x208e}, {0x2090, 0x209c}, {0x20a0, 0x20b9}, {0x20d0, 0x20f0},
    {0x2100, 0x2189}, {0x2190, 0x23f3}, {0x2400, 0x2426}, {0x2440, 0x244a},
    {0x2460, 0x26ff}, {0x2701, 0x27ca}, {0x27ce, 0x2b4c}, {0x2b50, 0x2b59},
    {0x2c00, 0x2c2e}, {0x2c30, 0x2c5e}, {0x2c60, 0x2cf1}, {0x2cf9, 0x2d25},
    {0x2d30, 0x2d65}, {0x2d7f, 0x2d96}, {0x2da0, 0x2da6}, {0x2da8, 0x2dae},
    {0x2db0, 0x2db6}, {0x2db8, 0x2dbe}, {0x2dc0, 0x2dc6}, {0x2dc8, 0x2dce},
    {0x2dd0, 0x2dd6}, {0x2dd8, 0x2dde}, {0x2de0, 0x2e31}, {0x2e80, 0x2e99},
    {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3000, 0x303f},
    {0x3041, 0x3096}, {0x3099, 0x30ff}, {0x3105, 0x312d}, {0x3131, 0x318e},
    {0x3190, 0x31ba}, {0x31c0, 0x31e3}, {0x31f0, 0x321e}, {0x3220, 0x32fe},
    {0x3300, 0x4db5}, {0x4dc0, 0x9fcb}, {0xa000, 0xa48c}, {0xa490, 0xa4c6},
    {0xa4d0, 0xa62b}, {0xa640, 0xa673}, {0xa67c, 0xa697}, {0xa6a0, 0xa6f7},
    {0xa700, 0xa78e}, {0xa7a0, 0xa7a9}, {0xa7fa, 0xa82b}, {0xa830, 0xa839},
    {0xa840, 0xa877}, {0xa880, 0xa8c4}, {0xa8ce, 0xa8d9}, {0xa8e0, 0xa8fb},
    {0xa900, 0xa953}, {0xa95f, 0xa97c}, {0xa980, 0xa9cd}, {0xa9cf, 0xa9d9},







|
|
|
|
|
|
|
|
|
|
|
|
|
|
|







|







526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
    {0x0ea1, 0x0ea3}, {0x0ead, 0x0eb9}, {0x0ebb, 0x0ebd}, {0x0ec0, 0x0ec4},
    {0x0ec8, 0x0ecd}, {0x0ed0, 0x0ed9}, {0x0f00, 0x0f47}, {0x0f49, 0x0f6c},
    {0x0f71, 0x0f97}, {0x0f99, 0x0fbc}, {0x0fbe, 0x0fcc}, {0x0fce, 0x0fda},
    {0x1000, 0x10c5}, {0x10d0, 0x10fc}, {0x1100, 0x1248}, {0x124a, 0x124d},
    {0x1250, 0x1256}, {0x125a, 0x125d}, {0x1260, 0x1288}, {0x128a, 0x128d},
    {0x1290, 0x12b0}, {0x12b2, 0x12b5}, {0x12b8, 0x12be}, {0x12c2, 0x12c5},
    {0x12c8, 0x12d6}, {0x12d8, 0x1310}, {0x1312, 0x1315}, {0x1318, 0x135a},
    {0x135d, 0x137c}, {0x1380, 0x1399}, {0x13a0, 0x13f4}, {0x1400, 0x167f},
    {0x1681, 0x169c}, {0x16a0, 0x16f0}, {0x1700, 0x170c}, {0x170e, 0x1714},
    {0x1720, 0x1736}, {0x1740, 0x1753}, {0x1760, 0x176c}, {0x176e, 0x1770},
    {0x1780, 0x17b3}, {0x17b6, 0x17dd}, {0x17e0, 0x17e9}, {0x17f0, 0x17f9},
    {0x1800, 0x180d}, {0x1810, 0x1819}, {0x1820, 0x1877}, {0x1880, 0x18aa},
    {0x18b0, 0x18f5}, {0x1900, 0x191c}, {0x1920, 0x192b}, {0x1930, 0x193b},
    {0x1944, 0x196d}, {0x1970, 0x1974}, {0x1980, 0x19ab}, {0x19b0, 0x19c9},
    {0x19d0, 0x19da}, {0x19de, 0x1a1b}, {0x1a1e, 0x1a5e}, {0x1a60, 0x1a7c},
    {0x1a7f, 0x1a89}, {0x1a90, 0x1a99}, {0x1aa0, 0x1aad}, {0x1b00, 0x1b4b},
    {0x1b50, 0x1b7c}, {0x1b80, 0x1baa}, {0x1bae, 0x1bb9}, {0x1bc0, 0x1bf3},
    {0x1bfc, 0x1c37}, {0x1c3b, 0x1c49}, {0x1c4d, 0x1c7f}, {0x1cd0, 0x1cf2},
    {0x1d00, 0x1de6}, {0x1dfc, 0x1f15}, {0x1f18, 0x1f1d}, {0x1f20, 0x1f45},
    {0x1f48, 0x1f4d}, {0x1f50, 0x1f57}, {0x1f5f, 0x1f7d}, {0x1f80, 0x1fb4},
    {0x1fb6, 0x1fc4}, {0x1fc6, 0x1fd3}, {0x1fd6, 0x1fdb}, {0x1fdd, 0x1fef},
    {0x1ff2, 0x1ff4}, {0x1ff6, 0x1ffe}, {0x2010, 0x2027}, {0x2030, 0x205e},
    {0x2074, 0x208e}, {0x2090, 0x209c}, {0x20a0, 0x20b9}, {0x20d0, 0x20f0},
    {0x2100, 0x2189}, {0x2190, 0x23f3}, {0x2400, 0x2426}, {0x2440, 0x244a},
    {0x2460, 0x26ff}, {0x2701, 0x27ca}, {0x27ce, 0x2b4c}, {0x2b50, 0x2b59},
    {0x2c00, 0x2c2e}, {0x2c30, 0x2c5e}, {0x2c60, 0x2cf1}, {0x2cf9, 0x2d25},
    {0x2d30, 0x2d65}, {0x2d7f, 0x2d96}, {0x2da0, 0x2da6}, {0x2da8, 0x2dae},
    {0x2db0, 0x2db6}, {0x2db8, 0x2dbe}, {0x2dc0, 0x2dc6}, {0x2dc8, 0x2dce},
    {0x2dd0, 0x2dd6}, {0x2dd8, 0x2dde}, {0x2de0, 0x2e31}, {0x2e80, 0x2e99},
    {0x2e9b, 0x2ef3}, {0x2f00, 0x2fd5}, {0x2ff0, 0x2ffb}, {0x3001, 0x303f},
    {0x3041, 0x3096}, {0x3099, 0x30ff}, {0x3105, 0x312d}, {0x3131, 0x318e},
    {0x3190, 0x31ba}, {0x31c0, 0x31e3}, {0x31f0, 0x321e}, {0x3220, 0x32fe},
    {0x3300, 0x4db5}, {0x4dc0, 0x9fcb}, {0xa000, 0xa48c}, {0xa490, 0xa4c6},
    {0xa4d0, 0xa62b}, {0xa640, 0xa673}, {0xa67c, 0xa697}, {0xa6a0, 0xa6f7},
    {0xa700, 0xa78e}, {0xa7a0, 0xa7a9}, {0xa7fa, 0xa82b}, {0xa830, 0xa839},
    {0xa840, 0xa877}, {0xa880, 0xa8c4}, {0xa8ce, 0xa8d9}, {0xa8e0, 0xa8fb},
    {0xa900, 0xa953}, {0xa95f, 0xa97c}, {0xa980, 0xa9cd}, {0xa9cf, 0xa9d9},
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814








815
816
817
818
819
820
821
     * Extract the class name
     */

    len = endp - startp;
    Tcl_DStringInit(&ds);
    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);

    /*
     * Remap lower and upper to alpha if the match is case insensitive.
     */

    if (cases && len == 5 && (strncmp("lower", np, 5) == 0
	    || strncmp("upper", np, 5) == 0)) {
	np = "alpha";
    }

    /*
     * Map the name to the corresponding enumerated value.
     */

    index = -1;
    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
	if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) {
	    index = i;
	    break;
	}
    }
    Tcl_DStringFree(&ds);
    if (index == -1) {
	ERR(REG_ECTYPE);
	return NULL;
    }









    /*
     * Now compute the character class contents.
     */

    switch((enum classes) index) {
    case CC_ALNUM:







<
<
<
<
<
<
<
<
<
















>
>
>
>
>
>
>
>







800
801
802
803
804
805
806









807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
     * Extract the class name
     */

    len = endp - startp;
    Tcl_DStringInit(&ds);
    np = Tcl_UniCharToUtfDString(startp, (int)len, &ds);










    /*
     * Map the name to the corresponding enumerated value.
     */

    index = -1;
    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
	if ((strlen(*namePtr) == len) && (strncmp(*namePtr, np, len) == 0)) {
	    index = i;
	    break;
	}
    }
    Tcl_DStringFree(&ds);
    if (index == -1) {
	ERR(REG_ECTYPE);
	return NULL;
    }

    /*
     * Remap lower and upper to alpha if the match is case insensitive.
     */

    if (cases && ((index == CC_LOWER) || (index == CC_UPPER))) {
	index = CC_ALNUM;
    }

    /*
     * Now compute the character class contents.
     */

    switch((enum classes) index) {
    case CC_ALNUM:
854
855
856
857
858
859
860
861
862

863






864
865
866
867
868
869
870
	break;
    case CC_BLANK:
	cv = getcvec(v, 2, 0);
	addchr(cv, '\t');
	addchr(cv, ' ');
	break;
    case CC_CNTRL:
	cv = getcvec(v, 0, 2);
	addrange(cv, 0x0, 0x1f);

	addrange(cv, 0x7f, 0x9f);






	break;
    case CC_DIGIT:
	cv = getcvec(v, 0, NUM_DIGIT_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) {
		addrange(cv, digitRangeTable[i].start,
			digitRangeTable[i].end);







|
|
>
|
>
>
>
>
>
>







870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
	break;
    case CC_BLANK:
	cv = getcvec(v, 2, 0);
	addchr(cv, '\t');
	addchr(cv, ' ');
	break;
    case CC_CNTRL:
	cv = getcvec(v, NUM_CONTROL_CHAR, NUM_CONTROL_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_CONTROL_RANGE ; i++) {
		addrange(cv, controlRangeTable[i].start,
			controlRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_CONTROL_CHAR ; i++) {
		addchr(cv, controlCharTable[i]);
	    }
	}
	break;
    case CC_DIGIT:
	cv = getcvec(v, 0, NUM_DIGIT_RANGE);
	if (cv) {
	    for (i=0 ; (size_t)i<NUM_DIGIT_RANGE ; i++) {
		addrange(cv, digitRangeTable[i].start,
			digitRangeTable[i].end);
933
934
935
936
937
938
939


















940
941
942
943
944
945
946
947
948
949
950
951
952
953
	    }
	    for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) {
		addchr(cv, upperCharTable[i]);
	    }
	}
	break;
    case CC_PRINT:


















    case CC_GRAPH:
	cv  = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE);
	if (cv) {
		/* For CC_PRINT, include space as well */
		addrange(cv, graphRangeTable[0].start - (index == CC_PRINT),
			graphRangeTable[0].end);
	    for (i=1 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
		addrange(cv, graphRangeTable[i].start,
			graphRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
		addchr(cv, graphCharTable[i]);
	    }
	}







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



<
<
<
|







956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983



984
985
986
987
988
989
990
991
	    }
	    for (i=0 ; (size_t)i<NUM_UPPER_CHAR ; i++) {
		addchr(cv, upperCharTable[i]);
	    }
	}
	break;
    case CC_PRINT:
    	cv  = getcvec(v, NUM_SPACE_CHAR + NUM_GRAPH_CHAR, NUM_SPACE_RANGE + NUM_GRAPH_RANGE  - 1);
    	if (cv) {
    	    for (i=1 ; (size_t)i<NUM_SPACE_RANGE ; i++) {
    		addrange(cv, spaceRangeTable[i].start,
    				spaceRangeTable[i].end);
    	    }
    	    for (i=0 ; (size_t)i<NUM_SPACE_CHAR ; i++) {
    		addchr(cv, spaceCharTable[i]);
    	    }
    	    for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
    		addrange(cv, graphRangeTable[i].start,
    				graphRangeTable[i].end);
    	    }
    	    for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
    		addchr(cv, graphCharTable[i]);
    	    }
    	}
    	break;
    case CC_GRAPH:
	cv  = getcvec(v, NUM_GRAPH_CHAR, NUM_GRAPH_RANGE);
	if (cv) {



	    for (i=0 ; (size_t)i<NUM_GRAPH_RANGE ; i++) {
		addrange(cv, graphRangeTable[i].start,
			graphRangeTable[i].end);
	    }
	    for (i=0 ; (size_t)i<NUM_GRAPH_CHAR ; i++) {
		addchr(cv, graphCharTable[i]);
	    }
	}

Changes to generic/tclUtf.c.

22
23
24
25
26
27
28


29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49







50
51
52
53
54
55
56
 * values are shifted right by the category value to determine whether the
 * given category is included in the set.
 */

#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))



#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)

#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
	| (1 << PARAGRAPH_SEPARATOR))

#define CONNECTOR_BITS (1 << CONNECTOR_PUNCTUATION)

#define PRINT_BITS (ALPHA_BITS | DIGIT_BITS | SPACE_BITS | \
	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
	(1 << OTHER_NUMBER) | (1 << CONNECTOR_PUNCTUATION) | \
	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION) | \
	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))

#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))








/*
 * Unicode characters less than this value are represented by themselves in
 * UTF-8 strings.
 */

#define UNICODE_SELF	0x80







>
>





<
<
|
<
<
<
<
<
<
<
<





>
>
>
>
>
>
>







22
23
24
25
26
27
28
29
30
31
32
33
34
35


36








37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
 * values are shifted right by the category value to determine whether the
 * given category is included in the set.
 */

#define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))

#define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))

#define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)

#define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
	| (1 << PARAGRAPH_SEPARATOR))



#define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))









#define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))

#define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
	(1 << OTHER_NUMBER) | \
	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))

/*
 * Unicode characters less than this value are represented by themselves in
 * UTF-8 strings.
 */

#define UNICODE_SELF	0x80
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsAlnum(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

    return (((ALPHA_BITS | DIGIT_BITS) >> category) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlpha --
 *







<
<
|







1324
1325
1326
1327
1328
1329
1330


1331
1332
1333
1334
1335
1336
1337
1338
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsAlnum(
    int ch)			/* Unicode character to test. */
{


    return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsAlpha --
 *
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsAlpha(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
    return ((ALPHA_BITS >> category) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsControl --
 *







<
|







1347
1348
1349
1350
1351
1352
1353

1354
1355
1356
1357
1358
1359
1360
1361
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsAlpha(
    int ch)			/* Unicode character to test. */
{

    return ((ALPHA_BITS >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsControl --
 *
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsControl(
    int ch)			/* Unicode character to test. */
{
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == CONTROL);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsDigit --
 *







|







1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsControl(
    int ch)			/* Unicode character to test. */
{
    return ((CONTROL_BITS >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsDigit --
 *
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsDigit(
    int ch)			/* Unicode character to test. */
{
    return (GetUniCharInfo(ch)&UNICODE_CATEGORY_MASK) == DECIMAL_DIGIT_NUMBER;
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsGraph --
 *







|







1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsDigit(
    int ch)			/* Unicode character to test. */
{
    return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsGraph --
 *
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsGraph(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
    return (((PRINT_BITS >> category) & 1) && (ch != ' '));
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsLower --
 *







<
|







1416
1417
1418
1419
1420
1421
1422

1423
1424
1425
1426
1427
1428
1429
1430
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsGraph(
    int ch)			/* Unicode character to test. */
{

    return ((GRAPH_BITS >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsLower --
 *
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsLower(
    int ch)			/* Unicode character to test. */
{
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == LOWERCASE_LETTER);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPrint --
 *







|







1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsLower(
    int ch)			/* Unicode character to test. */
{
    return (GetCategory(ch) == LOWERCASE_LETTER);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPrint --
 *
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsPrint(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
    return ((PRINT_BITS >> category) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPunct --
 *







<
|







1462
1463
1464
1465
1466
1467
1468

1469
1470
1471
1472
1473
1474
1475
1476
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsPrint(
    int ch)			/* Unicode character to test. */
{

    return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsPunct --
 *
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsPunct(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
    return ((PUNCT_BITS >> category) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsSpace --
 *







<
|







1485
1486
1487
1488
1489
1490
1491

1492
1493
1494
1495
1496
1497
1498
1499
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsPunct(
    int ch)			/* Unicode character to test. */
{

    return ((PUNCT_BITS >> GetCategory(ch)) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsSpace --
 *
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsSpace(
    int ch)			/* Unicode character to test. */
{
    register int category;

    /*
     * If the character is within the first 127 characters, just use the
     * standard C function, otherwise consult the Unicode table.
     */

    if (ch < 0x80) {
	return TclIsSpaceProc((char)ch);
    } else {
	category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);
	return ((SPACE_BITS >> category) & 1);
    }
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsUpper --







<
<





|
|

<
|







1508
1509
1510
1511
1512
1513
1514


1515
1516
1517
1518
1519
1520
1521
1522

1523
1524
1525
1526
1527
1528
1529
1530
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsSpace(
    int ch)			/* Unicode character to test. */
{


    /*
     * If the character is within the first 127 characters, just use the
     * standard C function, otherwise consult the Unicode table.
     */

    if (((Tcl_UniChar) ch) < ((Tcl_UniChar) 0x80)) {
	return isspace(UCHAR(ch)); /* INTL: ISO space */
    } else {

	return ((SPACE_BITS >> GetCategory(ch)) & 1);
    }
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsUpper --
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsUpper(
    int ch)			/* Unicode character to test. */
{
    return ((GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK) == UPPERCASE_LETTER);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsWordChar --
 *







|







1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsUpper(
    int ch)			/* Unicode character to test. */
{
    return (GetCategory(ch) == UPPERCASE_LETTER);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharIsWordChar --
 *
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsWordChar(
    int ch)			/* Unicode character to test. */
{
    register int category = (GetUniCharInfo(ch) & UNICODE_CATEGORY_MASK);

    return (((ALPHA_BITS | DIGIT_BITS | CONNECTOR_BITS) >> category) & 1);
}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharCaseMatch --
 *







<
|
<







1563
1564
1565
1566
1567
1568
1569

1570

1571
1572
1573
1574
1575
1576
1577
 *----------------------------------------------------------------------
 */

int
Tcl_UniCharIsWordChar(
    int ch)			/* Unicode character to test. */
{

    return ((WORD_BITS >> GetCategory(ch)) & 1);

}

/*
 *----------------------------------------------------------------------
 *
 * Tcl_UniCharCaseMatch --
 *

Changes to tests/utf.test.

304
305
306
307
308
309
310




























311
312
313
314
315
316
317
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {
    # [Bug 3464428]
    regexp {^[[:graph:]]+$} \u0120




























} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
test utf-21.4 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u0120
} {1}
test utf-21.5 {unicode graph char in regc_locale.c} {
    # [Bug 3464428]
    regexp {^[[:graph:]]+$} \u0120
} {1}
test utf-21.6 {TclUniCharIsGraph} {
    # [Bug 3464428]
    string is graph \u00a0
} {0}
test utf-21.7 {unicode graph char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:graph:]]} \u0020\u00a0\u2028\u2029
} {0}
test utf-21.8 {TclUniCharIsPrint} {
    # [Bug 3464428]
    string is print \u0009
} {0}
test utf-21.9 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.10 {unicode print char in regc_locale.c} {
    # [Bug 3464428]
    regexp {[[:print:]]} \u0009
} {0}
test utf-21.11 {TclUniCharIsControl} {
    # [Bug 3464428]
    string is control \u00ad
} {1}
test utf-21.12 {unicode control char in regc_locale.c} {
    # [Bug 3464428]
    regexp {^[[:cntrl:]]$} \u00ad
} {1}

test utf-22.1 {TclUniCharIsWordChar} {
    string wordend "xyz123_bar fg" 0
} 10
test utf-22.2 {TclUniCharIsWordChar} {
    string wordend "x\u5080z123_bar\u203c fg" 0

Changes to tools/uniClass.tcl.

83
84
85
86
87
88
89

90
91
92
93
94
95
96
 *	is automatically generated by the tools/uniClass.tcl script
 *	and used in generic/regc_locale.c.  Do not modify by hand.
 */
"

foreach {type desc} {
    alpha "alphabetic characters"

    digit "decimal digit characters"
    punct "punctuation characters"
    space "white space characters"
    lower "lowercase characters"
    upper "uppercase characters"
    graph "unicode print characters excluding space"
} {







>







83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
 *	is automatically generated by the tools/uniClass.tcl script
 *	and used in generic/regc_locale.c.  Do not modify by hand.
 */
"

foreach {type desc} {
    alpha "alphabetic characters"
    control "control characters"
    digit "decimal digit characters"
    punct "punctuation characters"
    space "white space characters"
    lower "lowercase characters"
    upper "uppercase characters"
    graph "unicode print characters excluding space"
} {