Tcl Source Code

Check-in [1f5f4a676e]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:[Bug 3444754] string tolower \u01c5 is wrong
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 1f5f4a676e1181cbafe2ac97dc839ded45c65618
User & Date: jan.nijtmans 2011-12-07 06:23:59
Context
2011-12-11
09:17
Some Unicode 6.0 chars not in [:print:] class check-in: 95f8aa7255 user: jan.nijtmans tags: trunk
2011-12-07
06:23
[Bug 3444754] string tolower \u01c5 is wrong check-in: 1f5f4a676e user: jan.nijtmans tags: trunk
06:17
[Bug 3444754] string tolower \u01c5 is wrong check-in: 74cab55c44 user: jan.nijtmans tags: core-8-5-branch
2011-12-01
09:18
Note incompatibility caused by switch of flags to dlopen() for Bug 3216070. check-in: 41679f73e6 user: dkf tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to ChangeLog.







1
2
3
4
5
6
7






2011-11-30  Jan Nijtmans  <[email protected]>

	* library/tcltest/tcltest.tcl: [Bug 967195]: Make tcltest work
	when tclsh is compiled without using the setargv() function on mingw.
	(no need to incr the version, since 2.2.10 is never released)

2011-11-29  Jan Nijtmans  <[email protected]>
>
>
>
>
>
>







1
2
3
4
5
6
7
8
9
10
11
12
13
2011-12-07  Jan Nijtmans  <[email protected]>

	* tools/uniParse.tcl:    [Bug 3444754] string tolower \u01c5 is wrong
	* generic/tclUniData.c:
	* tests/utf.test:

2011-11-30  Jan Nijtmans  <[email protected]>

	* library/tcltest/tcltest.tcl: [Bug 967195]: Make tcltest work
	when tclsh is compiled without using the setargv() function on mingw.
	(no need to incr the version, since 2.2.10 is never released)

2011-11-29  Jan Nijtmans  <[email protected]>

Changes to generic/tclUniData.c.

747
748
749
750
751
752
753
754
755
756
757
758
759
760
761

static const int groups[] = {
    0, 15, 12, 25, 27, 21, 22, 26, 20, 9, 1048641, 28, 19, 1048706,
    29, 2, 23, 16, 11, -24346494, 24, -3964798, 32833, 32898, -6520767,
    7602306, -3964863, 9830530, -6389630, 6881345, 6750273, 6717505,
    2588737, 6619201, 6651969, 6783041, -3178366, 6914113, 6848577,
    -5341054, 6979649, -4259710, 7012417, 7143489, 7110721, 7176257,
    5, -1834878, 65633, 32931, 65698, 2588802, -3178431, -1834943,
    -4259775, 353730625, -5341119, 353632321, -354385790, -6389695,
    2261057, 2326593, -353337214, -353238910, -353304446, 6881410,
    6750338, 6717570, 6619266, 6652034, 6783106, -1385430910, 6848642,
    6914178, -352026494, -352223102, 6979714, 7012482, -351502206,
    7143554, 2261122, 7110786, 2326658, 7176322, 4, 6, -2752378, 1245249,
    1212481, 2097217, 2064449, 1245314, 1212546, 1015938, 2097282,
    2064514, 262209, 2031746, 1867906, 1, 1540226, 1769602, 262274,







|







747
748
749
750
751
752
753
754
755
756
757
758
759
760
761

static const int groups[] = {
    0, 15, 12, 25, 27, 21, 22, 26, 20, 9, 1048641, 28, 19, 1048706,
    29, 2, 23, 16, 11, -24346494, 24, -3964798, 32833, 32898, -6520767,
    7602306, -3964863, 9830530, -6389630, 6881345, 6750273, 6717505,
    2588737, 6619201, 6651969, 6783041, -3178366, 6914113, 6848577,
    -5341054, 6979649, -4259710, 7012417, 7143489, 7110721, 7176257,
    5, -1834878, 65633, 32963, 65698, 2588802, -3178431, -1834943,
    -4259775, 353730625, -5341119, 353632321, -354385790, -6389695,
    2261057, 2326593, -353337214, -353238910, -353304446, 6881410,
    6750338, 6717570, 6619266, 6652034, 6783106, -1385430910, 6848642,
    6914178, -352026494, -352223102, 6979714, 7012482, -351502206,
    7143554, 2261122, 7110786, 2326658, 7176322, 4, 6, -2752378, 1245249,
    1212481, 2097217, 2064449, 1245314, 1212546, 1015938, 2097282,
    2064514, 262209, 2031746, 1867906, 1, 1540226, 1769602, 262274,
771
772
773
774
775
776
777
778

779
780
781
782
783
784
785
};

/*
 * The following constants are used to determine the category of a
 * Unicode character.
 */

#define UNICODE_CATEGORY_MASK 0X1F


enum {
    UNASSIGNED,
    UPPERCASE_LETTER,
    LOWERCASE_LETTER,
    TITLECASE_LETTER,
    MODIFIER_LETTER,







|
>







771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
};

/*
 * The following constants are used to determine the category of a
 * Unicode character.
 */

#define UNICODE_CATEGORY_MASK 0x1F
#define UNICODE_OUT_OF_RANGE 0x10000u

enum {
    UNASSIGNED,
    UPPERCASE_LETTER,
    LOWERCASE_LETTER,
    TITLECASE_LETTER,
    MODIFIER_LETTER,
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
/*
 * The following macros extract the fields of the character info.  The
 * GetDelta() macro is complicated because we can't rely on the C compiler
 * to do sign extension on right shifts.
 */

#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(info) ((info) & 0x1F)
#define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15)))

/*
 * This macro extracts the information about a character from the
 * Unicode character tables.
 */

#define GetUniCharInfo(ch) (groups[groupMap[(pageMap[(((int)(ch)) & 0xffff) >> OFFSET_BITS] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))]])








|









814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
/*
 * The following macros extract the fields of the character info.  The
 * GetDelta() macro is complicated because we can't rely on the C compiler
 * to do sign extension on right shifts.
 */

#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F)
#define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15)))

/*
 * This macro extracts the information about a character from the
 * Unicode character tables.
 */

#define GetUniCharInfo(ch) (groups[groupMap[(pageMap[(((int)(ch)) & 0xffff) >> OFFSET_BITS] << OFFSET_BITS) | ((ch) & ((1 << OFFSET_BITS)-1))]])

Changes to tests/utf.test.

255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    string toupper !
} !

test utf-16.1 {Tcl_UniCharToLower, negative delta} {
    string tolower aA
} aa
test utf-16.2 {Tcl_UniCharToLower, positive delta} {
    string tolower \u0178\u00ff\uA78D
} \u00ff\u00ff\u0265

test utf-17.1 {Tcl_UniCharToLower, no delta} {
    string tolower !
} !

test utf-18.1 {Tcl_UniCharToTitle, add one for title} {
    string totitle \u01c4







|
|







255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
    string toupper !
} !

test utf-16.1 {Tcl_UniCharToLower, negative delta} {
    string tolower aA
} aa
test utf-16.2 {Tcl_UniCharToLower, positive delta} {
    string tolower \u0178\u00ff\uA78D\u01c5
} \u00ff\u00ff\u0265\u01c6

test utf-17.1 {Tcl_UniCharToLower, no delta} {
    string tolower !
} !

test utf-18.1 {Tcl_UniCharToTitle, add one for title} {
    string totitle \u01c4

Changes to tools/uniParse.tcl.

40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
proc uni::getValue {items index} {
    variable categories
    variable titleCount

    # Extract character info

    set category [lindex $items 2]
    if {[scan [lindex $items 12] %4x toupper] == 1} {
	set toupper [expr {$index - $toupper}]
    } else {
	set toupper {}
    }
    if {[scan [lindex $items 13] %4x tolower] == 1} {
	set tolower [expr {$tolower - $index}]
    } else {
	set tolower {}
    }
    if {[scan [lindex $items 14] %4x totitle] == 1} {
	set totitle [expr {$index - $totitle}]
    } else {
	set totitle {}
    }

    set categoryIndex [lsearch -exact $categories $category]
    if {$categoryIndex < 0} {
	puts "Unexpected character category: $index($category)"
	set categoryIndex 0
    } elseif {$category eq "Lt"} {







|


|

|


|

|


|







40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
proc uni::getValue {items index} {
    variable categories
    variable titleCount

    # Extract character info

    set category [lindex $items 2]
    if {[scan [lindex $items 12] %6x toupper] == 1} {
	set toupper [expr {$index - $toupper}]
    } else {
	set toupper 0
    }
    if {[scan [lindex $items 13] %6x tolower] == 1} {
	set tolower [expr {$tolower - $index}]
    } else {
	set tolower 0
    }
    if {[scan [lindex $items 14] %6x totitle] == 1} {
	set totitle [expr {$index - $totitle}]
    } else {
	set totitle 0
    }

    set categoryIndex [lsearch -exact $categories $category]
    if {$categoryIndex < 0} {
	puts "Unexpected character category: $index($category)"
	set categoryIndex 0
    } elseif {$category eq "Lt"} {
96
97
98
99
100
101
102
103

104
105
106
107
108
109
110
111






112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
}

proc uni::buildTables {data} {
    variable shift

    variable pMap {}
    variable pages {}
    variable groups {{0,,,}}

    set info {}			;# temporary page info

    set mask [expr {(1 << $shift) - 1}]

    set next 0

    foreach line [split $data \n] {
	if {$line eq ""} {






	    set line "FFFF;;Cn;0;ON;;;;;N;;;;;\n"
	}

	set items [split $line \;]

	scan [lindex $items 0] %x index
	if {$index > 0xFFFF} then {
	    # Ignore non-BMP characters, as long as Tcl doesn't support them
	    continue
	}
	set index [format 0x%0.4x $index]

	set gIndex [getGroup [getValue $items $index]]

	# Since the input table omits unassigned characters, these will
	# show up as gaps in the index sequence.  There are a few special cases
	# where the gaps correspond to a uniform block of assigned characters.
	# These are indicated as such in the character name.







|
>




<
<


>
>
>
>
>
>
|









|







96
97
98
99
100
101
102
103
104
105
106
107
108


109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
}

proc uni::buildTables {data} {
    variable shift

    variable pMap {}
    variable pages {}
    variable groups {{0,0,0,0}}
    variable next 0
    set info {}			;# temporary page info

    set mask [expr {(1 << $shift) - 1}]



    foreach line [split $data \n] {
	if {$line eq ""} {
	    if {!($next & $mask)} {
		# next character is already on page boundary
		continue
	    }
	    # fill remaining page
	    set line [format %X [expr {($next-1)|$mask}]]
	    append line ";;Cn;0;ON;;;;;N;;;;;\n"
	}

	set items [split $line \;]

	scan [lindex $items 0] %x index
	if {$index > 0xFFFF} then {
	    # Ignore non-BMP characters, as long as Tcl doesn't support them
	    continue
	}
	set index [format %d $index]

	set gIndex [getGroup [getValue $items $index]]

	# Since the input table omits unassigned characters, these will
	# show up as gaps in the index sequence.  There are a few special cases
	# where the gaps correspond to a uniform block of assigned characters.
	# These are indicated as such in the character name.
163
164
165
166
167
168
169

170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
proc uni::main {} {
    global argc argv0 argv
    variable pMap
    variable pages
    variable groups
    variable shift
    variable titleCount


    if {$argc != 2} {
	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
	exit 1
    }
    set f [open [lindex $argv 0] r]
    set data [read $f]
    close $f

    buildTables $data
    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
    set size [expr {[llength $pMap] + [llength $pages]*(1<<$shift)}]
    puts "shift = $shift, space = $size"
    puts "title case count = $titleCount"

    set f [open [file join [lindex $argv 1] tclUniData.c] w]
    fconfigure $f -translation lf
    puts $f "/*
 * tclUniData.c --







>











|







168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
proc uni::main {} {
    global argc argv0 argv
    variable pMap
    variable pages
    variable groups
    variable shift
    variable titleCount
    variable next

    if {$argc != 2} {
	puts stderr "\nusage: $argv0 <datafile> <outdir>\n"
	exit 1
    }
    set f [open [lindex $argv 0] r]
    set data [read $f]
    close $f

    buildTables $data
    puts "X = [llength $pMap]  Y= [llength $pages]  A= [llength $groups]"
    set size [expr {[llength $pMap]*2 + [llength $pages]*(1<<$shift)}]
    puts "shift = $shift, space = $size"
    puts "title case count = $titleCount"

    set f [open [file join [lindex $argv 1] tclUniData.c] w]
    fconfigure $f -translation lf
    puts $f "/*
 * tclUniData.c --
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327


328
329
330
331
332
333
334
    set line "    "
    set last [expr {[llength $groups] - 1}]
    for {set i 0} {$i <= $last} {incr i} {
	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}

	# Compute the case conversion type and delta

	if {$totitle ne ""} {
	    if {$totitle == $toupper} {
		# subtract delta for title or upper
		set case 4
		set delta $toupper
	    } elseif {$toupper ne ""} {
		# subtract delta for upper, subtract 1 for title
		set case 5
		set delta $toupper
	    } else {
		# add delta for lower, add 1 for title
		set case 3
		set delta $tolower
	    }
	} elseif {$toupper ne ""} {
	    # subtract delta for upper, add delta for lower
	    set case 6
	    set delta $toupper
	} elseif {$tolower ne ""} {
	    # add delta for lower
	    set case 2
	    set delta $tolower
	} else {
	    # noop
	    set case 0
	    set delta 0
	}

	append line [expr {($delta << 15) | ($case << 5) | $type}]
	if {$i != $last} {
	    append line ", "
	}
	if {[string length $line] > 65} {
	    puts $f [string trimright $line]
	    set line "    "
	}
    }
    puts $f $line
    puts $f "};

/*
 * The following constants are used to determine the category of a
 * Unicode character.
 */

#define UNICODE_CATEGORY_MASK 0X1F



enum {
    UNASSIGNED,
    UPPERCASE_LETTER,
    LOWERCASE_LETTER,
    TITLECASE_LETTER,
    MODIFIER_LETTER,
    OTHER_LETTER,







|




|








|



|



















|






|
|
>
>







280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
    set line "    "
    set last [expr {[llength $groups] - 1}]
    for {set i 0} {$i <= $last} {incr i} {
	foreach {type toupper tolower totitle} [split [lindex $groups $i] ,] {}

	# Compute the case conversion type and delta

	if {$totitle} {
	    if {$totitle == $toupper} {
		# subtract delta for title or upper
		set case 4
		set delta $toupper
	    } elseif {$toupper} {
		# subtract delta for upper, subtract 1 for title
		set case 5
		set delta $toupper
	    } else {
		# add delta for lower, add 1 for title
		set case 3
		set delta $tolower
	    }
	} elseif {$toupper} {
	    # subtract delta for upper, add delta for lower
	    set case 6
	    set delta $toupper
	} elseif {$tolower} {
	    # add delta for lower
	    set case 2
	    set delta $tolower
	} else {
	    # noop
	    set case 0
	    set delta 0
	}

	append line [expr {($delta << 15) | ($case << 5) | $type}]
	if {$i != $last} {
	    append line ", "
	}
	if {[string length $line] > 65} {
	    puts $f [string trimright $line]
	    set line "    "
	}
    }
    puts $f $line
    puts -nonewline $f "};

/*
 * The following constants are used to determine the category of a
 * Unicode character.
 */

#define UNICODE_CATEGORY_MASK 0x1F
#define UNICODE_OUT_OF_RANGE "
    puts $f [format 0x%Xu $next]
    puts $f "
enum {
    UNASSIGNED,
    UPPERCASE_LETTER,
    LOWERCASE_LETTER,
    TITLECASE_LETTER,
    MODIFIER_LETTER,
    OTHER_LETTER,
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
/*
 * The following macros extract the fields of the character info.  The
 * GetDelta() macro is complicated because we can't rely on the C compiler
 * to do sign extension on right shifts.
 */

#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(info) ((info) & 0x1F)
#define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15)))

/*
 * This macro extracts the information about a character from the
 * Unicode character tables.
 */








|







369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
/*
 * The following macros extract the fields of the character info.  The
 * GetDelta() macro is complicated because we can't rely on the C compiler
 * to do sign extension on right shifts.
 */

#define GetCaseType(info) (((info) & 0xE0) >> 5)
#define GetCategory(ch) (GetUniCharInfo(ch) & 0x1F)
#define GetDelta(info) (((info) > 0) ? ((info) >> 15) : (~(~((info)) >> 15)))

/*
 * This macro extracts the information about a character from the
 * Unicode character tables.
 */