Check-in [7c97e3ea95]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
SHA1 Hash:7c97e3ea95fb60eca40a0b97b23181a43685cfed
Date: 2011-07-31 23:14:02
User: dkf
Comment:Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass).
Tags And Properties
Changes

Changes to generic/tclEncoding.c

2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
....
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
				 * large enough to hold the UTF-8 character
				 * (at most 6 bytes). */
{
    if ((ch > 0) && (ch < 0x80)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	three:
	    buf[2] = (char) ((ch | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 12) | 0xE0);
	    return 3;
	}
	if (ch <= 0x1FFFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
	if (ch <= 0x3FFFFFF) {
	    buf[4] = (char) ((ch | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 24) | 0xF8);
	    return 5;
	}
	if (ch <= 0x7FFFFFFF) {
	    buf[5] = (char) ((ch | 0x80) & 0xBF);
	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 30) | 0xFC);
	    return 6;
	}
    }

    ch = 0xFFFD;
    goto three;
}

static INLINE int
................................................................................

		src += Tcl_UtfToUniChar(src, &ch);
		if (ch >= 0xD800 && ch < 0xDBFF) {
		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;

		    src += Tcl_UtfToUniChar(src, &ch);
		    if (ch >= 0xDC00 && ch < 0xDFFF) {
			fullChar += (unsigned) (ch - 0xDC00);
			dst += IntToUtf(fullChar, dst);
			continue;
		    } else {
			src = origin + Tcl_UtfToUniChar(origin, &ch);
		    }
		}
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		unsigned fullChar;

		src += UtfToInt(src, &fullChar);
		if (fullChar > 0xFFFF) {
		    fullChar -= 0x10000;
		    ch = (Tcl_UniChar) (((fullChar & 0xFFC00) >> 10) + 0xD800);
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
		    dst += Tcl_UniCharToUtf(ch, dst);
		} else {
		    ch = (Tcl_UniChar) fullChar;
		    dst += Tcl_UniCharToUtf(ch, dst);
		}







<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<







 







|













|







2245
2246
2247
2248
2249
2250
2251

2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286

2287
2288
2289
2290
2291
2292
2293
....
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
				 * large enough to hold the UTF-8 character
				 * (at most 6 bytes). */
{
    if ((ch > 0) && (ch < 0x80)) {
	buf[0] = (char) ch;
	return 1;
    }

    if (ch <= 0x7FF) {
	buf[1] = (char) ((ch | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 6) | 0xC0);
	return 2;
    }
    if (ch <= 0xFFFF) {
    three:
	buf[2] = (char) ((ch | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 12) | 0xE0);
	return 3;
    }
    if (ch <= 0x1FFFFF) {
	buf[3] = (char) ((ch | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 18) | 0xF0);
	return 4;
    }
    if (ch <= 0x3FFFFFF) {
	buf[4] = (char) ((ch | 0x80) & 0xBF);
	buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 24) | 0xF8);
	return 5;
    }
    if (ch <= 0x7FFFFFFF) {
	buf[5] = (char) ((ch | 0x80) & 0xBF);
	buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 30) | 0xFC);
	return 6;

    }

    ch = 0xFFFD;
    goto three;
}

static INLINE int
................................................................................

		src += Tcl_UtfToUniChar(src, &ch);
		if (ch >= 0xD800 && ch < 0xDBFF) {
		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;

		    src += Tcl_UtfToUniChar(src, &ch);
		    if (ch >= 0xDC00 && ch < 0xDFFF) {
			fullChar += 0x2400 + (unsigned) ch;
			dst += IntToUtf(fullChar, dst);
			continue;
		    } else {
			src = origin + Tcl_UtfToUniChar(origin, &ch);
		    }
		}
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		unsigned fullChar;

		src += UtfToInt(src, &fullChar);
		if (fullChar > 0xFFFF) {
		    fullChar -= 0x10000;
		    ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800);
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
		    dst += Tcl_UniCharToUtf(ch, dst);
		} else {
		    ch = (Tcl_UniChar) fullChar;
		    dst += Tcl_UniCharToUtf(ch, dst);
		}

Changes to tests/encoding.test

311
312
313
314
315
316
317









318
319
320
321
322
323
324

test encoding-15.3 {UtfToUtfProc null character input} {
    set x [encoding convertfrom identity \x00]
    set y [encoding convertfrom utf-8 $x]
    binary scan [encoding convertto identity $y] H* z
    list [string bytelength $x] [string bytelength $y] $z
} {1 2 c080}










test encoding-16.1 {UnicodeToUtfProc} {
    set val [encoding convertfrom unicode NN]
    list $val [format %x [scan $val %c]]
} "\u4e4e 4e4e"

test encoding-17.1 {UtfToUnicodeProc} {







>
>
>
>
>
>
>
>
>







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

test encoding-15.3 {UtfToUtfProc null character input} {
    set x [encoding convertfrom identity \x00]
    set y [encoding convertfrom utf-8 $x]
    binary scan [encoding convertto identity $y] H* z
    list [string bytelength $x] [string bytelength $y] $z
} {1 2 c080}

test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} {
    set x \xF0\xA4\xAD\xA2; # U+024B62
    set y [encoding convertfrom utf-8 $x]
    set z [encoding convertto utf-8 $y]
    list [string length $x] [string length $y] [string length $z] \
	[format 0x%04x.0x%04x {*}[scan $y %c%c]] \
	[format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]]
} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2}

test encoding-16.1 {UnicodeToUtfProc} {
    set val [encoding convertfrom unicode NN]
    list $val [format %x [scan $val %c]]
} "\u4e4e 4e4e"

test encoding-17.1 {UtfToUnicodeProc} {