Tcl Source Code

Check-in [7c97e3ea95]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass).
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dkf-utf16-branch
Files: files | file ages | folders
SHA1: 7c97e3ea95fb60eca40a0b97b23181a43685cfed
User & Date: dkf 2011-07-31 23:14:02
Context
2011-07-31
23:16
Bring up to date with mainline. check-in: e878fe8df0 user: dkf tags: dkf-utf16-branch
23:14
Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass). check-in: 7c97e3ea95 user: dkf tags: dkf-utf16-branch
2011-07-28
14:24
Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and bac... check-in: fed6086a43 user: dkf tags: dkf-utf16-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
				 * large enough to hold the UTF-8 character
				 * (at most 6 bytes). */
{
    if ((ch > 0) && (ch < 0x80)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	three:
	    buf[2] = (char) ((ch | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 12) | 0xE0);
	    return 3;
	}
	if (ch <= 0x1FFFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
	if (ch <= 0x3FFFFFF) {
	    buf[4] = (char) ((ch | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 24) | 0xF8);
	    return 5;
	}
	if (ch <= 0x7FFFFFFF) {
	    buf[5] = (char) ((ch | 0x80) & 0xBF);
	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 30) | 0xFC);
	    return 6;
	}
    }

    ch = 0xFFFD;
    goto three;
}

static INLINE int







<
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
<







2245
2246
2247
2248
2249
2250
2251

2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286

2287
2288
2289
2290
2291
2292
2293
				 * large enough to hold the UTF-8 character
				 * (at most 6 bytes). */
{
    if ((ch > 0) && (ch < 0x80)) {
	buf[0] = (char) ch;
	return 1;
    }

    if (ch <= 0x7FF) {
	buf[1] = (char) ((ch | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 6) | 0xC0);
	return 2;
    }
    if (ch <= 0xFFFF) {
    three:
	buf[2] = (char) ((ch | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 12) | 0xE0);
	return 3;
    }
    if (ch <= 0x1FFFFF) {
	buf[3] = (char) ((ch | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 18) | 0xF0);
	return 4;
    }
    if (ch <= 0x3FFFFFF) {
	buf[4] = (char) ((ch | 0x80) & 0xBF);
	buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 24) | 0xF8);
	return 5;
    }
    if (ch <= 0x7FFFFFFF) {
	buf[5] = (char) ((ch | 0x80) & 0xBF);
	buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
	buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
	buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
	buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
	buf[0] = (char) ((ch >> 30) | 0xFC);
	return 6;

    }

    ch = 0xFFFD;
    goto three;
}

static INLINE int
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512

		src += Tcl_UtfToUniChar(src, &ch);
		if (ch >= 0xD800 && ch < 0xDBFF) {
		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;

		    src += Tcl_UtfToUniChar(src, &ch);
		    if (ch >= 0xDC00 && ch < 0xDFFF) {
			fullChar += (unsigned) (ch - 0xDC00);
			dst += IntToUtf(fullChar, dst);
			continue;
		    } else {
			src = origin + Tcl_UtfToUniChar(origin, &ch);
		    }
		}
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		unsigned fullChar;

		src += UtfToInt(src, &fullChar);
		if (fullChar > 0xFFFF) {
		    fullChar -= 0x10000;
		    ch = (Tcl_UniChar) (((fullChar & 0xFFC00) >> 10) + 0xD800);
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
		    dst += Tcl_UniCharToUtf(ch, dst);
		} else {
		    ch = (Tcl_UniChar) fullChar;
		    dst += Tcl_UniCharToUtf(ch, dst);
		}







|













|







2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510

		src += Tcl_UtfToUniChar(src, &ch);
		if (ch >= 0xD800 && ch < 0xDBFF) {
		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;

		    src += Tcl_UtfToUniChar(src, &ch);
		    if (ch >= 0xDC00 && ch < 0xDFFF) {
			fullChar += 0x2400 + (unsigned) ch;
			dst += IntToUtf(fullChar, dst);
			continue;
		    } else {
			src = origin + Tcl_UtfToUniChar(origin, &ch);
		    }
		}
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		unsigned fullChar;

		src += UtfToInt(src, &fullChar);
		if (fullChar > 0xFFFF) {
		    fullChar -= 0x10000;
		    ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800);
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
		    dst += Tcl_UniCharToUtf(ch, dst);
		} else {
		    ch = (Tcl_UniChar) fullChar;
		    dst += Tcl_UniCharToUtf(ch, dst);
		}

Changes to tests/encoding.test.

311
312
313
314
315
316
317









318
319
320
321
322
323
324

test encoding-15.3 {UtfToUtfProc null character input} {
    set x [encoding convertfrom identity \x00]
    set y [encoding convertfrom utf-8 $x]
    binary scan [encoding convertto identity $y] H* z
    list [string bytelength $x] [string bytelength $y] $z
} {1 2 c080}










test encoding-16.1 {UnicodeToUtfProc} {
    set val [encoding convertfrom unicode NN]
    list $val [format %x [scan $val %c]]
} "\u4e4e 4e4e"

test encoding-17.1 {UtfToUnicodeProc} {







>
>
>
>
>
>
>
>
>







311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333

test encoding-15.3 {UtfToUtfProc null character input} {
    set x [encoding convertfrom identity \x00]
    set y [encoding convertfrom utf-8 $x]
    binary scan [encoding convertto identity $y] H* z
    list [string bytelength $x] [string bytelength $y] $z
} {1 2 c080}

test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} {
    set x \xF0\xA4\xAD\xA2; # U+024B62
    set y [encoding convertfrom utf-8 $x]
    set z [encoding convertto utf-8 $y]
    list [string length $x] [string length $y] [string length $z] \
	[format 0x%04x.0x%04x {*}[scan $y %c%c]] \
	[format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]]
} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2}

test encoding-16.1 {UnicodeToUtfProc} {
    set val [encoding convertfrom unicode NN]
    list $val [format %x [scan $val %c]]
} "\u4e4e 4e4e"

test encoding-17.1 {UtfToUnicodeProc} {