Tcl Source Code

Check-in [fed6086a43]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and back again.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dkf-utf16-branch
Files: files | file ages | folders
SHA1: fed6086a43d51f8642be98d47322831bffc4ccf0
User & Date: dkf 2011-07-28 14:24:36
Context
2011-07-31
23:14
Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass). check-in: 7c97e3ea95 user: dkf tags: dkf-utf16-branch
2011-07-28
14:24
Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and bac... check-in: fed6086a43 user: dkf tags: dkf-utf16-branch
2011-07-27
10:40
Start work towards being able to work with utf8 fully and utf16 and other things outside the BMP. check-in: f9f8c8425c user: dkf tags: dkf-utf16-branch
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

2231
2232
2233
2234
2235
2236
2237





















































































































































2238
2239
2240
2241
2242
2243
2244
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */






















































































































































static int
UtfToUtfProc(
    ClientData clientData,	/* Not used. */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2231
2232
2233
2234
2235
2236
2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285
2286
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299
2300
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
2358
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
2374
2375
2376
2377
2378
2379
2380
2381
2382
2383
2384
2385
2386
2387
2388
2389
2390
2391
2392
2393
 *	Returns TCL_OK if conversion was successful.
 *
 * Side effects:
 *	None.
 *
 *-------------------------------------------------------------------------
 */

static INLINE int
IntToUtf(
    unsigned ch,		/* The character to be stored in the
				 * buffer. */
    char *buf)			/* Buffer in which the UTF-8 representation of
				 * the character is stored. Buffer must be
				 * large enough to hold the UTF-8 character
				 * (at most 6 bytes). */
{
    if ((ch > 0) && (ch < 0x80)) {
	buf[0] = (char) ch;
	return 1;
    }
    if (ch >= 0) {
	if (ch <= 0x7FF) {
	    buf[1] = (char) ((ch | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 6) | 0xC0);
	    return 2;
	}
	if (ch <= 0xFFFF) {
	three:
	    buf[2] = (char) ((ch | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 12) | 0xE0);
	    return 3;
	}
	if (ch <= 0x1FFFFF) {
	    buf[3] = (char) ((ch | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 18) | 0xF0);
	    return 4;
	}
	if (ch <= 0x3FFFFFF) {
	    buf[4] = (char) ((ch | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 24) | 0xF8);
	    return 5;
	}
	if (ch <= 0x7FFFFFFF) {
	    buf[5] = (char) ((ch | 0x80) & 0xBF);
	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
	    buf[0] = (char) ((ch >> 30) | 0xFC);
	    return 6;
	}
    }

    ch = 0xFFFD;
    goto three;
}

static INLINE int
UtfToInt(
    const char *src,		/* The UTF-8 string. */
    unsigned *chPtr)		/* Filled with the character represented by
				 * the front of the UTF-8 string. */
{
    register int byte;

    /*
     * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones.
     */

    byte = *((unsigned char *) src);
    if (byte < 0xC0) {
	/*
	 * Handles properly formed UTF-8 characters between 0x01 and 0x7F.
	 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid
	 * characters representing themselves.
	 */

	*chPtr = (Tcl_UniChar) byte;
	return 1;
    } else if (byte < 0xE0) {
	if ((src[1] & 0xC0) == 0x80) {
	    /*
	     * Two-byte-character lead-byte followed by a trail-byte.
	     */

	    *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F));
	    return 2;
	}

	/*
	 * A two-byte-character lead-byte not followed by trail-byte
	 * represents itself.
	 */

	*chPtr = (Tcl_UniChar) byte;
	return 1;
    } else if (byte < 0xF0) {
	if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) {
	    /*
	     * Three-byte-character lead byte followed by two trail bytes.
	     */

	    *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12)
		    | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F));
	    return 3;
	}

	/*
	 * A three-byte-character lead-byte not followed by two trail-bytes
	 * represents itself.
	 */

	*chPtr = (Tcl_UniChar) byte;
	return 1;
    } else {
	int ch, total, trail;
	static const unsigned char totalBytes[256] = {
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
	    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	    3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6
	};

	total = totalBytes[byte];
	trail = total - 1;
	if (trail > 0) {
	    ch = byte & (0x3F >> trail);
	    do {
		src++;
		if ((*src & 0xC0) != 0x80) {
		    *chPtr = byte;
		    return 1;
		}
		ch <<= 6;
		ch |= (*src & 0x3F);
		trail--;
	    } while (trail > 0);
	    *chPtr = ch;
	    return total;
	} else {
	    *chPtr = (Tcl_UniChar) byte;
	    return 1;
	}
    }
}

static int
UtfToUtfProc(
    ClientData clientData,	/* Not used. */
    const char *src,		/* Source string in UTF-8. */
    int srcLen,			/* Source string length in bytes. */
    int flags,			/* Conversion control flags. */
2326
2327
2328
2329
2330
2331
2332



2333












2334















2335
2336
2337
2338
2339
2340
2341
	} else {
	    /*
	     * This is where we ought to do surrogate pair handling, with the
	     * correct way of doing it depending on the conversionMode
	     * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE!
	     */




	    src += Tcl_UtfToUniChar(src, &ch);












	    dst += Tcl_UniCharToUtf(ch, dst);















	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;







>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
2485
2486
2487
2488
2489
2490
2491
2492
2493
2494
2495
2496
2497
2498
2499
2500
2501
2502
2503
2504
2505
2506
2507
2508
2509
2510
2511
2512
2513
2514
2515
2516
2517
2518
2519
2520
	} else {
	    /*
	     * This is where we ought to do surrogate pair handling, with the
	     * correct way of doing it depending on the conversionMode
	     * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE!
	     */

	    if (conversionMode == TO_STANDARD_UTF8) {
		const char *origin = src;

		src += Tcl_UtfToUniChar(src, &ch);
		if (ch >= 0xD800 && ch < 0xDBFF) {
		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;

		    src += Tcl_UtfToUniChar(src, &ch);
		    if (ch >= 0xDC00 && ch < 0xDFFF) {
			fullChar += (unsigned) (ch - 0xDC00);
			dst += IntToUtf(fullChar, dst);
			continue;
		    } else {
			src = origin + Tcl_UtfToUniChar(origin, &ch);
		    }
		}
		dst += Tcl_UniCharToUtf(ch, dst);
	    } else {
		unsigned fullChar;

		src += UtfToInt(src, &fullChar);
		if (fullChar > 0xFFFF) {
		    fullChar -= 0x10000;
		    ch = (Tcl_UniChar) (((fullChar & 0xFFC00) >> 10) + 0xD800);
		    dst += Tcl_UniCharToUtf(ch, dst);
		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
		    dst += Tcl_UniCharToUtf(ch, dst);
		} else {
		    ch = (Tcl_UniChar) fullChar;
		    dst += Tcl_UniCharToUtf(ch, dst);
		}
	    }
	}
    }

    *srcReadPtr = src - srcStart;
    *dstWrotePtr = dst - dstStart;
    *dstCharsPtr = numChars;
    return result;