Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and back again. |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | descendants | both | dkf-utf16-branch |
Files: | files | file ages | folders |
SHA1: |
fed6086a43d51f8642be98d47322831b |
User & Date: | dkf 2011-07-28 14:24:36 |
Context
2011-07-31
| ||
23:14 | Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass). check-in: 7c97e3ea95 user: dkf tags: dkf-utf16-branch | |
2011-07-28
| ||
14:24 | Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and bac... check-in: fed6086a43 user: dkf tags: dkf-utf16-branch | |
2011-07-27
| ||
10:40 | Start work towards being able to work with utf8 fully and utf16 and other things outside the BMP. check-in: f9f8c8425c user: dkf tags: dkf-utf16-branch | |
Changes
Changes to generic/tclEncoding.c.
︙ | ︙ | |||
2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 | * Returns TCL_OK if conversion was successful. * * Side effects: * None. * *------------------------------------------------------------------------- */ static int UtfToUtfProc( ClientData clientData, /* Not used. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 2378 2379 2380 2381 2382 2383 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 | * Returns TCL_OK if conversion was successful. * * Side effects: * None. * *------------------------------------------------------------------------- */ static INLINE int IntToUtf( unsigned ch, /* The character to be stored in the * buffer. */ char *buf) /* Buffer in which the UTF-8 representation of * the character is stored. Buffer must be * large enough to hold the UTF-8 character * (at most 6 bytes). */ { if ((ch > 0) && (ch < 0x80)) { buf[0] = (char) ch; return 1; } if (ch >= 0) { if (ch <= 0x7FF) { buf[1] = (char) ((ch | 0x80) & 0xBF); buf[0] = (char) ((ch >> 6) | 0xC0); return 2; } if (ch <= 0xFFFF) { three: buf[2] = (char) ((ch | 0x80) & 0xBF); buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 12) | 0xE0); return 3; } if (ch <= 0x1FFFFF) { buf[3] = (char) ((ch | 0x80) & 0xBF); buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 18) | 0xF0); return 4; } if (ch <= 0x3FFFFFF) { buf[4] = (char) ((ch | 0x80) & 0xBF); buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 24) | 0xF8); return 5; } if (ch <= 0x7FFFFFFF) { buf[5] = (char) ((ch | 0x80) & 0xBF); buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF); buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF); buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF); buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF); buf[0] = (char) ((ch >> 30) | 0xFC); return 6; } } ch = 0xFFFD; goto three; } static INLINE int UtfToInt( const char *src, /* The UTF-8 string. */ unsigned *chPtr) /* Filled with the character represented by * the front of the UTF-8 string. */ { register int byte; /* * Unroll 1 to 6 byte UTF-8 sequences, use loop to handle longer ones. */ byte = *((unsigned char *) src); if (byte < 0xC0) { /* * Handles properly formed UTF-8 characters between 0x01 and 0x7F. * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid * characters representing themselves. */ *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xE0) { if ((src[1] & 0xC0) == 0x80) { /* * Two-byte-character lead-byte followed by a trail-byte. */ *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); return 2; } /* * A two-byte-character lead-byte not followed by trail-byte * represents itself. */ *chPtr = (Tcl_UniChar) byte; return 1; } else if (byte < 0xF0) { if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { /* * Three-byte-character lead byte followed by two trail bytes. */ *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); return 3; } /* * A three-byte-character lead-byte not followed by two trail-bytes * represents itself. */ *chPtr = (Tcl_UniChar) byte; return 1; } else { int ch, total, trail; static const unsigned char totalBytes[256] = { 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,6,6 }; total = totalBytes[byte]; trail = total - 1; if (trail > 0) { ch = byte & (0x3F >> trail); do { src++; if ((*src & 0xC0) != 0x80) { *chPtr = byte; return 1; } ch <<= 6; ch |= (*src & 0x3F); trail--; } while (trail > 0); *chPtr = ch; return total; } else { *chPtr = (Tcl_UniChar) byte; return 1; } } } static int UtfToUtfProc( ClientData clientData, /* Not used. */ const char *src, /* Source string in UTF-8. */ int srcLen, /* Source string length in bytes. */ int flags, /* Conversion control flags. */ |
︙ | ︙ | |||
2326 2327 2328 2329 2330 2331 2332 | } else { /* * This is where we ought to do surrogate pair handling, with the * correct way of doing it depending on the conversionMode * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE! */ | > > > | > > > > > > > > > > > > | > > > > > > > > > > > > > > > | 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 2490 2491 2492 2493 2494 2495 2496 2497 2498 2499 2500 2501 2502 2503 2504 2505 2506 2507 2508 2509 2510 2511 2512 2513 2514 2515 2516 2517 2518 2519 2520 | } else { /* * This is where we ought to do surrogate pair handling, with the * correct way of doing it depending on the conversionMode * parameter. But we don't. Yet. KNOWN BUG/MISFEATURE! */ if (conversionMode == TO_STANDARD_UTF8) { const char *origin = src; src += Tcl_UtfToUniChar(src, &ch); if (ch >= 0xD800 && ch < 0xDBFF) { unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10; src += Tcl_UtfToUniChar(src, &ch); if (ch >= 0xDC00 && ch < 0xDFFF) { fullChar += (unsigned) (ch - 0xDC00); dst += IntToUtf(fullChar, dst); continue; } else { src = origin + Tcl_UtfToUniChar(origin, &ch); } } dst += Tcl_UniCharToUtf(ch, dst); } else { unsigned fullChar; src += UtfToInt(src, &fullChar); if (fullChar > 0xFFFF) { fullChar -= 0x10000; ch = (Tcl_UniChar) (((fullChar & 0xFFC00) >> 10) + 0xD800); dst += Tcl_UniCharToUtf(ch, dst); ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00); dst += Tcl_UniCharToUtf(ch, dst); } else { ch = (Tcl_UniChar) fullChar; dst += Tcl_UniCharToUtf(ch, dst); } } } } *srcReadPtr = src - srcStart; *dstWrotePtr = dst - dstStart; *dstCharsPtr = numChars; return result; |
︙ | ︙ |