Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Feature branch to explore making use of the Hoehrmann UTF-8 decoder. http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ |
---|---|
Downloads: | Tarball | ZIP archive | SQL archive |
Timelines: | family | ancestors | dgp-hoehrmann-decoder |
Files: | files | file ages | folders |
SHA1: |
8e353afbe61701af8d6d2f50923ef3bc |
User & Date: | dgp 2012-07-03 19:02:36 |
Context
2012-07-03
| ||
19:02 | Feature branch to explore making use of the Hoehrmann UTF-8 decoder. http://bjoern.hoehrmann.de/utf-... Closed-Leaf check-in: 8e353afbe6 user: dgp tags: dgp-hoehrmann-decoder | |
14:52 | Factor out a number of common patterns of use of Tcl_DStringAppend. check-in: 4dd736c95f user: dkf tags: trunk | |
Changes
Changes to generic/tclUtf.c.
︙ | ︙ | |||
282 283 284 285 286 287 288 289 290 291 292 293 294 295 | * number of bytes from the UTF-8 string that were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < | < < < | < < < < < < < | < < < < < < < < < < < < < < < < < < < < < < < < > > > | < | < | < < < < < | | < < < < | < < < < | | < < < | < < < < < < | < < < | | 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 | * number of bytes from the UTF-8 string that were consumed. * * Side effects: * None. * *--------------------------------------------------------------------------- */ #define UTF8_ACCEPT 0 #define UTF8_REJECT 1 static const unsigned char utf8d[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf 2,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 }; static unsigned int decode( unsigned int *state, unsigned int *codep, unsigned int byte) { unsigned int type = utf8d[byte]; *codep = (*state != UTF8_ACCEPT) ? (byte & 0x3fu) | (*codep << 6) : (0xff >> type) & (byte); *state = utf8d[256 + *state*16 + type]; return *state; } int Tcl_UtfToUniChar( register const char *src, /* The UTF-8 string. */ register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by * the UTF-8 string. */ { const char *p = src; unsigned int codepoint, state = UTF8_ACCEPT; int count = 1; while (*p && count <= TCL_UTF_MAX) { switch (decode(&state, &codepoint, (unsigned char)*p)) { case UTF8_ACCEPT: *chPtr = (Tcl_UniChar) codepoint; return count; case UTF8_REJECT: *chPtr = (Tcl_UniChar)(*src); return 1; default: count++; p++; } } *chPtr = (Tcl_UniChar)(*src); return 1; } /* *--------------------------------------------------------------------------- * * Tcl_UtfToUniCharDString -- |
︙ | ︙ |