Tcl Source Code

Check-in [7c97e3ea95]
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2018 Conference, Houston/TX, US, Oct 15-19
Send your abstracts to tclconference@googlegroups.com
or submit via the online form by Aug 20.

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass).
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | dkf-utf16-branch
Files: files | file ages | folders
SHA1:7c97e3ea95fb60eca40a0b97b23181a43685cfed
User & Date: dkf 2011-07-31 23:14:02
Context
2011-07-31
23:16
Bring up to date with mainline. check-in: e878fe8df0 user: dkf tags: dkf-utf16-branch
23:14
Add test of UTF-8 -> internal -> UTF-8 round trip (and make it pass). check-in: 7c97e3ea95 user: dkf tags: dkf-utf16-branch
2011-07-28
14:24
Actual possible implementation of code to convert external UTF-8 to internal surrogate pairs and bac... check-in: fed6086a43 user: dkf tags: dkf-utf16-branch
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to generic/tclEncoding.c.

  2245   2245   				 * large enough to hold the UTF-8 character
  2246   2246   				 * (at most 6 bytes). */
  2247   2247   {
  2248   2248       if ((ch > 0) && (ch < 0x80)) {
  2249   2249   	buf[0] = (char) ch;
  2250   2250   	return 1;
  2251   2251       }
  2252         -    if (ch >= 0) {
  2253         -	if (ch <= 0x7FF) {
  2254         -	    buf[1] = (char) ((ch | 0x80) & 0xBF);
  2255         -	    buf[0] = (char) ((ch >> 6) | 0xC0);
  2256         -	    return 2;
  2257         -	}
  2258         -	if (ch <= 0xFFFF) {
  2259         -	three:
  2260         -	    buf[2] = (char) ((ch | 0x80) & 0xBF);
  2261         -	    buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
  2262         -	    buf[0] = (char) ((ch >> 12) | 0xE0);
  2263         -	    return 3;
  2264         -	}
  2265         -	if (ch <= 0x1FFFFF) {
  2266         -	    buf[3] = (char) ((ch | 0x80) & 0xBF);
  2267         -	    buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
  2268         -	    buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
  2269         -	    buf[0] = (char) ((ch >> 18) | 0xF0);
  2270         -	    return 4;
  2271         -	}
  2272         -	if (ch <= 0x3FFFFFF) {
  2273         -	    buf[4] = (char) ((ch | 0x80) & 0xBF);
  2274         -	    buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
  2275         -	    buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
  2276         -	    buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
  2277         -	    buf[0] = (char) ((ch >> 24) | 0xF8);
  2278         -	    return 5;
  2279         -	}
  2280         -	if (ch <= 0x7FFFFFFF) {
  2281         -	    buf[5] = (char) ((ch | 0x80) & 0xBF);
  2282         -	    buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
  2283         -	    buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
  2284         -	    buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
  2285         -	    buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
  2286         -	    buf[0] = (char) ((ch >> 30) | 0xFC);
  2287         -	    return 6;
  2288         -	}
         2252  +    if (ch <= 0x7FF) {
         2253  +	buf[1] = (char) ((ch | 0x80) & 0xBF);
         2254  +	buf[0] = (char) ((ch >> 6) | 0xC0);
         2255  +	return 2;
         2256  +    }
         2257  +    if (ch <= 0xFFFF) {
         2258  +    three:
         2259  +	buf[2] = (char) ((ch | 0x80) & 0xBF);
         2260  +	buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF);
         2261  +	buf[0] = (char) ((ch >> 12) | 0xE0);
         2262  +	return 3;
         2263  +    }
         2264  +    if (ch <= 0x1FFFFF) {
         2265  +	buf[3] = (char) ((ch | 0x80) & 0xBF);
         2266  +	buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF);
         2267  +	buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF);
         2268  +	buf[0] = (char) ((ch >> 18) | 0xF0);
         2269  +	return 4;
         2270  +    }
         2271  +    if (ch <= 0x3FFFFFF) {
         2272  +	buf[4] = (char) ((ch | 0x80) & 0xBF);
         2273  +	buf[3] = (char) (((ch >> 6) | 0x80) & 0xBF);
         2274  +	buf[2] = (char) (((ch >> 12) | 0x80) & 0xBF);
         2275  +	buf[1] = (char) (((ch >> 18) | 0x80) & 0xBF);
         2276  +	buf[0] = (char) ((ch >> 24) | 0xF8);
         2277  +	return 5;
         2278  +    }
         2279  +    if (ch <= 0x7FFFFFFF) {
         2280  +	buf[5] = (char) ((ch | 0x80) & 0xBF);
         2281  +	buf[4] = (char) (((ch >> 6) | 0x80) & 0xBF);
         2282  +	buf[3] = (char) (((ch >> 12) | 0x80) & 0xBF);
         2283  +	buf[2] = (char) (((ch >> 18) | 0x80) & 0xBF);
         2284  +	buf[1] = (char) (((ch >> 24) | 0x80) & 0xBF);
         2285  +	buf[0] = (char) ((ch >> 30) | 0xFC);
         2286  +	return 6;
  2289   2287       }
  2290   2288   
  2291   2289       ch = 0xFFFD;
  2292   2290       goto three;
  2293   2291   }
  2294   2292   
  2295   2293   static INLINE int
................................................................................
  2484   2482   
  2485   2483   		src += Tcl_UtfToUniChar(src, &ch);
  2486   2484   		if (ch >= 0xD800 && ch < 0xDBFF) {
  2487   2485   		    unsigned fullChar = ((unsigned)(ch - 0xD800)) << 10;
  2488   2486   
  2489   2487   		    src += Tcl_UtfToUniChar(src, &ch);
  2490   2488   		    if (ch >= 0xDC00 && ch < 0xDFFF) {
  2491         -			fullChar += (unsigned) (ch - 0xDC00);
         2489  +			fullChar += 0x2400 + (unsigned) ch;
  2492   2490   			dst += IntToUtf(fullChar, dst);
  2493   2491   			continue;
  2494   2492   		    } else {
  2495   2493   			src = origin + Tcl_UtfToUniChar(origin, &ch);
  2496   2494   		    }
  2497   2495   		}
  2498   2496   		dst += Tcl_UniCharToUtf(ch, dst);
  2499   2497   	    } else {
  2500   2498   		unsigned fullChar;
  2501   2499   
  2502   2500   		src += UtfToInt(src, &fullChar);
  2503   2501   		if (fullChar > 0xFFFF) {
  2504   2502   		    fullChar -= 0x10000;
  2505         -		    ch = (Tcl_UniChar) (((fullChar & 0xFFC00) >> 10) + 0xD800);
         2503  +		    ch = (Tcl_UniChar) ((fullChar >> 10) + 0xD800);
  2506   2504   		    dst += Tcl_UniCharToUtf(ch, dst);
  2507   2505   		    ch = (Tcl_UniChar) ((fullChar & 0x3FF) + 0xDC00);
  2508   2506   		    dst += Tcl_UniCharToUtf(ch, dst);
  2509   2507   		} else {
  2510   2508   		    ch = (Tcl_UniChar) fullChar;
  2511   2509   		    dst += Tcl_UniCharToUtf(ch, dst);
  2512   2510   		}

Changes to tests/encoding.test.

   311    311   
   312    312   test encoding-15.3 {UtfToUtfProc null character input} {
   313    313       set x [encoding convertfrom identity \x00]
   314    314       set y [encoding convertfrom utf-8 $x]
   315    315       binary scan [encoding convertto identity $y] H* z
   316    316       list [string bytelength $x] [string bytelength $y] $z
   317    317   } {1 2 c080}
          318  +
          319  +test encoding-15.4 {UtfToUtfProc: UTF-8 to UTF-16 and back} {
          320  +    set x \xF0\xA4\xAD\xA2; # U+024B62
          321  +    set y [encoding convertfrom utf-8 $x]
          322  +    set z [encoding convertto utf-8 $y]
          323  +    list [string length $x] [string length $y] [string length $z] \
          324  +	[format 0x%04x.0x%04x {*}[scan $y %c%c]] \
          325  +	[format %02x.%02x.%02x.%02x {*}[scan $z %c%c%c%c]]
          326  +} {4 2 4 0xd852.0xdf62 f0.a4.ad.a2}
   318    327   
   319    328   test encoding-16.1 {UnicodeToUtfProc} {
   320    329       set val [encoding convertfrom unicode NN]
   321    330       list $val [format %x [scan $val %c]]
   322    331   } "\u4e4e 4e4e"
   323    332   
   324    333   test encoding-17.1 {UtfToUnicodeProc} {