Tcl Source Code

Artifact Content
Login
Bounty program for improvements to Tcl and certain Tcl packages.
Tcl 2019 Conference, Houston/TX, US, Nov 4-8
Send your abstracts to tclconference@googlegroups.com
or submit via the online form by Sep 9.

Artifact 18caccb00b6196fe42171ccfa9d6088ecf0def9ed39103b56d8587cdd13fbc10:


     1  /*
     2   * tclUtf.c --
     3   *
     4   *	Routines for manipulating UTF-8 strings.
     5   *
     6   * Copyright (c) 1997-1998 Sun Microsystems, Inc.
     7   *
     8   * See the file "license.terms" for information on usage and redistribution of
     9   * this file, and for a DISCLAIMER OF ALL WARRANTIES.
    10   */
    11  
    12  #include "tclInt.h"
    13  
    14  /*
    15   * Include the static character classification tables and macros.
    16   */
    17  
    18  #include "tclUniData.c"
    19  
    20  /*
    21   * The following macros are used for fast character category tests. The x_BITS
    22   * values are shifted right by the category value to determine whether the
    23   * given category is included in the set.
    24   */
    25  
    26  #define ALPHA_BITS ((1 << UPPERCASE_LETTER) | (1 << LOWERCASE_LETTER) \
    27  	| (1 << TITLECASE_LETTER) | (1 << MODIFIER_LETTER) | (1<<OTHER_LETTER))
    28  
    29  #define CONTROL_BITS ((1 << CONTROL) | (1 << FORMAT) | (1 << PRIVATE_USE))
    30  
    31  #define DIGIT_BITS (1 << DECIMAL_DIGIT_NUMBER)
    32  
    33  #define SPACE_BITS ((1 << SPACE_SEPARATOR) | (1 << LINE_SEPARATOR) \
    34  	| (1 << PARAGRAPH_SEPARATOR))
    35  
    36  #define WORD_BITS (ALPHA_BITS | DIGIT_BITS | (1 << CONNECTOR_PUNCTUATION))
    37  
    38  #define PUNCT_BITS ((1 << CONNECTOR_PUNCTUATION) | \
    39  	(1 << DASH_PUNCTUATION) | (1 << OPEN_PUNCTUATION) | \
    40  	(1 << CLOSE_PUNCTUATION) | (1 << INITIAL_QUOTE_PUNCTUATION) | \
    41  	(1 << FINAL_QUOTE_PUNCTUATION) | (1 << OTHER_PUNCTUATION))
    42  
    43  #define GRAPH_BITS (WORD_BITS | PUNCT_BITS | \
    44  	(1 << NON_SPACING_MARK) | (1 << ENCLOSING_MARK) | \
    45  	(1 << COMBINING_SPACING_MARK) | (1 << LETTER_NUMBER) | \
    46  	(1 << OTHER_NUMBER) | \
    47  	(1 << MATH_SYMBOL) | (1 << CURRENCY_SYMBOL) | \
    48  	(1 << MODIFIER_SYMBOL) | (1 << OTHER_SYMBOL))
    49  
    50  /*
    51   * Unicode characters less than this value are represented by themselves in
    52   * UTF-8 strings.
    53   */
    54  
    55  #define UNICODE_SELF	0x80
    56  
    57  /*
    58   * The following structures are used when mapping between Unicode (UCS-2) and
    59   * UTF-8.
    60   */
    61  
    62  static const unsigned char totalBytes[256] = {
    63      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    64      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    65      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    66      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    67      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    68      1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
    69      2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
    70      3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,
    71  #if TCL_UTF_MAX > 3
    72      4,4,4,4,4,4,4,4,
    73  #else
    74      1,1,1,1,1,1,1,1,
    75  #endif
    76      1,1,1,1,1,1,1,1
    77  };
    78  
    79  /*
    80   * Functions used only in this module.
    81   */
    82  
    83  static int		UtfCount(int ch);
    84  
    85  /*
    86   *---------------------------------------------------------------------------
    87   *
    88   * UtfCount --
    89   *
    90   *	Find the number of bytes in the Utf character "ch".
    91   *
    92   * Results:
    93   *	The return values is the number of bytes in the Utf character "ch".
    94   *
    95   * Side effects:
    96   *	None.
    97   *
    98   *---------------------------------------------------------------------------
    99   */
   100  
   101  static inline int
   102  UtfCount(
   103      int ch)			/* The Unicode character whose size is returned. */
   104  {
   105      if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
   106  	return 1;
   107      }
   108      if (ch <= 0x7FF) {
   109  	return 2;
   110      }
   111  #if TCL_UTF_MAX > 3
   112      if (((unsigned)(ch - 0x10000) <= 0xFFFFF)) {
   113  	return 4;
   114      }
   115  #endif
   116      return 3;
   117  }
   118  
   119  /*
   120   *---------------------------------------------------------------------------
   121   *
   122   * Tcl_UniCharToUtf --
   123   *
   124   *	Store the given Tcl_UniChar as a sequence of UTF-8 bytes in the
   125   *	provided buffer. Equivalent to Plan 9 runetochar().
   126   *
   127   * Results:
   128   *	The return values is the number of bytes in the buffer that were
   129   *	consumed.
   130   *
   131   * Side effects:
   132   *	None.
   133   *
   134   *---------------------------------------------------------------------------
   135   */
   136  
   137  int
   138  Tcl_UniCharToUtf(
   139      int ch,			/* The Tcl_UniChar to be stored in the
   140  				 * buffer. */
   141      char *buf)			/* Buffer in which the UTF-8 representation of
   142  				 * the Tcl_UniChar is stored. Buffer must be
   143  				 * large enough to hold the UTF-8 character
   144  				 * (at most TCL_UTF_MAX bytes). */
   145  {
   146      if ((unsigned)(ch - 1) < (UNICODE_SELF - 1)) {
   147  	buf[0] = (char) ch;
   148  	return 1;
   149      }
   150      if (ch >= 0) {
   151  	if (ch <= 0x7FF) {
   152  	    buf[1] = (char) ((ch | 0x80) & 0xBF);
   153  	    buf[0] = (char) ((ch >> 6) | 0xC0);
   154  	    return 2;
   155  	}
   156  	if (ch <= 0xFFFF) {
   157  #if TCL_UTF_MAX > 3
158 if ((ch & 0xF800) == 0xD800) { 159 if (ch & 0x0400) { 160 /* Low surrogate */ 161 if (((buf[0] & 0xF8) == 0xF0) && ((buf[1] & 0xC0) == 0x80) 162 && ((buf[2] & 0xCF) == 0)) { 163 /* Previous Tcl_UniChar was a High surrogate, so combine */ 164 buf[3] = (char) ((ch & 0x3F) | 0x80); 165 buf[2] |= (char) (((ch >> 6) & 0x0F) | 0x80); 166 return 4; 167 } 168 /* Previous Tcl_UniChar was not a High surrogate, so just output */ 169 } else { 170 /* High surrogate */ 171 ch += 0x40; 172 /* Fill buffer with specific 3-byte (invalid) byte combination, 173 so following Low surrogate can recognize it and combine */ 174 buf[2] = (char) ((ch << 4) & 0x30); 175 buf[1] = (char) (((ch >> 2) & 0x3F) | 0x80); 176 buf[0] = (char) (((ch >> 8) & 0x07) | 0xF0); 177 return 0; 178 } 179 }
180 #endif 181 goto three; 182 } 183 184 #if TCL_UTF_MAX > 3 185 if (ch <= 0x10FFFF) { 186 buf[3] = (char) ((ch | 0x80) & 0xBF); 187 buf[2] = (char) (((ch >> 6) | 0x80) & 0xBF); 188 buf[1] = (char) (((ch >> 12) | 0x80) & 0xBF); 189 buf[0] = (char) ((ch >> 18) | 0xF0); 190 return 4; 191 } 192 #endif 193 } 194 195 ch = 0xFFFD; 196 three: 197 buf[2] = (char) ((ch | 0x80) & 0xBF); 198 buf[1] = (char) (((ch >> 6) | 0x80) & 0xBF); 199 buf[0] = (char) ((ch >> 12) | 0xE0); 200 return 3; 201 } 202 203 /* 204 *--------------------------------------------------------------------------- 205 * 206 * Tcl_UniCharToUtfDString -- 207 * 208 * Convert the given Unicode string to UTF-8. 209 * 210 * Results: 211 * The return value is a pointer to the UTF-8 representation of the 212 * Unicode string. Storage for the return value is appended to the end of 213 * dsPtr. 214 * 215 * Side effects: 216 * None. 217 * 218 *--------------------------------------------------------------------------- 219 */ 220 221 char * 222 Tcl_UniCharToUtfDString( 223 const Tcl_UniChar *uniStr, /* Unicode string to convert to UTF-8. */ 224 int uniLength, /* Length of Unicode string in Tcl_UniChars 225 * (must be >= 0). */ 226 Tcl_DString *dsPtr) /* UTF-8 representation of string is appended 227 * to this previously initialized DString. */ 228 { 229 const Tcl_UniChar *w, *wEnd; 230 char *p, *string; 231 int oldLength; 232 233 /* 234 * UTF-8 string length in bytes will be <= Unicode string length * 235 * TCL_UTF_MAX. 236 */ 237 238 oldLength = Tcl_DStringLength(dsPtr); 239 Tcl_DStringSetLength(dsPtr, (oldLength + uniLength + 1) * TCL_UTF_MAX); 240 string = Tcl_DStringValue(dsPtr) + oldLength; 241 242 p = string; 243 wEnd = uniStr + uniLength; 244 for (w = uniStr; w < wEnd; ) { 245 p += Tcl_UniCharToUtf(*w, p); 246 w++; 247 } 248 Tcl_DStringSetLength(dsPtr, oldLength + (p - string)); 249 250 return string; 251 } 252 253 /* 254 *--------------------------------------------------------------------------- 255 * 256 * Tcl_UtfToUniChar -- 257 * 258 * Extract the Tcl_UniChar represented by the UTF-8 string. Bad UTF-8 259 * sequences are converted to valid Tcl_UniChars and processing 260 * continues. Equivalent to Plan 9 chartorune(). 261 * 262 * The caller must ensure that the source buffer is long enough that this 263 * routine does not run off the end and dereference non-existent memory 264 * looking for trail bytes. If the source buffer is known to be '\0' 265 * terminated, this cannot happen. Otherwise, the caller should call 266 * Tcl_UtfCharComplete() before calling this routine to ensure that 267 * enough bytes remain in the string. 268 * 269 * If TCL_UTF_MAX == 4, special handling of Surrogate pairs is done: 270 * For any UTF-8 string containing a character outside of the BMP, the 271 * first call to this function will fill *chPtr with the high surrogate 272 * and generate a return value of 0. Calling Tcl_UtfToUniChar again 273 * will produce the low surrogate and a return value of 4. Because *chPtr 274 * is used to remember whether the high surrogate is already produced, it 275 * is recommended to initialize the variable it points to as 0 before 276 * the first call to Tcl_UtfToUniChar is done. 277 * 278 * Results: 279 * *chPtr is filled with the Tcl_UniChar, and the return value is the 280 * number of bytes from the UTF-8 string that were consumed. 281 * 282 * Side effects: 283 * None. 284 * 285 *--------------------------------------------------------------------------- 286 */ 287 288 int 289 Tcl_UtfToUniChar( 290 register const char *src, /* The UTF-8 string. */ 291 register Tcl_UniChar *chPtr)/* Filled with the Tcl_UniChar represented by 292 * the UTF-8 string. */ 293 { 294 register int byte; 295 296 /* 297 * Unroll 1 to 3 (or 4) byte UTF-8 sequences. 298 */ 299 300 byte = *((unsigned char *) src); 301 if (byte < 0xC0) { 302 /* 303 * Handles properly formed UTF-8 characters between 0x01 and 0x7F. 304 * Also treats \0 and naked trail bytes 0x80 to 0xBF as valid 305 * characters representing themselves. 306 */ 307 308 *chPtr = (Tcl_UniChar) byte; 309 return 1; 310 } else if (byte < 0xE0) { 311 if ((src[1] & 0xC0) == 0x80) { 312 /* 313 * Two-byte-character lead-byte followed by a trail-byte. 314 */ 315 316 *chPtr = (Tcl_UniChar) (((byte & 0x1F) << 6) | (src[1] & 0x3F)); 317 if ((unsigned)(*chPtr - 1) >= (UNICODE_SELF - 1)) { 318 return 2; 319 } 320 } 321 322 /* 323 * A two-byte-character lead-byte not followed by trail-byte 324 * represents itself. 325 */ 326 } else if (byte < 0xF0) { 327 if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80)) { 328 /* 329 * Three-byte-character lead byte followed by two trail bytes. 330 */ 331 332 *chPtr = (Tcl_UniChar) (((byte & 0x0F) << 12) 333 | ((src[1] & 0x3F) << 6) | (src[2] & 0x3F)); 334 if (*chPtr > 0x7FF) { 335 return 3; 336 } 337 } 338 339 /* 340 * A three-byte-character lead-byte not followed by two trail-bytes 341 * represents itself. 342 */ 343 } 344 #if TCL_UTF_MAX > 3 345 else if (byte < 0xF8) { 346 if (((src[1] & 0xC0) == 0x80) && ((src[2] & 0xC0) == 0x80) && ((src[3] & 0xC0) == 0x80)) { 347 /* 348 * Four-byte-character lead byte followed by three trail bytes. 349 */ 350 #if TCL_UTF_MAX == 4 351 Tcl_UniChar surrogate; 352 353 byte = (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) 354 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)) - 0x10000; 355 surrogate = (Tcl_UniChar) (0xD800 + (byte >> 10)); 356 if (byte & 0x100000) { 357 /* out of range, < 0x10000 or > 0x10ffff */ 358 } else if (*chPtr != surrogate) { 359 /* produce high surrogate, but don't advance source pointer */ 360 *chPtr = surrogate; 361 return 0; 362 } else { 363 /* produce low surrogate, and advance source pointer */ 364 *chPtr = (Tcl_UniChar) (0xDC00 | (byte & 0x3FF)); 365 return 4; 366 } 367 #else 368 *chPtr = (Tcl_UniChar) (((byte & 0x07) << 18) | ((src[1] & 0x3F) << 12) 369 | ((src[2] & 0x3F) << 6) | (src[3] & 0x3F)); 370 if ((unsigned)(*chPtr - 0x10000) <= 0xFFFFF) { 371 return 4; 372 } 373 #endif 374 } 375 376 /* 377 * A four-byte-character lead-byte not followed by two trail-bytes 378 * represents itself. 379 */ 380 } 381 #endif 382 383 *chPtr = (Tcl_UniChar) byte; 384 return 1; 385 } 386 387 /* 388 *--------------------------------------------------------------------------- 389 * 390 * Tcl_UtfToUniCharDString -- 391 * 392 * Convert the UTF-8 string to Unicode. 393 * 394 * Results: 395 * The return value is a pointer to the Unicode representation of the 396 * UTF-8 string. Storage for the return value is appended to the end of 397 * dsPtr. The Unicode string is terminated with a Unicode NULL character. 398 * 399 * Side effects: 400 * None. 401 * 402 *--------------------------------------------------------------------------- 403 */ 404 405 Tcl_UniChar * 406 Tcl_UtfToUniCharDString( 407 const char *src, /* UTF-8 string to convert to Unicode. */ 408 int length, /* Length of UTF-8 string in bytes, or -1 for 409 * strlen(). */ 410 Tcl_DString *dsPtr) /* Unicode representation of string is 411 * appended to this previously initialized 412 * DString. */ 413 { 414 Tcl_UniChar ch = 0, *w, *wString; 415 const char *p, *end; 416 int oldLength; 417 418 if (length < 0) { 419 length = strlen(src); 420 } 421 422 /* 423 * Unicode string length in Tcl_UniChars will be <= UTF-8 string length in 424 * bytes. 425 */ 426 427 oldLength = Tcl_DStringLength(dsPtr); 428 /* TODO: fix overreach! */ 429 Tcl_DStringSetLength(dsPtr, 430 (int) ((oldLength + length + 1) * sizeof(Tcl_UniChar))); 431 wString = (Tcl_UniChar *) (Tcl_DStringValue(dsPtr) + oldLength); 432 433 w = wString; 434 end = src + length; 435 for (p = src; p < end; ) { 436 p += TclUtfToUniChar(p, &ch); 437 *w++ = ch; 438 } 439 *w = '\0'; 440 Tcl_DStringSetLength(dsPtr, 441 (oldLength + ((char *) w - (char *) wString))); 442 443 return wString; 444 } 445 446 /* 447 *--------------------------------------------------------------------------- 448 * 449 * Tcl_UtfCharComplete -- 450 * 451 * Determine if the UTF-8 string of the given length is long enough to be 452 * decoded by Tcl_UtfToUniChar(). This does not ensure that the UTF-8 453 * string is properly formed. Equivalent to Plan 9 fullrune(). 454 * 455 * Results: 456 * The return value is 0 if the string is not long enough, non-zero 457 * otherwise. 458 * 459 * Side effects: 460 * None. 461 * 462 *--------------------------------------------------------------------------- 463 */ 464 465 int 466 Tcl_UtfCharComplete( 467 const char *src, /* String to check if first few bytes contain 468 * a complete UTF-8 character. */ 469 int length) /* Length of above string in bytes. */ 470 { 471 return length >= totalBytes[(unsigned char)*src]; 472 } 473 474 /* 475 *--------------------------------------------------------------------------- 476 * 477 * Tcl_NumUtfChars -- 478 * 479 * Returns the number of characters (not bytes) in the UTF-8 string, not 480 * including the terminating NULL byte. This is equivalent to Plan 9 481 * utflen() and utfnlen(). 482 * 483 * Results: 484 * As above. 485 * 486 * Side effects: 487 * None. 488 * 489 *--------------------------------------------------------------------------- 490 */ 491 492 int 493 Tcl_NumUtfChars( 494 register const char *src, /* The UTF-8 string to measure. */ 495 int length) /* The length of the string in bytes, or -1 496 * for strlen(string). */ 497 { 498 Tcl_UniChar ch = 0; 499 register int i = 0; 500 501 /* 502 * The separate implementations are faster. 503 * 504 * Since this is a time-sensitive function, we also do the check for the 505 * single-byte char case specially. 506 */ 507 508 if (length < 0) { 509 while (*src != '\0') { 510 src += TclUtfToUniChar(src, &ch); 511 i++; 512 } 513 if (i < 0) i = INT_MAX; /* Bug [2738427] */ 514 } else { 515 register const char *endPtr = src + length - TCL_UTF_MAX; 516 517 while (src < endPtr) { 518 src += TclUtfToUniChar(src, &ch); 519 i++; 520 } 521 endPtr += TCL_UTF_MAX; 522 while ((src < endPtr) && Tcl_UtfCharComplete(src, endPtr - src)) { 523 src += TclUtfToUniChar(src, &ch); 524 i++; 525 } 526 if (src < endPtr) { 527 i += endPtr - src; 528 } 529 } 530 return i; 531 } 532 533 /* 534 *--------------------------------------------------------------------------- 535 * 536 * Tcl_UtfFindFirst -- 537 * 538 * Returns a pointer to the first occurance of the given Unicode character 539 * in the NULL-terminated UTF-8 string. The NULL terminator is considered 540 * part of the UTF-8 string. Equivalent to Plan 9 utfrune(). 541 * 542 * Results: 543 * As above. If the Unicode character does not exist in the given string, 544 * the return value is NULL. 545 * 546 * Side effects: 547 * None. 548 * 549 *--------------------------------------------------------------------------- 550 */ 551 552 const char * 553 Tcl_UtfFindFirst( 554 const char *src, /* The UTF-8 string to be searched. */ 555 int ch) /* The Unicode character to search for. */ 556 { 557 int len, fullchar; 558 Tcl_UniChar find = 0; 559 560 while (1) { 561 len = TclUtfToUniChar(src, &find); 562 fullchar = find; 563 #if TCL_UTF_MAX == 4 564 if (!len) { 565 len += TclUtfToUniChar(src, &find); 566 fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; 567 } 568 #endif 569 if (fullchar == ch) { 570 return src; 571 } 572 if (*src == '\0') { 573 return NULL; 574 } 575 src += len; 576 } 577 } 578 579 /* 580 *--------------------------------------------------------------------------- 581 * 582 * Tcl_UtfFindLast -- 583 * 584 * Returns a pointer to the last occurance of the given Unicode character 585 * in the NULL-terminated UTF-8 string. The NULL terminator is considered 586 * part of the UTF-8 string. Equivalent to Plan 9 utfrrune(). 587 * 588 * Results: 589 * As above. If the Unicode character does not exist in the given string, the 590 * return value is NULL. 591 * 592 * Side effects: 593 * None. 594 * 595 *--------------------------------------------------------------------------- 596 */ 597 598 const char * 599 Tcl_UtfFindLast( 600 const char *src, /* The UTF-8 string to be searched. */ 601 int ch) /* The Unicode character to search for. */ 602 { 603 int len, fullchar; 604 Tcl_UniChar find = 0; 605 const char *last; 606 607 last = NULL; 608 while (1) { 609 len = TclUtfToUniChar(src, &find); 610 fullchar = find; 611 #if TCL_UTF_MAX == 4 612 if (!len) { 613 len += TclUtfToUniChar(src, &find); 614 fullchar = (((fullchar & 0x3ff) << 10) | (find & 0x3ff)) + 0x10000; 615 } 616 #endif 617 if (fullchar == ch) { 618 last = src; 619 } 620 if (*src == '\0') { 621 break; 622 } 623 src += len; 624 } 625 return last; 626 } 627 628 /* 629 *--------------------------------------------------------------------------- 630 * 631 * Tcl_UtfNext -- 632 * 633 * Given a pointer to some current location in a UTF-8 string, move 634 * forward one character. The caller must ensure that they are not asking 635 * for the next character after the last character in the string. 636 * 637 * Results: 638 * The return value is the pointer to the next character in the UTF-8 639 * string. 640 * 641 * Side effects: 642 * None. 643 * 644 *--------------------------------------------------------------------------- 645 */ 646 647 const char * 648 Tcl_UtfNext( 649 const char *src) /* The current location in the string. */ 650 { 651 Tcl_UniChar ch = 0; 652 int len = TclUtfToUniChar(src, &ch); 653 654 #if TCL_UTF_MAX == 4 655 if (len == 0) { 656 len = TclUtfToUniChar(src, &ch); 657 } 658 #endif 659 return src + len; 660 } 661 662 /* 663 *--------------------------------------------------------------------------- 664 * 665 * Tcl_UtfPrev -- 666 * 667 * Given a pointer to some current location in a UTF-8 string, move 668 * backwards one character. This works correctly when the pointer is in 669 * the middle of a UTF-8 character. 670 * 671 * Results: 672 * The return value is a pointer to the previous character in the UTF-8 673 * string. If the current location was already at the beginning of the 674 * string, the return value will also be a pointer to the beginning of 675 * the string. 676 * 677 * Side effects: 678 * None. 679 * 680 *--------------------------------------------------------------------------- 681 */ 682 683 const char * 684 Tcl_UtfPrev( 685 const char *src, /* The current location in the string. */ 686 const char *start) /* Pointer to the beginning of the string, to 687 * avoid going backwards too far. */ 688 { 689 const char *look; 690 int i, byte; 691 692 look = --src; 693 for (i = 0; i < TCL_UTF_MAX; i++) { 694 if (look < start) { 695 if (src < start) { 696 src = start; 697 } 698 break; 699 } 700 byte = *((unsigned char *) look); 701 if (byte < 0x80) { 702 break; 703 } 704 if (byte >= 0xC0) { 705 return look; 706 } 707 look--; 708 } 709 return src; 710 } 711 712 /* 713 *--------------------------------------------------------------------------- 714 * 715 * Tcl_UniCharAtIndex -- 716 * 717 * Returns the Tcl_UniChar represented at the specified character 718 * (not byte) position in the UTF-8 string. 719 * 720 * Results: 721 * As above. 722 * 723 * Side effects: 724 * None. 725 * 726 *--------------------------------------------------------------------------- 727 */ 728 729 Tcl_UniChar 730 Tcl_UniCharAtIndex( 731 register const char *src, /* The UTF-8 string to dereference. */ 732 register int index) /* The position of the desired character. */ 733 { 734 Tcl_UniChar ch = 0; 735 736 while (index-- >= 0) { 737 src += TclUtfToUniChar(src, &ch); 738 } 739 return ch; 740 } 741 742 /* 743 *--------------------------------------------------------------------------- 744 * 745 * Tcl_UtfAtIndex -- 746 * 747 * Returns a pointer to the specified character (not byte) position in 748 * the UTF-8 string. 749 * 750 * Results: 751 * As above. 752 * 753 * Side effects: 754 * None. 755 * 756 *--------------------------------------------------------------------------- 757 */ 758 759 const char * 760 Tcl_UtfAtIndex( 761 register const char *src, /* The UTF-8 string. */ 762 register int index) /* The position of the desired character. */ 763 { 764 Tcl_UniChar ch = 0; 765 int len = 1; 766 767 while (index-- > 0) { 768 len = TclUtfToUniChar(src, &ch); 769 src += len; 770 } 771 #if TCL_UTF_MAX == 4 772 if (!len) { 773 /* Index points at character following High Surrogate */ 774 src += TclUtfToUniChar(src, &ch); 775 } 776 #endif 777 return src; 778 } 779 780 /* 781 *--------------------------------------------------------------------------- 782 * 783 * Tcl_UtfBackslash -- 784 * 785 * Figure out how to handle a backslash sequence. 786 * 787 * Results: 788 * Stores the bytes represented by the backslash sequence in dst and 789 * returns the number of bytes written to dst. At most TCL_UTF_MAX bytes 790 * are written to dst; dst must have been large enough to accept those 791 * bytes. If readPtr isn't NULL then it is filled in with a count of the 792 * number of bytes in the backslash sequence. 793 * 794 * Side effects: 795 * The maximum number of bytes it takes to represent a Unicode character 796 * in UTF-8 is guaranteed to be less than the number of bytes used to 797 * express the backslash sequence that represents that Unicode character. 798 * If the target buffer into which the caller is going to store the bytes 799 * that represent the Unicode character is at least as large as the 800 * source buffer from which the backslashed sequence was extracted, no 801 * buffer overruns should occur. 802 * 803 *--------------------------------------------------------------------------- 804 */ 805 806 int 807 Tcl_UtfBackslash( 808 const char *src, /* Points to the backslash character of a 809 * backslash sequence. */ 810 int *readPtr, /* Fill in with number of characters read from 811 * src, unless NULL. */ 812 char *dst) /* Filled with the bytes represented by the 813 * backslash sequence. */ 814 { 815 #define LINE_LENGTH 128 816 int numRead; 817 int result; 818 819 result = TclParseBackslash(src, LINE_LENGTH, &numRead, dst); 820 if (numRead == LINE_LENGTH) { 821 /* 822 * We ate a whole line. Pay the price of a strlen() 823 */ 824 825 result = TclParseBackslash(src, (int)strlen(src), &numRead, dst); 826 } 827 if (readPtr != NULL) { 828 *readPtr = numRead; 829 } 830 return result; 831 } 832 833 /* 834 *---------------------------------------------------------------------- 835 * 836 * Tcl_UtfToUpper -- 837 * 838 * Convert lowercase characters to uppercase characters in a UTF string 839 * in place. The conversion may shrink the UTF string. 840 * 841 * Results: 842 * Returns the number of bytes in the resulting string excluding the 843 * trailing null. 844 * 845 * Side effects: 846 * Writes a terminating null after the last converted character. 847 * 848 *---------------------------------------------------------------------- 849 */ 850 851 int 852 Tcl_UtfToUpper( 853 char *str) /* String to convert in place. */ 854 { 855 Tcl_UniChar ch = 0, upChar; 856 char *src, *dst; 857 int bytes; 858 859 /* 860 * Iterate over the string until we hit the terminating null. 861 */ 862 863 src = dst = str; 864 while (*src) { 865 bytes = TclUtfToUniChar(src, &ch); 866 upChar = Tcl_UniCharToUpper(ch); 867 868 /* 869 * To keep badly formed Utf strings from getting inflated by the 870 * conversion (thereby causing a segfault), only copy the upper case 871 * char to dst if its size is <= the original char. 872 */ 873 874 if (bytes < UtfCount(upChar)) { 875 memcpy(dst, src, (size_t) bytes); 876 dst += bytes; 877 } else { 878 dst += Tcl_UniCharToUtf(upChar, dst); 879 } 880 src += bytes; 881 } 882 *dst = '\0'; 883 return (dst - str); 884 } 885 886 /* 887 *---------------------------------------------------------------------- 888 * 889 * Tcl_UtfToLower -- 890 * 891 * Convert uppercase characters to lowercase characters in a UTF string 892 * in place. The conversion may shrink the UTF string. 893 * 894 * Results: 895 * Returns the number of bytes in the resulting string excluding the 896 * trailing null. 897 * 898 * Side effects: 899 * Writes a terminating null after the last converted character. 900 * 901 *---------------------------------------------------------------------- 902 */ 903 904 int 905 Tcl_UtfToLower( 906 char *str) /* String to convert in place. */ 907 { 908 Tcl_UniChar ch = 0, lowChar; 909 char *src, *dst; 910 int bytes; 911 912 /* 913 * Iterate over the string until we hit the terminating null. 914 */ 915 916 src = dst = str; 917 while (*src) { 918 bytes = TclUtfToUniChar(src, &ch); 919 lowChar = Tcl_UniCharToLower(ch); 920 921 /* 922 * To keep badly formed Utf strings from getting inflated by the 923 * conversion (thereby causing a segfault), only copy the lower case 924 * char to dst if its size is <= the original char. 925 */ 926 927 if (bytes < UtfCount(lowChar)) { 928 memcpy(dst, src, (size_t) bytes); 929 dst += bytes; 930 } else { 931 dst += Tcl_UniCharToUtf(lowChar, dst); 932 } 933 src += bytes; 934 } 935 *dst = '\0'; 936 return (dst - str); 937 } 938 939 /* 940 *---------------------------------------------------------------------- 941 * 942 * Tcl_UtfToTitle -- 943 * 944 * Changes the first character of a UTF string to title case or uppercase 945 * and the rest of the string to lowercase. The conversion happens in 946 * place and may shrink the UTF string. 947 * 948 * Results: 949 * Returns the number of bytes in the resulting string excluding the 950 * trailing null. 951 * 952 * Side effects: 953 * Writes a terminating null after the last converted character. 954 * 955 *---------------------------------------------------------------------- 956 */ 957 958 int 959 Tcl_UtfToTitle( 960 char *str) /* String to convert in place. */ 961 { 962 Tcl_UniChar ch = 0, titleChar, lowChar; 963 char *src, *dst; 964 int bytes; 965 966 /* 967 * Capitalize the first character and then lowercase the rest of the 968 * characters until we get to a null. 969 */ 970 971 src = dst = str; 972 973 if (*src) { 974 bytes = TclUtfToUniChar(src, &ch); 975 titleChar = Tcl_UniCharToTitle(ch); 976 977 if (bytes < UtfCount(titleChar)) { 978 memcpy(dst, src, (size_t) bytes); 979 dst += bytes; 980 } else { 981 dst += Tcl_UniCharToUtf(titleChar, dst); 982 } 983 src += bytes; 984 } 985 while (*src) { 986 bytes = TclUtfToUniChar(src, &ch); 987 lowChar = ch; 988 /* Special exception for Georgian Asomtavruli chars, no titlecase. */ 989 if ((unsigned)(lowChar - 0x1C90) >= 0x30) { 990 lowChar = Tcl_UniCharToLower(lowChar); 991 } 992 993 if (bytes < UtfCount(lowChar)) { 994 memcpy(dst, src, (size_t) bytes); 995 dst += bytes; 996 } else { 997 dst += Tcl_UniCharToUtf(lowChar, dst); 998 } 999 src += bytes; 1000 } 1001 *dst = '\0'; 1002 return (dst - str); 1003 } 1004 1005 /* 1006 *---------------------------------------------------------------------- 1007 * 1008 * TclpUtfNcmp2 -- 1009 * 1010 * Compare at most numBytes bytes of utf-8 strings cs and ct. Both cs and 1011 * ct are assumed to be at least numBytes bytes long. 1012 * 1013 * Results: 1014 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1015 * 1016 * Side effects: 1017 * None. 1018 * 1019 *---------------------------------------------------------------------- 1020 */ 1021 1022 int 1023 TclpUtfNcmp2( 1024 const char *cs, /* UTF string to compare to ct. */ 1025 const char *ct, /* UTF string cs is compared to. */ 1026 unsigned long numBytes) /* Number of *bytes* to compare. */ 1027 { 1028 /* 1029 * We can't simply call 'memcmp(cs, ct, numBytes);' because we need to 1030 * check for Tcl's \xC0\x80 non-utf-8 null encoding. Otherwise utf-8 lexes 1031 * fine in the strcmp manner. 1032 */ 1033 1034 register int result = 0; 1035 1036 for ( ; numBytes != 0; numBytes--, cs++, ct++) { 1037 if (*cs != *ct) { 1038 result = UCHAR(*cs) - UCHAR(*ct); 1039 break; 1040 } 1041 } 1042 if (numBytes && ((UCHAR(*cs) == 0xC0) || (UCHAR(*ct) == 0xC0))) { 1043 unsigned char c1, c2; 1044 1045 c1 = ((UCHAR(*cs) == 0xC0) && (UCHAR(cs[1]) == 0x80)) ? 0 : UCHAR(*cs); 1046 c2 = ((UCHAR(*ct) == 0xC0) && (UCHAR(ct[1]) == 0x80)) ? 0 : UCHAR(*ct); 1047 result = (c1 - c2); 1048 } 1049 return result; 1050 } 1051 1052 /* 1053 *---------------------------------------------------------------------- 1054 * 1055 * Tcl_UtfNcmp -- 1056 * 1057 * Compare at most numChars UTF chars of string cs to string ct. Both cs 1058 * and ct are assumed to be at least numChars UTF chars long. 1059 * 1060 * Results: 1061 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1062 * 1063 * Side effects: 1064 * None. 1065 * 1066 *---------------------------------------------------------------------- 1067 */ 1068 1069 int 1070 Tcl_UtfNcmp( 1071 const char *cs, /* UTF string to compare to ct. */ 1072 const char *ct, /* UTF string cs is compared to. */ 1073 unsigned long numChars) /* Number of UTF chars to compare. */ 1074 { 1075 Tcl_UniChar ch1 = 0, ch2 = 0; 1076 1077 /* 1078 * Cannot use 'memcmp(cs, ct, n);' as byte representation of \u0000 (the 1079 * pair of bytes 0xC0,0x80) is larger than byte representation of \u0001 1080 * (the byte 0x01.) 1081 */ 1082 1083 while (numChars-- > 0) { 1084 /* 1085 * n must be interpreted as chars, not bytes. This should be called 1086 * only when both strings are of at least n chars long (no need for \0 1087 * check) 1088 */ 1089 1090 cs += TclUtfToUniChar(cs, &ch1); 1091 ct += TclUtfToUniChar(ct, &ch2); 1092 if (ch1 != ch2) { 1093 #if TCL_UTF_MAX == 4 1094 /* Surrogates always report higher than non-surrogates */ 1095 if (((ch1 & 0xFC00) == 0xD800)) { 1096 if ((ch2 & 0xFC00) != 0xD800) { 1097 return ch1; 1098 } 1099 } else if ((ch2 & 0xFC00) == 0xD800) { 1100 return -ch2; 1101 } 1102 #endif 1103 return (ch1 - ch2); 1104 } 1105 } 1106 return 0; 1107 } 1108 1109 /* 1110 *---------------------------------------------------------------------- 1111 * 1112 * Tcl_UtfNcasecmp -- 1113 * 1114 * Compare at most numChars UTF chars of string cs to string ct case 1115 * insensitive. Both cs and ct are assumed to be at least numChars UTF 1116 * chars long. 1117 * 1118 * Results: 1119 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1120 * 1121 * Side effects: 1122 * None. 1123 * 1124 *---------------------------------------------------------------------- 1125 */ 1126 1127 int 1128 Tcl_UtfNcasecmp( 1129 const char *cs, /* UTF string to compare to ct. */ 1130 const char *ct, /* UTF string cs is compared to. */ 1131 unsigned long numChars) /* Number of UTF chars to compare. */ 1132 { 1133 Tcl_UniChar ch1 = 0, ch2 = 0; 1134 1135 while (numChars-- > 0) { 1136 /* 1137 * n must be interpreted as chars, not bytes. 1138 * This should be called only when both strings are of 1139 * at least n chars long (no need for \0 check) 1140 */ 1141 cs += TclUtfToUniChar(cs, &ch1); 1142 ct += TclUtfToUniChar(ct, &ch2); 1143 if (ch1 != ch2) { 1144 #if TCL_UTF_MAX == 4 1145 /* Surrogates always report higher than non-surrogates */ 1146 if (((ch1 & 0xFC00) == 0xD800)) { 1147 if ((ch2 & 0xFC00) != 0xD800) { 1148 return ch1; 1149 } 1150 } else if ((ch2 & 0xFC00) == 0xD800) { 1151 return -ch2; 1152 } 1153 #endif 1154 ch1 = Tcl_UniCharToLower(ch1); 1155 ch2 = Tcl_UniCharToLower(ch2); 1156 if (ch1 != ch2) { 1157 return (ch1 - ch2); 1158 } 1159 } 1160 } 1161 return 0; 1162 } 1163 1164 /* 1165 *---------------------------------------------------------------------- 1166 * 1167 * TclUtfCasecmp -- 1168 * 1169 * Compare UTF chars of string cs to string ct case insensitively. 1170 * Replacement for strcasecmp in Tcl core, in places where UTF-8 should 1171 * be handled. 1172 * 1173 * Results: 1174 * Return <0 if cs < ct, 0 if cs == ct, or >0 if cs > ct. 1175 * 1176 * Side effects: 1177 * None. 1178 * 1179 *---------------------------------------------------------------------- 1180 */ 1181 1182 int 1183 TclUtfCasecmp( 1184 const char *cs, /* UTF string to compare to ct. */ 1185 const char *ct) /* UTF string cs is compared to. */ 1186 { 1187 Tcl_UniChar ch1 = 0, ch2 = 0; 1188 1189 while (*cs && *ct) { 1190 cs += TclUtfToUniChar(cs, &ch1); 1191 ct += TclUtfToUniChar(ct, &ch2); 1192 if (ch1 != ch2) { 1193 #if TCL_UTF_MAX == 4 1194 /* Surrogates always report higher than non-surrogates */ 1195 if (((ch1 & 0xFC00) == 0xD800)) { 1196 if ((ch2 & 0xFC00) != 0xD800) { 1197 return ch1; 1198 } 1199 } else if ((ch2 & 0xFC00) == 0xD800) { 1200 return -ch2; 1201 } 1202 #endif 1203 ch1 = Tcl_UniCharToLower(ch1); 1204 ch2 = Tcl_UniCharToLower(ch2); 1205 if (ch1 != ch2) { 1206 return ch1 - ch2; 1207 } 1208 } 1209 } 1210 return UCHAR(*cs) - UCHAR(*ct); 1211 } 1212 1213 1214 /* 1215 *---------------------------------------------------------------------- 1216 * 1217 * Tcl_UniCharToUpper -- 1218 * 1219 * Compute the uppercase equivalent of the given Unicode character. 1220 * 1221 * Results: 1222 * Returns the uppercase Unicode character. 1223 * 1224 * Side effects: 1225 * None. 1226 * 1227 *---------------------------------------------------------------------- 1228 */ 1229 1230 Tcl_UniChar 1231 Tcl_UniCharToUpper( 1232 int ch) /* Unicode character to convert. */ 1233 { 1234 int info = GetUniCharInfo(ch); 1235 1236 if (GetCaseType(info) & 0x04) { 1237 ch -= GetDelta(info); 1238 } 1239 return (Tcl_UniChar) ch; 1240 } 1241 1242 /* 1243 *---------------------------------------------------------------------- 1244 * 1245 * Tcl_UniCharToLower -- 1246 * 1247 * Compute the lowercase equivalent of the given Unicode character. 1248 * 1249 * Results: 1250 * Returns the lowercase Unicode character. 1251 * 1252 * Side effects: 1253 * None. 1254 * 1255 *---------------------------------------------------------------------- 1256 */ 1257 1258 Tcl_UniChar 1259 Tcl_UniCharToLower( 1260 int ch) /* Unicode character to convert. */ 1261 { 1262 int info = GetUniCharInfo(ch); 1263 int mode = GetCaseType(info); 1264 1265 if ((mode & 0x02) && (mode != 0x7)) { 1266 ch += GetDelta(info); 1267 } 1268 return (Tcl_UniChar) ch; 1269 } 1270 1271 /* 1272 *---------------------------------------------------------------------- 1273 * 1274 * Tcl_UniCharToTitle -- 1275 * 1276 * Compute the titlecase equivalent of the given Unicode character. 1277 * 1278 * Results: 1279 * Returns the titlecase Unicode character. 1280 * 1281 * Side effects: 1282 * None. 1283 * 1284 *---------------------------------------------------------------------- 1285 */ 1286 1287 Tcl_UniChar 1288 Tcl_UniCharToTitle( 1289 int ch) /* Unicode character to convert. */ 1290 { 1291 int info = GetUniCharInfo(ch); 1292 int mode = GetCaseType(info); 1293 1294 if (mode & 0x1) { 1295 /* 1296 * Subtract or add one depending on the original case. 1297 */ 1298 1299 if (mode != 0x7) { 1300 ch += ((mode & 0x4) ? -1 : 1); 1301 } 1302 } else if (mode == 0x4) { 1303 ch -= GetDelta(info); 1304 } 1305 return (Tcl_UniChar) ch; 1306 } 1307 1308 /* 1309 *---------------------------------------------------------------------- 1310 * 1311 * Tcl_UniCharLen -- 1312 * 1313 * Find the length of a UniChar string. The str input must be null 1314 * terminated. 1315 * 1316 * Results: 1317 * Returns the length of str in UniChars (not bytes). 1318 * 1319 * Side effects: 1320 * None. 1321 * 1322 *---------------------------------------------------------------------- 1323 */ 1324 1325 int 1326 Tcl_UniCharLen( 1327 const Tcl_UniChar *uniStr) /* Unicode string to find length of. */ 1328 { 1329 int len = 0; 1330 1331 while (*uniStr != '\0') { 1332 len++; 1333 uniStr++; 1334 } 1335 return len; 1336 } 1337 1338 /* 1339 *---------------------------------------------------------------------- 1340 * 1341 * Tcl_UniCharNcmp -- 1342 * 1343 * Compare at most numChars unichars of string ucs to string uct. 1344 * Both ucs and uct are assumed to be at least numChars unichars long. 1345 * 1346 * Results: 1347 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 1348 * 1349 * Side effects: 1350 * None. 1351 * 1352 *---------------------------------------------------------------------- 1353 */ 1354 1355 int 1356 Tcl_UniCharNcmp( 1357 const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 1358 const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 1359 unsigned long numChars) /* Number of unichars to compare. */ 1360 { 1361 #ifdef WORDS_BIGENDIAN 1362 /* 1363 * We are definitely on a big-endian machine; memcmp() is safe 1364 */ 1365 1366 return memcmp(ucs, uct, numChars*sizeof(Tcl_UniChar)); 1367 1368 #else /* !WORDS_BIGENDIAN */ 1369 /* 1370 * We can't simply call memcmp() because that is not lexically correct. 1371 */ 1372 1373 for ( ; numChars != 0; ucs++, uct++, numChars--) { 1374 if (*ucs != *uct) { 1375 return (*ucs - *uct); 1376 } 1377 } 1378 return 0; 1379 #endif /* WORDS_BIGENDIAN */ 1380 } 1381 1382 /* 1383 *---------------------------------------------------------------------- 1384 * 1385 * Tcl_UniCharNcasecmp -- 1386 * 1387 * Compare at most numChars unichars of string ucs to string uct case 1388 * insensitive. Both ucs and uct are assumed to be at least numChars 1389 * unichars long. 1390 * 1391 * Results: 1392 * Return <0 if ucs < uct, 0 if ucs == uct, or >0 if ucs > uct. 1393 * 1394 * Side effects: 1395 * None. 1396 * 1397 *---------------------------------------------------------------------- 1398 */ 1399 1400 int 1401 Tcl_UniCharNcasecmp( 1402 const Tcl_UniChar *ucs, /* Unicode string to compare to uct. */ 1403 const Tcl_UniChar *uct, /* Unicode string ucs is compared to. */ 1404 unsigned long numChars) /* Number of unichars to compare. */ 1405 { 1406 for ( ; numChars != 0; numChars--, ucs++, uct++) { 1407 if (*ucs != *uct) { 1408 Tcl_UniChar lcs = Tcl_UniCharToLower(*ucs); 1409 Tcl_UniChar lct = Tcl_UniCharToLower(*uct); 1410 1411 if (lcs != lct) { 1412 return (lcs - lct); 1413 } 1414 } 1415 } 1416 return 0; 1417 } 1418 1419 /* 1420 *---------------------------------------------------------------------- 1421 * 1422 * Tcl_UniCharIsAlnum -- 1423 * 1424 * Test if a character is an alphanumeric Unicode character. 1425 * 1426 * Results: 1427 * Returns 1 if character is alphanumeric. 1428 * 1429 * Side effects: 1430 * None. 1431 * 1432 *---------------------------------------------------------------------- 1433 */ 1434 1435 int 1436 Tcl_UniCharIsAlnum( 1437 int ch) /* Unicode character to test. */ 1438 { 1439 #if TCL_UTF_MAX > 3 1440 if (UNICODE_OUT_OF_RANGE(ch)) { 1441 return 0; 1442 } 1443 #endif 1444 return (((ALPHA_BITS | DIGIT_BITS) >> GetCategory(ch)) & 1); 1445 } 1446 1447 /* 1448 *---------------------------------------------------------------------- 1449 * 1450 * Tcl_UniCharIsAlpha -- 1451 * 1452 * Test if a character is an alphabetic Unicode character. 1453 * 1454 * Results: 1455 * Returns 1 if character is alphabetic. 1456 * 1457 * Side effects: 1458 * None. 1459 * 1460 *---------------------------------------------------------------------- 1461 */ 1462 1463 int 1464 Tcl_UniCharIsAlpha( 1465 int ch) /* Unicode character to test. */ 1466 { 1467 #if TCL_UTF_MAX > 3 1468 if (UNICODE_OUT_OF_RANGE(ch)) { 1469 return 0; 1470 } 1471 #endif 1472 return ((ALPHA_BITS >> GetCategory(ch)) & 1); 1473 } 1474 1475 /* 1476 *---------------------------------------------------------------------- 1477 * 1478 * Tcl_UniCharIsControl -- 1479 * 1480 * Test if a character is a Unicode control character. 1481 * 1482 * Results: 1483 * Returns non-zero if character is a control. 1484 * 1485 * Side effects: 1486 * None. 1487 * 1488 *---------------------------------------------------------------------- 1489 */ 1490 1491 int 1492 Tcl_UniCharIsControl( 1493 int ch) /* Unicode character to test. */ 1494 { 1495 #if TCL_UTF_MAX > 3 1496 if (UNICODE_OUT_OF_RANGE(ch)) { 1497 ch &= 0x1FFFFF; 1498 if ((ch == 0xE0001) || ((ch >= 0xE0020) && (ch <= 0xE007f))) { 1499 return 1; 1500 } 1501 if ((ch >= 0xF0000) && ((ch & 0xFFFF) <= 0xFFFD)) { 1502 return 1; 1503 } 1504 return 0; 1505 } 1506 #endif 1507 return ((CONTROL_BITS >> GetCategory(ch)) & 1); 1508 } 1509 1510 /* 1511 *---------------------------------------------------------------------- 1512 * 1513 * Tcl_UniCharIsDigit -- 1514 * 1515 * Test if a character is a numeric Unicode character. 1516 * 1517 * Results: 1518 * Returns non-zero if character is a digit. 1519 * 1520 * Side effects: 1521 * None. 1522 * 1523 *---------------------------------------------------------------------- 1524 */ 1525 1526 int 1527 Tcl_UniCharIsDigit( 1528 int ch) /* Unicode character to test. */ 1529 { 1530 #if TCL_UTF_MAX > 3 1531 if (UNICODE_OUT_OF_RANGE(ch)) { 1532 return 0; 1533 } 1534 #endif 1535 return (GetCategory(ch) == DECIMAL_DIGIT_NUMBER); 1536 } 1537 1538 /* 1539 *---------------------------------------------------------------------- 1540 * 1541 * Tcl_UniCharIsGraph -- 1542 * 1543 * Test if a character is any Unicode print character except space. 1544 * 1545 * Results: 1546 * Returns non-zero if character is printable, but not space. 1547 * 1548 * Side effects: 1549 * None. 1550 * 1551 *---------------------------------------------------------------------- 1552 */ 1553 1554 int 1555 Tcl_UniCharIsGraph( 1556 int ch) /* Unicode character to test. */ 1557 { 1558 #if TCL_UTF_MAX > 3 1559 if (UNICODE_OUT_OF_RANGE(ch)) { 1560 ch &= 0x1FFFFF; 1561 return (ch >= 0xE0100) && (ch <= 0xE01EF); 1562 } 1563 #endif 1564 return ((GRAPH_BITS >> GetCategory(ch)) & 1); 1565 } 1566 1567 /* 1568 *---------------------------------------------------------------------- 1569 * 1570 * Tcl_UniCharIsLower -- 1571 * 1572 * Test if a character is a lowercase Unicode character. 1573 * 1574 * Results: 1575 * Returns non-zero if character is lowercase. 1576 * 1577 * Side effects: 1578 * None. 1579 * 1580 *---------------------------------------------------------------------- 1581 */ 1582 1583 int 1584 Tcl_UniCharIsLower( 1585 int ch) /* Unicode character to test. */ 1586 { 1587 #if TCL_UTF_MAX > 3 1588 if (UNICODE_OUT_OF_RANGE(ch)) { 1589 return 0; 1590 } 1591 #endif 1592 return (GetCategory(ch) == LOWERCASE_LETTER); 1593 } 1594 1595 /* 1596 *---------------------------------------------------------------------- 1597 * 1598 * Tcl_UniCharIsPrint -- 1599 * 1600 * Test if a character is a Unicode print character. 1601 * 1602 * Results: 1603 * Returns non-zero if character is printable. 1604 * 1605 * Side effects: 1606 * None. 1607 * 1608 *---------------------------------------------------------------------- 1609 */ 1610 1611 int 1612 Tcl_UniCharIsPrint( 1613 int ch) /* Unicode character to test. */ 1614 { 1615 #if TCL_UTF_MAX > 3 1616 if (UNICODE_OUT_OF_RANGE(ch)) { 1617 ch &= 0x1FFFFF; 1618 return (ch >= 0xE0100) && (ch <= 0xE01EF); 1619 } 1620 #endif 1621 return (((GRAPH_BITS|SPACE_BITS) >> GetCategory(ch)) & 1); 1622 } 1623 1624 /* 1625 *---------------------------------------------------------------------- 1626 * 1627 * Tcl_UniCharIsPunct -- 1628 * 1629 * Test if a character is a Unicode punctuation character. 1630 * 1631 * Results: 1632 * Returns non-zero if character is punct. 1633 * 1634 * Side effects: 1635 * None. 1636 * 1637 *---------------------------------------------------------------------- 1638 */ 1639 1640 int 1641 Tcl_UniCharIsPunct( 1642 int ch) /* Unicode character to test. */ 1643 { 1644 #if TCL_UTF_MAX > 3 1645 if (UNICODE_OUT_OF_RANGE(ch)) { 1646 return 0; 1647 } 1648 #endif 1649 return ((PUNCT_BITS >> GetCategory(ch)) & 1); 1650 } 1651 1652 /* 1653 *---------------------------------------------------------------------- 1654 * 1655 * Tcl_UniCharIsSpace -- 1656 * 1657 * Test if a character is a whitespace Unicode character. 1658 * 1659 * Results: 1660 * Returns non-zero if character is a space. 1661 * 1662 * Side effects: 1663 * None. 1664 * 1665 *---------------------------------------------------------------------- 1666 */ 1667 1668 int 1669 Tcl_UniCharIsSpace( 1670 int ch) /* Unicode character to test. */ 1671 { 1672 #if TCL_UTF_MAX > 3 1673 /* Ignore upper 11 bits. */ 1674 ch &= 0x1FFFFF; 1675 #else 1676 /* Ignore upper 16 bits. */ 1677 ch &= 0xFFFF; 1678 #endif 1679 1680 /* 1681 * If the character is within the first 127 characters, just use the 1682 * standard C function, otherwise consult the Unicode table. 1683 */ 1684 1685 if (ch < 0x80) { 1686 return TclIsSpaceProc((char) ch); 1687 #if TCL_UTF_MAX > 3 1688 } else if (UNICODE_OUT_OF_RANGE(ch)) { 1689 return 0; 1690 #endif 1691 } else if (ch == 0x0085 || ch == 0x180E || ch == 0x200B 1692 || ch == 0x202F || ch == 0x2060 || ch == 0xFEFF) { 1693 return 1; 1694 } else { 1695 return ((SPACE_BITS >> GetCategory(ch)) & 1); 1696 } 1697 } 1698 1699 /* 1700 *---------------------------------------------------------------------- 1701 * 1702 * Tcl_UniCharIsUpper -- 1703 * 1704 * Test if a character is a uppercase Unicode character. 1705 * 1706 * Results: 1707 * Returns non-zero if character is uppercase. 1708 * 1709 * Side effects: 1710 * None. 1711 * 1712 *---------------------------------------------------------------------- 1713 */ 1714 1715 int 1716 Tcl_UniCharIsUpper( 1717 int ch) /* Unicode character to test. */ 1718 { 1719 #if TCL_UTF_MAX > 3 1720 if (UNICODE_OUT_OF_RANGE(ch)) { 1721 return 0; 1722 } 1723 #endif 1724 return (GetCategory(ch) == UPPERCASE_LETTER); 1725 } 1726 1727 /* 1728 *---------------------------------------------------------------------- 1729 * 1730 * Tcl_UniCharIsWordChar -- 1731 * 1732 * Test if a character is alphanumeric or a connector punctuation mark. 1733 * 1734 * Results: 1735 * Returns 1 if character is a word character. 1736 * 1737 * Side effects: 1738 * None. 1739 * 1740 *---------------------------------------------------------------------- 1741 */ 1742 1743 int 1744 Tcl_UniCharIsWordChar( 1745 int ch) /* Unicode character to test. */ 1746 { 1747 #if TCL_UTF_MAX > 3 1748 if (UNICODE_OUT_OF_RANGE(ch)) { 1749 return 0; 1750 } 1751 #endif 1752 return ((WORD_BITS >> GetCategory(ch)) & 1); 1753 } 1754 1755 /* 1756 *---------------------------------------------------------------------- 1757 * 1758 * Tcl_UniCharCaseMatch -- 1759 * 1760 * See if a particular Unicode string matches a particular pattern. 1761 * Allows case insensitivity. This is the Unicode equivalent of the char* 1762 * Tcl_StringCaseMatch. The UniChar strings must be NULL-terminated. 1763 * This has no provision for counted UniChar strings, thus should not be 1764 * used where NULLs are expected in the UniChar string. Use 1765 * TclUniCharMatch where possible. 1766 * 1767 * Results: 1768 * The return value is 1 if string matches pattern, and 0 otherwise. The 1769 * matching operation permits the following special characters in the 1770 * pattern: *?\[] (see the manual entry for details on what these mean). 1771 * 1772 * Side effects: 1773 * None. 1774 * 1775 *---------------------------------------------------------------------- 1776 */ 1777 1778 int 1779 Tcl_UniCharCaseMatch( 1780 const Tcl_UniChar *uniStr, /* Unicode String. */ 1781 const Tcl_UniChar *uniPattern, 1782 /* Pattern, which may contain special 1783 * characters. */ 1784 int nocase) /* 0 for case sensitive, 1 for insensitive */ 1785 { 1786 Tcl_UniChar ch1 = 0, p; 1787 1788 while (1) { 1789 p = *uniPattern; 1790 1791 /* 1792 * See if we're at the end of both the pattern and the string. If so, 1793 * we succeeded. If we're at the end of the pattern but not at the end 1794 * of the string, we failed. 1795 */ 1796 1797 if (p == 0) { 1798 return (*uniStr == 0); 1799 } 1800 if ((*uniStr == 0) && (p != '*')) { 1801 return 0; 1802 } 1803 1804 /* 1805 * Check for a "*" as the next pattern character. It matches any 1806 * substring. We handle this by skipping all the characters up to the 1807 * next matching one in the pattern, and then calling ourselves 1808 * recursively for each postfix of string, until either we match or we 1809 * reach the end of the string. 1810 */ 1811 1812 if (p == '*') { 1813 /* 1814 * Skip all successive *'s in the pattern 1815 */ 1816 1817 while (*(++uniPattern) == '*') { 1818 /* empty body */ 1819 } 1820 p = *uniPattern; 1821 if (p == 0) { 1822 return 1; 1823 } 1824 if (nocase) { 1825 p = Tcl_UniCharToLower(p); 1826 } 1827 while (1) { 1828 /* 1829 * Optimization for matching - cruise through the string 1830 * quickly if the next char in the pattern isn't a special 1831 * character 1832 */ 1833 1834 if ((p != '[') && (p != '?') && (p != '\\')) { 1835 if (nocase) { 1836 while (*uniStr && (p != *uniStr) 1837 && (p != Tcl_UniCharToLower(*uniStr))) { 1838 uniStr++; 1839 } 1840 } else { 1841 while (*uniStr && (p != *uniStr)) { 1842 uniStr++; 1843 } 1844 } 1845 } 1846 if (Tcl_UniCharCaseMatch(uniStr, uniPattern, nocase)) { 1847 return 1; 1848 } 1849 if (*uniStr == 0) { 1850 return 0; 1851 } 1852 uniStr++; 1853 } 1854 } 1855 1856 /* 1857 * Check for a "?" as the next pattern character. It matches any 1858 * single character. 1859 */ 1860 1861 if (p == '?') { 1862 uniPattern++; 1863 uniStr++; 1864 continue; 1865 } 1866 1867 /* 1868 * Check for a "[" as the next pattern character. It is followed by a 1869 * list of characters that are acceptable, or by a range (two 1870 * characters separated by "-"). 1871 */ 1872 1873 if (p == '[') { 1874 Tcl_UniChar startChar, endChar; 1875 1876 uniPattern++; 1877 ch1 = (nocase ? Tcl_UniCharToLower(*uniStr) : *uniStr); 1878 uniStr++; 1879 while (1) { 1880 if ((*uniPattern == ']') || (*uniPattern == 0)) { 1881 return 0; 1882 } 1883 startChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 1884 : *uniPattern); 1885 uniPattern++; 1886 if (*uniPattern == '-') { 1887 uniPattern++; 1888 if (*uniPattern == 0) { 1889 return 0; 1890 } 1891 endChar = (nocase ? Tcl_UniCharToLower(*uniPattern) 1892 : *uniPattern); 1893 uniPattern++; 1894 if (((startChar <= ch1) && (ch1 <= endChar)) 1895 || ((endChar <= ch1) && (ch1 <= startChar))) { 1896 /* 1897 * Matches ranges of form [a-z] or [z-a]. 1898 */ 1899 break; 1900 } 1901 } else if (startChar == ch1) { 1902 break; 1903 } 1904 } 1905 while (*uniPattern != ']') { 1906 if (*uniPattern == 0) { 1907 uniPattern--; 1908 break; 1909 } 1910 uniPattern++; 1911 } 1912 uniPattern++; 1913 continue; 1914 } 1915 1916 /* 1917 * If the next pattern character is '\', just strip off the '\' so we 1918 * do exact matching on the character that follows. 1919 */ 1920 1921 if (p == '\\') { 1922 if (*(++uniPattern) == '\0') { 1923 return 0; 1924 } 1925 } 1926 1927 /* 1928 * There's no special character. Just make sure that the next bytes of 1929 * each string match. 1930 */ 1931 1932 if (nocase) { 1933 if (Tcl_UniCharToLower(*uniStr) != 1934 Tcl_UniCharToLower(*uniPattern)) { 1935 return 0; 1936 } 1937 } else if (*uniStr != *uniPattern) { 1938 return 0; 1939 } 1940 uniStr++; 1941 uniPattern++; 1942 } 1943 } 1944 1945 /* 1946 *---------------------------------------------------------------------- 1947 * 1948 * TclUniCharMatch -- 1949 * 1950 * See if a particular Unicode string matches a particular pattern. 1951 * Allows case insensitivity. This is the Unicode equivalent of the char* 1952 * Tcl_StringCaseMatch. This variant of Tcl_UniCharCaseMatch uses counted 1953 * Strings, so embedded NULLs are allowed. 1954 * 1955 * Results: 1956 * The return value is 1 if string matches pattern, and 0 otherwise. The 1957 * matching operation permits the following special characters in the 1958 * pattern: *?\[] (see the manual entry for details on what these mean). 1959 * 1960 * Side effects: 1961 * None. 1962 * 1963 *---------------------------------------------------------------------- 1964 */ 1965 1966 int 1967 TclUniCharMatch( 1968 const Tcl_UniChar *string, /* Unicode String. */ 1969 int strLen, /* Length of String */ 1970 const Tcl_UniChar *pattern, /* Pattern, which may contain special 1971 * characters. */ 1972 int ptnLen, /* Length of Pattern */ 1973 int nocase) /* 0 for case sensitive, 1 for insensitive */ 1974 { 1975 const Tcl_UniChar *stringEnd, *patternEnd; 1976 Tcl_UniChar p; 1977 1978 stringEnd = string + strLen; 1979 patternEnd = pattern + ptnLen; 1980 1981 while (1) { 1982 /* 1983 * See if we're at the end of both the pattern and the string. If so, 1984 * we succeeded. If we're at the end of the pattern but not at the end 1985 * of the string, we failed. 1986 */ 1987 1988 if (pattern == patternEnd) { 1989 return (string == stringEnd); 1990 } 1991 p = *pattern; 1992 if ((string == stringEnd) && (p != '*')) { 1993 return 0; 1994 } 1995 1996 /* 1997 * Check for a "*" as the next pattern character. It matches any 1998 * substring. We handle this by skipping all the characters up to the 1999 * next matching one in the pattern, and then calling ourselves 2000 * recursively for each postfix of string, until either we match or we 2001 * reach the end of the string. 2002 */ 2003 2004 if (p == '*') { 2005 /* 2006 * Skip all successive *'s in the pattern. 2007 */ 2008 2009 while (*(++pattern) == '*') { 2010 /* empty body */ 2011 } 2012 if (pattern == patternEnd) { 2013 return 1; 2014 } 2015 p = *pattern; 2016 if (nocase) { 2017 p = Tcl_UniCharToLower(p); 2018 } 2019 while (1) { 2020 /* 2021 * Optimization for matching - cruise through the string 2022 * quickly if the next char in the pattern isn't a special 2023 * character. 2024 */ 2025 2026 if ((p != '[') && (p != '?') && (p != '\\')) { 2027 if (nocase) { 2028 while ((string < stringEnd) && (p != *string) 2029 && (p != Tcl_UniCharToLower(*string))) { 2030 string++; 2031 } 2032 } else { 2033 while ((string < stringEnd) && (p != *string)) { 2034 string++; 2035 } 2036 } 2037 } 2038 if (TclUniCharMatch(string, stringEnd - string, 2039 pattern, patternEnd - pattern, nocase)) { 2040 return 1; 2041 } 2042 if (string == stringEnd) { 2043 return 0; 2044 } 2045 string++; 2046 } 2047 } 2048 2049 /* 2050 * Check for a "?" as the next pattern character. It matches any 2051 * single character. 2052 */ 2053 2054 if (p == '?') { 2055 pattern++; 2056 string++; 2057 continue; 2058 } 2059 2060 /* 2061 * Check for a "[" as the next pattern character. It is followed by a 2062 * list of characters that are acceptable, or by a range (two 2063 * characters separated by "-"). 2064 */ 2065 2066 if (p == '[') { 2067 Tcl_UniChar ch1, startChar, endChar; 2068 2069 pattern++; 2070 ch1 = (nocase ? Tcl_UniCharToLower(*string) : *string); 2071 string++; 2072 while (1) { 2073 if ((*pattern == ']') || (pattern == patternEnd)) { 2074 return 0; 2075 } 2076 startChar = (nocase ? Tcl_UniCharToLower(*pattern) : *pattern); 2077 pattern++; 2078 if (*pattern == '-') { 2079 pattern++; 2080 if (pattern == patternEnd) { 2081 return 0; 2082 } 2083 endChar = (nocase ? Tcl_UniCharToLower(*pattern) 2084 : *pattern); 2085 pattern++; 2086 if (((startChar <= ch1) && (ch1 <= endChar)) 2087 || ((endChar <= ch1) && (ch1 <= startChar))) { 2088 /* 2089 * Matches ranges of form [a-z] or [z-a]. 2090 */ 2091 break; 2092 } 2093 } else if (startChar == ch1) { 2094 break; 2095 } 2096 } 2097 while (*pattern != ']') { 2098 if (pattern == patternEnd) { 2099 pattern--; 2100 break; 2101 } 2102 pattern++; 2103 } 2104 pattern++; 2105 continue; 2106 } 2107 2108 /* 2109 * If the next pattern character is '\', just strip off the '\' so we 2110 * do exact matching on the character that follows. 2111 */ 2112 2113 if (p == '\\') { 2114 if (++pattern == patternEnd) { 2115 return 0; 2116 } 2117 } 2118 2119 /* 2120 * There's no special character. Just make sure that the next bytes of 2121 * each string match. 2122 */ 2123 2124 if (nocase) { 2125 if (Tcl_UniCharToLower(*string) != Tcl_UniCharToLower(*pattern)) { 2126 return 0; 2127 } 2128 } else if (*string != *pattern) { 2129 return 0; 2130 } 2131 string++; 2132 pattern++; 2133 } 2134 } 2135 2136 /* 2137 * Local Variables: 2138 * mode: c 2139 * c-basic-offset: 4 2140 * fill-column: 78 2141 * End: 2142 */