source/lib/charset/util_unistr.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 */
  21
  22 #include "includes.h"
  23 #include "system/iconv.h"
  24
  25 /**
  26  * @file
  27  * @brief Unicode string manipulation
  28  */
  29
  30 /* these 2 tables define the unicode case handling.  They are loaded
  31    at startup either via mmap() or read() from the lib directory */
  32 static void *upcase_table;
  33 static void *lowcase_table;
  34
  35
  36 /*******************************************************************
  37 load the case handling tables
  38 ********************************************************************/
  39 static void load_case_tables(void)
  40 {
  41         TALLOC_CTX *mem_ctx;
  42
  43         mem_ctx = talloc_init("load_case_tables");
  44         if (!mem_ctx) {
  45                 smb_panic("No memory for case_tables");
  46         }
  47         upcase_table = map_file(data_path(mem_ctx, "upcase.dat"), 0x20000);
  48         lowcase_table = map_file(data_path(mem_ctx, "lowcase.dat"), 0x20000);
  49         talloc_free(mem_ctx);
  50         if (upcase_table == NULL) {
  51                 /* try also under codepages for testing purposes */
  52                 upcase_table = map_file("codepages/upcase.dat", 0x20000);
  53                 if (upcase_table == NULL) {
  54                         upcase_table = (void *)-1;
  55                 }
  56         }
  57         if (lowcase_table == NULL) {
  58                 /* try also under codepages for testing purposes */
  59                 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
  60                 if (lowcase_table == NULL) {
  61                         lowcase_table = (void *)-1;
  62                 }
  63         }
  64 }
  65
  66 /**
  67  Convert a codepoint_t to upper case.
  68 **/
  69 codepoint_t toupper_w(codepoint_t val)
  70 {
  71         if (val < 128) {
  72                 return toupper(val);
  73         }
  74         if (upcase_table == NULL) {
  75                 load_case_tables();
  76         }
  77         if (upcase_table == (void *)-1) {
  78                 return val;
  79         }
  80         if (val & 0xFFFF0000) {
  81                 return val;
  82         }
  83         return SVAL(upcase_table, val*2);
  84 }
  85
  86 /**
  87  Convert a codepoint_t to lower case.
  88 **/
  89 codepoint_t tolower_w(codepoint_t val)
  90 {
  91         if (val < 128) {
  92                 return tolower(val);
  93         }
  94         if (lowcase_table == NULL) {
  95                 load_case_tables();
  96         }
  97         if (lowcase_table == (void *)-1) {
  98                 return val;
  99         }
 100         if (val & 0xFFFF0000) {
 101                 return val;
 102         }
 103         return SVAL(lowcase_table, val*2);
 104 }
 105
 106 /**
 107   compare two codepoints case insensitively
 108 */
 109 int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 110 {
 111         if (c1 == c2 ||
 112             toupper_w(c1) == toupper_w(c2)) {
 113                 return 0;
 114         }
 115         return c1 - c2;
 116 }
 117
 118 /**
 119  Case insensitive string compararison
 120 **/
 121 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
 122 {
 123         codepoint_t c1=0, c2=0;
 124         size_t size1, size2;
 125
 126         while (*s1 && *s2) {
 127                 c1 = next_codepoint(s1, &size1);
 128                 c2 = next_codepoint(s2, &size2);
 129
 130                 s1 += size1;
 131                 s2 += size2;
 132
 133                 if (c1 == c2) {
 134                         continue;
 135                 }
 136
 137                 if (c1 == INVALID_CODEPOINT ||
 138                     c2 == INVALID_CODEPOINT) {
 139                         /* what else can we do?? */
 140                         return strcasecmp(s1, s2);
 141                 }
 142
 143                 if (toupper_w(c1) != toupper_w(c2)) {
 144                         return c1 - c2;
 145                 }
 146         }
 147
 148         return *s1 - *s2;
 149 }
 150
 151 /**
 152  * Get the next token from a string, return False if none found.
 153  * Handles double-quotes.
 154  *
 155  * Based on a routine by GJC@VILLAGE.COM.
 156  * Extensively modified by Andrew.Tridgell@anu.edu.au
 157  **/
 158 _PUBLIC_ BOOL next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
 159 {
 160         const char *s;
 161         BOOL quoted;
 162         size_t len=1;
 163
 164         if (!ptr)
 165                 return(False);
 166
 167         s = *ptr;
 168
 169         /* default to simple separators */
 170         if (!sep)
 171                 sep = " \t\n\r";
 172
 173         /* find the first non sep char */
 174         while (*s && strchr_m(sep,*s))
 175                 s++;
 176
 177         /* nothing left? */
 178         if (! *s)
 179                 return(False);
 180
 181         /* copy over the token */
 182         for (quoted = False; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
 183                 if (*s == '\"') {
 184                         quoted = !quoted;
 185                 } else {
 186                         len++;
 187                         *buff++ = *s;
 188                 }
 189         }
 190
 191         *ptr = (*s) ? s+1 : s;
 192         *buff = 0;
 193
 194         return(True);
 195 }
 196
 197 /**
 198  Case insensitive string compararison, length limited
 199 **/
 200 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 201 {
 202         codepoint_t c1=0, c2=0;
 203         size_t size1, size2;
 204
 205         while (*s1 && *s2 && n) {
 206                 n--;
 207
 208                 c1 = next_codepoint(s1, &size1);
 209                 c2 = next_codepoint(s2, &size2);
 210
 211                 s1 += size1;
 212                 s2 += size2;
 213
 214                 if (c1 == c2) {
 215                         continue;
 216                 }
 217
 218                 if (c1 == INVALID_CODEPOINT ||
 219                     c2 == INVALID_CODEPOINT) {
 220                         /* what else can we do?? */
 221                         return strcasecmp(s1, s2);
 222                 }
 223
 224                 if (toupper_w(c1) != toupper_w(c2)) {
 225                         return c1 - c2;
 226                 }
 227         }
 228
 229         if (n == 0) {
 230                 return 0;
 231         }
 232
 233         return *s1 - *s2;
 234 }
 235
 236 /**
 237  * Compare 2 strings.
 238  *
 239  * @note The comparison is case-insensitive.
 240  **/
 241 _PUBLIC_ BOOL strequal_w(const char *s1, const char *s2)
 242 {
 243         if (s1 == s2)
 244                 return(True);
 245         if (!s1 || !s2)
 246                 return(False);
 247
 248         return strcasecmp_m(s1,s2) == 0;
 249 }
 250
 251 /**
 252  Compare 2 strings (case sensitive).
 253 **/
 254 _PUBLIC_ BOOL strcsequal_w(const char *s1,const char *s2)
 255 {
 256         if (s1 == s2)
 257                 return(True);
 258         if (!s1 || !s2)
 259                 return(False);
 260
 261         return strcmp(s1,s2) == 0;
 262 }
 263
 264
 265 /**
 266  String replace.
 267  NOTE: oldc and newc must be 7 bit characters
 268 **/
 269 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
 270 {
 271         for (; s && *s; s++) {
 272                 size_t size;
 273                 codepoint_t c = next_codepoint(s, &size);
 274                 if (c == oldc) {
 275                         *s = newc;
 276                 }
 277                 s += size;
 278         }
 279 }
 280
 281 /**
 282  Paranoid strcpy into a buffer of given length (includes terminating
 283  zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
 284  and replaces with '_'. Deliberately does *NOT* check for multibyte
 285  characters. Don't change it !
 286 **/
 287
 288 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
 289 {
 290         size_t len, i;
 291
 292         if (maxlength == 0) {
 293                 /* can't fit any bytes at all! */
 294                 return NULL;
 295         }
 296
 297         if (!dest) {
 298                 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
 299                 return NULL;
 300         }
 301
 302         if (!src) {
 303                 *dest = 0;
 304                 return dest;
 305         }
 306
 307         len = strlen(src);
 308         if (len >= maxlength)
 309                 len = maxlength - 1;
 310
 311         if (!other_safe_chars)
 312                 other_safe_chars = "";
 313
 314         for(i = 0; i < len; i++) {
 315                 int val = (src[i] & 0xff);
 316                 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
 317                         dest[i] = src[i];
 318                 else
 319                         dest[i] = '_';
 320         }
 321
 322         dest[i] = '\0';
 323
 324         return dest;
 325 }
 326
 327 /**
 328  Count the number of UCS2 characters in a string. Normally this will
 329  be the same as the number of bytes in a string for single byte strings,
 330  but will be different for multibyte.
 331 **/
 332 _PUBLIC_ size_t strlen_m(const char *s)
 333 {
 334         size_t count = 0;
 335
 336         if (!s) {
 337                 return 0;
 338         }
 339
 340         while (*s && !(((uint8_t)*s) & 0x80)) {
 341                 s++;
 342                 count++;
 343         }
 344
 345         if (!*s) {
 346                 return count;
 347         }
 348
 349         while (*s) {
 350                 size_t c_size;
 351                 codepoint_t c = next_codepoint(s, &c_size);
 352                 if (c < 0x10000) {
 353                         count += 1;
 354                 } else {
 355                         count += 2;
 356                 }
 357                 s += c_size;
 358         }
 359
 360         return count;
 361 }
 362
 363 /**
 364    Work out the number of multibyte chars in a string, including the NULL
 365    terminator.
 366 **/
 367 _PUBLIC_ size_t strlen_m_term(const char *s)
 368 {
 369         if (!s) {
 370                 return 0;
 371         }
 372
 373         return strlen_m(s) + 1;
 374 }
 375
 376 /**
 377  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 378 **/
 379 _PUBLIC_ char *strchr_m(const char *s, char c)
 380 {
 381         /* characters below 0x3F are guaranteed to not appear in
 382            non-initial position in multi-byte charsets */
 383         if ((c & 0xC0) == 0) {
 384                 return strchr(s, c);
 385         }
 386
 387         while (*s) {
 388                 size_t size;
 389                 codepoint_t c2 = next_codepoint(s, &size);
 390                 if (c2 == c) {
 391                         return discard_const(s);
 392                 }
 393                 s += size;
 394         }
 395
 396         return NULL;
 397 }
 398
 399 /**
 400  * Multibyte-character version of strrchr
 401  */
 402 _PUBLIC_ char *strrchr_m(const char *s, char c)
 403 {
 404         char *ret = NULL;
 405
 406         /* characters below 0x3F are guaranteed to not appear in
 407            non-initial position in multi-byte charsets */
 408         if ((c & 0xC0) == 0) {
 409                 return strrchr(s, c);
 410         }
 411
 412         while (*s) {
 413                 size_t size;
 414                 codepoint_t c2 = next_codepoint(s, &size);
 415                 if (c2 == c) {
 416                         ret = discard_const(s);
 417                 }
 418                 s += size;
 419         }
 420
 421         return ret;
 422 }
 423
 424 /**
 425   return True if any (multi-byte) character is lower case
 426 */
 427 _PUBLIC_ BOOL strhaslower(const char *string)
 428 {
 429         while (*string) {
 430                 size_t c_size;
 431                 codepoint_t s;
 432                 codepoint_t t;
 433
 434                 s = next_codepoint(string, &c_size);
 435                 string += c_size;
 436
 437                 t = toupper_w(s);
 438
 439                 if (s != t) {
 440                         return True; /* that means it has lower case chars */
 441                 }
 442         }
 443
 444         return False;
 445 }
 446
 447 /**
 448   return True if any (multi-byte) character is upper case
 449 */
 450 _PUBLIC_ BOOL strhasupper(const char *string)
 451 {
 452         while (*string) {
 453                 size_t c_size;
 454                 codepoint_t s;
 455                 codepoint_t t;
 456
 457                 s = next_codepoint(string, &c_size);
 458                 string += c_size;
 459
 460                 t = tolower_w(s);
 461
 462                 if (s != t) {
 463                         return True; /* that means it has upper case chars */
 464                 }
 465         }
 466
 467         return False;
 468 }
 469
 470 /**
 471  Convert a string to lower case, allocated with talloc
 472 **/
 473 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
 474 {
 475         size_t size=0;
 476         char *dest;
 477
 478         /* this takes advantage of the fact that upper/lower can't
 479            change the length of a character by more than 1 byte */
 480         dest = talloc_size(ctx, 2*(strlen(src))+1);
 481         if (dest == NULL) {
 482                 return NULL;
 483         }
 484
 485         while (*src) {
 486                 size_t c_size;
 487                 codepoint_t c = next_codepoint(src, &c_size);
 488                 src += c_size;
 489
 490                 c = tolower_w(c);
 491
 492                 c_size = push_codepoint(dest+size, c);
 493                 if (c_size == -1) {
 494                         talloc_free(dest);
 495                         return NULL;
 496                 }
 497                 size += c_size;
 498         }
 499
 500         dest[size] = 0;
 501
 502         return dest;
 503 }
 504
 505 /**
 506  Convert a string to UPPER case, allocated with talloc
 507 **/
 508 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
 509 {
 510         size_t size=0;
 511         char *dest;
 512
 513         if (!src) {
 514                 return NULL;
 515         }
 516
 517         /* this takes advantage of the fact that upper/lower can't
 518            change the length of a character by more than 1 byte */
 519         dest = talloc_size(ctx, 2*(strlen(src))+1);
 520         if (dest == NULL) {
 521                 return NULL;
 522         }
 523
 524         while (*src) {
 525                 size_t c_size;
 526                 codepoint_t c = next_codepoint(src, &c_size);
 527                 src += c_size;
 528
 529                 c = toupper_w(c);
 530
 531                 c_size = push_codepoint(dest+size, c);
 532                 if (c_size == -1) {
 533                         talloc_free(dest);
 534                         return NULL;
 535                 }
 536                 size += c_size;
 537         }
 538
 539         dest[size] = 0;
 540
 541         return dest;
 542 }
 543
 544 /**
 545  Convert a string to lower case.
 546 **/
 547 _PUBLIC_ void strlower_m(char *s)
 548 {
 549         char *d;
 550
 551         /* this is quite a common operation, so we want it to be
 552            fast. We optimise for the ascii case, knowing that all our
 553            supported multi-byte character sets are ascii-compatible
 554            (ie. they match for the first 128 chars) */
 555         while (*s && !(((uint8_t)*s) & 0x80)) {
 556                 *s = tolower((uint8_t)*s);
 557                 s++;
 558         }
 559
 560         if (!*s)
 561                 return;
 562
 563         d = s;
 564
 565         while (*s) {
 566                 size_t c_size, c_size2;
 567                 codepoint_t c = next_codepoint(s, &c_size);
 568                 c_size2 = push_codepoint(d, tolower_w(c));
 569                 if (c_size2 > c_size) {
 570                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
 571                                  c, tolower_w(c), (int)c_size, (int)c_size2));
 572                         smb_panic("codepoint expansion in strlower_m\n");
 573                 }
 574                 s += c_size;
 575                 d += c_size2;
 576         }
 577         *d = 0;
 578 }
 579
 580 /**
 581  Convert a string to UPPER case.
 582 **/
 583 _PUBLIC_ void strupper_m(char *s)
 584 {
 585         char *d;
 586
 587         /* this is quite a common operation, so we want it to be
 588            fast. We optimise for the ascii case, knowing that all our
 589            supported multi-byte character sets are ascii-compatible
 590            (ie. they match for the first 128 chars) */
 591         while (*s && !(((uint8_t)*s) & 0x80)) {
 592                 *s = toupper((uint8_t)*s);
 593                 s++;
 594         }
 595
 596         if (!*s)
 597                 return;
 598
 599         d = s;
 600
 601         while (*s) {
 602                 size_t c_size, c_size2;
 603                 codepoint_t c = next_codepoint(s, &c_size);
 604                 c_size2 = push_codepoint(d, toupper_w(c));
 605                 if (c_size2 > c_size) {
 606                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
 607                                  c, toupper_w(c), (int)c_size, (int)c_size2));
 608                         smb_panic("codepoint expansion in strupper_m\n");
 609                 }
 610                 s += c_size;
 611                 d += c_size2;
 612         }
 613         *d = 0;
 614 }
 615
 616
 617 /**
 618  Find the number of 'c' chars in a string
 619 **/
 620 _PUBLIC_ size_t count_chars_w(const char *s, char c)
 621 {
 622         size_t count = 0;
 623
 624         while (*s) {
 625                 size_t size;
 626                 codepoint_t c2 = next_codepoint(s, &size);
 627                 if (c2 == c) count++;
 628                 s += size;
 629         }
 630
 631         return count;
 632 }
 633
 634