source4/lib/charset/util_unistr.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "system/locale.h"
  23 #include "dynconfig.h"
  24
  25 /**
  26  * @file
  27  * @brief Unicode string manipulation
  28  */
  29
  30 /* these 2 tables define the unicode case handling.  They are loaded
  31    at startup either via mmap() or read() from the lib directory */
  32 static void *upcase_table;
  33 static void *lowcase_table;
  34
  35
  36 /*******************************************************************
  37 load the case handling tables
  38 ********************************************************************/
  39 static void load_case_tables(void)
  40 {
  41         TALLOC_CTX *mem_ctx;
  42
  43         mem_ctx = talloc_init("load_case_tables");
  44         if (!mem_ctx) {
  45                 smb_panic("No memory for case_tables");
  46         }
  47         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
  48         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
  49         talloc_free(mem_ctx);
  50         if (upcase_table == NULL) {
  51                 /* try also under codepages for testing purposes */
  52                 upcase_table = map_file("codepages/upcase.dat", 0x20000);
  53                 if (upcase_table == NULL) {
  54                         upcase_table = (void *)-1;
  55                 }
  56         }
  57         if (lowcase_table == NULL) {
  58                 /* try also under codepages for testing purposes */
  59                 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
  60                 if (lowcase_table == NULL) {
  61                         lowcase_table = (void *)-1;
  62                 }
  63         }
  64 }
  65
  66 /**
  67  Convert a codepoint_t to upper case.
  68 **/
  69 codepoint_t toupper_w(codepoint_t val)
  70 {
  71         if (val < 128) {
  72                 return toupper(val);
  73         }
  74         if (upcase_table == NULL) {
  75                 load_case_tables();
  76         }
  77         if (upcase_table == (void *)-1) {
  78                 return val;
  79         }
  80         if (val & 0xFFFF0000) {
  81                 return val;
  82         }
  83         return SVAL(upcase_table, val*2);
  84 }
  85
  86 /**
  87  Convert a codepoint_t to lower case.
  88 **/
  89 codepoint_t tolower_w(codepoint_t val)
  90 {
  91         if (val < 128) {
  92                 return tolower(val);
  93         }
  94         if (lowcase_table == NULL) {
  95                 load_case_tables();
  96         }
  97         if (lowcase_table == (void *)-1) {
  98                 return val;
  99         }
 100         if (val & 0xFFFF0000) {
 101                 return val;
 102         }
 103         return SVAL(lowcase_table, val*2);
 104 }
 105
 106 /**
 107   compare two codepoints case insensitively
 108 */
 109 int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 110 {
 111         if (c1 == c2 ||
 112             toupper_w(c1) == toupper_w(c2)) {
 113                 return 0;
 114         }
 115         return c1 - c2;
 116 }
 117
 118 /**
 119  Case insensitive string compararison
 120 **/
 121 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
 122 {
 123         codepoint_t c1=0, c2=0;
 124         size_t size1, size2;
 125
 126         /* handle null ptr comparisons to simplify the use in qsort */
 127         if (s1 == s2) return 0;
 128         if (s1 == NULL) return -1;
 129         if (s2 == NULL) return 1;
 130
 131         while (*s1 && *s2) {
 132                 c1 = next_codepoint(s1, &size1);
 133                 c2 = next_codepoint(s2, &size2);
 134
 135                 s1 += size1;
 136                 s2 += size2;
 137
 138                 if (c1 == c2) {
 139                         continue;
 140                 }
 141
 142                 if (c1 == INVALID_CODEPOINT ||
 143                     c2 == INVALID_CODEPOINT) {
 144                         /* what else can we do?? */
 145                         return strcasecmp(s1, s2);
 146                 }
 147
 148                 if (toupper_w(c1) != toupper_w(c2)) {
 149                         return c1 - c2;
 150                 }
 151         }
 152
 153         return *s1 - *s2;
 154 }
 155
 156 /**
 157  * Get the next token from a string, return False if none found.
 158  * Handles double-quotes.
 159  *
 160  * Based on a routine by GJC@VILLAGE.COM.
 161  * Extensively modified by Andrew.Tridgell@anu.edu.au
 162  **/
 163 _PUBLIC_ BOOL next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
 164 {
 165         const char *s;
 166         BOOL quoted;
 167         size_t len=1;
 168
 169         if (!ptr)
 170                 return(False);
 171
 172         s = *ptr;
 173
 174         /* default to simple separators */
 175         if (!sep)
 176                 sep = " \t\n\r";
 177
 178         /* find the first non sep char */
 179         while (*s && strchr_m(sep,*s))
 180                 s++;
 181
 182         /* nothing left? */
 183         if (! *s)
 184                 return(False);
 185
 186         /* copy over the token */
 187         for (quoted = False; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
 188                 if (*s == '\"') {
 189                         quoted = !quoted;
 190                 } else {
 191                         len++;
 192                         *buff++ = *s;
 193                 }
 194         }
 195
 196         *ptr = (*s) ? s+1 : s;
 197         *buff = 0;
 198
 199         return(True);
 200 }
 201
 202 /**
 203  Case insensitive string compararison, length limited
 204 **/
 205 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 206 {
 207         codepoint_t c1=0, c2=0;
 208         size_t size1, size2;
 209
 210         /* handle null ptr comparisons to simplify the use in qsort */
 211         if (s1 == s2) return 0;
 212         if (s1 == NULL) return -1;
 213         if (s2 == NULL) return 1;
 214
 215         while (*s1 && *s2 && n) {
 216                 n--;
 217
 218                 c1 = next_codepoint(s1, &size1);
 219                 c2 = next_codepoint(s2, &size2);
 220
 221                 s1 += size1;
 222                 s2 += size2;
 223
 224                 if (c1 == c2) {
 225                         continue;
 226                 }
 227
 228                 if (c1 == INVALID_CODEPOINT ||
 229                     c2 == INVALID_CODEPOINT) {
 230                         /* what else can we do?? */
 231                         return strcasecmp(s1, s2);
 232                 }
 233
 234                 if (toupper_w(c1) != toupper_w(c2)) {
 235                         return c1 - c2;
 236                 }
 237         }
 238
 239         if (n == 0) {
 240                 return 0;
 241         }
 242
 243         return *s1 - *s2;
 244 }
 245
 246 /**
 247  * Compare 2 strings.
 248  *
 249  * @note The comparison is case-insensitive.
 250  **/
 251 _PUBLIC_ BOOL strequal_w(const char *s1, const char *s2)
 252 {
 253         return strcasecmp_m(s1,s2) == 0;
 254 }
 255
 256 /**
 257  Compare 2 strings (case sensitive).
 258 **/
 259 _PUBLIC_ BOOL strcsequal_w(const char *s1,const char *s2)
 260 {
 261         if (s1 == s2)
 262                 return(True);
 263         if (!s1 || !s2)
 264                 return(False);
 265
 266         return strcmp(s1,s2) == 0;
 267 }
 268
 269
 270 /**
 271  String replace.
 272  NOTE: oldc and newc must be 7 bit characters
 273 **/
 274 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
 275 {
 276         while (s && *s) {
 277                 size_t size;
 278                 codepoint_t c = next_codepoint(s, &size);
 279                 if (c == oldc) {
 280                         *s = newc;
 281                 }
 282                 s += size;
 283         }
 284 }
 285
 286 /**
 287  Paranoid strcpy into a buffer of given length (includes terminating
 288  zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
 289  and replaces with '_'. Deliberately does *NOT* check for multibyte
 290  characters. Don't change it !
 291 **/
 292
 293 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
 294 {
 295         size_t len, i;
 296
 297         if (maxlength == 0) {
 298                 /* can't fit any bytes at all! */
 299                 return NULL;
 300         }
 301
 302         if (!dest) {
 303                 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
 304                 return NULL;
 305         }
 306
 307         if (!src) {
 308                 *dest = 0;
 309                 return dest;
 310         }
 311
 312         len = strlen(src);
 313         if (len >= maxlength)
 314                 len = maxlength - 1;
 315
 316         if (!other_safe_chars)
 317                 other_safe_chars = "";
 318
 319         for(i = 0; i < len; i++) {
 320                 int val = (src[i] & 0xff);
 321                 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
 322                         dest[i] = src[i];
 323                 else
 324                         dest[i] = '_';
 325         }
 326
 327         dest[i] = '\0';
 328
 329         return dest;
 330 }
 331
 332 /**
 333  Count the number of UCS2 characters in a string. Normally this will
 334  be the same as the number of bytes in a string for single byte strings,
 335  but will be different for multibyte.
 336 **/
 337 _PUBLIC_ size_t strlen_m(const char *s)
 338 {
 339         size_t count = 0;
 340
 341         if (!s) {
 342                 return 0;
 343         }
 344
 345         while (*s && !(((uint8_t)*s) & 0x80)) {
 346                 s++;
 347                 count++;
 348         }
 349
 350         if (!*s) {
 351                 return count;
 352         }
 353
 354         while (*s) {
 355                 size_t c_size;
 356                 codepoint_t c = next_codepoint(s, &c_size);
 357                 if (c < 0x10000) {
 358                         count += 1;
 359                 } else {
 360                         count += 2;
 361                 }
 362                 s += c_size;
 363         }
 364
 365         return count;
 366 }
 367
 368 /**
 369    Work out the number of multibyte chars in a string, including the NULL
 370    terminator.
 371 **/
 372 _PUBLIC_ size_t strlen_m_term(const char *s)
 373 {
 374         if (!s) {
 375                 return 0;
 376         }
 377
 378         return strlen_m(s) + 1;
 379 }
 380
 381 /**
 382  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 383 **/
 384 _PUBLIC_ char *strchr_m(const char *s, char c)
 385 {
 386         /* characters below 0x3F are guaranteed to not appear in
 387            non-initial position in multi-byte charsets */
 388         if ((c & 0xC0) == 0) {
 389                 return strchr(s, c);
 390         }
 391
 392         while (*s) {
 393                 size_t size;
 394                 codepoint_t c2 = next_codepoint(s, &size);
 395                 if (c2 == c) {
 396                         return discard_const(s);
 397                 }
 398                 s += size;
 399         }
 400
 401         return NULL;
 402 }
 403
 404 /**
 405  * Multibyte-character version of strrchr
 406  */
 407 _PUBLIC_ char *strrchr_m(const char *s, char c)
 408 {
 409         char *ret = NULL;
 410
 411         /* characters below 0x3F are guaranteed to not appear in
 412            non-initial position in multi-byte charsets */
 413         if ((c & 0xC0) == 0) {
 414                 return strrchr(s, c);
 415         }
 416
 417         while (*s) {
 418                 size_t size;
 419                 codepoint_t c2 = next_codepoint(s, &size);
 420                 if (c2 == c) {
 421                         ret = discard_const(s);
 422                 }
 423                 s += size;
 424         }
 425
 426         return ret;
 427 }
 428
 429 /**
 430   return True if any (multi-byte) character is lower case
 431 */
 432 _PUBLIC_ BOOL strhaslower(const char *string)
 433 {
 434         while (*string) {
 435                 size_t c_size;
 436                 codepoint_t s;
 437                 codepoint_t t;
 438
 439                 s = next_codepoint(string, &c_size);
 440                 string += c_size;
 441
 442                 t = toupper_w(s);
 443
 444                 if (s != t) {
 445                         return True; /* that means it has lower case chars */
 446                 }
 447         }
 448
 449         return False;
 450 }
 451
 452 /**
 453   return True if any (multi-byte) character is upper case
 454 */
 455 _PUBLIC_ BOOL strhasupper(const char *string)
 456 {
 457         while (*string) {
 458                 size_t c_size;
 459                 codepoint_t s;
 460                 codepoint_t t;
 461
 462                 s = next_codepoint(string, &c_size);
 463                 string += c_size;
 464
 465                 t = tolower_w(s);
 466
 467                 if (s != t) {
 468                         return True; /* that means it has upper case chars */
 469                 }
 470         }
 471
 472         return False;
 473 }
 474
 475 /**
 476  Convert a string to lower case, allocated with talloc
 477 **/
 478 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
 479 {
 480         size_t size=0;
 481         char *dest;
 482
 483         /* this takes advantage of the fact that upper/lower can't
 484            change the length of a character by more than 1 byte */
 485         dest = talloc_size(ctx, 2*(strlen(src))+1);
 486         if (dest == NULL) {
 487                 return NULL;
 488         }
 489
 490         while (*src) {
 491                 size_t c_size;
 492                 codepoint_t c = next_codepoint(src, &c_size);
 493                 src += c_size;
 494
 495                 c = tolower_w(c);
 496
 497                 c_size = push_codepoint(dest+size, c);
 498                 if (c_size == -1) {
 499                         talloc_free(dest);
 500                         return NULL;
 501                 }
 502                 size += c_size;
 503         }
 504
 505         dest[size] = 0;
 506
 507         /* trim it so talloc_append_string() works */
 508         dest = talloc_realloc_size(ctx, dest, size+1);
 509
 510         talloc_set_name_const(dest, dest);
 511
 512         return dest;
 513 }
 514
 515 /**
 516  Convert a string to UPPER case, allocated with talloc
 517 **/
 518 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
 519 {
 520         size_t size=0;
 521         char *dest;
 522
 523         if (!src) {
 524                 return NULL;
 525         }
 526
 527         /* this takes advantage of the fact that upper/lower can't
 528            change the length of a character by more than 1 byte */
 529         dest = talloc_size(ctx, 2*(strlen(src))+1);
 530         if (dest == NULL) {
 531                 return NULL;
 532         }
 533
 534         while (*src) {
 535                 size_t c_size;
 536                 codepoint_t c = next_codepoint(src, &c_size);
 537                 src += c_size;
 538
 539                 c = toupper_w(c);
 540
 541                 c_size = push_codepoint(dest+size, c);
 542                 if (c_size == -1) {
 543                         talloc_free(dest);
 544                         return NULL;
 545                 }
 546                 size += c_size;
 547         }
 548
 549         dest[size] = 0;
 550
 551         /* trim it so talloc_append_string() works */
 552         dest = talloc_realloc_size(ctx, dest, size+1);
 553
 554         talloc_set_name_const(dest, dest);
 555
 556         return dest;
 557 }
 558
 559 /**
 560  Convert a string to lower case.
 561 **/
 562 _PUBLIC_ void strlower_m(char *s)
 563 {
 564         char *d;
 565
 566         /* this is quite a common operation, so we want it to be
 567            fast. We optimise for the ascii case, knowing that all our
 568            supported multi-byte character sets are ascii-compatible
 569            (ie. they match for the first 128 chars) */
 570         while (*s && !(((uint8_t)*s) & 0x80)) {
 571                 *s = tolower((uint8_t)*s);
 572                 s++;
 573         }
 574
 575         if (!*s)
 576                 return;
 577
 578         d = s;
 579
 580         while (*s) {
 581                 size_t c_size, c_size2;
 582                 codepoint_t c = next_codepoint(s, &c_size);
 583                 c_size2 = push_codepoint(d, tolower_w(c));
 584                 if (c_size2 > c_size) {
 585                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
 586                                  c, tolower_w(c), (int)c_size, (int)c_size2));
 587                         smb_panic("codepoint expansion in strlower_m\n");
 588                 }
 589                 s += c_size;
 590                 d += c_size2;
 591         }
 592         *d = 0;
 593 }
 594
 595 /**
 596  Convert a string to UPPER case.
 597 **/
 598 _PUBLIC_ void strupper_m(char *s)
 599 {
 600         char *d;
 601
 602         /* this is quite a common operation, so we want it to be
 603            fast. We optimise for the ascii case, knowing that all our
 604            supported multi-byte character sets are ascii-compatible
 605            (ie. they match for the first 128 chars) */
 606         while (*s && !(((uint8_t)*s) & 0x80)) {
 607                 *s = toupper((uint8_t)*s);
 608                 s++;
 609         }
 610
 611         if (!*s)
 612                 return;
 613
 614         d = s;
 615
 616         while (*s) {
 617                 size_t c_size, c_size2;
 618                 codepoint_t c = next_codepoint(s, &c_size);
 619                 c_size2 = push_codepoint(d, toupper_w(c));
 620                 if (c_size2 > c_size) {
 621                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
 622                                  c, toupper_w(c), (int)c_size, (int)c_size2));
 623                         smb_panic("codepoint expansion in strupper_m\n");
 624                 }
 625                 s += c_size;
 626                 d += c_size2;
 627         }
 628         *d = 0;
 629 }
 630
 631
 632 /**
 633  Find the number of 'c' chars in a string
 634 **/
 635 _PUBLIC_ size_t count_chars_w(const char *s, char c)
 636 {
 637         size_t count = 0;
 638
 639         while (*s) {
 640                 size_t size;
 641                 codepoint_t c2 = next_codepoint(s, &size);
 642                 if (c2 == c) count++;
 643                 s += size;
 644         }
 645
 646         return count;
 647 }
 648
 649