source4/lib/charset/util_unistr.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "system/locale.h"
  23 #include "dynconfig/dynconfig.h"
  24 #include "param/param.h"
  25
  26 /**
  27  * @file
  28  * @brief Unicode string manipulation
  29  */
  30
  31 /* these 2 tables define the unicode case handling.  They are loaded
  32    at startup either via mmap() or read() from the lib directory */
  33 static void *upcase_table;
  34 static void *lowcase_table;
  35
  36
  37 /*******************************************************************
  38 load the case handling tables
  39 ********************************************************************/
  40 static void load_case_tables(void)
  41 {
  42         TALLOC_CTX *mem_ctx;
  43
  44         mem_ctx = talloc_init("load_case_tables");
  45         if (!mem_ctx) {
  46                 smb_panic("No memory for case_tables");
  47         }
  48         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
  49         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
  50         talloc_free(mem_ctx);
  51         if (upcase_table == NULL) {
  52                 /* try also under codepages for testing purposes */
  53                 upcase_table = map_file("codepages/upcase.dat", 0x20000);
  54                 if (upcase_table == NULL) {
  55                         upcase_table = (void *)-1;
  56                 }
  57         }
  58         if (lowcase_table == NULL) {
  59                 /* try also under codepages for testing purposes */
  60                 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
  61                 if (lowcase_table == NULL) {
  62                         lowcase_table = (void *)-1;
  63                 }
  64         }
  65 }
  66
  67 /**
  68  Convert a codepoint_t to upper case.
  69 **/
  70 _PUBLIC_ codepoint_t toupper_w(codepoint_t val)
  71 {
  72         if (val < 128) {
  73                 return toupper(val);
  74         }
  75         if (upcase_table == NULL) {
  76                 load_case_tables();
  77         }
  78         if (upcase_table == (void *)-1) {
  79                 return val;
  80         }
  81         if (val & 0xFFFF0000) {
  82                 return val;
  83         }
  84         return SVAL(upcase_table, val*2);
  85 }
  86
  87 /**
  88  Convert a codepoint_t to lower case.
  89 **/
  90 _PUBLIC_ codepoint_t tolower_w(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return tolower(val);
  94         }
  95         if (lowcase_table == NULL) {
  96                 load_case_tables();
  97         }
  98         if (lowcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(lowcase_table, val*2);
 105 }
 106
 107 /**
 108   compare two codepoints case insensitively
 109 */
 110 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 111 {
 112         if (c1 == c2 ||
 113             toupper_w(c1) == toupper_w(c2)) {
 114                 return 0;
 115         }
 116         return c1 - c2;
 117 }
 118
 119 /**
 120  Case insensitive string compararison
 121 **/
 122 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
 123 {
 124         codepoint_t c1=0, c2=0;
 125         size_t size1, size2;
 126         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 127
 128         /* handle null ptr comparisons to simplify the use in qsort */
 129         if (s1 == s2) return 0;
 130         if (s1 == NULL) return -1;
 131         if (s2 == NULL) return 1;
 132
 133         while (*s1 && *s2) {
 134                 c1 = next_codepoint(iconv_convenience, s1, &size1);
 135                 c2 = next_codepoint(iconv_convenience, s2, &size2);
 136
 137                 s1 += size1;
 138                 s2 += size2;
 139
 140                 if (c1 == c2) {
 141                         continue;
 142                 }
 143
 144                 if (c1 == INVALID_CODEPOINT ||
 145                     c2 == INVALID_CODEPOINT) {
 146                         /* what else can we do?? */
 147                         return strcasecmp(s1, s2);
 148                 }
 149
 150                 if (toupper_w(c1) != toupper_w(c2)) {
 151                         return c1 - c2;
 152                 }
 153         }
 154
 155         return *s1 - *s2;
 156 }
 157
 158 /**
 159  * Get the next token from a string, return False if none found.
 160  * Handles double-quotes.
 161  *
 162  * Based on a routine by GJC@VILLAGE.COM.
 163  * Extensively modified by Andrew.Tridgell@anu.edu.au
 164  **/
 165 _PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
 166 {
 167         const char *s;
 168         bool quoted;
 169         size_t len=1;
 170
 171         if (!ptr)
 172                 return false;
 173
 174         s = *ptr;
 175
 176         /* default to simple separators */
 177         if (!sep)
 178                 sep = " \t\n\r";
 179
 180         /* find the first non sep char */
 181         while (*s && strchr_m(sep,*s))
 182                 s++;
 183
 184         /* nothing left? */
 185         if (!*s)
 186                 return false;
 187
 188         /* copy over the token */
 189         for (quoted = false; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
 190                 if (*s == '\"') {
 191                         quoted = !quoted;
 192                 } else {
 193                         len++;
 194                         *buff++ = *s;
 195                 }
 196         }
 197
 198         *ptr = (*s) ? s+1 : s;
 199         *buff = 0;
 200
 201         return true;
 202 }
 203
 204 /**
 205  Case insensitive string compararison, length limited
 206 **/
 207 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 208 {
 209         codepoint_t c1=0, c2=0;
 210         size_t size1, size2;
 211         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 212
 213         /* handle null ptr comparisons to simplify the use in qsort */
 214         if (s1 == s2) return 0;
 215         if (s1 == NULL) return -1;
 216         if (s2 == NULL) return 1;
 217
 218         while (*s1 && *s2 && n) {
 219                 n--;
 220
 221                 c1 = next_codepoint(iconv_convenience, s1, &size1);
 222                 c2 = next_codepoint(iconv_convenience, s2, &size2);
 223
 224                 s1 += size1;
 225                 s2 += size2;
 226
 227                 if (c1 == c2) {
 228                         continue;
 229                 }
 230
 231                 if (c1 == INVALID_CODEPOINT ||
 232                     c2 == INVALID_CODEPOINT) {
 233                         /* what else can we do?? */
 234                         return strcasecmp(s1, s2);
 235                 }
 236
 237                 if (toupper_w(c1) != toupper_w(c2)) {
 238                         return c1 - c2;
 239                 }
 240         }
 241
 242         if (n == 0) {
 243                 return 0;
 244         }
 245
 246         return *s1 - *s2;
 247 }
 248
 249 /**
 250  * Compare 2 strings.
 251  *
 252  * @note The comparison is case-insensitive.
 253  **/
 254 _PUBLIC_ bool strequal_w(const char *s1, const char *s2)
 255 {
 256         return strcasecmp_m(s1,s2) == 0;
 257 }
 258
 259 /**
 260  Compare 2 strings (case sensitive).
 261 **/
 262 _PUBLIC_ bool strcsequal_w(const char *s1,const char *s2)
 263 {
 264         if (s1 == s2)
 265                 return true;
 266         if (!s1 || !s2)
 267                 return false;
 268
 269         return strcmp(s1,s2) == 0;
 270 }
 271
 272
 273 /**
 274  String replace.
 275  NOTE: oldc and newc must be 7 bit characters
 276 **/
 277 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
 278 {
 279         while (s && *s) {
 280                 size_t size;
 281                 codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 282                 if (c == oldc) {
 283                         *s = newc;
 284                 }
 285                 s += size;
 286         }
 287 }
 288
 289 /**
 290  Paranoid strcpy into a buffer of given length (includes terminating
 291  zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
 292  and replaces with '_'. Deliberately does *NOT* check for multibyte
 293  characters. Don't change it !
 294 **/
 295
 296 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
 297 {
 298         size_t len, i;
 299
 300         if (maxlength == 0) {
 301                 /* can't fit any bytes at all! */
 302                 return NULL;
 303         }
 304
 305         if (!dest) {
 306                 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
 307                 return NULL;
 308         }
 309
 310         if (!src) {
 311                 *dest = 0;
 312                 return dest;
 313         }
 314
 315         len = strlen(src);
 316         if (len >= maxlength)
 317                 len = maxlength - 1;
 318
 319         if (!other_safe_chars)
 320                 other_safe_chars = "";
 321
 322         for(i = 0; i < len; i++) {
 323                 int val = (src[i] & 0xff);
 324                 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
 325                         dest[i] = src[i];
 326                 else
 327                         dest[i] = '_';
 328         }
 329
 330         dest[i] = '\0';
 331
 332         return dest;
 333 }
 334
 335 /**
 336  Count the number of UCS2 characters in a string. Normally this will
 337  be the same as the number of bytes in a string for single byte strings,
 338  but will be different for multibyte.
 339 **/
 340 _PUBLIC_ size_t strlen_m(const char *s)
 341 {
 342         size_t count = 0;
 343
 344         if (!s) {
 345                 return 0;
 346         }
 347
 348         while (*s && !(((uint8_t)*s) & 0x80)) {
 349                 s++;
 350                 count++;
 351         }
 352
 353         if (!*s) {
 354                 return count;
 355         }
 356
 357         while (*s) {
 358                 size_t c_size;
 359                 codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &c_size);
 360                 if (c < 0x10000) {
 361                         count += 1;
 362                 } else {
 363                         count += 2;
 364                 }
 365                 s += c_size;
 366         }
 367
 368         return count;
 369 }
 370
 371 /**
 372    Work out the number of multibyte chars in a string, including the NULL
 373    terminator.
 374 **/
 375 _PUBLIC_ size_t strlen_m_term(const char *s)
 376 {
 377         if (!s) {
 378                 return 0;
 379         }
 380
 381         return strlen_m(s) + 1;
 382 }
 383
 384 /**
 385  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 386 **/
 387 _PUBLIC_ char *strchr_m(const char *s, char c)
 388 {
 389         if (s == NULL) {
 390                 return NULL;
 391         }
 392         /* characters below 0x3F are guaranteed to not appear in
 393            non-initial position in multi-byte charsets */
 394         if ((c & 0xC0) == 0) {
 395                 return strchr(s, c);
 396         }
 397
 398         while (*s) {
 399                 size_t size;
 400                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 401                 if (c2 == c) {
 402                         return discard_const_p(char, s);
 403                 }
 404                 s += size;
 405         }
 406
 407         return NULL;
 408 }
 409
 410 /**
 411  * Multibyte-character version of strrchr
 412  */
 413 _PUBLIC_ char *strrchr_m(const char *s, char c)
 414 {
 415         char *ret = NULL;
 416
 417         if (s == NULL) {
 418                 return NULL;
 419         }
 420
 421         /* characters below 0x3F are guaranteed to not appear in
 422            non-initial position in multi-byte charsets */
 423         if ((c & 0xC0) == 0) {
 424                 return strrchr(s, c);
 425         }
 426
 427         while (*s) {
 428                 size_t size;
 429                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 430                 if (c2 == c) {
 431                         ret = discard_const_p(char, s);
 432                 }
 433                 s += size;
 434         }
 435
 436         return ret;
 437 }
 438
 439 /**
 440   return True if any (multi-byte) character is lower case
 441 */
 442 _PUBLIC_ bool strhaslower(const char *string)
 443 {
 444         while (*string) {
 445                 size_t c_size;
 446                 codepoint_t s;
 447                 codepoint_t t;
 448
 449                 s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
 450                 string += c_size;
 451
 452                 t = toupper_w(s);
 453
 454                 if (s != t) {
 455                         return true; /* that means it has lower case chars */
 456                 }
 457         }
 458
 459         return false;
 460 }
 461
 462 /**
 463   return True if any (multi-byte) character is upper case
 464 */
 465 _PUBLIC_ bool strhasupper(const char *string)
 466 {
 467         while (*string) {
 468                 size_t c_size;
 469                 codepoint_t s;
 470                 codepoint_t t;
 471
 472                 s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
 473                 string += c_size;
 474
 475                 t = tolower_w(s);
 476
 477                 if (s != t) {
 478                         return true; /* that means it has upper case chars */
 479                 }
 480         }
 481
 482         return false;
 483 }
 484
 485 /**
 486  Convert a string to lower case, allocated with talloc
 487 **/
 488 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
 489 {
 490         size_t size=0;
 491         char *dest;
 492         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 493
 494         /* this takes advantage of the fact that upper/lower can't
 495            change the length of a character by more than 1 byte */
 496         dest = talloc_array(ctx, char, 2*(strlen(src))+1);
 497         if (dest == NULL) {
 498                 return NULL;
 499         }
 500
 501         while (*src) {
 502                 size_t c_size;
 503                 codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
 504                 src += c_size;
 505
 506                 c = tolower_w(c);
 507
 508                 c_size = push_codepoint(iconv_convenience, dest+size, c);
 509                 if (c_size == -1) {
 510                         talloc_free(dest);
 511                         return NULL;
 512                 }
 513                 size += c_size;
 514         }
 515
 516         dest[size] = 0;
 517
 518         /* trim it so talloc_append_string() works */
 519         dest = talloc_realloc(ctx, dest, char, size+1);
 520
 521         talloc_set_name_const(dest, dest);
 522
 523         return dest;
 524 }
 525
 526 /**
 527  Convert a string to UPPER case, allocated with talloc
 528  source length limited to n bytes
 529 **/
 530 _PUBLIC_ char *strupper_talloc_n(TALLOC_CTX *ctx, const char *src, size_t n)
 531 {
 532         size_t size=0;
 533         char *dest;
 534         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 535
 536         if (!src) {
 537                 return NULL;
 538         }
 539
 540         /* this takes advantage of the fact that upper/lower can't
 541            change the length of a character by more than 1 byte */
 542         dest = talloc_array(ctx, char, 2*(n+1));
 543         if (dest == NULL) {
 544                 return NULL;
 545         }
 546
 547         while (*src && n--) {
 548                 size_t c_size;
 549                 codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
 550                 src += c_size;
 551
 552                 c = toupper_w(c);
 553
 554                 c_size = push_codepoint(iconv_convenience, dest+size, c);
 555                 if (c_size == -1) {
 556                         talloc_free(dest);
 557                         return NULL;
 558                 }
 559                 size += c_size;
 560         }
 561
 562         dest[size] = 0;
 563
 564         /* trim it so talloc_append_string() works */
 565         dest = talloc_realloc(ctx, dest, char, size+1);
 566
 567         talloc_set_name_const(dest, dest);
 568
 569         return dest;
 570 }
 571
 572 /**
 573  Convert a string to UPPER case, allocated with talloc
 574 **/
 575 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
 576 {
 577         return strupper_talloc_n(ctx, src, src?strlen(src):0);
 578 }
 579
 580 /**
 581  talloc_strdup() a unix string to upper case.
 582 **/
 583 _PUBLIC_ char *talloc_strdup_upper(TALLOC_CTX *ctx, const char *src)
 584 {
 585         return strupper_talloc(ctx, src);
 586 }
 587
 588 /**
 589  Convert a string to lower case.
 590 **/
 591 _PUBLIC_ void strlower_m(char *s)
 592 {
 593         char *d;
 594         struct smb_iconv_convenience *iconv_convenience;
 595
 596         /* this is quite a common operation, so we want it to be
 597            fast. We optimise for the ascii case, knowing that all our
 598            supported multi-byte character sets are ascii-compatible
 599            (ie. they match for the first 128 chars) */
 600         while (*s && !(((uint8_t)*s) & 0x80)) {
 601                 *s = tolower((uint8_t)*s);
 602                 s++;
 603         }
 604
 605         if (!*s)
 606                 return;
 607
 608         iconv_convenience = lp_iconv_convenience(global_loadparm);
 609
 610         d = s;
 611
 612         while (*s) {
 613                 size_t c_size, c_size2;
 614                 codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
 615                 c_size2 = push_codepoint(iconv_convenience, d, tolower_w(c));
 616                 if (c_size2 > c_size) {
 617                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
 618                                  c, tolower_w(c), (int)c_size, (int)c_size2));
 619                         smb_panic("codepoint expansion in strlower_m\n");
 620                 }
 621                 s += c_size;
 622                 d += c_size2;
 623         }
 624         *d = 0;
 625 }
 626
 627 /**
 628  Convert a string to UPPER case.
 629 **/
 630 _PUBLIC_ void strupper_m(char *s)
 631 {
 632         char *d;
 633         struct smb_iconv_convenience *iconv_convenience;
 634
 635         /* this is quite a common operation, so we want it to be
 636            fast. We optimise for the ascii case, knowing that all our
 637            supported multi-byte character sets are ascii-compatible
 638            (ie. they match for the first 128 chars) */
 639         while (*s && !(((uint8_t)*s) & 0x80)) {
 640                 *s = toupper((uint8_t)*s);
 641                 s++;
 642         }
 643
 644         if (!*s)
 645                 return;
 646
 647         iconv_convenience = lp_iconv_convenience(global_loadparm);
 648
 649         d = s;
 650
 651         while (*s) {
 652                 size_t c_size, c_size2;
 653                 codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
 654                 c_size2 = push_codepoint(iconv_convenience, d, toupper_w(c));
 655                 if (c_size2 > c_size) {
 656                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
 657                                  c, toupper_w(c), (int)c_size, (int)c_size2));
 658                         smb_panic("codepoint expansion in strupper_m\n");
 659                 }
 660                 s += c_size;
 661                 d += c_size2;
 662         }
 663         *d = 0;
 664 }
 665
 666
 667 /**
 668  Find the number of 'c' chars in a string
 669 **/
 670 _PUBLIC_ size_t count_chars_w(const char *s, char c)
 671 {
 672         size_t count = 0;
 673
 674         while (*s) {
 675                 size_t size;
 676                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 677                 if (c2 == c) count++;
 678                 s += size;
 679         }
 680
 681         return count;
 682 }
 683
 684