source4/lib/charset/util_unistr.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 3 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  19 */
  20
  21 #include "includes.h"
  22 #include "system/locale.h"
  23 #include "dynconfig/dynconfig.h"
  24 #include "param/param.h"
  25
  26 /**
  27  * @file
  28  * @brief Unicode string manipulation
  29  */
  30
  31 /* these 2 tables define the unicode case handling.  They are loaded
  32    at startup either via mmap() or read() from the lib directory */
  33 static void *upcase_table;
  34 static void *lowcase_table;
  35
  36
  37 /*******************************************************************
  38 load the case handling tables
  39 ********************************************************************/
  40 static void load_case_tables(void)
  41 {
  42         TALLOC_CTX *mem_ctx;
  43
  44         mem_ctx = talloc_init("load_case_tables");
  45         if (!mem_ctx) {
  46                 smb_panic("No memory for case_tables");
  47         }
  48         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_DATADIR), 0x20000);
  49         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_DATADIR), 0x20000);
  50         talloc_free(mem_ctx);
  51         if (upcase_table == NULL) {
  52                 /* try also under codepages for testing purposes */
  53                 upcase_table = map_file("codepages/upcase.dat", 0x20000);
  54                 if (upcase_table == NULL) {
  55                         upcase_table = (void *)-1;
  56                 }
  57         }
  58         if (lowcase_table == NULL) {
  59                 /* try also under codepages for testing purposes */
  60                 lowcase_table = map_file("codepages/lowcase.dat", 0x20000);
  61                 if (lowcase_table == NULL) {
  62                         lowcase_table = (void *)-1;
  63                 }
  64         }
  65 }
  66
  67 /**
  68  Convert a codepoint_t to upper case.
  69 **/
  70 _PUBLIC_ codepoint_t toupper_w(codepoint_t val)
  71 {
  72         if (val < 128) {
  73                 return toupper(val);
  74         }
  75         if (upcase_table == NULL) {
  76                 load_case_tables();
  77         }
  78         if (upcase_table == (void *)-1) {
  79                 return val;
  80         }
  81         if (val & 0xFFFF0000) {
  82                 return val;
  83         }
  84         return SVAL(upcase_table, val*2);
  85 }
  86
  87 /**
  88  Convert a codepoint_t to lower case.
  89 **/
  90 _PUBLIC_ codepoint_t tolower_w(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return tolower(val);
  94         }
  95         if (lowcase_table == NULL) {
  96                 load_case_tables();
  97         }
  98         if (lowcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(lowcase_table, val*2);
 105 }
 106
 107 /**
 108   compare two codepoints case insensitively
 109 */
 110 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 111 {
 112         if (c1 == c2 ||
 113             toupper_w(c1) == toupper_w(c2)) {
 114                 return 0;
 115         }
 116         return c1 - c2;
 117 }
 118
 119 /**
 120  Case insensitive string compararison
 121 **/
 122 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
 123 {
 124         codepoint_t c1=0, c2=0;
 125         size_t size1, size2;
 126         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 127
 128         /* handle null ptr comparisons to simplify the use in qsort */
 129         if (s1 == s2) return 0;
 130         if (s1 == NULL) return -1;
 131         if (s2 == NULL) return 1;
 132
 133         while (*s1 && *s2) {
 134                 c1 = next_codepoint(iconv_convenience, s1, &size1);
 135                 c2 = next_codepoint(iconv_convenience, s2, &size2);
 136
 137                 s1 += size1;
 138                 s2 += size2;
 139
 140                 if (c1 == c2) {
 141                         continue;
 142                 }
 143
 144                 if (c1 == INVALID_CODEPOINT ||
 145                     c2 == INVALID_CODEPOINT) {
 146                         /* what else can we do?? */
 147                         return strcasecmp(s1, s2);
 148                 }
 149
 150                 if (toupper_w(c1) != toupper_w(c2)) {
 151                         return c1 - c2;
 152                 }
 153         }
 154
 155         return *s1 - *s2;
 156 }
 157
 158 /**
 159  * Get the next token from a string, return False if none found.
 160  * Handles double-quotes.
 161  *
 162  * Based on a routine by GJC@VILLAGE.COM.
 163  * Extensively modified by Andrew.Tridgell@anu.edu.au
 164  **/
 165 _PUBLIC_ bool next_token(const char **ptr,char *buff, const char *sep, size_t bufsize)
 166 {
 167         const char *s;
 168         bool quoted;
 169         size_t len=1;
 170
 171         if (!ptr)
 172                 return false;
 173
 174         s = *ptr;
 175
 176         /* default to simple separators */
 177         if (!sep)
 178                 sep = " \t\n\r";
 179
 180         /* find the first non sep char */
 181         while (*s && strchr_m(sep,*s))
 182                 s++;
 183
 184         /* nothing left? */
 185         if (!*s)
 186                 return false;
 187
 188         /* copy over the token */
 189         for (quoted = false; len < bufsize && *s && (quoted || !strchr_m(sep,*s)); s++) {
 190                 if (*s == '\"') {
 191                         quoted = !quoted;
 192                 } else {
 193                         len++;
 194                         *buff++ = *s;
 195                 }
 196         }
 197
 198         *ptr = (*s) ? s+1 : s;
 199         *buff = 0;
 200
 201         return true;
 202 }
 203
 204 /**
 205  Case insensitive string compararison, length limited
 206 **/
 207 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
 208 {
 209         codepoint_t c1=0, c2=0;
 210         size_t size1, size2;
 211         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 212
 213         /* handle null ptr comparisons to simplify the use in qsort */
 214         if (s1 == s2) return 0;
 215         if (s1 == NULL) return -1;
 216         if (s2 == NULL) return 1;
 217
 218         while (*s1 && *s2 && n) {
 219                 n--;
 220
 221                 c1 = next_codepoint(iconv_convenience, s1, &size1);
 222                 c2 = next_codepoint(iconv_convenience, s2, &size2);
 223
 224                 s1 += size1;
 225                 s2 += size2;
 226
 227                 if (c1 == c2) {
 228                         continue;
 229                 }
 230
 231                 if (c1 == INVALID_CODEPOINT ||
 232                     c2 == INVALID_CODEPOINT) {
 233                         /* what else can we do?? */
 234                         return strcasecmp(s1, s2);
 235                 }
 236
 237                 if (toupper_w(c1) != toupper_w(c2)) {
 238                         return c1 - c2;
 239                 }
 240         }
 241
 242         if (n == 0) {
 243                 return 0;
 244         }
 245
 246         return *s1 - *s2;
 247 }
 248
 249 /**
 250  * Compare 2 strings.
 251  *
 252  * @note The comparison is case-insensitive.
 253  **/
 254 _PUBLIC_ bool strequal_w(const char *s1, const char *s2)
 255 {
 256         return strcasecmp_m(s1,s2) == 0;
 257 }
 258
 259 /**
 260  Compare 2 strings (case sensitive).
 261 **/
 262 _PUBLIC_ bool strcsequal_w(const char *s1,const char *s2)
 263 {
 264         if (s1 == s2)
 265                 return true;
 266         if (!s1 || !s2)
 267                 return false;
 268
 269         return strcmp(s1,s2) == 0;
 270 }
 271
 272
 273 /**
 274  String replace.
 275  NOTE: oldc and newc must be 7 bit characters
 276 **/
 277 _PUBLIC_ void string_replace_w(char *s, char oldc, char newc)
 278 {
 279         while (s && *s) {
 280                 size_t size;
 281                 codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 282                 if (c == oldc) {
 283                         *s = newc;
 284                 }
 285                 s += size;
 286         }
 287 }
 288
 289 /**
 290  Paranoid strcpy into a buffer of given length (includes terminating
 291  zero. Strips out all but 'a-Z0-9' and the character in other_safe_chars
 292  and replaces with '_'. Deliberately does *NOT* check for multibyte
 293  characters. Don't change it !
 294 **/
 295
 296 _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_chars, size_t maxlength)
 297 {
 298         size_t len, i;
 299
 300         if (maxlength == 0) {
 301                 /* can't fit any bytes at all! */
 302                 return NULL;
 303         }
 304
 305         if (!dest) {
 306                 DEBUG(0,("ERROR: NULL dest in alpha_strcpy\n"));
 307                 return NULL;
 308         }
 309
 310         if (!src) {
 311                 *dest = 0;
 312                 return dest;
 313         }
 314
 315         len = strlen(src);
 316         if (len >= maxlength)
 317                 len = maxlength - 1;
 318
 319         if (!other_safe_chars)
 320                 other_safe_chars = "";
 321
 322         for(i = 0; i < len; i++) {
 323                 int val = (src[i] & 0xff);
 324                 if (isupper(val) || islower(val) || isdigit(val) || strchr_m(other_safe_chars, val))
 325                         dest[i] = src[i];
 326                 else
 327                         dest[i] = '_';
 328         }
 329
 330         dest[i] = '\0';
 331
 332         return dest;
 333 }
 334
 335 /**
 336  Count the number of UCS2 characters in a string. Normally this will
 337  be the same as the number of bytes in a string for single byte strings,
 338  but will be different for multibyte.
 339 **/
 340 _PUBLIC_ size_t strlen_m(const char *s)
 341 {
 342         size_t count = 0;
 343
 344         if (!s) {
 345                 return 0;
 346         }
 347
 348         while (*s && !(((uint8_t)*s) & 0x80)) {
 349                 s++;
 350                 count++;
 351         }
 352
 353         if (!*s) {
 354                 return count;
 355         }
 356
 357         while (*s) {
 358                 size_t c_size;
 359                 codepoint_t c = next_codepoint(lp_iconv_convenience(global_loadparm), s, &c_size);
 360                 if (c < 0x10000) {
 361                         count += 1;
 362                 } else {
 363                         count += 2;
 364                 }
 365                 s += c_size;
 366         }
 367
 368         return count;
 369 }
 370
 371 /**
 372    Work out the number of multibyte chars in a string, including the NULL
 373    terminator.
 374 **/
 375 _PUBLIC_ size_t strlen_m_term(const char *s)
 376 {
 377         if (!s) {
 378                 return 0;
 379         }
 380
 381         return strlen_m(s) + 1;
 382 }
 383
 384 /**
 385  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 386 **/
 387 _PUBLIC_ char *strchr_m(const char *s, char c)
 388 {
 389         /* characters below 0x3F are guaranteed to not appear in
 390            non-initial position in multi-byte charsets */
 391         if ((c & 0xC0) == 0) {
 392                 return strchr(s, c);
 393         }
 394
 395         while (*s) {
 396                 size_t size;
 397                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 398                 if (c2 == c) {
 399                         return discard_const_p(char, s);
 400                 }
 401                 s += size;
 402         }
 403
 404         return NULL;
 405 }
 406
 407 /**
 408  * Multibyte-character version of strrchr
 409  */
 410 _PUBLIC_ char *strrchr_m(const char *s, char c)
 411 {
 412         char *ret = NULL;
 413
 414         /* characters below 0x3F are guaranteed to not appear in
 415            non-initial position in multi-byte charsets */
 416         if ((c & 0xC0) == 0) {
 417                 return strrchr(s, c);
 418         }
 419
 420         while (*s) {
 421                 size_t size;
 422                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 423                 if (c2 == c) {
 424                         ret = discard_const_p(char, s);
 425                 }
 426                 s += size;
 427         }
 428
 429         return ret;
 430 }
 431
 432 /**
 433   return True if any (multi-byte) character is lower case
 434 */
 435 _PUBLIC_ bool strhaslower(const char *string)
 436 {
 437         while (*string) {
 438                 size_t c_size;
 439                 codepoint_t s;
 440                 codepoint_t t;
 441
 442                 s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
 443                 string += c_size;
 444
 445                 t = toupper_w(s);
 446
 447                 if (s != t) {
 448                         return true; /* that means it has lower case chars */
 449                 }
 450         }
 451
 452         return false;
 453 }
 454
 455 /**
 456   return True if any (multi-byte) character is upper case
 457 */
 458 _PUBLIC_ bool strhasupper(const char *string)
 459 {
 460         while (*string) {
 461                 size_t c_size;
 462                 codepoint_t s;
 463                 codepoint_t t;
 464
 465                 s = next_codepoint(lp_iconv_convenience(global_loadparm), string, &c_size);
 466                 string += c_size;
 467
 468                 t = tolower_w(s);
 469
 470                 if (s != t) {
 471                         return true; /* that means it has upper case chars */
 472                 }
 473         }
 474
 475         return false;
 476 }
 477
 478 /**
 479  Convert a string to lower case, allocated with talloc
 480 **/
 481 _PUBLIC_ char *strlower_talloc(TALLOC_CTX *ctx, const char *src)
 482 {
 483         size_t size=0;
 484         char *dest;
 485         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 486
 487         /* this takes advantage of the fact that upper/lower can't
 488            change the length of a character by more than 1 byte */
 489         dest = talloc_array(ctx, char, 2*(strlen(src))+1);
 490         if (dest == NULL) {
 491                 return NULL;
 492         }
 493
 494         while (*src) {
 495                 size_t c_size;
 496                 codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
 497                 src += c_size;
 498
 499                 c = tolower_w(c);
 500
 501                 c_size = push_codepoint(iconv_convenience, dest+size, c);
 502                 if (c_size == -1) {
 503                         talloc_free(dest);
 504                         return NULL;
 505                 }
 506                 size += c_size;
 507         }
 508
 509         dest[size] = 0;
 510
 511         /* trim it so talloc_append_string() works */
 512         dest = talloc_realloc(ctx, dest, char, size+1);
 513
 514         talloc_set_name_const(dest, dest);
 515
 516         return dest;
 517 }
 518
 519 /**
 520  Convert a string to UPPER case, allocated with talloc
 521 **/
 522 _PUBLIC_ char *strupper_talloc(TALLOC_CTX *ctx, const char *src)
 523 {
 524         size_t size=0;
 525         char *dest;
 526         struct smb_iconv_convenience *iconv_convenience = lp_iconv_convenience(global_loadparm);
 527
 528         if (!src) {
 529                 return NULL;
 530         }
 531
 532         /* this takes advantage of the fact that upper/lower can't
 533            change the length of a character by more than 1 byte */
 534         dest = talloc_array(ctx, char, 2*(strlen(src))+1);
 535         if (dest == NULL) {
 536                 return NULL;
 537         }
 538
 539         while (*src) {
 540                 size_t c_size;
 541                 codepoint_t c = next_codepoint(iconv_convenience, src, &c_size);
 542                 src += c_size;
 543
 544                 c = toupper_w(c);
 545
 546                 c_size = push_codepoint(iconv_convenience, dest+size, c);
 547                 if (c_size == -1) {
 548                         talloc_free(dest);
 549                         return NULL;
 550                 }
 551                 size += c_size;
 552         }
 553
 554         dest[size] = 0;
 555
 556         /* trim it so talloc_append_string() works */
 557         dest = talloc_realloc(ctx, dest, char, size+1);
 558
 559         talloc_set_name_const(dest, dest);
 560
 561         return dest;
 562 }
 563
 564 /**
 565  Convert a string to lower case.
 566 **/
 567 _PUBLIC_ void strlower_m(char *s)
 568 {
 569         char *d;
 570         struct smb_iconv_convenience *iconv_convenience;
 571
 572         /* this is quite a common operation, so we want it to be
 573            fast. We optimise for the ascii case, knowing that all our
 574            supported multi-byte character sets are ascii-compatible
 575            (ie. they match for the first 128 chars) */
 576         while (*s && !(((uint8_t)*s) & 0x80)) {
 577                 *s = tolower((uint8_t)*s);
 578                 s++;
 579         }
 580
 581         if (!*s)
 582                 return;
 583
 584         iconv_convenience = lp_iconv_convenience(global_loadparm);
 585
 586         d = s;
 587
 588         while (*s) {
 589                 size_t c_size, c_size2;
 590                 codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
 591                 c_size2 = push_codepoint(iconv_convenience, d, tolower_w(c));
 592                 if (c_size2 > c_size) {
 593                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strlower_m\n",
 594                                  c, tolower_w(c), (int)c_size, (int)c_size2));
 595                         smb_panic("codepoint expansion in strlower_m\n");
 596                 }
 597                 s += c_size;
 598                 d += c_size2;
 599         }
 600         *d = 0;
 601 }
 602
 603 /**
 604  Convert a string to UPPER case.
 605 **/
 606 _PUBLIC_ void strupper_m(char *s)
 607 {
 608         char *d;
 609         struct smb_iconv_convenience *iconv_convenience;
 610
 611         /* this is quite a common operation, so we want it to be
 612            fast. We optimise for the ascii case, knowing that all our
 613            supported multi-byte character sets are ascii-compatible
 614            (ie. they match for the first 128 chars) */
 615         while (*s && !(((uint8_t)*s) & 0x80)) {
 616                 *s = toupper((uint8_t)*s);
 617                 s++;
 618         }
 619
 620         if (!*s)
 621                 return;
 622
 623         iconv_convenience = lp_iconv_convenience(global_loadparm);
 624
 625         d = s;
 626
 627         while (*s) {
 628                 size_t c_size, c_size2;
 629                 codepoint_t c = next_codepoint(iconv_convenience, s, &c_size);
 630                 c_size2 = push_codepoint(iconv_convenience, d, toupper_w(c));
 631                 if (c_size2 > c_size) {
 632                         DEBUG(0,("FATAL: codepoint 0x%x (0x%x) expanded from %d to %d bytes in strupper_m\n",
 633                                  c, toupper_w(c), (int)c_size, (int)c_size2));
 634                         smb_panic("codepoint expansion in strupper_m\n");
 635                 }
 636                 s += c_size;
 637                 d += c_size2;
 638         }
 639         *d = 0;
 640 }
 641
 642
 643 /**
 644  Find the number of 'c' chars in a string
 645 **/
 646 _PUBLIC_ size_t count_chars_w(const char *s, char c)
 647 {
 648         size_t count = 0;
 649
 650         while (*s) {
 651                 size_t size;
 652                 codepoint_t c2 = next_codepoint(lp_iconv_convenience(global_loadparm), s, &size);
 653                 if (c2 == c) count++;
 654                 s += size;
 655         }
 656
 657         return count;
 658 }
 659
 660