lib/util/charset/codepoints.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Character set conversion Extensions
   4    Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
   5    Copyright (C) Andrew Tridgell 2001
   6    Copyright (C) Simo Sorce 2001
   7    Copyright (C) Jelmer Vernooij 2007
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21
  22 */
  23 #include "includes.h"
  24 #include "lib/util/charset/charset.h"
  25 #include "system/locale.h"
  26 #include "dynconfig/dynconfig.h"
  27
  28 #ifdef strcasecmp
  29 #undef strcasecmp
  30 #endif
  31
  32 /**
  33  * @file
  34  * @brief Unicode string manipulation
  35  */
  36
  37 /* these 2 tables define the unicode case handling.  They are loaded
  38    at startup either via mmap() or read() from the lib directory */
  39 static void *upcase_table;
  40 static void *lowcase_table;
  41
  42
  43 /*******************************************************************
  44 load the case handling tables
  45
  46 This is the function that should be called from library code.
  47 ********************************************************************/
  48 void load_case_tables_library(void)
  49 {
  50         TALLOC_CTX *mem_ctx;
  51
  52         mem_ctx = talloc_init("load_case_tables");
  53         if (!mem_ctx) {
  54                 smb_panic("No memory for case_tables");
  55         }
  56         upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  57         lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
  58         talloc_free(mem_ctx);
  59         if (upcase_table == NULL) {
  60                 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  61                 upcase_table = (void *)-1;
  62         }
  63         if (lowcase_table == NULL) {
  64                 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
  65                 lowcase_table = (void *)-1;
  66         }
  67 }
  68
  69 /*******************************************************************
  70 load the case handling tables
  71
  72 This MUST only be called from main() in application code, never from a
  73 library.  We don't know if the calling program has already done
  74 setlocale() to another value, and can't tell if they have.
  75 ********************************************************************/
  76 void load_case_tables(void)
  77 {
  78         /* This is a useful global hook where we can ensure that the
  79          * locale is set from the environment.  This is needed so that
  80          * we can use LOCALE as a codepage */
  81 #ifdef HAVE_SETLOCALE
  82         setlocale(LC_ALL, "");
  83 #endif
  84         load_case_tables_library();
  85 }
  86
  87 /**
  88  Convert a codepoint_t to upper case.
  89 **/
  90 _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
  91 {
  92         if (val < 128) {
  93                 return toupper(val);
  94         }
  95         if (upcase_table == NULL) {
  96                 load_case_tables_library();
  97         }
  98         if (upcase_table == (void *)-1) {
  99                 return val;
 100         }
 101         if (val & 0xFFFF0000) {
 102                 return val;
 103         }
 104         return SVAL(upcase_table, val*2);
 105 }
 106
 107 /**
 108  Convert a codepoint_t to lower case.
 109 **/
 110 _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
 111 {
 112         if (val < 128) {
 113                 return tolower(val);
 114         }
 115         if (lowcase_table == NULL) {
 116                 load_case_tables_library();
 117         }
 118         if (lowcase_table == (void *)-1) {
 119                 return val;
 120         }
 121         if (val & 0xFFFF0000) {
 122                 return val;
 123         }
 124         return SVAL(lowcase_table, val*2);
 125 }
 126
 127 /**
 128  If we upper cased this character, would we get the same character?
 129 **/
 130 _PUBLIC_ bool islower_m(codepoint_t val)
 131 {
 132         return (toupper_m(val) != val);
 133 }
 134
 135 /**
 136  If we lower cased this character, would we get the same character?
 137 **/
 138 _PUBLIC_ bool isupper_m(codepoint_t val)
 139 {
 140         return (tolower_m(val) != val);
 141 }
 142
 143 /**
 144   compare two codepoints case insensitively
 145 */
 146 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
 147 {
 148         if (c1 == c2 ||
 149             toupper_m(c1) == toupper_m(c2)) {
 150                 return 0;
 151         }
 152         return c1 - c2;
 153 }
 154
 155
 156 struct smb_iconv_handle {
 157         TALLOC_CTX *child_ctx;
 158         const char *unix_charset;
 159         const char *dos_charset;
 160         const char *display_charset;
 161         bool native_iconv;
 162         smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
 163 };
 164
 165 struct smb_iconv_handle *global_iconv_handle = NULL;
 166
 167 struct smb_iconv_handle *get_iconv_handle(void)
 168 {
 169         if (global_iconv_handle == NULL)
 170                 global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
 171                                                               "ASCII", "UTF-8", true, NULL);
 172         return global_iconv_handle;
 173 }
 174
 175 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx,
 176                                                   const char *dos_charset,
 177                                                   const char *unix_charset)
 178 {
 179         return smb_iconv_handle_reinit(mem_ctx,
 180                                        dos_charset, unix_charset, true, NULL);
 181 }
 182
 183 /**
 184  * Return the name of a charset to give to iconv().
 185  **/
 186 const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
 187 {
 188         switch (ch) {
 189         case CH_UTF16: return "UTF-16LE";
 190         case CH_UNIX: return ic->unix_charset;
 191         case CH_DOS: return ic->dos_charset;
 192         case CH_UTF8: return "UTF8";
 193         case CH_UTF16BE: return "UTF-16BE";
 194         case CH_UTF16MUNGED: return "UTF16_MUNGED";
 195         default:
 196         return "ASCII";
 197         }
 198 }
 199
 200 /**
 201  re-initialize iconv conversion descriptors
 202 **/
 203 static int close_iconv_handle(struct smb_iconv_handle *data)
 204 {
 205         unsigned c1, c2;
 206         for (c1=0;c1<NUM_CHARSETS;c1++) {
 207                 for (c2=0;c2<NUM_CHARSETS;c2++) {
 208                         if (data->conv_handles[c1][c2] != NULL) {
 209                                 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
 210                                         smb_iconv_close(data->conv_handles[c1][c2]);
 211                                 }
 212                                 data->conv_handles[c1][c2] = NULL;
 213                         }
 214                 }
 215         }
 216
 217         return 0;
 218 }
 219
 220 static const char *map_locale(const char *charset)
 221 {
 222         if (strcmp(charset, "LOCALE") != 0) {
 223                 return charset;
 224         }
 225 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
 226         {
 227                 const char *ln;
 228                 smb_iconv_t handle;
 229
 230                 ln = nl_langinfo(CODESET);
 231                 if (ln == NULL) {
 232                         DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
 233                         return "ASCII";
 234                 }
 235                 /* Check whether the charset name is supported
 236                    by iconv */
 237                 handle = smb_iconv_open(ln, "UCS-2LE");
 238                 if (handle == (smb_iconv_t) -1) {
 239                         DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
 240                         return "ASCII";
 241                 } else {
 242                         DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
 243                         smb_iconv_close(handle);
 244                 }
 245                 return ln;
 246         }
 247 #endif
 248         return "ASCII";
 249 }
 250
 251 /*
 252   the old_ic is passed in here as the smb_iconv_handle structure
 253   is used as a global pointer in some places (eg. python modules). We
 254   don't want to invalidate those global pointers, but we do want to
 255   update them with the right charset information when loadparm
 256   runs. To do that we need to re-use the structure pointer, but
 257   re-fill the elements in the structure with the updated values
 258  */
 259 _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
 260                                                                     const char *dos_charset,
 261                                                                     const char *unix_charset,
 262                                                                     bool native_iconv,
 263                                                                     struct smb_iconv_handle *old_ic)
 264 {
 265         struct smb_iconv_handle *ret;
 266
 267         if (old_ic != NULL) {
 268                 ret = old_ic;
 269                 close_iconv_handle(ret);
 270                 talloc_free(ret->child_ctx);
 271                 ZERO_STRUCTP(ret);
 272         } else {
 273                 ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
 274         }
 275         if (ret == NULL) {
 276                 return NULL;
 277         }
 278
 279         /* we use a child context to allow us to free all ptrs without
 280            freeing the structure itself */
 281         ret->child_ctx = talloc_new(ret);
 282         if (ret->child_ctx == NULL) {
 283                 return NULL;
 284         }
 285
 286         talloc_set_destructor(ret, close_iconv_handle);
 287
 288         if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
 289                 DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
 290                 dos_charset = "CP850";
 291         }
 292
 293         ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
 294         ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
 295         ret->native_iconv = native_iconv;
 296
 297         return ret;
 298 }
 299
 300 /*
 301   on-demand initialisation of conversion handles
 302 */
 303 smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
 304                             charset_t from, charset_t to)
 305 {
 306         const char *n1, *n2;
 307
 308         if (ic->conv_handles[from][to]) {
 309                 return ic->conv_handles[from][to];
 310         }
 311
 312         n1 = charset_name(ic, from);
 313         n2 = charset_name(ic, to);
 314
 315         ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
 316                                                        ic->native_iconv);
 317
 318         if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
 319                 if ((from == CH_DOS || to == CH_DOS) &&
 320                     strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
 321                         DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
 322                                  charset_name(ic, CH_DOS)));
 323                         ic->dos_charset = "ASCII";
 324
 325                         n1 = charset_name(ic, from);
 326                         n2 = charset_name(ic, to);
 327
 328                         ic->conv_handles[from][to] =
 329                                 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
 330                 }
 331         }
 332
 333         return ic->conv_handles[from][to];
 334 }
 335
 336 /**
 337  * Return the unicode codepoint for the next character in the input
 338  * string in the given src_charset.
 339  * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
 340  *
 341  * Also return the number of bytes consumed (which tells the caller
 342  * how many bytes to skip to get to the next src_charset-character).
 343  *
 344  * This is implemented (in the non-ascii-case) by first converting the
 345  * next character in the input string to UTF16_LE and then calculating
 346  * the unicode codepoint from that.
 347  *
 348  * Return INVALID_CODEPOINT if the next character cannot be converted.
 349  */
 350 _PUBLIC_ codepoint_t next_codepoint_handle_ext(
 351                         struct smb_iconv_handle *ic,
 352                         const char *str, charset_t src_charset,
 353                         size_t *bytes_consumed)
 354 {
 355         /* it cannot occupy more than 4 bytes in UTF16 format */
 356         uint8_t buf[4];
 357         smb_iconv_t descriptor;
 358         size_t ilen_orig;
 359         size_t ilen;
 360         size_t olen;
 361         char *outbuf;
 362
 363         if ((str[0] & 0x80) == 0) {
 364                 *bytes_consumed = 1;
 365                 return (codepoint_t)str[0];
 366         }
 367
 368         /*
 369          * we assume that no multi-byte character can take more than 5 bytes.
 370          * This is OK as we only support codepoints up to 1M (U+100000)
 371          */
 372         ilen_orig = strnlen(str, 5);
 373         ilen = ilen_orig;
 374
 375         descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
 376         if (descriptor == (smb_iconv_t)-1) {
 377                 *bytes_consumed = 1;
 378                 return INVALID_CODEPOINT;
 379         }
 380
 381         /*
 382          * this looks a little strange, but it is needed to cope with
 383          * codepoints above 64k (U+1000) which are encoded as per RFC2781.
 384          */
 385         olen = 2;
 386         outbuf = (char *)buf;
 387         smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
 388         if (olen == 2) {
 389                 olen = 4;
 390                 outbuf = (char *)buf;
 391                 smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
 392                 if (olen == 4) {
 393                         /* we didn't convert any bytes */
 394                         *bytes_consumed = 1;
 395                         return INVALID_CODEPOINT;
 396                 }
 397                 olen = 4 - olen;
 398         } else {
 399                 olen = 2 - olen;
 400         }
 401
 402         *bytes_consumed = ilen_orig - ilen;
 403
 404         if (olen == 2) {
 405                 return (codepoint_t)SVAL(buf, 0);
 406         }
 407         if (olen == 4) {
 408                 /* decode a 4 byte UTF16 character manually */
 409                 return (codepoint_t)0x10000 +
 410                         (buf[2] | ((buf[3] & 0x3)<<8) |
 411                          (buf[0]<<10) | ((buf[1] & 0x3)<<18));
 412         }
 413
 414         /* no other length is valid */
 415         return INVALID_CODEPOINT;
 416 }
 417
 418 /*
 419   return the unicode codepoint for the next multi-byte CH_UNIX character
 420   in the string
 421
 422   also return the number of bytes consumed (which tells the caller
 423   how many bytes to skip to get to the next CH_UNIX character)
 424
 425   return INVALID_CODEPOINT if the next character cannot be converted
 426 */
 427 _PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
 428                                     const char *str, size_t *size)
 429 {
 430         return next_codepoint_handle_ext(ic, str, CH_UNIX, size);
 431 }
 432
 433 /*
 434   push a single codepoint into a CH_UNIX string the target string must
 435   be able to hold the full character, which is guaranteed if it is at
 436   least 5 bytes in size. The caller may pass less than 5 bytes if they
 437   are sure the character will fit (for example, you can assume that
 438   uppercase/lowercase of a character will not add more than 1 byte)
 439
 440   return the number of bytes occupied by the CH_UNIX character, or
 441   -1 on failure
 442 */
 443 _PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
 444                                 char *str, codepoint_t c)
 445 {
 446         smb_iconv_t descriptor;
 447         uint8_t buf[4];
 448         size_t ilen, olen;
 449         const char *inbuf;
 450
 451         if (c < 128) {
 452                 *str = c;
 453                 return 1;
 454         }
 455
 456         descriptor = get_conv_handle(ic,
 457                                      CH_UTF16, CH_UNIX);
 458         if (descriptor == (smb_iconv_t)-1) {
 459                 return -1;
 460         }
 461
 462         if (c < 0x10000) {
 463                 ilen = 2;
 464                 olen = 5;
 465                 inbuf = (char *)buf;
 466                 SSVAL(buf, 0, c);
 467                 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 468                 if (ilen != 0) {
 469                         return -1;
 470                 }
 471                 return 5 - olen;
 472         }
 473
 474         c -= 0x10000;
 475
 476         buf[0] = (c>>10) & 0xFF;
 477         buf[1] = (c>>18) | 0xd8;
 478         buf[2] = c & 0xFF;
 479         buf[3] = ((c>>8) & 0x3) | 0xdc;
 480
 481         ilen = 4;
 482         olen = 5;
 483         inbuf = (char *)buf;
 484
 485         smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
 486         if (ilen != 0) {
 487                 return -1;
 488         }
 489         return 5 - olen;
 490 }
 491
 492 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
 493                                         size_t *size)
 494 {
 495         return next_codepoint_handle_ext(get_iconv_handle(), str,
 496                                               src_charset, size);
 497 }
 498
 499 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
 500 {
 501         return next_codepoint_handle(get_iconv_handle(), str, size);
 502 }
 503
 504 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
 505 {
 506         return push_codepoint_handle(get_iconv_handle(), str, c);
 507 }