source4/lib/iconv.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    minimal iconv implementation
   4    Copyright (C) Andrew Tridgell 2001
   5    Copyright (C) Jelmer Vernooij 2002
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 */
  21
  22 #include "includes.h"
  23 #include "dlinklist.h"
  24 #include "system/iconv.h"
  25
  26
  27 /**
  28  * @file
  29  *
  30  * @brief Samba wrapper/stub for iconv character set conversion.
  31  *
  32  * iconv is the XPG2 interface for converting between character
  33  * encodings.  This file provides a Samba wrapper around it, and also
  34  * a simple reimplementation that is used if the system does not
  35  * implement iconv.
  36  *
  37  * Samba only works with encodings that are supersets of ASCII: ascii
  38  * characters like whitespace can be tested for directly, multibyte
  39  * sequences start with a byte with the high bit set, and strings are
  40  * terminated by a nul byte.
  41  *
  42  * Note that the only function provided by iconv is conversion between
  43  * characters.  It doesn't directly support operations like
  44  * uppercasing or comparison.  We have to convert to UTF-16LE and
  45  * compare there.
  46  *
  47  * @sa Samba Developers Guide
  48  **/
  49
  50 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
  51 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
  52 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
  53 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
  54 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
  55 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
  56 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
  57 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
  58
  59 static const struct charset_functions const builtin_functions[] = {
  60         /* windows is closest to UTF-16 */
  61         {"UCS-2LE",  iconv_copy, iconv_copy},
  62         {"UTF-16LE",  iconv_copy, iconv_copy},
  63         {"UCS-2BE",  iconv_swab, iconv_swab},
  64         {"UTF-16BE",  iconv_swab, iconv_swab},
  65
  66         /* we include the UTF-8 alias to cope with differing locale settings */
  67         {"UTF8",   utf8_pull,  utf8_push},
  68         {"UTF-8",   utf8_pull,  utf8_push},
  69         {"ASCII", ascii_pull, ascii_push},
  70         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
  71         {NULL, NULL, NULL}
  72 };
  73
  74 static struct charset_functions *charsets = NULL;
  75
  76 static NTSTATUS charset_register_backend(const void *_funcs)
  77 {
  78         struct charset_functions *funcs = memdup(_funcs,sizeof(struct charset_functions));
  79         struct charset_functions *c = charsets;
  80
  81         /* Check whether we already have this charset... */
  82         while(c) {
  83                 if(!strcasecmp(c->name, funcs->name)){
  84                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
  85                         return NT_STATUS_OBJECT_NAME_COLLISION;
  86                 }
  87                 c = c->next;
  88         }
  89
  90         funcs->next = funcs->prev = NULL;
  91         DLIST_ADD(charsets, funcs);
  92         return NT_STATUS_OK;
  93 }
  94
  95 static void lazy_initialize_iconv(void)
  96 {
  97         static BOOL initialized = False;
  98         int i;
  99
 100         if (!initialized) {
 101                 initialized = True;
 102                 register_subsystem("charset", charset_register_backend);
 103
 104                 for(i = 0; builtin_functions[i].name; i++)
 105                         register_backend("charset", &builtin_functions[i]);
 106         }
 107 }
 108
 109 #ifdef HAVE_NATIVE_ICONV
 110 /* if there was an error then reset the internal state,
 111    this ensures that we don't have a shift state remaining for
 112    character sets like SJIS */
 113 static size_t sys_iconv(void *cd,
 114                         const char **inbuf, size_t *inbytesleft,
 115                         char **outbuf, size_t *outbytesleft)
 116 {
 117         size_t ret = iconv((iconv_t)cd,
 118                            discard_const_p(char *, inbuf), inbytesleft,
 119                            outbuf, outbytesleft);
 120         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
 121         return ret;
 122 }
 123 #endif
 124
 125 /**
 126  * This is a simple portable iconv() implementaion.
 127  *
 128  * It only knows about a very small number of character sets - just
 129  * enough that Samba works on systems that don't have iconv.
 130  **/
 131 size_t smb_iconv(smb_iconv_t cd,
 132                  const char **inbuf, size_t *inbytesleft,
 133                  char **outbuf, size_t *outbytesleft)
 134 {
 135         char cvtbuf[2048];
 136         size_t bufsize;
 137
 138         /* in many cases we can go direct */
 139         if (cd->direct) {
 140                 return cd->direct(cd->cd_direct,
 141                                   inbuf, inbytesleft, outbuf, outbytesleft);
 142         }
 143
 144
 145         /* otherwise we have to do it chunks at a time */
 146         while (*inbytesleft > 0) {
 147                 char *bufp1 = cvtbuf;
 148                 const char *bufp2 = cvtbuf;
 149
 150                 bufsize = sizeof(cvtbuf);
 151
 152                 if (cd->pull(cd->cd_pull,
 153                              inbuf, inbytesleft, &bufp1, &bufsize) == -1
 154                     && errno != E2BIG) return -1;
 155
 156                 bufsize = sizeof(cvtbuf) - bufsize;
 157
 158                 if (cd->push(cd->cd_push,
 159                              &bufp2, &bufsize,
 160                              outbuf, outbytesleft) == -1) return -1;
 161         }
 162
 163         return 0;
 164 }
 165
 166 static BOOL is_utf16(const char *name)
 167 {
 168         return strcasecmp(name, "UCS-2LE") == 0 ||
 169                 strcasecmp(name, "UTF-16LE") == 0;
 170 }
 171
 172 /*
 173   simple iconv_open() wrapper
 174  */
 175 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
 176 {
 177         smb_iconv_t ret;
 178         struct charset_functions *from, *to;
 179
 180         lazy_initialize_iconv();
 181         from = charsets;
 182         to = charsets;
 183
 184         ret = (smb_iconv_t)talloc_named(NULL, sizeof(*ret),
 185                                         "iconv(%s,%s)", tocode, fromcode);
 186         if (!ret) {
 187                 errno = ENOMEM;
 188                 return (smb_iconv_t)-1;
 189         }
 190         memset(ret, 0, sizeof(*ret));
 191
 192         /* check for the simplest null conversion */
 193         if (strcmp(fromcode, tocode) == 0) {
 194                 ret->direct = iconv_copy;
 195                 return ret;
 196         }
 197
 198         while (from) {
 199                 if (strcasecmp(from->name, fromcode) == 0) break;
 200                 from = from->next;
 201         }
 202
 203         while (to) {
 204                 if (strcasecmp(to->name, tocode) == 0) break;
 205                 to = to->next;
 206         }
 207
 208 #ifdef HAVE_NATIVE_ICONV
 209         if (!from) {
 210                 ret->pull = sys_iconv;
 211                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
 212                 if (ret->cd_pull == (iconv_t)-1)
 213                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
 214                 if (ret->cd_pull == (iconv_t)-1) goto failed;
 215         }
 216
 217         if (!to) {
 218                 ret->push = sys_iconv;
 219                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
 220                 if (ret->cd_push == (iconv_t)-1)
 221                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
 222                 if (ret->cd_push == (iconv_t)-1) goto failed;
 223         }
 224 #else
 225         if (!from || !to) {
 226                 goto failed;
 227         }
 228 #endif
 229
 230         /* check for conversion to/from ucs2 */
 231         if (is_utf16(fromcode) && to) {
 232                 ret->direct = to->push;
 233                 return ret;
 234         }
 235         if (is_utf16(tocode) && from) {
 236                 ret->direct = from->pull;
 237                 return ret;
 238         }
 239
 240 #ifdef HAVE_NATIVE_ICONV
 241         if (is_utf16(fromcode)) {
 242                 ret->direct = sys_iconv;
 243                 ret->cd_direct = ret->cd_push;
 244                 ret->cd_push = NULL;
 245                 return ret;
 246         }
 247         if (is_utf16(tocode)) {
 248                 ret->direct = sys_iconv;
 249                 ret->cd_direct = ret->cd_pull;
 250                 ret->cd_pull = NULL;
 251                 return ret;
 252         }
 253 #endif
 254
 255         /* the general case has to go via a buffer */
 256         if (!ret->pull) ret->pull = from->pull;
 257         if (!ret->push) ret->push = to->push;
 258         return ret;
 259
 260 failed:
 261         talloc_free(ret);
 262         errno = EINVAL;
 263         return (smb_iconv_t)-1;
 264 }
 265
 266 /*
 267   simple iconv_close() wrapper
 268 */
 269 int smb_iconv_close(smb_iconv_t cd)
 270 {
 271 #ifdef HAVE_NATIVE_ICONV
 272         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
 273         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
 274         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
 275 #endif
 276
 277         talloc_free(cd);
 278         return 0;
 279 }
 280
 281
 282 /**********************************************************************
 283  the following functions implement the builtin character sets in Samba
 284  and also the "test" character sets that are designed to test
 285  multi-byte character set support for english users
 286 ***********************************************************************/
 287 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 288                          char **outbuf, size_t *outbytesleft)
 289 {
 290         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 291                 (*outbuf)[0] = (*inbuf)[0];
 292                 (*outbuf)[1] = 0;
 293                 (*inbytesleft)  -= 1;
 294                 (*outbytesleft) -= 2;
 295                 (*inbuf)  += 1;
 296                 (*outbuf) += 2;
 297         }
 298
 299         if (*inbytesleft > 0) {
 300                 errno = E2BIG;
 301                 return -1;
 302         }
 303
 304         return 0;
 305 }
 306
 307 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
 308                          char **outbuf, size_t *outbytesleft)
 309 {
 310         int ir_count=0;
 311
 312         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 313                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
 314                 if ((*inbuf)[1]) ir_count++;
 315                 (*inbytesleft)  -= 2;
 316                 (*outbytesleft) -= 1;
 317                 (*inbuf)  += 2;
 318                 (*outbuf) += 1;
 319         }
 320
 321         if (*inbytesleft == 1) {
 322                 errno = EINVAL;
 323                 return -1;
 324         }
 325
 326         if (*inbytesleft > 1) {
 327                 errno = E2BIG;
 328                 return -1;
 329         }
 330
 331         return ir_count;
 332 }
 333
 334
 335 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 336                          char **outbuf, size_t *outbytesleft)
 337 {
 338         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
 339                 uint_t v;
 340
 341                 if ((*inbuf)[0] != '@') {
 342                         /* seven bit ascii case */
 343                         (*outbuf)[0] = (*inbuf)[0];
 344                         (*outbuf)[1] = 0;
 345                         (*inbytesleft)  -= 1;
 346                         (*outbytesleft) -= 2;
 347                         (*inbuf)  += 1;
 348                         (*outbuf) += 2;
 349                         continue;
 350                 }
 351                 /* it's a hex character */
 352                 if (*inbytesleft < 5) {
 353                         errno = EINVAL;
 354                         return -1;
 355                 }
 356
 357                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
 358                         errno = EILSEQ;
 359                         return -1;
 360                 }
 361
 362                 (*outbuf)[0] = v&0xff;
 363                 (*outbuf)[1] = v>>8;
 364                 (*inbytesleft)  -= 5;
 365                 (*outbytesleft) -= 2;
 366                 (*inbuf)  += 5;
 367                 (*outbuf) += 2;
 368         }
 369
 370         if (*inbytesleft > 0) {
 371                 errno = E2BIG;
 372                 return -1;
 373         }
 374
 375         return 0;
 376 }
 377
 378 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
 379                            char **outbuf, size_t *outbytesleft)
 380 {
 381         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
 382                 char buf[6];
 383
 384                 if ((*inbuf)[1] == 0 &&
 385                     ((*inbuf)[0] & 0x80) == 0 &&
 386                     (*inbuf)[0] != '@') {
 387                         (*outbuf)[0] = (*inbuf)[0];
 388                         (*inbytesleft)  -= 2;
 389                         (*outbytesleft) -= 1;
 390                         (*inbuf)  += 2;
 391                         (*outbuf) += 1;
 392                         continue;
 393                 }
 394                 if (*outbytesleft < 5) {
 395                         errno = E2BIG;
 396                         return -1;
 397                 }
 398                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
 399                 memcpy(*outbuf, buf, 5);
 400                 (*inbytesleft)  -= 2;
 401                 (*outbytesleft) -= 5;
 402                 (*inbuf)  += 2;
 403                 (*outbuf) += 5;
 404         }
 405
 406         if (*inbytesleft == 1) {
 407                 errno = EINVAL;
 408                 return -1;
 409         }
 410
 411         if (*inbytesleft > 1) {
 412                 errno = E2BIG;
 413                 return -1;
 414         }
 415
 416         return 0;
 417 }
 418
 419 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
 420                          char **outbuf, size_t *outbytesleft)
 421 {
 422         int n;
 423
 424         n = MIN(*inbytesleft, *outbytesleft);
 425
 426         swab(*inbuf, *outbuf, (n&~1));
 427         if (n&1) {
 428                 (*outbuf)[n-1] = 0;
 429         }
 430
 431         (*inbytesleft) -= n;
 432         (*outbytesleft) -= n;
 433         (*inbuf) += n;
 434         (*outbuf) += n;
 435
 436         if (*inbytesleft > 0) {
 437                 errno = E2BIG;
 438                 return -1;
 439         }
 440
 441         return 0;
 442 }
 443
 444
 445 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
 446                          char **outbuf, size_t *outbytesleft)
 447 {
 448         int n;
 449
 450         n = MIN(*inbytesleft, *outbytesleft);
 451
 452         memmove(*outbuf, *inbuf, n);
 453
 454         (*inbytesleft) -= n;
 455         (*outbytesleft) -= n;
 456         (*inbuf) += n;
 457         (*outbuf) += n;
 458
 459         if (*inbytesleft > 0) {
 460                 errno = E2BIG;
 461                 return -1;
 462         }
 463
 464         return 0;
 465 }
 466
 467 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
 468                          char **outbuf, size_t *outbytesleft)
 469 {
 470         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 471         const uint8_t *c = (const uint8_t *)*inbuf;
 472         uint8_t *uc = (uint8_t *)*outbuf;
 473
 474         while (in_left >= 1 && out_left >= 2) {
 475                 if ((c[0] & 0x80) == 0) {
 476                         uc[0] = c[0];
 477                         uc[1] = 0;
 478                         c  += 1;
 479                         in_left  -= 1;
 480                         out_left -= 2;
 481                         uc += 2;
 482                         continue;
 483                 }
 484
 485                 if ((c[0] & 0xe0) == 0xc0) {
 486                         if (in_left < 2 ||
 487                             (c[1] & 0xc0) != 0x80) {
 488                                 errno = EILSEQ;
 489                                 goto error;
 490                         }
 491                         uc[1] = (c[0]>>2) & 0x7;
 492                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
 493                         c  += 2;
 494                         in_left  -= 2;
 495                         out_left -= 2;
 496                         uc += 2;
 497                         continue;
 498                 }
 499
 500                 if ((c[0] & 0xf0) == 0xe0) {
 501                         if (in_left < 3 ||
 502                             (c[1] & 0xc0) != 0x80 ||
 503                             (c[2] & 0xc0) != 0x80) {
 504                                 errno = EILSEQ;
 505                                 goto error;
 506                         }
 507                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
 508                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
 509                         c  += 3;
 510                         in_left  -= 3;
 511                         out_left -= 2;
 512                         uc += 2;
 513                         continue;
 514                 }
 515
 516                 if ((c[0] & 0xf8) == 0xf0) {
 517                         unsigned int codepoint;
 518                         if (in_left < 4 ||
 519                             (c[1] & 0xc0) != 0x80 ||
 520                             (c[2] & 0xc0) != 0x80 ||
 521                             (c[3] & 0xc0) != 0x80) {
 522                                 errno = EILSEQ;
 523                                 goto error;
 524                         }
 525                         codepoint =
 526                                 (c[3]&0x3f) |
 527                                 ((c[2]&0x3f)<<6) |
 528                                 ((c[1]&0x3f)<<12) |
 529                                 ((c[0]&0x7)<<18);
 530                         if (codepoint < 0x10000) {
 531                                 /* accept UTF-8 characters that are not
 532                                    minimally packed, but pack the result */
 533                                 uc[0] = (codepoint & 0xFF);
 534                                 uc[1] = (codepoint >> 8);
 535                                 c += 4;
 536                                 in_left -= 4;
 537                                 out_left -= 2;
 538                                 uc += 2;
 539                                 continue;
 540                         }
 541
 542                         codepoint -= 0x10000;
 543
 544                         if (out_left < 4) {
 545                                 errno = E2BIG;
 546                                 goto error;
 547                         }
 548
 549                         uc[0] = (codepoint>>10) & 0xFF;
 550                         uc[1] = (codepoint>>18) | 0xd8;
 551                         uc[2] = codepoint & 0xFF;
 552                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
 553                         c  += 4;
 554                         in_left  -= 4;
 555                         out_left -= 4;
 556                         uc += 4;
 557                         continue;
 558                 }
 559
 560                 /* we don't handle 5 byte sequences */
 561                 errno = EINVAL;
 562                 goto error;
 563         }
 564
 565         if (in_left > 0) {
 566                 errno = E2BIG;
 567                 goto error;
 568         }
 569
 570         *inbytesleft = in_left;
 571         *outbytesleft = out_left;
 572         *inbuf = c;
 573         *outbuf = uc;
 574         return 0;
 575
 576 error:
 577         *inbytesleft = in_left;
 578         *outbytesleft = out_left;
 579         *inbuf = c;
 580         *outbuf = uc;
 581         return -1;
 582 }
 583
 584 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
 585                         char **outbuf, size_t *outbytesleft)
 586 {
 587         size_t in_left=*inbytesleft, out_left=*outbytesleft;
 588         uint8_t *c = (uint8_t *)*outbuf;
 589         const uint8_t *uc = (const uint8_t *)*inbuf;
 590
 591         while (in_left >= 2 && out_left >= 1) {
 592                 unsigned int codepoint;
 593
 594                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
 595                         /* simplest case */
 596                         c[0] = uc[0];
 597                         in_left  -= 2;
 598                         out_left -= 1;
 599                         uc += 2;
 600                         c  += 1;
 601                         continue;
 602                 }
 603
 604                 if ((uc[1]&0xf8) == 0) {
 605                         /* next simplest case */
 606                         if (out_left < 2) {
 607                                 errno = E2BIG;
 608                                 goto error;
 609                         }
 610                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
 611                         c[1] = 0x80 | (uc[0] & 0x3f);
 612                         in_left  -= 2;
 613                         out_left -= 2;
 614                         uc += 2;
 615                         c  += 2;
 616                         continue;
 617                 }
 618
 619                 if ((uc[1] & 0xfc) == 0xdc) {
 620                         /* its the second part of a 4 byte sequence. Illegal */
 621                         if (in_left < 4) {
 622                                 errno = EINVAL;
 623                         } else {
 624                                 errno = EILSEQ;
 625                         }
 626                         goto error;
 627                 }
 628
 629                 if ((uc[1] & 0xfc) != 0xd8) {
 630                         codepoint = uc[0] | (uc[1]<<8);
 631                         if (out_left < 3) {
 632                                 errno = E2BIG;
 633                                 goto error;
 634                         }
 635                         c[0] = 0xe0 | (codepoint >> 12);
 636                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
 637                         c[2] = 0x80 | (codepoint & 0x3f);
 638
 639                         in_left  -= 2;
 640                         out_left -= 3;
 641                         uc  += 2;
 642                         c   += 3;
 643                         continue;
 644                 }
 645
 646                 /* its the first part of a 4 byte sequence */
 647                 if (in_left < 4) {
 648                         errno = EINVAL;
 649                         goto error;
 650                 }
 651                 if ((uc[3] & 0xfc) != 0xdc) {
 652                         errno = EILSEQ;
 653                         goto error;
 654                 }
 655                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) |
 656                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
 657
 658                 if (out_left < 4) {
 659                         errno = E2BIG;
 660                         goto error;
 661                 }
 662                 c[0] = 0xf0 | (codepoint >> 18);
 663                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
 664                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
 665                 c[3] = 0x80 | (codepoint & 0x3f);
 666
 667                 in_left  -= 4;
 668                 out_left -= 4;
 669                 uc       += 4;
 670                 c        += 4;
 671         }
 672
 673         if (in_left == 1) {
 674                 errno = EINVAL;
 675                 goto error;
 676         }
 677
 678         if (in_left > 1) {
 679                 errno = E2BIG;
 680                 goto error;
 681         }
 682
 683         *inbytesleft = in_left;
 684         *outbytesleft = out_left;
 685         *inbuf  = uc;
 686         *outbuf = c;
 687
 688         return 0;
 689
 690 error:
 691         *inbytesleft = in_left;
 692         *outbytesleft = out_left;
 693         *inbuf  = uc;
 694         *outbuf = c;
 695         return -1;
 696 }
 697
 698
 699