r4267: fixed the charset code to use the builtin_functions.
[samba.git] / source4 / lib / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 #include "includes.h"
23 #include "dlinklist.h"
24 #include "system/iconv.h"
25
26
27 /**
28  * @file
29  *
30  * @brief Samba wrapper/stub for iconv character set conversion.
31  *
32  * iconv is the XPG2 interface for converting between character
33  * encodings.  This file provides a Samba wrapper around it, and also
34  * a simple reimplementation that is used if the system does not
35  * implement iconv.
36  *
37  * Samba only works with encodings that are supersets of ASCII: ascii
38  * characters like whitespace can be tested for directly, multibyte
39  * sequences start with a byte with the high bit set, and strings are
40  * terminated by a nul byte.
41  *
42  * Note that the only function provided by iconv is conversion between
43  * characters.  It doesn't directly support operations like
44  * uppercasing or comparison.  We have to convert to UTF-16LE and
45  * compare there.
46  *
47  * @sa Samba Developers Guide
48  **/
49
50 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
51 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
52 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
53 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
54 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
55 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
56 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
57 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
58
59 static const struct charset_functions builtin_functions[] = {
60         /* windows is closest to UTF-16 */
61         {"UCS-2LE",  iconv_copy, iconv_copy},
62         {"UTF-16LE",  iconv_copy, iconv_copy},
63         {"UCS-2BE",  iconv_swab, iconv_swab},
64         {"UTF-16BE",  iconv_swab, iconv_swab},
65
66         /* we include the UTF-8 alias to cope with differing locale settings */
67         {"UTF8",   utf8_pull,  utf8_push},
68         {"UTF-8",   utf8_pull,  utf8_push},
69         {"ASCII", ascii_pull, ascii_push},
70         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
71 };
72
73 static struct charset_functions *charsets = NULL;
74
75 NTSTATUS charset_register_backend(const void *_funcs) 
76 {
77         struct charset_functions *funcs = memdup(_funcs,sizeof(struct charset_functions));
78         struct charset_functions *c = charsets;
79
80         /* Check whether we already have this charset... */
81         while(c) {
82                 if(!strcasecmp(c->name, funcs->name)){ 
83                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
84                         return NT_STATUS_OBJECT_NAME_COLLISION;
85                 }
86                 c = c->next;
87         }
88
89         funcs->next = funcs->prev = NULL;
90         DLIST_ADD(charsets, funcs);
91         return NT_STATUS_OK;
92 }
93
94 #ifdef HAVE_NATIVE_ICONV
95 /* if there was an error then reset the internal state,
96    this ensures that we don't have a shift state remaining for
97    character sets like SJIS */
98 static size_t sys_iconv(void *cd, 
99                         const char **inbuf, size_t *inbytesleft,
100                         char **outbuf, size_t *outbytesleft)
101 {
102         size_t ret = iconv((iconv_t)cd, 
103                            discard_const_p(char *, inbuf), inbytesleft, 
104                            outbuf, outbytesleft);
105         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
106         return ret;
107 }
108 #endif
109
110 /**
111  * This is a simple portable iconv() implementaion.
112  *
113  * It only knows about a very small number of character sets - just
114  * enough that Samba works on systems that don't have iconv.
115  **/
116 size_t smb_iconv(smb_iconv_t cd, 
117                  const char **inbuf, size_t *inbytesleft,
118                  char **outbuf, size_t *outbytesleft)
119 {
120         char cvtbuf[2048];
121         size_t bufsize;
122
123         /* in many cases we can go direct */
124         if (cd->direct) {
125                 return cd->direct(cd->cd_direct, 
126                                   inbuf, inbytesleft, outbuf, outbytesleft);
127         }
128
129
130         /* otherwise we have to do it chunks at a time */
131         while (*inbytesleft > 0) {
132                 char *bufp1 = cvtbuf;
133                 const char *bufp2 = cvtbuf;
134
135                 bufsize = sizeof(cvtbuf);
136                 
137                 if (cd->pull(cd->cd_pull, 
138                              inbuf, inbytesleft, &bufp1, &bufsize) == -1
139                     && errno != E2BIG) return -1;
140
141                 bufsize = sizeof(cvtbuf) - bufsize;
142
143                 if (cd->push(cd->cd_push, 
144                              &bufp2, &bufsize, 
145                              outbuf, outbytesleft) == -1) return -1;
146         }
147
148         return 0;
149 }
150
151 static BOOL is_utf16(const char *name)
152 {
153         return strcasecmp(name, "UCS-2LE") == 0 ||
154                 strcasecmp(name, "UTF-16LE") == 0;
155 }
156
157 /*
158   simple iconv_open() wrapper
159  */
160 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
161 {
162         smb_iconv_t ret;
163         const struct charset_functions *from=NULL, *to=NULL;
164         int i;
165
166         ret = (smb_iconv_t)talloc_named(NULL, sizeof(*ret), 
167                                         "iconv(%s,%s)", tocode, fromcode);
168         if (!ret) {
169                 errno = ENOMEM;
170                 return (smb_iconv_t)-1;
171         }
172         memset(ret, 0, sizeof(*ret));
173
174         /* check for the simplest null conversion */
175         if (strcmp(fromcode, tocode) == 0) {
176                 ret->direct = iconv_copy;
177                 return ret;
178         }
179
180         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
181                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
182                         from = &builtin_functions[i];
183                 }
184                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
185                         to = &builtin_functions[i];
186                 }
187         }
188
189         if (from == NULL) {
190                 for (from=charsets; from; from=from->next) {
191                         if (strcasecmp(from->name, fromcode) == 0) break;
192                 }
193         }
194
195         if (to == NULL) {
196                 for (to=charsets; to; to=to->next) {
197                         if (strcasecmp(to->name, tocode) == 0) break;
198                 }
199         }
200
201 #ifdef HAVE_NATIVE_ICONV
202         if (!from) {
203                 ret->pull = sys_iconv;
204                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
205                 if (ret->cd_pull == (iconv_t)-1)
206                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
207                 if (ret->cd_pull == (iconv_t)-1) goto failed;
208         }
209
210         if (!to) {
211                 ret->push = sys_iconv;
212                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
213                 if (ret->cd_push == (iconv_t)-1)
214                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
215                 if (ret->cd_push == (iconv_t)-1) goto failed;
216         }
217 #else
218         if (!from || !to) {
219                 goto failed;
220         }
221 #endif
222
223         /* check for conversion to/from ucs2 */
224         if (is_utf16(fromcode) && to) {
225                 ret->direct = to->push;
226                 return ret;
227         }
228         if (is_utf16(tocode) && from) {
229                 ret->direct = from->pull;
230                 return ret;
231         }
232
233 #ifdef HAVE_NATIVE_ICONV
234         if (is_utf16(fromcode)) {
235                 ret->direct = sys_iconv;
236                 ret->cd_direct = ret->cd_push;
237                 ret->cd_push = NULL;
238                 return ret;
239         }
240         if (is_utf16(tocode)) {
241                 ret->direct = sys_iconv;
242                 ret->cd_direct = ret->cd_pull;
243                 ret->cd_pull = NULL;
244                 return ret;
245         }
246 #endif
247
248         /* the general case has to go via a buffer */
249         if (!ret->pull) ret->pull = from->pull;
250         if (!ret->push) ret->push = to->push;
251         return ret;
252
253 failed:
254         talloc_free(ret);
255         errno = EINVAL;
256         return (smb_iconv_t)-1;
257 }
258
259 /*
260   simple iconv_close() wrapper
261 */
262 int smb_iconv_close(smb_iconv_t cd)
263 {
264 #ifdef HAVE_NATIVE_ICONV
265         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
266         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
267         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
268 #endif
269
270         talloc_free(cd);
271         return 0;
272 }
273
274
275 /**********************************************************************
276  the following functions implement the builtin character sets in Samba
277  and also the "test" character sets that are designed to test
278  multi-byte character set support for english users
279 ***********************************************************************/
280 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
281                          char **outbuf, size_t *outbytesleft)
282 {
283         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
284                 (*outbuf)[0] = (*inbuf)[0];
285                 (*outbuf)[1] = 0;
286                 (*inbytesleft)  -= 1;
287                 (*outbytesleft) -= 2;
288                 (*inbuf)  += 1;
289                 (*outbuf) += 2;
290         }
291
292         if (*inbytesleft > 0) {
293                 errno = E2BIG;
294                 return -1;
295         }
296         
297         return 0;
298 }
299
300 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
301                          char **outbuf, size_t *outbytesleft)
302 {
303         int ir_count=0;
304
305         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
306                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
307                 if ((*inbuf)[1]) ir_count++;
308                 (*inbytesleft)  -= 2;
309                 (*outbytesleft) -= 1;
310                 (*inbuf)  += 2;
311                 (*outbuf) += 1;
312         }
313
314         if (*inbytesleft == 1) {
315                 errno = EINVAL;
316                 return -1;
317         }
318
319         if (*inbytesleft > 1) {
320                 errno = E2BIG;
321                 return -1;
322         }
323         
324         return ir_count;
325 }
326
327
328 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
329                          char **outbuf, size_t *outbytesleft)
330 {
331         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
332                 uint_t v;
333
334                 if ((*inbuf)[0] != '@') {
335                         /* seven bit ascii case */
336                         (*outbuf)[0] = (*inbuf)[0];
337                         (*outbuf)[1] = 0;
338                         (*inbytesleft)  -= 1;
339                         (*outbytesleft) -= 2;
340                         (*inbuf)  += 1;
341                         (*outbuf) += 2;
342                         continue;
343                 }
344                 /* it's a hex character */
345                 if (*inbytesleft < 5) {
346                         errno = EINVAL;
347                         return -1;
348                 }
349                 
350                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
351                         errno = EILSEQ;
352                         return -1;
353                 }
354
355                 (*outbuf)[0] = v&0xff;
356                 (*outbuf)[1] = v>>8;
357                 (*inbytesleft)  -= 5;
358                 (*outbytesleft) -= 2;
359                 (*inbuf)  += 5;
360                 (*outbuf) += 2;
361         }
362
363         if (*inbytesleft > 0) {
364                 errno = E2BIG;
365                 return -1;
366         }
367         
368         return 0;
369 }
370
371 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
372                            char **outbuf, size_t *outbytesleft)
373 {
374         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
375                 char buf[6];
376
377                 if ((*inbuf)[1] == 0 && 
378                     ((*inbuf)[0] & 0x80) == 0 &&
379                     (*inbuf)[0] != '@') {
380                         (*outbuf)[0] = (*inbuf)[0];
381                         (*inbytesleft)  -= 2;
382                         (*outbytesleft) -= 1;
383                         (*inbuf)  += 2;
384                         (*outbuf) += 1;
385                         continue;
386                 }
387                 if (*outbytesleft < 5) {
388                         errno = E2BIG;
389                         return -1;
390                 }
391                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
392                 memcpy(*outbuf, buf, 5);
393                 (*inbytesleft)  -= 2;
394                 (*outbytesleft) -= 5;
395                 (*inbuf)  += 2;
396                 (*outbuf) += 5;
397         }
398
399         if (*inbytesleft == 1) {
400                 errno = EINVAL;
401                 return -1;
402         }
403
404         if (*inbytesleft > 1) {
405                 errno = E2BIG;
406                 return -1;
407         }
408         
409         return 0;
410 }
411
412 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
413                          char **outbuf, size_t *outbytesleft)
414 {
415         int n;
416
417         n = MIN(*inbytesleft, *outbytesleft);
418
419         swab(*inbuf, *outbuf, (n&~1));
420         if (n&1) {
421                 (*outbuf)[n-1] = 0;
422         }
423
424         (*inbytesleft) -= n;
425         (*outbytesleft) -= n;
426         (*inbuf) += n;
427         (*outbuf) += n;
428
429         if (*inbytesleft > 0) {
430                 errno = E2BIG;
431                 return -1;
432         }
433
434         return 0;
435 }
436
437
438 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
439                          char **outbuf, size_t *outbytesleft)
440 {
441         int n;
442
443         n = MIN(*inbytesleft, *outbytesleft);
444
445         memmove(*outbuf, *inbuf, n);
446
447         (*inbytesleft) -= n;
448         (*outbytesleft) -= n;
449         (*inbuf) += n;
450         (*outbuf) += n;
451
452         if (*inbytesleft > 0) {
453                 errno = E2BIG;
454                 return -1;
455         }
456
457         return 0;
458 }
459
460 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
461                          char **outbuf, size_t *outbytesleft)
462 {
463         size_t in_left=*inbytesleft, out_left=*outbytesleft;
464         const uint8_t *c = (const uint8_t *)*inbuf;
465         uint8_t *uc = (uint8_t *)*outbuf;
466
467         while (in_left >= 1 && out_left >= 2) {
468                 if ((c[0] & 0x80) == 0) {
469                         uc[0] = c[0];
470                         uc[1] = 0;
471                         c  += 1;
472                         in_left  -= 1;
473                         out_left -= 2;
474                         uc += 2;
475                         continue;
476                 }
477
478                 if ((c[0] & 0xe0) == 0xc0) {
479                         if (in_left < 2 ||
480                             (c[1] & 0xc0) != 0x80) {
481                                 errno = EILSEQ;
482                                 goto error;
483                         }
484                         uc[1] = (c[0]>>2) & 0x7;
485                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
486                         c  += 2;
487                         in_left  -= 2;
488                         out_left -= 2;
489                         uc += 2;
490                         continue;
491                 }
492
493                 if ((c[0] & 0xf0) == 0xe0) {
494                         if (in_left < 3 ||
495                             (c[1] & 0xc0) != 0x80 || 
496                             (c[2] & 0xc0) != 0x80) {
497                                 errno = EILSEQ;
498                                 goto error;
499                         }
500                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
501                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
502                         c  += 3;
503                         in_left  -= 3;
504                         out_left -= 2;
505                         uc += 2;
506                         continue;
507                 }
508
509                 if ((c[0] & 0xf8) == 0xf0) {
510                         unsigned int codepoint;
511                         if (in_left < 4 ||
512                             (c[1] & 0xc0) != 0x80 || 
513                             (c[2] & 0xc0) != 0x80 ||
514                             (c[3] & 0xc0) != 0x80) {
515                                 errno = EILSEQ;
516                                 goto error;
517                         }
518                         codepoint = 
519                                 (c[3]&0x3f) | 
520                                 ((c[2]&0x3f)<<6) | 
521                                 ((c[1]&0x3f)<<12) |
522                                 ((c[0]&0x7)<<18);
523                         if (codepoint < 0x10000) {
524                                 /* accept UTF-8 characters that are not
525                                    minimally packed, but pack the result */
526                                 uc[0] = (codepoint & 0xFF);
527                                 uc[1] = (codepoint >> 8);
528                                 c += 4;
529                                 in_left -= 4;
530                                 out_left -= 2;
531                                 uc += 2;
532                                 continue;
533                         }
534
535                         codepoint -= 0x10000;
536
537                         if (out_left < 4) {
538                                 errno = E2BIG;
539                                 goto error;
540                         }
541
542                         uc[0] = (codepoint>>10) & 0xFF;
543                         uc[1] = (codepoint>>18) | 0xd8;
544                         uc[2] = codepoint & 0xFF;
545                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
546                         c  += 4;
547                         in_left  -= 4;
548                         out_left -= 4;
549                         uc += 4;
550                         continue;
551                 }
552
553                 /* we don't handle 5 byte sequences */
554                 errno = EINVAL;
555                 goto error;
556         }
557
558         if (in_left > 0) {
559                 errno = E2BIG;
560                 goto error;
561         }
562
563         *inbytesleft = in_left;
564         *outbytesleft = out_left;
565         *inbuf = c;
566         *outbuf = uc;   
567         return 0;
568
569 error:
570         *inbytesleft = in_left;
571         *outbytesleft = out_left;
572         *inbuf = c;
573         *outbuf = uc;
574         return -1;
575 }
576
577 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
578                         char **outbuf, size_t *outbytesleft)
579 {
580         size_t in_left=*inbytesleft, out_left=*outbytesleft;
581         uint8_t *c = (uint8_t *)*outbuf;
582         const uint8_t *uc = (const uint8_t *)*inbuf;
583
584         while (in_left >= 2 && out_left >= 1) {
585                 unsigned int codepoint;
586
587                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
588                         /* simplest case */
589                         c[0] = uc[0];
590                         in_left  -= 2;
591                         out_left -= 1;
592                         uc += 2;
593                         c  += 1;
594                         continue;
595                 }
596
597                 if ((uc[1]&0xf8) == 0) {
598                         /* next simplest case */
599                         if (out_left < 2) {
600                                 errno = E2BIG;
601                                 goto error;
602                         }
603                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
604                         c[1] = 0x80 | (uc[0] & 0x3f);
605                         in_left  -= 2;
606                         out_left -= 2;
607                         uc += 2;
608                         c  += 2;
609                         continue;
610                 }
611
612                 if ((uc[1] & 0xfc) == 0xdc) {
613                         /* its the second part of a 4 byte sequence. Illegal */
614                         if (in_left < 4) {
615                                 errno = EINVAL;
616                         } else {
617                                 errno = EILSEQ;
618                         }
619                         goto error;
620                 }
621
622                 if ((uc[1] & 0xfc) != 0xd8) {
623                         codepoint = uc[0] | (uc[1]<<8);
624                         if (out_left < 3) {
625                                 errno = E2BIG;
626                                 goto error;
627                         }
628                         c[0] = 0xe0 | (codepoint >> 12);
629                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
630                         c[2] = 0x80 | (codepoint & 0x3f);
631                         
632                         in_left  -= 2;
633                         out_left -= 3;
634                         uc  += 2;
635                         c   += 3;
636                         continue;
637                 }
638
639                 /* its the first part of a 4 byte sequence */
640                 if (in_left < 4) {
641                         errno = EINVAL;
642                         goto error;
643                 }
644                 if ((uc[3] & 0xfc) != 0xdc) {
645                         errno = EILSEQ;
646                         goto error;
647                 }
648                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
649                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
650                 
651                 if (out_left < 4) {
652                         errno = E2BIG;
653                         goto error;
654                 }
655                 c[0] = 0xf0 | (codepoint >> 18);
656                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
657                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
658                 c[3] = 0x80 | (codepoint & 0x3f);
659                 
660                 in_left  -= 4;
661                 out_left -= 4;
662                 uc       += 4;
663                 c        += 4;
664         }
665
666         if (in_left == 1) {
667                 errno = EINVAL;
668                 goto error;
669         }
670
671         if (in_left > 1) {
672                 errno = E2BIG;
673                 goto error;
674         }
675
676         *inbytesleft = in_left;
677         *outbytesleft = out_left;
678         *inbuf  = uc;
679         *outbuf = c;
680         
681         return 0;
682
683 error:
684         *inbytesleft = in_left;
685         *outbytesleft = out_left;
686         *inbuf  = uc;
687         *outbuf = c;
688         return -1;
689 }
690
691
692