allow all ucs2 chars in utf8, rather than mapping some to a single
[samba.git] / source / lib / iconv.c
1 /* 
2    Unix SMB/Netbios implementation.
3    Version 3.0
4    minimal iconv implementation
5    Copyright (C) Andrew Tridgell 2001
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 #include "includes.h"
23
24 static size_t ascii_pull(void *,char **, size_t *, char **, size_t *);
25 static size_t ascii_push(void *,char **, size_t *, char **, size_t *);
26 static size_t  utf8_pull(void *,char **, size_t *, char **, size_t *);
27 static size_t  utf8_push(void *,char **, size_t *, char **, size_t *);
28 static size_t weird_pull(void *,char **, size_t *, char **, size_t *);
29 static size_t weird_push(void *,char **, size_t *, char **, size_t *);
30 static size_t ucs2hex_pull(void *,char **, size_t *, char **, size_t *);
31 static size_t ucs2hex_push(void *,char **, size_t *, char **, size_t *);
32 static size_t iconv_copy(void *,char **, size_t *, char **, size_t *);
33
34 /*
35   for each charset we have a function that pulls from that charset to 
36   a ucs2 buffer, and a function that pushes to a ucs2 buffer 
37 */
38 static struct {
39         char *name;
40         size_t (*pull)(void *, char **inbuf, size_t *inbytesleft,
41                        char **outbuf, size_t *outbytesleft);
42         size_t (*push)(void *, char **inbuf, size_t *inbytesleft,
43                        char **outbuf, size_t *outbytesleft);
44 } charsets[] = {
45         {"UCS-2LE",  iconv_copy, iconv_copy},
46         {"UTF8",   utf8_pull,  utf8_push},
47         {"ASCII", ascii_pull, ascii_push},
48         {"WEIRD", weird_pull, weird_push},
49         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
50         {NULL, NULL, NULL}
51 };
52
53
54 /* if there was an error then reset the internal state,
55    this ensures that we don't have a shift state remaining for
56    character sets like SJIS */
57 static size_t sys_iconv(void *cd, 
58                         char **inbuf, size_t *inbytesleft,
59                         char **outbuf, size_t *outbytesleft)
60 {
61 #ifdef HAVE_NATIVE_ICONV
62         size_t ret = iconv((iconv_t)cd, 
63                            inbuf, inbytesleft, 
64                            outbuf, outbytesleft);
65         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
66         return ret;
67 #else
68         errno = EINVAL;
69         return -1;
70 #endif
71 }
72
73 /*
74   this is a simple portable iconv() implementaion. It only knows about
75   a very small number of character sets - just enough that Samba works
76   on systems that don't have iconv
77  */
78 size_t smb_iconv(smb_iconv_t cd, 
79                  char **inbuf, size_t *inbytesleft,
80                  char **outbuf, size_t *outbytesleft)
81 {
82         char cvtbuf[2048];
83         char *bufp = cvtbuf;
84         size_t bufsize;
85
86         /* in many cases we can go direct */
87         if (cd->direct) {
88                 return cd->direct(cd->cd_direct, 
89                                   inbuf, inbytesleft, outbuf, outbytesleft);
90         }
91
92
93         /* otherwise we have to do it chunks at a time */
94         while (*inbytesleft > 0) {
95                 bufp = cvtbuf;
96                 bufsize = sizeof(cvtbuf);
97                 
98                 if (cd->pull(cd->cd_pull, 
99                              inbuf, inbytesleft, &bufp, &bufsize) == -1
100                     && errno != E2BIG) return -1;
101
102                 bufp = cvtbuf;
103                 bufsize = sizeof(cvtbuf) - bufsize;
104
105                 if (cd->push(cd->cd_push, 
106                              &bufp, &bufsize, 
107                              outbuf, outbytesleft) == -1) return -1;
108         }
109
110         return 0;
111 }
112
113 /*
114   simple iconv_open() wrapper
115  */
116 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
117 {
118         smb_iconv_t ret;
119         int from, to;
120
121         ret = (smb_iconv_t)malloc(sizeof(*ret));
122         if (!ret) {
123                 errno = ENOMEM;
124                 return (smb_iconv_t)-1;
125         }
126         memset(ret, 0, sizeof(*ret));
127
128         /* check for the simplest null conversion */
129         if (strcmp(fromcode, tocode) == 0) {
130                 ret->direct = iconv_copy;
131                 return ret;
132         }
133
134         for (from=0; charsets[from].name; from++) {
135                 if (strcasecmp(charsets[from].name, fromcode) == 0) break;
136         }
137         for (to=0; charsets[to].name; to++) {
138                 if (strcasecmp(charsets[to].name, tocode) == 0) break;
139         }
140
141 #ifdef HAVE_NATIVE_ICONV
142         if (!charsets[from].name) {
143                 ret->pull = sys_iconv;
144                 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
145                 if (ret->cd_pull == (iconv_t)-1) goto failed;
146         }
147         if (!charsets[to].name) {
148                 ret->push = sys_iconv;
149                 ret->cd_push = iconv_open(tocode, "UCS-2LE");
150                 if (ret->cd_push == (iconv_t)-1) goto failed;
151         }
152 #else
153         if (!charsets[from].name || !charsets[to].name) {
154                 goto failed;
155         }
156 #endif
157
158         /* check for conversion to/from ucs2 */
159         if (from == 0 && charsets[to].name) {
160                 ret->direct = charsets[to].push;
161                 return ret;
162         }
163         if (to == 0 && charsets[from].name) {
164                 ret->direct = charsets[from].pull;
165                 return ret;
166         }
167
168 #ifdef HAVE_NATIVE_ICONV
169         if (from == 0) {
170                 ret->direct = sys_iconv;
171                 ret->cd_direct = ret->cd_push;
172                 ret->cd_push = NULL;
173                 return ret;
174         }
175         if (to == 0) {
176                 ret->direct = sys_iconv;
177                 ret->cd_direct = ret->cd_pull;
178                 ret->cd_pull = NULL;
179                 return ret;
180         }
181 #endif
182
183         /* the general case has to go via a buffer */
184         if (!ret->pull) ret->pull = charsets[from].pull;
185         if (!ret->push) ret->push = charsets[to].push;
186         return ret;
187
188 failed:
189         SAFE_FREE(ret);
190         errno = EINVAL;
191         return (smb_iconv_t)-1;
192 }
193
194 /*
195   simple iconv_close() wrapper
196 */
197 int smb_iconv_close (smb_iconv_t cd)
198 {
199 #ifdef HAVE_NATIVE_ICONV
200         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
201         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
202         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
203 #endif
204
205         memset(cd, 0, sizeof(*cd));
206         SAFE_FREE(cd);
207         return 0;
208 }
209
210
211 /**********************************************************************
212  the following functions implement the builtin character sets in Samba
213  and also the "test" character sets that are designed to test
214  multi-byte character set support for english users
215 ***********************************************************************/
216
217 static size_t ascii_pull(void *cd, char **inbuf, size_t *inbytesleft,
218                          char **outbuf, size_t *outbytesleft)
219 {
220         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
221                 (*outbuf)[0] = (*inbuf)[0];
222                 (*outbuf)[1] = 0;
223                 (*inbytesleft)  -= 1;
224                 (*outbytesleft) -= 2;
225                 (*inbuf)  += 1;
226                 (*outbuf) += 2;
227         }
228
229         if (*inbytesleft > 0) {
230                 errno = E2BIG;
231                 return -1;
232         }
233         
234         return 0;
235 }
236
237 static size_t ascii_push(void *cd, char **inbuf, size_t *inbytesleft,
238                          char **outbuf, size_t *outbytesleft)
239 {
240         int ir_count=0;
241
242         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
243                 (*outbuf)[0] = (*inbuf)[0];
244                 if ((*inbuf)[1]) ir_count++;
245                 (*inbytesleft)  -= 2;
246                 (*outbytesleft) -= 1;
247                 (*inbuf)  += 2;
248                 (*outbuf) += 1;
249         }
250
251         if (*inbytesleft == 1) {
252                 errno = EINVAL;
253                 return -1;
254         }
255
256         if (*inbytesleft > 1) {
257                 errno = E2BIG;
258                 return -1;
259         }
260         
261         return ir_count;
262 }
263
264
265 static size_t ucs2hex_pull(void *cd, char **inbuf, size_t *inbytesleft,
266                          char **outbuf, size_t *outbytesleft)
267 {
268         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
269                 unsigned v;
270
271                 if ((*inbuf)[0] != '@') {
272                         /* seven bit ascii case */
273                         (*outbuf)[0] = (*inbuf)[0];
274                         (*outbuf)[1] = 0;
275                         (*inbytesleft)  -= 1;
276                         (*outbytesleft) -= 2;
277                         (*inbuf)  += 1;
278                         (*outbuf) += 2;
279                         continue;
280                 }
281                 /* it's a hex character */
282                 if (*inbytesleft < 5) {
283                         errno = EINVAL;
284                         return -1;
285                 }
286                 
287                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
288                         errno = EILSEQ;
289                         return -1;
290                 }
291
292                 (*outbuf)[0] = v&0xff;
293                 (*outbuf)[1] = v>>8;
294                 (*inbytesleft)  -= 5;
295                 (*outbytesleft) -= 2;
296                 (*inbuf)  += 5;
297                 (*outbuf) += 2;
298         }
299
300         if (*inbytesleft > 0) {
301                 errno = E2BIG;
302                 return -1;
303         }
304         
305         return 0;
306 }
307
308 static size_t ucs2hex_push(void *cd, char **inbuf, size_t *inbytesleft,
309                            char **outbuf, size_t *outbytesleft)
310 {
311         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
312                 char buf[6];
313
314                 if ((*inbuf)[1] == 0 && 
315                     ((*inbuf)[0] & 0x80) == 0 &&
316                     (*inbuf)[0] != '@') {
317                         (*outbuf)[0] = (*inbuf)[0];
318                         (*inbytesleft)  -= 2;
319                         (*outbytesleft) -= 1;
320                         (*inbuf)  += 2;
321                         (*outbuf) += 1;
322                         continue;
323                 }
324                 if (*outbytesleft < 5) {
325                         errno = E2BIG;
326                         return -1;
327                 }
328                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
329                 memcpy(*outbuf, buf, 5);
330                 (*inbytesleft)  -= 2;
331                 (*outbytesleft) -= 5;
332                 (*inbuf)  += 2;
333                 (*outbuf) += 5;
334         }
335
336         if (*inbytesleft == 1) {
337                 errno = EINVAL;
338                 return -1;
339         }
340
341         if (*inbytesleft > 1) {
342                 errno = E2BIG;
343                 return -1;
344         }
345         
346         return 0;
347 }
348
349
350 /* the "weird" character set is very useful for testing multi-byte
351    support and finding bugs. Don't use on a production system! 
352 */
353 static struct {
354         char from;
355         char *to;
356         int len;
357 } weird_table[] = {
358         {'q', "^q^", 3},
359         {'Q', "^Q^", 3},
360         {0, NULL}
361 };
362
363 static size_t weird_pull(void *cd, char **inbuf, size_t *inbytesleft,
364                          char **outbuf, size_t *outbytesleft)
365 {
366         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
367                 int i;
368                 int done = 0;
369                 for (i=0;weird_table[i].from;i++) {
370                         if (strncmp((*inbuf), 
371                                     weird_table[i].to, 
372                                     weird_table[i].len) == 0) {
373                                 if (*inbytesleft < weird_table[i].len) {
374                                         DEBUG(0,("ERROR: truncated weird string\n"));
375                                         /* smb_panic("weird_pull"); */
376
377                                 } else {
378                                         (*outbuf)[0] = weird_table[i].from;
379                                         (*outbuf)[1] = 0;
380                                         (*inbytesleft)  -= weird_table[i].len;
381                                         (*outbytesleft) -= 2;
382                                         (*inbuf)  += weird_table[i].len;
383                                         (*outbuf) += 2;
384                                         done = 1;
385                                         break;
386                                 }
387                         }
388                 }
389                 if (done) continue;
390                 (*outbuf)[0] = (*inbuf)[0];
391                 (*outbuf)[1] = 0;
392                 (*inbytesleft)  -= 1;
393                 (*outbytesleft) -= 2;
394                 (*inbuf)  += 1;
395                 (*outbuf) += 2;
396         }
397
398         if (*inbytesleft > 0) {
399                 errno = E2BIG;
400                 return -1;
401         }
402         
403         return 0;
404 }
405
406 static size_t weird_push(void *cd, char **inbuf, size_t *inbytesleft,
407                          char **outbuf, size_t *outbytesleft)
408 {
409         int ir_count=0;
410
411         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
412                 int i;
413                 int done=0;
414                 for (i=0;weird_table[i].from;i++) {
415                         if ((*inbuf)[0] == weird_table[i].from &&
416                             (*inbuf)[1] == 0) {
417                                 if (*outbytesleft < weird_table[i].len) {
418                                         DEBUG(0,("No room for weird character\n"));
419                                         /* smb_panic("weird_push"); */
420                                 } else {
421                                         memcpy(*outbuf, weird_table[i].to, 
422                                                weird_table[i].len);
423                                         (*inbytesleft)  -= 2;
424                                         (*outbytesleft) -= weird_table[i].len;
425                                         (*inbuf)  += 2;
426                                         (*outbuf) += weird_table[i].len;
427                                         done = 1;
428                                         break;
429                                 }
430                         }
431                 }
432                 if (done) continue;
433
434                 (*outbuf)[0] = (*inbuf)[0];
435                 if ((*inbuf)[1]) ir_count++;
436                 (*inbytesleft)  -= 2;
437                 (*outbytesleft) -= 1;
438                 (*inbuf)  += 2;
439                 (*outbuf) += 1;
440         }
441
442         if (*inbytesleft == 1) {
443                 errno = EINVAL;
444                 return -1;
445         }
446
447         if (*inbytesleft > 1) {
448                 errno = E2BIG;
449                 return -1;
450         }
451         
452         return ir_count;
453 }
454
455 static size_t iconv_copy(void *cd, char **inbuf, size_t *inbytesleft,
456                          char **outbuf, size_t *outbytesleft)
457 {
458         int n;
459
460         n = MIN(*inbytesleft, *outbytesleft);
461
462         memmove(*outbuf, *inbuf, n);
463
464         (*inbytesleft) -= n;
465         (*outbytesleft) -= n;
466         (*inbuf) += n;
467         (*outbuf) += n;
468
469         if (*inbytesleft > 0) {
470                 errno = E2BIG;
471                 return -1;
472         }
473
474         return 0;
475 }
476
477 static size_t utf8_pull(void *cd, char **inbuf, size_t *inbytesleft,
478                          char **outbuf, size_t *outbytesleft)
479 {
480         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
481                 unsigned char *c = (unsigned char *)*inbuf;
482                 unsigned char *uc = (unsigned char *)*outbuf;
483                 int len = 1;
484
485                 if ((c[0] & 0x80) == 0) {
486                         uc[0] = c[0];
487                         uc[1] = 0;
488                 } else if ((c[0] & 0xf0) == 0xe0) {
489                         if (*inbytesleft < 3) {
490                                 DEBUG(0,("short utf8 char\n"));
491                                 goto badseq;
492                         }
493                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
494                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
495                         len = 3;
496                 } else if ((c[0] & 0xe0) == 0xc0) {
497                         if (*inbytesleft < 2) {
498                                 DEBUG(0,("short utf8 char\n"));
499                                 goto badseq;
500                         }
501                         uc[1] = (c[0]>>2) & 0x7;
502                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
503                         len = 2;
504                 }
505
506                 (*inbuf)  += len;
507                 (*inbytesleft)  -= len;
508                 (*outbytesleft) -= 2;
509                 (*outbuf) += 2;
510         }
511
512         if (*inbytesleft > 0) {
513                 errno = E2BIG;
514                 return -1;
515         }
516         
517         return 0;
518
519 badseq:
520         errno = EINVAL;
521         return -1;
522 }
523
524 static size_t utf8_push(void *cd, char **inbuf, size_t *inbytesleft,
525                          char **outbuf, size_t *outbytesleft)
526 {
527         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
528                 unsigned char *c = (unsigned char *)*outbuf;
529                 unsigned char *uc = (unsigned char *)*inbuf;
530                 int len=1;
531
532                 if (uc[1] & 0xf8) {
533                         if (*outbytesleft < 3) {
534                                 DEBUG(0,("short utf8 write\n"));
535                                 goto toobig;
536                         }
537                         c[0] = 0xe0 | (uc[1]>>4);
538                         c[1] = 0x80 | ((uc[1]&0xF)<<2) | (uc[0]>>6);
539                         c[2] = 0x80 | (uc[0]&0x3f);
540                         len = 3;
541                 } else if (uc[1] | (uc[0] & 0x80)) {
542                         if (*outbytesleft < 2) {
543                                 DEBUG(0,("short utf8 write\n"));
544                                 goto toobig;
545                         }
546                         c[0] = 0xc0 | (uc[1]<<2) | (uc[0]>>6);
547                         c[1] = 0x80 | (uc[0]&0x3f);
548                         len = 2;
549                 } else {
550                         c[0] = uc[0];
551                 }
552
553
554                 (*inbytesleft)  -= 2;
555                 (*outbytesleft) -= len;
556                 (*inbuf)  += 2;
557                 (*outbuf) += len;
558         }
559
560         if (*inbytesleft == 1) {
561                 errno = EINVAL;
562                 return -1;
563         }
564
565         if (*inbytesleft > 1) {
566                 errno = E2BIG;
567                 return -1;
568         }
569         
570         return 0;
571
572 toobig:
573         errno = E2BIG;
574         return -1;
575 }
576