Add smb_register_charset() and use it
[samba.git] / source / lib / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 #include "includes.h"
23
24 static size_t ascii_pull(void *,char **, size_t *, char **, size_t *);
25 static size_t ascii_push(void *,char **, size_t *, char **, size_t *);
26 static size_t  utf8_pull(void *,char **, size_t *, char **, size_t *);
27 static size_t  utf8_push(void *,char **, size_t *, char **, size_t *);
28 static size_t weird_pull(void *,char **, size_t *, char **, size_t *);
29 static size_t weird_push(void *,char **, size_t *, char **, size_t *);
30 static size_t ucs2hex_pull(void *,char **, size_t *, char **, size_t *);
31 static size_t ucs2hex_push(void *,char **, size_t *, char **, size_t *);
32 static size_t iconv_copy(void *,char **, size_t *, char **, size_t *);
33
34 struct charset_functions builtin_functions[] = {
35                 {"UCS-2LE",  iconv_copy, iconv_copy},
36                 {"UTF8",   utf8_pull,  utf8_push},
37                 {"ASCII", ascii_pull, ascii_push},
38                 {"WEIRD", weird_pull, weird_push},
39                 {"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
40                 {NULL, NULL, NULL}
41 };
42
43 static struct charset_functions *charsets = NULL;
44
45 BOOL smb_register_charset(struct charset_functions *funcs) 
46 {
47         struct charset_functions *c = charsets;
48
49         DEBUG(5, ("Attempting to register new charset %s\n", funcs->name));
50         /* Check whether we already have this charset... */
51         while(c) {
52                 if(!strcasecmp(c->name, funcs->name)){ 
53                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
54                         return False;
55                 }
56                 c = c->next;
57         }
58
59         funcs->next = funcs->prev = NULL;
60         DEBUG(5, ("Registered charset %s\n", c->name));
61         DLIST_ADD(charsets, funcs);
62         return True;
63 }
64
65 void lazy_initialize_iconv(void)
66 {
67         static BOOL initialized = False;
68         int i;
69
70         if (!initialized) {
71                 initialized = True;
72                 for(i = 0; builtin_functions[i].name; i++) 
73                         smb_register_charset(&builtin_functions[i]);
74         }
75 }
76
77 /* if there was an error then reset the internal state,
78    this ensures that we don't have a shift state remaining for
79    character sets like SJIS */
80 static size_t sys_iconv(void *cd, 
81                         char **inbuf, size_t *inbytesleft,
82                         char **outbuf, size_t *outbytesleft)
83 {
84 #ifdef HAVE_NATIVE_ICONV
85         size_t ret = iconv((iconv_t)cd, 
86                            inbuf, inbytesleft, 
87                            outbuf, outbytesleft);
88         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
89         return ret;
90 #else
91         errno = EINVAL;
92         return -1;
93 #endif
94 }
95
96 /*
97   this is a simple portable iconv() implementaion. It only knows about
98   a very small number of character sets - just enough that Samba works
99   on systems that don't have iconv
100  */
101 size_t smb_iconv(smb_iconv_t cd, 
102                  const char **inbuf, size_t *inbytesleft,
103                  char **outbuf, size_t *outbytesleft)
104 {
105         char cvtbuf[2048];
106         char *bufp = cvtbuf;
107         size_t bufsize;
108
109         /* in many cases we can go direct */
110         if (cd->direct) {
111                 return cd->direct(cd->cd_direct, 
112                                   (char **)inbuf, inbytesleft, outbuf, outbytesleft);
113         }
114
115
116         /* otherwise we have to do it chunks at a time */
117         while (*inbytesleft > 0) {
118                 bufp = cvtbuf;
119                 bufsize = sizeof(cvtbuf);
120                 
121                 if (cd->pull(cd->cd_pull, 
122                              (char **)inbuf, inbytesleft, &bufp, &bufsize) == -1
123                     && errno != E2BIG) return -1;
124
125                 bufp = cvtbuf;
126                 bufsize = sizeof(cvtbuf) - bufsize;
127
128                 if (cd->push(cd->cd_push, 
129                              &bufp, &bufsize, 
130                              outbuf, outbytesleft) == -1) return -1;
131         }
132
133         return 0;
134 }
135
136 /*
137   simple iconv_open() wrapper
138  */
139 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
140 {
141         smb_iconv_t ret;
142         struct charset_functions *from, *to;
143         
144         lazy_initialize_iconv();
145         from = charsets;
146         to = charsets;
147
148         ret = (smb_iconv_t)malloc(sizeof(*ret));
149         if (!ret) {
150                 errno = ENOMEM;
151                 return (smb_iconv_t)-1;
152         }
153         memset(ret, 0, sizeof(*ret));
154
155         ret->from_name = strdup(fromcode);
156         ret->to_name = strdup(tocode);
157
158         /* check for the simplest null conversion */
159         if (strcmp(fromcode, tocode) == 0) {
160                 ret->direct = iconv_copy;
161                 return ret;
162         }
163
164         while (from) {
165                 if (strcasecmp(from->name, fromcode) == 0) break;
166                 from = from->next;
167         }
168
169         while (to) {
170                 if (strcasecmp(to->name, tocode) == 0) break;
171                 to = to->next;
172         }
173
174 #ifdef HAVE_NATIVE_ICONV
175         if (!from) {
176                 ret->pull = sys_iconv;
177                 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
178                 if (ret->cd_pull == (iconv_t)-1) goto failed;
179         }
180
181         if (!to) {
182                 ret->push = sys_iconv;
183                 ret->cd_push = iconv_open(tocode, "UCS-2LE");
184                 if (ret->cd_push == (iconv_t)-1) goto failed;
185         }
186 #else
187         if (!from || !to) {
188                 goto failed;
189         }
190 #endif
191
192         /* check for conversion to/from ucs2 */
193         if (strcasecmp(fromcode, "UCS-2LE") == 0 && to) {
194                 ret->direct = to->push;
195                 return ret;
196         }
197         if (strcasecmp(tocode, "UCS-2LE") == 0 && from) {
198                 ret->direct = from->pull;
199                 return ret;
200         }
201
202 #ifdef HAVE_NATIVE_ICONV
203         if (strcasecmp(fromcode, "UCS-2LE") == 0) {
204                 ret->direct = sys_iconv;
205                 ret->cd_direct = ret->cd_push;
206                 ret->cd_push = NULL;
207                 return ret;
208         }
209         if (strcasecmp(tocode, "UCS-2LE") == 0) {
210                 ret->direct = sys_iconv;
211                 ret->cd_direct = ret->cd_pull;
212                 ret->cd_pull = NULL;
213                 return ret;
214         }
215 #endif
216
217         /* the general case has to go via a buffer */
218         if (!ret->pull) ret->pull = from->pull;
219         if (!ret->push) ret->push = to->push;
220         return ret;
221
222 failed:
223         SAFE_FREE(ret);
224         errno = EINVAL;
225         return (smb_iconv_t)-1;
226 }
227
228 /*
229   simple iconv_close() wrapper
230 */
231 int smb_iconv_close (smb_iconv_t cd)
232 {
233 #ifdef HAVE_NATIVE_ICONV
234         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
235         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
236         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
237 #endif
238
239         SAFE_FREE(cd->from_name);
240         SAFE_FREE(cd->to_name);
241
242         memset(cd, 0, sizeof(*cd));
243         SAFE_FREE(cd);
244         return 0;
245 }
246
247
248 /**********************************************************************
249  the following functions implement the builtin character sets in Samba
250  and also the "test" character sets that are designed to test
251  multi-byte character set support for english users
252 ***********************************************************************/
253
254 static size_t ascii_pull(void *cd, char **inbuf, size_t *inbytesleft,
255                          char **outbuf, size_t *outbytesleft)
256 {
257         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
258                 (*outbuf)[0] = (*inbuf)[0];
259                 (*outbuf)[1] = 0;
260                 (*inbytesleft)  -= 1;
261                 (*outbytesleft) -= 2;
262                 (*inbuf)  += 1;
263                 (*outbuf) += 2;
264         }
265
266         if (*inbytesleft > 0) {
267                 errno = E2BIG;
268                 return -1;
269         }
270         
271         return 0;
272 }
273
274 static size_t ascii_push(void *cd, char **inbuf, size_t *inbytesleft,
275                          char **outbuf, size_t *outbytesleft)
276 {
277         int ir_count=0;
278
279         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
280                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
281                 if ((*inbuf)[1]) ir_count++;
282                 (*inbytesleft)  -= 2;
283                 (*outbytesleft) -= 1;
284                 (*inbuf)  += 2;
285                 (*outbuf) += 1;
286         }
287
288         if (*inbytesleft == 1) {
289                 errno = EINVAL;
290                 return -1;
291         }
292
293         if (*inbytesleft > 1) {
294                 errno = E2BIG;
295                 return -1;
296         }
297         
298         return ir_count;
299 }
300
301
302 static size_t ucs2hex_pull(void *cd, char **inbuf, size_t *inbytesleft,
303                          char **outbuf, size_t *outbytesleft)
304 {
305         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
306                 unsigned v;
307
308                 if ((*inbuf)[0] != '@') {
309                         /* seven bit ascii case */
310                         (*outbuf)[0] = (*inbuf)[0];
311                         (*outbuf)[1] = 0;
312                         (*inbytesleft)  -= 1;
313                         (*outbytesleft) -= 2;
314                         (*inbuf)  += 1;
315                         (*outbuf) += 2;
316                         continue;
317                 }
318                 /* it's a hex character */
319                 if (*inbytesleft < 5) {
320                         errno = EINVAL;
321                         return -1;
322                 }
323                 
324                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
325                         errno = EILSEQ;
326                         return -1;
327                 }
328
329                 (*outbuf)[0] = v&0xff;
330                 (*outbuf)[1] = v>>8;
331                 (*inbytesleft)  -= 5;
332                 (*outbytesleft) -= 2;
333                 (*inbuf)  += 5;
334                 (*outbuf) += 2;
335         }
336
337         if (*inbytesleft > 0) {
338                 errno = E2BIG;
339                 return -1;
340         }
341         
342         return 0;
343 }
344
345 static size_t ucs2hex_push(void *cd, char **inbuf, size_t *inbytesleft,
346                            char **outbuf, size_t *outbytesleft)
347 {
348         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
349                 char buf[6];
350
351                 if ((*inbuf)[1] == 0 && 
352                     ((*inbuf)[0] & 0x80) == 0 &&
353                     (*inbuf)[0] != '@') {
354                         (*outbuf)[0] = (*inbuf)[0];
355                         (*inbytesleft)  -= 2;
356                         (*outbytesleft) -= 1;
357                         (*inbuf)  += 2;
358                         (*outbuf) += 1;
359                         continue;
360                 }
361                 if (*outbytesleft < 5) {
362                         errno = E2BIG;
363                         return -1;
364                 }
365                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
366                 memcpy(*outbuf, buf, 5);
367                 (*inbytesleft)  -= 2;
368                 (*outbytesleft) -= 5;
369                 (*inbuf)  += 2;
370                 (*outbuf) += 5;
371         }
372
373         if (*inbytesleft == 1) {
374                 errno = EINVAL;
375                 return -1;
376         }
377
378         if (*inbytesleft > 1) {
379                 errno = E2BIG;
380                 return -1;
381         }
382         
383         return 0;
384 }
385
386
387 /* the "weird" character set is very useful for testing multi-byte
388    support and finding bugs. Don't use on a production system! 
389 */
390 static struct {
391         char from;
392         char *to;
393         int len;
394 } weird_table[] = {
395         {'q', "^q^", 3},
396         {'Q', "^Q^", 3},
397         {0, NULL}
398 };
399
400 static size_t weird_pull(void *cd, char **inbuf, size_t *inbytesleft,
401                          char **outbuf, size_t *outbytesleft)
402 {
403         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
404                 int i;
405                 int done = 0;
406                 for (i=0;weird_table[i].from;i++) {
407                         if (strncmp((*inbuf), 
408                                     weird_table[i].to, 
409                                     weird_table[i].len) == 0) {
410                                 if (*inbytesleft < weird_table[i].len) {
411                                         DEBUG(0,("ERROR: truncated weird string\n"));
412                                         /* smb_panic("weird_pull"); */
413
414                                 } else {
415                                         (*outbuf)[0] = weird_table[i].from;
416                                         (*outbuf)[1] = 0;
417                                         (*inbytesleft)  -= weird_table[i].len;
418                                         (*outbytesleft) -= 2;
419                                         (*inbuf)  += weird_table[i].len;
420                                         (*outbuf) += 2;
421                                         done = 1;
422                                         break;
423                                 }
424                         }
425                 }
426                 if (done) continue;
427                 (*outbuf)[0] = (*inbuf)[0];
428                 (*outbuf)[1] = 0;
429                 (*inbytesleft)  -= 1;
430                 (*outbytesleft) -= 2;
431                 (*inbuf)  += 1;
432                 (*outbuf) += 2;
433         }
434
435         if (*inbytesleft > 0) {
436                 errno = E2BIG;
437                 return -1;
438         }
439         
440         return 0;
441 }
442
443 static size_t weird_push(void *cd, char **inbuf, size_t *inbytesleft,
444                          char **outbuf, size_t *outbytesleft)
445 {
446         int ir_count=0;
447
448         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
449                 int i;
450                 int done=0;
451                 for (i=0;weird_table[i].from;i++) {
452                         if ((*inbuf)[0] == weird_table[i].from &&
453                             (*inbuf)[1] == 0) {
454                                 if (*outbytesleft < weird_table[i].len) {
455                                         DEBUG(0,("No room for weird character\n"));
456                                         /* smb_panic("weird_push"); */
457                                 } else {
458                                         memcpy(*outbuf, weird_table[i].to, 
459                                                weird_table[i].len);
460                                         (*inbytesleft)  -= 2;
461                                         (*outbytesleft) -= weird_table[i].len;
462                                         (*inbuf)  += 2;
463                                         (*outbuf) += weird_table[i].len;
464                                         done = 1;
465                                         break;
466                                 }
467                         }
468                 }
469                 if (done) continue;
470
471                 (*outbuf)[0] = (*inbuf)[0];
472                 if ((*inbuf)[1]) ir_count++;
473                 (*inbytesleft)  -= 2;
474                 (*outbytesleft) -= 1;
475                 (*inbuf)  += 2;
476                 (*outbuf) += 1;
477         }
478
479         if (*inbytesleft == 1) {
480                 errno = EINVAL;
481                 return -1;
482         }
483
484         if (*inbytesleft > 1) {
485                 errno = E2BIG;
486                 return -1;
487         }
488         
489         return ir_count;
490 }
491
492 static size_t iconv_copy(void *cd, char **inbuf, size_t *inbytesleft,
493                          char **outbuf, size_t *outbytesleft)
494 {
495         int n;
496
497         n = MIN(*inbytesleft, *outbytesleft);
498
499         memmove(*outbuf, *inbuf, n);
500
501         (*inbytesleft) -= n;
502         (*outbytesleft) -= n;
503         (*inbuf) += n;
504         (*outbuf) += n;
505
506         if (*inbytesleft > 0) {
507                 errno = E2BIG;
508                 return -1;
509         }
510
511         return 0;
512 }
513
514 static size_t utf8_pull(void *cd, char **inbuf, size_t *inbytesleft,
515                          char **outbuf, size_t *outbytesleft)
516 {
517         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
518                 unsigned char *c = (unsigned char *)*inbuf;
519                 unsigned char *uc = (unsigned char *)*outbuf;
520                 int len = 1;
521
522                 if ((c[0] & 0x80) == 0) {
523                         uc[0] = c[0];
524                         uc[1] = 0;
525                 } else if ((c[0] & 0xf0) == 0xe0) {
526                         if (*inbytesleft < 3) {
527                                 DEBUG(0,("short utf8 char\n"));
528                                 goto badseq;
529                         }
530                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
531                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
532                         len = 3;
533                 } else if ((c[0] & 0xe0) == 0xc0) {
534                         if (*inbytesleft < 2) {
535                                 DEBUG(0,("short utf8 char\n"));
536                                 goto badseq;
537                         }
538                         uc[1] = (c[0]>>2) & 0x7;
539                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
540                         len = 2;
541                 }
542
543                 (*inbuf)  += len;
544                 (*inbytesleft)  -= len;
545                 (*outbytesleft) -= 2;
546                 (*outbuf) += 2;
547         }
548
549         if (*inbytesleft > 0) {
550                 errno = E2BIG;
551                 return -1;
552         }
553         
554         return 0;
555
556 badseq:
557         errno = EINVAL;
558         return -1;
559 }
560
561 static size_t utf8_push(void *cd, char **inbuf, size_t *inbytesleft,
562                          char **outbuf, size_t *outbytesleft)
563 {
564         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
565                 unsigned char *c = (unsigned char *)*outbuf;
566                 unsigned char *uc = (unsigned char *)*inbuf;
567                 int len=1;
568
569                 if (uc[1] & 0xf8) {
570                         if (*outbytesleft < 3) {
571                                 DEBUG(0,("short utf8 write\n"));
572                                 goto toobig;
573                         }
574                         c[0] = 0xe0 | (uc[1]>>4);
575                         c[1] = 0x80 | ((uc[1]&0xF)<<2) | (uc[0]>>6);
576                         c[2] = 0x80 | (uc[0]&0x3f);
577                         len = 3;
578                 } else if (uc[1] | (uc[0] & 0x80)) {
579                         if (*outbytesleft < 2) {
580                                 DEBUG(0,("short utf8 write\n"));
581                                 goto toobig;
582                         }
583                         c[0] = 0xc0 | (uc[1]<<2) | (uc[0]>>6);
584                         c[1] = 0x80 | (uc[0]&0x3f);
585                         len = 2;
586                 } else {
587                         c[0] = uc[0];
588                 }
589
590
591                 (*inbytesleft)  -= 2;
592                 (*outbytesleft) -= len;
593                 (*inbuf)  += 2;
594                 (*outbuf) += len;
595         }
596
597         if (*inbytesleft == 1) {
598                 errno = EINVAL;
599                 return -1;
600         }
601
602         if (*inbytesleft > 1) {
603                 errno = E2BIG;
604                 return -1;
605         }
606         
607         return 0;
608
609 toobig:
610         errno = E2BIG;
611         return -1;
612 }
613