first public release of samba4 code
[bbaumbach/samba-autobuild/.git] / source4 / lib / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 2 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program; if not, write to the Free Software
19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 */
21
22 #include "includes.h"
23
24
25 /**
26  * @file
27  *
28  * @brief Samba wrapper/stub for iconv character set conversion.
29  *
30  * iconv is the XPG2 interface for converting between character
31  * encodings.  This file provides a Samba wrapper around it, and also
32  * a simple reimplementation that is used if the system does not
33  * implement iconv.
34  *
35  * Samba only works with encodings that are supersets of ASCII: ascii
36  * characters like whitespace can be tested for directly, multibyte
37  * sequences start with a byte with the high bit set, and strings are
38  * terminated by a nul byte.
39  *
40  * Note that the only function provided by iconv is conversion between
41  * characters.  It doesn't directly support operations like
42  * uppercasing or comparison.  We have to convert to UCS-2 and compare
43  * there.
44  *
45  * @sa Samba Developers Guide
46  **/
47
48 static size_t ascii_pull(void *,const char **, size_t *, char **, size_t *);
49 static size_t ascii_push(void *,const char **, size_t *, char **, size_t *);
50 static size_t  utf8_pull(void *,const char **, size_t *, char **, size_t *);
51 static size_t  utf8_push(void *,const char **, size_t *, char **, size_t *);
52 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
53 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
54 static size_t iconv_copy(void *,const char **, size_t *, char **, size_t *);
55
56 static struct charset_functions builtin_functions[] = {
57         {"UCS-2LE",  iconv_copy, iconv_copy},
58         {"UTF8",   utf8_pull,  utf8_push},
59         {"ASCII", ascii_pull, ascii_push},
60         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push},
61         {NULL, NULL, NULL}
62 };
63
64 static struct charset_functions *charsets = NULL;
65
66 BOOL smb_register_charset(struct charset_functions *funcs) 
67 {
68         struct charset_functions *c = charsets;
69
70         DEBUG(5, ("Attempting to register new charset %s\n", funcs->name));
71         /* Check whether we already have this charset... */
72         while(c) {
73                 if(!strcasecmp(c->name, funcs->name)){ 
74                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
75                         return False;
76                 }
77                 c = c->next;
78         }
79
80         funcs->next = funcs->prev = NULL;
81         DEBUG(5, ("Registered charset %s\n", funcs->name));
82         DLIST_ADD(charsets, funcs);
83         return True;
84 }
85
86 static void lazy_initialize_iconv(void)
87 {
88         static BOOL initialized = False;
89         int i;
90
91         if (!initialized) {
92                 initialized = True;
93                 for(i = 0; builtin_functions[i].name; i++) 
94                         smb_register_charset(&builtin_functions[i]);
95         }
96 }
97
98 #ifdef HAVE_NATIVE_ICONV
99 /* if there was an error then reset the internal state,
100    this ensures that we don't have a shift state remaining for
101    character sets like SJIS */
102 static size_t sys_iconv(void *cd, 
103                         const char **inbuf, size_t *inbytesleft,
104                         char **outbuf, size_t *outbytesleft)
105 {
106         size_t ret = iconv((iconv_t)cd, 
107                            inbuf, inbytesleft, 
108                            outbuf, outbytesleft);
109         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
110         return ret;
111 }
112 #endif
113
114 /**
115  * This is a simple portable iconv() implementaion.
116  *
117  * It only knows about a very small number of character sets - just
118  * enough that Samba works on systems that don't have iconv.
119  **/
120 size_t smb_iconv(smb_iconv_t cd, 
121                  const char **inbuf, size_t *inbytesleft,
122                  char **outbuf, size_t *outbytesleft)
123 {
124         char cvtbuf[2048];
125         char *bufp = cvtbuf;
126         size_t bufsize;
127
128         /* in many cases we can go direct */
129         if (cd->direct) {
130                 return cd->direct(cd->cd_direct, 
131                                   inbuf, inbytesleft, outbuf, outbytesleft);
132         }
133
134
135         /* otherwise we have to do it chunks at a time */
136         while (*inbytesleft > 0) {
137                 bufp = cvtbuf;
138                 bufsize = sizeof(cvtbuf);
139                 
140                 if (cd->pull(cd->cd_pull, 
141                              inbuf, inbytesleft, &bufp, &bufsize) == -1
142                     && errno != E2BIG) return -1;
143
144                 bufp = cvtbuf;
145                 bufsize = sizeof(cvtbuf) - bufsize;
146
147                 if (cd->push(cd->cd_push, 
148                              &bufp, &bufsize, 
149                              outbuf, outbytesleft) == -1) return -1;
150         }
151
152         return 0;
153 }
154
155 /*
156   simple iconv_open() wrapper
157  */
158 smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
159 {
160         smb_iconv_t ret;
161         struct charset_functions *from, *to;
162         
163         lazy_initialize_iconv();
164         from = charsets;
165         to = charsets;
166
167         ret = (smb_iconv_t)malloc(sizeof(*ret));
168         if (!ret) {
169                 errno = ENOMEM;
170                 return (smb_iconv_t)-1;
171         }
172         memset(ret, 0, sizeof(*ret));
173
174         ret->from_name = strdup(fromcode);
175         ret->to_name = strdup(tocode);
176
177         /* check for the simplest null conversion */
178         if (strcmp(fromcode, tocode) == 0) {
179                 ret->direct = iconv_copy;
180                 return ret;
181         }
182
183         while (from) {
184                 if (strcasecmp(from->name, fromcode) == 0) break;
185                 from = from->next;
186         }
187
188         while (to) {
189                 if (strcasecmp(to->name, tocode) == 0) break;
190                 to = to->next;
191         }
192
193 #ifdef HAVE_NATIVE_ICONV
194         if (!from) {
195                 ret->pull = sys_iconv;
196                 ret->cd_pull = iconv_open("UCS-2LE", fromcode);
197                 if (ret->cd_pull == (iconv_t)-1) goto failed;
198         }
199
200         if (!to) {
201                 ret->push = sys_iconv;
202                 ret->cd_push = iconv_open(tocode, "UCS-2LE");
203                 if (ret->cd_push == (iconv_t)-1) goto failed;
204         }
205 #else
206         if (!from || !to) {
207                 goto failed;
208         }
209 #endif
210
211         /* check for conversion to/from ucs2 */
212         if (strcasecmp(fromcode, "UCS-2LE") == 0 && to) {
213                 ret->direct = to->push;
214                 return ret;
215         }
216         if (strcasecmp(tocode, "UCS-2LE") == 0 && from) {
217                 ret->direct = from->pull;
218                 return ret;
219         }
220
221 #ifdef HAVE_NATIVE_ICONV
222         if (strcasecmp(fromcode, "UCS-2LE") == 0) {
223                 ret->direct = sys_iconv;
224                 ret->cd_direct = ret->cd_push;
225                 ret->cd_push = NULL;
226                 return ret;
227         }
228         if (strcasecmp(tocode, "UCS-2LE") == 0) {
229                 ret->direct = sys_iconv;
230                 ret->cd_direct = ret->cd_pull;
231                 ret->cd_pull = NULL;
232                 return ret;
233         }
234 #endif
235
236         /* the general case has to go via a buffer */
237         if (!ret->pull) ret->pull = from->pull;
238         if (!ret->push) ret->push = to->push;
239         return ret;
240
241 failed:
242         SAFE_FREE(ret);
243         errno = EINVAL;
244         return (smb_iconv_t)-1;
245 }
246
247 /*
248   simple iconv_close() wrapper
249 */
250 int smb_iconv_close (smb_iconv_t cd)
251 {
252 #ifdef HAVE_NATIVE_ICONV
253         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
254         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
255         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
256 #endif
257
258         SAFE_FREE(cd->from_name);
259         SAFE_FREE(cd->to_name);
260
261         memset(cd, 0, sizeof(*cd));
262         SAFE_FREE(cd);
263         return 0;
264 }
265
266
267 /**********************************************************************
268  the following functions implement the builtin character sets in Samba
269  and also the "test" character sets that are designed to test
270  multi-byte character set support for english users
271 ***********************************************************************/
272 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
273                          char **outbuf, size_t *outbytesleft)
274 {
275         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
276                 (*outbuf)[0] = (*inbuf)[0];
277                 (*outbuf)[1] = 0;
278                 (*inbytesleft)  -= 1;
279                 (*outbytesleft) -= 2;
280                 (*inbuf)  += 1;
281                 (*outbuf) += 2;
282         }
283
284         if (*inbytesleft > 0) {
285                 errno = E2BIG;
286                 return -1;
287         }
288         
289         return 0;
290 }
291
292 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
293                          char **outbuf, size_t *outbytesleft)
294 {
295         int ir_count=0;
296
297         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
298                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
299                 if ((*inbuf)[1]) ir_count++;
300                 (*inbytesleft)  -= 2;
301                 (*outbytesleft) -= 1;
302                 (*inbuf)  += 2;
303                 (*outbuf) += 1;
304         }
305
306         if (*inbytesleft == 1) {
307                 errno = EINVAL;
308                 return -1;
309         }
310
311         if (*inbytesleft > 1) {
312                 errno = E2BIG;
313                 return -1;
314         }
315         
316         return ir_count;
317 }
318
319
320 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
321                          char **outbuf, size_t *outbytesleft)
322 {
323         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
324                 unsigned v;
325
326                 if ((*inbuf)[0] != '@') {
327                         /* seven bit ascii case */
328                         (*outbuf)[0] = (*inbuf)[0];
329                         (*outbuf)[1] = 0;
330                         (*inbytesleft)  -= 1;
331                         (*outbytesleft) -= 2;
332                         (*inbuf)  += 1;
333                         (*outbuf) += 2;
334                         continue;
335                 }
336                 /* it's a hex character */
337                 if (*inbytesleft < 5) {
338                         errno = EINVAL;
339                         return -1;
340                 }
341                 
342                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
343                         errno = EILSEQ;
344                         return -1;
345                 }
346
347                 (*outbuf)[0] = v&0xff;
348                 (*outbuf)[1] = v>>8;
349                 (*inbytesleft)  -= 5;
350                 (*outbytesleft) -= 2;
351                 (*inbuf)  += 5;
352                 (*outbuf) += 2;
353         }
354
355         if (*inbytesleft > 0) {
356                 errno = E2BIG;
357                 return -1;
358         }
359         
360         return 0;
361 }
362
363 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
364                            char **outbuf, size_t *outbytesleft)
365 {
366         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
367                 char buf[6];
368
369                 if ((*inbuf)[1] == 0 && 
370                     ((*inbuf)[0] & 0x80) == 0 &&
371                     (*inbuf)[0] != '@') {
372                         (*outbuf)[0] = (*inbuf)[0];
373                         (*inbytesleft)  -= 2;
374                         (*outbytesleft) -= 1;
375                         (*inbuf)  += 2;
376                         (*outbuf) += 1;
377                         continue;
378                 }
379                 if (*outbytesleft < 5) {
380                         errno = E2BIG;
381                         return -1;
382                 }
383                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
384                 memcpy(*outbuf, buf, 5);
385                 (*inbytesleft)  -= 2;
386                 (*outbytesleft) -= 5;
387                 (*inbuf)  += 2;
388                 (*outbuf) += 5;
389         }
390
391         if (*inbytesleft == 1) {
392                 errno = EINVAL;
393                 return -1;
394         }
395
396         if (*inbytesleft > 1) {
397                 errno = E2BIG;
398                 return -1;
399         }
400         
401         return 0;
402 }
403
404
405 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
406                          char **outbuf, size_t *outbytesleft)
407 {
408         int n;
409
410         n = MIN(*inbytesleft, *outbytesleft);
411
412         memmove(*outbuf, *inbuf, n);
413
414         (*inbytesleft) -= n;
415         (*outbytesleft) -= n;
416         (*inbuf) += n;
417         (*outbuf) += n;
418
419         if (*inbytesleft > 0) {
420                 errno = E2BIG;
421                 return -1;
422         }
423
424         return 0;
425 }
426
427 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
428                          char **outbuf, size_t *outbytesleft)
429 {
430         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
431                 const unsigned char *c = (const unsigned char *)*inbuf;
432                 unsigned char *uc = (unsigned char *)*outbuf;
433                 int len = 1;
434
435                 if ((c[0] & 0x80) == 0) {
436                         uc[0] = c[0];
437                         uc[1] = 0;
438                 } else if ((c[0] & 0xf0) == 0xe0) {
439                         if (*inbytesleft < 3) {
440                                 DEBUG(0,("short utf8 char\n"));
441                                 goto badseq;
442                         }
443                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
444                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
445                         len = 3;
446                 } else if ((c[0] & 0xe0) == 0xc0) {
447                         if (*inbytesleft < 2) {
448                                 DEBUG(0,("short utf8 char\n"));
449                                 goto badseq;
450                         }
451                         uc[1] = (c[0]>>2) & 0x7;
452                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
453                         len = 2;
454                 }
455
456                 (*inbuf)  += len;
457                 (*inbytesleft)  -= len;
458                 (*outbytesleft) -= 2;
459                 (*outbuf) += 2;
460         }
461
462         if (*inbytesleft > 0) {
463                 errno = E2BIG;
464                 return -1;
465         }
466         
467         return 0;
468
469 badseq:
470         errno = EINVAL;
471         return -1;
472 }
473
474 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
475                          char **outbuf, size_t *outbytesleft)
476 {
477         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
478                 unsigned char *c = (unsigned char *)*outbuf;
479                 const unsigned char *uc = (const unsigned char *)*inbuf;
480                 int len=1;
481
482                 if (uc[1] & 0xf8) {
483                         if (*outbytesleft < 3) {
484                                 DEBUG(0,("short utf8 write\n"));
485                                 goto toobig;
486                         }
487                         c[0] = 0xe0 | (uc[1]>>4);
488                         c[1] = 0x80 | ((uc[1]&0xF)<<2) | (uc[0]>>6);
489                         c[2] = 0x80 | (uc[0]&0x3f);
490                         len = 3;
491                 } else if (uc[1] | (uc[0] & 0x80)) {
492                         if (*outbytesleft < 2) {
493                                 DEBUG(0,("short utf8 write\n"));
494                                 goto toobig;
495                         }
496                         c[0] = 0xc0 | (uc[1]<<2) | (uc[0]>>6);
497                         c[1] = 0x80 | (uc[0]&0x3f);
498                         len = 2;
499                 } else {
500                         c[0] = uc[0];
501                 }
502
503
504                 (*inbytesleft)  -= 2;
505                 (*outbytesleft) -= len;
506                 (*inbuf)  += 2;
507                 (*outbuf) += len;
508         }
509
510         if (*inbytesleft == 1) {
511                 errno = EINVAL;
512                 return -1;
513         }
514
515         if (*inbytesleft > 1) {
516                 errno = E2BIG;
517                 return -1;
518         }
519         
520         return 0;
521
522 toobig:
523         errno = E2BIG;
524         return -1;
525 }
526