lib: Fix a typo
[metze/samba/wip.git] / lib / util / charset / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #include "charset_proto.h"
26
27 #ifdef strcasecmp
28 #undef strcasecmp
29 #endif
30
31 /**
32  * @file
33  *
34  * @brief Samba wrapper/stub for iconv character set conversion.
35  *
36  * iconv is the XPG2 interface for converting between character
37  * encodings.  This file provides a Samba wrapper around it, and also
38  * a simple reimplementation that is used if the system does not
39  * implement iconv.
40  *
41  * Samba only works with encodings that are supersets of ASCII: ascii
42  * characters like whitespace can be tested for directly, multibyte
43  * sequences start with a byte with the high bit set, and strings are
44  * terminated by a nul byte.
45  *
46  * Note that the only function provided by iconv is conversion between
47  * characters.  It doesn't directly support operations like
48  * uppercasing or comparison.  We have to convert to UTF-16LE and
49  * compare there.
50  *
51  * @sa Samba Developers Guide
52  **/
53
54 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
55 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
56 static size_t latin1_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t latin1_push(void *,const char **, size_t *, char **, size_t *);
58 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
59 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
60 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
61 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
62 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
63 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
64 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
65
66 static const struct charset_functions builtin_functions[] = {
67         /* windows is closest to UTF-16 */
68         {"UCS-2LE",  iconv_copy, iconv_copy},
69         {"UTF-16LE",  iconv_copy, iconv_copy},
70         {"UCS-2BE",  iconv_swab, iconv_swab},
71         {"UTF-16BE",  iconv_swab, iconv_swab},
72
73         /* we include the UTF-8 alias to cope with differing locale settings */
74         {"UTF8",   utf8_pull,  utf8_push},
75         {"UTF-8",   utf8_pull,  utf8_push},
76
77         /* this handles the munging needed for String2Key */
78         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy, true},
79
80         {"ASCII", ascii_pull, ascii_push},
81         {"646", ascii_pull, ascii_push},
82         {"ISO-8859-1", latin1_pull, latin1_push},
83 #ifdef DEVELOPER        
84         {"WEIRD", weird_pull, weird_push, true},
85 #endif
86 #ifdef DARWINOS
87         {"MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push, true},
88 #endif
89         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push, true}
90
91 };
92
93 #ifdef HAVE_NATIVE_ICONV
94 /* if there was an error then reset the internal state,
95    this ensures that we don't have a shift state remaining for
96    character sets like SJIS */
97 static size_t sys_iconv(void *cd, 
98                         const char **inbuf, size_t *inbytesleft,
99                         char **outbuf, size_t *outbytesleft)
100 {
101         size_t ret = iconv((iconv_t)cd, 
102                            discard_const_p(char *, inbuf), inbytesleft, 
103                            outbuf, outbytesleft);
104         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
105         return ret;
106 }
107 #endif
108
109 /**
110  * This is a simple portable iconv() implementaion.
111  *
112  * It only knows about a very small number of character sets - just
113  * enough that Samba works on systems that don't have iconv.
114  **/
115 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd, 
116                  const char **inbuf, size_t *inbytesleft,
117                  char **outbuf, size_t *outbytesleft)
118 {
119         /* in many cases we can go direct */
120         if (cd->direct) {
121                 return cd->direct(cd->cd_direct, 
122                                   inbuf, inbytesleft, outbuf, outbytesleft);
123         }
124
125         /* otherwise we have to do it chunks at a time */
126         {
127 #ifndef SMB_ICONV_BUFSIZE
128 #define SMB_ICONV_BUFSIZE 2048
129 #endif
130                 size_t bufsize;
131                 char cvtbuf[SMB_ICONV_BUFSIZE];
132
133                 while (*inbytesleft > 0) {
134                         char *bufp1 = cvtbuf;
135                         const char *bufp2 = cvtbuf;
136                         int saved_errno = errno;
137                         bool pull_failed = false;
138                         bufsize = SMB_ICONV_BUFSIZE;
139
140                         if (cd->pull(cd->cd_pull,
141                                      inbuf, inbytesleft, &bufp1, &bufsize) == -1
142                             && errno != E2BIG) {
143                                 saved_errno = errno;
144                                 pull_failed = true;
145                         }
146
147                         bufsize = SMB_ICONV_BUFSIZE - bufsize;
148
149                         if (cd->push(cd->cd_push,
150                                      &bufp2, &bufsize,
151                                      outbuf, outbytesleft) == -1) {
152                                 return -1;
153                         } else if (pull_failed) {
154                                 /* We want the pull errno if possible */
155                                 errno = saved_errno;
156                                 return -1;
157                         }
158                 }
159         }
160
161         return 0;
162 }
163
164 static bool is_utf16(const char *name)
165 {
166         return strcasecmp(name, "UCS-2LE") == 0 ||
167                 strcasecmp(name, "UTF-16LE") == 0;
168 }
169
170 static int smb_iconv_t_destructor(smb_iconv_t hwd)
171 {
172 #ifdef HAVE_NATIVE_ICONV
173         if (hwd->cd_pull != NULL && hwd->cd_pull != (iconv_t)-1)
174                 iconv_close(hwd->cd_pull);
175         if (hwd->cd_push != NULL && hwd->cd_push != (iconv_t)-1)
176                 iconv_close(hwd->cd_push);
177         if (hwd->cd_direct != NULL && hwd->cd_direct != (iconv_t)-1)
178                 iconv_close(hwd->cd_direct);
179 #endif
180
181         return 0;
182 }
183
184 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
185                               const char *fromcode, bool use_builtin_handlers)
186 {
187         smb_iconv_t ret;
188         const struct charset_functions *from=NULL, *to=NULL;
189         int i;
190
191         ret = (smb_iconv_t)talloc_named(mem_ctx,
192                                         sizeof(*ret), 
193                                         "iconv(%s,%s)", tocode, fromcode);
194         if (!ret) {
195                 errno = ENOMEM;
196                 return (smb_iconv_t)-1;
197         }
198         memset(ret, 0, sizeof(*ret));
199         talloc_set_destructor(ret, smb_iconv_t_destructor);
200
201         /* check for the simplest null conversion */
202         if (strcmp(fromcode, tocode) == 0) {
203                 ret->direct = iconv_copy;
204                 return ret;
205         }
206
207         /* check if we have a builtin function for this conversion */
208         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
209                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
210                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
211                                 from = &builtin_functions[i];
212                         }
213                 }
214                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) { 
215                         if (use_builtin_handlers || builtin_functions[i].samba_internal_charset) {
216                                 to = &builtin_functions[i];
217                         }
218                 }
219         }
220
221 #ifdef HAVE_NATIVE_ICONV
222         /* the from and to variables indicate a samba module or
223          * internal conversion, ret->pull and ret->push are
224          * initialised only in this block for iconv based
225          * conversions */
226
227         if (from == NULL) {
228                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
229                 if (ret->cd_pull == (iconv_t)-1)
230                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
231                 if (ret->cd_pull != (iconv_t)-1) {
232                         ret->pull = sys_iconv;
233                 }
234         }
235         
236         if (to == NULL) {
237                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
238                 if (ret->cd_push == (iconv_t)-1)
239                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
240                 if (ret->cd_push != (iconv_t)-1) {
241                         ret->push = sys_iconv;
242                 }
243         }
244 #endif
245
246         if (ret->pull == NULL && from == NULL) {
247                 goto failed;
248         }
249         
250         if (ret->push == NULL && to == NULL) {
251                 goto failed;
252         }
253
254         /* check for conversion to/from ucs2 */
255         if (is_utf16(fromcode) && to) {
256                 ret->direct = to->push;
257                 return ret;
258         }
259         if (is_utf16(tocode) && from) {
260                 ret->direct = from->pull;
261                 return ret;
262         }
263
264 #ifdef HAVE_NATIVE_ICONV
265         if (is_utf16(fromcode)) {
266                 ret->direct = sys_iconv;
267                 ret->cd_direct = ret->cd_push;
268                 ret->cd_push = NULL;
269                 return ret;
270         }
271         if (is_utf16(tocode)) {
272                 ret->direct = sys_iconv;
273                 ret->cd_direct = ret->cd_pull;
274                 ret->cd_pull = NULL;
275                 return ret;
276         }
277 #endif
278
279         /* the general case has to go via a buffer */
280         if (!ret->pull) ret->pull = from->pull;
281         if (!ret->push) ret->push = to->push;
282         return ret;
283
284 failed:
285         talloc_free(ret);
286         errno = EINVAL;
287         return (smb_iconv_t)-1;
288 }
289
290 /*
291   simple iconv_open() wrapper
292  */
293 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
294 {
295         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
296 }
297
298 /*
299   simple iconv_close() wrapper
300 */
301 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
302 {
303         talloc_free(cd);
304         return 0;
305 }
306
307
308 /**********************************************************************
309  the following functions implement the builtin character sets in Samba
310  and also the "test" character sets that are designed to test
311  multi-byte character set support for english users
312 ***********************************************************************/
313
314 /*
315   this takes an ASCII sequence and produces a UTF16 sequence
316
317   The first 127 codepoints of latin1 matches the first 127 codepoints
318   of unicode, and so can be put into the first byte of UTF16LE
319
320  */
321
322 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
323                          char **outbuf, size_t *outbytesleft)
324 {
325         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
326                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0]) {
327                         /* If this is multi-byte, then it isn't legal ASCII */
328                         errno = EILSEQ;
329                         return -1;
330                 }
331                 (*outbuf)[0] = (*inbuf)[0];
332                 (*outbuf)[1] = 0;
333                 (*inbytesleft)  -= 1;
334                 (*outbytesleft) -= 2;
335                 (*inbuf)  += 1;
336                 (*outbuf) += 2;
337         }
338
339         if (*inbytesleft > 0) {
340                 errno = E2BIG;
341                 return -1;
342         }
343         
344         return 0;
345 }
346
347 /*
348   this takes a UTF16 sequence and produces an ASCII sequence
349
350   The first 127 codepoints of ASCII matches the first 127 codepoints
351   of unicode, and so can be read directly from the first byte of UTF16LE
352
353  */
354 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
355                          char **outbuf, size_t *outbytesleft)
356 {
357         int ir_count=0;
358
359         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
360                 if (((*inbuf)[0] & 0x7F) != (*inbuf)[0] ||
361                         (*inbuf)[1] != 0) {
362                         /* If this is multi-byte, then it isn't legal ASCII */
363                         errno = EILSEQ;
364                         return -1;
365                 }
366                 (*outbuf)[0] = (*inbuf)[0];
367                 (*inbytesleft)  -= 2;
368                 (*outbytesleft) -= 1;
369                 (*inbuf)  += 2;
370                 (*outbuf) += 1;
371         }
372
373         if (*inbytesleft == 1) {
374                 errno = EINVAL;
375                 return -1;
376         }
377
378         if (*inbytesleft > 1) {
379                 errno = E2BIG;
380                 return -1;
381         }
382         
383         return ir_count;
384 }
385
386 /*
387   this takes a latin1/ISO-8859-1 sequence and produces a UTF16 sequence
388
389   The first 256 codepoints of latin1 matches the first 256 codepoints
390   of unicode, and so can be put into the first byte of UTF16LE
391
392  */
393 static size_t latin1_pull(void *cd, const char **inbuf, size_t *inbytesleft,
394                           char **outbuf, size_t *outbytesleft)
395 {
396         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
397                 (*outbuf)[0] = (*inbuf)[0];
398                 (*outbuf)[1] = 0;
399                 (*inbytesleft)  -= 1;
400                 (*outbytesleft) -= 2;
401                 (*inbuf)  += 1;
402                 (*outbuf) += 2;
403         }
404
405         if (*inbytesleft > 0) {
406                 errno = E2BIG;
407                 return -1;
408         }
409
410         return 0;
411 }
412
413 /*
414   this takes a UTF16 sequence and produces a latin1/ISO-8859-1 sequence
415
416   The first 256 codepoints of latin1 matches the first 256 codepoints
417   of unicode, and so can be read directly from the first byte of UTF16LE
418
419  */
420 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
421                          char **outbuf, size_t *outbytesleft)
422 {
423         int ir_count=0;
424
425         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
426                 (*outbuf)[0] = (*inbuf)[0];
427                 if ((*inbuf)[1] != 0) {
428                         /* If this is multi-byte, then it isn't legal latin1 */
429                         errno = EILSEQ;
430                         return -1;
431                 }
432                 (*inbytesleft)  -= 2;
433                 (*outbytesleft) -= 1;
434                 (*inbuf)  += 2;
435                 (*outbuf) += 1;
436         }
437
438         if (*inbytesleft == 1) {
439                 errno = EINVAL;
440                 return -1;
441         }
442
443         if (*inbytesleft > 1) {
444                 errno = E2BIG;
445                 return -1;
446         }
447
448         return ir_count;
449 }
450
451 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
452                          char **outbuf, size_t *outbytesleft)
453 {
454         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
455                 unsigned int v;
456
457                 if ((*inbuf)[0] != '@') {
458                         /* seven bit ascii case */
459                         (*outbuf)[0] = (*inbuf)[0];
460                         (*outbuf)[1] = 0;
461                         (*inbytesleft)  -= 1;
462                         (*outbytesleft) -= 2;
463                         (*inbuf)  += 1;
464                         (*outbuf) += 2;
465                         continue;
466                 }
467                 /* it's a hex character */
468                 if (*inbytesleft < 5) {
469                         errno = EINVAL;
470                         return -1;
471                 }
472                 
473                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
474                         errno = EILSEQ;
475                         return -1;
476                 }
477
478                 (*outbuf)[0] = v&0xff;
479                 (*outbuf)[1] = v>>8;
480                 (*inbytesleft)  -= 5;
481                 (*outbytesleft) -= 2;
482                 (*inbuf)  += 5;
483                 (*outbuf) += 2;
484         }
485
486         if (*inbytesleft > 0) {
487                 errno = E2BIG;
488                 return -1;
489         }
490         
491         return 0;
492 }
493
494 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
495                            char **outbuf, size_t *outbytesleft)
496 {
497         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
498                 char buf[6];
499
500                 if ((*inbuf)[1] == 0 && 
501                     ((*inbuf)[0] & 0x80) == 0 &&
502                     (*inbuf)[0] != '@') {
503                         (*outbuf)[0] = (*inbuf)[0];
504                         (*inbytesleft)  -= 2;
505                         (*outbytesleft) -= 1;
506                         (*inbuf)  += 2;
507                         (*outbuf) += 1;
508                         continue;
509                 }
510                 if (*outbytesleft < 5) {
511                         errno = E2BIG;
512                         return -1;
513                 }
514                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
515                 memcpy(*outbuf, buf, 5);
516                 (*inbytesleft)  -= 2;
517                 (*outbytesleft) -= 5;
518                 (*inbuf)  += 2;
519                 (*outbuf) += 5;
520         }
521
522         if (*inbytesleft == 1) {
523                 errno = EINVAL;
524                 return -1;
525         }
526
527         if (*inbytesleft > 1) {
528                 errno = E2BIG;
529                 return -1;
530         }
531         
532         return 0;
533 }
534
535 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
536                          char **outbuf, size_t *outbytesleft)
537 {
538         int n;
539
540         n = MIN(*inbytesleft, *outbytesleft);
541
542         swab(*inbuf, *outbuf, (n&~1));
543         if (n&1) {
544                 (*outbuf)[n-1] = 0;
545         }
546
547         (*inbytesleft) -= n;
548         (*outbytesleft) -= n;
549         (*inbuf) += n;
550         (*outbuf) += n;
551
552         if (*inbytesleft > 0) {
553                 errno = E2BIG;
554                 return -1;
555         }
556
557         return 0;
558 }
559
560
561 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
562                          char **outbuf, size_t *outbytesleft)
563 {
564         int n;
565
566         n = MIN(*inbytesleft, *outbytesleft);
567
568         memmove(*outbuf, *inbuf, n);
569
570         (*inbytesleft) -= n;
571         (*outbytesleft) -= n;
572         (*inbuf) += n;
573         (*outbuf) += n;
574
575         if (*inbytesleft > 0) {
576                 errno = E2BIG;
577                 return -1;
578         }
579
580         return 0;
581 }
582
583 /*
584   this takes a UTF8 sequence and produces a UTF16 sequence
585  */
586 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
587                          char **outbuf, size_t *outbytesleft)
588 {
589         size_t in_left=*inbytesleft, out_left=*outbytesleft;
590         const uint8_t *c = (const uint8_t *)*inbuf;
591         uint8_t *uc = (uint8_t *)*outbuf;
592
593         while (in_left >= 1 && out_left >= 2) {
594                 if ((c[0] & 0x80) == 0) {
595                         uc[0] = c[0];
596                         uc[1] = 0;
597                         c  += 1;
598                         in_left  -= 1;
599                         out_left -= 2;
600                         uc += 2;
601                         continue;
602                 }
603
604                 if ((c[0] & 0xe0) == 0xc0) {
605                         if (in_left < 2 ||
606                             (c[1] & 0xc0) != 0x80) {
607                                 errno = EILSEQ;
608                                 goto error;
609                         }
610                         uc[1] = (c[0]>>2) & 0x7;
611                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
612                         c  += 2;
613                         in_left  -= 2;
614                         out_left -= 2;
615                         uc += 2;
616                         continue;
617                 }
618
619                 if ((c[0] & 0xf0) == 0xe0) {
620                         if (in_left < 3 ||
621                             (c[1] & 0xc0) != 0x80 || 
622                             (c[2] & 0xc0) != 0x80) {
623                                 errno = EILSEQ;
624                                 goto error;
625                         }
626                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
627                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
628                         c  += 3;
629                         in_left  -= 3;
630                         out_left -= 2;
631                         uc += 2;
632                         continue;
633                 }
634
635                 if ((c[0] & 0xf8) == 0xf0) {
636                         unsigned int codepoint;
637                         if (in_left < 4 ||
638                             (c[1] & 0xc0) != 0x80 || 
639                             (c[2] & 0xc0) != 0x80 ||
640                             (c[3] & 0xc0) != 0x80) {
641                                 errno = EILSEQ;
642                                 goto error;
643                         }
644                         codepoint = 
645                                 (c[3]&0x3f) | 
646                                 ((c[2]&0x3f)<<6) | 
647                                 ((c[1]&0x3f)<<12) |
648                                 ((c[0]&0x7)<<18);
649                         if (codepoint < 0x10000) {
650                                 /* accept UTF-8 characters that are not
651                                    minimally packed, but pack the result */
652                                 uc[0] = (codepoint & 0xFF);
653                                 uc[1] = (codepoint >> 8);
654                                 c += 4;
655                                 in_left -= 4;
656                                 out_left -= 2;
657                                 uc += 2;
658                                 continue;
659                         }
660
661                         codepoint -= 0x10000;
662
663                         if (out_left < 4) {
664                                 errno = E2BIG;
665                                 goto error;
666                         }
667
668                         uc[0] = (codepoint>>10) & 0xFF;
669                         uc[1] = (codepoint>>18) | 0xd8;
670                         uc[2] = codepoint & 0xFF;
671                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
672                         c  += 4;
673                         in_left  -= 4;
674                         out_left -= 4;
675                         uc += 4;
676                         continue;
677                 }
678
679                 /* we don't handle 5 byte sequences */
680                 errno = EINVAL;
681                 goto error;
682         }
683
684         if (in_left > 0) {
685                 errno = E2BIG;
686                 goto error;
687         }
688
689         *inbytesleft = in_left;
690         *outbytesleft = out_left;
691         *inbuf = (const char *)c;
692         *outbuf = (char *)uc;
693         return 0;
694
695 error:
696         *inbytesleft = in_left;
697         *outbytesleft = out_left;
698         *inbuf = (const char *)c;
699         *outbuf = (char *)uc;
700         return -1;
701 }
702
703
704 /*
705   this takes a UTF16 sequence and produces a UTF8 sequence
706  */
707 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
708                         char **outbuf, size_t *outbytesleft)
709 {
710         size_t in_left=*inbytesleft, out_left=*outbytesleft;
711         uint8_t *c = (uint8_t *)*outbuf;
712         const uint8_t *uc = (const uint8_t *)*inbuf;
713
714         while (in_left >= 2 && out_left >= 1) {
715                 unsigned int codepoint;
716
717                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
718                         /* simplest case */
719                         c[0] = uc[0];
720                         in_left  -= 2;
721                         out_left -= 1;
722                         uc += 2;
723                         c  += 1;
724                         continue;
725                 }
726
727                 if ((uc[1]&0xf8) == 0) {
728                         /* next simplest case */
729                         if (out_left < 2) {
730                                 errno = E2BIG;
731                                 goto error;
732                         }
733                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
734                         c[1] = 0x80 | (uc[0] & 0x3f);
735                         in_left  -= 2;
736                         out_left -= 2;
737                         uc += 2;
738                         c  += 2;
739                         continue;
740                 }
741
742                 if ((uc[1] & 0xfc) == 0xdc) {
743                         /* its the second part of a 4 byte sequence. Illegal */
744                         if (in_left < 4) {
745                                 errno = EINVAL;
746                         } else {
747                                 errno = EILSEQ;
748                         }
749                         goto error;
750                 }
751
752                 if ((uc[1] & 0xfc) != 0xd8) {
753                         codepoint = uc[0] | (uc[1]<<8);
754                         if (out_left < 3) {
755                                 errno = E2BIG;
756                                 goto error;
757                         }
758                         c[0] = 0xe0 | (codepoint >> 12);
759                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
760                         c[2] = 0x80 | (codepoint & 0x3f);
761                         
762                         in_left  -= 2;
763                         out_left -= 3;
764                         uc  += 2;
765                         c   += 3;
766                         continue;
767                 }
768
769                 /* its the first part of a 4 byte sequence */
770                 if (in_left < 4) {
771                         errno = EINVAL;
772                         goto error;
773                 }
774                 if ((uc[3] & 0xfc) != 0xdc) {
775                         errno = EILSEQ;
776                         goto error;
777                 }
778                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
779                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
780                 
781                 if (out_left < 4) {
782                         errno = E2BIG;
783                         goto error;
784                 }
785                 c[0] = 0xf0 | (codepoint >> 18);
786                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
787                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
788                 c[3] = 0x80 | (codepoint & 0x3f);
789                 
790                 in_left  -= 4;
791                 out_left -= 4;
792                 uc       += 4;
793                 c        += 4;
794         }
795
796         if (in_left == 1) {
797                 errno = EINVAL;
798                 goto error;
799         }
800
801         if (in_left > 1) {
802                 errno = E2BIG;
803                 goto error;
804         }
805
806         *inbytesleft = in_left;
807         *outbytesleft = out_left;
808         *inbuf  = (const char *)uc;
809         *outbuf = (char *)c;
810         
811         return 0;
812
813 error:
814         *inbytesleft = in_left;
815         *outbytesleft = out_left;
816         *inbuf  = (const char *)uc;
817         *outbuf = (char *)c;
818         return -1;
819 }
820
821
822 /*
823   this takes a UTF16 munged sequence, modifies it according to the
824   string2key rules, and produces a UTF16 sequence
825
826 The rules are:
827
828     1) any 0x0000 characters are mapped to 0x0001
829
830     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
831        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
832        U+FFFD (OBJECT REPLACEMENT CHARACTER).
833
834     3) the same for any low surrogate that was not preceded by a high surrogate.
835
836  */
837 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
838                                char **outbuf, size_t *outbytesleft)
839 {
840         size_t in_left=*inbytesleft, out_left=*outbytesleft;
841         uint8_t *c = (uint8_t *)*outbuf;
842         const uint8_t *uc = (const uint8_t *)*inbuf;
843
844         while (in_left >= 2 && out_left >= 2) {
845                 unsigned int codepoint = uc[0] | (uc[1]<<8);
846
847                 if (codepoint == 0) {
848                         codepoint = 1;
849                 }
850
851                 if ((codepoint & 0xfc00) == 0xd800) {
852                         /* a high surrogate */
853                         unsigned int codepoint2;
854                         if (in_left < 4) {
855                                 codepoint = 0xfffd;
856                                 goto codepoint16;                               
857                         }
858                         codepoint2 = uc[2] | (uc[3]<<8);
859                         if ((codepoint2 & 0xfc00) != 0xdc00) {
860                                 /* high surrogate not followed by low
861                                    surrogate: convert to 0xfffd */
862                                 codepoint = 0xfffd;
863                                 goto codepoint16;
864                         }
865                         if (out_left < 4) {
866                                 errno = E2BIG;
867                                 goto error;
868                         }
869                         memcpy(c, uc, 4);
870                         in_left  -= 4;
871                         out_left -= 4;
872                         uc       += 4;
873                         c        += 4;
874                         continue;
875                 }
876
877                 if ((codepoint & 0xfc00) == 0xdc00) {
878                         /* low surrogate not preceded by high
879                            surrogate: convert to 0xfffd */
880                         codepoint = 0xfffd;
881                 }
882
883         codepoint16:
884                 c[0] = codepoint & 0xFF;
885                 c[1] = (codepoint>>8) & 0xFF;
886                 
887                 in_left  -= 2;
888                 out_left -= 2;
889                 uc  += 2;
890                 c   += 2;
891                 continue;               
892         }
893
894         if (in_left == 1) {
895                 errno = EINVAL;
896                 goto error;
897         }
898
899         if (in_left > 1) {
900                 errno = E2BIG;
901                 goto error;
902         }
903
904         *inbytesleft = in_left;
905         *outbytesleft = out_left;
906         *inbuf  = (const char *)uc;
907         *outbuf = (char *)c;
908         
909         return 0;
910
911 error:
912         *inbytesleft = in_left;
913         *outbytesleft = out_left;
914         *inbuf  = (const char *)uc;
915         *outbuf = (char *)c;
916         return -1;
917 }
918
919
920