]> git.samba.org - samba.git/blob - lib/util/charset/iconv.c
Merge branch 'master' of git://git.samba.org/samba into minschema
[samba.git] / lib / util / charset / iconv.c
1 /* 
2    Unix SMB/CIFS implementation.
3    minimal iconv implementation
4    Copyright (C) Andrew Tridgell 2001
5    Copyright (C) Jelmer Vernooij 2002
6    
7    This program is free software; you can redistribute it and/or modify
8    it under the terms of the GNU General Public License as published by
9    the Free Software Foundation; either version 3 of the License, or
10    (at your option) any later version.
11    
12    This program is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15    GNU General Public License for more details.
16    
17    You should have received a copy of the GNU General Public License
18    along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include "includes.h"
22 #include "../lib/util/dlinklist.h"
23 #include "system/iconv.h"
24 #include "system/filesys.h"
25 #undef strcasecmp
26
27
28 /**
29  * @file
30  *
31  * @brief Samba wrapper/stub for iconv character set conversion.
32  *
33  * iconv is the XPG2 interface for converting between character
34  * encodings.  This file provides a Samba wrapper around it, and also
35  * a simple reimplementation that is used if the system does not
36  * implement iconv.
37  *
38  * Samba only works with encodings that are supersets of ASCII: ascii
39  * characters like whitespace can be tested for directly, multibyte
40  * sequences start with a byte with the high bit set, and strings are
41  * terminated by a nul byte.
42  *
43  * Note that the only function provided by iconv is conversion between
44  * characters.  It doesn't directly support operations like
45  * uppercasing or comparison.  We have to convert to UTF-16LE and
46  * compare there.
47  *
48  * @sa Samba Developers Guide
49  **/
50
51 static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
52 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
53 static size_t latin1_push (void *,const char **, size_t *, char **, size_t *);
54 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
55 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
56 static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
57 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
58 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
59 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
60 static size_t iconv_swab  (void *,const char **, size_t *, char **, size_t *);
61
62 static const struct charset_functions builtin_functions[] = {
63         /* windows is closest to UTF-16 */
64         {"UCS-2LE",  iconv_copy, iconv_copy},
65         {"UTF-16LE",  iconv_copy, iconv_copy},
66         {"UCS-2BE",  iconv_swab, iconv_swab},
67         {"UTF-16BE",  iconv_swab, iconv_swab},
68
69         /* we include the UTF-8 alias to cope with differing locale settings */
70         {"UTF8",   utf8_pull,  utf8_push},
71         {"UTF-8",   utf8_pull,  utf8_push},
72
73         /* this handles the munging needed for String2Key */
74         {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
75
76         {"ASCII", ascii_pull, ascii_push},
77         {"646", ascii_pull, ascii_push},
78         {"ISO-8859-1", ascii_pull, latin1_push},
79         {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
80 };
81
82 static struct charset_functions *charsets = NULL;
83
84 bool charset_register_backend(const void *_funcs) 
85 {
86         struct charset_functions *funcs = (struct charset_functions *)memdup(_funcs,sizeof(struct charset_functions));
87         struct charset_functions *c;
88
89         /* Check whether we already have this charset... */
90         for (c = charsets; c != NULL; c = c->next) {
91                 if(!strcasecmp(c->name, funcs->name)) { 
92                         DEBUG(2, ("Duplicate charset %s, not registering\n", funcs->name));
93                         return false;
94                 }
95         }
96
97         funcs->next = funcs->prev = NULL;
98         DLIST_ADD(charsets, funcs);
99         return true;
100 }
101
102 #ifdef HAVE_NATIVE_ICONV
103 /* if there was an error then reset the internal state,
104    this ensures that we don't have a shift state remaining for
105    character sets like SJIS */
106 static size_t sys_iconv(void *cd, 
107                         const char **inbuf, size_t *inbytesleft,
108                         char **outbuf, size_t *outbytesleft)
109 {
110         size_t ret = iconv((iconv_t)cd, 
111                            discard_const_p(char *, inbuf), inbytesleft, 
112                            outbuf, outbytesleft);
113         if (ret == (size_t)-1) iconv(cd, NULL, NULL, NULL, NULL);
114         return ret;
115 }
116 #endif
117
118 /**
119  * This is a simple portable iconv() implementaion.
120  *
121  * It only knows about a very small number of character sets - just
122  * enough that Samba works on systems that don't have iconv.
123  **/
124 _PUBLIC_ size_t smb_iconv(smb_iconv_t cd, 
125                  const char **inbuf, size_t *inbytesleft,
126                  char **outbuf, size_t *outbytesleft)
127 {
128         char cvtbuf[2048];
129         size_t bufsize;
130
131         /* in many cases we can go direct */
132         if (cd->direct) {
133                 return cd->direct(cd->cd_direct, 
134                                   inbuf, inbytesleft, outbuf, outbytesleft);
135         }
136
137
138         /* otherwise we have to do it chunks at a time */
139         while (*inbytesleft > 0) {
140                 char *bufp1 = cvtbuf;
141                 const char *bufp2 = cvtbuf;
142
143                 bufsize = sizeof(cvtbuf);
144                 
145                 if (cd->pull(cd->cd_pull, 
146                              inbuf, inbytesleft, &bufp1, &bufsize) == -1
147                     && errno != E2BIG) return -1;
148
149                 bufsize = sizeof(cvtbuf) - bufsize;
150
151                 if (cd->push(cd->cd_push, 
152                              &bufp2, &bufsize, 
153                              outbuf, outbytesleft) == -1) return -1;
154         }
155
156         return 0;
157 }
158
159 static bool is_utf16(const char *name)
160 {
161         return strcasecmp(name, "UCS-2LE") == 0 ||
162                 strcasecmp(name, "UTF-16LE") == 0;
163 }
164
165
166
167 _PUBLIC_ smb_iconv_t smb_iconv_open_ex(TALLOC_CTX *mem_ctx, const char *tocode, 
168                               const char *fromcode, bool native_iconv)
169 {
170         smb_iconv_t ret;
171         const struct charset_functions *from=NULL, *to=NULL;
172         int i;
173
174         ret = (smb_iconv_t)talloc_named(mem_ctx,
175                                         sizeof(*ret), 
176                                         "iconv(%s,%s)", tocode, fromcode);
177         if (!ret) {
178                 errno = ENOMEM;
179                 return (smb_iconv_t)-1;
180         }
181         memset(ret, 0, sizeof(*ret));
182
183         /* check for the simplest null conversion */
184         if (strcmp(fromcode, tocode) == 0) {
185                 ret->direct = iconv_copy;
186                 return ret;
187         }
188
189         for (i=0;i<ARRAY_SIZE(builtin_functions);i++) {
190                 if (strcasecmp(fromcode, builtin_functions[i].name) == 0) {
191                         from = &builtin_functions[i];
192                 }
193                 if (strcasecmp(tocode, builtin_functions[i].name) == 0) {
194                         to = &builtin_functions[i];
195                 }
196         }
197
198         if (from == NULL) {
199                 for (from=charsets; from; from=from->next) {
200                         if (strcasecmp(from->name, fromcode) == 0) break;
201                 }
202         }
203
204         if (to == NULL) {
205                 for (to=charsets; to; to=to->next) {
206                         if (strcasecmp(to->name, tocode) == 0) break;
207                 }
208         }
209
210 #ifdef HAVE_NATIVE_ICONV
211         if ((!from || !to) && !native_iconv) {
212                 goto failed;
213         }
214         if (!from) {
215                 ret->pull = sys_iconv;
216                 ret->cd_pull = iconv_open("UTF-16LE", fromcode);
217                 if (ret->cd_pull == (iconv_t)-1)
218                         ret->cd_pull = iconv_open("UCS-2LE", fromcode);
219                 if (ret->cd_pull == (iconv_t)-1) goto failed;
220         }
221
222         if (!to) {
223                 ret->push = sys_iconv;
224                 ret->cd_push = iconv_open(tocode, "UTF-16LE");
225                 if (ret->cd_push == (iconv_t)-1)
226                         ret->cd_push = iconv_open(tocode, "UCS-2LE");
227                 if (ret->cd_push == (iconv_t)-1) goto failed;
228         }
229 #else
230         if (!from || !to) {
231                 goto failed;
232         }
233 #endif
234
235         /* check for conversion to/from ucs2 */
236         if (is_utf16(fromcode) && to) {
237                 ret->direct = to->push;
238                 return ret;
239         }
240         if (is_utf16(tocode) && from) {
241                 ret->direct = from->pull;
242                 return ret;
243         }
244
245 #ifdef HAVE_NATIVE_ICONV
246         if (is_utf16(fromcode)) {
247                 ret->direct = sys_iconv;
248                 ret->cd_direct = ret->cd_push;
249                 ret->cd_push = NULL;
250                 return ret;
251         }
252         if (is_utf16(tocode)) {
253                 ret->direct = sys_iconv;
254                 ret->cd_direct = ret->cd_pull;
255                 ret->cd_pull = NULL;
256                 return ret;
257         }
258 #endif
259
260         /* the general case has to go via a buffer */
261         if (!ret->pull) ret->pull = from->pull;
262         if (!ret->push) ret->push = to->push;
263         return ret;
264
265 failed:
266         talloc_free(ret);
267         errno = EINVAL;
268         return (smb_iconv_t)-1;
269 }
270
271 /*
272   simple iconv_open() wrapper
273  */
274 _PUBLIC_ smb_iconv_t smb_iconv_open(const char *tocode, const char *fromcode)
275 {
276         return smb_iconv_open_ex(NULL, tocode, fromcode, true);
277 }
278
279 /*
280   simple iconv_close() wrapper
281 */
282 _PUBLIC_ int smb_iconv_close(smb_iconv_t cd)
283 {
284 #ifdef HAVE_NATIVE_ICONV
285         if (cd->cd_direct) iconv_close((iconv_t)cd->cd_direct);
286         if (cd->cd_pull) iconv_close((iconv_t)cd->cd_pull);
287         if (cd->cd_push) iconv_close((iconv_t)cd->cd_push);
288 #endif
289
290         talloc_free(cd);
291         return 0;
292 }
293
294
295 /**********************************************************************
296  the following functions implement the builtin character sets in Samba
297  and also the "test" character sets that are designed to test
298  multi-byte character set support for english users
299 ***********************************************************************/
300 static size_t ascii_pull(void *cd, const char **inbuf, size_t *inbytesleft,
301                          char **outbuf, size_t *outbytesleft)
302 {
303         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
304                 (*outbuf)[0] = (*inbuf)[0];
305                 (*outbuf)[1] = 0;
306                 (*inbytesleft)  -= 1;
307                 (*outbytesleft) -= 2;
308                 (*inbuf)  += 1;
309                 (*outbuf) += 2;
310         }
311
312         if (*inbytesleft > 0) {
313                 errno = E2BIG;
314                 return -1;
315         }
316         
317         return 0;
318 }
319
320 static size_t ascii_push(void *cd, const char **inbuf, size_t *inbytesleft,
321                          char **outbuf, size_t *outbytesleft)
322 {
323         int ir_count=0;
324
325         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
326                 (*outbuf)[0] = (*inbuf)[0] & 0x7F;
327                 if ((*inbuf)[1]) ir_count++;
328                 (*inbytesleft)  -= 2;
329                 (*outbytesleft) -= 1;
330                 (*inbuf)  += 2;
331                 (*outbuf) += 1;
332         }
333
334         if (*inbytesleft == 1) {
335                 errno = EINVAL;
336                 return -1;
337         }
338
339         if (*inbytesleft > 1) {
340                 errno = E2BIG;
341                 return -1;
342         }
343         
344         return ir_count;
345 }
346
347 static size_t latin1_push(void *cd, const char **inbuf, size_t *inbytesleft,
348                          char **outbuf, size_t *outbytesleft)
349 {
350         int ir_count=0;
351
352         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
353                 (*outbuf)[0] = (*inbuf)[0];
354                 if ((*inbuf)[1]) ir_count++;
355                 (*inbytesleft)  -= 2;
356                 (*outbytesleft) -= 1;
357                 (*inbuf)  += 2;
358                 (*outbuf) += 1;
359         }
360
361         if (*inbytesleft == 1) {
362                 errno = EINVAL;
363                 return -1;
364         }
365
366         if (*inbytesleft > 1) {
367                 errno = E2BIG;
368                 return -1;
369         }
370         
371         return ir_count;
372 }
373
374 static size_t ucs2hex_pull(void *cd, const char **inbuf, size_t *inbytesleft,
375                          char **outbuf, size_t *outbytesleft)
376 {
377         while (*inbytesleft >= 1 && *outbytesleft >= 2) {
378                 uint_t v;
379
380                 if ((*inbuf)[0] != '@') {
381                         /* seven bit ascii case */
382                         (*outbuf)[0] = (*inbuf)[0];
383                         (*outbuf)[1] = 0;
384                         (*inbytesleft)  -= 1;
385                         (*outbytesleft) -= 2;
386                         (*inbuf)  += 1;
387                         (*outbuf) += 2;
388                         continue;
389                 }
390                 /* it's a hex character */
391                 if (*inbytesleft < 5) {
392                         errno = EINVAL;
393                         return -1;
394                 }
395                 
396                 if (sscanf(&(*inbuf)[1], "%04x", &v) != 1) {
397                         errno = EILSEQ;
398                         return -1;
399                 }
400
401                 (*outbuf)[0] = v&0xff;
402                 (*outbuf)[1] = v>>8;
403                 (*inbytesleft)  -= 5;
404                 (*outbytesleft) -= 2;
405                 (*inbuf)  += 5;
406                 (*outbuf) += 2;
407         }
408
409         if (*inbytesleft > 0) {
410                 errno = E2BIG;
411                 return -1;
412         }
413         
414         return 0;
415 }
416
417 static size_t ucs2hex_push(void *cd, const char **inbuf, size_t *inbytesleft,
418                            char **outbuf, size_t *outbytesleft)
419 {
420         while (*inbytesleft >= 2 && *outbytesleft >= 1) {
421                 char buf[6];
422
423                 if ((*inbuf)[1] == 0 && 
424                     ((*inbuf)[0] & 0x80) == 0 &&
425                     (*inbuf)[0] != '@') {
426                         (*outbuf)[0] = (*inbuf)[0];
427                         (*inbytesleft)  -= 2;
428                         (*outbytesleft) -= 1;
429                         (*inbuf)  += 2;
430                         (*outbuf) += 1;
431                         continue;
432                 }
433                 if (*outbytesleft < 5) {
434                         errno = E2BIG;
435                         return -1;
436                 }
437                 snprintf(buf, 6, "@%04x", SVAL(*inbuf, 0));
438                 memcpy(*outbuf, buf, 5);
439                 (*inbytesleft)  -= 2;
440                 (*outbytesleft) -= 5;
441                 (*inbuf)  += 2;
442                 (*outbuf) += 5;
443         }
444
445         if (*inbytesleft == 1) {
446                 errno = EINVAL;
447                 return -1;
448         }
449
450         if (*inbytesleft > 1) {
451                 errno = E2BIG;
452                 return -1;
453         }
454         
455         return 0;
456 }
457
458 static size_t iconv_swab(void *cd, const char **inbuf, size_t *inbytesleft,
459                          char **outbuf, size_t *outbytesleft)
460 {
461         int n;
462
463         n = MIN(*inbytesleft, *outbytesleft);
464
465         swab(*inbuf, *outbuf, (n&~1));
466         if (n&1) {
467                 (*outbuf)[n-1] = 0;
468         }
469
470         (*inbytesleft) -= n;
471         (*outbytesleft) -= n;
472         (*inbuf) += n;
473         (*outbuf) += n;
474
475         if (*inbytesleft > 0) {
476                 errno = E2BIG;
477                 return -1;
478         }
479
480         return 0;
481 }
482
483
484 static size_t iconv_copy(void *cd, const char **inbuf, size_t *inbytesleft,
485                          char **outbuf, size_t *outbytesleft)
486 {
487         int n;
488
489         n = MIN(*inbytesleft, *outbytesleft);
490
491         memmove(*outbuf, *inbuf, n);
492
493         (*inbytesleft) -= n;
494         (*outbytesleft) -= n;
495         (*inbuf) += n;
496         (*outbuf) += n;
497
498         if (*inbytesleft > 0) {
499                 errno = E2BIG;
500                 return -1;
501         }
502
503         return 0;
504 }
505
506 /*
507   this takes a UTF8 sequence and produces a UTF16 sequence
508  */
509 static size_t utf8_pull(void *cd, const char **inbuf, size_t *inbytesleft,
510                          char **outbuf, size_t *outbytesleft)
511 {
512         size_t in_left=*inbytesleft, out_left=*outbytesleft;
513         const uint8_t *c = (const uint8_t *)*inbuf;
514         uint8_t *uc = (uint8_t *)*outbuf;
515
516         while (in_left >= 1 && out_left >= 2) {
517                 if ((c[0] & 0x80) == 0) {
518                         uc[0] = c[0];
519                         uc[1] = 0;
520                         c  += 1;
521                         in_left  -= 1;
522                         out_left -= 2;
523                         uc += 2;
524                         continue;
525                 }
526
527                 if ((c[0] & 0xe0) == 0xc0) {
528                         if (in_left < 2 ||
529                             (c[1] & 0xc0) != 0x80) {
530                                 errno = EILSEQ;
531                                 goto error;
532                         }
533                         uc[1] = (c[0]>>2) & 0x7;
534                         uc[0] = (c[0]<<6) | (c[1]&0x3f);
535                         c  += 2;
536                         in_left  -= 2;
537                         out_left -= 2;
538                         uc += 2;
539                         continue;
540                 }
541
542                 if ((c[0] & 0xf0) == 0xe0) {
543                         if (in_left < 3 ||
544                             (c[1] & 0xc0) != 0x80 || 
545                             (c[2] & 0xc0) != 0x80) {
546                                 errno = EILSEQ;
547                                 goto error;
548                         }
549                         uc[1] = ((c[0]&0xF)<<4) | ((c[1]>>2)&0xF);
550                         uc[0] = (c[1]<<6) | (c[2]&0x3f);
551                         c  += 3;
552                         in_left  -= 3;
553                         out_left -= 2;
554                         uc += 2;
555                         continue;
556                 }
557
558                 if ((c[0] & 0xf8) == 0xf0) {
559                         unsigned int codepoint;
560                         if (in_left < 4 ||
561                             (c[1] & 0xc0) != 0x80 || 
562                             (c[2] & 0xc0) != 0x80 ||
563                             (c[3] & 0xc0) != 0x80) {
564                                 errno = EILSEQ;
565                                 goto error;
566                         }
567                         codepoint = 
568                                 (c[3]&0x3f) | 
569                                 ((c[2]&0x3f)<<6) | 
570                                 ((c[1]&0x3f)<<12) |
571                                 ((c[0]&0x7)<<18);
572                         if (codepoint < 0x10000) {
573                                 /* accept UTF-8 characters that are not
574                                    minimally packed, but pack the result */
575                                 uc[0] = (codepoint & 0xFF);
576                                 uc[1] = (codepoint >> 8);
577                                 c += 4;
578                                 in_left -= 4;
579                                 out_left -= 2;
580                                 uc += 2;
581                                 continue;
582                         }
583
584                         codepoint -= 0x10000;
585
586                         if (out_left < 4) {
587                                 errno = E2BIG;
588                                 goto error;
589                         }
590
591                         uc[0] = (codepoint>>10) & 0xFF;
592                         uc[1] = (codepoint>>18) | 0xd8;
593                         uc[2] = codepoint & 0xFF;
594                         uc[3] = ((codepoint>>8) & 0x3) | 0xdc;
595                         c  += 4;
596                         in_left  -= 4;
597                         out_left -= 4;
598                         uc += 4;
599                         continue;
600                 }
601
602                 /* we don't handle 5 byte sequences */
603                 errno = EINVAL;
604                 goto error;
605         }
606
607         if (in_left > 0) {
608                 errno = E2BIG;
609                 goto error;
610         }
611
612         *inbytesleft = in_left;
613         *outbytesleft = out_left;
614         *inbuf = (const char *)c;
615         *outbuf = (char *)uc;
616         return 0;
617
618 error:
619         *inbytesleft = in_left;
620         *outbytesleft = out_left;
621         *inbuf = (const char *)c;
622         *outbuf = (char *)uc;
623         return -1;
624 }
625
626
627 /*
628   this takes a UTF16 sequence and produces a UTF8 sequence
629  */
630 static size_t utf8_push(void *cd, const char **inbuf, size_t *inbytesleft,
631                         char **outbuf, size_t *outbytesleft)
632 {
633         size_t in_left=*inbytesleft, out_left=*outbytesleft;
634         uint8_t *c = (uint8_t *)*outbuf;
635         const uint8_t *uc = (const uint8_t *)*inbuf;
636
637         while (in_left >= 2 && out_left >= 1) {
638                 unsigned int codepoint;
639
640                 if (uc[1] == 0 && !(uc[0] & 0x80)) {
641                         /* simplest case */
642                         c[0] = uc[0];
643                         in_left  -= 2;
644                         out_left -= 1;
645                         uc += 2;
646                         c  += 1;
647                         continue;
648                 }
649
650                 if ((uc[1]&0xf8) == 0) {
651                         /* next simplest case */
652                         if (out_left < 2) {
653                                 errno = E2BIG;
654                                 goto error;
655                         }
656                         c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
657                         c[1] = 0x80 | (uc[0] & 0x3f);
658                         in_left  -= 2;
659                         out_left -= 2;
660                         uc += 2;
661                         c  += 2;
662                         continue;
663                 }
664
665                 if ((uc[1] & 0xfc) == 0xdc) {
666                         /* its the second part of a 4 byte sequence. Illegal */
667                         if (in_left < 4) {
668                                 errno = EINVAL;
669                         } else {
670                                 errno = EILSEQ;
671                         }
672                         goto error;
673                 }
674
675                 if ((uc[1] & 0xfc) != 0xd8) {
676                         codepoint = uc[0] | (uc[1]<<8);
677                         if (out_left < 3) {
678                                 errno = E2BIG;
679                                 goto error;
680                         }
681                         c[0] = 0xe0 | (codepoint >> 12);
682                         c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
683                         c[2] = 0x80 | (codepoint & 0x3f);
684                         
685                         in_left  -= 2;
686                         out_left -= 3;
687                         uc  += 2;
688                         c   += 3;
689                         continue;
690                 }
691
692                 /* its the first part of a 4 byte sequence */
693                 if (in_left < 4) {
694                         errno = EINVAL;
695                         goto error;
696                 }
697                 if ((uc[3] & 0xfc) != 0xdc) {
698                         errno = EILSEQ;
699                         goto error;
700                 }
701                 codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
702                                        (uc[0]<<10) | ((uc[1] & 0x3)<<18));
703                 
704                 if (out_left < 4) {
705                         errno = E2BIG;
706                         goto error;
707                 }
708                 c[0] = 0xf0 | (codepoint >> 18);
709                 c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
710                 c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
711                 c[3] = 0x80 | (codepoint & 0x3f);
712                 
713                 in_left  -= 4;
714                 out_left -= 4;
715                 uc       += 4;
716                 c        += 4;
717         }
718
719         if (in_left == 1) {
720                 errno = EINVAL;
721                 goto error;
722         }
723
724         if (in_left > 1) {
725                 errno = E2BIG;
726                 goto error;
727         }
728
729         *inbytesleft = in_left;
730         *outbytesleft = out_left;
731         *inbuf  = (const char *)uc;
732         *outbuf = (char *)c;
733         
734         return 0;
735
736 error:
737         *inbytesleft = in_left;
738         *outbytesleft = out_left;
739         *inbuf  = (const char *)uc;
740         *outbuf = (char *)c;
741         return -1;
742 }
743
744
745 /*
746   this takes a UTF16 munged sequence, modifies it according to the
747   string2key rules, and produces a UTF16 sequence
748
749 The rules are:
750
751     1) any 0x0000 characters are mapped to 0x0001
752
753     2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
754        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
755        U+FFFD (OBJECT REPLACEMENT CHARACTER).
756
757     3) the same for any low surrogate that was not preceded by a high surrogate.
758
759  */
760 static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
761                                char **outbuf, size_t *outbytesleft)
762 {
763         size_t in_left=*inbytesleft, out_left=*outbytesleft;
764         uint8_t *c = (uint8_t *)*outbuf;
765         const uint8_t *uc = (const uint8_t *)*inbuf;
766
767         while (in_left >= 2 && out_left >= 2) {
768                 unsigned int codepoint = uc[0] | (uc[1]<<8);
769
770                 if (codepoint == 0) {
771                         codepoint = 1;
772                 }
773
774                 if ((codepoint & 0xfc00) == 0xd800) {
775                         /* a high surrogate */
776                         unsigned int codepoint2;
777                         if (in_left < 4) {
778                                 codepoint = 0xfffd;
779                                 goto codepoint16;                               
780                         }
781                         codepoint2 = uc[2] | (uc[3]<<8);
782                         if ((codepoint2 & 0xfc00) != 0xdc00) {
783                                 /* high surrogate not followed by low
784                                    surrogate: convert to 0xfffd */
785                                 codepoint = 0xfffd;
786                                 goto codepoint16;
787                         }
788                         if (out_left < 4) {
789                                 errno = E2BIG;
790                                 goto error;
791                         }
792                         memcpy(c, uc, 4);
793                         in_left  -= 4;
794                         out_left -= 4;
795                         uc       += 4;
796                         c        += 4;
797                         continue;
798                 }
799
800                 if ((codepoint & 0xfc00) == 0xdc00) {
801                         /* low surrogate not preceded by high
802                            surrogate: convert to 0xfffd */
803                         codepoint = 0xfffd;
804                 }
805
806         codepoint16:
807                 c[0] = codepoint & 0xFF;
808                 c[1] = (codepoint>>8) & 0xFF;
809                 
810                 in_left  -= 2;
811                 out_left -= 2;
812                 uc  += 2;
813                 c   += 2;
814                 continue;               
815         }
816
817         if (in_left == 1) {
818                 errno = EINVAL;
819                 goto error;
820         }
821
822         if (in_left > 1) {
823                 errno = E2BIG;
824                 goto error;
825         }
826
827         *inbytesleft = in_left;
828         *outbytesleft = out_left;
829         *inbuf  = (const char *)uc;
830         *outbuf = (char *)c;
831         
832         return 0;
833
834 error:
835         *inbytesleft = in_left;
836         *outbytesleft = out_left;
837         *inbuf  = (const char *)uc;
838         *outbuf = (char *)c;
839         return -1;
840 }
841
842
843