pyldb: avoid segfault when adding an element with no name
[nivanova/samba-autobuild/.git] / lib / util / charset / charset_macosxfs.c
1 /* 
2    Unix SMB/CIFS implementation.
3    Samba charset module for Mac OS X/Darwin
4    Copyright (C) Benjamin Riefenstahl 2003
5    
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10    
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15    
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*
21  * modules/charset_macosxfs.c
22  *
23  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
24  * and display encoding.
25  *
26  * Actually two implementations are provided here.  The default
27  * implementation is based on the official CFString API.  The other is
28  * based on internal CFString APIs as defined in the OpenDarwin
29  * source.
30  */
31
32 #include "includes.h"
33 #include "charset_proto.h"
34 #undef realloc
35
36 #ifdef DARWINOS
37
38 /*
39  * Include OS frameworks.  These are only needed in this module.
40  */
41 #include <CoreFoundation/CFString.h>
42
43 /*
44  * See if autoconf has found us the internal headers in some form.
45  */
46 #if defined(HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H)
47 #       include <CoreFoundation/CFStringEncodingConverter.h>
48 #       include <CoreFoundation/CFUnicodePrecomposition.h>
49 #       define USE_INTERNAL_API 1
50 #elif defined(HAVE_CFSTRINGENCODINGCONVERTER_H)
51 #       include <CFStringEncodingConverter.h>
52 #       include <CFUnicodePrecomposition.h>
53 #       define USE_INTERNAL_API 1
54 #endif
55
56 /*
57  * Compile time configuration: Do we want debug output?
58  */
59 /* #define DEBUG_STRINGS 1 */
60
61 /*
62  * A simple, but efficient memory provider for our buffers.
63  */
64 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
65 {
66         if (newsize > *size) {
67                 *size = newsize + 128;
68                 buffer = realloc(buffer, *size);
69         }
70         return buffer;
71 }
72
73 /*
74  * While there is a version of OpenDarwin for intel, the usual case is
75  * big-endian PPC.  So we need byte swapping to handle the
76  * little-endian byte order of the network protocol.  We also need an
77  * additional dynamic buffer to do this work for incoming data blocks,
78  * because we have to consider the original data as constant.
79  *
80  * We abstract the differences away by providing a simple facade with
81  * these functions/macros:
82  *
83  *      le_to_native(dst,src,len)
84  *      native_to_le(cp,len)
85  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
86  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
87  */
88 #ifdef WORDS_BIGENDIAN
89
90 static inline void swap_bytes (char * dst, const char * src, size_t len)
91 {
92         const char *srcend = src + len;
93         while (src < srcend) {
94                 dst[0] = src[1];
95                 dst[1] = src[0];
96                 dst += 2;
97                 src += 2;
98         }
99 }
100 static inline void swap_bytes_inplace (char * cp, size_t len)
101 {
102         char temp;
103         char *end = cp + len;
104         while (cp  < end) {
105                 temp = cp[1];
106                 cp[1] = cp[0];
107                 cp[0] = temp;
108                 cp += 2;
109         }
110 }
111
112 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
113 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
114 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
115         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
116
117 #else   /* ! WORDS_BIGENDIAN */
118
119 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
120 #define native_to_le(cp,len)            /* nothing */
121 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
122         (((void)(bufsize)),(UniChar*)(data))
123
124 #endif
125
126 static inline UniChar *set_ucbuffer_with_le_copy (
127         UniChar *buffer, size_t *bufsize,
128         const void *data, size_t size, size_t reserve)
129 {
130         buffer = resize_buffer(buffer, bufsize, size+reserve);
131         le_to_native((char*)buffer,data,size);
132         return buffer;
133 }
134
135
136 /*
137  * A simple hexdump function for debugging error conditions.
138  */
139 #define debug_out(s)    DEBUG(0,(s))
140
141 #ifdef DEBUG_STRINGS
142
143 static void hexdump( const char * label, const char * s, size_t len )
144 {
145         size_t restlen = len;
146         debug_out("<<<<<<<\n");
147         debug_out(label);
148         debug_out("\n");
149         while (restlen > 0) {
150                 char line[100];
151                 size_t i, j;
152                 char * d = line;
153 #undef sprintf
154                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
155                 *d++ = ' ';
156                 for( i = 0; i<restlen && i<8; ++i ) {
157                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
158                 }
159                 for( j = i; j<8; ++j ) {
160                         d += sprintf(d, "   ");
161                 }
162                 *d++ = ' ';
163                 for( i = 8; i<restlen && i<16; ++i ) {
164                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
165                 }
166                 for( j = i; j<16; ++j ) {
167                         d += sprintf(d, "   ");
168                 }
169                 *d++ = ' ';
170                 for( i = 0; i<restlen && i<16; ++i ) {
171                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
172                                 *d++ = '.';
173                         else
174                                 *d++ = s[i];
175                 }
176                 *d++ = '\n';
177                 *d = 0;
178                 restlen -= i;
179                 s += i;
180                 debug_out(line);
181         }
182         debug_out(">>>>>>>\n");
183 }
184
185 #else   /* !DEBUG_STRINGS */
186
187 #define hexdump(label,s,len) /* nothing */
188
189 #endif
190
191
192 #if !USE_INTERNAL_API
193
194 /*
195  * An implementation based on documented Mac OS X APIs.
196  *
197  * This does a certain amount of memory management, creating and
198  * manipulating CFString objects.  We try to minimize the impact by
199  * keeping those objects around and re-using them.  We also use
200  * external backing store for the CFStrings where this is possible and
201  * benficial.
202  *
203  * The Unicode normalizations forms available at this level are
204  * generic, not specifically for the file system.  So they may not be
205  * perfect fits.
206  */
207 size_t macosxfs_encoding_pull(
208         void *cd,                               /* Encoder handle */
209         const char **inbuf, size_t *inbytesleft, /* Script string */
210         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
211 {
212         static const int script_code = kCFStringEncodingUTF8;
213         static CFMutableStringRef cfstring = NULL;
214         size_t outsize;
215         CFRange range;
216
217         (void) cd; /* UNUSED */
218
219         if (0 == *inbytesleft) {
220                 return 0;
221         }
222
223         if (NULL == cfstring) {
224                 /*
225                  * A version with an external backing store as in the
226                  * push function should have been more efficient, but
227                  * testing shows, that it is actually slower (!).
228                  * Maybe kCFAllocatorDefault gets shortcut evaluation
229                  * internally, while kCFAllocatorNull doesn't.
230                  */
231                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
232         }
233
234         /*
235          * Three methods of appending to a CFString, choose the most
236          * efficient.
237          */
238         if (0 == (*inbuf)[*inbytesleft-1]) {
239                 CFStringAppendCString(cfstring, *inbuf, script_code);
240         } else if (*inbytesleft <= 255) {
241                 Str255 buffer;
242                 buffer[0] = *inbytesleft;
243                 memcpy(buffer+1, *inbuf, buffer[0]);
244                 CFStringAppendPascalString(cfstring, buffer, script_code);
245         } else {
246                 /*
247                  * We would like to use a fixed buffer and a loop
248                  * here, but than we can't garantee that the input is
249                  * well-formed UTF-8, as we are supposed to do.
250                  */
251                 static char *buffer = NULL;
252                 static size_t buflen = 0;
253                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
254                 memcpy(buffer, *inbuf, *inbytesleft);
255                 buffer[*inbytesleft] = 0;
256                 CFStringAppendCString(cfstring, *inbuf, script_code);
257         }
258
259         /*
260          * Compose characters, using the non-canonical composition
261          * form.
262          */
263         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
264
265         outsize = CFStringGetLength(cfstring);
266         range = CFRangeMake(0,outsize);
267
268         if (outsize == 0) {
269                 /*
270                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
271                  * errors here.  That function will always pass 2
272                  * characters.  smbd/open.c:check_for_pipe() cuts a
273                  * patchname to 10 characters blindly.  Suppress the
274                  * debug output in those cases.
275                  */
276                 if(2 != *inbytesleft && 10 != *inbytesleft) {
277                         debug_out("String conversion: "
278                                   "An unknown error occurred\n");
279                         hexdump("UTF8->UTF16LE (old) input",
280                                 *inbuf, *inbytesleft);
281                 }
282                 errno = EILSEQ; /* Not sure, but this is what we have
283                                  * actually seen. */
284                 return -1;
285         }
286         if (outsize*2 > *outbytesleft) {
287                 CFStringDelete(cfstring, range);
288                 debug_out("String conversion: "
289                           "Output buffer too small\n");
290                 hexdump("UTF8->UTF16LE (old) input",
291                         *inbuf, *inbytesleft);
292                 errno = E2BIG;
293                 return -1;
294         }
295
296         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
297         CFStringDelete(cfstring, range);
298
299         native_to_le(*outbuf, outsize*2);
300
301         /*
302          * Add a converted null byte, if the CFString conversions
303          * prevented that until now.
304          */
305         if (0 == (*inbuf)[*inbytesleft-1] && 
306             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
307
308                 if ((outsize*2+2) > *outbytesleft) {
309                         debug_out("String conversion: "
310                                   "Output buffer too small\n");
311                         hexdump("UTF8->UTF16LE (old) input",
312                                 *inbuf, *inbytesleft);
313                         errno = E2BIG;
314                         return -1;
315                 }
316
317                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
318                 outsize += 2;
319         }
320
321         *inbuf += *inbytesleft;
322         *inbytesleft = 0;
323         *outbuf += outsize*2;
324         *outbytesleft -= outsize*2;
325
326         return 0;
327 }
328
329 size_t macosxfs_encoding_push(
330         void *cd,                               /* Encoder handle */
331         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
332         char **outbuf, size_t *outbytesleft)    /* Script string */
333 {
334         static const int script_code = kCFStringEncodingUTF8;
335         static CFMutableStringRef cfstring = NULL;
336         static UniChar *buffer = NULL;
337         static size_t buflen = 0;
338         CFIndex outsize, cfsize, charsconverted;
339
340         (void) cd; /* UNUSED */
341
342         if (0 == *inbytesleft) {
343                 return 0;
344         }
345
346         /*
347          * We need a buffer that can hold 4 times the original data,
348          * because that is the theoretical maximum that decomposition
349          * can create currently (in Unicode 4.0).
350          */
351         buffer = set_ucbuffer_with_le_copy(
352                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
353
354         if (NULL == cfstring) {
355                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
356                         kCFAllocatorDefault,
357                         buffer, *inbytesleft/2, buflen/2,
358                         kCFAllocatorNull);
359         } else {
360                 CFStringSetExternalCharactersNoCopy(
361                         cfstring,
362                         buffer, *inbytesleft/2, buflen/2);
363         }
364
365         /*
366          * Decompose characters, using the non-canonical decomposition
367          * form.
368          *
369          * NB: This isn't exactly what HFS+ wants (see note on
370          * kCFStringEncodingUseHFSPlusCanonical in
371          * CFStringEncodingConverter.h), but AFAIK it's the best that
372          * the official API can do.
373          */
374         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
375
376         cfsize = CFStringGetLength(cfstring);
377         charsconverted = CFStringGetBytes(
378                 cfstring, CFRangeMake(0,cfsize),
379                 script_code, 0, false,
380                 *outbuf, *outbytesleft, &outsize);
381
382         if (0 == charsconverted) {
383                 debug_out("String conversion: "
384                           "Buffer too small or not convertable\n");
385                 hexdump("UTF16LE->UTF8 (old) input",
386                         *inbuf, *inbytesleft);
387                 errno = EILSEQ; /* Probably more likely. */
388                 return -1;
389         }
390
391         /*
392          * Add a converted null byte, if the CFString conversions
393          * prevented that until now.
394          */
395         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
396             (0 != (*outbuf)[outsize-1])) {
397
398                 if (((size_t)outsize+1) > *outbytesleft) {
399                         debug_out("String conversion: "
400                                   "Output buffer too small\n");
401                         hexdump("UTF16LE->UTF8 (old) input",
402                                 *inbuf, *inbytesleft);
403                         errno = E2BIG;
404                         return -1;
405                 }
406
407                 (*outbuf)[outsize] = 0;
408                 ++outsize;
409         }
410
411         *inbuf += *inbytesleft;
412         *inbytesleft = 0;
413         *outbuf += outsize;
414         *outbytesleft -= outsize;
415
416         return 0;
417 }
418
419 #else /* USE_INTERNAL_API */
420
421 /*
422  * An implementation based on internal code as known from the
423  * OpenDarwin CVS.
424  *
425  * This code doesn't need much memory management because it uses
426  * functions that operate on the raw memory directly.
427  *
428  * The push routine here is faster and more compatible with HFS+ than
429  * the other implementation above.  The pull routine is only faster
430  * for some strings, slightly slower for others.  The pull routine
431  * looses because it has to iterate over the data twice, once to
432  * decode UTF-8 and than to do the character composition required by
433  * Windows.
434  */
435 static size_t macosxfs_encoding_pull(
436         void *cd,                               /* Encoder handle */
437         const char **inbuf, size_t *inbytesleft, /* Script string */
438         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
439 {
440         static const int script_code = kCFStringEncodingUTF8;
441         UInt32 srcCharsUsed = 0;
442         UInt32 dstCharsUsed = 0;
443         UInt32 result;
444         uint32_t dstDecomposedUsed = 0;
445         uint32_t dstPrecomposedUsed = 0;
446
447         (void) cd; /* UNUSED */
448
449         if (0 == *inbytesleft) {
450                 return 0;
451         }
452
453         result = CFStringEncodingBytesToUnicode(
454                 script_code, kCFStringEncodingComposeCombinings,
455                 *inbuf, *inbytesleft, &srcCharsUsed,
456                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
457
458         switch(result) {
459         case kCFStringEncodingConversionSuccess:
460                 if (*inbytesleft == srcCharsUsed) {
461                         break;
462                 }
463
464                 FALL_THROUGH;
465         case kCFStringEncodingInsufficientOutputBufferLength:
466                 debug_out("String conversion: "
467                           "Output buffer too small\n");
468                 hexdump("UTF8->UTF16LE (new) input",
469                         *inbuf, *inbytesleft);
470                 errno = E2BIG;
471                 return -1;
472         case kCFStringEncodingInvalidInputStream:
473                 /*
474                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
475                  * errors here.  That function will always pass 2
476                  * characters.  smbd/open.c:check_for_pipe() cuts a
477                  * patchname to 10 characters blindly.  Suppress the
478                  * debug output in those cases.
479                  */
480                 if(2 != *inbytesleft && 10 != *inbytesleft) {
481                         debug_out("String conversion: "
482                                   "Invalid input sequence\n");
483                         hexdump("UTF8->UTF16LE (new) input",
484                                 *inbuf, *inbytesleft);
485                 }
486                 errno = EILSEQ;
487                 return -1;
488         case kCFStringEncodingConverterUnavailable:
489                 debug_out("String conversion: "
490                           "Unknown encoding\n");
491                 hexdump("UTF8->UTF16LE (new) input",
492                         *inbuf, *inbytesleft);
493                 errno = EINVAL;
494                 return -1;
495         }
496
497         /*
498          * It doesn't look like CFStringEncodingBytesToUnicode() can
499          * produce precomposed characters (flags=ComposeCombinings
500          * doesn't do it), so we need another pass over the data here.
501          * We can do this in-place, as the string can only get
502          * shorter.
503          *
504          * (Actually in theory there should be an internal
505          * decomposition and reordering before the actual composition
506          * step.  But we should be able to rely on that we always get
507          * fully decomposed strings for input, so this can't create
508          * problems in reality.)
509          */
510         CFUniCharPrecompose(
511                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
512                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
513
514         native_to_le(*outbuf, dstPrecomposedUsed*2);
515
516         *inbuf += srcCharsUsed;
517         *inbytesleft -= srcCharsUsed;
518         *outbuf += dstPrecomposedUsed*2;
519         *outbytesleft -= dstPrecomposedUsed*2;
520
521         return 0;
522 }
523
524 static size_t macosxfs_encoding_push(
525         void *cd,                               /* Encoder handle */
526         const char **inbuf, size_t *inbytesleft, /* UTF-16-LE string */
527         char **outbuf, size_t *outbytesleft)    /* Script string */
528 {
529         static const int script_code = kCFStringEncodingUTF8;
530         static UniChar *buffer = NULL;
531         static size_t buflen = 0;
532         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
533
534         (void) cd; /* UNUSED */
535
536         if (0 == *inbytesleft) {
537                 return 0;
538         }
539
540         buffer = set_ucbuffer_with_le(
541                 buffer, &buflen, *inbuf, *inbytesleft);
542
543         result = CFStringEncodingUnicodeToBytes(
544                 script_code, kCFStringEncodingUseHFSPlusCanonical,
545                 buffer, *inbytesleft/2, &srcCharsUsed,
546                 *outbuf, *outbytesleft, &dstCharsUsed);
547
548         switch(result) {
549         case kCFStringEncodingConversionSuccess:
550                 if (*inbytesleft/2 == srcCharsUsed) {
551                         break;
552                 }
553
554                 FALL_THROUGH;
555         case kCFStringEncodingInsufficientOutputBufferLength:
556                 debug_out("String conversion: "
557                           "Output buffer too small\n");
558                 hexdump("UTF16LE->UTF8 (new) input",
559                         *inbuf, *inbytesleft);
560                 errno = E2BIG;
561                 return -1;
562         case kCFStringEncodingInvalidInputStream:
563                 /*
564                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
565                  * cuts a pathname to 10 characters blindly.  Suppress
566                  * the debug output in those cases.
567                  */
568                 if(10 != *inbytesleft) {
569                         debug_out("String conversion: "
570                                   "Invalid input sequence\n");
571                         hexdump("UTF16LE->UTF8 (new) input",
572                                 *inbuf, *inbytesleft);
573                 }
574                 errno = EILSEQ;
575                 return -1;
576         case kCFStringEncodingConverterUnavailable:
577                 debug_out("String conversion: "
578                           "Unknown encoding\n");
579                 hexdump("UTF16LE->UTF8 (new) input",
580                         *inbuf, *inbytesleft);
581                 errno = EINVAL;
582                 return -1;
583         }
584
585         *inbuf += srcCharsUsed*2;
586         *inbytesleft -= srcCharsUsed*2;
587         *outbuf += dstCharsUsed;
588         *outbytesleft -= dstCharsUsed;
589
590         return 0;
591 }
592
593 #endif /* USE_INTERNAL_API */
594
595 #else /* DARWIN */
596      
597 void charset_macosfs_dummy(void);
598 void charset_macosfs_dummy(void)
599 {
600         return;
601 }
602
603 #endif /* DARWIN */