r23784: use the GPLv3 boilerplate as recommended by the FSF and the license text
[tprouty/samba.git] / source / modules / charset_macosxfs.c
1 /* 
2    Unix SMB/CIFS implementation.
3    Samba charset module for Mac OS X/Darwin
4    Copyright (C) Benjamin Riefenstahl 2003
5    
6    This program is free software; you can redistribute it and/or modify
7    it under the terms of the GNU General Public License as published by
8    the Free Software Foundation; either version 3 of the License, or
9    (at your option) any later version.
10    
11    This program is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14    GNU General Public License for more details.
15    
16    You should have received a copy of the GNU General Public License
17    along with this program.  If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*
21  * modules/charset_macosxfs.c
22  *
23  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
24  * and display encoding.
25  *
26  * Actually two implementations are provided here.  The default
27  * implementation is based on the official CFString API.  The other is
28  * based on internal CFString APIs as defined in the OpenDarwin
29  * source.
30  */
31
32 #include "includes.h"
33
34 /*
35  * Include OS frameworks.  These are only needed in this module.
36  */
37 #include <CoreFoundation/CFString.h>
38
39 /*
40  * See if autoconf has found us the internal headers in some form.
41  */
42 #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
43 #       include <CoreFoundation/CFStringEncodingConverter.h>
44 #       include <CoreFoundation/CFUnicodePrecomposition.h>
45 #       define USE_INTERNAL_API 1
46 #elif HAVE_CFSTRINGENCODINGCONVERTER_H
47 #       include <CFStringEncodingConverter.h>
48 #       include <CFUnicodePrecomposition.h>
49 #       define USE_INTERNAL_API 1
50 #endif
51
52 /*
53  * Compile time configuration: Do we want debug output?
54  */
55 /* #define DEBUG_STRINGS 1 */
56
57 /*
58  * A simple, but efficient memory provider for our buffers.
59  */
60 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
61 {
62         if (newsize > *size) {
63                 *size = newsize + 128;
64                 buffer = SMB_REALLOC(buffer, *size);
65         }
66         return buffer;
67 }
68
69 /*
70  * While there is a version of OpenDarwin for intel, the usual case is
71  * big-endian PPC.  So we need byte swapping to handle the
72  * little-endian byte order of the network protocol.  We also need an
73  * additional dynamic buffer to do this work for incoming data blocks,
74  * because we have to consider the original data as constant.
75  *
76  * We abstract the differences away by providing a simple facade with
77  * these functions/macros:
78  *
79  *      le_to_native(dst,src,len)
80  *      native_to_le(cp,len)
81  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
82  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
83  */
84 #ifdef WORDS_BIGENDIAN
85
86 static inline void swap_bytes (char * dst, const char * src, size_t len)
87 {
88         const char *srcend = src + len;
89         while (src < srcend) {
90                 dst[0] = src[1];
91                 dst[1] = src[0];
92                 dst += 2;
93                 src += 2;
94         }
95 }
96 static inline void swap_bytes_inplace (char * cp, size_t len)
97 {
98         char temp;
99         char *end = cp + len;
100         while (cp  < end) {
101                 temp = cp[1];
102                 cp[1] = cp[0];
103                 cp[0] = temp;
104                 cp += 2;
105         }
106 }
107
108 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
109 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
110 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
111         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
112
113 #else   /* ! WORDS_BIGENDIAN */
114
115 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
116 #define native_to_le(cp,len)            /* nothing */
117 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
118         (((void)(bufsize)),(UniChar*)(data))
119
120 #endif
121
122 static inline UniChar *set_ucbuffer_with_le_copy (
123         UniChar *buffer, size_t *bufsize,
124         const void *data, size_t size, size_t reserve)
125 {
126         buffer = resize_buffer(buffer, bufsize, size+reserve);
127         le_to_native((char*)buffer,data,size);
128         return buffer;
129 }
130
131
132 /*
133  * A simple hexdump function for debugging error conditions.
134  */
135 #define debug_out(s)    DEBUG(0,(s))
136
137 #ifdef DEBUG_STRINGS
138
139 static void hexdump( const char * label, const char * s, size_t len )
140 {
141         size_t restlen = len;
142         debug_out("<<<<<<<\n");
143         debug_out(label);
144         debug_out("\n");
145         while (restlen > 0) {
146                 char line[100];
147                 size_t i, j;
148                 char * d = line;
149 #undef sprintf
150                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
151                 *d++ = ' ';
152                 for( i = 0; i<restlen && i<8; ++i ) {
153                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
154                 }
155                 for( j = i; j<8; ++j ) {
156                         d += sprintf(d, "   ");
157                 }
158                 *d++ = ' ';
159                 for( i = 8; i<restlen && i<16; ++i ) {
160                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
161                 }
162                 for( j = i; j<16; ++j ) {
163                         d += sprintf(d, "   ");
164                 }
165                 *d++ = ' ';
166                 for( i = 0; i<restlen && i<16; ++i ) {
167                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
168                                 *d++ = '.';
169                         else
170                                 *d++ = s[i];
171                 }
172                 *d++ = '\n';
173                 *d = 0;
174                 restlen -= i;
175                 s += i;
176                 debug_out(line);
177         }
178         debug_out(">>>>>>>\n");
179 }
180
181 #else   /* !DEBUG_STRINGS */
182
183 #define hexdump(label,s,len) /* nothing */
184
185 #endif
186
187
188 #if !USE_INTERNAL_API
189
190 /*
191  * An implementation based on documented Mac OS X APIs.
192  *
193  * This does a certain amount of memory management, creating and
194  * manipulating CFString objects.  We try to minimize the impact by
195  * keeping those objects around and re-using them.  We also use
196  * external backing store for the CFStrings where this is possible and
197  * benficial.
198  *
199  * The Unicode normalizations forms available at this level are
200  * generic, not specifically for the file system.  So they may not be
201  * perfect fits.
202  */
203 static size_t macosxfs_encoding_pull(
204         void *cd,                                   /* Encoder handle */
205         const char **inbuf, size_t *inbytesleft,    /* Script string */
206         char **outbuf, size_t *outbytesleft)        /* UTF-16-LE string */
207 {
208         static const int script_code = kCFStringEncodingUTF8;
209         static CFMutableStringRef cfstring = NULL;
210         size_t outsize;
211         CFRange range;
212
213         (void) cd; /* UNUSED */
214
215         if (0 == *inbytesleft) {
216                 return 0;
217         }
218
219         if (NULL == cfstring) {
220                 /*
221                  * A version with an external backing store as in the
222                  * push function should have been more efficient, but
223                  * testing shows, that it is actually slower (!).
224                  * Maybe kCFAllocatorDefault gets shortcut evaluation
225                  * internally, while kCFAllocatorNull doesn't.
226                  */
227                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
228         }
229
230         /*
231          * Three methods of appending to a CFString, choose the most
232          * efficient.
233          */
234         if (0 == (*inbuf)[*inbytesleft-1]) {
235                 CFStringAppendCString(cfstring, *inbuf, script_code);
236         } else if (*inbytesleft <= 255) {
237                 Str255 buffer;
238                 buffer[0] = *inbytesleft;
239                 memcpy(buffer+1, *inbuf, buffer[0]);
240                 CFStringAppendPascalString(cfstring, buffer, script_code);
241         } else {
242                 /*
243                  * We would like to use a fixed buffer and a loop
244                  * here, but than we can't garantee that the input is
245                  * well-formed UTF-8, as we are supposed to do.
246                  */
247                 static char *buffer = NULL;
248                 static size_t buflen = 0;
249                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
250                 memcpy(buffer, *inbuf, *inbytesleft);
251                 buffer[*inbytesleft] = 0;
252                 CFStringAppendCString(cfstring, *inbuf, script_code);
253         }
254
255         /*
256          * Compose characters, using the non-canonical composition
257          * form.
258          */
259         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
260
261         outsize = CFStringGetLength(cfstring);
262         range = CFRangeMake(0,outsize);
263
264         if (outsize == 0) {
265                 /*
266                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
267                  * errors here.  That function will always pass 2
268                  * characters.  smbd/open.c:check_for_pipe() cuts a
269                  * patchname to 10 characters blindly.  Suppress the
270                  * debug output in those cases.
271                  */
272                 if(2 != *inbytesleft && 10 != *inbytesleft) {
273                         debug_out("String conversion: "
274                                   "An unknown error occurred\n");
275                         hexdump("UTF8->UTF16LE (old) input",
276                                 *inbuf, *inbytesleft);
277                 }
278                 errno = EILSEQ; /* Not sure, but this is what we have
279                                  * actually seen. */
280                 return -1;
281         }
282         if (outsize*2 > *outbytesleft) {
283                 CFStringDelete(cfstring, range);
284                 debug_out("String conversion: "
285                           "Output buffer too small\n");
286                 hexdump("UTF8->UTF16LE (old) input",
287                         *inbuf, *inbytesleft);
288                 errno = E2BIG;
289                 return -1;
290         }
291
292         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
293         CFStringDelete(cfstring, range);
294
295         native_to_le(*outbuf, outsize*2);
296
297         /*
298          * Add a converted null byte, if the CFString conversions
299          * prevented that until now.
300          */
301         if (0 == (*inbuf)[*inbytesleft-1] && 
302             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
303
304                 if ((outsize*2+2) > *outbytesleft) {
305                         debug_out("String conversion: "
306                                   "Output buffer too small\n");
307                         hexdump("UTF8->UTF16LE (old) input",
308                                 *inbuf, *inbytesleft);
309                         errno = E2BIG;
310                         return -1;
311                 }
312
313                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
314                 outsize += 2;
315         }
316
317         *inbuf += *inbytesleft;
318         *inbytesleft = 0;
319         *outbuf += outsize*2;
320         *outbytesleft -= outsize*2;
321
322         return 0;
323 }
324
325 static size_t macosxfs_encoding_push(
326         void *cd,                                   /* Encoder handle */
327         const char **inbuf, size_t *inbytesleft,    /* UTF-16-LE string */
328         char **outbuf, size_t *outbytesleft)        /* Script string */
329 {
330         static const int script_code = kCFStringEncodingUTF8;
331         static CFMutableStringRef cfstring = NULL;
332         static UniChar *buffer = NULL;
333         static size_t buflen = 0;
334         CFIndex outsize, cfsize, charsconverted;
335
336         (void) cd; /* UNUSED */
337
338         if (0 == *inbytesleft) {
339                 return 0;
340         }
341
342         /*
343          * We need a buffer that can hold 4 times the original data,
344          * because that is the theoretical maximum that decomposition
345          * can create currently (in Unicode 4.0).
346          */
347         buffer = set_ucbuffer_with_le_copy(
348                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
349
350         if (NULL == cfstring) {
351                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
352                         kCFAllocatorDefault,
353                         buffer, *inbytesleft/2, buflen/2,
354                         kCFAllocatorNull);
355         } else {
356                 CFStringSetExternalCharactersNoCopy(
357                         cfstring,
358                         buffer, *inbytesleft/2, buflen/2);
359         }
360
361         /*
362          * Decompose characters, using the non-canonical decomposition
363          * form.
364          *
365          * NB: This isn't exactly what HFS+ wants (see note on
366          * kCFStringEncodingUseHFSPlusCanonical in
367          * CFStringEncodingConverter.h), but AFAIK it's the best that
368          * the official API can do.
369          */
370         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
371
372         cfsize = CFStringGetLength(cfstring);
373         charsconverted = CFStringGetBytes(
374                 cfstring, CFRangeMake(0,cfsize),
375                 script_code, 0, False,
376                 (uint8_t *)(*outbuf), *outbytesleft, &outsize);
377
378         if (0 == charsconverted) {
379                 debug_out("String conversion: "
380                           "Buffer too small or not convertable\n");
381                 hexdump("UTF16LE->UTF8 (old) input",
382                         *inbuf, *inbytesleft);
383                 errno = EILSEQ; /* Probably more likely. */
384                 return -1;
385         }
386
387         /*
388          * Add a converted null byte, if the CFString conversions
389          * prevented that until now.
390          */
391         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
392             (0 != (*outbuf)[outsize-1])) {
393
394                 if (((size_t)outsize+1) > *outbytesleft) {
395                         debug_out("String conversion: "
396                                   "Output buffer too small\n");
397                         hexdump("UTF16LE->UTF8 (old) input",
398                                 *inbuf, *inbytesleft);
399                         errno = E2BIG;
400                         return -1;
401                 }
402
403                 (*outbuf)[outsize] = 0;
404                 ++outsize;
405         }
406
407         *inbuf += *inbytesleft;
408         *inbytesleft = 0;
409         *outbuf += outsize;
410         *outbytesleft -= outsize;
411
412         return 0;
413 }
414
415 #else /* USE_INTERNAL_API */
416
417 /*
418  * An implementation based on internal code as known from the
419  * OpenDarwin CVS.
420  *
421  * This code doesn't need much memory management because it uses
422  * functions that operate on the raw memory directly.
423  *
424  * The push routine here is faster and more compatible with HFS+ than
425  * the other implementation above.  The pull routine is only faster
426  * for some strings, slightly slower for others.  The pull routine
427  * looses because it has to iterate over the data twice, once to
428  * decode UTF-8 and than to do the character composition required by
429  * Windows.
430  */
431 static size_t macosxfs_encoding_pull(
432         void *cd,                               /* Encoder handle */
433         char **inbuf, size_t *inbytesleft,      /* Script string */
434         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
435 {
436         static const int script_code = kCFStringEncodingUTF8;
437         UInt32 srcCharsUsed = 0;
438         UInt32 dstCharsUsed = 0;
439         UInt32 result;
440         uint32_t dstDecomposedUsed = 0;
441         uint32_t dstPrecomposedUsed = 0;
442
443         (void) cd; /* UNUSED */
444
445         if (0 == *inbytesleft) {
446                 return 0;
447         }
448
449         result = CFStringEncodingBytesToUnicode(
450                 script_code, kCFStringEncodingComposeCombinings,
451                 *inbuf, *inbytesleft, &srcCharsUsed,
452                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
453
454         switch(result) {
455         case kCFStringEncodingConversionSuccess:
456                 if (*inbytesleft == srcCharsUsed)
457                         break;
458                 else
459                         ; /*fall through*/
460         case kCFStringEncodingInsufficientOutputBufferLength:
461                 debug_out("String conversion: "
462                           "Output buffer too small\n");
463                 hexdump("UTF8->UTF16LE (new) input",
464                         *inbuf, *inbytesleft);
465                 errno = E2BIG;
466                 return -1;
467         case kCFStringEncodingInvalidInputStream:
468                 /*
469                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
470                  * errors here.  That function will always pass 2
471                  * characters.  smbd/open.c:check_for_pipe() cuts a
472                  * patchname to 10 characters blindly.  Suppress the
473                  * debug output in those cases.
474                  */
475                 if(2 != *inbytesleft && 10 != *inbytesleft) {
476                         debug_out("String conversion: "
477                                   "Invalid input sequence\n");
478                         hexdump("UTF8->UTF16LE (new) input",
479                                 *inbuf, *inbytesleft);
480                 }
481                 errno = EILSEQ;
482                 return -1;
483         case kCFStringEncodingConverterUnavailable:
484                 debug_out("String conversion: "
485                           "Unknown encoding\n");
486                 hexdump("UTF8->UTF16LE (new) input",
487                         *inbuf, *inbytesleft);
488                 errno = EINVAL;
489                 return -1;
490         }
491
492         /*
493          * It doesn't look like CFStringEncodingBytesToUnicode() can
494          * produce precomposed characters (flags=ComposeCombinings
495          * doesn't do it), so we need another pass over the data here.
496          * We can do this in-place, as the string can only get
497          * shorter.
498          *
499          * (Actually in theory there should be an internal
500          * decomposition and reordering before the actual composition
501          * step.  But we should be able to rely on that we always get
502          * fully decomposed strings for input, so this can't create
503          * problems in reality.)
504          */
505         CFUniCharPrecompose(
506                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
507                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
508
509         native_to_le(*outbuf, dstPrecomposedUsed*2);
510
511         *inbuf += srcCharsUsed;
512         *inbytesleft -= srcCharsUsed;
513         *outbuf += dstPrecomposedUsed*2;
514         *outbytesleft -= dstPrecomposedUsed*2;
515
516         return 0;
517 }
518
519 static size_t macosxfs_encoding_push(
520         void *cd,                               /* Encoder handle */
521         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
522         char **outbuf, size_t *outbytesleft)    /* Script string */
523 {
524         static const int script_code = kCFStringEncodingUTF8;
525         static UniChar *buffer = NULL;
526         static size_t buflen = 0;
527         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
528
529         (void) cd; /* UNUSED */
530
531         if (0 == *inbytesleft) {
532                 return 0;
533         }
534
535         buffer = set_ucbuffer_with_le(
536                 buffer, &buflen, *inbuf, *inbytesleft);
537
538         result = CFStringEncodingUnicodeToBytes(
539                 script_code, kCFStringEncodingUseHFSPlusCanonical,
540                 buffer, *inbytesleft/2, &srcCharsUsed,
541                 *outbuf, *outbytesleft, &dstCharsUsed);
542
543         switch(result) {
544         case kCFStringEncodingConversionSuccess:
545                 if (*inbytesleft/2 == srcCharsUsed)
546                         break;
547                 else
548                         ; /*fall through*/
549         case kCFStringEncodingInsufficientOutputBufferLength:
550                 debug_out("String conversion: "
551                           "Output buffer too small\n");
552                 hexdump("UTF16LE->UTF8 (new) input",
553                         *inbuf, *inbytesleft);
554                 errno = E2BIG;
555                 return -1;
556         case kCFStringEncodingInvalidInputStream:
557                 /*
558                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
559                  * cuts a pathname to 10 characters blindly.  Suppress
560                  * the debug output in those cases.
561                  */
562                 if(10 != *inbytesleft) {
563                         debug_out("String conversion: "
564                                   "Invalid input sequence\n");
565                         hexdump("UTF16LE->UTF8 (new) input",
566                                 *inbuf, *inbytesleft);
567                 }
568                 errno = EILSEQ;
569                 return -1;
570         case kCFStringEncodingConverterUnavailable:
571                 debug_out("String conversion: "
572                           "Unknown encoding\n");
573                 hexdump("UTF16LE->UTF8 (new) input",
574                         *inbuf, *inbytesleft);
575                 errno = EINVAL;
576                 return -1;
577         }
578
579         *inbuf += srcCharsUsed*2;
580         *inbytesleft -= srcCharsUsed*2;
581         *outbuf += dstCharsUsed;
582         *outbytesleft -= dstCharsUsed;
583
584         return 0;
585 }
586
587 #endif /* USE_INTERNAL_API */
588
589 /*
590  * For initialization, actually install the encoding as "macosxfs".
591  */
592 static struct charset_functions macosxfs_encoding_functions = {
593         "MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
594 };
595
596 NTSTATUS charset_macosxfs_init(void)
597 {
598         return smb_register_charset(&macosxfs_encoding_functions);
599 }
600
601 /* eof */