source3/modules/charset_macosxfs.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba charset module for Mac OS X/Darwin
   4    Copyright (C) Benjamin Riefenstahl 2003
   5
   6    This program is free software; you can redistribute it and/or modify
   7    it under the terms of the GNU General Public License as published by
   8    the Free Software Foundation; either version 2 of the License, or
   9    (at your option) any later version.
  10
  11    This program is distributed in the hope that it will be useful,
  12    but WITHOUT ANY WARRANTY; without even the implied warranty of
  13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14    GNU General Public License for more details.
  15
  16    You should have received a copy of the GNU General Public License
  17    along with this program; if not, write to the Free Software
  18    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  19 */
  20
  21 /*
  22  * modules/charset_macosxfs.c
  23  *
  24  * A Samba charset module to use on Mac OS X/Darwin as the filesystem
  25  * and display encoding.
  26  *
  27  * Actually two implementations are provided here.  The default
  28  * implementation is based on the official CFString API.  The other is
  29  * based on internal CFString APIs as defined in the OpenDarwin
  30  * source.
  31  */
  32
  33 #include "includes.h"
  34
  35 /*
  36  * Include OS frameworks.  These are only needed in this module.
  37  */
  38 #include <CoreFoundation/CFString.h>
  39
  40 /*
  41  * See if autoconf has found us the internal headers in some form.
  42  */
  43 #if HAVE_COREFOUNDATION_CFSTRINGENCODINGCONVERTER_H
  44 #       include <Corefoundation/CFStringEncodingConverter.h>
  45 #       include <Corefoundation/CFUnicodePrecomposition.h>
  46 #       define USE_INTERNAL_API 1
  47 #elif HAVE_CFSTRINGENCODINGCONVERTER_H
  48 #       include <CFStringEncodingConverter.h>
  49 #       include <CFUnicodePrecomposition.h>
  50 #       define USE_INTERNAL_API 1
  51 #endif
  52
  53 /*
  54  * Compile time configuration: Do we want debug output?
  55  */
  56 /* #define DEBUG_STRINGS 1 */
  57
  58 /*
  59  * A simple, but efficient memory provider for our buffers.
  60  */
  61 static inline void *resize_buffer (void *buffer, size_t *size, size_t newsize)
  62 {
  63         if (newsize > *size) {
  64                 *size = newsize + 128;
  65                 buffer = realloc(buffer, *size);
  66         }
  67         return buffer;
  68 }
  69
  70 /*
  71  * While there is a version of OpenDarwin for intel, the usual case is
  72  * big-endian PPC.  So we need byte swapping to handle the
  73  * little-endian byte order of the network protocol.  We also need an
  74  * additional dynamic buffer to do this work for incoming data blocks,
  75  * because we have to consider the original data as constant.
  76  *
  77  * We abstract the differences away by providing a simple facade with
  78  * these functions/macros:
  79  *
  80  *      le_to_native(dst,src,len)
  81  *      native_to_le(cp,len)
  82  *      set_ucbuffer_with_le(buffer,bufsize,data,size)
  83  *      set_ucbuffer_with_le_copy(buffer,bufsize,data,size,reserve)
  84  */
  85 #ifdef WORDS_BIGENDIAN
  86
  87 static inline void swap_bytes (char * dst, const char * src, size_t len)
  88 {
  89         const char *srcend = src + len;
  90         while (src < srcend) {
  91                 dst[0] = src[1];
  92                 dst[1] = src[0];
  93                 dst += 2;
  94                 src += 2;
  95         }
  96 }
  97 static inline void swap_bytes_inplace (char * cp, size_t len)
  98 {
  99         char temp;
 100         char *end = cp + len;
 101         while (cp  < end) {
 102                 temp = cp[1];
 103                 cp[1] = cp[0];
 104                 cp[0] = temp;
 105                 cp += 2;
 106         }
 107 }
 108
 109 #define le_to_native(dst,src,len)       swap_bytes(dst,src,len)
 110 #define native_to_le(cp,len)            swap_bytes_inplace(cp,len)
 111 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 112         set_ucbuffer_with_le_copy(buffer,bufsize,data,size,0)
 113
 114 #else   /* ! WORDS_BIGENDIAN */
 115
 116 #define le_to_native(dst,src,len)       memcpy(dst,src,len)
 117 #define native_to_le(cp,len)            /* nothing */
 118 #define set_ucbuffer_with_le(buffer,bufsize,data,size) \
 119         (((void)(bufsize)),(UniChar*)(data))
 120
 121 #endif
 122
 123 static inline UniChar *set_ucbuffer_with_le_copy (
 124         UniChar *buffer, size_t *bufsize,
 125         const void *data, size_t size, size_t reserve)
 126 {
 127         buffer = resize_buffer(buffer, bufsize, size+reserve);
 128         le_to_native((char*)buffer,data,size);
 129         return buffer;
 130 }
 131
 132
 133 /*
 134  * A simple hexdump function for debugging error conditions.
 135  */
 136 #define debug_out(s)    DEBUG(0,(s))
 137
 138 #ifdef DEBUG_STRINGS
 139
 140 static void hexdump( const char * label, const char * s, size_t len )
 141 {
 142         size_t restlen = len;
 143         debug_out("<<<<<<<\n");
 144         debug_out(label);
 145         debug_out("\n");
 146         while (restlen > 0) {
 147                 char line[100];
 148                 size_t i, j;
 149                 char * d = line;
 150 #undef sprintf
 151                 d += sprintf(d, "%04X ", (unsigned)(len-restlen));
 152                 *d++ = ' ';
 153                 for( i = 0; i<restlen && i<8; ++i ) {
 154                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 155                 }
 156                 for( j = i; j<8; ++j ) {
 157                         d += sprintf(d, "   ");
 158                 }
 159                 *d++ = ' ';
 160                 for( i = 8; i<restlen && i<16; ++i ) {
 161                         d += sprintf(d, "%02X ", ((unsigned)s[i]) & 0xFF);
 162                 }
 163                 for( j = i; j<16; ++j ) {
 164                         d += sprintf(d, "   ");
 165                 }
 166                 *d++ = ' ';
 167                 for( i = 0; i<restlen && i<16; ++i ) {
 168                         if(s[i] < ' ' || s[i] >= 0x7F || !isprint(s[i]))
 169                                 *d++ = '.';
 170                         else
 171                                 *d++ = s[i];
 172                 }
 173                 *d++ = '\n';
 174                 *d = 0;
 175                 restlen -= i;
 176                 s += i;
 177                 debug_out(line);
 178         }
 179         debug_out(">>>>>>>\n");
 180 }
 181
 182 #else   /* !DEBUG_STRINGS */
 183
 184 #define hexdump(label,s,len) /* nothing */
 185
 186 #endif
 187
 188
 189 #if !USE_INTERNAL_API
 190
 191 /*
 192  * An implementation based on documented Mac OS X APIs.
 193  *
 194  * This does a certain amount of memory management, creating and
 195  * manipulating CFString objects.  We try to minimize the impact by
 196  * keeping those objects around and re-using them.  We also use
 197  * external backing store for the CFStrings where this is possible and
 198  * benficial.
 199  *
 200  * The Unicode normalizations forms available at this level are
 201  * generic, not specifically for the file system.  So they may not be
 202  * perfect fits.
 203  */
 204 static size_t macosxfs_encoding_pull(
 205         void *cd,                               /* Encoder handle */
 206         char **inbuf, size_t *inbytesleft,      /* Script string */
 207         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 208 {
 209         static const int script_code = kCFStringEncodingUTF8;
 210         static CFMutableStringRef cfstring = NULL;
 211         size_t outsize;
 212         CFRange range;
 213
 214         (void) cd; /* UNUSED */
 215
 216         if (0 == *inbytesleft) {
 217                 return 0;
 218         }
 219
 220         if (NULL == cfstring) {
 221                 /*
 222                  * A version with an external backing store as in the
 223                  * push function should have been more efficient, but
 224                  * testing shows, that it is actually slower (!).
 225                  * Maybe kCFAllocatorDefault gets shortcut evaluation
 226                  * internally, while kCFAllocatorNull doesn't.
 227                  */
 228                 cfstring = CFStringCreateMutable(kCFAllocatorDefault,0);
 229         }
 230
 231         /*
 232          * Three methods of appending to a CFString, choose the most
 233          * efficient.
 234          */
 235         if (0 == (*inbuf)[*inbytesleft-1]) {
 236                 CFStringAppendCString(cfstring, *inbuf, script_code);
 237         } else if (*inbytesleft <= 255) {
 238                 Str255 buffer;
 239                 buffer[0] = *inbytesleft;
 240                 memcpy(buffer+1, *inbuf, buffer[0]);
 241                 CFStringAppendPascalString(cfstring, buffer, script_code);
 242         } else {
 243                 /*
 244                  * We would like to use a fixed buffer and a loop
 245                  * here, but than we can't garantee that the input is
 246                  * well-formed UTF-8, as we are supposed to do.
 247                  */
 248                 static char *buffer = NULL;
 249                 static size_t buflen = 0;
 250                 buffer = resize_buffer(buffer, &buflen, *inbytesleft+1);
 251                 memcpy(buffer, *inbuf, *inbytesleft);
 252                 buffer[*inbytesleft] = 0;
 253                 CFStringAppendCString(cfstring, *inbuf, script_code);
 254         }
 255
 256         /*
 257          * Compose characters, using the non-canonical composition
 258          * form.
 259          */
 260         CFStringNormalize(cfstring, kCFStringNormalizationFormC);
 261
 262         outsize = CFStringGetLength(cfstring);
 263         range = CFRangeMake(0,outsize);
 264
 265         if (outsize == 0) {
 266                 /*
 267                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 268                  * errors here.  That function will always pass 2
 269                  * characters.  smbd/open.c:check_for_pipe() cuts a
 270                  * patchname to 10 characters blindly.  Suppress the
 271                  * debug output in those cases.
 272                  */
 273                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 274                         debug_out("String conversion: "
 275                                   "An unknown error occurred\n");
 276                         hexdump("UTF8->UTF16LE (old) input",
 277                                 *inbuf, *inbytesleft);
 278                 }
 279                 errno = EILSEQ; /* Not sure, but this is what we have
 280                                  * actually seen. */
 281                 return -1;
 282         }
 283         if (outsize*2 > *outbytesleft) {
 284                 CFStringDelete(cfstring, range);
 285                 debug_out("String conversion: "
 286                           "Output buffer too small\n");
 287                 hexdump("UTF8->UTF16LE (old) input",
 288                         *inbuf, *inbytesleft);
 289                 errno = E2BIG;
 290                 return -1;
 291         }
 292
 293         CFStringGetCharacters(cfstring, range, (UniChar*)*outbuf);
 294         CFStringDelete(cfstring, range);
 295
 296         native_to_le(*outbuf, outsize*2);
 297
 298         /*
 299          * Add a converted null byte, if the CFString conversions
 300          * prevented that until now.
 301          */
 302         if (0 == (*inbuf)[*inbytesleft-1] &&
 303             (0 != (*outbuf)[outsize*2-1] || 0 != (*outbuf)[outsize*2-2])) {
 304
 305                 if ((outsize*2+2) > *outbytesleft) {
 306                         debug_out("String conversion: "
 307                                   "Output buffer too small\n");
 308                         hexdump("UTF8->UTF16LE (old) input",
 309                                 *inbuf, *inbytesleft);
 310                         errno = E2BIG;
 311                         return -1;
 312                 }
 313
 314                 (*outbuf)[outsize*2] = (*outbuf)[outsize*2+1] = 0;
 315                 outsize += 2;
 316         }
 317
 318         *inbuf += *inbytesleft;
 319         *inbytesleft = 0;
 320         *outbuf += outsize*2;
 321         *outbytesleft -= outsize*2;
 322
 323         return 0;
 324 }
 325
 326 static size_t macosxfs_encoding_push(
 327         void *cd,                               /* Encoder handle */
 328         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
 329         char **outbuf, size_t *outbytesleft)    /* Script string */
 330 {
 331         static const int script_code = kCFStringEncodingUTF8;
 332         static CFMutableStringRef cfstring = NULL;
 333         static UniChar *buffer = NULL;
 334         static size_t buflen = 0;
 335         CFIndex outsize, cfsize, charsconverted;
 336
 337         (void) cd; /* UNUSED */
 338
 339         if (0 == *inbytesleft) {
 340                 return 0;
 341         }
 342
 343         /*
 344          * We need a buffer that can hold 4 times the original data,
 345          * because that is the theoretical maximum that decomposition
 346          * can create currently (in Unicode 4.0).
 347          */
 348         buffer = set_ucbuffer_with_le_copy(
 349                 buffer, &buflen, *inbuf, *inbytesleft, 3 * *inbytesleft);
 350
 351         if (NULL == cfstring) {
 352                 cfstring = CFStringCreateMutableWithExternalCharactersNoCopy(
 353                         kCFAllocatorDefault,
 354                         buffer, *inbytesleft/2, buflen/2,
 355                         kCFAllocatorNull);
 356         } else {
 357                 CFStringSetExternalCharactersNoCopy(
 358                         cfstring,
 359                         buffer, *inbytesleft/2, buflen/2);
 360         }
 361
 362         /*
 363          * Decompose characters, using the non-canonical decomposition
 364          * form.
 365          *
 366          * NB: This isn't exactly what HFS+ wants (see note on
 367          * kCFStringEncodingUseHFSPlusCanonical in
 368          * CFStringEncodingConverter.h), but AFAIK it's the best that
 369          * the official API can do.
 370          */
 371         CFStringNormalize(cfstring, kCFStringNormalizationFormD);
 372
 373         cfsize = CFStringGetLength(cfstring);
 374         charsconverted = CFStringGetBytes(
 375                 cfstring, CFRangeMake(0,cfsize),
 376                 script_code, 0, False,
 377                 *outbuf, *outbytesleft, &outsize);
 378
 379         if (0 == charsconverted) {
 380                 debug_out("String conversion: "
 381                           "Buffer too small or not convertable\n");
 382                 hexdump("UTF16LE->UTF8 (old) input",
 383                         *inbuf, *inbytesleft);
 384                 errno = EILSEQ; /* Probably more likely. */
 385                 return -1;
 386         }
 387
 388         /*
 389          * Add a converted null byte, if the CFString conversions
 390          * prevented that until now.
 391          */
 392         if (0 == (*inbuf)[*inbytesleft-1] && 0 == (*inbuf)[*inbytesleft-2] &&
 393             (0 != (*outbuf)[outsize-1])) {
 394
 395                 if (((size_t)outsize+1) > *outbytesleft) {
 396                         debug_out("String conversion: "
 397                                   "Output buffer too small\n");
 398                         hexdump("UTF16LE->UTF8 (old) input",
 399                                 *inbuf, *inbytesleft);
 400                         errno = E2BIG;
 401                         return -1;
 402                 }
 403
 404                 (*outbuf)[outsize] = 0;
 405                 ++outsize;
 406         }
 407
 408         *inbuf += *inbytesleft;
 409         *inbytesleft = 0;
 410         *outbuf += outsize;
 411         *outbytesleft -= outsize;
 412
 413         return 0;
 414 }
 415
 416 #else /* USE_INTERNAL_API */
 417
 418 /*
 419  * An implementation based on internal code as known from the
 420  * OpenDarwin CVS.
 421  *
 422  * This code doesn't need much memory management because it uses
 423  * functions that operate on the raw memory directly.
 424  *
 425  * The push routine here is faster and more compatible with HFS+ than
 426  * the other implementation above.  The pull routine is only faster
 427  * for some strings, slightly slower for others.  The pull routine
 428  * looses because it has to iterate over the data twice, once to
 429  * decode UTF-8 and than to do the character composition required by
 430  * Windows.
 431  */
 432 static size_t macosxfs_encoding_pull(
 433         void *cd,                               /* Encoder handle */
 434         char **inbuf, size_t *inbytesleft,      /* Script string */
 435         char **outbuf, size_t *outbytesleft)    /* UTF-16-LE string */
 436 {
 437         static const int script_code = kCFStringEncodingUTF8;
 438         UInt32 srcCharsUsed = 0;
 439         UInt32 dstCharsUsed = 0;
 440         UInt32 result;
 441         uint32_t dstDecomposedUsed = 0;
 442         uint32_t dstPrecomposedUsed = 0;
 443
 444         (void) cd; /* UNUSED */
 445
 446         if (0 == *inbytesleft) {
 447                 return 0;
 448         }
 449
 450         result = CFStringEncodingBytesToUnicode(
 451                 script_code, kCFStringEncodingComposeCombinings,
 452                 *inbuf, *inbytesleft, &srcCharsUsed,
 453                 (UniChar*)*outbuf, *outbytesleft, &dstCharsUsed);
 454
 455         switch(result) {
 456         case kCFStringEncodingConversionSuccess:
 457                 if (*inbytesleft == srcCharsUsed)
 458                         break;
 459                 else
 460                         ; /*fall through*/
 461         case kCFStringEncodingInsufficientOutputBufferLength:
 462                 debug_out("String conversion: "
 463                           "Output buffer too small\n");
 464                 hexdump("UTF8->UTF16LE (new) input",
 465                         *inbuf, *inbytesleft);
 466                 errno = E2BIG;
 467                 return -1;
 468         case kCFStringEncodingInvalidInputStream:
 469                 /*
 470                  * HACK: smbd/mangle_hash2.c:is_legal_name() expects
 471                  * errors here.  That function will always pass 2
 472                  * characters.  smbd/open.c:check_for_pipe() cuts a
 473                  * patchname to 10 characters blindly.  Suppress the
 474                  * debug output in those cases.
 475                  */
 476                 if(2 != *inbytesleft && 10 != *inbytesleft) {
 477                         debug_out("String conversion: "
 478                                   "Invalid input sequence\n");
 479                         hexdump("UTF8->UTF16LE (new) input",
 480                                 *inbuf, *inbytesleft);
 481                 }
 482                 errno = EILSEQ;
 483                 return -1;
 484         case kCFStringEncodingConverterUnavailable:
 485                 debug_out("String conversion: "
 486                           "Unknown encoding\n");
 487                 hexdump("UTF8->UTF16LE (new) input",
 488                         *inbuf, *inbytesleft);
 489                 errno = EINVAL;
 490                 return -1;
 491         }
 492
 493         /*
 494          * It doesn't look like CFStringEncodingBytesToUnicode() can
 495          * produce precomposed characters (flags=ComposeCombinings
 496          * doesn't do it), so we need another pass over the data here.
 497          * We can do this in-place, as the string can only get
 498          * shorter.
 499          *
 500          * (Actually in theory there should be an internal
 501          * decomposition and reordering before the actual composition
 502          * step.  But we should be able to rely on that we always get
 503          * fully decomposed strings for input, so this can't create
 504          * problems in reality.)
 505          */
 506         CFUniCharPrecompose(
 507                 (const UTF16Char *)*outbuf, dstCharsUsed, &dstDecomposedUsed,
 508                 (UTF16Char *)*outbuf, dstCharsUsed, &dstPrecomposedUsed);
 509
 510         native_to_le(*outbuf, dstPrecomposedUsed*2);
 511
 512         *inbuf += srcCharsUsed;
 513         *inbytesleft -= srcCharsUsed;
 514         *outbuf += dstPrecomposedUsed*2;
 515         *outbytesleft -= dstPrecomposedUsed*2;
 516
 517         return 0;
 518 }
 519
 520 static size_t macosxfs_encoding_push(
 521         void *cd,                               /* Encoder handle */
 522         char **inbuf, size_t *inbytesleft,      /* UTF-16-LE string */
 523         char **outbuf, size_t *outbytesleft)    /* Script string */
 524 {
 525         static const int script_code = kCFStringEncodingUTF8;
 526         static UniChar *buffer = NULL;
 527         static size_t buflen = 0;
 528         UInt32 srcCharsUsed=0, dstCharsUsed=0, result;
 529
 530         (void) cd; /* UNUSED */
 531
 532         if (0 == *inbytesleft) {
 533                 return 0;
 534         }
 535
 536         buffer = set_ucbuffer_with_le(
 537                 buffer, &buflen, *inbuf, *inbytesleft);
 538
 539         result = CFStringEncodingUnicodeToBytes(
 540                 script_code, kCFStringEncodingUseHFSPlusCanonical,
 541                 buffer, *inbytesleft/2, &srcCharsUsed,
 542                 *outbuf, *outbytesleft, &dstCharsUsed);
 543
 544         switch(result) {
 545         case kCFStringEncodingConversionSuccess:
 546                 if (*inbytesleft/2 == srcCharsUsed)
 547                         break;
 548                 else
 549                         ; /*fall through*/
 550         case kCFStringEncodingInsufficientOutputBufferLength:
 551                 debug_out("String conversion: "
 552                           "Output buffer too small\n");
 553                 hexdump("UTF16LE->UTF8 (new) input",
 554                         *inbuf, *inbytesleft);
 555                 errno = E2BIG;
 556                 return -1;
 557         case kCFStringEncodingInvalidInputStream:
 558                 /*
 559                  * HACK: smbd/open.c:check_for_pipe():is_legal_name()
 560                  * cuts a pathname to 10 characters blindly.  Suppress
 561                  * the debug output in those cases.
 562                  */
 563                 if(10 != *inbytesleft) {
 564                         debug_out("String conversion: "
 565                                   "Invalid input sequence\n");
 566                         hexdump("UTF16LE->UTF8 (new) input",
 567                                 *inbuf, *inbytesleft);
 568                 }
 569                 errno = EILSEQ;
 570                 return -1;
 571         case kCFStringEncodingConverterUnavailable:
 572                 debug_out("String conversion: "
 573                           "Unknown encoding\n");
 574                 hexdump("UTF16LE->UTF8 (new) input",
 575                         *inbuf, *inbytesleft);
 576                 errno = EINVAL;
 577                 return -1;
 578         }
 579
 580         *inbuf += srcCharsUsed*2;
 581         *inbytesleft -= srcCharsUsed*2;
 582         *outbuf += dstCharsUsed;
 583         *outbytesleft -= dstCharsUsed;
 584
 585         return 0;
 586 }
 587
 588 #endif /* USE_INTERNAL_API */
 589
 590 /*
 591  * For initialization, actually install the encoding as "macosxfs".
 592  */
 593 static struct charset_functions macosxfs_encoding_functions = {
 594         "MACOSXFS", macosxfs_encoding_pull, macosxfs_encoding_push
 595 };
 596
 597 NTSTATUS charset_macosxfs_init(void)
 598 {
 599         return smb_register_charset(&macosxfs_encoding_functions);
 600 }
 601
 602 /* eof */