finished adding UTF16_MUNGED charset
authorAndrew Tridgell <tridge@samba.org>
Fri, 31 Oct 2008 04:41:34 +0000 (15:41 +1100)
committerAndrew Tridgell <tridge@samba.org>
Fri, 31 Oct 2008 04:41:34 +0000 (15:41 +1100)
Changed the approach for the charset to go via utf16, which makes a
bit more sense to read.

Added a testsuiite for UTF16_MUNGED as part of LOCAL-ICONV

lib/util/charset/charcnv.c
lib/util/charset/charset.h
lib/util/charset/iconv.c
lib/util/charset/tests/iconv.c

index 2ae16c3250a9aa88d760452619ad20411aa5751e..9dd68f05eaf0767b36d5bab04397d8470846dc0e 100644 (file)
@@ -57,6 +57,7 @@ static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
        case CH_DOS: return ic->dos_charset;
        case CH_UTF8: return "UTF8";
        case CH_UTF16BE: return "UTF-16BE";
+       case CH_UTF16MUNGED: return "UTF16_MUNGED";
        default:
        return "ASCII";
        }
index 21fc20b8c3d001601eb95fb246d230eec8fbafc4..cace79f94960f215570b5c53db709f743d0830ef 100644 (file)
@@ -28,9 +28,9 @@
 #include <talloc.h>
 
 /* this defines the charset types used in samba */
-typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE} charset_t;
+typedef enum {CH_UTF16=0, CH_UNIX, CH_DOS, CH_UTF8, CH_UTF16BE, CH_UTF16MUNGED} charset_t;
 
-#define NUM_CHARSETS 5
+#define NUM_CHARSETS 6
 
 /*
  *   for each charset we have a function that pulls from that charset to
index 10b3a6488ba225807102287ad181530a62aa8230..b6842a49aa6e474a236231132a517a2747499481 100644 (file)
@@ -51,7 +51,7 @@ static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
-static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *);
+static size_t utf16_munged_pull(void *,const char **, size_t *, char **, size_t *);
 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
@@ -69,7 +69,7 @@ static const struct charset_functions builtin_functions[] = {
        {"UTF-8",   utf8_pull,  utf8_push},
 
        /* this handles the munging needed for String2Key */
-       {"UTF8_MUNGED",   utf8_pull,  utf8_munged_push},
+       {"UTF16_MUNGED",   utf16_munged_pull,  iconv_copy},
 
        {"ASCII", ascii_pull, ascii_push},
        {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
@@ -713,103 +713,74 @@ error:
 
 
 /*
-  this takes a UTF16 sequence, munges it according to the string2key
-  rules, and produces a UTF8 sequence
+  this takes a UTF16 munged sequence, modifies it according to the
+  string2key rules, and produces a UTF16 sequence
 
 The rules are:
 
-    1) convert any instance of 0xD800 - 0xDBFF (high surrogate)
+    1) any 0x0000 characters are mapped to 0x0001
+
+    2) convert any instance of 0xD800 - 0xDBFF (high surrogate)
        without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
        U+FFFD (OBJECT REPLACEMENT CHARACTER).
 
-    2) the same for any low surrogate that was not preceded by a high surrogate.
+    3) the same for any low surrogate that was not preceded by a high surrogate.
+
  */
-static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft,
+static size_t utf16_munged_pull(void *cd, const char **inbuf, size_t *inbytesleft,
                               char **outbuf, size_t *outbytesleft)
 {
        size_t in_left=*inbytesleft, out_left=*outbytesleft;
        uint8_t *c = (uint8_t *)*outbuf;
        const uint8_t *uc = (const uint8_t *)*inbuf;
 
-       while (in_left >= 2 && out_left >= 1) {
-               unsigned int codepoint;
+       while (in_left >= 2 && out_left >= 2) {
+               unsigned int codepoint = uc[0] | (uc[1]<<8);
 
-               if (uc[1] == 0 && !(uc[0] & 0x80)) {
-                       /* simplest case */
-                       c[0] = uc[0];
-                       in_left  -= 2;
-                       out_left -= 1;
-                       uc += 2;
-                       c  += 1;
-                       continue;
+               if (codepoint == 0) {
+                       codepoint = 1;
                }
 
-               if ((uc[1]&0xf8) == 0) {
-                       /* next simplest case */
-                       if (out_left < 2) {
+               if ((codepoint & 0xfc00) == 0xd800) {
+                       /* a high surrogate */
+                       unsigned int codepoint2;
+                       if (in_left < 4) {
+                               codepoint = 0xfffd;
+                               goto codepoint16;                               
+                       }
+                       codepoint2 = uc[2] | (uc[3]<<8);
+                       if ((codepoint2 & 0xfc00) != 0xdc00) {
+                               /* high surrogate not followed by low
+                                  surrogate: convert to 0xfffd */
+                               codepoint = 0xfffd;
+                               goto codepoint16;
+                       }
+                       if (out_left < 4) {
                                errno = E2BIG;
                                goto error;
                        }
-                       c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
-                       c[1] = 0x80 | (uc[0] & 0x3f);
-                       in_left  -= 2;
-                       out_left -= 2;
-                       uc += 2;
-                       c  += 2;
+                       memcpy(c, uc, 4);
+                       in_left  -= 4;
+                       out_left -= 4;
+                       uc       += 4;
+                       c        += 4;
                        continue;
                }
 
-               if ((uc[1] & 0xfc) == 0xdc) {
-                       /* low surrogate not preceded by high surrogate
-                          convert to 0xfffd */
-                       codepoint = 0xfffd;
-                       goto codepoint16;
-               }
-
-               if ((uc[1] & 0xfc) != 0xd8) {
-                       codepoint = uc[0] | (uc[1]<<8);
-                       goto codepoint16;
-               }
-
-               /* its the first part of a 4 byte sequence */
-               if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) {
-                       /* high surrogate not followed by low surrogate 
-                          convert to 0xfffd */
+               if ((codepoint & 0xfc00) == 0xdc00) {
+                       /* low surrogate not preceded by high
+                          surrogate: convert to 0xfffd */
                        codepoint = 0xfffd;
-                       goto codepoint16;
-               }
-
-               codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
-                                      (uc[0]<<10) | ((uc[1] & 0x3)<<18));
-               
-               if (out_left < 4) {
-                       errno = E2BIG;
-                       goto error;
                }
-               c[0] = 0xf0 | (codepoint >> 18);
-               c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
-               c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
-               c[3] = 0x80 | (codepoint & 0x3f);
-               
-               in_left  -= 4;
-               out_left -= 4;
-               uc       += 4;
-               c        += 4;
-               continue;
 
        codepoint16:
-               if (out_left < 3) {
-                       errno = E2BIG;
-                       goto error;
-               }
-               c[0] = 0xe0 | (codepoint >> 12);
-               c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
-               c[2] = 0x80 | (codepoint & 0x3f);
+               c[0] = codepoint & 0xFF;
+               c[1] = (codepoint>>8) & 0xFF;
                
                in_left  -= 2;
-               out_left -= 3;
+               out_left -= 2;
                uc  += 2;
-               c   += 3;
+               c   += 2;
                continue;               
        }
 
index 40e223b28f5f61f22bb8d9cecb055d0e8389a8b8..1facea61368dd2b05066537124efa963b2a2c5dd 100644 (file)
@@ -398,10 +398,65 @@ static bool test_random_5m(struct torture_context *tctx)
        return true;
 }
 
+
+static bool test_string2key(struct torture_context *tctx)
+{
+       uint16_t *buf;
+       char *dest = NULL;
+       TALLOC_CTX *mem_ctx = talloc_new(tctx);
+       ssize_t ret;
+       size_t len = (random()%1000)+1;
+       const uint16_t in1[10] = { 'a', 0xd805, 'b', 0xdcf0, 'c', 0, 'd', 'e', 'f', 'g' };
+       uint8_t le1[20];
+       uint8_t *munged1;
+       uint8_t *out1;
+       int i;
+       const char *correct = "a\357\277\275b\357\277\275c\001defg";
+
+       buf = talloc_size(mem_ctx, len*2);
+       generate_random_buffer((uint8_t *)buf, len*2);
+
+       torture_comment(tctx, "converting random buffer\n");
+
+       ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)buf, len*2, (void**)&dest);
+       if (ret == -1) {
+               torture_fail(tctx, "Failed to convert random buffer\n");
+       }
+
+       for (i=0;i<10;i++) {
+               SSVAL(&le1[2*i], 0, in1[i]);
+       }
+
+       torture_comment(tctx, "converting fixed buffer to UTF16\n");
+
+       ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF16, (void *)le1, 20, (void**)&munged1);
+       if (ret == -1) {
+               torture_fail(tctx, "Failed to convert fixed buffer to UTF16_MUNGED\n");
+       }
+
+       torture_assert(tctx, ret == 20, "conversion should give 20 bytes\n");
+
+       torture_comment(tctx, "converting fixed buffer to UTF8\n");
+
+       ret = convert_string_talloc(mem_ctx, CH_UTF16MUNGED, CH_UTF8, (void *)le1, 20, (void**)&out1);
+       if (ret == -1) {
+               torture_fail(tctx, "Failed to convert fixed buffer to UTF8\n");
+       }
+
+       torture_assert(tctx, strcmp(correct, out1) == 0, "conversion gave incorrect result\n");
+
+       talloc_free(mem_ctx);
+
+       return true;
+}
+
 struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx)
 {
        struct torture_suite *suite = torture_suite_create(mem_ctx, "ICONV");
 
+       torture_suite_add_simple_test(suite, "string2key",
+                                     test_string2key);
+
        torture_suite_add_simple_test(suite, "next_codepoint()",
                                      test_next_codepoint);
 
@@ -410,6 +465,9 @@ struct torture_suite *torture_local_iconv(TALLOC_CTX *mem_ctx)
 
        torture_suite_add_simple_test(suite, "5M random UTF-16LE sequences",
                                      test_random_5m);
+
+       torture_suite_add_simple_test(suite, "string2key",
+                                     test_string2key);
        return suite;
 }