added a new charset for string2key
authorAndrew Tridgell <tridge@samba.org>
Fri, 31 Oct 2008 02:51:37 +0000 (13:51 +1100)
committerAndrew Tridgell <tridge@samba.org>
Fri, 31 Oct 2008 02:51:37 +0000 (13:51 +1100)
This charset follows the rules for converting random buffers to utf8
strings, matching the way windows does it. This should allow us to be
compatible for the generation of AES keys

lib/util/charset/iconv.c

index a01b6a5787f28e31dc34b5bcbb6397c961dba964..10b3a6488ba225807102287ad181530a62aa8230 100644 (file)
@@ -51,6 +51,7 @@ static size_t ascii_pull  (void *,const char **, size_t *, char **, size_t *);
 static size_t ascii_push  (void *,const char **, size_t *, char **, size_t *);
 static size_t utf8_pull   (void *,const char **, size_t *, char **, size_t *);
 static size_t utf8_push   (void *,const char **, size_t *, char **, size_t *);
+static size_t utf8_munged_push(void *,const char **, size_t *, char **, size_t *);
 static size_t ucs2hex_pull(void *,const char **, size_t *, char **, size_t *);
 static size_t ucs2hex_push(void *,const char **, size_t *, char **, size_t *);
 static size_t iconv_copy  (void *,const char **, size_t *, char **, size_t *);
@@ -66,6 +67,10 @@ static const struct charset_functions builtin_functions[] = {
        /* we include the UTF-8 alias to cope with differing locale settings */
        {"UTF8",   utf8_pull,  utf8_push},
        {"UTF-8",   utf8_pull,  utf8_push},
+
+       /* this handles the munging needed for String2Key */
+       {"UTF8_MUNGED",   utf8_pull,  utf8_munged_push},
+
        {"ASCII", ascii_pull, ascii_push},
        {"UCS2-HEX", ucs2hex_pull, ucs2hex_push}
 };
@@ -707,4 +712,131 @@ error:
 }
 
 
+/*
+  this takes a UTF16 sequence, munges it according to the string2key
+  rules, and produces a UTF8 sequence
+
+The rules are:
+
+    1) convert any instance of 0xD800 - 0xDBFF (high surrogate)
+       without an immediately following 0xDC00 - 0x0xDFFF (low surrogate) to
+       U+FFFD (OBJECT REPLACEMENT CHARACTER).
+
+    2) the same for any low surrogate that was not preceded by a high surrogate.
+ */
+static size_t utf8_munged_push(void *cd, const char **inbuf, size_t *inbytesleft,
+                              char **outbuf, size_t *outbytesleft)
+{
+       size_t in_left=*inbytesleft, out_left=*outbytesleft;
+       uint8_t *c = (uint8_t *)*outbuf;
+       const uint8_t *uc = (const uint8_t *)*inbuf;
+
+       while (in_left >= 2 && out_left >= 1) {
+               unsigned int codepoint;
+
+               if (uc[1] == 0 && !(uc[0] & 0x80)) {
+                       /* simplest case */
+                       c[0] = uc[0];
+                       in_left  -= 2;
+                       out_left -= 1;
+                       uc += 2;
+                       c  += 1;
+                       continue;
+               }
+
+               if ((uc[1]&0xf8) == 0) {
+                       /* next simplest case */
+                       if (out_left < 2) {
+                               errno = E2BIG;
+                               goto error;
+                       }
+                       c[0] = 0xc0 | (uc[0]>>6) | (uc[1]<<2);
+                       c[1] = 0x80 | (uc[0] & 0x3f);
+                       in_left  -= 2;
+                       out_left -= 2;
+                       uc += 2;
+                       c  += 2;
+                       continue;
+               }
+
+               if ((uc[1] & 0xfc) == 0xdc) {
+                       /* low surrogate not preceded by high surrogate
+                          convert to 0xfffd */
+                       codepoint = 0xfffd;
+                       goto codepoint16;
+               }
+
+               if ((uc[1] & 0xfc) != 0xd8) {
+                       codepoint = uc[0] | (uc[1]<<8);
+                       goto codepoint16;
+               }
+
+               /* its the first part of a 4 byte sequence */
+               if (in_left < 4 || (uc[3] & 0xfc) != 0xdc) {
+                       /* high surrogate not followed by low surrogate 
+                          convert to 0xfffd */
+                       codepoint = 0xfffd;
+                       goto codepoint16;
+               }
+
+               codepoint = 0x10000 + (uc[2] | ((uc[3] & 0x3)<<8) | 
+                                      (uc[0]<<10) | ((uc[1] & 0x3)<<18));
+               
+               if (out_left < 4) {
+                       errno = E2BIG;
+                       goto error;
+               }
+               c[0] = 0xf0 | (codepoint >> 18);
+               c[1] = 0x80 | ((codepoint >> 12) & 0x3f);
+               c[2] = 0x80 | ((codepoint >> 6) & 0x3f);
+               c[3] = 0x80 | (codepoint & 0x3f);
+               
+               in_left  -= 4;
+               out_left -= 4;
+               uc       += 4;
+               c        += 4;
+               continue;
+
+       codepoint16:
+               if (out_left < 3) {
+                       errno = E2BIG;
+                       goto error;
+               }
+               c[0] = 0xe0 | (codepoint >> 12);
+               c[1] = 0x80 | ((codepoint >> 6) & 0x3f);
+               c[2] = 0x80 | (codepoint & 0x3f);
+               
+               in_left  -= 2;
+               out_left -= 3;
+               uc  += 2;
+               c   += 3;
+               continue;               
+       }
+
+       if (in_left == 1) {
+               errno = EINVAL;
+               goto error;
+       }
+
+       if (in_left > 1) {
+               errno = E2BIG;
+               goto error;
+       }
+
+       *inbytesleft = in_left;
+       *outbytesleft = out_left;
+       *inbuf  = (const char *)uc;
+       *outbuf = (char *)c;
+       
+       return 0;
+
+error:
+       *inbytesleft = in_left;
+       *outbytesleft = out_left;
+       *inbuf  = (const char *)uc;
+       *outbuf = (char *)c;
+       return -1;
+}
+
+