lib: Simplify load_case_tables_library()

[obnox/samba/samba-obnox.git] / lib / util / charset / codepoints.c
diff --git a/lib/util/charset/codepoints.c b/lib/util/charset/codepoints.c

index bc39f1b1a4790f6f347c667531450f4ac9a8bd8d..499cea42f0969efd2bbad46de766941d1ce264c1 100644 (file)
--- a/lib/util/charset/codepoints.c
+++ b/lib/util/charset/codepoints.c
@@ -1,8 +1,10 @@
  /* 
     Unix SMB/CIFS implementation.
-   Samba utility functions
-   Copyright (C) Andrew Tridgell 1992-2001
+   Character set conversion Extensions
+   Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
+   Copyright (C) Andrew Tridgell 2001
     Copyright (C) Simo Sorce 2001
+   Copyright (C) Jelmer Vernooij 2007
     
     This program is free software; you can redistribute it and/or modify
     it under the terms of the GNU General Public License as published by
@@ -16,12 +18,17 @@
     
     You should have received a copy of the GNU General Public License
     along with this program.  If not, see <http://www.gnu.org/licenses/>.
-*/
  
+*/
  #include "includes.h"
+#include "lib/util/charset/charset.h"
  #include "system/locale.h"
  #include "dynconfig/dynconfig.h"
  
+#ifdef strcasecmp
+#undef strcasecmp
+#endif
+
  /**
   * @file
   * @brief Unicode string manipulation
@@ -35,26 +42,49 @@ static void *lowcase_table;
  
  /*******************************************************************
  load the case handling tables
+
+This is the function that should be called from library code.
  ********************************************************************/
-void load_case_tables(void)
+void load_case_tables_library(void)
  {
-       TALLOC_CTX *mem_ctx;
+       const char *codepagedir = get_dyn_CODEPAGEDIR();
+       size_t codepagedir_len = strlen(codepagedir);
+       char buf[codepagedir_len+13];
+
+       snprintf(buf, sizeof(buf), "%s/upcase.dat", codepagedir);
+       upcase_table = map_file(buf, 0x20000);
+
+       snprintf(buf, sizeof(buf), "%s/lowcase.dat", codepagedir);
+       lowcase_table = map_file(buf, 0x20000);
  
-       mem_ctx = talloc_init("load_case_tables");
-       if (!mem_ctx) {
-               smb_panic("No memory for case_tables");
-       }
-       upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", dyn_CODEPAGEDIR), 0x20000);
-       lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", dyn_CODEPAGEDIR), 0x20000);
-       talloc_free(mem_ctx);
         if (upcase_table == NULL) {
+               DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
                 upcase_table = (void *)-1;
         }
         if (lowcase_table == NULL) {
+               DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
                 lowcase_table = (void *)-1;
         }
  }
  
+/*******************************************************************
+load the case handling tables
+
+This MUST only be called from main() in application code, never from a
+library.  We don't know if the calling program has already done
+setlocale() to another value, and can't tell if they have.
+********************************************************************/
+void load_case_tables(void)
+{
+       /* This is a useful global hook where we can ensure that the
+        * locale is set from the environment.  This is needed so that
+        * we can use LOCALE as a codepage */
+#ifdef HAVE_SETLOCALE
+       setlocale(LC_ALL, "");
+#endif
+       load_case_tables_library();
+}
+
  /**
   Convert a codepoint_t to upper case.
  **/
@@ -64,7 +94,7 @@ _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
                 return toupper(val);
         }
         if (upcase_table == NULL) {
-               load_case_tables();
+               load_case_tables_library();
         }
         if (upcase_table == (void *)-1) {
                 return val;
@@ -84,7 +114,7 @@ _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
                 return tolower(val);
         }
         if (lowcase_table == NULL) {
-               load_case_tables();
+               load_case_tables_library();
         }
         if (lowcase_table == (void *)-1) {
                 return val;
@@ -95,6 +125,22 @@ _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
         return SVAL(lowcase_table, val*2);
  }
  
+/**
+ If we upper cased this character, would we get the same character?
+**/
+_PUBLIC_ bool islower_m(codepoint_t val)
+{
+       return (toupper_m(val) != val);
+}
+
+/**
+ If we lower cased this character, would we get the same character?
+**/
+_PUBLIC_ bool isupper_m(codepoint_t val)
+{
+       return (tolower_m(val) != val);
+}
+
  /**
    compare two codepoints case insensitively
  */
@@ -108,3 +154,325 @@ _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
  }
  
  
+struct smb_iconv_handle {
+       TALLOC_CTX *child_ctx;
+       const char *unix_charset;
+       const char *dos_charset;
+       const char *display_charset;
+       bool use_builtin_handlers;
+       smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
+};
+
+struct smb_iconv_handle *global_iconv_handle = NULL;
+
+struct smb_iconv_handle *get_iconv_handle(void)
+{
+       if (global_iconv_handle == NULL)
+               global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
+                                                             "ASCII", "UTF-8", true, NULL);
+       return global_iconv_handle;
+}
+
+struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx, 
+                                                 const char *dos_charset, 
+                                                 const char *unix_charset,
+                                                 bool use_builtin_handlers)
+{
+       return smb_iconv_handle_reinit(mem_ctx,
+                                      dos_charset, unix_charset, use_builtin_handlers, NULL);
+}
+
+/**
+ * Return the name of a charset to give to iconv().
+ **/
+const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
+{
+       switch (ch) {
+       case CH_UTF16: return "UTF-16LE";
+       case CH_UNIX: return ic->unix_charset;
+       case CH_DOS: return ic->dos_charset;
+       case CH_UTF8: return "UTF8";
+       case CH_UTF16BE: return "UTF-16BE";
+       case CH_UTF16MUNGED: return "UTF16_MUNGED";
+       default:
+       return "ASCII";
+       }
+}
+
+/**
+ re-initialize iconv conversion descriptors
+**/
+static int close_iconv_handle(struct smb_iconv_handle *data)
+{
+       unsigned c1, c2;
+       for (c1=0;c1<NUM_CHARSETS;c1++) {
+               for (c2=0;c2<NUM_CHARSETS;c2++) {
+                       if (data->conv_handles[c1][c2] != NULL) {
+                               if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
+                                       smb_iconv_close(data->conv_handles[c1][c2]);
+                               }
+                               data->conv_handles[c1][c2] = NULL;
+                       }
+               }
+       }
+
+       return 0;
+}
+
+/*
+  the old_ic is passed in here as the smb_iconv_handle structure
+  is used as a global pointer in some places (eg. python modules). We
+  don't want to invalidate those global pointers, but we do want to
+  update them with the right charset information when loadparm
+  runs. To do that we need to re-use the structure pointer, but
+  re-fill the elements in the structure with the updated values
+ */
+_PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
+                                                                   const char *dos_charset,
+                                                                   const char *unix_charset,
+                                                                   bool use_builtin_handlers,
+                                                                   struct smb_iconv_handle *old_ic)
+{
+       struct smb_iconv_handle *ret;
+
+       if (old_ic != NULL) {
+               ret = old_ic;
+               close_iconv_handle(ret);
+               talloc_free(ret->child_ctx);
+               ZERO_STRUCTP(ret);
+       } else {
+               ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
+       }
+       if (ret == NULL) {
+               return NULL;
+       }
+
+       /* we use a child context to allow us to free all ptrs without
+          freeing the structure itself */
+       ret->child_ctx = talloc_new(ret);
+       if (ret->child_ctx == NULL) {
+               return NULL;
+       }
+
+       talloc_set_destructor(ret, close_iconv_handle);
+
+       if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
+               DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
+               dos_charset = "CP850";
+       }
+
+       ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
+       ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
+       ret->use_builtin_handlers = use_builtin_handlers;
+
+       return ret;
+}
+
+/*
+  on-demand initialisation of conversion handles
+*/
+smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
+                           charset_t from, charset_t to)
+{
+       const char *n1, *n2;
+
+       if (ic->conv_handles[from][to]) {
+               return ic->conv_handles[from][to];
+       }
+
+       n1 = charset_name(ic, from);
+       n2 = charset_name(ic, to);
+
+       ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
+                                                      ic->use_builtin_handlers);
+
+       if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
+               if ((from == CH_DOS || to == CH_DOS) &&
+                   strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
+                       DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
+                                charset_name(ic, CH_DOS)));
+                       ic->dos_charset = "ASCII";
+
+                       n1 = charset_name(ic, from);
+                       n2 = charset_name(ic, to);
+
+                       ic->conv_handles[from][to] =
+                               smb_iconv_open_ex(ic, n2, n1, ic->use_builtin_handlers);
+               }
+       }
+
+       return ic->conv_handles[from][to];
+}
+
+/**
+ * Return the unicode codepoint for the next character in the input
+ * string in the given src_charset.
+ * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
+ *
+ * Also return the number of bytes consumed (which tells the caller
+ * how many bytes to skip to get to the next src_charset-character).
+ *
+ * This is implemented (in the non-ascii-case) by first converting the
+ * next character in the input string to UTF16_LE and then calculating
+ * the unicode codepoint from that.
+ *
+ * Return INVALID_CODEPOINT if the next character cannot be converted.
+ */
+_PUBLIC_ codepoint_t next_codepoint_handle_ext(
+                       struct smb_iconv_handle *ic,
+                       const char *str, charset_t src_charset,
+                       size_t *bytes_consumed)
+{
+       /* it cannot occupy more than 4 bytes in UTF16 format */
+       uint8_t buf[4];
+       smb_iconv_t descriptor;
+       size_t ilen_orig;
+       size_t ilen;
+       size_t olen;
+       char *outbuf;
+
+       if ((str[0] & 0x80) == 0) {
+               *bytes_consumed = 1;
+               return (codepoint_t)str[0];
+       }
+
+       /*
+        * we assume that no multi-byte character can take more than 5 bytes.
+        * This is OK as we only support codepoints up to 1M (U+100000)
+        */
+       ilen_orig = strnlen(str, 5);
+       ilen = ilen_orig;
+
+       descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
+       if (descriptor == (smb_iconv_t)-1) {
+               *bytes_consumed = 1;
+               return INVALID_CODEPOINT;
+       }
+
+       /*
+        * this looks a little strange, but it is needed to cope with
+        * codepoints above 64k (U+1000) which are encoded as per RFC2781.
+        */
+       olen = 2;
+       outbuf = (char *)buf;
+       smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
+       if (olen == 2) {
+               olen = 4;
+               outbuf = (char *)buf;
+               smb_iconv(descriptor,  &str, &ilen, &outbuf, &olen);
+               if (olen == 4) {
+                       /* we didn't convert any bytes */
+                       *bytes_consumed = 1;
+                       return INVALID_CODEPOINT;
+               }
+               olen = 4 - olen;
+       } else {
+               olen = 2 - olen;
+       }
+
+       *bytes_consumed = ilen_orig - ilen;
+
+       if (olen == 2) {
+               return (codepoint_t)SVAL(buf, 0);
+       }
+       if (olen == 4) {
+               /* decode a 4 byte UTF16 character manually */
+               return (codepoint_t)0x10000 +
+                       (buf[2] | ((buf[3] & 0x3)<<8) |
+                        (buf[0]<<10) | ((buf[1] & 0x3)<<18));
+       }
+
+       /* no other length is valid */
+       return INVALID_CODEPOINT;
+}
+
+/*
+  return the unicode codepoint for the next multi-byte CH_UNIX character
+  in the string
+
+  also return the number of bytes consumed (which tells the caller
+  how many bytes to skip to get to the next CH_UNIX character)
+
+  return INVALID_CODEPOINT if the next character cannot be converted
+*/
+_PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
+                                   const char *str, size_t *size)
+{
+       return next_codepoint_handle_ext(ic, str, CH_UNIX, size);
+}
+
+/*
+  push a single codepoint into a CH_UNIX string the target string must
+  be able to hold the full character, which is guaranteed if it is at
+  least 5 bytes in size. The caller may pass less than 5 bytes if they
+  are sure the character will fit (for example, you can assume that
+  uppercase/lowercase of a character will not add more than 1 byte)
+
+  return the number of bytes occupied by the CH_UNIX character, or
+  -1 on failure
+*/
+_PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
+                               char *str, codepoint_t c)
+{
+       smb_iconv_t descriptor;
+       uint8_t buf[4];
+       size_t ilen, olen;
+       const char *inbuf;
+
+       if (c < 128) {
+               *str = c;
+               return 1;
+       }
+
+       descriptor = get_conv_handle(ic,
+                                    CH_UTF16, CH_UNIX);
+       if (descriptor == (smb_iconv_t)-1) {
+               return -1;
+       }
+
+       if (c < 0x10000) {
+               ilen = 2;
+               olen = 5;
+               inbuf = (char *)buf;
+               SSVAL(buf, 0, c);
+               smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
+               if (ilen != 0) {
+                       return -1;
+               }
+               return 5 - olen;
+       }
+
+       c -= 0x10000;
+
+       buf[0] = (c>>10) & 0xFF;
+       buf[1] = (c>>18) | 0xd8;
+       buf[2] = c & 0xFF;
+       buf[3] = ((c>>8) & 0x3) | 0xdc;
+
+       ilen = 4;
+       olen = 5;
+       inbuf = (char *)buf;
+
+       smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
+       if (ilen != 0) {
+               return -1;
+       }
+       return 5 - olen;
+}
+
+_PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
+                                       size_t *size)
+{
+       return next_codepoint_handle_ext(get_iconv_handle(), str,
+                                             src_charset, size);
+}
+
+_PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
+{
+       return next_codepoint_handle(get_iconv_handle(), str, size);
+}
+
+_PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
+{
+       return push_codepoint_handle(get_iconv_handle(), str, c);
+}