lib/util/charset/util_unistr: add strlen_m_ext that takes input and output charset

author Michael Adam <obnox@samba.org>

Sat, 30 Oct 2010 00:03:02 +0000 (02:03 +0200)

committer Michael Adam <obnox@samba.org>

Wed, 3 Nov 2010 22:45:20 +0000 (22:45 +0000)
author Michael Adam <obnox@samba.org>
Sat, 30 Oct 2010 00:03:02 +0000 (02:03 +0200)
committer Michael Adam <obnox@samba.org>
Wed, 3 Nov 2010 22:45:20 +0000 (22:45 +0000)
diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h

index 283212dbff7f86ec0cdbef41998d85ab4ec09a7b..8222a0586e0c90bf7275e435138dd39e5f35444f 100644 (file)
--- a/lib/util/charset/charset.h
+++ b/lib/util/charset/charset.h
@@ -120,6 +120,7 @@ struct smb_iconv_convenience;
  #define strupper(s) strupper_m(s)
  
  char *strchr_m(const char *s, char c);
+size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset);
  size_t strlen_m_term(const char *s);
  size_t strlen_m_term_null(const char *s);
  size_t strlen_m(const char *s);
diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c

index 79a9ffe3df9f1904398e72c24f717906b325c624..93fc24da15fa31f845ca471e22774d8a3c4ec554 100644 (file)
--- a/lib/util/charset/util_unistr.c
+++ b/lib/util/charset/util_unistr.c
@@ -249,11 +249,12 @@ _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_
  }
  
  /**
- Count the number of UCS2 characters in a string. Normally this will
- be the same as the number of bytes in a string for single byte strings,
- but will be different for multibyte.
-**/
-_PUBLIC_ size_t strlen_m(const char *s)
+ * Calculate the number of units (8 or 16-bit, depending on the
+ * destination charset), that would be needed to convert the input
+ * string which is expected to be in in src_charset encoding to the
+ * destination charset (which should be a unicode charset).
+ */
+_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
  {
         size_t count = 0;
         struct smb_iconv_convenience *ic = get_iconv_convenience();
@@ -273,18 +274,57 @@ _PUBLIC_ size_t strlen_m(const char *s)
  
         while (*s) {
                 size_t c_size;
-               codepoint_t c = next_codepoint_convenience(ic, s, &c_size);
-               if (c < 0x10000) {
+               codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size);
+               s += c_size;
+
+               switch (dst_charset) {
+               case CH_UTF16LE:
+               case CH_UTF16BE:
+               case CH_UTF16MUNGED:
+                       if (c < 0x10000) {
+                               count += 1;
+                       } else {
+                               count += 2;
+                       }
+                       break;
+               case CH_UTF8:
+                       /*
+                        * this only checks ranges, and does not
+                        * check for invalid codepoints
+                        */
+                       if (c < 0x80) {
+                               count += 1;
+                       } else if (c < 0x800) {
+                               count += 2;
+                       } else if (c < 0x1000) {
+                               count += 3;
+                       } else {
+                               count += 4;
+                       }
+                       break;
+               default:
+                       /*
+                        * non-unicode encoding:
+                        * assume that each codepoint fits into
+                        * one unit in the destination encoding.
+                        */
                         count += 1;
-               } else {
-                       count += 2;
                 }
-               s += c_size;
         }
  
         return count;
  }
  
+/**
+ Count the number of UCS2 characters in a string. Normally this will
+ be the same as the number of bytes in a string for single byte strings,
+ but will be different for multibyte.
+**/
+_PUBLIC_ size_t strlen_m(const char *s)
+{
+       return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
+}
+
  /**
     Work out the number of multibyte chars in a string, including the NULL
     terminator.
author	Michael Adam <obnox@samba.org>
	Sat, 30 Oct 2010 00:03:02 +0000 (02:03 +0200)
committer	Michael Adam <obnox@samba.org>
	Wed, 3 Nov 2010 22:45:20 +0000 (22:45 +0000)
lib/util/charset/charset.h		patch \| blob \| history
lib/util/charset/util_unistr.c		patch \| blob \| history