From: Michael Adam Date: Sat, 30 Oct 2010 00:03:02 +0000 (+0200) Subject: lib/util/charset/util_unistr: add strlen_m_ext that takes input and output charset X-Git-Tag: samba-4.0.0alpha14~1361 X-Git-Url: http://git.samba.org/?p=samba.git;a=commitdiff_plain;h=82c8b31ebce2783e439399f662591b03ab5a1960 lib/util/charset/util_unistr: add strlen_m_ext that takes input and output charset The function calculates the number of units (8 or 16-bit, depending on the destination charset), that would be needed to convert the input string which is expected to be in in src_charset encoding to the dst_charset (which should be a unicode charset). --- diff --git a/lib/util/charset/charset.h b/lib/util/charset/charset.h index 283212dbff7..8222a0586e0 100644 --- a/lib/util/charset/charset.h +++ b/lib/util/charset/charset.h @@ -120,6 +120,7 @@ struct smb_iconv_convenience; #define strupper(s) strupper_m(s) char *strchr_m(const char *s, char c); +size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset); size_t strlen_m_term(const char *s); size_t strlen_m_term_null(const char *s); size_t strlen_m(const char *s); diff --git a/lib/util/charset/util_unistr.c b/lib/util/charset/util_unistr.c index 79a9ffe3df9..93fc24da15f 100644 --- a/lib/util/charset/util_unistr.c +++ b/lib/util/charset/util_unistr.c @@ -249,11 +249,12 @@ _PUBLIC_ char *alpha_strcpy(char *dest, const char *src, const char *other_safe_ } /** - Count the number of UCS2 characters in a string. Normally this will - be the same as the number of bytes in a string for single byte strings, - but will be different for multibyte. -**/ -_PUBLIC_ size_t strlen_m(const char *s) + * Calculate the number of units (8 or 16-bit, depending on the + * destination charset), that would be needed to convert the input + * string which is expected to be in in src_charset encoding to the + * destination charset (which should be a unicode charset). + */ +_PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset) { size_t count = 0; struct smb_iconv_convenience *ic = get_iconv_convenience(); @@ -273,18 +274,57 @@ _PUBLIC_ size_t strlen_m(const char *s) while (*s) { size_t c_size; - codepoint_t c = next_codepoint_convenience(ic, s, &c_size); - if (c < 0x10000) { + codepoint_t c = next_codepoint_convenience_ext(ic, s, src_charset, &c_size); + s += c_size; + + switch (dst_charset) { + case CH_UTF16LE: + case CH_UTF16BE: + case CH_UTF16MUNGED: + if (c < 0x10000) { + count += 1; + } else { + count += 2; + } + break; + case CH_UTF8: + /* + * this only checks ranges, and does not + * check for invalid codepoints + */ + if (c < 0x80) { + count += 1; + } else if (c < 0x800) { + count += 2; + } else if (c < 0x1000) { + count += 3; + } else { + count += 4; + } + break; + default: + /* + * non-unicode encoding: + * assume that each codepoint fits into + * one unit in the destination encoding. + */ count += 1; - } else { - count += 2; } - s += c_size; } return count; } +/** + Count the number of UCS2 characters in a string. Normally this will + be the same as the number of bytes in a string for single byte strings, + but will be different for multibyte. +**/ +_PUBLIC_ size_t strlen_m(const char *s) +{ + return strlen_m_ext(s, CH_UNIX, CH_UTF16LE); +} + /** Work out the number of multibyte chars in a string, including the NULL terminator.