2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "lib/util/charset/charset.h"
25 #include "system/locale.h"
26 #include "dynconfig/dynconfig.h"
34 * @brief Unicode string manipulation
37 /* these 2 tables define the unicode case handling. They are loaded
38 at startup either via mmap() or read() from the lib directory */
39 static void *upcase_table;
40 static void *lowcase_table;
43 /*******************************************************************
44 load the case handling tables
46 This is the function that should be called from library code.
47 ********************************************************************/
48 void load_case_tables_library(void)
52 mem_ctx = talloc_init("load_case_tables");
54 smb_panic("No memory for case_tables");
56 upcase_table = map_file(talloc_asprintf(mem_ctx, "%s/upcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
57 lowcase_table = map_file(talloc_asprintf(mem_ctx, "%s/lowcase.dat", get_dyn_CODEPAGEDIR()), 0x20000);
59 if (upcase_table == NULL) {
60 DEBUG(1, ("Failed to load upcase.dat, will use lame ASCII-only case sensitivity rules\n"));
61 upcase_table = (void *)-1;
63 if (lowcase_table == NULL) {
64 DEBUG(1, ("Failed to load lowcase.dat, will use lame ASCII-only case sensitivity rules\n"));
65 lowcase_table = (void *)-1;
69 /*******************************************************************
70 load the case handling tables
72 This MUST only be called from main() in application code, never from a
73 library. We don't know if the calling program has already done
74 setlocale() to another value, and can't tell if they have.
75 ********************************************************************/
76 void load_case_tables(void)
78 /* This is a useful global hook where we can ensure that the
79 * locale is set from the environment. This is needed so that
80 * we can use LOCALE as a codepage */
82 setlocale(LC_ALL, "");
84 load_case_tables_library();
88 Convert a codepoint_t to upper case.
90 _PUBLIC_ codepoint_t toupper_m(codepoint_t val)
95 if (upcase_table == NULL) {
96 load_case_tables_library();
98 if (upcase_table == (void *)-1) {
101 if (val & 0xFFFF0000) {
104 return SVAL(upcase_table, val*2);
108 Convert a codepoint_t to lower case.
110 _PUBLIC_ codepoint_t tolower_m(codepoint_t val)
115 if (lowcase_table == NULL) {
116 load_case_tables_library();
118 if (lowcase_table == (void *)-1) {
121 if (val & 0xFFFF0000) {
124 return SVAL(lowcase_table, val*2);
128 If we upper cased this character, would we get the same character?
130 _PUBLIC_ bool islower_m(codepoint_t val)
132 return (toupper_m(val) != val);
136 If we lower cased this character, would we get the same character?
138 _PUBLIC_ bool isupper_m(codepoint_t val)
140 return (tolower_m(val) != val);
144 compare two codepoints case insensitively
146 _PUBLIC_ int codepoint_cmpi(codepoint_t c1, codepoint_t c2)
149 toupper_m(c1) == toupper_m(c2)) {
156 struct smb_iconv_handle {
157 TALLOC_CTX *child_ctx;
158 const char *unix_charset;
159 const char *dos_charset;
160 const char *display_charset;
162 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
165 struct smb_iconv_handle *global_iconv_handle = NULL;
167 struct smb_iconv_handle *get_iconv_handle(void)
169 if (global_iconv_handle == NULL)
170 global_iconv_handle = smb_iconv_handle_reinit(talloc_autofree_context(),
171 "ASCII", "UTF-8", true, NULL);
172 return global_iconv_handle;
175 struct smb_iconv_handle *get_iconv_testing_handle(TALLOC_CTX *mem_ctx,
176 const char *dos_charset,
177 const char *unix_charset)
179 return smb_iconv_handle_reinit(mem_ctx,
180 dos_charset, unix_charset, true, NULL);
184 * Return the name of a charset to give to iconv().
186 const char *charset_name(struct smb_iconv_handle *ic, charset_t ch)
189 case CH_UTF16: return "UTF-16LE";
190 case CH_UNIX: return ic->unix_charset;
191 case CH_DOS: return ic->dos_charset;
192 case CH_UTF8: return "UTF8";
193 case CH_UTF16BE: return "UTF-16BE";
194 case CH_UTF16MUNGED: return "UTF16_MUNGED";
201 re-initialize iconv conversion descriptors
203 static int close_iconv_handle(struct smb_iconv_handle *data)
206 for (c1=0;c1<NUM_CHARSETS;c1++) {
207 for (c2=0;c2<NUM_CHARSETS;c2++) {
208 if (data->conv_handles[c1][c2] != NULL) {
209 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
210 smb_iconv_close(data->conv_handles[c1][c2]);
212 data->conv_handles[c1][c2] = NULL;
220 static const char *map_locale(const char *charset)
222 if (strcmp(charset, "LOCALE") != 0) {
225 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
230 ln = nl_langinfo(CODESET);
232 DEBUG(1,("Unable to determine charset for LOCALE - using ASCII\n"));
235 /* Check whether the charset name is supported
237 handle = smb_iconv_open(ln, "UCS-2LE");
238 if (handle == (smb_iconv_t) -1) {
239 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
242 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
243 smb_iconv_close(handle);
252 the old_ic is passed in here as the smb_iconv_handle structure
253 is used as a global pointer in some places (eg. python modules). We
254 don't want to invalidate those global pointers, but we do want to
255 update them with the right charset information when loadparm
256 runs. To do that we need to re-use the structure pointer, but
257 re-fill the elements in the structure with the updated values
259 _PUBLIC_ struct smb_iconv_handle *smb_iconv_handle_reinit(TALLOC_CTX *mem_ctx,
260 const char *dos_charset,
261 const char *unix_charset,
263 struct smb_iconv_handle *old_ic)
265 struct smb_iconv_handle *ret;
267 if (old_ic != NULL) {
269 close_iconv_handle(ret);
270 talloc_free(ret->child_ctx);
273 ret = talloc_zero(mem_ctx, struct smb_iconv_handle);
279 /* we use a child context to allow us to free all ptrs without
280 freeing the structure itself */
281 ret->child_ctx = talloc_new(ret);
282 if (ret->child_ctx == NULL) {
286 talloc_set_destructor(ret, close_iconv_handle);
288 if (strcasecmp(dos_charset, "UTF8") == 0 || strcasecmp(dos_charset, "UTF-8") == 0) {
289 DEBUG(0,("ERROR: invalid DOS charset: 'dos charset' must not be UTF8, using (default value) CP850 instead\n"));
290 dos_charset = "CP850";
293 ret->dos_charset = talloc_strdup(ret->child_ctx, dos_charset);
294 ret->unix_charset = talloc_strdup(ret->child_ctx, unix_charset);
295 ret->native_iconv = native_iconv;
301 on-demand initialisation of conversion handles
303 smb_iconv_t get_conv_handle(struct smb_iconv_handle *ic,
304 charset_t from, charset_t to)
308 if (ic->conv_handles[from][to]) {
309 return ic->conv_handles[from][to];
312 n1 = charset_name(ic, from);
313 n2 = charset_name(ic, to);
315 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
318 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
319 if ((from == CH_DOS || to == CH_DOS) &&
320 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
321 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
322 charset_name(ic, CH_DOS)));
323 ic->dos_charset = "ASCII";
325 n1 = charset_name(ic, from);
326 n2 = charset_name(ic, to);
328 ic->conv_handles[from][to] =
329 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
333 return ic->conv_handles[from][to];
337 * Return the unicode codepoint for the next character in the input
338 * string in the given src_charset.
339 * The unicode codepoint (codepoint_t) is an unsinged 32 bit value.
341 * Also return the number of bytes consumed (which tells the caller
342 * how many bytes to skip to get to the next src_charset-character).
344 * This is implemented (in the non-ascii-case) by first converting the
345 * next character in the input string to UTF16_LE and then calculating
346 * the unicode codepoint from that.
348 * Return INVALID_CODEPOINT if the next character cannot be converted.
350 _PUBLIC_ codepoint_t next_codepoint_handle_ext(
351 struct smb_iconv_handle *ic,
352 const char *str, charset_t src_charset,
353 size_t *bytes_consumed)
355 /* it cannot occupy more than 4 bytes in UTF16 format */
357 smb_iconv_t descriptor;
363 if ((str[0] & 0x80) == 0) {
365 return (codepoint_t)str[0];
369 * we assume that no multi-byte character can take more than 5 bytes.
370 * This is OK as we only support codepoints up to 1M (U+100000)
372 ilen_orig = strnlen(str, 5);
375 descriptor = get_conv_handle(ic, src_charset, CH_UTF16);
376 if (descriptor == (smb_iconv_t)-1) {
378 return INVALID_CODEPOINT;
382 * this looks a little strange, but it is needed to cope with
383 * codepoints above 64k (U+1000) which are encoded as per RFC2781.
386 outbuf = (char *)buf;
387 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
390 outbuf = (char *)buf;
391 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
393 /* we didn't convert any bytes */
395 return INVALID_CODEPOINT;
402 *bytes_consumed = ilen_orig - ilen;
405 return (codepoint_t)SVAL(buf, 0);
408 /* decode a 4 byte UTF16 character manually */
409 return (codepoint_t)0x10000 +
410 (buf[2] | ((buf[3] & 0x3)<<8) |
411 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
414 /* no other length is valid */
415 return INVALID_CODEPOINT;
419 return the unicode codepoint for the next multi-byte CH_UNIX character
422 also return the number of bytes consumed (which tells the caller
423 how many bytes to skip to get to the next CH_UNIX character)
425 return INVALID_CODEPOINT if the next character cannot be converted
427 _PUBLIC_ codepoint_t next_codepoint_handle(struct smb_iconv_handle *ic,
428 const char *str, size_t *size)
430 return next_codepoint_handle_ext(ic, str, CH_UNIX, size);
434 push a single codepoint into a CH_UNIX string the target string must
435 be able to hold the full character, which is guaranteed if it is at
436 least 5 bytes in size. The caller may pass less than 5 bytes if they
437 are sure the character will fit (for example, you can assume that
438 uppercase/lowercase of a character will not add more than 1 byte)
440 return the number of bytes occupied by the CH_UNIX character, or
443 _PUBLIC_ ssize_t push_codepoint_handle(struct smb_iconv_handle *ic,
444 char *str, codepoint_t c)
446 smb_iconv_t descriptor;
456 descriptor = get_conv_handle(ic,
458 if (descriptor == (smb_iconv_t)-1) {
467 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
476 buf[0] = (c>>10) & 0xFF;
477 buf[1] = (c>>18) | 0xd8;
479 buf[3] = ((c>>8) & 0x3) | 0xdc;
485 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
492 _PUBLIC_ codepoint_t next_codepoint_ext(const char *str, charset_t src_charset,
495 return next_codepoint_handle_ext(get_iconv_handle(), str,
499 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
501 return next_codepoint_handle(get_iconv_handle(), str, size);
504 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
506 return push_codepoint_handle(get_iconv_handle(), str, c);