2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/iconv.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
41 struct smb_iconv_convenience {
42 const char *unix_charset;
43 const char *dos_charset;
45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
50 * Return the name of a charset to give to iconv().
52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
55 case CH_UTF16: return "UTF-16LE";
56 case CH_UNIX: return ic->unix_charset;
57 case CH_DOS: return ic->dos_charset;
58 case CH_UTF8: return "UTF8";
59 case CH_UTF16BE: return "UTF-16BE";
60 case CH_UTF16MUNGED: return "UTF16_MUNGED";
67 re-initialize iconv conversion descriptors
69 static int close_iconv_convenience(struct smb_iconv_convenience *data)
72 for (c1=0;c1<NUM_CHARSETS;c1++) {
73 for (c2=0;c2<NUM_CHARSETS;c2++) {
74 if (data->conv_handles[c1][c2] != NULL) {
75 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
76 smb_iconv_close(data->conv_handles[c1][c2]);
78 data->conv_handles[c1][c2] = NULL;
86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
87 const char *dos_charset,
88 const char *unix_charset,
91 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
92 struct smb_iconv_convenience);
98 talloc_set_destructor(ret, close_iconv_convenience);
100 ret->dos_charset = talloc_strdup(ret, dos_charset);
101 ret->unix_charset = talloc_strdup(ret, unix_charset);
102 ret->native_iconv = native_iconv;
108 on-demand initialisation of conversion handles
110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
111 charset_t from, charset_t to)
114 static bool initialised;
116 if (initialised == false) {
120 /* we set back the locale to C to get ASCII-compatible
121 toupper/lower functions. For now we do not need
122 any other POSIX localisations anyway. When we
123 should really need localized string functions one
124 day we need to write our own ascii_tolower etc.
126 setlocale(LC_ALL, "C");
130 if (ic->conv_handles[from][to]) {
131 return ic->conv_handles[from][to];
134 n1 = charset_name(ic, from);
135 n2 = charset_name(ic, to);
137 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
140 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
141 if ((from == CH_DOS || to == CH_DOS) &&
142 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
143 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
144 charset_name(ic, CH_DOS)));
145 ic->dos_charset = "ASCII";
147 n1 = charset_name(ic, from);
148 n2 = charset_name(ic, to);
150 ic->conv_handles[from][to] =
151 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
155 return ic->conv_handles[from][to];
159 * Convert string from one encoding to another, making error checking etc
161 * @param mem_ctx Memory context
162 * @param cd Iconv handle
163 * @param src pointer to source string (multibyte or singlebyte)
164 * @param srclen length of the source string in bytes
165 * @param dest pointer to destination string (multibyte or singlebyte)
166 * @param destlen maximal length allowed for string
167 * @returns the number of bytes occupied in the destination
169 _PUBLIC_ ssize_t iconv_talloc(TALLOC_CTX *ctx,
171 void const *src, size_t srclen,
174 size_t i_len, o_len, destlen;
176 const char *inbuf = (const char *)src;
181 /* it is _very_ rare that a conversion increases the size by
186 destlen = 2 + (destlen*3);
187 ob = talloc_realloc(ctx, outbuf, char, destlen);
189 DEBUG(0, ("iconv_talloc: realloc failed!\n"));
196 /* we give iconv 2 less bytes to allow us to terminate at the
200 retval = smb_iconv(cd,
203 if(retval == (size_t)-1) {
204 const char *reason="unknown error";
207 reason="Incomplete multibyte sequence";
212 reason="Illegal multibyte sequence";
215 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
220 destlen = (destlen-2) - o_len;
222 /* guarantee null termination in all charsets */
223 SSVAL(ob, destlen, 0);
232 * Convert string from one encoding to another, making error checking etc
234 * @param src pointer to source string (multibyte or singlebyte)
235 * @param srclen length of the source string in bytes
236 * @param dest pointer to destination string (multibyte or singlebyte)
237 * @param destlen maximal length allowed for string
238 * @returns the number of bytes occupied in the destination
240 _PUBLIC_ bool convert_string_convenience(struct smb_iconv_convenience *ic,
241 charset_t from, charset_t to,
242 void const *src, size_t srclen,
243 void *dest, size_t destlen, size_t *converted_size,
244 bool allow_badcharcnv)
248 const char* inbuf = (const char*)src;
249 char* outbuf = (char*)dest;
250 smb_iconv_t descriptor;
252 if (allow_badcharcnv) {
253 /* Not implemented yet */
257 if (srclen == (size_t)-1)
258 srclen = strlen(inbuf)+1;
260 descriptor = get_conv_handle(ic, from, to);
262 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
263 /* conversion not supported, use as is */
264 size_t len = MIN(srclen,destlen);
265 memcpy(dest,src,len);
266 *converted_size = len;
272 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
273 if(retval==(size_t)-1) {
277 reason="Incomplete multibyte sequence";
280 reason="No more room";
281 if (from == CH_UNIX) {
282 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
283 charset_name(ic, from), charset_name(ic, to),
284 (int)srclen, (int)destlen,
287 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
288 charset_name(ic, from), charset_name(ic, to),
289 (int)srclen, (int)destlen));
293 reason="Illegal multibyte sequence";
296 /* smb_panic(reason); */
298 if (converted_size != NULL)
299 *converted_size = destlen-o_len;
304 * Convert between character sets, allocating a new buffer using talloc for the result.
306 * @param srclen length of source buffer.
307 * @param dest always set at least to NULL
308 * @note -1 is not accepted for srclen.
310 * @returns Size in bytes of the converted string; or -1 in case of error.
313 _PUBLIC_ bool convert_string_talloc_convenience(TALLOC_CTX *ctx,
314 struct smb_iconv_convenience *ic,
315 charset_t from, charset_t to,
316 void const *src, size_t srclen,
317 void **dest, size_t *converted_size,
318 bool allow_badcharcnv)
320 smb_iconv_t descriptor;
323 if (allow_badcharcnv)
324 return false; /* Not implemented yet */
328 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
331 descriptor = get_conv_handle(ic, from, to);
333 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
334 /* conversion not supported, return -1*/
335 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
336 charset_name(ic, from),
337 charset_name(ic, to)));
341 ret = iconv_talloc(ctx, descriptor, src, srclen, dest);
344 if (converted_size != NULL)
345 *converted_size = ret;
350 return the unicode codepoint for the next multi-byte CH_UNIX character
353 also return the number of bytes consumed (which tells the caller
354 how many bytes to skip to get to the next CH_UNIX character)
356 return INVALID_CODEPOINT if the next character cannot be converted
358 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
359 const char *str, size_t *size)
361 /* it cannot occupy more than 4 bytes in UTF16 format */
363 smb_iconv_t descriptor;
369 if ((str[0] & 0x80) == 0) {
371 return (codepoint_t)str[0];
374 /* we assume that no multi-byte character can take
375 more than 5 bytes. This is OK as we only
376 support codepoints up to 1M */
377 ilen_orig = strnlen(str, 5);
380 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
381 if (descriptor == (smb_iconv_t)-1) {
383 return INVALID_CODEPOINT;
386 /* this looks a little strange, but it is needed to cope
387 with codepoints above 64k */
389 outbuf = (char *)buf;
390 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
393 outbuf = (char *)buf;
394 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
396 /* we didn't convert any bytes */
398 return INVALID_CODEPOINT;
405 *size = ilen_orig - ilen;
408 return (codepoint_t)SVAL(buf, 0);
411 /* decode a 4 byte UTF16 character manually */
412 return (codepoint_t)0x10000 +
413 (buf[2] | ((buf[3] & 0x3)<<8) |
414 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
417 /* no other length is valid */
418 return INVALID_CODEPOINT;
422 push a single codepoint into a CH_UNIX string the target string must
423 be able to hold the full character, which is guaranteed if it is at
424 least 5 bytes in size. The caller may pass less than 5 bytes if they
425 are sure the character will fit (for example, you can assume that
426 uppercase/lowercase of a character will not add more than 1 byte)
428 return the number of bytes occupied by the CH_UNIX character, or
431 _PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic,
432 char *str, codepoint_t c)
434 smb_iconv_t descriptor;
444 descriptor = get_conv_handle(ic,
446 if (descriptor == (smb_iconv_t)-1) {
455 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
464 buf[0] = (c>>10) & 0xFF;
465 buf[1] = (c>>18) | 0xd8;
467 buf[3] = ((c>>8) & 0x3) | 0xdc;
473 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);