2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/iconv.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
41 struct smb_iconv_convenience {
42 const char *unix_charset;
43 const char *dos_charset;
45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
50 * Return the name of a charset to give to iconv().
52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
55 case CH_UTF16: return "UTF-16LE";
56 case CH_UNIX: return ic->unix_charset;
57 case CH_DOS: return ic->dos_charset;
58 case CH_UTF8: return "UTF8";
59 case CH_UTF16BE: return "UTF-16BE";
60 case CH_UTF16MUNGED: return "UTF16_MUNGED";
67 re-initialize iconv conversion descriptors
69 static int close_iconv_convenience(struct smb_iconv_convenience *data)
72 for (c1=0;c1<NUM_CHARSETS;c1++) {
73 for (c2=0;c2<NUM_CHARSETS;c2++) {
74 if (data->conv_handles[c1][c2] != NULL) {
75 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
76 smb_iconv_close(data->conv_handles[c1][c2]);
78 data->conv_handles[c1][c2] = NULL;
86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
87 const char *dos_charset,
88 const char *unix_charset,
91 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
92 struct smb_iconv_convenience);
98 talloc_set_destructor(ret, close_iconv_convenience);
100 ret->dos_charset = talloc_strdup(ret, dos_charset);
101 ret->unix_charset = talloc_strdup(ret, unix_charset);
102 ret->native_iconv = native_iconv;
108 on-demand initialisation of conversion handles
110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
111 charset_t from, charset_t to)
114 static bool initialised;
116 if (initialised == false) {
120 /* we set back the locale to C to get ASCII-compatible
121 toupper/lower functions. For now we do not need
122 any other POSIX localisations anyway. When we
123 should really need localized string functions one
124 day we need to write our own ascii_tolower etc.
126 setlocale(LC_ALL, "C");
130 if (ic->conv_handles[from][to]) {
131 return ic->conv_handles[from][to];
134 n1 = charset_name(ic, from);
135 n2 = charset_name(ic, to);
137 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
140 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
141 if ((from == CH_DOS || to == CH_DOS) &&
142 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
143 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
144 charset_name(ic, CH_DOS)));
145 ic->dos_charset = "ASCII";
147 n1 = charset_name(ic, from);
148 n2 = charset_name(ic, to);
150 ic->conv_handles[from][to] =
151 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
155 return ic->conv_handles[from][to];
159 * Convert string from one encoding to another, making error checking etc
161 * @param mem_ctx Memory context
162 * @param cd Iconv handle
163 * @param src pointer to source string (multibyte or singlebyte)
164 * @param srclen length of the source string in bytes
165 * @param dest pointer to destination string (multibyte or singlebyte)
166 * @param destlen maximal length allowed for string
167 * @returns the number of bytes occupied in the destination
169 _PUBLIC_ ssize_t iconv_talloc(TALLOC_CTX *ctx,
171 void const *src, size_t srclen,
174 size_t i_len, o_len, destlen;
175 void **dest = (void **)dst;
177 const char *inbuf = (const char *)src;
182 /* it is _very_ rare that a conversion increases the size by
187 destlen = 2 + (destlen*3);
188 ob = talloc_realloc(ctx, outbuf, char, destlen);
190 DEBUG(0, ("iconv_talloc: realloc failed!\n"));
197 /* we give iconv 2 less bytes to allow us to terminate at the
201 retval = smb_iconv(cd,
204 if(retval == (size_t)-1) {
205 const char *reason="unknown error";
208 reason="Incomplete multibyte sequence";
213 reason="Illegal multibyte sequence";
216 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
221 destlen = (destlen-2) - o_len;
223 /* guarantee null termination in all charsets */
224 SSVAL(ob, destlen, 0);
233 * Convert string from one encoding to another, making error checking etc
235 * @param src pointer to source string (multibyte or singlebyte)
236 * @param srclen length of the source string in bytes
237 * @param dest pointer to destination string (multibyte or singlebyte)
238 * @param destlen maximal length allowed for string
239 * @returns the number of bytes occupied in the destination
241 _PUBLIC_ bool convert_string_convenience(struct smb_iconv_convenience *ic,
242 charset_t from, charset_t to,
243 void const *src, size_t srclen,
244 void *dest, size_t destlen, size_t *converted_size,
245 bool allow_badcharcnv)
249 const char* inbuf = (const char*)src;
250 char* outbuf = (char*)dest;
251 smb_iconv_t descriptor;
253 if (allow_badcharcnv) {
254 /* Not implemented yet */
258 if (srclen == (size_t)-1)
259 srclen = strlen(inbuf)+1;
261 descriptor = get_conv_handle(ic, from, to);
263 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
264 /* conversion not supported, use as is */
265 size_t len = MIN(srclen,destlen);
266 memcpy(dest,src,len);
267 *converted_size = len;
273 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
274 if(retval==(size_t)-1) {
278 reason="Incomplete multibyte sequence";
281 reason="No more room";
282 if (from == CH_UNIX) {
283 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
284 charset_name(ic, from), charset_name(ic, to),
285 (int)srclen, (int)destlen,
288 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
289 charset_name(ic, from), charset_name(ic, to),
290 (int)srclen, (int)destlen));
294 reason="Illegal multibyte sequence";
297 /* smb_panic(reason); */
299 if (converted_size != NULL)
300 *converted_size = destlen-o_len;
305 * Convert between character sets, allocating a new buffer using talloc for the result.
307 * @param srclen length of source buffer.
308 * @param dest always set at least to NULL
309 * @note -1 is not accepted for srclen.
311 * @returns Size in bytes of the converted string; or -1 in case of error.
314 _PUBLIC_ bool convert_string_talloc_convenience(TALLOC_CTX *ctx,
315 struct smb_iconv_convenience *ic,
316 charset_t from, charset_t to,
317 void const *src, size_t srclen,
318 void *dst, size_t *converted_size,
319 bool allow_badcharcnv)
321 void **dest = (void **)dst;
322 smb_iconv_t descriptor;
325 if (allow_badcharcnv)
326 return false; /* Not implemented yet */
330 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
333 descriptor = get_conv_handle(ic, from, to);
335 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
336 /* conversion not supported, return -1*/
337 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
338 charset_name(ic, from),
339 charset_name(ic, to)));
343 ret = iconv_talloc(ctx, descriptor, src, srclen, dest);
346 if (converted_size != NULL)
347 *converted_size = ret;
352 return the unicode codepoint for the next multi-byte CH_UNIX character
355 also return the number of bytes consumed (which tells the caller
356 how many bytes to skip to get to the next CH_UNIX character)
358 return INVALID_CODEPOINT if the next character cannot be converted
360 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
361 const char *str, size_t *size)
363 /* it cannot occupy more than 4 bytes in UTF16 format */
365 smb_iconv_t descriptor;
371 if ((str[0] & 0x80) == 0) {
373 return (codepoint_t)str[0];
376 /* we assume that no multi-byte character can take
377 more than 5 bytes. This is OK as we only
378 support codepoints up to 1M */
379 ilen_orig = strnlen(str, 5);
382 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
383 if (descriptor == (smb_iconv_t)-1) {
385 return INVALID_CODEPOINT;
388 /* this looks a little strange, but it is needed to cope
389 with codepoints above 64k */
391 outbuf = (char *)buf;
392 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
395 outbuf = (char *)buf;
396 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
398 /* we didn't convert any bytes */
400 return INVALID_CODEPOINT;
407 *size = ilen_orig - ilen;
410 return (codepoint_t)SVAL(buf, 0);
413 /* decode a 4 byte UTF16 character manually */
414 return (codepoint_t)0x10000 +
415 (buf[2] | ((buf[3] & 0x3)<<8) |
416 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
419 /* no other length is valid */
420 return INVALID_CODEPOINT;
424 push a single codepoint into a CH_UNIX string the target string must
425 be able to hold the full character, which is guaranteed if it is at
426 least 5 bytes in size. The caller may pass less than 5 bytes if they
427 are sure the character will fit (for example, you can assume that
428 uppercase/lowercase of a character will not add more than 1 byte)
430 return the number of bytes occupied by the CH_UNIX character, or
433 _PUBLIC_ ssize_t push_codepoint_convenience(struct smb_iconv_convenience *ic,
434 char *str, codepoint_t c)
436 smb_iconv_t descriptor;
446 descriptor = get_conv_handle(ic,
448 if (descriptor == (smb_iconv_t)-1) {
457 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
466 buf[0] = (c>>10) & 0xFF;
467 buf[1] = (c>>18) | 0xd8;
469 buf[3] = ((c>>8) & 0x3) | 0xdc;
475 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);