2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/iconv.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
41 struct smb_iconv_convenience {
42 const char *unix_charset;
43 const char *dos_charset;
45 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
50 * Return the name of a charset to give to iconv().
52 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
55 case CH_UTF16: return "UTF-16LE";
56 case CH_UNIX: return ic->unix_charset;
57 case CH_DOS: return ic->dos_charset;
58 case CH_UTF8: return "UTF8";
59 case CH_UTF16BE: return "UTF-16BE";
66 re-initialize iconv conversion descriptors
68 static int close_iconv_convenience(struct smb_iconv_convenience *data)
71 for (c1=0;c1<NUM_CHARSETS;c1++) {
72 for (c2=0;c2<NUM_CHARSETS;c2++) {
73 if (data->conv_handles[c1][c2] != NULL) {
74 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
75 smb_iconv_close(data->conv_handles[c1][c2]);
77 data->conv_handles[c1][c2] = NULL;
85 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
86 const char *dos_charset,
87 const char *unix_charset,
90 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
91 struct smb_iconv_convenience);
97 talloc_set_destructor(ret, close_iconv_convenience);
99 ret->dos_charset = talloc_strdup(ret, dos_charset);
100 ret->unix_charset = talloc_strdup(ret, unix_charset);
101 ret->native_iconv = native_iconv;
107 on-demand initialisation of conversion handles
109 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
110 charset_t from, charset_t to)
113 static bool initialised;
115 if (initialised == false) {
119 /* we set back the locale to C to get ASCII-compatible
120 toupper/lower functions. For now we do not need
121 any other POSIX localisations anyway. When we
122 should really need localized string functions one
123 day we need to write our own ascii_tolower etc.
125 setlocale(LC_ALL, "C");
129 if (ic->conv_handles[from][to]) {
130 return ic->conv_handles[from][to];
133 n1 = charset_name(ic, from);
134 n2 = charset_name(ic, to);
136 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
139 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
140 if ((from == CH_DOS || to == CH_DOS) &&
141 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
142 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
143 charset_name(ic, CH_DOS)));
144 ic->dos_charset = "ASCII";
146 n1 = charset_name(ic, from);
147 n2 = charset_name(ic, to);
149 ic->conv_handles[from][to] =
150 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
154 return ic->conv_handles[from][to];
159 * Convert string from one encoding to another, making error checking etc
161 * @param src pointer to source string (multibyte or singlebyte)
162 * @param srclen length of the source string in bytes
163 * @param dest pointer to destination string (multibyte or singlebyte)
164 * @param destlen maximal length allowed for string
165 * @returns the number of bytes occupied in the destination
167 _PUBLIC_ ssize_t convert_string_convenience(struct smb_iconv_convenience *ic,
168 charset_t from, charset_t to,
169 void const *src, size_t srclen,
170 void *dest, size_t destlen)
174 const char* inbuf = (const char*)src;
175 char* outbuf = (char*)dest;
176 smb_iconv_t descriptor;
178 if (srclen == (size_t)-1)
179 srclen = strlen(inbuf)+1;
181 descriptor = get_conv_handle(ic, from, to);
183 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
184 /* conversion not supported, use as is */
185 size_t len = MIN(srclen,destlen);
186 memcpy(dest,src,len);
192 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
193 if(retval==(size_t)-1) {
197 reason="Incomplete multibyte sequence";
200 reason="No more room";
201 if (from == CH_UNIX) {
202 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
203 charset_name(ic, from), charset_name(ic, to),
204 (int)srclen, (int)destlen,
207 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
208 charset_name(ic, from), charset_name(ic, to),
209 (int)srclen, (int)destlen));
213 reason="Illegal multibyte sequence";
216 /* smb_panic(reason); */
218 return destlen-o_len;
221 _PUBLIC_ ssize_t convert_string_talloc_descriptor(TALLOC_CTX *ctx, smb_iconv_t descriptor, void const *src, size_t srclen, void **dest)
223 size_t i_len, o_len, destlen;
225 const char *inbuf = (const char *)src;
230 /* it is _very_ rare that a conversion increases the size by
235 destlen = 2 + (destlen*3);
236 ob = talloc_realloc(ctx, outbuf, char, destlen);
238 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
245 /* we give iconv 2 less bytes to allow us to terminate at the
249 retval = smb_iconv(descriptor,
252 if(retval == (size_t)-1) {
253 const char *reason="unknown error";
256 reason="Incomplete multibyte sequence";
261 reason="Illegal multibyte sequence";
264 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
269 destlen = (destlen-2) - o_len;
271 /* guarantee null termination in all charsets */
272 SSVAL(ob, destlen, 0);
280 * Convert between character sets, allocating a new buffer using talloc for the result.
282 * @param srclen length of source buffer.
283 * @param dest always set at least to NULL
284 * @note -1 is not accepted for srclen.
286 * @returns Size in bytes of the converted string; or -1 in case of error.
289 _PUBLIC_ ssize_t convert_string_talloc_convenience(TALLOC_CTX *ctx,
290 struct smb_iconv_convenience *ic,
291 charset_t from, charset_t to,
292 void const *src, size_t srclen,
295 smb_iconv_t descriptor;
299 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
302 descriptor = get_conv_handle(ic, from, to);
304 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
305 /* conversion not supported, return -1*/
306 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
307 charset_name(ic, from),
308 charset_name(ic, to)));
312 return convert_string_talloc_descriptor(ctx, descriptor, src, srclen, dest);
316 return the unicode codepoint for the next multi-byte CH_UNIX character
319 also return the number of bytes consumed (which tells the caller
320 how many bytes to skip to get to the next CH_UNIX character)
322 return INVALID_CODEPOINT if the next character cannot be converted
324 _PUBLIC_ codepoint_t next_codepoint_convenience(struct smb_iconv_convenience *ic,
325 const char *str, size_t *size)
327 /* it cannot occupy more than 4 bytes in UTF16 format */
329 smb_iconv_t descriptor;
335 if ((str[0] & 0x80) == 0) {
337 return (codepoint_t)str[0];
340 /* we assume that no multi-byte character can take
341 more than 5 bytes. This is OK as we only
342 support codepoints up to 1M */
343 ilen_orig = strnlen(str, 5);
346 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
347 if (descriptor == (smb_iconv_t)-1) {
349 return INVALID_CODEPOINT;
352 /* this looks a little strange, but it is needed to cope
353 with codepoints above 64k */
355 outbuf = (char *)buf;
356 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
359 outbuf = (char *)buf;
360 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
362 /* we didn't convert any bytes */
364 return INVALID_CODEPOINT;
371 *size = ilen_orig - ilen;
374 return (codepoint_t)SVAL(buf, 0);
377 /* decode a 4 byte UTF16 character manually */
378 return (codepoint_t)0x10000 +
379 (buf[2] | ((buf[3] & 0x3)<<8) |
380 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
383 /* no other length is valid */
384 return INVALID_CODEPOINT;
388 push a single codepoint into a CH_UNIX string the target string must
389 be able to hold the full character, which is guaranteed if it is at
390 least 5 bytes in size. The caller may pass less than 5 bytes if they
391 are sure the character will fit (for example, you can assume that
392 uppercase/lowercase of a character will not add more than 1 byte)
394 return the number of bytes occupied by the CH_UNIX character, or
397 _PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic,
398 char *str, codepoint_t c)
400 smb_iconv_t descriptor;
410 descriptor = get_conv_handle(ic,
412 if (descriptor == (smb_iconv_t)-1) {
421 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
430 buf[0] = (c>>10) & 0xFF;
431 buf[1] = (c>>18) | 0xd8;
433 buf[3] = ((c>>8) & 0x3) | 0xdc;
439 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);