2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "system/iconv.h"
24 #include "param/param.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
42 * Return the name of a charset to give to iconv().
44 static const char *charset_name(struct loadparm_context *lp_ctx, charset_t ch)
47 case CH_UTF16: return "UTF-16LE";
48 case CH_UNIX: return lp_unix_charset(lp_ctx);
49 case CH_DOS: return lp_dos_charset(lp_ctx);
50 case CH_DISPLAY: return lp_display_charset(lp_ctx);
51 case CH_UTF8: return "UTF8";
52 case CH_UTF16BE: return "UTF-16BE";
58 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
61 re-initialize iconv conversion descriptors
63 _PUBLIC_ void init_iconv(void)
66 for (c1=0;c1<NUM_CHARSETS;c1++) {
67 for (c2=0;c2<NUM_CHARSETS;c2++) {
68 if (conv_handles[c1][c2] != NULL) {
69 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
70 smb_iconv_close(conv_handles[c1][c2]);
72 conv_handles[c1][c2] = NULL;
80 on-demand initialisation of conversion handles
82 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
85 static int initialised;
86 /* auto-free iconv memory on exit so valgrind reports are easier
88 if (initialised == 0) {
92 /* we set back the locale to C to get ASCII-compatible
93 toupper/lower functions. For now we do not need
94 any other POSIX localisations anyway. When we
95 should really need localized string functions one
96 day we need to write our own ascii_tolower etc.
98 setlocale(LC_ALL, "C");
104 if (conv_handles[from][to]) {
105 return conv_handles[from][to];
108 n1 = charset_name(global_loadparm, from);
109 n2 = charset_name(global_loadparm, to);
111 conv_handles[from][to] = smb_iconv_open(n2,n1);
113 if (conv_handles[from][to] == (smb_iconv_t)-1) {
114 if ((from == CH_DOS || to == CH_DOS) &&
115 strcasecmp(charset_name(global_loadparm, CH_DOS), "ASCII") != 0) {
116 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
117 charset_name(global_loadparm, CH_DOS)));
118 lp_set_cmdline(global_loadparm, "dos charset", "ASCII");
120 n1 = charset_name(global_loadparm, from);
121 n2 = charset_name(global_loadparm, to);
123 conv_handles[from][to] = smb_iconv_open(n2,n1);
127 return conv_handles[from][to];
132 * Convert string from one encoding to another, making error checking etc
134 * @param src pointer to source string (multibyte or singlebyte)
135 * @param srclen length of the source string in bytes
136 * @param dest pointer to destination string (multibyte or singlebyte)
137 * @param destlen maximal length allowed for string
138 * @returns the number of bytes occupied in the destination
140 _PUBLIC_ ssize_t convert_string(charset_t from, charset_t to,
141 void const *src, size_t srclen,
142 void *dest, size_t destlen)
146 const char* inbuf = (const char*)src;
147 char* outbuf = (char*)dest;
148 smb_iconv_t descriptor;
150 if (srclen == (size_t)-1)
151 srclen = strlen(inbuf)+1;
153 descriptor = get_conv_handle(from, to);
155 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
156 /* conversion not supported, use as is */
157 size_t len = MIN(srclen,destlen);
158 memcpy(dest,src,len);
164 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
165 if(retval==(size_t)-1) {
169 reason="Incomplete multibyte sequence";
172 reason="No more room";
173 if (from == CH_UNIX) {
174 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
175 charset_name(global_loadparm, from), charset_name(global_loadparm, to),
176 (int)srclen, (int)destlen,
179 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
180 charset_name(global_loadparm, from), charset_name(global_loadparm, to),
181 (int)srclen, (int)destlen));
185 reason="Illegal multibyte sequence";
188 /* smb_panic(reason); */
190 return destlen-o_len;
194 * Convert between character sets, allocating a new buffer using talloc for the result.
196 * @param srclen length of source buffer.
197 * @param dest always set at least to NULL
198 * @note -1 is not accepted for srclen.
200 * @returns Size in bytes of the converted string; or -1 in case of error.
203 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
204 void const *src, size_t srclen, void **dest)
206 size_t i_len, o_len, destlen;
208 const char *inbuf = (const char *)src;
210 smb_iconv_t descriptor;
214 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
217 descriptor = get_conv_handle(from, to);
219 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
220 /* conversion not supported, return -1*/
221 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
222 charset_name(global_loadparm, from), charset_name(global_loadparm, to)));
226 /* it is _very_ rare that a conversion increases the size by
231 destlen = 2 + (destlen*3);
232 ob = talloc_realloc(ctx, outbuf, char, destlen);
234 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
241 /* we give iconv 2 less bytes to allow us to terminate at the
245 retval = smb_iconv(descriptor,
248 if(retval == (size_t)-1) {
249 const char *reason="unknown error";
252 reason="Incomplete multibyte sequence";
257 reason="Illegal multibyte sequence";
260 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
265 destlen = (destlen-2) - o_len;
267 /* guarantee null termination in all charsets */
268 SSVAL(ob, destlen, 0);
276 * Copy a string from a char* unix src to a dos codepage string destination.
278 * @return the number of bytes occupied by the string in the destination.
280 * @param flags can include
282 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
283 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
286 * @param dest_len the maximum length in bytes allowed in the
287 * destination. If @p dest_len is -1 then no maximum is used.
289 static ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
294 if (flags & STR_UPPER) {
295 char *tmpbuf = strupper_talloc(NULL, src);
296 if (tmpbuf == NULL) {
299 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
304 src_len = strlen(src);
306 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
309 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
313 * Copy a string from a unix char* src to an ASCII destination,
314 * allocating a buffer using talloc().
316 * @param dest always set at least to NULL
318 * @returns The number of bytes occupied by the string in the destination
319 * or -1 in case of error.
321 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
323 size_t src_len = strlen(src)+1;
325 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
330 * Copy a string from a dos codepage source to a unix char* destination.
332 * The resulting string in "dest" is always null terminated.
334 * @param flags can have:
336 * <dt>STR_TERMINATE</dt>
337 * <dd>STR_TERMINATE means the string in @p src
338 * is null terminated, and src_len is ignored.</dd>
341 * @param src_len is the length of the source area in bytes.
342 * @returns the number of bytes occupied by the string in @p src.
344 static ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
348 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
349 if (src_len == (size_t)-1) {
350 src_len = strlen((const char *)src) + 1;
352 size_t len = strnlen((const char *)src, src_len);
359 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
362 dest[MIN(ret, dest_len-1)] = 0;
368 * Copy a string from a char* src to a unicode destination.
370 * @returns the number of bytes occupied by the string in the destination.
372 * @param flags can have:
375 * <dt>STR_TERMINATE <dd>means include the null termination.
376 * <dt>STR_UPPER <dd>means uppercase in the destination.
377 * <dt>STR_NOALIGN <dd>means don't do alignment.
380 * @param dest_len is the maximum length allowed in the
381 * destination. If dest_len is -1 then no maxiumum is used.
383 static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
386 size_t src_len = strlen(src);
389 if (flags & STR_UPPER) {
390 char *tmpbuf = strupper_talloc(NULL, src);
391 if (tmpbuf == NULL) {
394 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
399 if (flags & STR_TERMINATE)
402 if (ucs2_align(NULL, dest, flags)) {
404 dest = (void *)((char *)dest + 1);
405 if (dest_len) dest_len--;
409 /* ucs2 is always a multiple of 2 bytes */
412 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
413 if (ret == (size_t)-1) {
424 * Copy a string from a unix char* src to a UCS2 destination,
425 * allocating a buffer using talloc().
427 * @param dest always set at least to NULL
429 * @returns The number of bytes occupied by the string in the destination
430 * or -1 in case of error.
432 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
434 size_t src_len = strlen(src)+1;
436 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
441 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
443 * @param dest always set at least to NULL
445 * @returns The number of bytes occupied by the string in the destination
448 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
450 size_t src_len = strlen(src)+1;
452 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
456 Copy a string from a ucs2 source to a unix char* destination.
458 STR_TERMINATE means the string in src is null terminated.
459 STR_NOALIGN means don't try to align.
460 if STR_TERMINATE is set then src_len is ignored if it is -1.
461 src_len is the length of the source area in bytes
462 Return the number of bytes occupied by the string in src.
463 The resulting string in "dest" is always null terminated.
466 static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
470 if (ucs2_align(NULL, src, flags)) {
471 src = (const void *)((const char *)src + 1);
476 if (flags & STR_TERMINATE) {
477 if (src_len == (size_t)-1) {
478 src_len = utf16_len(src);
480 src_len = utf16_len_n(src, src_len);
484 /* ucs2 is always a multiple of 2 bytes */
485 if (src_len != (size_t)-1)
488 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
490 dest[MIN(ret, dest_len-1)] = 0;
496 * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
498 * @param dest always set at least to NULL
500 * @returns The number of bytes occupied by the string in the destination
503 _PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
505 size_t src_len = strlen(src)+1;
507 return convert_string_talloc(ctx, CH_DOS, CH_UNIX, src, src_len, (void **)dest);
511 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
513 * @param dest always set at least to NULL
515 * @returns The number of bytes occupied by the string in the destination
518 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
520 size_t src_len = utf16_len(src);
522 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
526 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
528 * @param dest always set at least to NULL
530 * @returns The number of bytes occupied by the string in the destination
533 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
535 size_t src_len = strlen(src)+1;
537 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
541 Copy a string from a char* src to a unicode or ascii
542 dos codepage destination choosing unicode or ascii based on the
543 flags in the SMB buffer starting at base_ptr.
544 Return the number of bytes occupied by the string in the destination.
546 STR_TERMINATE means include the null termination.
547 STR_UPPER means uppercase in the destination.
548 STR_ASCII use ascii even with unicode packet.
549 STR_NOALIGN means don't do alignment.
550 dest_len is the maximum length allowed in the destination. If dest_len
551 is -1 then no maxiumum is used.
554 _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
556 if (flags & STR_ASCII) {
557 return push_ascii(dest, src, dest_len, flags);
558 } else if (flags & STR_UNICODE) {
559 return push_ucs2(dest, src, dest_len, flags);
561 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
568 Copy a string from a unicode or ascii source (depending on
569 the packet flags) to a char* destination.
571 STR_TERMINATE means the string in src is null terminated.
572 STR_UNICODE means to force as unicode.
573 STR_ASCII use ascii even with unicode packet.
574 STR_NOALIGN means don't do alignment.
575 if STR_TERMINATE is set then src_len is ignored is it is -1
576 src_len is the length of the source area in bytes.
577 Return the number of bytes occupied by the string in src.
578 The resulting string in "dest" is always null terminated.
581 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
583 if (flags & STR_ASCII) {
584 return pull_ascii(dest, src, dest_len, src_len, flags);
585 } else if (flags & STR_UNICODE) {
586 return pull_ucs2(dest, src, dest_len, src_len, flags);
588 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
595 return the unicode codepoint for the next multi-byte CH_UNIX character
598 also return the number of bytes consumed (which tells the caller
599 how many bytes to skip to get to the next CH_UNIX character)
601 return INVALID_CODEPOINT if the next character cannot be converted
603 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
605 /* it cannot occupy more than 4 bytes in UTF16 format */
607 smb_iconv_t descriptor;
613 if ((str[0] & 0x80) == 0) {
615 return (codepoint_t)str[0];
618 /* we assume that no multi-byte character can take
619 more than 5 bytes. This is OK as we only
620 support codepoints up to 1M */
621 ilen_orig = strnlen(str, 5);
624 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
625 if (descriptor == (smb_iconv_t)-1) {
627 return INVALID_CODEPOINT;
630 /* this looks a little strange, but it is needed to cope
631 with codepoints above 64k */
633 outbuf = (char *)buf;
634 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
637 outbuf = (char *)buf;
638 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
640 /* we didn't convert any bytes */
642 return INVALID_CODEPOINT;
649 *size = ilen_orig - ilen;
652 return (codepoint_t)SVAL(buf, 0);
655 /* decode a 4 byte UTF16 character manually */
656 return (codepoint_t)0x10000 +
657 (buf[2] | ((buf[3] & 0x3)<<8) |
658 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
661 /* no other length is valid */
662 return INVALID_CODEPOINT;
666 push a single codepoint into a CH_UNIX string the target string must
667 be able to hold the full character, which is guaranteed if it is at
668 least 5 bytes in size. The caller may pass less than 5 bytes if they
669 are sure the character will fit (for example, you can assume that
670 uppercase/lowercase of a character will not add more than 1 byte)
672 return the number of bytes occupied by the CH_UNIX character, or
675 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
677 smb_iconv_t descriptor;
687 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
688 if (descriptor == (smb_iconv_t)-1) {
697 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
706 buf[0] = (c>>10) & 0xFF;
707 buf[1] = (c>>18) | 0xd8;
709 buf[3] = ((c>>8) & 0x3) | 0xdc;
715 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);