2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program. If not, see <http://www.gnu.org/licenses/>.
23 #include "system/iconv.h"
28 * @brief Character-set conversion routines built on our iconv.
30 * @note Samba's internal character set (at least in the 3.0 series)
31 * is always the same as the one for the Unix filesystem. It is
32 * <b>not</b> necessarily UTF-8 and may be different on machines that
33 * need i18n filenames to be compatible with Unix software. It does
34 * have to be a superset of ASCII. All multibyte sequences must start
35 * with a byte with the high bit set.
41 * Return the name of a charset to give to iconv().
43 static const char *charset_name(charset_t ch)
46 case CH_UTF16: return "UTF-16LE";
47 case CH_UNIX: return lp_unix_charset();
48 case CH_DOS: return lp_dos_charset();
49 case CH_DISPLAY: return lp_display_charset();
50 case CH_UTF8: return "UTF8";
51 case CH_UTF16BE: return "UTF-16BE";
57 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
60 re-initialize iconv conversion descriptors
62 _PUBLIC_ void init_iconv(void)
65 for (c1=0;c1<NUM_CHARSETS;c1++) {
66 for (c2=0;c2<NUM_CHARSETS;c2++) {
67 if (conv_handles[c1][c2] != NULL) {
68 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
69 smb_iconv_close(conv_handles[c1][c2]);
71 conv_handles[c1][c2] = NULL;
79 on-demand initialisation of conversion handles
81 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
84 static int initialised;
85 /* auto-free iconv memory on exit so valgrind reports are easier
87 if (initialised == 0) {
91 /* we set back the locale to C to get ASCII-compatible
92 toupper/lower functions. For now we do not need
93 any other POSIX localisations anyway. When we
94 should really need localized string functions one
95 day we need to write our own ascii_tolower etc.
97 setlocale(LC_ALL, "C");
103 if (conv_handles[from][to]) {
104 return conv_handles[from][to];
107 n1 = charset_name(from);
108 n2 = charset_name(to);
110 conv_handles[from][to] = smb_iconv_open(n2,n1);
112 if (conv_handles[from][to] == (smb_iconv_t)-1) {
113 if ((from == CH_DOS || to == CH_DOS) &&
114 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
115 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
116 charset_name(CH_DOS)));
117 lp_set_cmdline("dos charset", "ASCII");
119 n1 = charset_name(from);
120 n2 = charset_name(to);
122 conv_handles[from][to] = smb_iconv_open(n2,n1);
126 return conv_handles[from][to];
131 * Convert string from one encoding to another, making error checking etc
133 * @param src pointer to source string (multibyte or singlebyte)
134 * @param srclen length of the source string in bytes
135 * @param dest pointer to destination string (multibyte or singlebyte)
136 * @param destlen maximal length allowed for string
137 * @returns the number of bytes occupied in the destination
139 _PUBLIC_ ssize_t convert_string(charset_t from, charset_t to,
140 void const *src, size_t srclen,
141 void *dest, size_t destlen)
145 const char* inbuf = (const char*)src;
146 char* outbuf = (char*)dest;
147 smb_iconv_t descriptor;
149 if (srclen == (size_t)-1)
150 srclen = strlen(src)+1;
152 descriptor = get_conv_handle(from, to);
154 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
155 /* conversion not supported, use as is */
156 size_t len = MIN(srclen,destlen);
157 memcpy(dest,src,len);
163 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
164 if(retval==(size_t)-1) {
168 reason="Incomplete multibyte sequence";
171 reason="No more room";
172 if (from == CH_UNIX) {
173 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
174 charset_name(from), charset_name(to),
175 (int)srclen, (int)destlen,
178 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
179 charset_name(from), charset_name(to),
180 (int)srclen, (int)destlen));
184 reason="Illegal multibyte sequence";
187 /* smb_panic(reason); */
189 return destlen-o_len;
193 * Convert between character sets, allocating a new buffer using talloc for the result.
195 * @param srclen length of source buffer.
196 * @param dest always set at least to NULL
197 * @note -1 is not accepted for srclen.
199 * @returns Size in bytes of the converted string; or -1 in case of error.
202 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
203 void const *src, size_t srclen, void **dest)
205 size_t i_len, o_len, destlen;
207 const char *inbuf = (const char *)src;
209 smb_iconv_t descriptor;
213 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
216 descriptor = get_conv_handle(from, to);
218 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
219 /* conversion not supported, return -1*/
220 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
221 charset_name(from), charset_name(to)));
225 /* it is _very_ rare that a conversion increases the size by
230 destlen = 2 + (destlen*3);
231 ob = talloc_realloc(ctx, outbuf, char, destlen);
233 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
240 /* we give iconv 2 less bytes to allow us to terminate at the
244 retval = smb_iconv(descriptor,
247 if(retval == (size_t)-1) {
248 const char *reason="unknown error";
251 reason="Incomplete multibyte sequence";
256 reason="Illegal multibyte sequence";
259 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
264 destlen = (destlen-2) - o_len;
266 /* guarantee null termination in all charsets */
267 SSVAL(ob, destlen, 0);
275 * Copy a string from a char* unix src to a dos codepage string destination.
277 * @return the number of bytes occupied by the string in the destination.
279 * @param flags can include
281 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
282 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
285 * @param dest_len the maximum length in bytes allowed in the
286 * destination. If @p dest_len is -1 then no maximum is used.
288 static ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
293 if (flags & STR_UPPER) {
294 char *tmpbuf = strupper_talloc(NULL, src);
295 if (tmpbuf == NULL) {
298 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
303 src_len = strlen(src);
305 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
308 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
312 * Copy a string from a unix char* src to an ASCII destination,
313 * allocating a buffer using talloc().
315 * @param dest always set at least to NULL
317 * @returns The number of bytes occupied by the string in the destination
318 * or -1 in case of error.
320 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
322 size_t src_len = strlen(src)+1;
324 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
329 * Copy a string from a dos codepage source to a unix char* destination.
331 * The resulting string in "dest" is always null terminated.
333 * @param flags can have:
335 * <dt>STR_TERMINATE</dt>
336 * <dd>STR_TERMINATE means the string in @p src
337 * is null terminated, and src_len is ignored.</dd>
340 * @param src_len is the length of the source area in bytes.
341 * @returns the number of bytes occupied by the string in @p src.
343 static ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
347 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
348 if (src_len == (size_t)-1) {
349 src_len = strlen(src) + 1;
351 size_t len = strnlen(src, src_len);
358 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
361 dest[MIN(ret, dest_len-1)] = 0;
367 * Copy a string from a char* src to a unicode destination.
369 * @returns the number of bytes occupied by the string in the destination.
371 * @param flags can have:
374 * <dt>STR_TERMINATE <dd>means include the null termination.
375 * <dt>STR_UPPER <dd>means uppercase in the destination.
376 * <dt>STR_NOALIGN <dd>means don't do alignment.
379 * @param dest_len is the maximum length allowed in the
380 * destination. If dest_len is -1 then no maxiumum is used.
382 static ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
385 size_t src_len = strlen(src);
388 if (flags & STR_UPPER) {
389 char *tmpbuf = strupper_talloc(NULL, src);
390 if (tmpbuf == NULL) {
393 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
398 if (flags & STR_TERMINATE)
401 if (ucs2_align(NULL, dest, flags)) {
403 dest = (void *)((char *)dest + 1);
404 if (dest_len) dest_len--;
408 /* ucs2 is always a multiple of 2 bytes */
411 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
412 if (ret == (size_t)-1) {
423 * Copy a string from a unix char* src to a UCS2 destination,
424 * allocating a buffer using talloc().
426 * @param dest always set at least to NULL
428 * @returns The number of bytes occupied by the string in the destination
429 * or -1 in case of error.
431 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
433 size_t src_len = strlen(src)+1;
435 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
440 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
442 * @param dest always set at least to NULL
444 * @returns The number of bytes occupied by the string in the destination
447 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
449 size_t src_len = strlen(src)+1;
451 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
455 Copy a string from a ucs2 source to a unix char* destination.
457 STR_TERMINATE means the string in src is null terminated.
458 STR_NOALIGN means don't try to align.
459 if STR_TERMINATE is set then src_len is ignored if it is -1.
460 src_len is the length of the source area in bytes
461 Return the number of bytes occupied by the string in src.
462 The resulting string in "dest" is always null terminated.
465 static size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
469 if (ucs2_align(NULL, src, flags)) {
470 src = (const void *)((const char *)src + 1);
475 if (flags & STR_TERMINATE) {
476 if (src_len == (size_t)-1) {
477 src_len = utf16_len(src);
479 src_len = utf16_len_n(src, src_len);
483 /* ucs2 is always a multiple of 2 bytes */
484 if (src_len != (size_t)-1)
487 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
489 dest[MIN(ret, dest_len-1)] = 0;
495 * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
497 * @param dest always set at least to NULL
499 * @returns The number of bytes occupied by the string in the destination
502 _PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
504 size_t src_len = strlen(src)+1;
506 return convert_string_talloc(ctx, CH_DOS, CH_UNIX, src, src_len, (void **)dest);
510 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
512 * @param dest always set at least to NULL
514 * @returns The number of bytes occupied by the string in the destination
517 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
519 size_t src_len = utf16_len(src);
521 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
525 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
527 * @param dest always set at least to NULL
529 * @returns The number of bytes occupied by the string in the destination
532 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
534 size_t src_len = strlen(src)+1;
536 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
540 Copy a string from a char* src to a unicode or ascii
541 dos codepage destination choosing unicode or ascii based on the
542 flags in the SMB buffer starting at base_ptr.
543 Return the number of bytes occupied by the string in the destination.
545 STR_TERMINATE means include the null termination.
546 STR_UPPER means uppercase in the destination.
547 STR_ASCII use ascii even with unicode packet.
548 STR_NOALIGN means don't do alignment.
549 dest_len is the maximum length allowed in the destination. If dest_len
550 is -1 then no maxiumum is used.
553 _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
555 if (flags & STR_ASCII) {
556 return push_ascii(dest, src, dest_len, flags);
557 } else if (flags & STR_UNICODE) {
558 return push_ucs2(dest, src, dest_len, flags);
560 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
567 Copy a string from a unicode or ascii source (depending on
568 the packet flags) to a char* destination.
570 STR_TERMINATE means the string in src is null terminated.
571 STR_UNICODE means to force as unicode.
572 STR_ASCII use ascii even with unicode packet.
573 STR_NOALIGN means don't do alignment.
574 if STR_TERMINATE is set then src_len is ignored is it is -1
575 src_len is the length of the source area in bytes.
576 Return the number of bytes occupied by the string in src.
577 The resulting string in "dest" is always null terminated.
580 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
582 if (flags & STR_ASCII) {
583 return pull_ascii(dest, src, dest_len, src_len, flags);
584 } else if (flags & STR_UNICODE) {
585 return pull_ucs2(dest, src, dest_len, src_len, flags);
587 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
594 return the unicode codepoint for the next multi-byte CH_UNIX character
597 also return the number of bytes consumed (which tells the caller
598 how many bytes to skip to get to the next CH_UNIX character)
600 return INVALID_CODEPOINT if the next character cannot be converted
602 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
604 /* it cannot occupy more than 4 bytes in UTF16 format */
606 smb_iconv_t descriptor;
612 if ((str[0] & 0x80) == 0) {
614 return (codepoint_t)str[0];
617 /* we assume that no multi-byte character can take
618 more than 5 bytes. This is OK as we only
619 support codepoints up to 1M */
620 ilen_orig = strnlen(str, 5);
623 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
624 if (descriptor == (smb_iconv_t)-1) {
626 return INVALID_CODEPOINT;
629 /* this looks a little strange, but it is needed to cope
630 with codepoints above 64k */
632 outbuf = (char *)buf;
633 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
636 outbuf = (char *)buf;
637 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
639 /* we didn't convert any bytes */
641 return INVALID_CODEPOINT;
648 *size = ilen_orig - ilen;
651 return (codepoint_t)SVAL(buf, 0);
654 /* decode a 4 byte UTF16 character manually */
655 return (codepoint_t)0x10000 +
656 (buf[2] | ((buf[3] & 0x3)<<8) |
657 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
660 /* no other length is valid */
661 return INVALID_CODEPOINT;
665 push a single codepoint into a CH_UNIX string the target string must
666 be able to hold the full character, which is guaranteed if it is at
667 least 5 bytes in size. The caller may pass less than 5 bytes if they
668 are sure the character will fit (for example, you can assume that
669 uppercase/lowercase of a character will not add more than 1 byte)
671 return the number of bytes occupied by the CH_UNIX character, or
674 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
676 smb_iconv_t descriptor;
686 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
687 if (descriptor == (smb_iconv_t)-1) {
696 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
705 buf[0] = (c>>10) & 0xFF;
706 buf[1] = (c>>18) | 0xd8;
708 buf[3] = ((c>>8) & 0x3) | 0xdc;
714 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);