2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Jelmer Vernooij 2007
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 3 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program. If not, see <http://www.gnu.org/licenses/>.
24 #include "system/iconv.h"
25 #include "param/param.h"
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
42 struct smb_iconv_convenience {
43 const char *unix_charset;
44 const char *dos_charset;
46 smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
51 * Return the name of a charset to give to iconv().
53 static const char *charset_name(struct smb_iconv_convenience *ic, charset_t ch)
56 case CH_UTF16: return "UTF-16LE";
57 case CH_UNIX: return ic->unix_charset;
58 case CH_DOS: return ic->dos_charset;
59 case CH_UTF8: return "UTF8";
60 case CH_UTF16BE: return "UTF-16BE";
67 re-initialize iconv conversion descriptors
69 static int close_iconv(struct smb_iconv_convenience *data)
72 for (c1=0;c1<NUM_CHARSETS;c1++) {
73 for (c2=0;c2<NUM_CHARSETS;c2++) {
74 if (data->conv_handles[c1][c2] != NULL) {
75 if (data->conv_handles[c1][c2] != (smb_iconv_t)-1) {
76 smb_iconv_close(data->conv_handles[c1][c2]);
78 data->conv_handles[c1][c2] = NULL;
86 _PUBLIC_ struct smb_iconv_convenience *smb_iconv_convenience_init(TALLOC_CTX *mem_ctx,
87 const char *dos_charset,
88 const char *unix_charset,
91 struct smb_iconv_convenience *ret = talloc_zero(mem_ctx,
92 struct smb_iconv_convenience);
98 talloc_set_destructor(ret, close_iconv);
100 ret->dos_charset = talloc_strdup(ret, dos_charset);
101 ret->unix_charset = talloc_strdup(ret, unix_charset);
102 ret->native_iconv = native_iconv;
108 on-demand initialisation of conversion handles
110 static smb_iconv_t get_conv_handle(struct smb_iconv_convenience *ic,
111 charset_t from, charset_t to)
114 static bool initialised;
116 if (initialised == false) {
120 /* we set back the locale to C to get ASCII-compatible
121 toupper/lower functions. For now we do not need
122 any other POSIX localisations anyway. When we
123 should really need localized string functions one
124 day we need to write our own ascii_tolower etc.
126 setlocale(LC_ALL, "C");
130 if (ic->conv_handles[from][to]) {
131 return ic->conv_handles[from][to];
134 n1 = charset_name(ic, from);
135 n2 = charset_name(ic, to);
137 ic->conv_handles[from][to] = smb_iconv_open_ex(ic, n2, n1,
140 if (ic->conv_handles[from][to] == (smb_iconv_t)-1) {
141 if ((from == CH_DOS || to == CH_DOS) &&
142 strcasecmp(charset_name(ic, CH_DOS), "ASCII") != 0) {
143 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
144 charset_name(ic, CH_DOS)));
145 ic->dos_charset = "ASCII";
147 n1 = charset_name(ic, from);
148 n2 = charset_name(ic, to);
150 ic->conv_handles[from][to] =
151 smb_iconv_open_ex(ic, n2, n1, ic->native_iconv);
155 return ic->conv_handles[from][to];
160 * Convert string from one encoding to another, making error checking etc
162 * @param src pointer to source string (multibyte or singlebyte)
163 * @param srclen length of the source string in bytes
164 * @param dest pointer to destination string (multibyte or singlebyte)
165 * @param destlen maximal length allowed for string
166 * @returns the number of bytes occupied in the destination
168 _PUBLIC_ ssize_t convert_string(struct smb_iconv_convenience *ic,
169 charset_t from, charset_t to,
170 void const *src, size_t srclen,
171 void *dest, size_t destlen)
175 const char* inbuf = (const char*)src;
176 char* outbuf = (char*)dest;
177 smb_iconv_t descriptor;
179 if (srclen == (size_t)-1)
180 srclen = strlen(inbuf)+1;
182 descriptor = get_conv_handle(ic, from, to);
184 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
185 /* conversion not supported, use as is */
186 size_t len = MIN(srclen,destlen);
187 memcpy(dest,src,len);
193 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
194 if(retval==(size_t)-1) {
198 reason="Incomplete multibyte sequence";
201 reason="No more room";
202 if (from == CH_UNIX) {
203 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
204 charset_name(ic, from), charset_name(ic, to),
205 (int)srclen, (int)destlen,
208 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
209 charset_name(ic, from), charset_name(ic, to),
210 (int)srclen, (int)destlen));
214 reason="Illegal multibyte sequence";
217 /* smb_panic(reason); */
219 return destlen-o_len;
222 _PUBLIC_ ssize_t convert_string_talloc_descriptor(TALLOC_CTX *ctx, smb_iconv_t descriptor, void const *src, size_t srclen, void **dest)
224 size_t i_len, o_len, destlen;
226 const char *inbuf = (const char *)src;
231 /* it is _very_ rare that a conversion increases the size by
236 destlen = 2 + (destlen*3);
237 ob = talloc_realloc(ctx, outbuf, char, destlen);
239 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
246 /* we give iconv 2 less bytes to allow us to terminate at the
250 retval = smb_iconv(descriptor,
253 if(retval == (size_t)-1) {
254 const char *reason="unknown error";
257 reason="Incomplete multibyte sequence";
262 reason="Illegal multibyte sequence";
265 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
270 destlen = (destlen-2) - o_len;
272 /* guarantee null termination in all charsets */
273 SSVAL(ob, destlen, 0);
281 * Convert between character sets, allocating a new buffer using talloc for the result.
283 * @param srclen length of source buffer.
284 * @param dest always set at least to NULL
285 * @note -1 is not accepted for srclen.
287 * @returns Size in bytes of the converted string; or -1 in case of error.
290 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx,
291 struct smb_iconv_convenience *ic,
292 charset_t from, charset_t to,
293 void const *src, size_t srclen,
296 smb_iconv_t descriptor;
300 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
303 descriptor = get_conv_handle(ic, from, to);
305 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
306 /* conversion not supported, return -1*/
307 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
308 charset_name(ic, from),
309 charset_name(ic, to)));
313 return convert_string_talloc_descriptor(ctx, descriptor, src, srclen, dest);
317 * Copy a string from a char* unix src to a dos codepage string destination.
319 * @return the number of bytes occupied by the string in the destination.
321 * @param flags can include
323 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
324 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
327 * @param dest_len the maximum length in bytes allowed in the
328 * destination. If @p dest_len is -1 then no maximum is used.
330 static ssize_t push_ascii(struct smb_iconv_convenience *ic,
331 void *dest, const char *src, size_t dest_len, int flags)
336 if (flags & STR_UPPER) {
337 char *tmpbuf = strupper_talloc(NULL, src);
338 if (tmpbuf == NULL) {
341 ret = push_ascii(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER);
346 src_len = strlen(src);
348 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
351 return convert_string(ic, CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
355 * Copy a string from a unix char* src to an ASCII destination,
356 * allocating a buffer using talloc().
358 * @param dest always set at least to NULL
360 * @returns The number of bytes occupied by the string in the destination
361 * or -1 in case of error.
363 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
365 size_t src_len = strlen(src)+1;
367 return convert_string_talloc(ctx, ic, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
372 * Copy a string from a dos codepage source to a unix char* destination.
374 * The resulting string in "dest" is always null terminated.
376 * @param flags can have:
378 * <dt>STR_TERMINATE</dt>
379 * <dd>STR_TERMINATE means the string in @p src
380 * is null terminated, and src_len is ignored.</dd>
383 * @param src_len is the length of the source area in bytes.
384 * @returns the number of bytes occupied by the string in @p src.
386 static ssize_t pull_ascii(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
390 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
391 if (src_len == (size_t)-1) {
392 src_len = strlen((const char *)src) + 1;
394 size_t len = strnlen((const char *)src, src_len);
401 ret = convert_string(ic, CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
404 dest[MIN(ret, dest_len-1)] = 0;
410 * Copy a string from a char* src to a unicode destination.
412 * @returns the number of bytes occupied by the string in the destination.
414 * @param flags can have:
417 * <dt>STR_TERMINATE <dd>means include the null termination.
418 * <dt>STR_UPPER <dd>means uppercase in the destination.
419 * <dt>STR_NOALIGN <dd>means don't do alignment.
422 * @param dest_len is the maximum length allowed in the
423 * destination. If dest_len is -1 then no maxiumum is used.
425 static ssize_t push_ucs2(struct smb_iconv_convenience *ic,
426 void *dest, const char *src, size_t dest_len, int flags)
429 size_t src_len = strlen(src);
432 if (flags & STR_UPPER) {
433 char *tmpbuf = strupper_talloc(NULL, src);
434 if (tmpbuf == NULL) {
437 ret = push_ucs2(ic, dest, tmpbuf, dest_len, flags & ~STR_UPPER);
442 if (flags & STR_TERMINATE)
445 if (ucs2_align(NULL, dest, flags)) {
447 dest = (void *)((char *)dest + 1);
448 if (dest_len) dest_len--;
452 /* ucs2 is always a multiple of 2 bytes */
455 ret = convert_string(ic, CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
456 if (ret == (size_t)-1) {
467 * Copy a string from a unix char* src to a UCS2 destination,
468 * allocating a buffer using talloc().
470 * @param dest always set at least to NULL
472 * @returns The number of bytes occupied by the string in the destination
473 * or -1 in case of error.
475 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, void **dest, const char *src)
477 size_t src_len = strlen(src)+1;
479 return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF16, src, src_len, dest);
484 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
486 * @param dest always set at least to NULL
488 * @returns The number of bytes occupied by the string in the destination
491 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
493 size_t src_len = strlen(src)+1;
495 return convert_string_talloc(ctx, ic, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
499 Copy a string from a ucs2 source to a unix char* destination.
501 STR_TERMINATE means the string in src is null terminated.
502 STR_NOALIGN means don't try to align.
503 if STR_TERMINATE is set then src_len is ignored if it is -1.
504 src_len is the length of the source area in bytes
505 Return the number of bytes occupied by the string in src.
506 The resulting string in "dest" is always null terminated.
509 static size_t pull_ucs2(struct smb_iconv_convenience *ic, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
513 if (ucs2_align(NULL, src, flags)) {
514 src = (const void *)((const char *)src + 1);
519 if (flags & STR_TERMINATE) {
520 if (src_len == (size_t)-1) {
521 src_len = utf16_len(src);
523 src_len = utf16_len_n(src, src_len);
527 /* ucs2 is always a multiple of 2 bytes */
528 if (src_len != (size_t)-1)
531 ret = convert_string(ic, CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
533 dest[MIN(ret, dest_len-1)] = 0;
539 * Copy a string from a ASCII src to a unix char * destination, allocating a buffer using talloc
541 * @param dest always set at least to NULL
543 * @returns The number of bytes occupied by the string in the destination
546 _PUBLIC_ ssize_t pull_ascii_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
548 size_t src_len = strlen(src)+1;
550 return convert_string_talloc(ctx, ic, CH_DOS, CH_UNIX, src, src_len, (void **)dest);
554 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
556 * @param dest always set at least to NULL
558 * @returns The number of bytes occupied by the string in the destination
561 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const void *src)
563 size_t src_len = utf16_len(src);
565 return convert_string_talloc(ctx, ic, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
569 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
571 * @param dest always set at least to NULL
573 * @returns The number of bytes occupied by the string in the destination
576 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, struct smb_iconv_convenience *ic, char **dest, const char *src)
578 size_t src_len = strlen(src)+1;
580 return convert_string_talloc(ctx, ic, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
584 Copy a string from a char* src to a unicode or ascii
585 dos codepage destination choosing unicode or ascii based on the
586 flags in the SMB buffer starting at base_ptr.
587 Return the number of bytes occupied by the string in the destination.
589 STR_TERMINATE means include the null termination.
590 STR_UPPER means uppercase in the destination.
591 STR_ASCII use ascii even with unicode packet.
592 STR_NOALIGN means don't do alignment.
593 dest_len is the maximum length allowed in the destination. If dest_len
594 is -1 then no maxiumum is used.
597 _PUBLIC_ ssize_t push_string(struct smb_iconv_convenience *ic,
598 void *dest, const char *src, size_t dest_len, int flags)
600 if (flags & STR_ASCII) {
601 return push_ascii(ic, dest, src, dest_len, flags);
602 } else if (flags & STR_UNICODE) {
603 return push_ucs2(ic, dest, src, dest_len, flags);
605 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
612 Copy a string from a unicode or ascii source (depending on
613 the packet flags) to a char* destination.
615 STR_TERMINATE means the string in src is null terminated.
616 STR_UNICODE means to force as unicode.
617 STR_ASCII use ascii even with unicode packet.
618 STR_NOALIGN means don't do alignment.
619 if STR_TERMINATE is set then src_len is ignored is it is -1
620 src_len is the length of the source area in bytes.
621 Return the number of bytes occupied by the string in src.
622 The resulting string in "dest" is always null terminated.
625 _PUBLIC_ ssize_t pull_string(struct smb_iconv_convenience *ic,
626 char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
628 if (flags & STR_ASCII) {
629 return pull_ascii(ic, dest, src, dest_len, src_len, flags);
630 } else if (flags & STR_UNICODE) {
631 return pull_ucs2(ic, dest, src, dest_len, src_len, flags);
633 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
640 return the unicode codepoint for the next multi-byte CH_UNIX character
643 also return the number of bytes consumed (which tells the caller
644 how many bytes to skip to get to the next CH_UNIX character)
646 return INVALID_CODEPOINT if the next character cannot be converted
648 _PUBLIC_ codepoint_t next_codepoint(struct smb_iconv_convenience *ic,
649 const char *str, size_t *size)
651 /* it cannot occupy more than 4 bytes in UTF16 format */
653 smb_iconv_t descriptor;
659 if ((str[0] & 0x80) == 0) {
661 return (codepoint_t)str[0];
664 /* we assume that no multi-byte character can take
665 more than 5 bytes. This is OK as we only
666 support codepoints up to 1M */
667 ilen_orig = strnlen(str, 5);
670 descriptor = get_conv_handle(ic, CH_UNIX, CH_UTF16);
671 if (descriptor == (smb_iconv_t)-1) {
673 return INVALID_CODEPOINT;
676 /* this looks a little strange, but it is needed to cope
677 with codepoints above 64k */
679 outbuf = (char *)buf;
680 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
683 outbuf = (char *)buf;
684 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
686 /* we didn't convert any bytes */
688 return INVALID_CODEPOINT;
695 *size = ilen_orig - ilen;
698 return (codepoint_t)SVAL(buf, 0);
701 /* decode a 4 byte UTF16 character manually */
702 return (codepoint_t)0x10000 +
703 (buf[2] | ((buf[3] & 0x3)<<8) |
704 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
707 /* no other length is valid */
708 return INVALID_CODEPOINT;
712 push a single codepoint into a CH_UNIX string the target string must
713 be able to hold the full character, which is guaranteed if it is at
714 least 5 bytes in size. The caller may pass less than 5 bytes if they
715 are sure the character will fit (for example, you can assume that
716 uppercase/lowercase of a character will not add more than 1 byte)
718 return the number of bytes occupied by the CH_UNIX character, or
721 _PUBLIC_ ssize_t push_codepoint(struct smb_iconv_convenience *ic,
722 char *str, codepoint_t c)
724 smb_iconv_t descriptor;
734 descriptor = get_conv_handle(ic,
736 if (descriptor == (smb_iconv_t)-1) {
745 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
754 buf[0] = (c>>10) & 0xFF;
755 buf[1] = (c>>18) | 0xd8;
757 buf[3] = ((c>>8) & 0x3) | 0xdc;
763 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);