2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/iconv.h"
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
43 * Return the name of a charset to give to iconv().
45 static const char *charset_name(charset_t ch)
47 const char *ret = NULL;
49 if (ch == CH_UTF16) ret = "UTF-16LE";
50 else if (ch == CH_UNIX) ret = lp_unix_charset();
51 else if (ch == CH_DOS) ret = lp_dos_charset();
52 else if (ch == CH_DISPLAY) ret = lp_display_charset();
53 else if (ch == CH_UTF8) ret = "UTF8";
54 else if (ch == CH_UTF16BE) ret = "UTF-16BE";
56 if (!ret || !*ret) ret = "ASCII";
60 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
63 re-initialize iconv conversion descriptors
68 for (c1=0;c1<NUM_CHARSETS;c1++) {
69 for (c2=0;c2<NUM_CHARSETS;c2++) {
70 if (conv_handles[c1][c2] != NULL) {
71 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
72 smb_iconv_close(conv_handles[c1][c2]);
74 conv_handles[c1][c2] = NULL;
82 on-demand initialisation of conversion handles
84 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
87 static int initialised;
88 /* auto-free iconv memory on exit so valgrind reports are easier
90 if (initialised == 0) {
94 /* we set back the locale to C to get ASCII-compatible
95 toupper/lower functions. For now we do not need
96 any other POSIX localisations anyway. When we
97 should really need localized string functions one
98 day we need to write our own ascii_tolower etc.
100 setlocale(LC_ALL, "C");
106 if (conv_handles[from][to]) {
107 return conv_handles[from][to];
110 n1 = charset_name(from);
111 n2 = charset_name(to);
113 conv_handles[from][to] = smb_iconv_open(n2,n1);
115 if (conv_handles[from][to] == (smb_iconv_t)-1) {
116 if ((from == CH_DOS || to == CH_DOS) &&
117 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
118 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
119 charset_name(CH_DOS)));
120 lp_set_cmdline("dos charset", "ASCII");
122 n1 = charset_name(from);
123 n2 = charset_name(to);
125 conv_handles[from][to] = smb_iconv_open(n2,n1);
129 return conv_handles[from][to];
134 * Convert string from one encoding to another, making error checking etc
136 * @param src pointer to source string (multibyte or singlebyte)
137 * @param srclen length of the source string in bytes
138 * @param dest pointer to destination string (multibyte or singlebyte)
139 * @param destlen maximal length allowed for string
140 * @returns the number of bytes occupied in the destination
142 ssize_t convert_string(charset_t from, charset_t to,
143 void const *src, size_t srclen,
144 void *dest, size_t destlen)
148 const char* inbuf = (const char*)src;
149 char* outbuf = (char*)dest;
150 smb_iconv_t descriptor;
152 if (srclen == (size_t)-1)
153 srclen = strlen(src)+1;
155 descriptor = get_conv_handle(from, to);
157 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
158 /* conversion not supported, use as is */
159 size_t len = MIN(srclen,destlen);
160 memcpy(dest,src,len);
166 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
167 if(retval==(size_t)-1) {
171 reason="Incomplete multibyte sequence";
174 reason="No more room";
175 if (from == CH_UNIX) {
176 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
177 charset_name(from), charset_name(to),
178 (int)srclen, (int)destlen,
181 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
182 charset_name(from), charset_name(to),
183 (int)srclen, (int)destlen));
187 reason="Illegal multibyte sequence";
190 /* smb_panic(reason); */
192 return destlen-o_len;
196 * Convert between character sets, allocating a new buffer using talloc for the result.
198 * @param srclen length of source buffer.
199 * @param dest always set at least to NULL
200 * @note -1 is not accepted for srclen.
202 * @returns Size in bytes of the converted string; or -1 in case of error.
205 ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
206 void const *src, size_t srclen, void **dest)
208 size_t i_len, o_len, destlen;
210 const char *inbuf = (const char *)src;
212 smb_iconv_t descriptor;
216 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
219 descriptor = get_conv_handle(from, to);
221 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
222 /* conversion not supported, return -1*/
223 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
224 charset_name(from), charset_name(to)));
228 /* it is _very_ rare that a conversion increases the size by
233 destlen = 2 + (destlen*3);
234 ob = talloc_realloc(ctx, outbuf, char, destlen);
236 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
243 /* we give iconv 2 less bytes to allow us to terminate at the
247 retval = smb_iconv(descriptor,
250 if(retval == (size_t)-1) {
251 const char *reason="unknown error";
254 reason="Incomplete multibyte sequence";
259 reason="Illegal multibyte sequence";
262 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
267 destlen = (destlen-2) - o_len;
269 /* guarantee null termination in all charsets */
270 SSVAL(ob, destlen, 0);
278 * Copy a string from a char* unix src to a dos codepage string destination.
280 * @return the number of bytes occupied by the string in the destination.
282 * @param flags can include
284 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
285 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
288 * @param dest_len the maximum length in bytes allowed in the
289 * destination. If @p dest_len is -1 then no maximum is used.
291 ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
296 if (flags & STR_UPPER) {
297 char *tmpbuf = strupper_talloc(NULL, src);
298 if (tmpbuf == NULL) {
301 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
306 /* treat a pstring as "unlimited" length */
307 if (dest_len == (size_t)-1)
308 dest_len = sizeof(pstring);
310 src_len = strlen(src);
312 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
315 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
319 * Copy a string from a unix char* src to an ASCII destination,
320 * allocating a buffer using talloc().
322 * @param dest always set at least to NULL
324 * @returns The number of bytes occupied by the string in the destination
325 * or -1 in case of error.
327 ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
329 size_t src_len = strlen(src)+1;
332 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
337 * Copy a string from a dos codepage source to a unix char* destination.
339 * The resulting string in "dest" is always null terminated.
341 * @param flags can have:
343 * <dt>STR_TERMINATE</dt>
344 * <dd>STR_TERMINATE means the string in @p src
345 * is null terminated, and src_len is ignored.</dd>
348 * @param src_len is the length of the source area in bytes.
349 * @returns the number of bytes occupied by the string in @p src.
351 ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
355 if (dest_len == (size_t)-1)
356 dest_len = sizeof(pstring);
358 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
359 if (src_len == (size_t)-1) {
360 src_len = strlen(src) + 1;
362 size_t len = strnlen(src, src_len);
369 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
372 dest[MIN(ret, dest_len-1)] = 0;
378 * Copy a string from a char* src to a unicode destination.
380 * @returns the number of bytes occupied by the string in the destination.
382 * @param flags can have:
385 * <dt>STR_TERMINATE <dd>means include the null termination.
386 * <dt>STR_UPPER <dd>means uppercase in the destination.
387 * <dt>STR_NOALIGN <dd>means don't do alignment.
390 * @param dest_len is the maximum length allowed in the
391 * destination. If dest_len is -1 then no maxiumum is used.
393 ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
396 size_t src_len = strlen(src);
399 if (flags & STR_UPPER) {
400 char *tmpbuf = strupper_talloc(NULL, src);
401 if (tmpbuf == NULL) {
404 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
409 /* treat a pstring as "unlimited" length */
410 if (dest_len == (size_t)-1)
411 dest_len = sizeof(pstring);
413 if (flags & STR_TERMINATE)
416 if (ucs2_align(NULL, dest, flags)) {
418 dest = (void *)((char *)dest + 1);
419 if (dest_len) dest_len--;
423 /* ucs2 is always a multiple of 2 bytes */
426 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
427 if (ret == (size_t)-1) {
438 * Copy a string from a unix char* src to a UCS2 destination,
439 * allocating a buffer using talloc().
441 * @param dest always set at least to NULL
443 * @returns The number of bytes occupied by the string in the destination
444 * or -1 in case of error.
446 ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
448 size_t src_len = strlen(src)+1;
450 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
455 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
457 * @param dest always set at least to NULL
459 * @returns The number of bytes occupied by the string in the destination
462 ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
464 size_t src_len = strlen(src)+1;
467 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
471 Copy a string from a ucs2 source to a unix char* destination.
473 STR_TERMINATE means the string in src is null terminated.
474 STR_NOALIGN means don't try to align.
475 if STR_TERMINATE is set then src_len is ignored if it is -1.
476 src_len is the length of the source area in bytes
477 Return the number of bytes occupied by the string in src.
478 The resulting string in "dest" is always null terminated.
481 size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
485 if (dest_len == (size_t)-1)
486 dest_len = sizeof(pstring);
488 if (ucs2_align(NULL, src, flags)) {
489 src = (const void *)((const char *)src + 1);
494 if (flags & STR_TERMINATE) {
495 if (src_len == (size_t)-1) {
496 src_len = utf16_len(src);
498 src_len = utf16_len_n(src, src_len);
502 /* ucs2 is always a multiple of 2 bytes */
503 if (src_len != (size_t)-1)
506 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
508 dest[MIN(ret, dest_len-1)] = 0;
514 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
516 * @param dest always set at least to NULL
518 * @returns The number of bytes occupied by the string in the destination
521 ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
523 size_t src_len = utf16_len(src);
525 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
529 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
531 * @param dest always set at least to NULL
533 * @returns The number of bytes occupied by the string in the destination
536 ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
538 size_t src_len = strlen(src)+1;
540 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
544 Copy a string from a char* src to a unicode or ascii
545 dos codepage destination choosing unicode or ascii based on the
546 flags in the SMB buffer starting at base_ptr.
547 Return the number of bytes occupied by the string in the destination.
549 STR_TERMINATE means include the null termination.
550 STR_UPPER means uppercase in the destination.
551 STR_ASCII use ascii even with unicode packet.
552 STR_NOALIGN means don't do alignment.
553 dest_len is the maximum length allowed in the destination. If dest_len
554 is -1 then no maxiumum is used.
557 ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
559 if (flags & STR_ASCII) {
560 return push_ascii(dest, src, dest_len, flags);
561 } else if (flags & STR_UNICODE) {
562 return push_ucs2(dest, src, dest_len, flags);
564 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
571 Copy a string from a unicode or ascii source (depending on
572 the packet flags) to a char* destination.
574 STR_TERMINATE means the string in src is null terminated.
575 STR_UNICODE means to force as unicode.
576 STR_ASCII use ascii even with unicode packet.
577 STR_NOALIGN means don't do alignment.
578 if STR_TERMINATE is set then src_len is ignored is it is -1
579 src_len is the length of the source area in bytes.
580 Return the number of bytes occupied by the string in src.
581 The resulting string in "dest" is always null terminated.
584 ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
586 if (flags & STR_ASCII) {
587 return pull_ascii(dest, src, dest_len, src_len, flags);
588 } else if (flags & STR_UNICODE) {
589 return pull_ucs2(dest, src, dest_len, src_len, flags);
591 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
598 return the unicode codepoint for the next multi-byte CH_UNIX character
601 also return the number of bytes consumed (which tells the caller
602 how many bytes to skip to get to the next CH_UNIX character)
604 return INVALID_CODEPOINT if the next character cannot be converted
606 codepoint_t next_codepoint(const char *str, size_t *size)
608 /* it cannot occupy more than 4 bytes in UTF16 format */
610 smb_iconv_t descriptor;
616 if ((str[0] & 0x80) == 0) {
618 return (codepoint_t)str[0];
621 /* we assume that no multi-byte character can take
622 more than 5 bytes. This is OK as we only
623 support codepoints up to 1M */
624 ilen_orig = strnlen(str, 5);
627 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
628 if (descriptor == (smb_iconv_t)-1) {
630 return INVALID_CODEPOINT;
633 /* this looks a little strange, but it is needed to cope
634 with codepoints above 64k */
637 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
641 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
643 /* we didn't convert any bytes */
645 return INVALID_CODEPOINT;
652 *size = ilen_orig - ilen;
655 return (codepoint_t)SVAL(buf, 0);
658 /* decode a 4 byte UTF16 character manually */
659 return (codepoint_t)0x10000 +
660 (buf[2] | ((buf[3] & 0x3)<<8) |
661 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
664 /* no other length is valid */
665 return INVALID_CODEPOINT;
669 push a single codepoint into a CH_UNIX string the target string must
670 be able to hold the full character, which is guaranteed if it is at
671 least 5 bytes in size. The caller may pass less than 5 bytes if they
672 are sure the character will fit (for example, you can assume that
673 uppercase/lowercase of a character will not add more than 1 byte)
675 return the number of bytes occupied by the CH_UNIX character, or
678 ssize_t push_codepoint(char *str, codepoint_t c)
680 smb_iconv_t descriptor;
690 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
691 if (descriptor == (smb_iconv_t)-1) {
700 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
709 buf[0] = (c>>10) & 0xFF;
710 buf[1] = (c>>18) | 0xd8;
712 buf[3] = ((c>>8) & 0x3) | 0xdc;
718 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);