2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/iconv.h"
30 * @brief Character-set conversion routines built on our iconv.
32 * @note Samba's internal character set (at least in the 3.0 series)
33 * is always the same as the one for the Unix filesystem. It is
34 * <b>not</b> necessarily UTF-8 and may be different on machines that
35 * need i18n filenames to be compatible with Unix software. It does
36 * have to be a superset of ASCII. All multibyte sequences must start
37 * with a byte with the high bit set.
43 * Return the name of a charset to give to iconv().
45 static const char *charset_name(charset_t ch)
47 const char *ret = NULL;
49 if (ch == CH_UTF16) ret = "UTF-16LE";
50 else if (ch == CH_UNIX) ret = lp_unix_charset();
51 else if (ch == CH_DOS) ret = lp_dos_charset();
52 else if (ch == CH_DISPLAY) ret = lp_display_charset();
53 else if (ch == CH_UTF8) ret = "UTF8";
54 else if (ch == CH_UTF16BE) ret = "UTF-16BE";
56 if (!ret || !*ret) ret = "ASCII";
60 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
63 re-initialize iconv conversion descriptors
68 for (c1=0;c1<NUM_CHARSETS;c1++) {
69 for (c2=0;c2<NUM_CHARSETS;c2++) {
70 if (conv_handles[c1][c2] != NULL) {
71 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
72 smb_iconv_close(conv_handles[c1][c2]);
74 conv_handles[c1][c2] = NULL;
82 on-demand initialisation of conversion handles
84 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
87 static int initialised;
88 /* auto-free iconv memory on exit so valgrind reports are easier
90 if (initialised == 0) {
94 /* we set back the locale to C to get ASCII-compatible
95 toupper/lower functions. For now we do not need
96 any other POSIX localisations anyway. When we
97 should really need localized string functions one
98 day we need to write our own ascii_tolower etc.
100 setlocale(LC_ALL, "C");
106 if (conv_handles[from][to]) {
107 return conv_handles[from][to];
110 n1 = charset_name(from);
111 n2 = charset_name(to);
113 conv_handles[from][to] = smb_iconv_open(n2,n1);
115 if (conv_handles[from][to] == (smb_iconv_t)-1) {
116 if ((from == CH_DOS || to == CH_DOS) &&
117 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
118 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
119 charset_name(CH_DOS)));
120 lp_set_cmdline("dos charset", "ASCII");
122 n1 = charset_name(from);
123 n2 = charset_name(to);
125 conv_handles[from][to] = smb_iconv_open(n2,n1);
129 return conv_handles[from][to];
134 * Convert string from one encoding to another, making error checking etc
136 * @param src pointer to source string (multibyte or singlebyte)
137 * @param srclen length of the source string in bytes
138 * @param dest pointer to destination string (multibyte or singlebyte)
139 * @param destlen maximal length allowed for string
140 * @returns the number of bytes occupied in the destination
142 ssize_t convert_string(charset_t from, charset_t to,
143 void const *src, size_t srclen,
144 void *dest, size_t destlen)
148 const char* inbuf = (const char*)src;
149 char* outbuf = (char*)dest;
150 smb_iconv_t descriptor;
152 if (srclen == (size_t)-1)
153 srclen = strlen(src)+1;
155 descriptor = get_conv_handle(from, to);
157 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
158 /* conversion not supported, use as is */
159 size_t len = MIN(srclen,destlen);
160 memcpy(dest,src,len);
166 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
167 if(retval==(size_t)-1) {
171 reason="Incomplete multibyte sequence";
174 reason="No more room";
175 if (from == CH_UNIX) {
176 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
177 charset_name(from), charset_name(to),
178 srclen, destlen, (const char *)src));
180 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
181 charset_name(from), charset_name(to),
186 reason="Illegal multibyte sequence";
189 /* smb_panic(reason); */
191 return destlen-o_len;
195 * Convert between character sets, allocating a new buffer using talloc for the result.
197 * @param srclen length of source buffer.
198 * @param dest always set at least to NULL
199 * @note -1 is not accepted for srclen.
201 * @returns Size in bytes of the converted string; or -1 in case of error.
204 ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
205 void const *src, size_t srclen, void **dest)
207 size_t i_len, o_len, destlen;
209 const char *inbuf = (const char *)src;
211 smb_iconv_t descriptor;
215 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
218 descriptor = get_conv_handle(from, to);
220 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
221 /* conversion not supported, return -1*/
222 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
223 charset_name(from), charset_name(to)));
227 /* it is _very_ rare that a conversion increases the size by
232 destlen = 2 + (destlen*3);
233 ob = talloc_realloc(ctx, outbuf, char, destlen);
235 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
242 /* we give iconv 2 less bytes to allow us to terminate at the
246 retval = smb_iconv(descriptor,
249 if(retval == (size_t)-1) {
250 const char *reason="unknown error";
253 reason="Incomplete multibyte sequence";
258 reason="Illegal multibyte sequence";
261 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
266 destlen = (destlen-2) - o_len;
268 /* guarantee null termination in all charsets */
269 SSVAL(ob, destlen, 0);
277 * Copy a string from a char* unix src to a dos codepage string destination.
279 * @return the number of bytes occupied by the string in the destination.
281 * @param flags can include
283 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
284 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
287 * @param dest_len the maximum length in bytes allowed in the
288 * destination. If @p dest_len is -1 then no maximum is used.
290 ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
295 if (flags & STR_UPPER) {
296 char *tmpbuf = strupper_talloc(NULL, src);
297 if (tmpbuf == NULL) {
300 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
305 /* treat a pstring as "unlimited" length */
306 if (dest_len == (size_t)-1)
307 dest_len = sizeof(pstring);
309 src_len = strlen(src);
311 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
314 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
318 * Copy a string from a unix char* src to an ASCII destination,
319 * allocating a buffer using talloc().
321 * @param dest always set at least to NULL
323 * @returns The number of bytes occupied by the string in the destination
324 * or -1 in case of error.
326 ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
328 size_t src_len = strlen(src)+1;
331 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
336 * Copy a string from a dos codepage source to a unix char* destination.
338 * The resulting string in "dest" is always null terminated.
340 * @param flags can have:
342 * <dt>STR_TERMINATE</dt>
343 * <dd>STR_TERMINATE means the string in @p src
344 * is null terminated, and src_len is ignored.</dd>
347 * @param src_len is the length of the source area in bytes.
348 * @returns the number of bytes occupied by the string in @p src.
350 ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
354 if (dest_len == (size_t)-1)
355 dest_len = sizeof(pstring);
357 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
358 if (src_len == (size_t)-1) {
359 src_len = strlen(src) + 1;
361 size_t len = strnlen(src, src_len);
368 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
371 dest[MIN(ret, dest_len-1)] = 0;
377 * Copy a string from a char* src to a unicode destination.
379 * @returns the number of bytes occupied by the string in the destination.
381 * @param flags can have:
384 * <dt>STR_TERMINATE <dd>means include the null termination.
385 * <dt>STR_UPPER <dd>means uppercase in the destination.
386 * <dt>STR_NOALIGN <dd>means don't do alignment.
389 * @param dest_len is the maximum length allowed in the
390 * destination. If dest_len is -1 then no maxiumum is used.
392 ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
395 size_t src_len = strlen(src);
398 if (flags & STR_UPPER) {
399 char *tmpbuf = strupper_talloc(NULL, src);
400 if (tmpbuf == NULL) {
403 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
408 /* treat a pstring as "unlimited" length */
409 if (dest_len == (size_t)-1)
410 dest_len = sizeof(pstring);
412 if (flags & STR_TERMINATE)
415 if (ucs2_align(NULL, dest, flags)) {
417 dest = (void *)((char *)dest + 1);
418 if (dest_len) dest_len--;
422 /* ucs2 is always a multiple of 2 bytes */
425 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
426 if (ret == (size_t)-1) {
437 * Copy a string from a unix char* src to a UCS2 destination,
438 * allocating a buffer using talloc().
440 * @param dest always set at least to NULL
442 * @returns The number of bytes occupied by the string in the destination
443 * or -1 in case of error.
445 ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
447 size_t src_len = strlen(src)+1;
449 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
454 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
456 * @param dest always set at least to NULL
458 * @returns The number of bytes occupied by the string in the destination
461 ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
463 size_t src_len = strlen(src)+1;
466 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
470 Copy a string from a ucs2 source to a unix char* destination.
472 STR_TERMINATE means the string in src is null terminated.
473 STR_NOALIGN means don't try to align.
474 if STR_TERMINATE is set then src_len is ignored if it is -1.
475 src_len is the length of the source area in bytes
476 Return the number of bytes occupied by the string in src.
477 The resulting string in "dest" is always null terminated.
480 size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
484 if (dest_len == (size_t)-1)
485 dest_len = sizeof(pstring);
487 if (ucs2_align(NULL, src, flags)) {
488 src = (const void *)((const char *)src + 1);
493 if (flags & STR_TERMINATE) {
494 if (src_len == (size_t)-1) {
495 src_len = utf16_len(src);
497 src_len = utf16_len_n(src, src_len);
501 /* ucs2 is always a multiple of 2 bytes */
502 if (src_len != (size_t)-1)
505 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
507 dest[MIN(ret, dest_len-1)] = 0;
513 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
515 * @param dest always set at least to NULL
517 * @returns The number of bytes occupied by the string in the destination
520 ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
522 size_t src_len = utf16_len(src);
524 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
528 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
530 * @param dest always set at least to NULL
532 * @returns The number of bytes occupied by the string in the destination
535 ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
537 size_t src_len = strlen(src)+1;
539 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
543 Copy a string from a char* src to a unicode or ascii
544 dos codepage destination choosing unicode or ascii based on the
545 flags in the SMB buffer starting at base_ptr.
546 Return the number of bytes occupied by the string in the destination.
548 STR_TERMINATE means include the null termination.
549 STR_UPPER means uppercase in the destination.
550 STR_ASCII use ascii even with unicode packet.
551 STR_NOALIGN means don't do alignment.
552 dest_len is the maximum length allowed in the destination. If dest_len
553 is -1 then no maxiumum is used.
556 ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
558 if (flags & STR_ASCII) {
559 return push_ascii(dest, src, dest_len, flags);
560 } else if (flags & STR_UNICODE) {
561 return push_ucs2(dest, src, dest_len, flags);
563 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
570 Copy a string from a unicode or ascii source (depending on
571 the packet flags) to a char* destination.
573 STR_TERMINATE means the string in src is null terminated.
574 STR_UNICODE means to force as unicode.
575 STR_ASCII use ascii even with unicode packet.
576 STR_NOALIGN means don't do alignment.
577 if STR_TERMINATE is set then src_len is ignored is it is -1
578 src_len is the length of the source area in bytes.
579 Return the number of bytes occupied by the string in src.
580 The resulting string in "dest" is always null terminated.
583 ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
585 if (flags & STR_ASCII) {
586 return pull_ascii(dest, src, dest_len, src_len, flags);
587 } else if (flags & STR_UNICODE) {
588 return pull_ucs2(dest, src, dest_len, src_len, flags);
590 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
597 return the unicode codepoint for the next multi-byte CH_UNIX character
600 also return the number of bytes consumed (which tells the caller
601 how many bytes to skip to get to the next CH_UNIX character)
603 return INVALID_CODEPOINT if the next character cannot be converted
605 codepoint_t next_codepoint(const char *str, size_t *size)
607 /* it cannot occupy more than 4 bytes in UTF16 format */
609 smb_iconv_t descriptor;
615 if ((str[0] & 0x80) == 0) {
617 return (codepoint_t)str[0];
620 /* we assume that no multi-byte character can take
621 more than 5 bytes. This is OK as we only
622 support codepoints up to 1M */
623 ilen_orig = strnlen(str, 5);
626 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
627 if (descriptor == (smb_iconv_t)-1) {
629 return INVALID_CODEPOINT;
632 /* this looks a little strange, but it is needed to cope
633 with codepoints above 64k */
636 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
640 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
642 /* we didn't convert any bytes */
644 return INVALID_CODEPOINT;
651 *size = ilen_orig - ilen;
654 return (codepoint_t)SVAL(buf, 0);
657 /* decode a 4 byte UTF16 character manually */
658 return (codepoint_t)0x10000 +
659 (buf[2] | ((buf[3] & 0x3)<<8) |
660 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
663 /* no other length is valid */
664 return INVALID_CODEPOINT;
668 push a single codepoint into a CH_UNIX string the target string must
669 be able to hold the full character, which is guaranteed if it is at
670 least 5 bytes in size. The caller may pass less than 5 bytes if they
671 are sure the character will fit (for example, you can assume that
672 uppercase/lowercase of a character will not add more than 1 byte)
674 return the number of bytes occupied by the CH_UNIX character, or
677 ssize_t push_codepoint(char *str, codepoint_t c)
679 smb_iconv_t descriptor;
689 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
690 if (descriptor == (smb_iconv_t)-1) {
699 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
708 buf[0] = (c>>10) & 0xFF;
709 buf[1] = (c>>18) | 0xd8;
711 buf[3] = ((c>>8) & 0x3) | 0xdc;
717 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);