2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 #include "system/iconv.h"
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
42 * Return the name of a charset to give to iconv().
44 static const char *charset_name(charset_t ch)
46 const char *ret = NULL;
48 if (ch == CH_UTF16) ret = "UTF-16LE";
49 else if (ch == CH_UNIX) ret = lp_unix_charset();
50 else if (ch == CH_DOS) ret = lp_dos_charset();
51 else if (ch == CH_DISPLAY) ret = lp_display_charset();
52 else if (ch == CH_UTF8) ret = "UTF8";
53 else if (ch == CH_UTF16BE) ret = "UTF-16BE";
55 if (!ret || !*ret) ret = "ASCII";
59 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
62 re-initialize iconv conversion descriptors
64 _PUBLIC_ void init_iconv(void)
67 for (c1=0;c1<NUM_CHARSETS;c1++) {
68 for (c2=0;c2<NUM_CHARSETS;c2++) {
69 if (conv_handles[c1][c2] != NULL) {
70 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
71 smb_iconv_close(conv_handles[c1][c2]);
73 conv_handles[c1][c2] = NULL;
81 on-demand initialisation of conversion handles
83 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
86 static int initialised;
87 /* auto-free iconv memory on exit so valgrind reports are easier
89 if (initialised == 0) {
93 /* we set back the locale to C to get ASCII-compatible
94 toupper/lower functions. For now we do not need
95 any other POSIX localisations anyway. When we
96 should really need localized string functions one
97 day we need to write our own ascii_tolower etc.
99 setlocale(LC_ALL, "C");
105 if (conv_handles[from][to]) {
106 return conv_handles[from][to];
109 n1 = charset_name(from);
110 n2 = charset_name(to);
112 conv_handles[from][to] = smb_iconv_open(n2,n1);
114 if (conv_handles[from][to] == (smb_iconv_t)-1) {
115 if ((from == CH_DOS || to == CH_DOS) &&
116 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
117 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
118 charset_name(CH_DOS)));
119 lp_set_cmdline("dos charset", "ASCII");
121 n1 = charset_name(from);
122 n2 = charset_name(to);
124 conv_handles[from][to] = smb_iconv_open(n2,n1);
128 return conv_handles[from][to];
133 * Convert string from one encoding to another, making error checking etc
135 * @param src pointer to source string (multibyte or singlebyte)
136 * @param srclen length of the source string in bytes
137 * @param dest pointer to destination string (multibyte or singlebyte)
138 * @param destlen maximal length allowed for string
139 * @returns the number of bytes occupied in the destination
141 _PUBLIC_ ssize_t convert_string(charset_t from, charset_t to,
142 void const *src, size_t srclen,
143 void *dest, size_t destlen)
147 const char* inbuf = (const char*)src;
148 char* outbuf = (char*)dest;
149 smb_iconv_t descriptor;
151 if (srclen == (size_t)-1)
152 srclen = strlen(src)+1;
154 descriptor = get_conv_handle(from, to);
156 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
157 /* conversion not supported, use as is */
158 size_t len = MIN(srclen,destlen);
159 memcpy(dest,src,len);
165 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
166 if(retval==(size_t)-1) {
170 reason="Incomplete multibyte sequence";
173 reason="No more room";
174 if (from == CH_UNIX) {
175 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
176 charset_name(from), charset_name(to),
177 (int)srclen, (int)destlen,
180 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
181 charset_name(from), charset_name(to),
182 (int)srclen, (int)destlen));
186 reason="Illegal multibyte sequence";
189 /* smb_panic(reason); */
191 return destlen-o_len;
195 * Convert between character sets, allocating a new buffer using talloc for the result.
197 * @param srclen length of source buffer.
198 * @param dest always set at least to NULL
199 * @note -1 is not accepted for srclen.
201 * @returns Size in bytes of the converted string; or -1 in case of error.
204 _PUBLIC_ ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
205 void const *src, size_t srclen, void **dest)
207 size_t i_len, o_len, destlen;
209 const char *inbuf = (const char *)src;
211 smb_iconv_t descriptor;
215 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
218 descriptor = get_conv_handle(from, to);
220 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
221 /* conversion not supported, return -1*/
222 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
223 charset_name(from), charset_name(to)));
227 /* it is _very_ rare that a conversion increases the size by
232 destlen = 2 + (destlen*3);
233 ob = talloc_realloc(ctx, outbuf, char, destlen);
235 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
242 /* we give iconv 2 less bytes to allow us to terminate at the
246 retval = smb_iconv(descriptor,
249 if(retval == (size_t)-1) {
250 const char *reason="unknown error";
253 reason="Incomplete multibyte sequence";
258 reason="Illegal multibyte sequence";
261 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
266 destlen = (destlen-2) - o_len;
268 /* guarantee null termination in all charsets */
269 SSVAL(ob, destlen, 0);
277 * Copy a string from a char* unix src to a dos codepage string destination.
279 * @return the number of bytes occupied by the string in the destination.
281 * @param flags can include
283 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
284 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
287 * @param dest_len the maximum length in bytes allowed in the
288 * destination. If @p dest_len is -1 then no maximum is used.
290 _PUBLIC_ ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
295 if (flags & STR_UPPER) {
296 char *tmpbuf = strupper_talloc(NULL, src);
297 if (tmpbuf == NULL) {
300 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
305 src_len = strlen(src);
307 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
310 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
314 * Copy a string from a unix char* src to an ASCII destination,
315 * allocating a buffer using talloc().
317 * @param dest always set at least to NULL
319 * @returns The number of bytes occupied by the string in the destination
320 * or -1 in case of error.
322 _PUBLIC_ ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
324 size_t src_len = strlen(src)+1;
327 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
332 * Copy a string from a dos codepage source to a unix char* destination.
334 * The resulting string in "dest" is always null terminated.
336 * @param flags can have:
338 * <dt>STR_TERMINATE</dt>
339 * <dd>STR_TERMINATE means the string in @p src
340 * is null terminated, and src_len is ignored.</dd>
343 * @param src_len is the length of the source area in bytes.
344 * @returns the number of bytes occupied by the string in @p src.
346 _PUBLIC_ ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
350 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
351 if (src_len == (size_t)-1) {
352 src_len = strlen(src) + 1;
354 size_t len = strnlen(src, src_len);
361 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
364 dest[MIN(ret, dest_len-1)] = 0;
370 * Copy a string from a char* src to a unicode destination.
372 * @returns the number of bytes occupied by the string in the destination.
374 * @param flags can have:
377 * <dt>STR_TERMINATE <dd>means include the null termination.
378 * <dt>STR_UPPER <dd>means uppercase in the destination.
379 * <dt>STR_NOALIGN <dd>means don't do alignment.
382 * @param dest_len is the maximum length allowed in the
383 * destination. If dest_len is -1 then no maxiumum is used.
385 _PUBLIC_ ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
388 size_t src_len = strlen(src);
391 if (flags & STR_UPPER) {
392 char *tmpbuf = strupper_talloc(NULL, src);
393 if (tmpbuf == NULL) {
396 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
401 if (flags & STR_TERMINATE)
404 if (ucs2_align(NULL, dest, flags)) {
406 dest = (void *)((char *)dest + 1);
407 if (dest_len) dest_len--;
411 /* ucs2 is always a multiple of 2 bytes */
414 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
415 if (ret == (size_t)-1) {
426 * Copy a string from a unix char* src to a UCS2 destination,
427 * allocating a buffer using talloc().
429 * @param dest always set at least to NULL
431 * @returns The number of bytes occupied by the string in the destination
432 * or -1 in case of error.
434 _PUBLIC_ ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
436 size_t src_len = strlen(src)+1;
438 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
443 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
445 * @param dest always set at least to NULL
447 * @returns The number of bytes occupied by the string in the destination
450 _PUBLIC_ ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
452 size_t src_len = strlen(src)+1;
455 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
459 Copy a string from a ucs2 source to a unix char* destination.
461 STR_TERMINATE means the string in src is null terminated.
462 STR_NOALIGN means don't try to align.
463 if STR_TERMINATE is set then src_len is ignored if it is -1.
464 src_len is the length of the source area in bytes
465 Return the number of bytes occupied by the string in src.
466 The resulting string in "dest" is always null terminated.
469 _PUBLIC_ size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
473 if (ucs2_align(NULL, src, flags)) {
474 src = (const void *)((const char *)src + 1);
479 if (flags & STR_TERMINATE) {
480 if (src_len == (size_t)-1) {
481 src_len = utf16_len(src);
483 src_len = utf16_len_n(src, src_len);
487 /* ucs2 is always a multiple of 2 bytes */
488 if (src_len != (size_t)-1)
491 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
493 dest[MIN(ret, dest_len-1)] = 0;
499 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
501 * @param dest always set at least to NULL
503 * @returns The number of bytes occupied by the string in the destination
506 _PUBLIC_ ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
508 size_t src_len = utf16_len(src);
510 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
514 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
516 * @param dest always set at least to NULL
518 * @returns The number of bytes occupied by the string in the destination
521 _PUBLIC_ ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
523 size_t src_len = strlen(src)+1;
525 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
529 Copy a string from a char* src to a unicode or ascii
530 dos codepage destination choosing unicode or ascii based on the
531 flags in the SMB buffer starting at base_ptr.
532 Return the number of bytes occupied by the string in the destination.
534 STR_TERMINATE means include the null termination.
535 STR_UPPER means uppercase in the destination.
536 STR_ASCII use ascii even with unicode packet.
537 STR_NOALIGN means don't do alignment.
538 dest_len is the maximum length allowed in the destination. If dest_len
539 is -1 then no maxiumum is used.
542 _PUBLIC_ ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
544 if (flags & STR_ASCII) {
545 return push_ascii(dest, src, dest_len, flags);
546 } else if (flags & STR_UNICODE) {
547 return push_ucs2(dest, src, dest_len, flags);
549 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
556 Copy a string from a unicode or ascii source (depending on
557 the packet flags) to a char* destination.
559 STR_TERMINATE means the string in src is null terminated.
560 STR_UNICODE means to force as unicode.
561 STR_ASCII use ascii even with unicode packet.
562 STR_NOALIGN means don't do alignment.
563 if STR_TERMINATE is set then src_len is ignored is it is -1
564 src_len is the length of the source area in bytes.
565 Return the number of bytes occupied by the string in src.
566 The resulting string in "dest" is always null terminated.
569 _PUBLIC_ ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
571 if (flags & STR_ASCII) {
572 return pull_ascii(dest, src, dest_len, src_len, flags);
573 } else if (flags & STR_UNICODE) {
574 return pull_ucs2(dest, src, dest_len, src_len, flags);
576 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
583 return the unicode codepoint for the next multi-byte CH_UNIX character
586 also return the number of bytes consumed (which tells the caller
587 how many bytes to skip to get to the next CH_UNIX character)
589 return INVALID_CODEPOINT if the next character cannot be converted
591 _PUBLIC_ codepoint_t next_codepoint(const char *str, size_t *size)
593 /* it cannot occupy more than 4 bytes in UTF16 format */
595 smb_iconv_t descriptor;
601 if ((str[0] & 0x80) == 0) {
603 return (codepoint_t)str[0];
606 /* we assume that no multi-byte character can take
607 more than 5 bytes. This is OK as we only
608 support codepoints up to 1M */
609 ilen_orig = strnlen(str, 5);
612 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
613 if (descriptor == (smb_iconv_t)-1) {
615 return INVALID_CODEPOINT;
618 /* this looks a little strange, but it is needed to cope
619 with codepoints above 64k */
622 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
626 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
628 /* we didn't convert any bytes */
630 return INVALID_CODEPOINT;
637 *size = ilen_orig - ilen;
640 return (codepoint_t)SVAL(buf, 0);
643 /* decode a 4 byte UTF16 character manually */
644 return (codepoint_t)0x10000 +
645 (buf[2] | ((buf[3] & 0x3)<<8) |
646 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
649 /* no other length is valid */
650 return INVALID_CODEPOINT;
654 push a single codepoint into a CH_UNIX string the target string must
655 be able to hold the full character, which is guaranteed if it is at
656 least 5 bytes in size. The caller may pass less than 5 bytes if they
657 are sure the character will fit (for example, you can assume that
658 uppercase/lowercase of a character will not add more than 1 byte)
660 return the number of bytes occupied by the CH_UNIX character, or
663 _PUBLIC_ ssize_t push_codepoint(char *str, codepoint_t c)
665 smb_iconv_t descriptor;
675 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
676 if (descriptor == (smb_iconv_t)-1) {
685 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
694 buf[0] = (c>>10) & 0xFF;
695 buf[1] = (c>>18) | 0xd8;
697 buf[3] = ((c>>8) & 0x3) | 0xdc;
703 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);