2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
28 * @brief Character-set conversion routines built on our iconv.
30 * @note Samba's internal character set (at least in the 3.0 series)
31 * is always the same as the one for the Unix filesystem. It is
32 * <b>not</b> necessarily UTF-8 and may be different on machines that
33 * need i18n filenames to be compatible with Unix software. It does
34 * have to be a superset of ASCII. All multibyte sequences must start
35 * with a byte with the high bit set.
41 * Return the name of a charset to give to iconv().
43 static const char *charset_name(charset_t ch)
45 const char *ret = NULL;
47 if (ch == CH_UTF16) ret = "UTF-16LE";
48 else if (ch == CH_UNIX) ret = lp_unix_charset();
49 else if (ch == CH_DOS) ret = lp_dos_charset();
50 else if (ch == CH_DISPLAY) ret = lp_display_charset();
51 else if (ch == CH_UTF8) ret = "UTF8";
52 else if (ch == CH_UTF16BE) ret = "UTF-16BE";
54 if (!ret || !*ret) ret = "ASCII";
58 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
61 re-initialize iconv conversion descriptors
66 for (c1=0;c1<NUM_CHARSETS;c1++) {
67 for (c2=0;c2<NUM_CHARSETS;c2++) {
68 if (conv_handles[c1][c2] != NULL) {
69 if (conv_handles[c1][c2] != (smb_iconv_t)-1) {
70 smb_iconv_close(conv_handles[c1][c2]);
72 conv_handles[c1][c2] = NULL;
80 on-demand initialisation of conversion handles
82 static smb_iconv_t get_conv_handle(charset_t from, charset_t to)
85 static int initialised;
86 /* auto-free iconv memory on exit so valgrind reports are easier
88 if (initialised == 0) {
93 if (conv_handles[from][to]) {
94 return conv_handles[from][to];
97 n1 = charset_name(from);
98 n2 = charset_name(to);
100 conv_handles[from][to] = smb_iconv_open(n2,n1);
102 if (conv_handles[from][to] == (smb_iconv_t)-1) {
103 if ((from == CH_DOS || to == CH_DOS) &&
104 strcasecmp(charset_name(CH_DOS), "ASCII") != 0) {
105 DEBUG(0,("dos charset '%s' unavailable - using ASCII\n",
106 charset_name(CH_DOS)));
107 lp_set_cmdline("dos charset", "ASCII");
109 n1 = charset_name(from);
110 n2 = charset_name(to);
112 conv_handles[from][to] = smb_iconv_open(n2,n1);
116 return conv_handles[from][to];
121 * Convert string from one encoding to another, making error checking etc
123 * @param src pointer to source string (multibyte or singlebyte)
124 * @param srclen length of the source string in bytes
125 * @param dest pointer to destination string (multibyte or singlebyte)
126 * @param destlen maximal length allowed for string
127 * @returns the number of bytes occupied in the destination
129 ssize_t convert_string(charset_t from, charset_t to,
130 void const *src, size_t srclen,
131 void *dest, size_t destlen)
135 const char* inbuf = (const char*)src;
136 char* outbuf = (char*)dest;
137 smb_iconv_t descriptor;
139 if (srclen == (size_t)-1)
140 srclen = strlen(src)+1;
142 descriptor = get_conv_handle(from, to);
144 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
145 /* conversion not supported, use as is */
146 size_t len = MIN(srclen,destlen);
147 memcpy(dest,src,len);
153 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
154 if(retval==(size_t)-1) {
158 reason="Incomplete multibyte sequence";
161 reason="No more room";
162 if (from == CH_UNIX) {
163 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d - '%s'\n",
164 charset_name(from), charset_name(to),
165 srclen, destlen, (const char *)src));
167 DEBUG(0,("E2BIG: convert_string(%s,%s): srclen=%d destlen=%d\n",
168 charset_name(from), charset_name(to),
173 reason="Illegal multibyte sequence";
176 /* smb_panic(reason); */
178 return destlen-o_len;
182 * Convert between character sets, allocating a new buffer using talloc for the result.
184 * @param srclen length of source buffer.
185 * @param dest always set at least to NULL
186 * @note -1 is not accepted for srclen.
188 * @returns Size in bytes of the converted string; or -1 in case of error.
191 ssize_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
192 void const *src, size_t srclen, void **dest)
194 size_t i_len, o_len, destlen;
196 const char *inbuf = (const char *)src;
198 smb_iconv_t descriptor;
202 if (src == NULL || srclen == (size_t)-1 || srclen == 0)
205 descriptor = get_conv_handle(from, to);
207 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
208 /* conversion not supported, return -1*/
209 DEBUG(3, ("convert_string_talloc: conversion from %s to %s not supported!\n",
210 charset_name(from), charset_name(to)));
214 /* it is _very_ rare that a conversion increases the size by
219 destlen = 2 + (destlen*3);
220 ob = (char *)talloc_realloc(ctx, outbuf, destlen);
222 DEBUG(0, ("convert_string_talloc: realloc failed!\n"));
229 /* we give iconv 2 less bytes to allow us to terminate at the
233 retval = smb_iconv(descriptor,
236 if(retval == (size_t)-1) {
237 const char *reason="unknown error";
240 reason="Incomplete multibyte sequence";
245 reason="Illegal multibyte sequence";
248 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
253 destlen = (destlen-2) - o_len;
255 /* guarantee null termination in all charsets */
256 SSVAL(ob, destlen, 0);
264 * Copy a string from a char* unix src to a dos codepage string destination.
266 * @return the number of bytes occupied by the string in the destination.
268 * @param flags can include
270 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
271 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
274 * @param dest_len the maximum length in bytes allowed in the
275 * destination. If @p dest_len is -1 then no maximum is used.
277 ssize_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
282 if (flags & STR_UPPER) {
283 char *tmpbuf = strupper_talloc(NULL, src);
284 if (tmpbuf == NULL) {
287 ret = push_ascii(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
292 /* treat a pstring as "unlimited" length */
293 if (dest_len == (size_t)-1)
294 dest_len = sizeof(pstring);
296 src_len = strlen(src);
298 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
301 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
305 * Copy a string from a unix char* src to an ASCII destination,
306 * allocating a buffer using talloc().
308 * @param dest always set at least to NULL
310 * @returns The number of bytes occupied by the string in the destination
311 * or -1 in case of error.
313 ssize_t push_ascii_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
315 size_t src_len = strlen(src)+1;
318 return convert_string_talloc(ctx, CH_UNIX, CH_DOS, src, src_len, (void **)dest);
323 * Copy a string from a dos codepage source to a unix char* destination.
325 * The resulting string in "dest" is always null terminated.
327 * @param flags can have:
329 * <dt>STR_TERMINATE</dt>
330 * <dd>STR_TERMINATE means the string in @p src
331 * is null terminated, and src_len is ignored.</dd>
334 * @param src_len is the length of the source area in bytes.
335 * @returns the number of bytes occupied by the string in @p src.
337 ssize_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
341 if (dest_len == (size_t)-1)
342 dest_len = sizeof(pstring);
344 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII)) {
345 if (src_len == (size_t)-1) {
346 src_len = strlen(src) + 1;
348 size_t len = strnlen(src, src_len);
355 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
358 dest[MIN(ret, dest_len-1)] = 0;
364 * Copy a string from a char* src to a unicode destination.
366 * @returns the number of bytes occupied by the string in the destination.
368 * @param flags can have:
371 * <dt>STR_TERMINATE <dd>means include the null termination.
372 * <dt>STR_UPPER <dd>means uppercase in the destination.
373 * <dt>STR_NOALIGN <dd>means don't do alignment.
376 * @param dest_len is the maximum length allowed in the
377 * destination. If dest_len is -1 then no maxiumum is used.
379 ssize_t push_ucs2(void *dest, const char *src, size_t dest_len, int flags)
382 size_t src_len = strlen(src);
385 if (flags & STR_UPPER) {
386 char *tmpbuf = strupper_talloc(NULL, src);
387 if (tmpbuf == NULL) {
390 ret = push_ucs2(dest, tmpbuf, dest_len, flags & ~STR_UPPER);
395 /* treat a pstring as "unlimited" length */
396 if (dest_len == (size_t)-1)
397 dest_len = sizeof(pstring);
399 if (flags & STR_TERMINATE)
402 if (ucs2_align(NULL, dest, flags)) {
404 dest = (void *)((char *)dest + 1);
405 if (dest_len) dest_len--;
409 /* ucs2 is always a multiple of 2 bytes */
412 ret = convert_string(CH_UNIX, CH_UTF16, src, src_len, dest, dest_len);
413 if (ret == (size_t)-1) {
424 * Copy a string from a unix char* src to a UCS2 destination,
425 * allocating a buffer using talloc().
427 * @param dest always set at least to NULL
429 * @returns The number of bytes occupied by the string in the destination
430 * or -1 in case of error.
432 ssize_t push_ucs2_talloc(TALLOC_CTX *ctx, void **dest, const char *src)
434 size_t src_len = strlen(src)+1;
436 return convert_string_talloc(ctx, CH_UNIX, CH_UTF16, src, src_len, dest);
441 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
443 * @param dest always set at least to NULL
445 * @returns The number of bytes occupied by the string in the destination
448 ssize_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
450 size_t src_len = strlen(src)+1;
453 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
457 Copy a string from a ucs2 source to a unix char* destination.
459 STR_TERMINATE means the string in src is null terminated.
460 STR_NOALIGN means don't try to align.
461 if STR_TERMINATE is set then src_len is ignored if it is -1.
462 src_len is the length of the source area in bytes
463 Return the number of bytes occupied by the string in src.
464 The resulting string in "dest" is always null terminated.
467 size_t pull_ucs2(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
471 if (dest_len == (size_t)-1)
472 dest_len = sizeof(pstring);
474 if (ucs2_align(NULL, src, flags)) {
475 src = (const void *)((const char *)src + 1);
480 if (flags & STR_TERMINATE) {
481 if (src_len == (size_t)-1) {
482 src_len = utf16_len(src);
484 src_len = utf16_len_n(src, src_len);
488 /* ucs2 is always a multiple of 2 bytes */
489 if (src_len != (size_t)-1)
492 ret = convert_string(CH_UTF16, CH_UNIX, src, src_len, dest, dest_len);
494 dest[MIN(ret, dest_len-1)] = 0;
499 ssize_t pull_ucs2_pstring(char *dest, const void *src)
501 return pull_ucs2(dest, src, sizeof(pstring), -1, STR_TERMINATE);
505 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
507 * @param dest always set at least to NULL
509 * @returns The number of bytes occupied by the string in the destination
512 ssize_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const void *src)
514 size_t src_len = utf16_len(src);
516 return convert_string_talloc(ctx, CH_UTF16, CH_UNIX, src, src_len, (void **)dest);
520 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
522 * @param dest always set at least to NULL
524 * @returns The number of bytes occupied by the string in the destination
527 ssize_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
529 size_t src_len = strlen(src)+1;
531 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
535 Copy a string from a char* src to a unicode or ascii
536 dos codepage destination choosing unicode or ascii based on the
537 flags in the SMB buffer starting at base_ptr.
538 Return the number of bytes occupied by the string in the destination.
540 STR_TERMINATE means include the null termination.
541 STR_UPPER means uppercase in the destination.
542 STR_ASCII use ascii even with unicode packet.
543 STR_NOALIGN means don't do alignment.
544 dest_len is the maximum length allowed in the destination. If dest_len
545 is -1 then no maxiumum is used.
548 ssize_t push_string(void *dest, const char *src, size_t dest_len, int flags)
550 if (flags & STR_ASCII) {
551 return push_ascii(dest, src, dest_len, flags);
552 } else if (flags & STR_UNICODE) {
553 return push_ucs2(dest, src, dest_len, flags);
555 smb_panic("push_string requires either STR_ASCII or STR_UNICODE flag to be set");
562 Copy a string from a unicode or ascii source (depending on
563 the packet flags) to a char* destination.
565 STR_TERMINATE means the string in src is null terminated.
566 STR_UNICODE means to force as unicode.
567 STR_ASCII use ascii even with unicode packet.
568 STR_NOALIGN means don't do alignment.
569 if STR_TERMINATE is set then src_len is ignored is it is -1
570 src_len is the length of the source area in bytes.
571 Return the number of bytes occupied by the string in src.
572 The resulting string in "dest" is always null terminated.
575 ssize_t pull_string(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
577 if (flags & STR_ASCII) {
578 return pull_ascii(dest, src, dest_len, src_len, flags);
579 } else if (flags & STR_UNICODE) {
580 return pull_ucs2(dest, src, dest_len, src_len, flags);
582 smb_panic("pull_string requires either STR_ASCII or STR_UNICODE flag to be set");
589 return the unicode codepoint for the next multi-byte CH_UNIX character
592 also return the number of bytes consumed (which tells the caller
593 how many bytes to skip to get to the next CH_UNIX character)
595 return INVALID_CODEPOINT if the next character cannot be converted
597 codepoint_t next_codepoint(const char *str, size_t *size)
599 /* it cannot occupy more than 4 bytes in UTF16 format */
601 smb_iconv_t descriptor;
607 if ((str[0] & 0x80) == 0) {
609 return (codepoint_t)str[0];
612 /* we assume that no multi-byte character can take
613 more than 5 bytes. This is OK as we only
614 support codepoints up to 1M */
615 ilen_orig = strnlen(str, 5);
618 descriptor = get_conv_handle(CH_UNIX, CH_UTF16);
619 if (descriptor == (smb_iconv_t)-1) {
621 return INVALID_CODEPOINT;
624 /* this looks a little strange, but it is needed to cope
625 with codepoints above 64k */
628 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
632 smb_iconv(descriptor, &str, &ilen, &outbuf, &olen);
634 /* we didn't convert any bytes */
636 return INVALID_CODEPOINT;
643 *size = ilen_orig - ilen;
646 return (codepoint_t)SVAL(buf, 0);
649 /* decode a 4 byte UTF16 character manually */
650 return (codepoint_t)0x10000 +
651 (buf[2] | ((buf[3] & 0x3)<<8) |
652 (buf[0]<<10) | ((buf[1] & 0x3)<<18));
655 /* no other length is valid */
656 return INVALID_CODEPOINT;
660 push a single codepoint into a CH_UNIX string the target string must
661 be able to hold the full character, which is guaranteed if it is at
662 least 5 bytes in size. The caller may pass less than 5 bytes if they
663 are sure the character will fit (for example, you can assume that
664 uppercase/lowercase of a character will not add more than 1 byte)
666 return the number of bytes occupied by the CH_UNIX character, or
669 ssize_t push_codepoint(char *str, codepoint_t c)
671 smb_iconv_t descriptor;
681 descriptor = get_conv_handle(CH_UTF16, CH_UNIX);
682 if (descriptor == (smb_iconv_t)-1) {
691 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);
700 buf[0] = (c>>10) & 0xFF;
701 buf[1] = (c>>18) | 0xd8;
703 buf[3] = ((c>>8) & 0x3) | 0xdc;
709 smb_iconv(descriptor, &inbuf, &ilen, &str, &olen);