2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Martin Pool 2003
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
42 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
46 * Return the name of a charset to give to iconv().
48 static const char *charset_name(charset_t ch)
50 const char *ret = NULL;
52 if (ch == CH_UCS2) ret = "UCS-2LE";
53 else if (ch == CH_UNIX) ret = lp_unix_charset();
54 else if (ch == CH_DOS) ret = lp_dos_charset();
55 else if (ch == CH_DISPLAY) ret = lp_display_charset();
56 else if (ch == CH_UTF8) ret = "UTF8";
58 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
59 if (ret && strcasecmp(ret, "LOCALE") == 0) {
60 const char *ln = NULL;
63 setlocale(LC_ALL, "");
65 ln = nl_langinfo(CODESET);
67 /* Check whether the charset name is supported
69 smb_iconv_t handle = smb_iconv_open(ln,"UCS-2LE");
70 if (handle == (smb_iconv_t) -1) {
71 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
74 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
75 smb_iconv_close(handle);
82 if (!ret || !*ret) ret = "ASCII";
86 void lazy_initialize_conv(void)
88 static int initialized = False;
98 * Initialize iconv conversion descriptors.
100 * This is called the first time it is needed, and also called again
101 * every time the configuration is reloaded, because the charset or
102 * codepage might have changed.
104 void init_iconv(void)
107 BOOL did_reload = False;
109 /* so that charset_name() works we need to get the UNIX<->UCS2 going
111 if (!conv_handles[CH_UNIX][CH_UCS2])
112 conv_handles[CH_UNIX][CH_UCS2] = smb_iconv_open("UCS-2LE", "ASCII");
114 if (!conv_handles[CH_UCS2][CH_UNIX])
115 conv_handles[CH_UCS2][CH_UNIX] = smb_iconv_open("ASCII", "UCS-2LE");
117 for (c1=0;c1<NUM_CHARSETS;c1++) {
118 for (c2=0;c2<NUM_CHARSETS;c2++) {
119 const char *n1 = charset_name((charset_t)c1);
120 const char *n2 = charset_name((charset_t)c2);
121 if (conv_handles[c1][c2] &&
122 strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
123 strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
128 if (conv_handles[c1][c2])
129 smb_iconv_close(conv_handles[c1][c2]);
131 conv_handles[c1][c2] = smb_iconv_open(n2,n1);
132 if (conv_handles[c1][c2] == (smb_iconv_t)-1) {
133 DEBUG(0,("Conversion from %s to %s not supported\n",
134 charset_name((charset_t)c1), charset_name((charset_t)c2)));
135 conv_handles[c1][c2] = NULL;
141 /* XXX: Does this really get called every time the dos
142 * codepage changes? */
143 /* XXX: Is the did_reload test too strict? */
144 init_doschar_table();
150 * Convert string from one encoding to another, making error checking etc
152 * @param src pointer to source string (multibyte or singlebyte)
153 * @param srclen length of the source string in bytes
154 * @param dest pointer to destination string (multibyte or singlebyte)
155 * @param destlen maximal length allowed for string
156 * @returns the number of bytes occupied in the destination
158 size_t convert_string(charset_t from, charset_t to,
159 void const *src, size_t srclen,
160 void *dest, size_t destlen)
164 const char* inbuf = (const char*)src;
165 char* outbuf = (char*)dest;
166 smb_iconv_t descriptor;
168 if (srclen == (size_t)-1)
169 srclen = strlen(src)+1;
171 lazy_initialize_conv();
173 descriptor = conv_handles[from][to];
175 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
176 /* conversion not supported, use as is */
177 size_t len = MIN(srclen,destlen);
178 memcpy(dest,src,len);
184 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
185 if(retval==(size_t)-1) {
186 const char *reason="unknown error";
189 reason="Incomplete multibyte sequence";
192 reason="No more room";
193 DEBUG(0, ("convert_string: Required %d, available %d\n",
195 /* we are not sure we need srclen bytes,
196 may be more, may be less.
197 We only know we need more than destlen
201 reason="Illegal multibyte sequence";
204 /* smb_panic(reason); */
206 return destlen-o_len;
210 * Convert between character sets, allocating a new buffer for the result.
212 * @param srclen length of source buffer.
213 * @param dest always set at least to NULL
214 * @note -1 is not accepted for srclen.
216 * @returns Size in bytes of the converted string; or -1 in case of error.
219 size_t convert_string_allocate(charset_t from, charset_t to,
220 void const *src, size_t srclen, void **dest)
222 size_t i_len, o_len, destlen;
224 const char *inbuf = (const char *)src;
226 smb_iconv_t descriptor;
230 if (src == NULL || srclen == (size_t)-1)
233 lazy_initialize_conv();
235 descriptor = conv_handles[from][to];
237 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
238 /* conversion not supported, return -1*/
239 DEBUG(3, ("convert_string_allocate: conversion not supported!\n"));
243 destlen = MAX(srclen, 512);
246 destlen = destlen * 2;
247 ob = (char *)Realloc(outbuf, destlen);
249 DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
257 retval = smb_iconv(descriptor,
260 if(retval == (size_t)-1) {
261 const char *reason="unknown error";
264 reason="Incomplete multibyte sequence";
269 reason="Illegal multibyte sequence";
272 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
273 /* smb_panic(reason); */
277 destlen = destlen - o_len;
278 *dest = (char *)Realloc(ob,destlen);
279 if (destlen && !*dest) {
280 DEBUG(0, ("convert_string_allocate: out of memory!\n"));
290 * Convert between character sets, allocating a new buffer using talloc for the result.
292 * @param srclen length of source buffer.
293 * @param dest always set at least to NULL
294 * @note -1 is not accepted for srclen.
296 * @returns Size in bytes of the converted string; or -1 in case of error.
298 static size_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
299 void const *src, size_t srclen, void **dest)
301 void *alloced_string;
304 /* FIXME: Ridiculous to allocate two buffers and then copy the string! */
307 dest_len=convert_string_allocate(from, to, src, srclen, &alloced_string);
308 if (dest_len == (size_t)-1)
310 *dest = talloc_memdup(ctx, alloced_string, dest_len);
311 SAFE_FREE(alloced_string);
317 size_t unix_strupper(const char *src, size_t srclen, char *dest, size_t destlen)
322 size = convert_string_allocate(CH_UNIX, CH_UCS2, src, srclen,
325 smb_panic("failed to create UCS2 buffer");
327 if (!strupper_w(buffer) && (dest == src)) {
332 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
337 size_t unix_strlower(const char *src, size_t srclen, char *dest, size_t destlen)
342 size = convert_string_allocate(CH_UNIX, CH_UCS2, src, srclen,
345 smb_panic("failed to create UCS2 buffer");
347 if (!strlower_w(buffer) && (dest == src)) {
351 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
357 static size_t ucs2_align(const void *base_ptr, const void *p, int flags)
359 if (flags & (STR_NOALIGN|STR_ASCII))
361 return PTR_DIFF(p, base_ptr) & 1;
366 * Copy a string from a char* unix src to a dos codepage string destination.
368 * @return the number of bytes occupied by the string in the destination.
370 * @param flags can include
372 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
373 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
376 * @param dest_len the maximum length in bytes allowed in the
377 * destination. If @p dest_len is -1 then no maximum is used.
379 size_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
381 size_t src_len = strlen(src);
384 /* treat a pstring as "unlimited" length */
385 if (dest_len == (size_t)-1)
386 dest_len = sizeof(pstring);
388 if (flags & STR_UPPER) {
389 pstrcpy(tmpbuf, src);
394 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
397 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
400 size_t push_ascii_fstring(void *dest, const char *src)
402 return push_ascii(dest, src, sizeof(fstring), STR_TERMINATE);
405 size_t push_ascii_pstring(void *dest, const char *src)
407 return push_ascii(dest, src, sizeof(pstring), STR_TERMINATE);
411 * Copy a string from a dos codepage source to a unix char* destination.
413 * The resulting string in "dest" is always null terminated.
415 * @param flags can have:
417 * <dt>STR_TERMINATE</dt>
418 * <dd>STR_TERMINATE means the string in @p src
419 * is null terminated, and src_len is ignored.</dd>
422 * @param src_len is the length of the source area in bytes.
423 * @returns the number of bytes occupied by the string in @p src.
425 size_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
429 if (dest_len == (size_t)-1)
430 dest_len = sizeof(pstring);
432 if (flags & STR_TERMINATE) {
433 if (src_len == (size_t)-1) {
434 src_len = strlen(src) + 1;
436 size_t len = strnlen(src, src_len);
443 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
446 dest[MIN(ret, dest_len-1)] = 0;
453 size_t pull_ascii_pstring(char *dest, const void *src)
455 return pull_ascii(dest, src, sizeof(pstring), -1, STR_TERMINATE);
458 size_t pull_ascii_fstring(char *dest, const void *src)
460 return pull_ascii(dest, src, sizeof(fstring), -1, STR_TERMINATE);
464 * Copy a string from a char* src to a unicode destination.
466 * @returns the number of bytes occupied by the string in the destination.
468 * @param flags can have:
471 * <dt>STR_TERMINATE <dd>means include the null termination.
472 * <dt>STR_UPPER <dd>means uppercase in the destination.
473 * <dt>STR_NOALIGN <dd>means don't do alignment.
476 * @param dest_len is the maximum length allowed in the
477 * destination. If dest_len is -1 then no maxiumum is used.
479 size_t push_ucs2(const void *base_ptr, void *dest, const char *src, size_t dest_len, int flags)
482 size_t src_len = strlen(src);
485 /* treat a pstring as "unlimited" length */
486 if (dest_len == (size_t)-1)
487 dest_len = sizeof(pstring);
489 if (flags & STR_UPPER) {
490 pstrcpy(tmpbuf, src);
495 if (flags & STR_TERMINATE)
498 if (ucs2_align(base_ptr, dest, flags)) {
500 dest = (void *)((char *)dest + 1);
501 if (dest_len) dest_len--;
505 /* ucs2 is always a multiple of 2 bytes */
508 len += convert_string(CH_UNIX, CH_UCS2, src, src_len, dest, dest_len);
514 * Copy a string from a unix char* src to a UCS2 destination,
515 * allocating a buffer using talloc().
517 * @param dest always set at least to NULL
519 * @returns The number of bytes occupied by the string in the destination
520 * or -1 in case of error.
522 size_t push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src)
524 size_t src_len = strlen(src)+1;
527 return convert_string_talloc(ctx, CH_UNIX, CH_UCS2, src, src_len, (void **)dest);
532 * Copy a string from a unix char* src to a UCS2 destination, allocating a buffer
534 * @param dest always set at least to NULL
536 * @returns The number of bytes occupied by the string in the destination
537 * or -1 in case of error.
540 size_t push_ucs2_allocate(smb_ucs2_t **dest, const char *src)
542 size_t src_len = strlen(src)+1;
545 return convert_string_allocate(CH_UNIX, CH_UCS2, src, src_len, (void **)dest);
549 Copy a string from a char* src to a UTF-8 destination.
550 Return the number of bytes occupied by the string in the destination
552 STR_TERMINATE means include the null termination
553 STR_UPPER means uppercase in the destination
554 dest_len is the maximum length allowed in the destination. If dest_len
555 is -1 then no maxiumum is used.
558 static size_t push_utf8(void *dest, const char *src, size_t dest_len, int flags)
560 size_t src_len = strlen(src);
563 /* treat a pstring as "unlimited" length */
564 if (dest_len == (size_t)-1)
565 dest_len = sizeof(pstring);
567 if (flags & STR_UPPER) {
568 pstrcpy(tmpbuf, src);
573 if (flags & STR_TERMINATE)
576 return convert_string(CH_UNIX, CH_UTF8, src, src_len, dest, dest_len);
579 size_t push_utf8_fstring(void *dest, const char *src)
581 return push_utf8(dest, src, sizeof(fstring), STR_TERMINATE);
585 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
587 * @param dest always set at least to NULL
589 * @returns The number of bytes occupied by the string in the destination
592 size_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
594 size_t src_len = strlen(src)+1;
597 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void**)dest);
601 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer
603 * @param dest always set at least to NULL
605 * @returns The number of bytes occupied by the string in the destination
608 size_t push_utf8_allocate(char **dest, const char *src)
610 size_t src_len = strlen(src)+1;
613 return convert_string_allocate(CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
617 Copy a string from a ucs2 source to a unix char* destination.
619 STR_TERMINATE means the string in src is null terminated.
620 STR_NOALIGN means don't try to align.
621 if STR_TERMINATE is set then src_len is ignored if it is -1.
622 src_len is the length of the source area in bytes
623 Return the number of bytes occupied by the string in src.
624 The resulting string in "dest" is always null terminated.
627 size_t pull_ucs2(const void *base_ptr, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
631 if (dest_len == (size_t)-1)
632 dest_len = sizeof(pstring);
634 if (ucs2_align(base_ptr, src, flags)) {
635 src = (const void *)((const char *)src + 1);
640 if (flags & STR_TERMINATE) {
641 if (src_len == (size_t)-1) {
642 src_len = strlen_w(src)*2 + 2;
644 size_t len = strnlen_w(src, src_len/2);
651 /* ucs2 is always a multiple of 2 bytes */
652 if (src_len != (size_t)-1)
655 ret = convert_string(CH_UCS2, CH_UNIX, src, src_len, dest, dest_len);
657 dest[MIN(ret, dest_len-1)] = 0;
664 size_t pull_ucs2_pstring(char *dest, const void *src)
666 return pull_ucs2(NULL, dest, src, sizeof(pstring), -1, STR_TERMINATE);
669 size_t pull_ucs2_fstring(char *dest, const void *src)
671 return pull_ucs2(NULL, dest, src, sizeof(fstring), -1, STR_TERMINATE);
675 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
677 * @param dest always set at least to NULL
679 * @returns The number of bytes occupied by the string in the destination
682 size_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const smb_ucs2_t *src)
684 size_t src_len = (strlen_w(src)+1) * sizeof(smb_ucs2_t);
686 return convert_string_talloc(ctx, CH_UCS2, CH_UNIX, src, src_len, (void **)dest);
690 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer
692 * @param dest always set at least to NULL
694 * @returns The number of bytes occupied by the string in the destination
697 size_t pull_ucs2_allocate(char **dest, const smb_ucs2_t *src)
699 size_t src_len = (strlen_w(src)+1) * sizeof(smb_ucs2_t);
701 return convert_string_allocate(CH_UCS2, CH_UNIX, src, src_len, (void **)dest);
705 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
707 * @param dest always set at least to NULL
709 * @returns The number of bytes occupied by the string in the destination
712 size_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
714 size_t src_len = strlen(src)+1;
716 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
720 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer
722 * @param dest always set at least to NULL
724 * @returns The number of bytes occupied by the string in the destination
727 size_t pull_utf8_allocate(void **dest, const char *src)
729 size_t src_len = strlen(src)+1;
731 return convert_string_allocate(CH_UTF8, CH_UNIX, src, src_len, dest);
735 Copy a string from a char* src to a unicode or ascii
736 dos codepage destination choosing unicode or ascii based on the
737 flags in the SMB buffer starting at base_ptr.
738 Return the number of bytes occupied by the string in the destination.
740 STR_TERMINATE means include the null termination.
741 STR_UPPER means uppercase in the destination.
742 STR_ASCII use ascii even with unicode packet.
743 STR_NOALIGN means don't do alignment.
744 dest_len is the maximum length allowed in the destination. If dest_len
745 is -1 then no maxiumum is used.
748 size_t push_string_fn(const char *function, unsigned int line, const void *base_ptr, void *dest, const char *src, size_t dest_len, int flags)
751 /* We really need to zero fill here, not clobber
752 * region, as we want to ensure that valgrind thinks
753 * all of the outgoing buffer has been written to
754 * so a send() or write() won't trap an error.
758 if (dest_len != (size_t)-1)
759 clobber_region(function, line, dest, dest_len);
761 if (dest_len != (size_t)-1)
762 memset(dest, '\0', dest_len);
766 if (!(flags & STR_ASCII) && \
767 ((flags & STR_UNICODE || \
768 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
769 return push_ucs2(base_ptr, dest, src, dest_len, flags);
771 return push_ascii(dest, src, dest_len, flags);
776 Copy a string from a unicode or ascii source (depending on
777 the packet flags) to a char* destination.
779 STR_TERMINATE means the string in src is null terminated.
780 STR_UNICODE means to force as unicode.
781 STR_ASCII use ascii even with unicode packet.
782 STR_NOALIGN means don't do alignment.
783 if STR_TERMINATE is set then src_len is ignored is it is -1
784 src_len is the length of the source area in bytes.
785 Return the number of bytes occupied by the string in src.
786 The resulting string in "dest" is always null terminated.
789 size_t pull_string_fn(const char *function, unsigned int line, const void *base_ptr, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
791 if (dest_len != (size_t)-1)
792 clobber_region(function, line, dest, dest_len);
794 if (!(flags & STR_ASCII) && \
795 ((flags & STR_UNICODE || \
796 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
797 return pull_ucs2(base_ptr, dest, src, dest_len, src_len, flags);
799 return pull_ascii(dest, src, dest_len, src_len, flags);
802 size_t align_string(const void *base_ptr, const char *p, int flags)
804 if (!(flags & STR_ASCII) && \
805 ((flags & STR_UNICODE || \
806 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
807 return ucs2_align(base_ptr, p, flags);
813 Convert from unix to ucs2 charset and return the
814 allocated and converted string or NULL if an error occurred.
815 You must provide a zero terminated string.
816 The returning string will be zero terminated.
819 smb_ucs2_t *acnv_uxu2(const char *src)
825 slen = strlen(src) + 1;
826 dlen = convert_string_allocate(CH_UNIX, CH_UCS2, src, slen, &dest);
827 if (dlen == (size_t)-1)
834 Convert from dos to ucs2 charset and return the
835 allocated and converted string or NULL if an error occurred.
836 You must provide a zero terminated string.
837 The returning string will be zero terminated.
840 smb_ucs2_t *acnv_dosu2(const char *src)
846 slen = strlen(src) + 1;
847 dlen = convert_string_allocate(CH_DOS, CH_UCS2, src, slen, &dest);
848 if (dlen == (size_t)-1)