2 Unix SMB/CIFS implementation.
3 Character set conversion Extensions
4 Copyright (C) Igor Vergeichik <iverg@mail.ru> 2001
5 Copyright (C) Andrew Tridgell 2001
6 Copyright (C) Simo Sorce 2001
7 Copyright (C) Martin Pool 2003
9 This program is free software; you can redistribute it and/or modify
10 it under the terms of the GNU General Public License as published by
11 the Free Software Foundation; either version 2 of the License, or
12 (at your option) any later version.
14 This program is distributed in the hope that it will be useful,
15 but WITHOUT ANY WARRANTY; without even the implied warranty of
16 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 GNU General Public License for more details.
19 You should have received a copy of the GNU General Public License
20 along with this program; if not, write to the Free Software
21 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
29 * @brief Character-set conversion routines built on our iconv.
31 * @note Samba's internal character set (at least in the 3.0 series)
32 * is always the same as the one for the Unix filesystem. It is
33 * <b>not</b> necessarily UTF-8 and may be different on machines that
34 * need i18n filenames to be compatible with Unix software. It does
35 * have to be a superset of ASCII. All multibyte sequences must start
36 * with a byte with the high bit set.
42 static smb_iconv_t conv_handles[NUM_CHARSETS][NUM_CHARSETS];
43 static BOOL conv_silent; /* Should we do a debug if the conversion fails ? */
46 * Return the name of a charset to give to iconv().
48 static const char *charset_name(charset_t ch)
50 const char *ret = NULL;
52 if (ch == CH_UCS2) ret = "UCS-2LE";
53 else if (ch == CH_UNIX) ret = lp_unix_charset();
54 else if (ch == CH_DOS) ret = lp_dos_charset();
55 else if (ch == CH_DISPLAY) ret = lp_display_charset();
56 else if (ch == CH_UTF8) ret = "UTF8";
58 #if defined(HAVE_NL_LANGINFO) && defined(CODESET)
59 if (ret && strcasecmp(ret, "LOCALE") == 0) {
60 const char *ln = NULL;
63 setlocale(LC_ALL, "");
65 ln = nl_langinfo(CODESET);
67 /* Check whether the charset name is supported
69 smb_iconv_t handle = smb_iconv_open(ln,"UCS-2LE");
70 if (handle == (smb_iconv_t) -1) {
71 DEBUG(5,("Locale charset '%s' unsupported, using ASCII instead\n", ln));
74 DEBUG(5,("Substituting charset '%s' for LOCALE\n", ln));
75 smb_iconv_close(handle);
82 if (!ret || !*ret) ret = "ASCII";
86 void lazy_initialize_conv(void)
88 static int initialized = False;
98 * Initialize iconv conversion descriptors.
100 * This is called the first time it is needed, and also called again
101 * every time the configuration is reloaded, because the charset or
102 * codepage might have changed.
104 void init_iconv(void)
107 BOOL did_reload = False;
109 /* so that charset_name() works we need to get the UNIX<->UCS2 going
111 if (!conv_handles[CH_UNIX][CH_UCS2])
112 conv_handles[CH_UNIX][CH_UCS2] = smb_iconv_open("UCS-2LE", "ASCII");
114 if (!conv_handles[CH_UCS2][CH_UNIX])
115 conv_handles[CH_UCS2][CH_UNIX] = smb_iconv_open("ASCII", "UCS-2LE");
117 for (c1=0;c1<NUM_CHARSETS;c1++) {
118 for (c2=0;c2<NUM_CHARSETS;c2++) {
119 const char *n1 = charset_name((charset_t)c1);
120 const char *n2 = charset_name((charset_t)c2);
121 if (conv_handles[c1][c2] &&
122 strcmp(n1, conv_handles[c1][c2]->from_name) == 0 &&
123 strcmp(n2, conv_handles[c1][c2]->to_name) == 0)
128 if (conv_handles[c1][c2])
129 smb_iconv_close(conv_handles[c1][c2]);
131 conv_handles[c1][c2] = smb_iconv_open(n2,n1);
132 if (conv_handles[c1][c2] == (smb_iconv_t)-1) {
133 DEBUG(0,("Conversion from %s to %s not supported\n",
134 charset_name((charset_t)c1), charset_name((charset_t)c2)));
135 conv_handles[c1][c2] = NULL;
141 /* XXX: Does this really get called every time the dos
142 * codepage changes? */
143 /* XXX: Is the did_reload test too strict? */
145 init_doschar_table();
152 * Convert string from one encoding to another, making error checking etc
154 * @param src pointer to source string (multibyte or singlebyte)
155 * @param srclen length of the source string in bytes
156 * @param dest pointer to destination string (multibyte or singlebyte)
157 * @param destlen maximal length allowed for string
158 * @returns the number of bytes occupied in the destination
160 * Ensure the srclen contains the terminating zero.
163 size_t convert_string(charset_t from, charset_t to,
164 void const *src, size_t srclen,
165 void *dest, size_t destlen)
169 const char* inbuf = (const char*)src;
170 char* outbuf = (char*)dest;
171 smb_iconv_t descriptor;
173 if (srclen == (size_t)-1)
174 srclen = strlen(src)+1;
178 lazy_initialize_conv();
180 descriptor = conv_handles[from][to];
182 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
184 DEBUG(0,("convert_string: Conversion not supported.\n"));
190 retval = smb_iconv(descriptor, &inbuf, &i_len, &outbuf, &o_len);
191 if(retval==(size_t)-1) {
192 const char *reason="unknown error";
195 reason="Incomplete multibyte sequence";
197 DEBUG(3,("convert_string: Conversion error: %s(%s)\n",reason,inbuf));
200 reason="No more room";
202 DEBUG(3, ("convert_string: Required %lu, available %lu\n",
203 (unsigned long)srclen, (unsigned long)destlen));
204 /* we are not sure we need srclen bytes,
205 may be more, may be less.
206 We only know we need more than destlen
210 reason="Illegal multibyte sequence";
212 DEBUG(3,("convert_string: Conversion error: %s(%s)\n",reason,inbuf));
216 DEBUG(0,("convert_string: Conversion error: %s(%s)\n",reason,inbuf));
219 /* smb_panic(reason); */
221 return destlen-o_len;
225 /* conversion not supported, use as is */
227 size_t len = MIN(srclen,destlen);
229 memcpy(dest,src,len);
235 * Convert between character sets, allocating a new buffer for the result.
237 * @param ctx TALLOC_CTX to use to allocate with. If NULL use malloc.
238 * @param srclen length of source buffer.
239 * @param dest always set at least to NULL
240 * @note -1 is not accepted for srclen.
242 * @returns Size in bytes of the converted string; or -1 in case of error.
244 * Ensure the srclen contains the terminating zero.
247 size_t convert_string_allocate(TALLOC_CTX *ctx, charset_t from, charset_t to,
248 void const *src, size_t srclen, void **dest)
250 size_t i_len, o_len, destlen = MAX(srclen, 512);
252 const char *inbuf = (const char *)src;
253 char *outbuf = NULL, *ob = NULL;
254 smb_iconv_t descriptor;
258 if (src == NULL || srclen == (size_t)-1)
263 lazy_initialize_conv();
265 descriptor = conv_handles[from][to];
267 if (descriptor == (smb_iconv_t)-1 || descriptor == (smb_iconv_t)0) {
269 DEBUG(0,("convert_string_allocate: Conversion not supported.\n"));
274 if ((destlen*2) < destlen) {
275 /* wrapped ! abort. */
277 DEBUG(0, ("convert_string_allocate: destlen wrapped !\n"));
282 destlen = destlen * 2;
286 ob = (char *)talloc_realloc(ctx, ob, destlen);
288 ob = (char *)Realloc(ob, destlen);
291 DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
300 retval = smb_iconv(descriptor,
303 if(retval == (size_t)-1) {
304 const char *reason="unknown error";
307 reason="Incomplete multibyte sequence";
309 DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
314 reason="Illegal multibyte sequence";
316 DEBUG(3,("convert_string_allocate: Conversion error: %s(%s)\n",reason,inbuf));
320 DEBUG(0,("Conversion error: %s(%s)\n",reason,inbuf));
321 /* smb_panic(reason); */
325 destlen = destlen - o_len;
327 *dest = (char *)talloc_realloc(ctx,ob,destlen);
329 *dest = (char *)Realloc(ob,destlen);
330 if (destlen && !*dest) {
331 DEBUG(0, ("convert_string_allocate: out of memory!\n"));
341 /* conversion not supported, use as is */
343 if (srclen && (destlen != srclen)) {
346 ob = (char *)talloc_realloc(ctx, ob, destlen);
348 ob = (char *)Realloc(ob, destlen);
350 DEBUG(0, ("convert_string_allocate: realloc failed!\n"));
357 memcpy(ob,(const char *)src,srclen);
365 * Convert between character sets, allocating a new buffer using talloc for the result.
367 * @param srclen length of source buffer.
368 * @param dest always set at least to NULL
369 * @note -1 is not accepted for srclen.
371 * @returns Size in bytes of the converted string; or -1 in case of error.
373 static size_t convert_string_talloc(TALLOC_CTX *ctx, charset_t from, charset_t to,
374 void const *src, size_t srclen, void **dest)
379 dest_len=convert_string_allocate(ctx, from, to, src, srclen, dest);
380 if (dest_len == (size_t)-1)
387 size_t unix_strupper(const char *src, size_t srclen, char *dest, size_t destlen)
392 size = push_ucs2_allocate(&buffer, src);
394 smb_panic("failed to create UCS2 buffer");
396 if (!strupper_w(buffer) && (dest == src)) {
401 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
407 strdup() a unix string to upper case.
410 char *strdup_upper(const char *s)
416 size = push_ucs2_allocate(&buffer, s);
423 size = pull_ucs2_allocate(&out_buffer, buffer);
433 size_t unix_strlower(const char *src, size_t srclen, char *dest, size_t destlen)
438 size = convert_string_allocate(NULL, CH_UNIX, CH_UCS2, src, srclen,
441 smb_panic("failed to create UCS2 buffer");
443 if (!strlower_w(buffer) && (dest == src)) {
447 size = convert_string(CH_UCS2, CH_UNIX, buffer, size, dest, destlen);
453 strdup() a unix string to lower case.
456 char *strdup_lower(const char *s)
462 size = push_ucs2_allocate(&buffer, s);
469 size = pull_ucs2_allocate(&out_buffer, buffer);
479 static size_t ucs2_align(const void *base_ptr, const void *p, int flags)
481 if (flags & (STR_NOALIGN|STR_ASCII))
483 return PTR_DIFF(p, base_ptr) & 1;
488 * Copy a string from a char* unix src to a dos codepage string destination.
490 * @return the number of bytes occupied by the string in the destination.
492 * @param flags can include
494 * <dt>STR_TERMINATE</dt> <dd>means include the null termination</dd>
495 * <dt>STR_UPPER</dt> <dd>means uppercase in the destination</dd>
498 * @param dest_len the maximum length in bytes allowed in the
499 * destination. If @p dest_len is -1 then no maximum is used.
501 size_t push_ascii(void *dest, const char *src, size_t dest_len, int flags)
503 size_t src_len = strlen(src);
506 /* treat a pstring as "unlimited" length */
507 if (dest_len == (size_t)-1)
508 dest_len = sizeof(pstring);
510 if (flags & STR_UPPER) {
511 pstrcpy(tmpbuf, src);
516 if (flags & (STR_TERMINATE | STR_TERMINATE_ASCII))
519 return convert_string(CH_UNIX, CH_DOS, src, src_len, dest, dest_len);
522 size_t push_ascii_fstring(void *dest, const char *src)
524 return push_ascii(dest, src, sizeof(fstring), STR_TERMINATE);
527 size_t push_ascii_pstring(void *dest, const char *src)
529 return push_ascii(dest, src, sizeof(pstring), STR_TERMINATE);
532 size_t push_ascii_nstring(void *dest, const char *src)
534 return push_ascii(dest, src, sizeof(nstring), STR_TERMINATE);
538 * Copy a string from a dos codepage source to a unix char* destination.
540 * The resulting string in "dest" is always null terminated.
542 * @param flags can have:
544 * <dt>STR_TERMINATE</dt>
545 * <dd>STR_TERMINATE means the string in @p src
546 * is null terminated, and src_len is ignored.</dd>
549 * @param src_len is the length of the source area in bytes.
550 * @returns the number of bytes occupied by the string in @p src.
552 size_t pull_ascii(char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
556 if (dest_len == (size_t)-1)
557 dest_len = sizeof(pstring);
559 if (flags & STR_TERMINATE) {
560 if (src_len == (size_t)-1) {
561 src_len = strlen(src) + 1;
563 size_t len = strnlen(src, src_len);
570 ret = convert_string(CH_DOS, CH_UNIX, src, src_len, dest, dest_len);
573 dest[MIN(ret, dest_len-1)] = 0;
580 size_t pull_ascii_pstring(char *dest, const void *src)
582 return pull_ascii(dest, src, sizeof(pstring), -1, STR_TERMINATE);
585 size_t pull_ascii_fstring(char *dest, const void *src)
587 return pull_ascii(dest, src, sizeof(fstring), -1, STR_TERMINATE);
590 size_t pull_ascii_nstring(char *dest, const void *src)
592 return pull_ascii(dest, src, sizeof(nstring), sizeof(nstring), STR_TERMINATE);
596 * Copy a string from a char* src to a unicode destination.
598 * @returns the number of bytes occupied by the string in the destination.
600 * @param flags can have:
603 * <dt>STR_TERMINATE <dd>means include the null termination.
604 * <dt>STR_UPPER <dd>means uppercase in the destination.
605 * <dt>STR_NOALIGN <dd>means don't do alignment.
608 * @param dest_len is the maximum length allowed in the
609 * destination. If dest_len is -1 then no maxiumum is used.
611 size_t push_ucs2(const void *base_ptr, void *dest, const char *src, size_t dest_len, int flags)
614 size_t src_len = strlen(src);
616 /* treat a pstring as "unlimited" length */
617 if (dest_len == (size_t)-1)
618 dest_len = sizeof(pstring);
620 if (flags & STR_TERMINATE)
623 if (ucs2_align(base_ptr, dest, flags)) {
625 dest = (void *)((char *)dest + 1);
626 if (dest_len) dest_len--;
630 /* ucs2 is always a multiple of 2 bytes */
633 len += convert_string(CH_UNIX, CH_UCS2, src, src_len, dest, dest_len);
635 if (flags & STR_UPPER) {
636 smb_ucs2_t *dest_ucs2 = dest;
638 for (i = 0; i < (dest_len / 2) && dest_ucs2[i]; i++) {
639 smb_ucs2_t v = toupper_w(dest_ucs2[i]);
640 if (v != dest_ucs2[i]) {
651 * Copy a string from a unix char* src to a UCS2 destination,
652 * allocating a buffer using talloc().
654 * @param dest always set at least to NULL
656 * @returns The number of bytes occupied by the string in the destination
657 * or -1 in case of error.
659 size_t push_ucs2_talloc(TALLOC_CTX *ctx, smb_ucs2_t **dest, const char *src)
661 size_t src_len = strlen(src)+1;
664 return convert_string_talloc(ctx, CH_UNIX, CH_UCS2, src, src_len, (void **)dest);
669 * Copy a string from a unix char* src to a UCS2 destination, allocating a buffer
671 * @param dest always set at least to NULL
673 * @returns The number of bytes occupied by the string in the destination
674 * or -1 in case of error.
677 size_t push_ucs2_allocate(smb_ucs2_t **dest, const char *src)
679 size_t src_len = strlen(src)+1;
682 return convert_string_allocate(NULL, CH_UNIX, CH_UCS2, src, src_len, (void **)dest);
686 Copy a string from a char* src to a UTF-8 destination.
687 Return the number of bytes occupied by the string in the destination
689 STR_TERMINATE means include the null termination
690 STR_UPPER means uppercase in the destination
691 dest_len is the maximum length allowed in the destination. If dest_len
692 is -1 then no maxiumum is used.
695 static size_t push_utf8(void *dest, const char *src, size_t dest_len, int flags)
697 size_t src_len = strlen(src);
700 /* treat a pstring as "unlimited" length */
701 if (dest_len == (size_t)-1)
702 dest_len = sizeof(pstring);
704 if (flags & STR_UPPER) {
705 pstrcpy(tmpbuf, src);
710 if (flags & STR_TERMINATE)
713 return convert_string(CH_UNIX, CH_UTF8, src, src_len, dest, dest_len);
716 size_t push_utf8_fstring(void *dest, const char *src)
718 return push_utf8(dest, src, sizeof(fstring), STR_TERMINATE);
722 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer using talloc
724 * @param dest always set at least to NULL
726 * @returns The number of bytes occupied by the string in the destination
729 size_t push_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
731 size_t src_len = strlen(src)+1;
734 return convert_string_talloc(ctx, CH_UNIX, CH_UTF8, src, src_len, (void**)dest);
738 * Copy a string from a unix char* src to a UTF-8 destination, allocating a buffer
740 * @param dest always set at least to NULL
742 * @returns The number of bytes occupied by the string in the destination
745 size_t push_utf8_allocate(char **dest, const char *src)
747 size_t src_len = strlen(src)+1;
750 return convert_string_allocate(NULL, CH_UNIX, CH_UTF8, src, src_len, (void **)dest);
754 Copy a string from a ucs2 source to a unix char* destination.
756 STR_TERMINATE means the string in src is null terminated.
757 STR_NOALIGN means don't try to align.
758 if STR_TERMINATE is set then src_len is ignored if it is -1.
759 src_len is the length of the source area in bytes
760 Return the number of bytes occupied by the string in src.
761 The resulting string in "dest" is always null terminated.
764 size_t pull_ucs2(const void *base_ptr, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
768 if (dest_len == (size_t)-1)
769 dest_len = sizeof(pstring);
771 if (ucs2_align(base_ptr, src, flags)) {
772 src = (const void *)((const char *)src + 1);
777 if (flags & STR_TERMINATE) {
778 if (src_len == (size_t)-1) {
779 src_len = strlen_w(src)*2 + 2;
781 size_t len = strnlen_w(src, src_len/2);
788 /* ucs2 is always a multiple of 2 bytes */
789 if (src_len != (size_t)-1)
792 ret = convert_string(CH_UCS2, CH_UNIX, src, src_len, dest, dest_len);
794 dest[MIN(ret, dest_len-1)] = 0;
801 size_t pull_ucs2_pstring(char *dest, const void *src)
803 return pull_ucs2(NULL, dest, src, sizeof(pstring), -1, STR_TERMINATE);
806 size_t pull_ucs2_fstring(char *dest, const void *src)
808 return pull_ucs2(NULL, dest, src, sizeof(fstring), -1, STR_TERMINATE);
812 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer using talloc
814 * @param dest always set at least to NULL
816 * @returns The number of bytes occupied by the string in the destination
819 size_t pull_ucs2_talloc(TALLOC_CTX *ctx, char **dest, const smb_ucs2_t *src)
821 size_t src_len = (strlen_w(src)+1) * sizeof(smb_ucs2_t);
823 return convert_string_talloc(ctx, CH_UCS2, CH_UNIX, src, src_len, (void **)dest);
827 * Copy a string from a UCS2 src to a unix char * destination, allocating a buffer
829 * @param dest always set at least to NULL
831 * @returns The number of bytes occupied by the string in the destination
834 size_t pull_ucs2_allocate(char **dest, const smb_ucs2_t *src)
836 size_t src_len = (strlen_w(src)+1) * sizeof(smb_ucs2_t);
838 return convert_string_allocate(NULL, CH_UCS2, CH_UNIX, src, src_len, (void **)dest);
842 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer using talloc
844 * @param dest always set at least to NULL
846 * @returns The number of bytes occupied by the string in the destination
849 size_t pull_utf8_talloc(TALLOC_CTX *ctx, char **dest, const char *src)
851 size_t src_len = strlen(src)+1;
853 return convert_string_talloc(ctx, CH_UTF8, CH_UNIX, src, src_len, (void **)dest);
857 * Copy a string from a UTF-8 src to a unix char * destination, allocating a buffer
859 * @param dest always set at least to NULL
861 * @returns The number of bytes occupied by the string in the destination
864 size_t pull_utf8_allocate(void **dest, const char *src)
866 size_t src_len = strlen(src)+1;
868 return convert_string_allocate(NULL, CH_UTF8, CH_UNIX, src, src_len, dest);
872 Copy a string from a char* src to a unicode or ascii
873 dos codepage destination choosing unicode or ascii based on the
874 flags in the SMB buffer starting at base_ptr.
875 Return the number of bytes occupied by the string in the destination.
877 STR_TERMINATE means include the null termination.
878 STR_UPPER means uppercase in the destination.
879 STR_ASCII use ascii even with unicode packet.
880 STR_NOALIGN means don't do alignment.
881 dest_len is the maximum length allowed in the destination. If dest_len
882 is -1 then no maxiumum is used.
885 size_t push_string_fn(const char *function, unsigned int line, const void *base_ptr, void *dest, const char *src, size_t dest_len, int flags)
888 /* We really need to zero fill here, not clobber
889 * region, as we want to ensure that valgrind thinks
890 * all of the outgoing buffer has been written to
891 * so a send() or write() won't trap an error.
895 if (dest_len != (size_t)-1)
896 clobber_region(function, line, dest, dest_len);
898 if (dest_len != (size_t)-1)
899 memset(dest, '\0', dest_len);
903 if (!(flags & STR_ASCII) && \
904 ((flags & STR_UNICODE || \
905 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
906 return push_ucs2(base_ptr, dest, src, dest_len, flags);
908 return push_ascii(dest, src, dest_len, flags);
913 Copy a string from a unicode or ascii source (depending on
914 the packet flags) to a char* destination.
916 STR_TERMINATE means the string in src is null terminated.
917 STR_UNICODE means to force as unicode.
918 STR_ASCII use ascii even with unicode packet.
919 STR_NOALIGN means don't do alignment.
920 if STR_TERMINATE is set then src_len is ignored is it is -1
921 src_len is the length of the source area in bytes.
922 Return the number of bytes occupied by the string in src.
923 The resulting string in "dest" is always null terminated.
926 size_t pull_string_fn(const char *function, unsigned int line, const void *base_ptr, char *dest, const void *src, size_t dest_len, size_t src_len, int flags)
928 if (dest_len != (size_t)-1)
929 clobber_region(function, line, dest, dest_len);
931 if (!(flags & STR_ASCII) && \
932 ((flags & STR_UNICODE || \
933 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
934 return pull_ucs2(base_ptr, dest, src, dest_len, src_len, flags);
936 return pull_ascii(dest, src, dest_len, src_len, flags);
939 size_t align_string(const void *base_ptr, const char *p, int flags)
941 if (!(flags & STR_ASCII) && \
942 ((flags & STR_UNICODE || \
943 (SVAL(base_ptr, smb_flg2) & FLAGS2_UNICODE_STRINGS)))) {
944 return ucs2_align(base_ptr, p, flags);