lib/util/charset/util_str.c

   1 /*
   2    Unix SMB/CIFS implementation.
   3    Samba utility functions
   4    Copyright (C) Andrew Tridgell 1992-2001
   5    Copyright (C) Simo Sorce 2001
   6    Copyright (C) Andrew Bartlett 2011
   7    Copyright (C) Jeremy Allison  1992-2007
   8
   9    This program is free software; you can redistribute it and/or modify
  10    it under the terms of the GNU General Public License as published by
  11    the Free Software Foundation; either version 3 of the License, or
  12    (at your option) any later version.
  13
  14    This program is distributed in the hope that it will be useful,
  15    but WITHOUT ANY WARRANTY; without even the implied warranty of
  16    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17    GNU General Public License for more details.
  18
  19    You should have received a copy of the GNU General Public License
  20    along with this program.  If not, see <http://www.gnu.org/licenses/>.
  21 */
  22
  23 #include "includes.h"
  24 #include "system/locale.h"
  25
  26 #ifdef strcasecmp
  27 #undef strcasecmp
  28 #endif
  29
  30 /**
  31  Case insensitive string compararison
  32 **/
  33 _PUBLIC_ int strcasecmp_m(const char *s1, const char *s2)
  34 {
  35         codepoint_t c1=0, c2=0;
  36         size_t size1, size2;
  37         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  38
  39         /* handle null ptr comparisons to simplify the use in qsort */
  40         if (s1 == s2) return 0;
  41         if (s1 == NULL) return -1;
  42         if (s2 == NULL) return 1;
  43
  44         while (*s1 && *s2) {
  45                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  46                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  47
  48                 s1 += size1;
  49                 s2 += size2;
  50
  51                 if (c1 == c2) {
  52                         continue;
  53                 }
  54
  55                 if (c1 == INVALID_CODEPOINT ||
  56                     c2 == INVALID_CODEPOINT) {
  57                         /* what else can we do?? */
  58                         return strcasecmp(s1, s2);
  59                 }
  60
  61                 if (toupper_m(c1) != toupper_m(c2)) {
  62                         return c1 - c2;
  63                 }
  64         }
  65
  66         return *s1 - *s2;
  67 }
  68
  69 /**
  70  Case insensitive string compararison, length limited
  71 **/
  72 _PUBLIC_ int strncasecmp_m(const char *s1, const char *s2, size_t n)
  73 {
  74         codepoint_t c1=0, c2=0;
  75         size_t size1, size2;
  76         struct smb_iconv_handle *iconv_handle = get_iconv_handle();
  77
  78         /* handle null ptr comparisons to simplify the use in qsort */
  79         if (s1 == s2) return 0;
  80         if (s1 == NULL) return -1;
  81         if (s2 == NULL) return 1;
  82
  83         while (*s1 && *s2 && n) {
  84                 n--;
  85
  86                 c1 = next_codepoint_handle(iconv_handle, s1, &size1);
  87                 c2 = next_codepoint_handle(iconv_handle, s2, &size2);
  88
  89                 s1 += size1;
  90                 s2 += size2;
  91
  92                 if (c1 == c2) {
  93                         continue;
  94                 }
  95
  96                 if (c1 == INVALID_CODEPOINT ||
  97                     c2 == INVALID_CODEPOINT) {
  98                         /* what else can we do?? */
  99                         return strcasecmp(s1, s2);
 100                 }
 101
 102                 if (toupper_m(c1) != toupper_m(c2)) {
 103                         return c1 - c2;
 104                 }
 105         }
 106
 107         if (n == 0) {
 108                 return 0;
 109         }
 110
 111         return *s1 - *s2;
 112 }
 113
 114 /**
 115  * Compare 2 strings.
 116  *
 117  * @note The comparison is case-insensitive.
 118  **/
 119 _PUBLIC_ bool strequal_m(const char *s1, const char *s2)
 120 {
 121         return strcasecmp_m(s1,s2) == 0;
 122 }
 123
 124 /**
 125  Compare 2 strings (case sensitive).
 126 **/
 127 _PUBLIC_ bool strcsequal(const char *s1,const char *s2)
 128 {
 129         if (s1 == s2)
 130                 return true;
 131         if (!s1 || !s2)
 132                 return false;
 133
 134         return strcmp(s1,s2) == 0;
 135 }
 136
 137 /**
 138  * Calculate the number of units (8 or 16-bit, depending on the
 139  * destination charset), that would be needed to convert the input
 140  * string which is expected to be in in src_charset encoding to the
 141  * destination charset (which should be a unicode charset).
 142  */
 143 _PUBLIC_ size_t strlen_m_ext_handle(struct smb_iconv_handle *ic,
 144                                     const char *s, charset_t src_charset, charset_t dst_charset)
 145 {
 146         size_t count = 0;
 147
 148         if (!s) {
 149                 return 0;
 150         }
 151
 152         while (*s && !(((uint8_t)*s) & 0x80)) {
 153                 s++;
 154                 count++;
 155         }
 156
 157         if (!*s) {
 158                 return count;
 159         }
 160
 161         while (*s) {
 162                 size_t c_size;
 163                 codepoint_t c = next_codepoint_handle_ext(ic, s, src_charset, &c_size);
 164                 s += c_size;
 165
 166                 switch (dst_charset) {
 167                 case CH_UTF16LE:
 168                 case CH_UTF16BE:
 169                 case CH_UTF16MUNGED:
 170                         if (c < 0x10000) {
 171                                 /* Unicode char fits into 16 bits. */
 172                                 count += 1;
 173                         } else {
 174                                 /* Double-width unicode char - 32 bits. */
 175                                 count += 2;
 176                         }
 177                         break;
 178                 case CH_UTF8:
 179                         /*
 180                          * this only checks ranges, and does not
 181                          * check for invalid codepoints
 182                          */
 183                         if (c < 0x80) {
 184                                 count += 1;
 185                         } else if (c < 0x800) {
 186                                 count += 2;
 187                         } else if (c < 0x10000) {
 188                                 count += 3;
 189                         } else {
 190                                 count += 4;
 191                         }
 192                         break;
 193                 default:
 194                         /*
 195                          * non-unicode encoding:
 196                          * assume that each codepoint fits into
 197                          * one unit in the destination encoding.
 198                          */
 199                         count += 1;
 200                 }
 201         }
 202
 203         return count;
 204 }
 205
 206 /**
 207  * Calculate the number of units (8 or 16-bit, depending on the
 208  * destination charset), that would be needed to convert the input
 209  * string which is expected to be in in src_charset encoding to the
 210  * destination charset (which should be a unicode charset).
 211  */
 212 _PUBLIC_ size_t strlen_m_ext(const char *s, charset_t src_charset, charset_t dst_charset)
 213 {
 214         struct smb_iconv_handle *ic = get_iconv_handle();
 215         return strlen_m_ext_handle(ic, s, src_charset, dst_charset);
 216 }
 217
 218 _PUBLIC_ size_t strlen_m_ext_term(const char *s, const charset_t src_charset,
 219                                   const charset_t dst_charset)
 220 {
 221         if (!s) {
 222                 return 0;
 223         }
 224         return strlen_m_ext(s, src_charset, dst_charset) + 1;
 225 }
 226
 227 /**
 228  * Calculate the number of 16-bit units that would be needed to convert
 229  * the input string which is expected to be in CH_UNIX encoding to UTF16.
 230  *
 231  * This will be the same as the number of bytes in a string for single
 232  * byte strings, but will be different for multibyte.
 233  */
 234 _PUBLIC_ size_t strlen_m(const char *s)
 235 {
 236         return strlen_m_ext(s, CH_UNIX, CH_UTF16LE);
 237 }
 238
 239 /**
 240    Work out the number of multibyte chars in a string, including the NULL
 241    terminator.
 242 **/
 243 _PUBLIC_ size_t strlen_m_term(const char *s)
 244 {
 245         if (!s) {
 246                 return 0;
 247         }
 248
 249         return strlen_m(s) + 1;
 250 }
 251
 252 /*
 253  * Weird helper routine for the winreg pipe: If nothing is around, return 0,
 254  * if a string is there, include the terminator.
 255  */
 256
 257 _PUBLIC_ size_t strlen_m_term_null(const char *s)
 258 {
 259         size_t len;
 260         if (!s) {
 261                 return 0;
 262         }
 263         len = strlen_m(s);
 264         if (len == 0) {
 265                 return 0;
 266         }
 267
 268         return len+1;
 269 }
 270
 271 /**
 272  Strchr and strrchr_m are a bit complex on general multi-byte strings.
 273 **/
 274 _PUBLIC_ char *strchr_m(const char *src, char c)
 275 {
 276         const char *s;
 277         struct smb_iconv_handle *ic = get_iconv_handle();
 278         if (src == NULL) {
 279                 return NULL;
 280         }
 281         /* characters below 0x3F are guaranteed to not appear in
 282            non-initial position in multi-byte charsets */
 283         if ((c & 0xC0) == 0) {
 284                 return strchr(src, c);
 285         }
 286
 287         /* this is quite a common operation, so we want it to be
 288            fast. We optimise for the ascii case, knowing that all our
 289            supported multi-byte character sets are ascii-compatible
 290            (ie. they match for the first 128 chars) */
 291
 292         for (s = src; *s && !(((unsigned char)s[0]) & 0x80); s++) {
 293                 if (*s == c)
 294                         return (char *)s;
 295         }
 296
 297         if (!*s)
 298                 return NULL;
 299
 300 #ifdef BROKEN_UNICODE_COMPOSE_CHARACTERS
 301         /* With compose characters we must restart from the beginning. JRA. */
 302         s = src;
 303 #endif
 304
 305         while (*s) {
 306                 size_t size;
 307                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 308                 if (c2 == c) {
 309                         return discard_const_p(char, s);
 310                 }
 311                 s += size;
 312         }
 313
 314         return NULL;
 315 }
 316
 317 /**
 318  * Multibyte-character version of strrchr
 319  */
 320 _PUBLIC_ char *strrchr_m(const char *s, char c)
 321 {
 322         struct smb_iconv_handle *ic = get_iconv_handle();
 323         char *ret = NULL;
 324
 325         if (s == NULL) {
 326                 return NULL;
 327         }
 328
 329         /* characters below 0x3F are guaranteed to not appear in
 330            non-initial position in multi-byte charsets */
 331         if ((c & 0xC0) == 0) {
 332                 return strrchr(s, c);
 333         }
 334
 335         /* this is quite a common operation, so we want it to be
 336            fast. We optimise for the ascii case, knowing that all our
 337            supported multi-byte character sets are ascii-compatible
 338            (ie. they match for the first 128 chars). Also, in Samba
 339            we only search for ascii characters in 'c' and that
 340            in all mb character sets with a compound character
 341            containing c, if 'c' is not a match at position
 342            p, then p[-1] > 0x7f. JRA. */
 343
 344         {
 345                 size_t len = strlen(s);
 346                 const char *cp = s;
 347                 bool got_mb = false;
 348
 349                 if (len == 0)
 350                         return NULL;
 351                 cp += (len - 1);
 352                 do {
 353                         if (c == *cp) {
 354                                 /* Could be a match. Part of a multibyte ? */
 355                                 if ((cp > s) &&
 356                                         (((unsigned char)cp[-1]) & 0x80)) {
 357                                         /* Yep - go slow :-( */
 358                                         got_mb = true;
 359                                         break;
 360                                 }
 361                                 /* No - we have a match ! */
 362                                 return (char *)cp;
 363                         }
 364                 } while (cp-- != s);
 365                 if (!got_mb)
 366                         return NULL;
 367         }
 368
 369         while (*s) {
 370                 size_t size;
 371                 codepoint_t c2 = next_codepoint_handle(ic, s, &size);
 372                 if (c2 == c) {
 373                         ret = discard_const_p(char, s);
 374                 }
 375                 s += size;
 376         }
 377
 378         return ret;
 379 }
 380
 381 /**
 382   return True if any (multi-byte) character is lower case
 383 */
 384 _PUBLIC_ bool strhaslower(const char *string)
 385 {
 386         struct smb_iconv_handle *ic = get_iconv_handle();
 387         while (*string) {
 388                 size_t c_size;
 389                 codepoint_t s;
 390                 codepoint_t t;
 391
 392                 s = next_codepoint_handle(ic, string, &c_size);
 393                 string += c_size;
 394
 395                 t = toupper_m(s);
 396
 397                 if (s != t) {
 398                         return true; /* that means it has lower case chars */
 399                 }
 400         }
 401
 402         return false;
 403 }
 404
 405 /**
 406   return True if any (multi-byte) character is upper case
 407 */
 408 _PUBLIC_ bool strhasupper(const char *string)
 409 {
 410         struct smb_iconv_handle *ic = get_iconv_handle();
 411         while (*string) {
 412                 size_t c_size;
 413                 codepoint_t s;
 414                 codepoint_t t;
 415
 416                 s = next_codepoint_handle(ic, string, &c_size);
 417                 string += c_size;
 418
 419                 t = tolower_m(s);
 420
 421                 if (s != t) {
 422                         return true; /* that means it has upper case chars */
 423                 }
 424         }
 425
 426         return false;
 427 }
 428