source/lib/charset.c

   1 /*
   2    Unix SMB/Netbios implementation.
   3    Version 1.9.
   4    Character set handling
   5    Copyright (C) Andrew Tridgell 1992-1998
   6
   7    This program is free software; you can redistribute it and/or modify
   8    it under the terms of the GNU General Public License as published by
   9    the Free Software Foundation; either version 2 of the License, or
  10    (at your option) any later version.
  11
  12    This program is distributed in the hope that it will be useful,
  13    but WITHOUT ANY WARRANTY; without even the implied warranty of
  14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15    GNU General Public License for more details.
  16
  17    You should have received a copy of the GNU General Public License
  18    along with this program; if not, write to the Free Software
  19    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  20 */
  21
  22 #define CHARSET_C
  23 #include "includes.h"
  24
  25 extern int DEBUGLEVEL;
  26
  27 /*
  28  * Codepage definitions.
  29  */
  30
  31 #if !defined(KANJI)
  32 /* lower->upper mapping for IBM Code Page 850 - MS-DOS Latin 1 */
  33 unsigned char cp_850[][4] = {
  34 /* dec col/row oct hex  description */
  35 /* 133  08/05  205  85  a grave */
  36 /* 183  11/07  267  B7  A grave */      {0x85,0xB7,1,1},
  37 /* 160  10/00  240  A0  a acute */
  38 /* 181  11/05  265  B5  A acute */      {0xA0,0xB5,1,1},
  39 /* 131  08/03  203  83  a circumflex */
  40 /* 182  11/06  266  B6  A circumflex */ {0x83,0xB6,1,1},
  41 /* 198  12/06  306  C6  a tilde */
  42 /* 199  12/07  307  C7  A tilde */      {0xC6,0xC7,1,1},
  43 /* 132  08/04  204  84  a diaeresis */
  44 /* 142  08/14  216  8E  A diaeresis */  {0x84,0x8E,1,1},
  45 /* 134  08/06  206  86  a ring */
  46 /* 143  08/15  217  8F  A ring */       {0x86,0x8F,1,1},
  47 /* 145  09/01  221  91  ae diphthong */
  48 /* 146  09/02  222  92  AE diphthong */ {0x91,0x92,1,1},
  49 /* 135  08/07  207  87  c cedilla */
  50 /* 128  08/00  200  80  C cedilla */    {0x87,0x80,1,1},
  51 /* 138  08/10  212  8A  e grave */
  52 /* 212  13/04  324  D4  E grave */      {0x8A,0xD4,1,1},
  53 /* 130  08/02  202  82  e acute */
  54 /* 144  09/00  220  90  E acute */      {0x82,0x90,1,1},
  55 /* 136  08/08  210  88  e circumflex */
  56 /* 210  13/02  322  D2  E circumflex */ {0x88,0xD2,1,1},
  57 /* 137  08/09  211  89  e diaeresis */
  58 /* 211  13/03  323  D3  E diaeresis */  {0x89,0xD3,1,1},
  59 /* 141  08/13  215  8D  i grave */
  60 /* 222  13/14  336  DE  I grave */      {0x8D,0xDE,1,1},
  61 /* 161  10/01  241  A1  i acute */
  62 /* 214  13/06  326  D6  I acute */      {0xA1,0xD6,1,1},
  63 /* 140  08/12  214  8C  i circumflex */
  64 /* 215  13/07  327  D7  I circumflex */ {0x8C,0xD7,1,1},
  65 /* 139  08/11  213  8B  i diaeresis */
  66 /* 216  13/08  330  D8  I diaeresis */  {0x8B,0xD8,1,1},
  67 /* 208  13/00  320  D0  Icelandic eth */
  68 /* 209  13/01  321  D1  Icelandic Eth */ {0xD0,0xD1,1,1},
  69 /* 164  10/04  244  A4  n tilde */
  70 /* 165  10/05  245  A5  N tilde */      {0xA4,0xA5,1,1},
  71 /* 149  09/05  225  95  o grave */
  72 /* 227  14/03  343  E3  O grave */      {0x95,0xE3,1,1},
  73 /* 162  10/02  242  A2  o acute */
  74 /* 224  14/00  340  E0  O acute */      {0xA2,0xE0,1,1},
  75 /* 147  09/03  223  93  o circumflex */
  76 /* 226  14/02  342  E2  O circumflex */ {0x93,0xE2,1,1},
  77 /* 228  14/04  344  E4  o tilde */
  78 /* 229  14/05  345  E5  O tilde */      {0xE4,0xE5,1,1},
  79 /* 148  09/04  224  94  o diaeresis */
  80 /* 153  09/09  231  99  O diaeresis */  {0x94,0x99,1,1},
  81 /* 155  09/11  233  9B  o slash */
  82 /* 157  09/13  235  9D  O slash */      {0x9B,0x9D,1,1},
  83 /* 151  09/07  227  97  u grave */
  84 /* 235  14/11  353  EB  U grave */      {0x97,0xEB,1,1},
  85 /* 163  10/03  243  A3  u acute */
  86 /* 233  14/09  351  E9  U acute */      {0xA3,0xE9,1,1},
  87 /* 150  09/06  226  96  u circumflex */
  88 /* 234  14/10  352  EA  U circumflex */ {0x96,0xEA,1,1},
  89 /* 129  08/01  201  81  u diaeresis */
  90 /* 154  09/10  232  9A  U diaeresis */  {0x81,0x9A,1,1},
  91 /* 236  14/12  354  EC  y acute */
  92 /* 237  14/13  355  ED  Y acute */      {0xEC,0xED,1,1},
  93 /* 231  14/07  347  E7  Icelandic thorn */
  94 /* 232  14/08  350  E8  Icelandic Thorn */ {0xE7,0xE8,1,1},
  95
  96   {0x9C,0,0,0},     /* Pound        */
  97   {0,0,0,0}
  98 };
  99 #else /* KANJI */
 100 /* lower->upper mapping for IBM Code Page 932 - MS-DOS Japanese SJIS */
 101 unsigned char cp_932[][4] = {
 102   {0,0,0,0}
 103 };
 104 #endif /* KANJI */
 105
 106 char xx_dos_char_map[256];
 107 char xx_upper_char_map[256];
 108 char xx_lower_char_map[256];
 109
 110 char *dos_char_map = xx_dos_char_map;
 111 char *upper_char_map = xx_upper_char_map;
 112 char *lower_char_map = xx_lower_char_map;
 113
 114 /*
 115  * This code has been extended to deal with ascynchronous mappings
 116  * like MS-DOS Latin US (Code page 437) where things like :
 117  * a acute are capitalized to 'A', but the reverse mapping
 118  * must not hold true. This allows the filename case insensitive
 119  * matching in do_match() to work, as the DOS/Win95/NT client
 120  * uses 'A' as a mask to match against characters like a acute.
 121  * This is the meaning behind the parameters that allow a
 122  * mapping from lower to upper, but not upper to lower.
 123  */
 124
 125 static void add_dos_char(int lower, BOOL map_lower_to_upper,
 126                          int upper, BOOL map_upper_to_lower)
 127 {
 128   lower &= 0xff;
 129   upper &= 0xff;
 130   DEBUGADD( 6, ( "Adding chars 0x%x 0x%x (l->u = %s) (u->l = %s)\n",
 131                  lower, upper,
 132                  map_lower_to_upper ? "True" : "False",
 133                  map_upper_to_lower ? "True" : "False" ) );
 134   if (lower) dos_char_map[lower] = 1;
 135   if (upper) dos_char_map[upper] = 1;
 136   lower_char_map[lower] = (char)lower; /* Define tolower(lower) */
 137   upper_char_map[upper] = (char)upper; /* Define toupper(upper) */
 138   if (lower && upper) {
 139     if(map_upper_to_lower)
 140     lower_char_map[upper] = (char)lower;
 141     if(map_lower_to_upper)
 142     upper_char_map[lower] = (char)upper;
 143   }
 144 }
 145
 146 /****************************************************************************
 147 initialise the charset arrays
 148 ****************************************************************************/
 149 void charset_initialise(void)
 150 {
 151   int i;
 152
 153 #ifdef LC_ALL
 154   /* include <locale.h> in includes.h if available for OS                  */
 155   /* we take only standard 7-bit ASCII definitions from ctype              */
 156   setlocale(LC_ALL,"C");
 157 #endif
 158
 159   for (i= 0;i<=255;i++) {
 160     dos_char_map[i] = 0;
 161   }
 162
 163   for (i=0;i<=127;i++) {
 164     if (isalnum(i) || strchr("._^$~!#%&-{}()@'`",(char)i))
 165       add_dos_char(i,False,0,False);
 166   }
 167
 168   for (i=0; i<=255; i++) {
 169     char c = (char)i;
 170     upper_char_map[i] = lower_char_map[i] = c;
 171
 172     /* Some systems have buggy isupper/islower for characters
 173        above 127. Best not to rely on them. */
 174     if(i < 128) {
 175       if (isupper((int)c)) lower_char_map[i] = tolower(c);
 176       if (islower((int)c)) upper_char_map[i] = toupper(c);
 177     }
 178   }
 179 }
 180
 181 /****************************************************************************
 182 load the client codepage.
 183 ****************************************************************************/
 184
 185 typedef unsigned char (*codepage_p)[4];
 186
 187 static codepage_p load_client_codepage( int client_codepage )
 188 {
 189   pstring codepage_file_name;
 190   unsigned char buf[8];
 191   FILE *fp = NULL;
 192   SMB_OFF_T size;
 193   codepage_p cp_p = NULL;
 194   SMB_STRUCT_STAT st;
 195
 196   DEBUG(5, ("load_client_codepage: loading codepage %d.\n", client_codepage));
 197
 198   if(strlen(CODEPAGEDIR) + 14 > sizeof(codepage_file_name))
 199   {
 200     DEBUG(0,("load_client_codepage: filename too long to load\n"));
 201     return NULL;
 202   }
 203
 204   pstrcpy(codepage_file_name, CODEPAGEDIR);
 205   pstrcat(codepage_file_name, "/");
 206   pstrcat(codepage_file_name, "codepage.");
 207   slprintf(&codepage_file_name[strlen(codepage_file_name)],
 208            sizeof(pstring)-(strlen(codepage_file_name)+1),
 209            "%03d",
 210            client_codepage);
 211
 212   if(sys_stat(codepage_file_name,&st)!=0)
 213   {
 214     DEBUG(0,("load_client_codepage: filename %s does not exist.\n",
 215               codepage_file_name));
 216     return NULL;
 217   }
 218
 219   /* Check if it is at least big enough to hold the required
 220      data. Should be 2 byte version, 2 byte codepage, 4 byte length,
 221      plus zero or more bytes of data. Note that the data cannot be more
 222      than 4 * MAXCODEPAGELINES bytes.
 223    */
 224   size = st.st_size;
 225
 226   if( size < CODEPAGE_HEADER_SIZE || size > (CODEPAGE_HEADER_SIZE + 4 * MAXCODEPAGELINES))
 227   {
 228     DEBUG(0,("load_client_codepage: file %s is an incorrect size for a \
 229 code page file (size=%d).\n", codepage_file_name, (int)size));
 230     return NULL;
 231   }
 232
 233   /* Read the first 8 bytes of the codepage file - check
 234      the version number and code page number. All the data
 235      is held in little endian format.
 236    */
 237
 238   if((fp = sys_fopen( codepage_file_name, "r")) == NULL)
 239   {
 240     DEBUG(0,("load_client_codepage: cannot open file %s. Error was %s\n",
 241               codepage_file_name, strerror(errno)));
 242     return NULL;
 243   }
 244
 245   if(fread( buf, 1, CODEPAGE_HEADER_SIZE, fp)!=CODEPAGE_HEADER_SIZE)
 246   {
 247     DEBUG(0,("load_client_codepage: cannot read header from file %s. Error was %s\n",
 248               codepage_file_name, strerror(errno)));
 249     goto clean_and_exit;
 250   }
 251
 252   /* Check the version value */
 253   if(SVAL(buf,CODEPAGE_VERSION_OFFSET) != CODEPAGE_FILE_VERSION_ID)
 254   {
 255     DEBUG(0,("load_client_codepage: filename %s has incorrect version id. \
 256 Needed %hu, got %hu.\n",
 257           codepage_file_name, (uint16)CODEPAGE_FILE_VERSION_ID,
 258           SVAL(buf,CODEPAGE_VERSION_OFFSET)));
 259     goto clean_and_exit;
 260   }
 261
 262   /* Check the codepage matches */
 263   if(SVAL(buf,CODEPAGE_CLIENT_CODEPAGE_OFFSET) != (uint16)client_codepage)
 264   {
 265     DEBUG(0,("load_client_codepage: filename %s has incorrect codepage. \
 266 Needed %hu, got %hu.\n",
 267            codepage_file_name, (uint16)client_codepage,
 268            SVAL(buf,CODEPAGE_CLIENT_CODEPAGE_OFFSET)));
 269     goto clean_and_exit;
 270   }
 271
 272   /* Check the length is correct. */
 273   if(IVAL(buf,CODEPAGE_LENGTH_OFFSET) != (size - CODEPAGE_HEADER_SIZE))
 274   {
 275     DEBUG(0,("load_client_codepage: filename %s has incorrect size headers. \
 276 Needed %u, got %u.\n", codepage_file_name, (uint32)(size - CODEPAGE_HEADER_SIZE),
 277                IVAL(buf,CODEPAGE_LENGTH_OFFSET)));
 278     goto clean_and_exit;
 279   }
 280
 281   size -= CODEPAGE_HEADER_SIZE; /* Remove header */
 282
 283   /* Make sure the size is a multiple of 4. */
 284   if((size % 4 ) != 0)
 285   {
 286     DEBUG(0,("load_client_codepage: filename %s has a codepage size not a \
 287 multiple of 4.\n", codepage_file_name));
 288     goto clean_and_exit;
 289   }
 290
 291   /* Allocate space for the code page file and read it all in. */
 292   if((cp_p = (codepage_p)malloc( size  + 4 )) == NULL)
 293   {
 294     DEBUG(0,("load_client_codepage: malloc fail.\n"));
 295     goto clean_and_exit;
 296   }
 297
 298   if(fread( (char *)cp_p, 1, size, fp)!=size)
 299   {
 300     DEBUG(0,("load_client_codepage: read fail on file %s. Error was %s.\n",
 301               codepage_file_name, strerror(errno)));
 302     goto clean_and_exit;
 303   }
 304
 305   /* Ensure array is correctly terminated. */
 306   memset(((char *)cp_p) + size, '\0', 4);
 307
 308   fclose(fp);
 309   return cp_p;
 310
 311 clean_and_exit:
 312
 313   /* pseudo destructor :-) */
 314
 315   if(fp != NULL)
 316     fclose(fp);
 317   if(cp_p)
 318     free((char *)cp_p);
 319   return NULL;
 320 }
 321
 322 /****************************************************************************
 323  Initialise the client codepage.
 324 ****************************************************************************/
 325
 326 void codepage_initialise(int client_codepage)
 327 {
 328   int i;
 329   static codepage_p cp = NULL;
 330
 331   if(cp != NULL)
 332   {
 333     DEBUG(6,
 334       ("codepage_initialise: called twice - ignoring second client code page = %d\n",
 335       client_codepage));
 336     return;
 337   }
 338
 339   DEBUG(6,("codepage_initialise: client code page = %d\n", client_codepage));
 340
 341   /*
 342    * Known client codepages - these can be added to.
 343    */
 344   cp = load_client_codepage( client_codepage );
 345
 346   if(cp == NULL)
 347   {
 348 #ifdef KANJI
 349     DEBUG(6,("codepage_initialise: loading dynamic codepage file %s/codepage.%d \
 350 for code page %d failed. Using default client codepage 932\n",
 351              CODEPAGEDIR, client_codepage, client_codepage));
 352     cp = cp_932;
 353     client_codepage = KANJI_CODEPAGE;
 354 #else /* KANJI */
 355     DEBUG(6,("codepage_initialise: loading dynamic codepage file %s/codepage.%d \
 356 for code page %d failed. Using default client codepage 850\n",
 357              CODEPAGEDIR, client_codepage, client_codepage));
 358     cp = cp_850;
 359     client_codepage = MSDOS_LATIN_1_CODEPAGE;
 360 #endif /* KANJI */
 361   }
 362
 363   /*
 364    * Setup the function pointers for the loaded codepage.
 365    */
 366   initialize_multibyte_vectors( client_codepage );
 367
 368   if(cp)
 369   {
 370     for(i = 0; !((cp[i][0] == '\0') && (cp[i][1] == '\0')); i++)
 371       add_dos_char(cp[i][0], (BOOL)cp[i][2], cp[i][1], (BOOL)cp[i][3]);
 372   }
 373
 374   /* Try and load the unicode map. */
 375   load_dos_unicode_map(client_codepage);
 376 }
 377
 378 /*******************************************************************
 379 add characters depending on a string passed by the user
 380 ********************************************************************/
 381 void add_char_string(char *s)
 382 {
 383   char *extra_chars = (char *)strdup(s);
 384   char *t;
 385   if (!extra_chars) return;
 386
 387   for (t=strtok(extra_chars," \t\r\n"); t; t=strtok(NULL," \t\r\n")) {
 388     char c1=0,c2=0;
 389     int i1=0,i2=0;
 390     if (isdigit((unsigned char)*t) || (*t)=='-') {
 391       sscanf(t,"%i:%i",&i1,&i2);
 392       add_dos_char(i1,True,i2,True);
 393     } else {
 394       sscanf(t,"%c:%c",&c1,&c2);
 395       add_dos_char((unsigned char)c1,True,(unsigned char)c2, True);
 396     }
 397   }
 398
 399   free(extra_chars);
 400 }