[BZ #4896, BZ #4936]
[jlayton/glibc.git] / iconvdata / johab.c
1 /* Mapping tables for JOHAB handling.
2    Copyright (C) 1998, 1999, 2000-2002, 2007 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Jungshik Shin <jshin@pantheon.yale.edu>
5    and Ulrich Drepper <drepper@cygnus.com>, 1998.
6
7    The GNU C Library is free software; you can redistribute it and/or
8    modify it under the terms of the GNU Lesser General Public
9    License as published by the Free Software Foundation; either
10    version 2.1 of the License, or (at your option) any later version.
11
12    The GNU C Library is distributed in the hope that it will be useful,
13    but WITHOUT ANY WARRANTY; without even the implied warranty of
14    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15    Lesser General Public License for more details.
16
17    You should have received a copy of the GNU Lesser General Public
18    License along with the GNU C Library; if not, write to the Free
19    Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
20    02111-1307 USA.  */
21
22 #include <dlfcn.h>
23 #include <stdint.h>
24 #include <ksc5601.h>
25
26 /* The table for Bit pattern to Hangul Jamo
27    5 bits each are used to encode
28    leading consonants(19 + 1 filler), medial vowels(21 + 1 filler)
29    and trailing consonants(27 + 1 filler).
30
31    KS C 5601-1992 Annex 3 Table 2
32    0 : Filler, -1: invalid, >= 1 : valid
33
34  */
35 static const int init[32] =
36 {
37   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
38   19, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
39 };
40 static const int mid[32] =
41 {
42   -1, -1, 0, 1, 2, 3, 4, 5,
43   -1, -1, 6, 7, 8, 9, 10, 11,
44   -1, -1, 12, 13, 14, 15, 16, 17,
45   -1, -1, 18, 19, 20, 21, -1, -1
46 };
47 static const int final[32] =
48 {
49   -1, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
50   -1, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, -1, -1
51 };
52
53 /*
54    Hangul Jamo in Johab to Unicode 2.0 : Unicode 2.0
55    defines 51 Hangul Compatibility Jamos in the block [0x3131,0x314e]
56
57    It's to be considered later which Jamo block to use, Compatibility
58    block [0x3131,0x314e] or Hangul Conjoining Jamo block, [0x1100,0x11ff]
59
60  */
61 static const uint32_t init_to_ucs[19] =
62 {
63   0x3131, 0x3132, 0x3134, 0x3137, 0x3138, 0x3139, 0x3141, 0x3142,
64   0x3143, 0x3145, 0x3146, 0x3147, 0x3148, 0x3149, 0x314a, 0x314b,
65   0x314c, 0x314d, 0x314e
66 };
67
68 static const uint32_t final_to_ucs[31] =
69 {
70   L'\0', L'\0', 0x3133, L'\0', 0x3135, 0x3136, L'\0', L'\0',
71   0x313a, 0x313b, 0x313c, 0x313d, 0x313e, 0x313f,
72   0x3140, L'\0', L'\0', 0x3144, L'\0', L'\0', L'\0', L'\0',
73   L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0', L'\0'
74 };
75
76 /* The following three arrays are used to convert
77    precomposed Hangul syllables in [0xac00,0xd???]
78    to Jamo bit patterns for Johab encoding
79
80    cf. : KS C 5601-1992, Annex3 Table 2
81
82    Arrays are used to speed up things although it's possible
83    to get the same result arithmetically.
84
85  */
86 static const int init_to_bit[19] =
87 {
88   0x8800, 0x8c00, 0x9000, 0x9400, 0x9800, 0x9c00,
89   0xa000, 0xa400, 0xa800, 0xac00, 0xb000, 0xb400,
90   0xb800, 0xbc00, 0xc000, 0xc400, 0xc800, 0xcc00,
91   0xd000
92 };
93
94 static const int mid_to_bit[21] =
95 {
96           0x0060, 0x0080, 0x00a0, 0x00c0, 0x00e0,
97   0x0140, 0x0160, 0x0180, 0x01a0, 0x01c0, 0x1e0,
98   0x0240, 0x0260, 0x0280, 0x02a0, 0x02c0, 0x02e0,
99   0x0340, 0x0360, 0x0380, 0x03a0
100 };
101
102 static const int final_to_bit[28] =
103 {
104   1, 2, 3, 4, 5, 6, 7, 8, 9, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11,
105   0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d
106 };
107
108 /* The conversion table from
109    UCS4 Hangul Compatibility Jamo in [0x3131,0x3163]
110    to Johab
111
112    cf. 1. KS C 5601-1992 Annex 3 Table 2
113    2. Unicode 2.0 manual
114
115  */
116 static const uint16_t jamo_from_ucs_table[51] =
117 {
118   0x8841, 0x8c41,
119   0x8444,
120   0x9041,
121   0x8446, 0x8447,
122   0x9441, 0x9841, 0x9c41,
123   0x844a, 0x844b, 0x844c, 0x844d, 0x844e, 0x844f, 0x8450,
124   0xa041, 0xa441, 0xa841,
125   0x8454,
126   0xac41, 0xb041, 0xb441, 0xb841, 0xbc41,
127   0xc041, 0xc441, 0xc841, 0xcc41, 0xd041,
128   0x8461, 0x8481, 0x84a1, 0x84c1, 0x84e1,
129   0x8541, 0x8561, 0x8581, 0x85a1, 0x85c1, 0x85e1,
130   0x8641, 0x8661, 0x8681, 0x86a1, 0x86c1, 0x86e1,
131   0x8741, 0x8761, 0x8781, 0x87a1
132 };
133
134
135 static inline uint32_t
136 johab_sym_hanja_to_ucs (uint_fast32_t idx, uint_fast32_t c1, uint_fast32_t c2)
137 {
138   if (idx <= 0xdefe)
139     return (uint32_t) __ksc5601_sym_to_ucs[(c1 - 0xd9) * 188 + c2
140                                            - (c2 > 0x90 ? 0x43 : 0x31)];
141   else
142     return (uint32_t) __ksc5601_hanja_to_ucs[(c1 - 0xe0) * 188 + c2
143                                              - (c2 > 0x90 ? 0x43 : 0x31)];
144 }
145 /* Definitions used in the body of the `gconv' function.  */
146 #define CHARSET_NAME            "JOHAB//"
147 #define FROM_LOOP               from_johab
148 #define TO_LOOP                 to_johab
149 #define DEFINE_INIT             1
150 #define DEFINE_FINI             1
151 #define MIN_NEEDED_FROM         1
152 #define MAX_NEEDED_FROM         2
153 #define MIN_NEEDED_TO           4
154
155
156 /* First define the conversion function from JOHAB to UCS4.  */
157 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
158 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
159 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
160 #define LOOPFCT                 FROM_LOOP
161 #define BODY \
162   {                                                                           \
163     uint32_t ch = *inptr;                                                     \
164                                                                               \
165     if (ch <= 0x7f)                                                           \
166       {                                                                       \
167         /* Plain ISO646-KR.  */                                               \
168         if (ch == 0x5c)                                                       \
169           ch = 0x20a9; /* half-width Korean Currency WON sign */              \
170         ++inptr;                                                              \
171       }                                                                       \
172     /* Johab : 1. Hangul                                                      \
173        1st byte : 0x84-0xd3                                                   \
174        2nd byte : 0x41-0x7e, 0x81-0xfe                                        \
175        2. Hanja & Symbol  :                                                   \
176        1st byte : 0xd8-0xde, 0xe0-0xf9                                        \
177        2nd byte : 0x31-0x7e, 0x91-0xfe                                        \
178        0xd831-0xd87e and 0xd891-0xd8fe are user-defined area */               \
179     else                                                                      \
180       {                                                                       \
181         if (__builtin_expect (ch > 0xf9, 0)                                   \
182             || __builtin_expect (ch == 0xdf, 0)                               \
183             || (__builtin_expect (ch > 0x7e, 0) && ch < 0x84)                 \
184             || (__builtin_expect (ch > 0xd3, 0) && ch < 0xd9))                \
185           {                                                                   \
186             /* These are illegal.  */                                         \
187             STANDARD_FROM_LOOP_ERR_HANDLER (1);                               \
188           }                                                                   \
189         else                                                                  \
190           {                                                                   \
191             /* Two-byte character.  First test whether the next               \
192                character is also available.  */                               \
193             uint32_t ch2;                                                     \
194             uint_fast32_t idx;                                                \
195                                                                               \
196             if (__builtin_expect (inptr + 1 >= inend, 0))                     \
197               {                                                               \
198                 /* The second character is not available.  Store the          \
199                    intermediate result.  */                                   \
200                 result = __GCONV_INCOMPLETE_INPUT;                            \
201                 break;                                                        \
202               }                                                               \
203                                                                               \
204             ch2 = inptr[1];                                                   \
205             idx = ch * 256 + ch2;                                             \
206             if (__builtin_expect (ch <= 0xd3, 1))                             \
207               {                                                               \
208                 /* Hangul */                                                  \
209                 int_fast32_t i, m, f;                                         \
210                                                                               \
211                 i = init[(idx & 0x7c00) >> 10];                               \
212                 m = mid[(idx & 0x03e0) >> 5];                                 \
213                 f = final[idx & 0x001f];                                      \
214                                                                               \
215                 if (__builtin_expect (i == -1, 0)                             \
216                     || __builtin_expect (m == -1, 0)                          \
217                     || __builtin_expect (f == -1, 0))                         \
218                   {                                                           \
219                     /* This is illegal.  */                                   \
220                     STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
221                   }                                                           \
222                 else if (i > 0 && m > 0)                                      \
223                   ch = ((i - 1) * 21 + (m - 1)) * 28 + f + 0xac00;            \
224                 else if (i > 0 && m == 0 && f == 0)                           \
225                   ch = init_to_ucs[i - 1];                                    \
226                 else if (i == 0 && m > 0 && f == 0)                           \
227                   ch = 0x314e + m;      /* 0x314f + m - 1 */                  \
228                 else if (__builtin_expect ((i | m) == 0, 1)                   \
229                          && __builtin_expect (f > 0, 1))                      \
230                   ch = final_to_ucs[f - 1];     /* round trip?? */            \
231                 else                                                          \
232                   {                                                           \
233                     /* This is illegal.  */                                   \
234                     STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
235                   }                                                           \
236               }                                                               \
237             else                                                              \
238               {                                                               \
239                 if (__builtin_expect (ch2 < 0x31, 0)                          \
240                     || (__builtin_expect (ch2 > 0x7e, 0) && ch2 < 0x91)       \
241                     || __builtin_expect (ch2, 0) == 0xff                      \
242                     || (__builtin_expect (ch, 0) == 0xd9 && ch2 > 0xe8)       \
243                     || (__builtin_expect (ch, 0) == 0xda                      \
244                         && ch2 > 0xa0 && ch2 < 0xd4)                          \
245                     || (__builtin_expect (ch, 0) == 0xde && ch2 > 0xf1))      \
246                   {                                                           \
247                     /* This is illegal.  */                                   \
248                     STANDARD_FROM_LOOP_ERR_HANDLER (1);                       \
249                   }                                                           \
250                 else                                                          \
251                   {                                                           \
252                     ch = johab_sym_hanja_to_ucs (idx, ch, ch2);               \
253                     /* if (idx <= 0xdefe)                                     \
254                          ch = __ksc5601_sym_to_ucs[(ch - 0xd9) * 192          \
255                                                    + ch2 - (ch2 > 0x90        \
256                                                             ? 0x43 : 0x31)];  \
257                        else                                                   \
258                          ch = __ksc5601_hanja_to_ucs[(ch - 0xe0) *192         \
259                                                      + ch2 -  (ch2 > 0x90     \
260                                                                ?0x43 : 0x31)];\
261                     */                                                        \
262                   }                                                           \
263               }                                                               \
264           }                                                                   \
265                                                                               \
266         if (__builtin_expect (ch == 0, 0))                                    \
267           {                                                                   \
268             /* This is an illegal character.  */                              \
269             STANDARD_FROM_LOOP_ERR_HANDLER (2);                               \
270           }                                                                   \
271                                                                               \
272         inptr += 2;                                                           \
273       }                                                                       \
274                                                                               \
275     put32 (outptr, ch);                                                       \
276     outptr += 4;                                                              \
277   }
278 #define LOOP_NEED_FLAGS
279 #define ONEBYTE_BODY \
280   {                                                                           \
281     if (c <= 0x7f)                                                            \
282       return (c == 0x5c ? 0x20a9 : c);                                        \
283     else                                                                      \
284       return WEOF;                                                            \
285   }
286 #include <iconv/loop.c>
287
288
289 /* Next, define the other direction.  */
290 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
291 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
292 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
293 #define LOOPFCT                 TO_LOOP
294 #define BODY \
295   {                                                                           \
296     uint32_t ch = get32 (inptr);                                              \
297     /*                                                                        \
298        if (ch >= (sizeof (from_ucs4_lat1) / sizeof (from_ucs4_lat1[0])))      \
299          {                                                                    \
300            if (ch >= 0x0391 && ch <= 0x0451)                                  \
301              cp = from_ucs4_greek[ch - 0x391];                                \
302            else if (ch >= 0x2010 && ch <= 0x9fa0)                             \
303              cp = from_ucs4_cjk[ch - 0x02010];                                \
304            else                                                               \
305              break;                                                           \
306          }                                                                    \
307        else                                                                   \
308          cp = from_ucs4_lat1[ch];                                             \
309     */                                                                        \
310                                                                               \
311     if (ch <= 0x7f && ch != 0x5c)                                             \
312       *outptr++ = ch;                                                         \
313     else                                                                      \
314       {                                                                       \
315         if (ch >= 0xac00 && ch <= 0xd7a3)                                     \
316           {                                                                   \
317             if (__builtin_expect (outptr + 2 > outend, 0))                    \
318               {                                                               \
319                 result = __GCONV_FULL_OUTPUT;                                 \
320                 break;                                                        \
321               }                                                               \
322                                                                               \
323             ch -= 0xac00;                                                     \
324                                                                               \
325             ch = (init_to_bit[ch / 588]   /* 21 * 28 = 588 */                 \
326                   + mid_to_bit[(ch / 28) % 21]/* (ch % (21 * 28)) / 28 */     \
327                   + final_to_bit[ch %  28]);  /* (ch % (21 * 28)) % 28 */     \
328                                                                               \
329             *outptr++ = ch / 256;                                             \
330             *outptr++ = ch % 256;                                             \
331           }                                                                   \
332         /* KS C 5601-1992 Annex 3 regards  0xA4DA(Hangul Filler : U3164)      \
333            as symbol */                                                       \
334         else if (ch >= 0x3131 && ch <= 0x3163)                                \
335           {                                                                   \
336             ch = jamo_from_ucs_table[ch - 0x3131];                            \
337                                                                               \
338             if (__builtin_expect (outptr + 2 > outend, 0))                    \
339               {                                                               \
340                 result = __GCONV_FULL_OUTPUT;                                 \
341                 break;                                                        \
342               }                                                               \
343                                                                               \
344             *outptr++ = ch / 256;                                             \
345             *outptr++ = ch % 256;                                             \
346           }                                                                   \
347         else if ((ch >= 0x4e00 && ch <= 0x9fa5)                               \
348                  || (ch >= 0xf900 && ch <= 0xfa0b))                           \
349           {                                                                   \
350             size_t written;                                                   \
351             uint32_t temp;                                                    \
352                                                                               \
353             written = ucs4_to_ksc5601_hanja (ch, outptr, outend - outptr);    \
354             if (__builtin_expect (written, 1) == 0)                           \
355               {                                                               \
356                 result = __GCONV_FULL_OUTPUT;                                 \
357                 break;                                                        \
358               }                                                               \
359             if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0))        \
360               {                                                               \
361                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
362               }                                                               \
363                                                                               \
364             outptr[0] -= 0x4a;                                                \
365             outptr[1] -= 0x21;                                                \
366                                                                               \
367             temp = outptr[0] * 94 + outptr[1];                                \
368                                                                               \
369             outptr[0] = 0xe0 + temp / 188;                                    \
370             outptr[1] = temp % 188;                                           \
371             outptr[1] += outptr[1] >= 78 ? 0x43 : 0x31;                       \
372                                                                               \
373             outptr += 2;                                                      \
374           }                                                                   \
375         else if (ch == 0x20a9)                                                \
376           *outptr++ = 0x5c;                                                   \
377         else                                                                  \
378           {                                                                   \
379             size_t written;                                                   \
380             uint32_t temp;                                                    \
381                                                                               \
382             written = ucs4_to_ksc5601_sym (ch, outptr, outend - outptr);      \
383             if (__builtin_expect (written, 1) == 0)                           \
384               {                                                               \
385                 result = __GCONV_FULL_OUTPUT;                                 \
386                 break;                                                        \
387               }                                                               \
388             if (__builtin_expect (written == __UNKNOWN_10646_CHAR, 0)         \
389                 || (outptr[0] == 0x22 && outptr[1] > 0x68))                   \
390               {                                                               \
391                 UNICODE_TAG_HANDLER (ch, 4);                                  \
392                 STANDARD_TO_LOOP_ERR_HANDLER (4);                             \
393               }                                                               \
394                                                                               \
395             temp = (outptr[0] < 0x4a ? outptr[0] + 0x191 : outptr[0] + 0x176);\
396             outptr[1] += (temp % 2 ? 0x5e : 0);                               \
397             outptr[1] += (outptr[1] < 0x6f ? 0x10 : 0x22);                    \
398             outptr[0] = temp / 2;                                             \
399                                                                               \
400             outptr += 2;                                                      \
401           }                                                                   \
402       }                                                                       \
403                                                                               \
404     inptr += 4;                                                               \
405   }
406 #define LOOP_NEED_FLAGS
407 #include <iconv/loop.c>
408
409
410 /* Now define the toplevel functions.  */
411 #include <iconv/skeleton.c>