17a9fea65633c5c69dea836a5f6ddc328fb0c7e3
[jlayton/glibc.git] / iconvdata / utf-7.c
1 /* Conversion module for UTF-7.
2    Copyright (C) 2000-2014 Free Software Foundation, Inc.
3    This file is part of the GNU C Library.
4    Contributed by Bruno Haible <haible@clisp.cons.org>, 2000.
5
6    The GNU C Library is free software; you can redistribute it and/or
7    modify it under the terms of the GNU Lesser General Public
8    License as published by the Free Software Foundation; either
9    version 2.1 of the License, or (at your option) any later version.
10
11    The GNU C Library is distributed in the hope that it will be useful,
12    but WITHOUT ANY WARRANTY; without even the implied warranty of
13    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14    Lesser General Public License for more details.
15
16    You should have received a copy of the GNU Lesser General Public
17    License along with the GNU C Library; if not, see
18    <http://www.gnu.org/licenses/>.  */
19
20 /* UTF-7 is a legacy encoding used for transmitting Unicode within the
21    ASCII character set, used primarily by mail agents.  New programs
22    are encouraged to use UTF-8 instead.
23
24    UTF-7 is specified in RFC 2152 (and old RFC 1641, RFC 1642).  The
25    original Base64 encoding is defined in RFC 2045.  */
26
27 #include <dlfcn.h>
28 #include <gconv.h>
29 #include <stdint.h>
30 #include <stdlib.h>
31
32
33 /* Define this to 1 if you want the so-called "optional direct" characters
34       ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
35    to be encoded. Define to 0 if you want them to be passed straight
36    through, like the so-called "direct" characters.
37    We set this to 1 because it's safer.
38  */
39 #define UTF7_ENCODE_OPTIONAL_CHARS 1
40
41
42 /* The set of "direct characters":
43    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
44 */
45
46 static const unsigned char direct_tab[128 / 8] =
47   {
48     0x00, 0x26, 0x00, 0x00, 0x81, 0xf3, 0xff, 0x87,
49     0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
50   };
51
52 static int
53 isdirect (uint32_t ch)
54 {
55   return (ch < 128 && ((direct_tab[ch >> 3] >> (ch & 7)) & 1));
56 }
57
58
59 /* The set of "direct and optional direct characters":
60    A-Z a-z 0-9 ' ( ) , - . / : ? space tab lf cr
61    ! " # $ % & * ; < = > @ [ ] ^ _ ` { | }
62 */
63
64 static const unsigned char xdirect_tab[128 / 8] =
65   {
66     0x00, 0x26, 0x00, 0x00, 0xff, 0xf7, 0xff, 0xff,
67     0xff, 0xff, 0xff, 0xef, 0xff, 0xff, 0xff, 0x3f
68   };
69
70 static int
71 isxdirect (uint32_t ch)
72 {
73   return (ch < 128 && ((xdirect_tab[ch >> 3] >> (ch & 7)) & 1));
74 }
75
76
77 /* The set of "extended base64 characters":
78    A-Z a-z 0-9 + / -
79 */
80
81 static const unsigned char xbase64_tab[128 / 8] =
82   {
83     0x00, 0x00, 0x00, 0x00, 0x00, 0xa8, 0xff, 0x03,
84     0xfe, 0xff, 0xff, 0x07, 0xfe, 0xff, 0xff, 0x07
85   };
86
87 static int
88 isxbase64 (uint32_t ch)
89 {
90   return (ch < 128 && ((xbase64_tab[ch >> 3] >> (ch & 7)) & 1));
91 }
92
93
94 /* Converts a value in the range 0..63 to a base64 encoded char.  */
95 static unsigned char
96 base64 (unsigned int i)
97 {
98   if (i < 26)
99     return i + 'A';
100   else if (i < 52)
101     return i - 26 + 'a';
102   else if (i < 62)
103     return i - 52 + '0';
104   else if (i == 62)
105     return '+';
106   else if (i == 63)
107     return '/';
108   else
109     abort ();
110 }
111
112
113 /* Definitions used in the body of the `gconv' function.  */
114 #define CHARSET_NAME            "UTF-7//"
115 #define DEFINE_INIT             1
116 #define DEFINE_FINI             1
117 #define FROM_LOOP               from_utf7_loop
118 #define TO_LOOP                 to_utf7_loop
119 #define MIN_NEEDED_FROM         1
120 #define MAX_NEEDED_FROM         6
121 #define MIN_NEEDED_TO           4
122 #define MAX_NEEDED_TO           4
123 #define PREPARE_LOOP \
124   mbstate_t saved_state;                                                      \
125   mbstate_t *statep = data->__statep;
126 #define EXTRA_LOOP_ARGS         , statep
127
128
129 /* Since we might have to reset input pointer we must be able to save
130    and restore the state.  */
131 #define SAVE_RESET_STATE(Save) \
132   if (Save)                                                                   \
133     saved_state = *statep;                                                    \
134   else                                                                        \
135     *statep = saved_state
136
137
138 /* First define the conversion function from UTF-7 to UCS4.
139    The state is structured as follows:
140      __count bit 2..0: zero
141      __count bit 8..3: shift
142      __wch: data
143    Precise meaning:
144      shift      data
145        0         --          not inside base64 encoding
146      1..32  XX..XX00..00     inside base64, (32 - shift) bits pending
147    This state layout is simpler than relying on STORE_REST/UNPACK_BYTES.
148
149    When shift = 0, __wch needs to store at most one lookahead byte (see
150    __GCONV_INCOMPLETE_INPUT below).
151 */
152 #define MIN_NEEDED_INPUT        MIN_NEEDED_FROM
153 #define MAX_NEEDED_INPUT        MAX_NEEDED_FROM
154 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_TO
155 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_TO
156 #define LOOPFCT                 FROM_LOOP
157 #define BODY \
158   {                                                                           \
159     uint_fast8_t ch = *inptr;                                                 \
160                                                                               \
161     if ((statep->__count >> 3) == 0)                                          \
162       {                                                                       \
163         /* base64 encoding inactive.  */                                      \
164         if (isxdirect (ch))                                                   \
165           {                                                                   \
166             inptr++;                                                          \
167             put32 (outptr, ch);                                               \
168             outptr += 4;                                                      \
169           }                                                                   \
170         else if (__builtin_expect (ch == '+', 1))                             \
171           {                                                                   \
172             if (__builtin_expect (inptr + 2 > inend, 0))                      \
173               {                                                               \
174                 /* Not enough input available.  */                            \
175                 result = __GCONV_INCOMPLETE_INPUT;                            \
176                 break;                                                        \
177               }                                                               \
178             if (inptr[1] == '-')                                              \
179               {                                                               \
180                 inptr += 2;                                                   \
181                 put32 (outptr, ch);                                           \
182                 outptr += 4;                                                  \
183               }                                                               \
184             else                                                              \
185               {                                                               \
186                 /* Switch into base64 mode.  */                               \
187                 inptr++;                                                      \
188                 statep->__count = (32 << 3);                                  \
189                 statep->__value.__wch = 0;                                    \
190               }                                                               \
191           }                                                                   \
192         else                                                                  \
193           {                                                                   \
194             /* The input is invalid.  */                                      \
195             STANDARD_FROM_LOOP_ERR_HANDLER (1);                               \
196           }                                                                   \
197       }                                                                       \
198     else                                                                      \
199       {                                                                       \
200         /* base64 encoding active.  */                                        \
201         uint32_t i;                                                           \
202         int shift;                                                            \
203                                                                               \
204         if (ch >= 'A' && ch <= 'Z')                                           \
205           i = ch - 'A';                                                       \
206         else if (ch >= 'a' && ch <= 'z')                                      \
207           i = ch - 'a' + 26;                                                  \
208         else if (ch >= '0' && ch <= '9')                                      \
209           i = ch - '0' + 52;                                                  \
210         else if (ch == '+')                                                   \
211           i = 62;                                                             \
212         else if (ch == '/')                                                   \
213           i = 63;                                                             \
214         else                                                                  \
215           {                                                                   \
216             /* Terminate base64 encoding.  */                                 \
217                                                                               \
218             /* If accumulated data is nonzero, the input is invalid.  */      \
219             /* Also, partial UTF-16 characters are invalid.  */               \
220             if (__builtin_expect (statep->__value.__wch != 0, 0)              \
221                 || __builtin_expect ((statep->__count >> 3) <= 26, 0))        \
222               {                                                               \
223                 STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));    \
224               }                                                               \
225                                                                               \
226             if (ch == '-')                                                    \
227               inptr++;                                                        \
228                                                                               \
229             statep->__count = 0;                                              \
230             continue;                                                         \
231           }                                                                   \
232                                                                               \
233         /* Concatenate the base64 integer i to the accumulator.  */           \
234         shift = (statep->__count >> 3);                                       \
235         if (shift > 6)                                                        \
236           {                                                                   \
237             uint32_t wch;                                                     \
238                                                                               \
239             shift -= 6;                                                       \
240             wch = statep->__value.__wch | (i << shift);                       \
241                                                                               \
242             if (shift <= 16 && shift > 10)                                    \
243               {                                                               \
244                 /* An UTF-16 character has just been completed.  */           \
245                 uint32_t wc1 = wch >> 16;                                     \
246                                                                               \
247                 /* UTF-16: When we see a High Surrogate, we must also decode  \
248                    the following Low Surrogate. */                            \
249                 if (!(wc1 >= 0xd800 && wc1 < 0xdc00))                         \
250                   {                                                           \
251                     wch = wch << 16;                                          \
252                     shift += 16;                                              \
253                     put32 (outptr, wc1);                                      \
254                     outptr += 4;                                              \
255                   }                                                           \
256               }                                                               \
257             else if (shift <= 10 && shift > 4)                                \
258               {                                                               \
259                 /* After a High Surrogate, verify that the next 16 bit        \
260                    indeed form a Low Surrogate.  */                           \
261                 uint32_t wc2 = wch & 0xffff;                                  \
262                                                                               \
263                 if (! __builtin_expect (wc2 >= 0xdc00 && wc2 < 0xe000, 1))    \
264                   {                                                           \
265                     STANDARD_FROM_LOOP_ERR_HANDLER ((statep->__count = 0, 1));\
266                   }                                                           \
267               }                                                               \
268                                                                               \
269             statep->__value.__wch = wch;                                      \
270           }                                                                   \
271         else                                                                  \
272           {                                                                   \
273             /* An UTF-16 surrogate pair has just been completed.  */          \
274             uint32_t wc1 = (uint32_t) statep->__value.__wch >> 16;            \
275             uint32_t wc2 = ((uint32_t) statep->__value.__wch & 0xffff)        \
276                            | (i >> (6 - shift));                              \
277                                                                               \
278             statep->__value.__wch = (i << shift) << 26;                       \
279             shift += 26;                                                      \
280                                                                               \
281             assert (wc1 >= 0xd800 && wc1 < 0xdc00);                           \
282             assert (wc2 >= 0xdc00 && wc2 < 0xe000);                           \
283             put32 (outptr,                                                    \
284                    0x10000 + ((wc1 - 0xd800) << 10) + (wc2 - 0xdc00));        \
285             outptr += 4;                                                      \
286           }                                                                   \
287                                                                               \
288         statep->__count = shift << 3;                                         \
289                                                                               \
290         /* Now that we digested the input increment the input pointer.  */    \
291         inptr++;                                                              \
292       }                                                                       \
293   }
294 #define LOOP_NEED_FLAGS
295 #define EXTRA_LOOP_DECLS        , mbstate_t *statep
296 #include <iconv/loop.c>
297
298
299 /* Next, define the conversion from UCS4 to UTF-7.
300    The state is structured as follows:
301      __count bit 2..0: zero
302      __count bit 4..3: shift
303      __count bit 8..5: data
304    Precise meaning:
305      shift      data
306        0         0           not inside base64 encoding
307        1         0           inside base64, no pending bits
308        2       XX00          inside base64, 2 bits known for next byte
309        3       XXXX          inside base64, 4 bits known for next byte
310
311    __count bit 2..0 and __wch are always zero, because this direction
312    never returns __GCONV_INCOMPLETE_INPUT.
313 */
314 #define MIN_NEEDED_INPUT        MIN_NEEDED_TO
315 #define MAX_NEEDED_INPUT        MAX_NEEDED_TO
316 #define MIN_NEEDED_OUTPUT       MIN_NEEDED_FROM
317 #define MAX_NEEDED_OUTPUT       MAX_NEEDED_FROM
318 #define LOOPFCT                 TO_LOOP
319 #define BODY \
320   {                                                                           \
321     uint32_t ch = get32 (inptr);                                              \
322                                                                               \
323     if ((statep->__count & 0x18) == 0)                                        \
324       {                                                                       \
325         /* base64 encoding inactive */                                        \
326         if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
327           {                                                                   \
328             *outptr++ = (unsigned char) ch;                                   \
329           }                                                                   \
330         else                                                                  \
331           {                                                                   \
332             size_t count;                                                     \
333                                                                               \
334             if (ch == '+')                                                    \
335               count = 2;                                                      \
336             else if (ch < 0x10000)                                            \
337               count = 3;                                                      \
338             else if (ch < 0x110000)                                           \
339               count = 6;                                                      \
340             else                                                              \
341               STANDARD_TO_LOOP_ERR_HANDLER (4);                               \
342                                                                               \
343             if (__builtin_expect (outptr + count > outend, 0))                \
344               {                                                               \
345                 result = __GCONV_FULL_OUTPUT;                                 \
346                 break;                                                        \
347               }                                                               \
348                                                                               \
349             *outptr++ = '+';                                                  \
350             if (ch == '+')                                                    \
351               *outptr++ = '-';                                                \
352             else if (ch < 0x10000)                                            \
353               {                                                               \
354                 *outptr++ = base64 (ch >> 10);                                \
355                 *outptr++ = base64 ((ch >> 4) & 0x3f);                        \
356                 statep->__count = ((ch & 15) << 5) | (3 << 3);                \
357               }                                                               \
358             else if (ch < 0x110000)                                           \
359               {                                                               \
360                 uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10);               \
361                 uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff);             \
362                                                                               \
363                 ch = (ch1 << 16) | ch2;                                       \
364                 *outptr++ = base64 (ch >> 26);                                \
365                 *outptr++ = base64 ((ch >> 20) & 0x3f);                       \
366                 *outptr++ = base64 ((ch >> 14) & 0x3f);                       \
367                 *outptr++ = base64 ((ch >> 8) & 0x3f);                        \
368                 *outptr++ = base64 ((ch >> 2) & 0x3f);                        \
369                 statep->__count = ((ch & 3) << 7) | (2 << 3);                 \
370               }                                                               \
371             else                                                              \
372               abort ();                                                       \
373           }                                                                   \
374       }                                                                       \
375     else                                                                      \
376       {                                                                       \
377         /* base64 encoding active */                                          \
378         if (UTF7_ENCODE_OPTIONAL_CHARS ? isdirect (ch) : isxdirect (ch))      \
379           {                                                                   \
380             /* deactivate base64 encoding */                                  \
381             size_t count;                                                     \
382                                                                               \
383             count = ((statep->__count & 0x18) >= 0x10) + isxbase64 (ch) + 1;  \
384             if (__builtin_expect (outptr + count > outend, 0))                \
385               {                                                               \
386                 result = __GCONV_FULL_OUTPUT;                                 \
387                 break;                                                        \
388               }                                                               \
389                                                                               \
390             if ((statep->__count & 0x18) >= 0x10)                             \
391               *outptr++ = base64 ((statep->__count >> 3) & ~3);               \
392             if (isxbase64 (ch))                                               \
393               *outptr++ = '-';                                                \
394             *outptr++ = (unsigned char) ch;                                   \
395             statep->__count = 0;                                              \
396           }                                                                   \
397         else                                                                  \
398           {                                                                   \
399             size_t count;                                                     \
400                                                                               \
401             if (ch < 0x10000)                                                 \
402               count = ((statep->__count & 0x18) >= 0x10 ? 3 : 2);             \
403             else if (ch < 0x110000)                                           \
404               count = ((statep->__count & 0x18) >= 0x18 ? 6 : 5);             \
405             else                                                              \
406               STANDARD_TO_LOOP_ERR_HANDLER (4);                               \
407                                                                               \
408             if (__builtin_expect (outptr + count > outend, 0))                \
409               {                                                               \
410                 result = __GCONV_FULL_OUTPUT;                                 \
411                 break;                                                        \
412               }                                                               \
413                                                                               \
414             if (ch < 0x10000)                                                 \
415               {                                                               \
416                 switch ((statep->__count >> 3) & 3)                           \
417                   {                                                           \
418                   case 1:                                                     \
419                     *outptr++ = base64 (ch >> 10);                            \
420                     *outptr++ = base64 ((ch >> 4) & 0x3f);                    \
421                     statep->__count = ((ch & 15) << 5) | (3 << 3);            \
422                     break;                                                    \
423                   case 2:                                                     \
424                     *outptr++ =                                               \
425                       base64 (((statep->__count >> 3) & ~3) | (ch >> 12));    \
426                     *outptr++ = base64 ((ch >> 6) & 0x3f);                    \
427                     *outptr++ = base64 (ch & 0x3f);                           \
428                     statep->__count = (1 << 3);                               \
429                     break;                                                    \
430                   case 3:                                                     \
431                     *outptr++ =                                               \
432                       base64 (((statep->__count >> 3) & ~3) | (ch >> 14));    \
433                     *outptr++ = base64 ((ch >> 8) & 0x3f);                    \
434                     *outptr++ = base64 ((ch >> 2) & 0x3f);                    \
435                     statep->__count = ((ch & 3) << 7) | (2 << 3);             \
436                     break;                                                    \
437                   default:                                                    \
438                     abort ();                                                 \
439                   }                                                           \
440               }                                                               \
441             else if (ch < 0x110000)                                           \
442               {                                                               \
443                 uint32_t ch1 = 0xd800 + ((ch - 0x10000) >> 10);               \
444                 uint32_t ch2 = 0xdc00 + ((ch - 0x10000) & 0x3ff);             \
445                                                                               \
446                 ch = (ch1 << 16) | ch2;                                       \
447                 switch ((statep->__count >> 3) & 3)                           \
448                   {                                                           \
449                   case 1:                                                     \
450                     *outptr++ = base64 (ch >> 26);                            \
451                     *outptr++ = base64 ((ch >> 20) & 0x3f);                   \
452                     *outptr++ = base64 ((ch >> 14) & 0x3f);                   \
453                     *outptr++ = base64 ((ch >> 8) & 0x3f);                    \
454                     *outptr++ = base64 ((ch >> 2) & 0x3f);                    \
455                     statep->__count = ((ch & 3) << 7) | (2 << 3);             \
456                     break;                                                    \
457                   case 2:                                                     \
458                     *outptr++ =                                               \
459                       base64 (((statep->__count >> 3) & ~3) | (ch >> 28));    \
460                     *outptr++ = base64 ((ch >> 22) & 0x3f);                   \
461                     *outptr++ = base64 ((ch >> 16) & 0x3f);                   \
462                     *outptr++ = base64 ((ch >> 10) & 0x3f);                   \
463                     *outptr++ = base64 ((ch >> 4) & 0x3f);                    \
464                     statep->__count = ((ch & 15) << 5) | (3 << 3);            \
465                     break;                                                    \
466                   case 3:                                                     \
467                     *outptr++ =                                               \
468                       base64 (((statep->__count >> 3) & ~3) | (ch >> 30));    \
469                     *outptr++ = base64 ((ch >> 24) & 0x3f);                   \
470                     *outptr++ = base64 ((ch >> 18) & 0x3f);                   \
471                     *outptr++ = base64 ((ch >> 12) & 0x3f);                   \
472                     *outptr++ = base64 ((ch >> 6) & 0x3f);                    \
473                     *outptr++ = base64 (ch & 0x3f);                           \
474                     statep->__count = (1 << 3);                               \
475                     break;                                                    \
476                   default:                                                    \
477                     abort ();                                                 \
478                   }                                                           \
479               }                                                               \
480             else                                                              \
481               abort ();                                                       \
482           }                                                                   \
483       }                                                                       \
484                                                                               \
485     /* Now that we wrote the output increment the input pointer.  */          \
486     inptr += 4;                                                               \
487   }
488 #define LOOP_NEED_FLAGS
489 #define EXTRA_LOOP_DECLS        , mbstate_t *statep
490 #include <iconv/loop.c>
491
492
493 /* Since this is a stateful encoding we have to provide code which resets
494    the output state to the initial state.  This has to be done during the
495    flushing.  */
496 #define EMIT_SHIFT_TO_INIT \
497   if (FROM_DIRECTION)                                                         \
498     /* Nothing to emit.  */                                                   \
499     memset (data->__statep, '\0', sizeof (mbstate_t));                        \
500   else                                                                        \
501     {                                                                         \
502       /* The "to UTF-7" direction.  Flush the remaining bits and terminate    \
503          with a '-' byte.  This will guarantee correct decoding if more       \
504          UTF-7 encoded text is added afterwards.  */                          \
505       int state = data->__statep->__count;                                    \
506                                                                               \
507       if (state & 0x18)                                                       \
508         {                                                                     \
509           /* Deactivate base64 encoding.  */                                  \
510           size_t count = ((state & 0x18) >= 0x10) + 1;                        \
511                                                                               \
512           if (__builtin_expect (outbuf + count > outend, 0))                  \
513             /* We don't have enough room in the output buffer.  */            \
514             status = __GCONV_FULL_OUTPUT;                                     \
515           else                                                                \
516             {                                                                 \
517               /* Write out the shift sequence.  */                            \
518               if ((state & 0x18) >= 0x10)                                     \
519                 *outbuf++ = base64 ((state >> 3) & ~3);                       \
520               *outbuf++ = '-';                                                \
521                                                                               \
522               data->__statep->__count = 0;                                    \
523             }                                                                 \
524         }                                                                     \
525       else                                                                    \
526         data->__statep->__count = 0;                                          \
527     }
528
529
530 /* Now define the toplevel functions.  */
531 #include <iconv/skeleton.c>