2 * Routines for handling character sets
4 * Wireshark - Network traffic analyzer
5 * By Gerald Combs <gerald@wireshark.org>
6 * Copyright 1998 Gerald Combs
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version 2
11 * of the License, or (at your option) any later version.
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
27 #include <epan/proto.h>
28 #include <epan/wmem/wmem.h>
30 #include <wsutil/pint.h>
31 #include <wsutil/unicode-utils.h>
35 /* REPLACEMENT CHARACTER */
39 * Wikipedia's "Character encoding" template, giving a pile of character encodings and
40 * Wikipedia pages for them:
42 * http://en.wikipedia.org/wiki/Template:Character_encoding
44 * Unicode character encoding model:
46 * http://www.unicode.org/reports/tr17/
48 * International Components for Unicode character set mapping tables:
50 * http://site.icu-project.org/charts/charset
52 * MSDN information on code pages:
54 * http://msdn.microsoft.com/en-us/library/dd317752(v=VS.85).aspx
56 * ASCII-based code pages, from IBM:
58 * http://www-01.ibm.com/software/globalization/cp/cp_cpgid.html
60 * EBCDIC code pages, from IBM:
62 * http://www-03.ibm.com/systems/i/software/globalization/codepages.html
66 * Given a wmem scope, a pointer, and a length, treat the string of bytes
67 * referred to by the pointer and length as an ASCII string, with all bytes
68 * with the high-order bit set being invalid, and return a pointer to a
69 * UTF-8 string, allocated using the wmem scope.
71 * Octets with the highest bit set will be converted to the Unicode
72 * REPLACEMENT CHARACTER.
75 get_ascii_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
79 str = wmem_strbuf_sized_new(scope, length+1, 0);
85 wmem_strbuf_append_c(str, ch);
87 wmem_strbuf_append_unichar(str, UNREPL);
92 return (guint8 *) wmem_strbuf_finalize(str);
96 * Given a wmem scope, a pointer, and a length, treat the string of bytes
97 * referred to by the pointer and length as an ISO 8859/1 string, and
98 * return a pointer to a UTF-8 string, allocated using the wmem scope.
101 get_8859_1_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
105 str = wmem_strbuf_sized_new(scope, length+1, 0);
111 wmem_strbuf_append_c(str, ch);
114 * Note: we assume here that the code points
115 * 0x80-0x9F are used for C1 control characters,
116 * and thus have the same value as the corresponding
117 * Unicode code points.
119 wmem_strbuf_append_unichar(str, ch);
125 return (guint8 *) wmem_strbuf_finalize(str);
129 * Translation tables that map the upper 128 code points in single-byte
130 * "extended ASCII" character encodings to Unicode code points in the
131 * Basic Multilingual Plane.
134 /* ISO-8859-2 (http://en.wikipedia.org/wiki/ISO/IEC_8859-2#Code_page_layout) */
135 const gunichar2 charset_table_iso_8859_2[0x80] = {
136 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
137 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
138 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
139 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
140 0x00a0, 0x0104, 0x02d8, 0x0141, 0x00a4, 0x013d, 0x015a, 0x00a7, /* 0xA0 - */
141 0x00a8, 0x0160, 0x015e, 0x0164, 0x0179, 0x00ad, 0x017d, 0x017b, /* - 0xAF */
142 0x00b0, 0x0105, 0x02db, 0x0142, 0x00b4, 0x013e, 0x015b, 0x02c7, /* 0xB0 - */
143 0x00b8, 0x0161, 0x015f, 0x0165, 0x017a, 0x02dd, 0x017e, 0x017c, /* - 0xBF */
144 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
145 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
146 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
147 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
148 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
149 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
150 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
151 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9 /* - 0xFF */
154 /* generated by ../tools/make_charset_ISO-8859-3 */
155 const gunichar2 charset_table_iso_8859_3[0x80] = {
156 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
157 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
158 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
159 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
160 0x00a0, 0x0126, 0x02d8, 0x00a3, 0x00a4, UNREPL, 0x0124, 0x00a7, /* 0xA0 - */
161 0x00a8, 0x0130, 0x015e, 0x011e, 0x0134, 0x00ad, UNREPL, 0x017b, /* - 0xAF */
162 0x00b0, 0x0127, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x0125, 0x00b7, /* 0xB0 - */
163 0x00b8, 0x0131, 0x015f, 0x011f, 0x0135, 0x00bd, UNREPL, 0x017c, /* - 0xBF */
164 0x00c0, 0x00c1, 0x00c2, UNREPL, 0x00c4, 0x010a, 0x0108, 0x00c7, /* 0xC0 - */
165 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
166 UNREPL, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x0120, 0x00d6, 0x00d7, /* 0xD0 - */
167 0x011c, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x016c, 0x015c, 0x00df, /* - 0xDF */
168 0x00e0, 0x00e1, 0x00e2, UNREPL, 0x00e4, 0x010b, 0x0109, 0x00e7, /* 0xE0 - */
169 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
170 UNREPL, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x0121, 0x00f6, 0x00f7, /* 0xF0 - */
171 0x011d, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x016d, 0x015d, 0x02d9, /* - 0xFF */
174 /* generated by ../tools/make_charset_ISO-8859-4 */
175 const gunichar2 charset_table_iso_8859_4[0x80] = {
176 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
177 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
178 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
179 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
180 0x00a0, 0x0104, 0x0138, 0x0156, 0x00a4, 0x0128, 0x013b, 0x00a7, /* 0xA0 - */
181 0x00a8, 0x0160, 0x0112, 0x0122, 0x0166, 0x00ad, 0x017d, 0x00af, /* - 0xAF */
182 0x00b0, 0x0105, 0x02db, 0x0157, 0x00b4, 0x0129, 0x013c, 0x02c7, /* 0xB0 - */
183 0x00b8, 0x0161, 0x0113, 0x0123, 0x0167, 0x014a, 0x017e, 0x014b, /* - 0xBF */
184 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
185 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x012a, /* - 0xCF */
186 0x0110, 0x0145, 0x014c, 0x0136, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
187 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x0168, 0x016a, 0x00df, /* - 0xDF */
188 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
189 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x012b, /* - 0xEF */
190 0x0111, 0x0146, 0x014d, 0x0137, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
191 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x0169, 0x016b, 0x02d9, /* - 0xFF */
194 /* ISO-8859-5 (http://en.wikipedia.org/wiki/ISO/IEC_8859-5#Code_page_layout) */
195 const gunichar2 charset_table_iso_8859_5[0x80] = {
196 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
197 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
198 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
199 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
200 0x00a0, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407, /* 0xA0 - */
201 0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f, /* - 0xAF */
202 0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417, /* 0xB0 - */
203 0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f, /* - 0xBF */
204 0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427, /* 0xC0 - */
205 0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f, /* - 0xCF */
206 0x0430, 0x0431, 0x0432, 0x0433, 0x0434, 0x0435, 0x0436, 0x0437, /* 0xD0 - */
207 0x0438, 0x0439, 0x043a, 0x043b, 0x043c, 0x043d, 0x043e, 0x043f, /* - 0xDF */
208 0x0440, 0x0441, 0x0442, 0x0443, 0x0444, 0x0445, 0x0446, 0x0447, /* 0xE0 - */
209 0x0448, 0x0449, 0x044a, 0x044b, 0x044c, 0x044d, 0x044e, 0x044f, /* - 0xEF */
210 0x2116, 0x0451, 0x0452, 0x0453, 0x0454, 0x0455, 0x0456, 0x0457, /* 0xF0 - */
211 0x0458, 0x0459, 0x045a, 0x045b, 0x045c, 0x00a7, 0x045e, 0x045f /* - 0xFF */
214 /* generated by ../tools/make_charset_ISO-8859-6 */
215 const gunichar2 charset_table_iso_8859_6[0x80] = {
216 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
217 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
218 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
219 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
220 0x00a0, UNREPL, UNREPL, UNREPL, 0x00a4, UNREPL, UNREPL, UNREPL, /* 0xA0 - */
221 UNREPL, UNREPL, UNREPL, UNREPL, 0x060c, 0x00ad, UNREPL, UNREPL, /* - 0xAF */
222 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xB0 - */
223 UNREPL, UNREPL, UNREPL, 0x061b, UNREPL, UNREPL, UNREPL, 0x061f, /* - 0xBF */
224 UNREPL, 0x0621, 0x0622, 0x0623, 0x0624, 0x0625, 0x0626, 0x0627, /* 0xC0 - */
225 0x0628, 0x0629, 0x062a, 0x062b, 0x062c, 0x062d, 0x062e, 0x062f, /* - 0xCF */
226 0x0630, 0x0631, 0x0632, 0x0633, 0x0634, 0x0635, 0x0636, 0x0637, /* 0xD0 - */
227 0x0638, 0x0639, 0x063a, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xDF */
228 0x0640, 0x0641, 0x0642, 0x0643, 0x0644, 0x0645, 0x0646, 0x0647, /* 0xE0 - */
229 0x0648, 0x0649, 0x064a, 0x064b, 0x064c, 0x064d, 0x064e, 0x064f, /* - 0xEF */
230 0x0650, 0x0651, 0x0652, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xF0 - */
231 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
234 /* generated by ../tools/make_charset_ISO-8859-7 */
235 const gunichar2 charset_table_iso_8859_7[0x80] = {
236 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
237 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
238 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
239 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
240 0x00a0, 0x2018, 0x2019, 0x00a3, 0x20ac, 0x20af, 0x00a6, 0x00a7, /* 0xA0 - */
241 0x00a8, 0x00a9, 0x037a, 0x00ab, 0x00ac, 0x00ad, UNREPL, 0x2015, /* - 0xAF */
242 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x0384, 0x0385, 0x0386, 0x00b7, /* 0xB0 - */
243 0x0388, 0x0389, 0x038a, 0x00bb, 0x038c, 0x00bd, 0x038e, 0x038f, /* - 0xBF */
244 0x0390, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397, /* 0xC0 - */
245 0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f, /* - 0xCF */
246 0x03a0, 0x03a1, UNREPL, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7, /* 0xD0 - */
247 0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x03ac, 0x03ad, 0x03ae, 0x03af, /* - 0xDF */
248 0x03b0, 0x03b1, 0x03b2, 0x03b3, 0x03b4, 0x03b5, 0x03b6, 0x03b7, /* 0xE0 - */
249 0x03b8, 0x03b9, 0x03ba, 0x03bb, 0x03bc, 0x03bd, 0x03be, 0x03bf, /* - 0xEF */
250 0x03c0, 0x03c1, 0x03c2, 0x03c3, 0x03c4, 0x03c5, 0x03c6, 0x03c7, /* 0xF0 - */
251 0x03c8, 0x03c9, 0x03ca, 0x03cb, 0x03cc, 0x03cd, 0x03ce, UNREPL, /* - 0xFF */
254 /* generated by ../tools/make_charset_ISO-8859-8 */
255 const gunichar2 charset_table_iso_8859_8[0x80] = {
256 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
257 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
258 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
259 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
260 0x00a0, UNREPL, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
261 0x00a8, 0x00a9, 0x00d7, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
262 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
263 0x00b8, 0x00b9, 0x00f7, 0x00bb, 0x00bc, 0x00bd, 0x00be, UNREPL, /* - 0xBF */
264 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xC0 - */
265 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xCF */
266 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, /* 0xD0 - */
267 UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, UNREPL, 0x2017, /* - 0xDF */
268 0x05d0, 0x05d1, 0x05d2, 0x05d3, 0x05d4, 0x05d5, 0x05d6, 0x05d7, /* 0xE0 - */
269 0x05d8, 0x05d9, 0x05da, 0x05db, 0x05dc, 0x05dd, 0x05de, 0x05df, /* - 0xEF */
270 0x05e0, 0x05e1, 0x05e2, 0x05e3, 0x05e4, 0x05e5, 0x05e6, 0x05e7, /* 0xF0 - */
271 0x05e8, 0x05e9, 0x05ea, UNREPL, UNREPL, 0x200e, 0x200f, UNREPL, /* - 0xFF */
274 /* ISO-8859-9 (http://en.wikipedia.org/wiki/ISO/IEC_8859-9#Code_page_layout) */
275 const gunichar2 charset_table_iso_8859_9[0x80] = {
276 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
277 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
278 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
279 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
280 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x00a4, 0x00a5, 0x00a6, 0x00a7, /* 0xA0 - */
281 0x00a8, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
282 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
283 0x00b8, 0x00b9, 0x00ba, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00bf, /* - 0xBF */
284 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
285 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
286 0x011e, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
287 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0130, 0x015e, 0x00df, /* - 0xDF */
288 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
289 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
290 0x011f, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
291 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0131, 0x015f, 0x00ff /* - 0xFF */
294 /* generated by ../tools/make_charset_ISO-8859-10 */
295 const gunichar2 charset_table_iso_8859_10[0x80] = {
296 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
297 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
298 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
299 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
300 0x00a0, 0x0104, 0x0112, 0x0122, 0x012a, 0x0128, 0x0136, 0x00a7, /* 0xA0 - */
301 0x013b, 0x0110, 0x0160, 0x0166, 0x017d, 0x00ad, 0x016a, 0x014a, /* - 0xAF */
302 0x00b0, 0x0105, 0x0113, 0x0123, 0x012b, 0x0129, 0x0137, 0x00b7, /* 0xB0 - */
303 0x013c, 0x0111, 0x0161, 0x0167, 0x017e, 0x2015, 0x016b, 0x014b, /* - 0xBF */
304 0x0100, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x012e, /* 0xC0 - */
305 0x010c, 0x00c9, 0x0118, 0x00cb, 0x0116, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
306 0x00d0, 0x0145, 0x014c, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0168, /* 0xD0 - */
307 0x00d8, 0x0172, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
308 0x0101, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x012f, /* 0xE0 - */
309 0x010d, 0x00e9, 0x0119, 0x00eb, 0x0117, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
310 0x00f0, 0x0146, 0x014d, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x0169, /* 0xF0 - */
311 0x00f8, 0x0173, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x0138, /* - 0xFF */
314 /* generated by ../tools/make_charset_ISO-8859-11 */
315 const gunichar2 charset_table_iso_8859_11[0x80] = {
316 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
317 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
318 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
319 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
320 0x00a0, 0x0e01, 0x0e02, 0x0e03, 0x0e04, 0x0e05, 0x0e06, 0x0e07, /* 0xA0 - */
321 0x0e08, 0x0e09, 0x0e0a, 0x0e0b, 0x0e0c, 0x0e0d, 0x0e0e, 0x0e0f, /* - 0xAF */
322 0x0e10, 0x0e11, 0x0e12, 0x0e13, 0x0e14, 0x0e15, 0x0e16, 0x0e17, /* 0xB0 - */
323 0x0e18, 0x0e19, 0x0e1a, 0x0e1b, 0x0e1c, 0x0e1d, 0x0e1e, 0x0e1f, /* - 0xBF */
324 0x0e20, 0x0e21, 0x0e22, 0x0e23, 0x0e24, 0x0e25, 0x0e26, 0x0e27, /* 0xC0 - */
325 0x0e28, 0x0e29, 0x0e2a, 0x0e2b, 0x0e2c, 0x0e2d, 0x0e2e, 0x0e2f, /* - 0xCF */
326 0x0e30, 0x0e31, 0x0e32, 0x0e33, 0x0e34, 0x0e35, 0x0e36, 0x0e37, /* 0xD0 - */
327 0x0e38, 0x0e39, 0x0e3a, UNREPL, UNREPL, UNREPL, UNREPL, 0x0e3f, /* - 0xDF */
328 0x0e40, 0x0e41, 0x0e42, 0x0e43, 0x0e44, 0x0e45, 0x0e46, 0x0e47, /* 0xE0 - */
329 0x0e48, 0x0e49, 0x0e4a, 0x0e4b, 0x0e4c, 0x0e4d, 0x0e4e, 0x0e4f, /* - 0xEF */
330 0x0e50, 0x0e51, 0x0e52, 0x0e53, 0x0e54, 0x0e55, 0x0e56, 0x0e57, /* 0xF0 - */
331 0x0e58, 0x0e59, 0x0e5a, 0x0e5b, UNREPL, UNREPL, UNREPL, UNREPL, /* - 0xFF */
334 /* generated by ../tools/make_charset_ISO-8859-13 */
335 const gunichar2 charset_table_iso_8859_13[0x80] = {
336 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
337 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
338 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
339 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
340 0x00a0, 0x201d, 0x00a2, 0x00a3, 0x00a4, 0x201e, 0x00a6, 0x00a7, /* 0xA0 - */
341 0x00d8, 0x00a9, 0x0156, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00c6, /* - 0xAF */
342 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x201c, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
343 0x00f8, 0x00b9, 0x0157, 0x00bb, 0x00bc, 0x00bd, 0x00be, 0x00e6, /* - 0xBF */
344 0x0104, 0x012e, 0x0100, 0x0106, 0x00c4, 0x00c5, 0x0118, 0x0112, /* 0xC0 - */
345 0x010c, 0x00c9, 0x0179, 0x0116, 0x0122, 0x0136, 0x012a, 0x013b, /* - 0xCF */
346 0x0160, 0x0143, 0x0145, 0x00d3, 0x014c, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
347 0x0172, 0x0141, 0x015a, 0x016a, 0x00dc, 0x017b, 0x017d, 0x00df, /* - 0xDF */
348 0x0105, 0x012f, 0x0101, 0x0107, 0x00e4, 0x00e5, 0x0119, 0x0113, /* 0xE0 - */
349 0x010d, 0x00e9, 0x017a, 0x0117, 0x0123, 0x0137, 0x012b, 0x013c, /* - 0xEF */
350 0x0161, 0x0144, 0x0146, 0x00f3, 0x014d, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
351 0x0173, 0x0142, 0x015b, 0x016b, 0x00fc, 0x017c, 0x017e, 0x2019, /* - 0xFF */
354 /* generated by ../tools/make_charset_ISO-8859-14 */
355 const gunichar2 charset_table_iso_8859_14[0x80] = {
356 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
357 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
358 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
359 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
360 0x00a0, 0x1e02, 0x1e03, 0x00a3, 0x010a, 0x010b, 0x1e0a, 0x00a7, /* 0xA0 - */
361 0x1e80, 0x00a9, 0x1e82, 0x1e0b, 0x1ef2, 0x00ad, 0x00ae, 0x0178, /* - 0xAF */
362 0x1e1e, 0x1e1f, 0x0120, 0x0121, 0x1e40, 0x1e41, 0x00b6, 0x1e56, /* 0xB0 - */
363 0x1e81, 0x1e57, 0x1e83, 0x1e60, 0x1ef3, 0x1e84, 0x1e85, 0x1e61, /* - 0xBF */
364 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
365 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
366 0x0174, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x1e6a, /* 0xD0 - */
367 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x0176, 0x00df, /* - 0xDF */
368 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
369 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
370 0x0175, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x1e6b, /* 0xF0 - */
371 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x0177, 0x00ff, /* - 0xFF */
374 /* generated by ../tools/make_charset_ISO-8859-15 */
375 const gunichar2 charset_table_iso_8859_15[0x80] = {
376 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
377 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
378 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
379 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
380 0x00a0, 0x00a1, 0x00a2, 0x00a3, 0x20ac, 0x00a5, 0x0160, 0x00a7, /* 0xA0 - */
381 0x0161, 0x00a9, 0x00aa, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x00af, /* - 0xAF */
382 0x00b0, 0x00b1, 0x00b2, 0x00b3, 0x017d, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
383 0x017e, 0x00b9, 0x00ba, 0x00bb, 0x0152, 0x0153, 0x0178, 0x00bf, /* - 0xBF */
384 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7, /* 0xC0 - */
385 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
386 0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x00d7, /* 0xD0 - */
387 0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x00df, /* - 0xDF */
388 0x00e0, 0x00e1, 0x00e2, 0x00e3, 0x00e4, 0x00e5, 0x00e6, 0x00e7, /* 0xE0 - */
389 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
390 0x00f0, 0x00f1, 0x00f2, 0x00f3, 0x00f4, 0x00f5, 0x00f6, 0x00f7, /* 0xF0 - */
391 0x00f8, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x00fd, 0x00fe, 0x00ff, /* - 0xFF */
394 /* generated by ../tools/make_charset_ISO-8859-16 */
395 const gunichar2 charset_table_iso_8859_16[0x80] = {
396 0x0080, 0x0081, 0x0082, 0x0083, 0x0084, 0x0085, 0x0086, 0x0087, /* 0x80 - */
397 0x0088, 0x0089, 0x008a, 0x008b, 0x008c, 0x008d, 0x008e, 0x008f, /* - 0x8F */
398 0x0090, 0x0091, 0x0092, 0x0093, 0x0094, 0x0095, 0x0096, 0x0097, /* 0x90 - */
399 0x0098, 0x0099, 0x009a, 0x009b, 0x009c, 0x009d, 0x009e, 0x009f, /* - 0x9F */
400 0x00a0, 0x0104, 0x0105, 0x0141, 0x20ac, 0x201e, 0x0160, 0x00a7, /* 0xA0 - */
401 0x0161, 0x00a9, 0x0218, 0x00ab, 0x0179, 0x00ad, 0x017a, 0x017b, /* - 0xAF */
402 0x00b0, 0x00b1, 0x010c, 0x0142, 0x017d, 0x201d, 0x00b6, 0x00b7, /* 0xB0 - */
403 0x017e, 0x010d, 0x0219, 0x00bb, 0x0152, 0x0153, 0x0178, 0x017c, /* - 0xBF */
404 0x00c0, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0106, 0x00c6, 0x00c7, /* 0xC0 - */
405 0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf, /* - 0xCF */
406 0x0110, 0x0143, 0x00d2, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x015a, /* 0xD0 - */
407 0x0170, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x0118, 0x021a, 0x00df, /* - 0xDF */
408 0x00e0, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x0107, 0x00e6, 0x00e7, /* 0xE0 - */
409 0x00e8, 0x00e9, 0x00ea, 0x00eb, 0x00ec, 0x00ed, 0x00ee, 0x00ef, /* - 0xEF */
410 0x0111, 0x0144, 0x00f2, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x015b, /* 0xF0 - */
411 0x0171, 0x00f9, 0x00fa, 0x00fb, 0x00fc, 0x0119, 0x021b, 0x00ff, /* - 0xFF */
414 /* Windows-1250 (http://en.wikipedia.org/wiki/Windows-1250) */
415 const gunichar2 charset_table_cp1250[0x80] = {
416 0x20ac, UNREPL, 0x201a, UNREPL, 0x201e, 0x2026, 0x2020, 0x2021, /* 0x80 - */
417 UNREPL, 0x2030, 0x0160, 0x2039, 0x015a, 0x0164, 0x017d, 0x0179, /* - 0x8F */
418 UNREPL, 0x2018, 0x2019, 0x201c, 0x201d, 0x2022, 0x2013, 0x2014, /* 0x90 - */
419 UNREPL, 0x2122, 0x0161, 0x203a, 0x015b, 0x0165, 0x017e, 0x017a, /* - 0x9F */
420 0x00a0, 0x02c7, 0x02d8, 0x0141, 0x00a4, 0x0104, 0x00a6, 0x00a7, /* 0xA0 - */
421 0x00a8, 0x00a9, 0x015e, 0x00ab, 0x00ac, 0x00ad, 0x00ae, 0x017b, /* - 0xAF */
422 0x00b0, 0x00b1, 0x02db, 0x0142, 0x00b4, 0x00b5, 0x00b6, 0x00b7, /* 0xB0 - */
423 0x00b8, 0x0105, 0x015f, 0x00bb, 0x013d, 0x02dd, 0x013e, 0x017c, /* - 0xBF */
424 0x0154, 0x00c1, 0x00c2, 0x0102, 0x00c4, 0x0139, 0x0106, 0x00c7, /* 0xC0 - */
425 0x010c, 0x00c9, 0x0118, 0x00cb, 0x011a, 0x00cd, 0x00ce, 0x010e, /* - 0xCF */
426 0x0110, 0x0143, 0x0147, 0x00d3, 0x00d4, 0x0150, 0x00d6, 0x00d7, /* 0xD0 - */
427 0x0158, 0x016e, 0x00da, 0x0170, 0x00dc, 0x00dd, 0x0162, 0x00df, /* - 0xDF */
428 0x0155, 0x00e1, 0x00e2, 0x0103, 0x00e4, 0x013a, 0x0107, 0x00e7, /* 0xE0 - */
429 0x010d, 0x00e9, 0x0119, 0x00eb, 0x011b, 0x00ed, 0x00ee, 0x010f, /* - 0xEF */
430 0x0111, 0x0144, 0x0148, 0x00f3, 0x00f4, 0x0151, 0x00f6, 0x00f7, /* 0xF0 - */
431 0x0159, 0x016f, 0x00fa, 0x0171, 0x00fc, 0x00fd, 0x0163, 0x02d9, /* - 0xFF */
434 /* generated by ./make_charset_table MACROMAN */
435 /* That's "MacRoman", not "Macro Man" (faster than a speeding recursive expansion!) */
436 const gunichar2 charset_table_mac_roman[0x80] = {
437 0x00c4, 0x00c5, 0x00c7, 0x00c9, 0x00d1, 0x00d6, 0x00dc, 0x00e1, /* 0x80 - */
438 0x00e0, 0x00e2, 0x00e4, 0x00e3, 0x00e5, 0x00e7, 0x00e9, 0x00e8, /* - 0x8F */
439 0x00ea, 0x00eb, 0x00ed, 0x00ec, 0x00ee, 0x00ef, 0x00f1, 0x00f3, /* 0x90 - */
440 0x00f2, 0x00f4, 0x00f6, 0x00f5, 0x00fa, 0x00f9, 0x00fb, 0x00fc, /* - 0x9F */
441 0x2020, 0x00b0, 0x00a2, 0x00a3, 0x00a7, 0x2022, 0x00b6, 0x00df, /* 0xA0 - */
442 0x00ae, 0x00a9, 0x2122, 0x00b4, 0x00a8, 0x2260, 0x00c6, 0x00d8, /* - 0xAF */
443 0x221e, 0x00b1, 0x2264, 0x2265, 0x00a5, 0x00b5, 0x2202, 0x2211, /* 0xB0 - */
444 0x220f, 0x03c0, 0x222b, 0x00aa, 0x00ba, 0x03a9, 0x00e6, 0x00f8, /* - 0xBF */
445 0x00bf, 0x00a1, 0x00ac, 0x221a, 0x0192, 0x2248, 0x2206, 0x00ab, /* 0xC0 - */
446 0x00bb, 0x2026, 0x00a0, 0x00c0, 0x00c3, 0x00d5, 0x0152, 0x0153, /* - 0xCF */
447 0x2013, 0x2014, 0x201c, 0x201d, 0x2018, 0x2019, 0x00f7, 0x25ca, /* 0xD0 - */
448 0x00ff, 0x0178, 0x2044, 0x20ac, 0x2039, 0x203a, 0xfb01, 0xfb02, /* - 0xDF */
449 0x2021, 0x00b7, 0x201a, 0x201e, 0x2030, 0x00c2, 0x00ca, 0x00c1, /* 0xE0 - */
450 0x00cb, 0x00c8, 0x00cd, 0x00ce, 0x00cf, 0x00cc, 0x00d3, 0x00d4, /* - 0xEF */
451 0xf8ff, 0x00d2, 0x00da, 0x00db, 0x00d9, 0x0131, 0x02c6, 0x02dc, /* 0xF0 - */
452 0x00af, 0x02d8, 0x02d9, 0x02da, 0x00b8, 0x02dd, 0x02db, 0x02c7, /* - 0xFF */
455 /* generated by ./make_charset_table CP437 */
456 const gunichar2 charset_table_cp437[0x80] = {
457 0x00c7, 0x00fc, 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, /* 0x80 - */
458 0x00ea, 0x00eb, 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, /* - 0x8F */
459 0x00c9, 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, /* 0x90 - */
460 0x00ff, 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, /* - 0x9F */
461 0x00e1, 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, /* 0xA0 - */
462 0x00bf, 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, /* - 0xAF */
463 0x2591, 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, /* 0xB0 - */
464 0x2555, 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, /* - 0xBF */
465 0x2514, 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, /* 0xC0 - */
466 0x255a, 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, /* - 0xCF */
467 0x2568, 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, /* 0xD0 - */
468 0x256a, 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, /* - 0xDF */
469 0x03b1, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03c4, /* 0xE0 - */
470 0x03a6, 0x0398, 0x03a9, 0x03b4, 0x221e, 0x03c6, 0x03b5, 0x2229, /* - 0xEF */
471 0x2261, 0x00b1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, /* 0xF0 - */
472 0x00b0, 0x2219, 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0x00a0, /* - 0xFF */
476 * Given a wmem scope, a pointer, and a length, and a translation table,
477 * treat the string of bytes referred to by the pointer and length as a
478 * string encoded using one octet per character, with octets with the
479 * high-order bit clear being ASCII and octets with the high-order bit
480 * set being mapped by the translation table to 2-byte Unicode Basic
481 * Multilingual Plane characters (including REPLACEMENT CHARACTER), and
482 * return a pointer to a UTF-8 string, allocated using the wmem scope.
485 get_unichar2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const gunichar2 table[0x80])
489 str = wmem_strbuf_sized_new(scope, length+1, 0);
495 wmem_strbuf_append_c(str, ch);
497 wmem_strbuf_append_unichar(str, table[ch-0x80]);
502 return (guint8 *) wmem_strbuf_finalize(str);
506 * Given a wmem scope, a pointer, and a length, treat the string of bytes
507 * referred to by the pointer and length as a UCS-2 encoded string
508 * containing characters from the Basic Multilingual Plane (plane 0) of
509 * Unicode, and return a pointer to a UTF-8 string, allocated with the
512 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
514 * Specify length in bytes.
516 * XXX - should map lead and trail surrogate values to REPLACEMENT
517 * CHARACTERs (0xFFFD)?
518 * XXX - if there are an odd number of bytes, should put a
519 * REPLACEMENT CHARACTER at the end.
522 get_ucs_2_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
525 gint i; /* Byte counter for string */
526 wmem_strbuf_t *strbuf;
528 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
530 for(i = 0; i + 1 < length; i += 2) {
531 if (encoding == ENC_BIG_ENDIAN){
532 uchar = pntoh16(ptr + i);
534 uchar = pletoh16(ptr + i);
536 wmem_strbuf_append_unichar(strbuf, uchar);
540 * XXX - if i < length, this means we were handed an odd
541 * number of bytes, so we're not a valid UCS-2 string.
543 return (guint8 *) wmem_strbuf_finalize(strbuf);
547 * Given a wmem scope, a pointer, and a length, treat the string of bytes
548 * referred to by the pointer and length as a UTF-16 encoded string, and
549 * return a pointer to a UTF-8 string, allocated with the wmem scope.
551 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN.
553 * Specify length in bytes.
555 * XXX - should map surrogate errors to REPLACEMENT CHARACTERs (0xFFFD).
556 * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
557 * XXX - if there are an odd number of bytes, should put a
558 * REPLACEMENT CHARACTER at the end.
561 get_utf_16_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
563 wmem_strbuf_t *strbuf;
564 gunichar2 uchar2, lead_surrogate;
566 gint i; /* Byte counter for string */
568 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
570 for(i = 0; i + 1 < length; i += 2) {
571 if (encoding == ENC_BIG_ENDIAN)
572 uchar2 = pntoh16(ptr + i);
574 uchar2 = pletoh16(ptr + i);
576 if (IS_LEAD_SURROGATE(uchar2)) {
578 * Lead surrogate. Must be followed by
582 if (i + 1 >= length) {
584 * Oops, string ends with a lead surrogate.
585 * Ignore this for now.
586 * XXX - insert "substitute" character?
587 * Report the error in some other
592 lead_surrogate = uchar2;
593 if (encoding == ENC_BIG_ENDIAN)
594 uchar2 = pntoh16(ptr + i);
596 uchar2 = pletoh16(ptr + i);
597 if (IS_TRAIL_SURROGATE(uchar2)) {
598 /* Trail surrogate. */
599 uchar = SURROGATE_VALUE(lead_surrogate, uchar2);
600 wmem_strbuf_append_unichar(strbuf, uchar);
603 * Not a trail surrogate.
604 * Ignore the entire pair.
605 * XXX - insert "substitute" character?
606 * Report the error in some other
612 if (IS_TRAIL_SURROGATE(uchar2)) {
614 * Trail surrogate without a preceding
615 * lead surrogate. Ignore it.
616 * XXX - insert "substitute" character?
617 * Report the error in some other
623 * Non-surrogate; just append it.
625 wmem_strbuf_append_unichar(strbuf, uchar2);
631 * XXX - if i < length, this means we were handed an odd
632 * number of bytes, so we're not a valid UTF-16 string.
634 return (guint8 *) wmem_strbuf_finalize(strbuf);
638 * Given a wmem scope, a pointer, and a length, treat the string of bytes
639 * referred to by the pointer and length as a UCS-4 encoded string, and
640 * return a pointer to a UTF-8 string, allocated with the wmem scope.
642 * Encoding parameter should be ENC_BIG_ENDIAN or ENC_LITTLE_ENDIAN
644 * Specify length in bytes
646 * XXX - should map lead and trail surrogate values to a "substitute"
648 * XXX - should map code points > 10FFFF to REPLACEMENT CHARACTERs.
649 * XXX - if the number of bytes isn't a multiple of 4, should put a
650 * REPLACEMENT CHARACTER at the end.
653 get_ucs_4_string(wmem_allocator_t *scope, const guint8 *ptr, gint length, const guint encoding)
656 gint i; /* Byte counter for string */
657 wmem_strbuf_t *strbuf;
659 strbuf = wmem_strbuf_sized_new(scope, length+1, 0);
661 for(i = 0; i + 3 < length; i += 4) {
662 if (encoding == ENC_BIG_ENDIAN)
663 uchar = pntoh32(ptr + i);
665 uchar = pletoh32(ptr + i);
667 wmem_strbuf_append_unichar(strbuf, uchar);
671 * XXX - if i < length, this means we were handed a number
672 * of bytes that's not a multiple of 4, so we're not a valid
675 return (gchar *)wmem_strbuf_finalize(strbuf);
684 /* ETSI GSM 03.38, version 6.0.1, section 6.2.1; Default alphabet */
685 static const gunichar2 gsm_default_alphabet[0x80] = {
686 '@', 0xa3, '$', 0xa5, 0xe8, 0xe9, 0xf9, 0xec,
687 0xf2, 0xc7, '\n', 0xd8, 0xf8, '\r', 0xc5, 0xe5,
688 0x394, '_', 0x3a6, 0x393, 0x39b, 0x3a9, 0x3a0, 0x3a8,
689 0x3a3, 0x398, 0x39e, 0xa0, 0xc6, 0xe6, 0xdf, 0xc9,
690 ' ', '!', '\"', '#', 0xa4, '%', '&', '\'',
691 '(', ')', '*', '+', ',', '-', '.', '/',
692 '0', '1', '2', '3', '4', '5', '6', '7',
693 '8', '9', ':', ';', '<', '=', '>', '?',
694 0xa1, 'A', 'B', 'C', 'D', 'E', 'F', 'G',
695 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O',
696 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W',
697 'X', 'Y', 'Z', 0xc4, 0xd6, 0xd1, 0xdc, 0xa7,
698 0xbf, 'a', 'b', 'c', 'd', 'e', 'f', 'g',
699 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
700 'p', 'q', 'r', 's', 't', 'u', 'v', 'w',
701 'x', 'y', 'z', 0xe4, 0xf6, 0xf1, 0xfc, 0xe0
705 GSM_to_UNICHAR(guint8 c)
707 if (c < G_N_ELEMENTS(gsm_default_alphabet))
708 return gsm_default_alphabet[c];
714 GSMext_to_UNICHAR(guint8 c)
718 case 0x0a: return 0x0c; /* form feed */
719 case 0x14: return '^';
720 case 0x28: return '{';
721 case 0x29: return '}';
722 case 0x2f: return '\\';
723 case 0x3c: return '[';
724 case 0x3d: return '~';
725 case 0x3e: return ']';
726 case 0x40: return '|';
727 case 0x65: return 0x20ac; /* euro */
730 return UNREPL; /* invalid character */
733 #define GN_BYTE_MASK ((1 << bits) - 1)
735 #define GN_CHAR_ESCAPE 0x1b
738 char_is_escape(unsigned char value)
740 return (value == GN_CHAR_ESCAPE);
744 handle_ts_23_038_char(wmem_strbuf_t *strbuf, guint8 code_point,
749 if (char_is_escape(code_point)) {
751 * XXX - if saw_escape is TRUE here, then this is
752 * the case where we escape to "another extension table",
753 * but TS 128 038 V11.0 doesn't specify such an extension
759 * Have we seen an escape?
763 uchar = GSMext_to_UNICHAR(code_point);
765 uchar = GSM_to_UNICHAR(code_point);
767 wmem_strbuf_append_unichar(strbuf, uchar);
773 get_ts_23_038_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
774 const gint bit_offset, gint no_of_chars)
776 wmem_strbuf_t *strbuf;
777 gint char_count; /* character counter for string */
778 guint8 in_byte, out_byte, rest = 0x00;
779 const guint8 *start_ptr = ptr;
780 gboolean saw_escape = FALSE;
783 strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
785 bits = bit_offset & 0x07;
790 for(char_count = 0; char_count < no_of_chars; ptr++) {
791 /* Get the next byte from the string. */
795 * Combine the bits we've accumulated with bits from
796 * that byte to make a 7-bit code point.
798 out_byte = ((in_byte & GN_BYTE_MASK) << (7 - bits)) | rest;
801 * Leftover bits used in that code point.
803 rest = in_byte >> bits;
806 * If we don't start from 0th bit, we shouldn't go to the
807 * next char. Under *out_num we have now 0 and under Rest -
808 * _first_ part of the char.
810 if ((start_ptr != ptr) || (bits == 7)) {
811 saw_escape = handle_ts_23_038_char(strbuf, out_byte,
817 * After reading 7 octets we have read 7 full characters
818 * but we have 7 bits as well. This is the next character.
820 if ((bits == 1) && (char_count < no_of_chars)) {
821 saw_escape = handle_ts_23_038_char(strbuf, rest,
833 * Escape not followed by anything.
835 * XXX - for now, show the escape as a REPLACEMENT
838 wmem_strbuf_append_unichar(strbuf, UNREPL);
841 return (guint8 *)wmem_strbuf_finalize(strbuf);
845 get_ascii_7bits_string(wmem_allocator_t *scope, const guint8 *ptr,
846 const gint bit_offset, gint no_of_chars)
848 wmem_strbuf_t *strbuf;
849 gint char_count; /* character counter for string */
850 guint8 in_byte, out_byte, rest = 0x00;
851 const guint8 *start_ptr = ptr;
854 bits = bit_offset & 0x07;
859 strbuf = wmem_strbuf_sized_new(scope, no_of_chars+1, 0);
860 for(char_count = 0; char_count < no_of_chars; ptr++) {
861 /* Get the next byte from the string. */
865 * Combine the bits we've accumulated with bits from
866 * that byte to make a 7-bit code point.
868 out_byte = (in_byte >> (8 - bits)) | rest;
871 * Leftover bits used in that code point.
873 rest = (in_byte << (bits - 1)) & 0x7f;
876 * If we don't start from 0th bit, we shouldn't go to the
877 * next char. Under *out_num we have now 0 and under Rest -
878 * _first_ part of the char.
880 if ((start_ptr != ptr) || (bits == 7)) {
881 wmem_strbuf_append_c(strbuf, out_byte);
886 * After reading 7 octets we have read 7 full characters
887 * but we have 7 bits as well. This is the next character.
889 if ((bits == 1) && (char_count < no_of_chars)) {
890 wmem_strbuf_append_c(strbuf, rest);
899 return (guint8 *)wmem_strbuf_finalize(strbuf);
902 /* ASCII/EBCDIC conversion tables from
903 * http://www.room42.com/store/computer_center/code_tables.shtml
906 static const guint8 ASCII_translate_EBCDIC [ 256 ] = {
907 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
908 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
909 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
910 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
911 0x40, 0x5A, 0x7F, 0x7B, 0x5B, 0x6C, 0x50, 0x7D, 0x4D,
912 0x5D, 0x5C, 0x4E, 0x6B, 0x60, 0x4B, 0x61,
913 0xF0, 0xF1, 0xF2, 0xF3, 0xF4, 0xF5, 0xF6, 0xF7, 0xF8,
914 0xF9, 0x7A, 0x5E, 0x4C, 0x7E, 0x6E, 0x6F,
915 0x7C, 0xC1, 0xC2, 0xC3, 0xC4, 0xC5, 0xC6, 0xC7, 0xC8,
916 0xC9, 0xD1, 0xD2, 0xD3, 0xD4, 0xD5, 0xD6,
917 0xD7, 0xD8, 0xD9, 0xE2, 0xE3, 0xE4, 0xE5, 0xE6, 0xE7,
918 0xE8, 0xE9, 0xAD, 0xE0, 0xBD, 0x5F, 0x6D,
919 0x7D, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88,
920 0x89, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96,
921 0x97, 0x98, 0x99, 0xA2, 0xA3, 0xA4, 0xA5, 0xA6, 0xA7,
922 0xA8, 0xA9, 0xC0, 0x6A, 0xD0, 0xA1, 0x4B,
923 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
924 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
925 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
926 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
927 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
928 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
929 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
930 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
931 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
932 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
933 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
934 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
935 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
936 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
937 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B,
938 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B, 0x4B
942 ASCII_to_EBCDIC(guint8 *buf, guint bytes)
949 for (i = 0; i < bytes; i++, bufptr++) {
950 *bufptr = ASCII_translate_EBCDIC[*bufptr];
955 ASCII_to_EBCDIC1(guint8 c)
957 return ASCII_translate_EBCDIC[c];
961 static const guint8 EBCDIC_translate_ASCII [ 256 ] = {
962 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08,
963 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
964 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18,
965 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F,
966 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28,
967 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, 0x2F,
968 0x2E, 0x2E, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
969 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x2E, 0x3F,
970 0x20, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
971 0x2E, 0x2E, 0x2E, 0x3C, 0x28, 0x2B, 0x7C,
972 0x26, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
973 0x2E, 0x21, 0x24, 0x2A, 0x29, 0x3B, 0x5E,
974 0x2D, 0x2F, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
975 0x2E, 0x7C, 0x2C, 0x25, 0x5F, 0x3E, 0x3F,
976 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
977 0x2E, 0x3A, 0x23, 0x40, 0x27, 0x3D, 0x22,
978 0x2E, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
979 0x69, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
980 0x2E, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71,
981 0x72, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
982 0x2E, 0x7E, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
983 0x7A, 0x2E, 0x2E, 0x2E, 0x5B, 0x2E, 0x2E,
984 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
985 0x2E, 0x2E, 0x2E, 0x2E, 0x5D, 0x2E, 0x2E,
986 0x7B, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
987 0x49, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
988 0x7D, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51,
989 0x52, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
990 0x5C, 0x2E, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
991 0x5A, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E,
992 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
993 0x39, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E
997 EBCDIC_to_ASCII(guint8 *buf, guint bytes)
1004 for (i = 0; i < bytes; i++, bufptr++) {
1005 *bufptr = EBCDIC_translate_ASCII[*bufptr];
1010 EBCDIC_to_ASCII1(guint8 c)
1012 return EBCDIC_translate_ASCII[c];
1016 * Given a wmem scope, a pointer, and a length, and a translation table,
1017 * treat the string of bytes referred to by the pointer and length as a
1018 * string encoded in EBCDIC using one octet per character, and return a
1019 * pointer to a UTF-8 string, allocated using the wmem scope.
1022 get_ebcdic_string(wmem_allocator_t *scope, const guint8 *ptr, gint length)
1026 str = wmem_strbuf_sized_new(scope, length+1, 0);
1028 while (length > 0) {
1031 wmem_strbuf_append_unichar(str, EBCDIC_translate_ASCII[ch]);
1036 return (guint8 *) wmem_strbuf_finalize(str);
1040 * Editor modelines - http://www.wireshark.org/tools/modelines.html
1045 * indent-tabs-mode: nil
1048 * vi: set shiftwidth=4 tabstop=8 expandtab:
1049 * :indentSize=4:tabSize=8:noTabs=true: