Line data Source code
1 : /*
2 : * Copyright (c) 2004, 2006, 2007, 2008 Kungliga Tekniska Högskolan
3 : * (Royal Institute of Technology, Stockholm, Sweden).
4 : * All rights reserved.
5 : *
6 : * Redistribution and use in source and binary forms, with or without
7 : * modification, are permitted provided that the following conditions
8 : * are met:
9 : *
10 : * 1. Redistributions of source code must retain the above copyright
11 : * notice, this list of conditions and the following disclaimer.
12 : *
13 : * 2. Redistributions in binary form must reproduce the above copyright
14 : * notice, this list of conditions and the following disclaimer in the
15 : * documentation and/or other materials provided with the distribution.
16 : *
17 : * 3. Neither the name of the Institute nor the names of its contributors
18 : * may be used to endorse or promote products derived from this software
19 : * without specific prior written permission.
20 : *
21 : * THIS SOFTWARE IS PROVIDED BY THE INSTITUTE AND CONTRIBUTORS ``AS IS'' AND
22 : * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23 : * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24 : * ARE DISCLAIMED. IN NO EVENT SHALL THE INSTITUTE OR CONTRIBUTORS BE LIABLE
25 : * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26 : * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27 : * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 : * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29 : * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30 : * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31 : * SUCH DAMAGE.
32 : */
33 :
34 : #include <config.h>
35 : #include "windlocl.h"
36 :
37 : static int
38 3901584 : utf8toutf32(const unsigned char **pp, uint32_t *out)
39 : {
40 3901584 : const unsigned char *p = *pp;
41 3901584 : uint32_t c = *p;
42 143942 : uint32_t out_val;
43 :
44 3901584 : if (c & 0x80) {
45 1452 : if ((c & 0xE0) == 0xC0) {
46 44 : const uint32_t c2 = *++p;
47 44 : if ((c2 & 0xC0) == 0x80) {
48 44 : out_val = ((c & 0x1F) << 6)
49 44 : | (c2 & 0x3F);
50 44 : if (out_val < 0x80) {
51 0 : return WIND_ERR_INVALID_UTF8;
52 : }
53 : } else {
54 0 : return WIND_ERR_INVALID_UTF8;
55 : }
56 1408 : } else if ((c & 0xF0) == 0xE0) {
57 1396 : const uint32_t c2 = *++p;
58 1396 : if ((c2 & 0xC0) == 0x80) {
59 1396 : const uint32_t c3 = *++p;
60 1396 : if ((c3 & 0xC0) == 0x80) {
61 1396 : out_val = ((c & 0x0F) << 12)
62 1396 : | ((c2 & 0x3F) << 6)
63 1396 : | (c3 & 0x3F);
64 1396 : if (out_val < 0x800) {
65 0 : return WIND_ERR_INVALID_UTF8;
66 : }
67 : } else {
68 0 : return WIND_ERR_INVALID_UTF8;
69 : }
70 : } else {
71 0 : return WIND_ERR_INVALID_UTF8;
72 : }
73 12 : } else if ((c & 0xF8) == 0xF0) {
74 12 : const uint32_t c2 = *++p;
75 12 : if ((c2 & 0xC0) == 0x80) {
76 12 : const uint32_t c3 = *++p;
77 12 : if ((c3 & 0xC0) == 0x80) {
78 12 : const uint32_t c4 = *++p;
79 12 : if ((c4 & 0xC0) == 0x80) {
80 12 : out_val = ((c & 0x07) << 18)
81 12 : | ((c2 & 0x3F) << 12)
82 12 : | ((c3 & 0x3F) << 6)
83 12 : | (c4 & 0x3F);
84 12 : if (out_val < 0x10000) {
85 0 : return WIND_ERR_INVALID_UTF8;
86 : }
87 : } else {
88 0 : return WIND_ERR_INVALID_UTF8;
89 : }
90 : } else {
91 0 : return WIND_ERR_INVALID_UTF8;
92 : }
93 : } else {
94 0 : return WIND_ERR_INVALID_UTF8;
95 : }
96 : } else {
97 0 : return WIND_ERR_INVALID_UTF8;
98 : }
99 : } else {
100 3756190 : out_val = c;
101 : }
102 :
103 : /* Allow unpaired surrogates (in the range 0xd800–0xdfff). */
104 :
105 3901584 : if (out_val > 0x10ffff) {
106 0 : return WIND_ERR_INVALID_UTF8;
107 : }
108 :
109 3901584 : *out = out_val;
110 3901584 : *pp = p;
111 :
112 3901584 : return 0;
113 : }
114 :
115 : /**
116 : * Convert an UTF-8 string to an UCS4 string.
117 : *
118 : * @param in an UTF-8 string to convert.
119 : * @param out the resulting UCS4 string, must be at least
120 : * wind_utf8ucs4_length() long. If out is NULL, the function will
121 : * calculate the needed space for the out variable (just like
122 : * wind_utf8ucs4_length()).
123 : * @param out_len before processing out_len should be the length of
124 : * the out variable, after processing it will be the length of the out
125 : * string.
126 : *
127 : * @return returns 0 on success, an wind error code otherwise
128 : * @ingroup wind
129 : */
130 :
131 : int
132 5620 : wind_utf8ucs4(const char *in, uint32_t *out, size_t *out_len)
133 : {
134 0 : const unsigned char *p;
135 5620 : size_t o = 0;
136 0 : int ret;
137 :
138 89920 : for (p = (const unsigned char *)in; *p != '\0'; ++p) {
139 0 : uint32_t u;
140 :
141 84300 : ret = utf8toutf32(&p, &u);
142 84300 : if (ret)
143 0 : return ret;
144 :
145 84300 : if (out) {
146 42150 : if (o >= *out_len)
147 0 : return WIND_ERR_OVERRUN;
148 42150 : out[o] = u;
149 : }
150 84300 : o++;
151 : }
152 5620 : *out_len = o;
153 5620 : return 0;
154 : }
155 :
156 : /**
157 : * Calculate the length of from converting a UTF-8 string to a UCS4
158 : * string.
159 : *
160 : * @param in an UTF-8 string to convert.
161 : * @param out_len the length of the resulting UCS4 string.
162 : *
163 : * @return returns 0 on success, an wind error code otherwise
164 : * @ingroup wind
165 : */
166 :
167 : int
168 2810 : wind_utf8ucs4_length(const char *in, size_t *out_len)
169 : {
170 2810 : return wind_utf8ucs4(in, NULL, out_len);
171 : }
172 :
173 : static const char first_char[4] =
174 : { 0x00, 0xC0, 0xE0, 0xF0 };
175 :
176 : /**
177 : * Convert an UCS4 string to a UTF-8 string.
178 : *
179 : * @param in an UCS4 string to convert.
180 : * @param in_len the length input array.
181 :
182 : * @param out the resulting UTF-8 string, must be at least
183 : * wind_ucs4utf8_length() + 1 long (the extra char for the NUL). If
184 : * out is NULL, the function will calculate the needed space for the
185 : * out variable (just like wind_ucs4utf8_length()).
186 :
187 : * @param out_len before processing out_len should be the length of
188 : * the out variable, after processing it will be the length of the out
189 : * string.
190 : *
191 : * @return returns 0 on success, an wind error code otherwise
192 : * @ingroup wind
193 : */
194 :
195 : int
196 0 : wind_ucs4utf8(const uint32_t *in, size_t in_len, char *out, size_t *out_len)
197 : {
198 0 : uint32_t ch;
199 0 : size_t i, len, o;
200 :
201 0 : for (o = 0, i = 0; i < in_len; i++) {
202 0 : ch = in[i];
203 :
204 0 : if (ch < 0x80) {
205 0 : len = 1;
206 0 : } else if (ch < 0x800) {
207 0 : len = 2;
208 0 : } else if (ch < 0x10000) {
209 0 : len = 3;
210 0 : } else if (ch <= 0x10FFFF) {
211 0 : len = 4;
212 : } else
213 0 : return WIND_ERR_INVALID_UTF32;
214 :
215 0 : o += len;
216 :
217 0 : if (out) {
218 0 : if (o >= *out_len)
219 0 : return WIND_ERR_OVERRUN;
220 :
221 0 : switch(len) {
222 0 : case 4:
223 0 : out[3] = (ch | 0x80) & 0xbf;
224 0 : ch = ch >> 6;
225 0 : HEIM_FALLTHROUGH;
226 0 : case 3:
227 0 : out[2] = (ch | 0x80) & 0xbf;
228 0 : ch = ch >> 6;
229 0 : HEIM_FALLTHROUGH;
230 0 : case 2:
231 0 : out[1] = (ch | 0x80) & 0xbf;
232 0 : ch = ch >> 6;
233 0 : HEIM_FALLTHROUGH;
234 0 : case 1:
235 0 : out[0] = ch | first_char[len - 1];
236 0 : HEIM_FALLTHROUGH;
237 0 : default:
238 0 : break;
239 : }
240 0 : out += len;
241 : }
242 : }
243 0 : if (out) {
244 0 : if (o + 1 >= *out_len)
245 0 : return WIND_ERR_OVERRUN;
246 0 : *out = '\0';
247 : }
248 0 : *out_len = o;
249 0 : return 0;
250 : }
251 :
252 : /**
253 : * Calculate the length of from converting a UCS4 string to an UTF-8 string.
254 : *
255 : * @param in an UCS4 string to convert.
256 : * @param in_len the length of UCS4 string to convert.
257 : * @param out_len the length of the resulting UTF-8 string.
258 : *
259 : * @return returns 0 on success, an wind error code otherwise
260 : * @ingroup wind
261 : */
262 :
263 : int
264 0 : wind_ucs4utf8_length(const uint32_t *in, size_t in_len, size_t *out_len)
265 : {
266 0 : return wind_ucs4utf8(in, in_len, NULL, out_len);
267 : }
268 :
269 : /**
270 : * Read in an UCS2 from a buffer.
271 : *
272 : * @param ptr The input buffer to read from.
273 : * @param len the length of the input buffer.
274 : * @param flags Flags to control the behavior of the function.
275 : * @param out the output UCS2, the array must be at least out/2 long.
276 : * @param out_len the output length
277 : *
278 : * @return returns 0 on success, an wind error code otherwise.
279 : * @ingroup wind
280 : */
281 :
282 : int
283 414536 : wind_ucs2read(const void *ptr, size_t len, unsigned int *flags,
284 : uint16_t *out, size_t *out_len)
285 : {
286 414536 : const unsigned char *p = ptr;
287 414536 : int little = ((*flags) & WIND_RW_LE);
288 414536 : size_t olen = *out_len;
289 :
290 : /** if len is zero, flags are unchanged */
291 414536 : if (len == 0) {
292 0 : *out_len = 0;
293 0 : return 0;
294 : }
295 :
296 : /** if len is odd, WIND_ERR_LENGTH_NOT_MOD2 is returned */
297 414536 : if (len & 1)
298 0 : return WIND_ERR_LENGTH_NOT_MOD2;
299 :
300 : /**
301 : * If the flags WIND_RW_BOM is set, check for BOM. If not BOM is
302 : * found, check is LE/BE flag is already and use that otherwise
303 : * fail with WIND_ERR_NO_BOM. When done, clear WIND_RW_BOM and
304 : * the LE/BE flag and set the resulting LE/BE flag.
305 : */
306 414536 : if ((*flags) & WIND_RW_BOM) {
307 0 : uint16_t bom = (p[0] << 8) + p[1];
308 0 : if (bom == 0xfffe || bom == 0xfeff) {
309 0 : little = (bom == 0xfffe);
310 0 : p += 2;
311 0 : len -= 2;
312 0 : } else if (((*flags) & (WIND_RW_LE|WIND_RW_BE)) != 0) {
313 : /* little already set */
314 : } else
315 0 : return WIND_ERR_NO_BOM;
316 0 : *flags = ((*flags) & ~(WIND_RW_BOM|WIND_RW_LE|WIND_RW_BE));
317 0 : *flags |= little ? WIND_RW_LE : WIND_RW_BE;
318 : }
319 :
320 9028403 : while (len) {
321 8613867 : if (olen < 1)
322 0 : return WIND_ERR_OVERRUN;
323 8613867 : if (little)
324 8613867 : *out = (p[1] << 8) + p[0];
325 : else
326 0 : *out = (p[0] << 8) + p[1];
327 8613867 : out++; p += 2; len -= 2; olen--;
328 : }
329 414536 : *out_len -= olen;
330 414536 : return 0;
331 : }
332 :
333 : /**
334 : * Write an UCS2 string to a buffer.
335 : *
336 : * @param in The input UCS2 string.
337 : * @param in_len the length of the input buffer.
338 : * @param flags Flags to control the behavior of the function.
339 : * @param ptr The input buffer to write to, the array must be at least
340 : * (in + 1) * 2 bytes long.
341 : * @param out_len the output length
342 : *
343 : * @return returns 0 on success, an wind error code otherwise.
344 : * @ingroup wind
345 : */
346 :
347 : int
348 103179 : wind_ucs2write(const uint16_t *in, size_t in_len, unsigned int *flags,
349 : void *ptr, size_t *out_len)
350 : {
351 103179 : unsigned char *p = ptr;
352 103179 : size_t len = *out_len;
353 :
354 : /** If in buffer is not of length be mod 2, WIND_ERR_LENGTH_NOT_MOD2 is returned*/
355 103179 : if (len & 1)
356 0 : return WIND_ERR_LENGTH_NOT_MOD2;
357 :
358 : /** On zero input length, flags are preserved */
359 103179 : if (in_len == 0) {
360 0 : *out_len = 0;
361 0 : return 0;
362 : }
363 : /** If flags have WIND_RW_BOM set, the byte order mark is written
364 : * first to the output data */
365 103179 : if ((*flags) & WIND_RW_BOM) {
366 0 : uint16_t bom = 0xfffe;
367 :
368 0 : if (len < 2)
369 0 : return WIND_ERR_OVERRUN;
370 :
371 0 : if ((*flags) & WIND_RW_LE) {
372 0 : p[0] = (bom ) & 0xff;
373 0 : p[1] = (bom >> 8) & 0xff;
374 : } else {
375 0 : p[1] = (bom ) & 0xff;
376 0 : p[0] = (bom >> 8) & 0xff;
377 : }
378 0 : len -= 2;
379 : }
380 :
381 1984732 : while (in_len) {
382 : /** If the output wont fit into out_len, WIND_ERR_OVERRUN is returned */
383 1881553 : if (len < 2)
384 0 : return WIND_ERR_OVERRUN;
385 1881553 : if ((*flags) & WIND_RW_LE) {
386 1881553 : p[0] = (in[0] ) & 0xff;
387 1881553 : p[1] = (in[0] >> 8) & 0xff;
388 : } else {
389 0 : p[1] = (in[0] ) & 0xff;
390 0 : p[0] = (in[0] >> 8) & 0xff;
391 : }
392 1881553 : len -= 2;
393 1881553 : in_len--;
394 1881553 : p += 2;
395 1881553 : in++;
396 : }
397 103179 : *out_len -= len;
398 103179 : return 0;
399 : }
400 :
401 :
402 : /**
403 : * Convert an UTF-8 string to an UCS2 string.
404 : *
405 : * @param in an UTF-8 string to convert.
406 : * @param out the resulting UCS2 string, must be at least
407 : * wind_utf8ucs2_length() long. If out is NULL, the function will
408 : * calculate the needed space for the out variable (just like
409 : * wind_utf8ucs2_length()).
410 : * @param out_len before processing out_len should be the length of
411 : * the out variable, after processing it will be the length of the out
412 : * string.
413 : *
414 : * @return returns 0 on success, an wind error code otherwise
415 : * @ingroup wind
416 : */
417 :
418 : int
419 208790 : wind_utf8ucs2(const char *in, uint16_t *out, size_t *out_len)
420 : {
421 7808 : const unsigned char *p;
422 208790 : size_t o = 0;
423 7808 : int ret;
424 :
425 4026074 : for (p = (const unsigned char *)in; *p != '\0'; ++p) {
426 143942 : uint32_t u;
427 :
428 3817284 : ret = utf8toutf32(&p, &u);
429 3817284 : if (ret)
430 0 : return ret;
431 :
432 3817284 : if (u >= 0x10000) {
433 12 : if (out) {
434 0 : uint16_t high_ten_bits;
435 0 : uint16_t low_ten_bits;
436 :
437 6 : if (o + 2 > *out_len)
438 0 : return WIND_ERR_OVERRUN;
439 :
440 6 : u -= 0x10000;
441 6 : high_ten_bits = (u >> 10) & 0x3ff;
442 6 : low_ten_bits = u & 0x3ff;
443 :
444 6 : out[o] = 0xd800 | high_ten_bits;
445 6 : out[o+1] = 0xdc00 | low_ten_bits;
446 : }
447 12 : o += 2;
448 : } else {
449 3817272 : if (out) {
450 1908636 : if (o >= *out_len)
451 0 : return WIND_ERR_OVERRUN;
452 1908636 : out[o] = u;
453 : }
454 3817272 : o++;
455 : }
456 : }
457 208790 : *out_len = o;
458 208790 : return 0;
459 : }
460 :
461 : /**
462 : * Calculate the length of from converting a UTF-8 string to a UCS2
463 : * string.
464 : *
465 : * @param in an UTF-8 string to convert.
466 : * @param out_len the length of the resulting UCS2 string.
467 : *
468 : * @return returns 0 on success, an wind error code otherwise
469 : * @ingroup wind
470 : */
471 :
472 : int
473 104395 : wind_utf8ucs2_length(const char *in, size_t *out_len)
474 : {
475 104395 : return wind_utf8ucs2(in, NULL, out_len);
476 : }
477 :
478 : /**
479 : * Convert an UCS2 string to a UTF-8 string.
480 : *
481 : * @param in an UCS2 string to convert.
482 : * @param in_len the length of the in UCS2 string.
483 : * @param out the resulting UTF-8 string, must be at least
484 : * wind_ucs2utf8_length() long. If out is NULL, the function will
485 : * calculate the needed space for the out variable (just like
486 : * wind_ucs2utf8_length()).
487 : * @param out_len before processing out_len should be the length of
488 : * the out variable, after processing it will be the length of the out
489 : * string.
490 : *
491 : * @return returns 0 on success, an wind error code otherwise
492 : * @ingroup wind
493 : */
494 :
495 : int
496 829072 : wind_ucs2utf8(const uint16_t *in, size_t in_len, char *out, size_t *out_len)
497 : {
498 20480 : uint32_t ch;
499 20480 : size_t i, len, o;
500 :
501 18056764 : for (o = 0, i = 0; i < in_len; i++) {
502 17227692 : ch = in[i];
503 :
504 17227692 : if (ch < 0x80) {
505 16794050 : len = 1;
506 42 : } else if (ch < 0x800) {
507 0 : len = 2;
508 42 : } else if (ch < 0xd800 || ch >= 0xe000) {
509 0 : len = 3;
510 42 : } else if (ch < 0xdc00) {
511 : /* A high surrogate. */
512 42 : if (i < in_len - 1) {
513 42 : uint16_t ch2 = in[i + 1];
514 :
515 42 : if (ch2 >= 0xdc00 && ch2 < 0xe000) {
516 0 : uint16_t high_ten_bits;
517 0 : uint16_t low_ten_bits;
518 :
519 : /* A surrogate pair. */
520 42 : high_ten_bits = ch & 0x3ff;
521 42 : low_ten_bits = ch2 & 0x3ff;
522 :
523 42 : ch = 0x10000 + ((uint32_t)high_ten_bits << 10 | low_ten_bits);
524 42 : len = 4;
525 42 : ++i;
526 : } else {
527 : /* An unpaired high surrogate. */
528 0 : len = 3;
529 : }
530 : } else {
531 : /* An unpaired high surrogate. */
532 0 : len = 3;
533 : }
534 : } else {
535 : /* An unpaired low surrogate. */
536 0 : len = 3;
537 : }
538 :
539 17227692 : o += len;
540 :
541 17227692 : if (out) {
542 8613846 : if (o >= *out_len)
543 0 : return WIND_ERR_OVERRUN;
544 :
545 8613846 : switch(len) {
546 21 : case 4:
547 21 : out[3] = (ch | 0x80) & 0xbf;
548 21 : ch = ch >> 6;
549 0 : HEIM_FALLTHROUGH;
550 21 : case 3:
551 21 : out[2] = (ch | 0x80) & 0xbf;
552 21 : ch = ch >> 6;
553 0 : HEIM_FALLTHROUGH;
554 21 : case 2:
555 21 : out[1] = (ch | 0x80) & 0xbf;
556 21 : ch = ch >> 6;
557 216800 : HEIM_FALLTHROUGH;
558 8613846 : case 1:
559 8613846 : out[0] = ch | first_char[len - 1];
560 216800 : HEIM_FALLTHROUGH;
561 8397046 : default:
562 8613846 : break;
563 : }
564 8613846 : out += len;
565 : }
566 : }
567 829072 : if (out) {
568 414536 : if (o >= *out_len)
569 0 : return WIND_ERR_OVERRUN;
570 414536 : *out = '\0';
571 : }
572 829072 : *out_len = o;
573 829072 : return 0;
574 : }
575 :
576 : /**
577 : * Calculate the length of from converting a UCS2 string to an UTF-8 string.
578 : *
579 : * @param in an UCS2 string to convert.
580 : * @param in_len an UCS2 string length to convert.
581 : * @param out_len the length of the resulting UTF-8 string.
582 : *
583 : * @return returns 0 on success, an wind error code otherwise
584 : * @ingroup wind
585 : */
586 :
587 : int
588 414536 : wind_ucs2utf8_length(const uint16_t *in, size_t in_len, size_t *out_len)
589 : {
590 414536 : return wind_ucs2utf8(in, in_len, NULL, out_len);
591 : }
|