14a16efa3SDimitry Andric /*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
24a16efa3SDimitry Andric *
3e6d15924SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4e6d15924SDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
5e6d15924SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
64a16efa3SDimitry Andric *
74a16efa3SDimitry Andric *===------------------------------------------------------------------------=*/
84a16efa3SDimitry Andric /*
9e3b55780SDimitry Andric * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
10e3b55780SDimitry Andric * Distributed under the Terms of Use in
11e3b55780SDimitry Andric * http://www.unicode.org/copyright.html.
124a16efa3SDimitry Andric *
13e3b55780SDimitry Andric * Permission is hereby granted, free of charge, to any person obtaining
14e3b55780SDimitry Andric * a copy of the Unicode data files and any associated documentation
15e3b55780SDimitry Andric * (the "Data Files") or Unicode software and any associated documentation
16e3b55780SDimitry Andric * (the "Software") to deal in the Data Files or Software
17e3b55780SDimitry Andric * without restriction, including without limitation the rights to use,
18e3b55780SDimitry Andric * copy, modify, merge, publish, distribute, and/or sell copies of
19e3b55780SDimitry Andric * the Data Files or Software, and to permit persons to whom the Data Files
20e3b55780SDimitry Andric * or Software are furnished to do so, provided that
21e3b55780SDimitry Andric * (a) this copyright and permission notice appear with all copies
22e3b55780SDimitry Andric * of the Data Files or Software,
23e3b55780SDimitry Andric * (b) this copyright and permission notice appear in associated
24e3b55780SDimitry Andric * documentation, and
25e3b55780SDimitry Andric * (c) there is clear notice in each modified Data File or in the Software
26e3b55780SDimitry Andric * as well as in the documentation associated with the Data File(s) or
27e3b55780SDimitry Andric * Software that the data or software has been modified.
284a16efa3SDimitry Andric *
29e3b55780SDimitry Andric * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
30e3b55780SDimitry Andric * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
31e3b55780SDimitry Andric * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
32e3b55780SDimitry Andric * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
33e3b55780SDimitry Andric * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
34e3b55780SDimitry Andric * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
35e3b55780SDimitry Andric * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
36e3b55780SDimitry Andric * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
37e3b55780SDimitry Andric * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
38e3b55780SDimitry Andric * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
394a16efa3SDimitry Andric *
40e3b55780SDimitry Andric * Except as contained in this notice, the name of a copyright holder
41e3b55780SDimitry Andric * shall not be used in advertising or otherwise to promote the sale,
42e3b55780SDimitry Andric * use or other dealings in these Data Files or Software without prior
43e3b55780SDimitry Andric * written authorization of the copyright holder.
444a16efa3SDimitry Andric */
454a16efa3SDimitry Andric
464a16efa3SDimitry Andric /* ---------------------------------------------------------------------
474a16efa3SDimitry Andric
484a16efa3SDimitry Andric Conversions between UTF32, UTF-16, and UTF-8. Source code file.
494a16efa3SDimitry Andric Author: Mark E. Davis, 1994.
504a16efa3SDimitry Andric Rev History: Rick McGowan, fixes & updates May 2001.
514a16efa3SDimitry Andric Sept 2001: fixed const & error conditions per
524a16efa3SDimitry Andric mods suggested by S. Parent & A. Lillich.
534a16efa3SDimitry Andric June 2002: Tim Dodd added detection and handling of incomplete
544a16efa3SDimitry Andric source sequences, enhanced error detection, added casts
554a16efa3SDimitry Andric to eliminate compiler warnings.
564a16efa3SDimitry Andric July 2003: slight mods to back out aggressive FFFE detection.
574a16efa3SDimitry Andric Jan 2004: updated switches in from-UTF8 conversions.
584a16efa3SDimitry Andric Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
594a16efa3SDimitry Andric
604a16efa3SDimitry Andric See the header file "ConvertUTF.h" for complete documentation.
614a16efa3SDimitry Andric
624a16efa3SDimitry Andric ------------------------------------------------------------------------ */
634a16efa3SDimitry Andric
644a16efa3SDimitry Andric #include "llvm/Support/ConvertUTF.h"
654a16efa3SDimitry Andric #ifdef CVTUTF_DEBUG
664a16efa3SDimitry Andric #include <stdio.h>
674a16efa3SDimitry Andric #endif
685ca98fd9SDimitry Andric #include <assert.h>
694a16efa3SDimitry Andric
70ab44ce3dSDimitry Andric /*
71ab44ce3dSDimitry Andric * This code extensively uses fall-through switches.
72ab44ce3dSDimitry Andric * Keep the compiler from warning about that.
73ab44ce3dSDimitry Andric */
74ab44ce3dSDimitry Andric #if defined(__clang__) && defined(__has_warning)
75ab44ce3dSDimitry Andric # if __has_warning("-Wimplicit-fallthrough")
76ab44ce3dSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
77ab44ce3dSDimitry Andric _Pragma("clang diagnostic push") \
78ab44ce3dSDimitry Andric _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
79ab44ce3dSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
80ab44ce3dSDimitry Andric _Pragma("clang diagnostic pop")
81ab44ce3dSDimitry Andric # endif
82ab44ce3dSDimitry Andric #elif defined(__GNUC__) && __GNUC__ > 6
83ab44ce3dSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS \
84ab44ce3dSDimitry Andric _Pragma("GCC diagnostic push") \
85ab44ce3dSDimitry Andric _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
86ab44ce3dSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS \
87ab44ce3dSDimitry Andric _Pragma("GCC diagnostic pop")
88ab44ce3dSDimitry Andric #endif
89ab44ce3dSDimitry Andric #ifndef ConvertUTF_DISABLE_WARNINGS
90ab44ce3dSDimitry Andric # define ConvertUTF_DISABLE_WARNINGS
91ab44ce3dSDimitry Andric #endif
92ab44ce3dSDimitry Andric #ifndef ConvertUTF_RESTORE_WARNINGS
93ab44ce3dSDimitry Andric # define ConvertUTF_RESTORE_WARNINGS
94ab44ce3dSDimitry Andric #endif
95ab44ce3dSDimitry Andric
96ab44ce3dSDimitry Andric ConvertUTF_DISABLE_WARNINGS
97ab44ce3dSDimitry Andric
98b915e9e0SDimitry Andric namespace llvm {
99b915e9e0SDimitry Andric
1004a16efa3SDimitry Andric static const int halfShift = 10; /* used for shifting by 10 bits */
1014a16efa3SDimitry Andric
1024a16efa3SDimitry Andric static const UTF32 halfBase = 0x0010000UL;
1034a16efa3SDimitry Andric static const UTF32 halfMask = 0x3FFUL;
1044a16efa3SDimitry Andric
1054a16efa3SDimitry Andric #define UNI_SUR_HIGH_START (UTF32)0xD800
1064a16efa3SDimitry Andric #define UNI_SUR_HIGH_END (UTF32)0xDBFF
1074a16efa3SDimitry Andric #define UNI_SUR_LOW_START (UTF32)0xDC00
1084a16efa3SDimitry Andric #define UNI_SUR_LOW_END (UTF32)0xDFFF
1094a16efa3SDimitry Andric
1104a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
1114a16efa3SDimitry Andric
1124a16efa3SDimitry Andric /*
1134a16efa3SDimitry Andric * Index into the table below with the first byte of a UTF-8 sequence to
1144a16efa3SDimitry Andric * get the number of trailing bytes that are supposed to follow it.
1154a16efa3SDimitry Andric * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
1164a16efa3SDimitry Andric * left as-is for anyone who may want to do such conversion, which was
1174a16efa3SDimitry Andric * allowed in earlier algorithms.
1184a16efa3SDimitry Andric */
1194a16efa3SDimitry Andric static const char trailingBytesForUTF8[256] = {
1204a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1214a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1224a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1234a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1244a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1254a16efa3SDimitry Andric 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1264a16efa3SDimitry Andric 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1274a16efa3SDimitry Andric 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
1284a16efa3SDimitry Andric };
1294a16efa3SDimitry Andric
1304a16efa3SDimitry Andric /*
1314a16efa3SDimitry Andric * Magic values subtracted from a buffer value during UTF8 conversion.
1324a16efa3SDimitry Andric * This table contains as many values as there might be trailing bytes
1334a16efa3SDimitry Andric * in a UTF-8 sequence.
1344a16efa3SDimitry Andric */
1354a16efa3SDimitry Andric static const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
1364a16efa3SDimitry Andric 0x03C82080UL, 0xFA082080UL, 0x82082080UL };
1374a16efa3SDimitry Andric
1384a16efa3SDimitry Andric /*
1394a16efa3SDimitry Andric * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
1404a16efa3SDimitry Andric * into the first byte, depending on how many bytes follow. There are
1414a16efa3SDimitry Andric * as many entries in this table as there are UTF-8 sequence types.
1424a16efa3SDimitry Andric * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
1434a16efa3SDimitry Andric * for *legal* UTF-8 will be 4 or fewer bytes total.
1444a16efa3SDimitry Andric */
1454a16efa3SDimitry Andric static const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
1464a16efa3SDimitry Andric
1474a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
1484a16efa3SDimitry Andric
1494a16efa3SDimitry Andric /* The interface converts a whole buffer to avoid function-call overhead.
1504a16efa3SDimitry Andric * Constants have been gathered. Loops & conditionals have been removed as
1514a16efa3SDimitry Andric * much as possible for efficiency, in favor of drop-through switches.
1524a16efa3SDimitry Andric * (See "Note A" at the bottom of the file for equivalent code.)
1534a16efa3SDimitry Andric * If your compiler supports it, the "isLegalUTF8" call can be turned
1544a16efa3SDimitry Andric * into an inline function.
1554a16efa3SDimitry Andric */
1564a16efa3SDimitry Andric
1574a16efa3SDimitry Andric
1584a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
1594a16efa3SDimitry Andric
ConvertUTF32toUTF16(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)1604a16efa3SDimitry Andric ConversionResult ConvertUTF32toUTF16 (
1614a16efa3SDimitry Andric const UTF32** sourceStart, const UTF32* sourceEnd,
1624a16efa3SDimitry Andric UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
1634a16efa3SDimitry Andric ConversionResult result = conversionOK;
1644a16efa3SDimitry Andric const UTF32* source = *sourceStart;
1654a16efa3SDimitry Andric UTF16* target = *targetStart;
1664a16efa3SDimitry Andric while (source < sourceEnd) {
1674a16efa3SDimitry Andric UTF32 ch;
1684a16efa3SDimitry Andric if (target >= targetEnd) {
1694a16efa3SDimitry Andric result = targetExhausted; break;
1704a16efa3SDimitry Andric }
1714a16efa3SDimitry Andric ch = *source++;
1724a16efa3SDimitry Andric if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
1734a16efa3SDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
1744a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
1754a16efa3SDimitry Andric if (flags == strictConversion) {
1764a16efa3SDimitry Andric --source; /* return to the illegal value itself */
1774a16efa3SDimitry Andric result = sourceIllegal;
1784a16efa3SDimitry Andric break;
1794a16efa3SDimitry Andric } else {
1804a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
1814a16efa3SDimitry Andric }
1824a16efa3SDimitry Andric } else {
1834a16efa3SDimitry Andric *target++ = (UTF16)ch; /* normal case */
1844a16efa3SDimitry Andric }
1854a16efa3SDimitry Andric } else if (ch > UNI_MAX_LEGAL_UTF32) {
1864a16efa3SDimitry Andric if (flags == strictConversion) {
1874a16efa3SDimitry Andric result = sourceIllegal;
1884a16efa3SDimitry Andric } else {
1894a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
1904a16efa3SDimitry Andric }
1914a16efa3SDimitry Andric } else {
1924a16efa3SDimitry Andric /* target is a character in range 0xFFFF - 0x10FFFF. */
1934a16efa3SDimitry Andric if (target + 1 >= targetEnd) {
1944a16efa3SDimitry Andric --source; /* Back up source pointer! */
1954a16efa3SDimitry Andric result = targetExhausted; break;
1964a16efa3SDimitry Andric }
1974a16efa3SDimitry Andric ch -= halfBase;
1984a16efa3SDimitry Andric *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
1994a16efa3SDimitry Andric *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
2004a16efa3SDimitry Andric }
2014a16efa3SDimitry Andric }
2024a16efa3SDimitry Andric *sourceStart = source;
2034a16efa3SDimitry Andric *targetStart = target;
2044a16efa3SDimitry Andric return result;
2054a16efa3SDimitry Andric }
2064a16efa3SDimitry Andric
2074a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
2084a16efa3SDimitry Andric
ConvertUTF16toUTF32(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)2094a16efa3SDimitry Andric ConversionResult ConvertUTF16toUTF32 (
2104a16efa3SDimitry Andric const UTF16** sourceStart, const UTF16* sourceEnd,
2114a16efa3SDimitry Andric UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
2124a16efa3SDimitry Andric ConversionResult result = conversionOK;
2134a16efa3SDimitry Andric const UTF16* source = *sourceStart;
2144a16efa3SDimitry Andric UTF32* target = *targetStart;
2154a16efa3SDimitry Andric UTF32 ch, ch2;
2164a16efa3SDimitry Andric while (source < sourceEnd) {
2174a16efa3SDimitry Andric const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2184a16efa3SDimitry Andric ch = *source++;
2194a16efa3SDimitry Andric /* If we have a surrogate pair, convert to UTF32 first. */
2204a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2214a16efa3SDimitry Andric /* If the 16 bits following the high surrogate are in the source buffer... */
2224a16efa3SDimitry Andric if (source < sourceEnd) {
2234a16efa3SDimitry Andric ch2 = *source;
2244a16efa3SDimitry Andric /* If it's a low surrogate, convert to UTF32. */
2254a16efa3SDimitry Andric if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2264a16efa3SDimitry Andric ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2274a16efa3SDimitry Andric + (ch2 - UNI_SUR_LOW_START) + halfBase;
2284a16efa3SDimitry Andric ++source;
2294a16efa3SDimitry Andric } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2304a16efa3SDimitry Andric --source; /* return to the illegal value itself */
2314a16efa3SDimitry Andric result = sourceIllegal;
2324a16efa3SDimitry Andric break;
2334a16efa3SDimitry Andric }
2344a16efa3SDimitry Andric } else { /* We don't have the 16 bits following the high surrogate. */
2354a16efa3SDimitry Andric --source; /* return to the high surrogate */
2364a16efa3SDimitry Andric result = sourceExhausted;
2374a16efa3SDimitry Andric break;
2384a16efa3SDimitry Andric }
2394a16efa3SDimitry Andric } else if (flags == strictConversion) {
2404a16efa3SDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
2414a16efa3SDimitry Andric if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2424a16efa3SDimitry Andric --source; /* return to the illegal value itself */
2434a16efa3SDimitry Andric result = sourceIllegal;
2444a16efa3SDimitry Andric break;
2454a16efa3SDimitry Andric }
2464a16efa3SDimitry Andric }
2474a16efa3SDimitry Andric if (target >= targetEnd) {
2484a16efa3SDimitry Andric source = oldSource; /* Back up source pointer! */
2494a16efa3SDimitry Andric result = targetExhausted; break;
2504a16efa3SDimitry Andric }
2514a16efa3SDimitry Andric *target++ = ch;
2524a16efa3SDimitry Andric }
2534a16efa3SDimitry Andric *sourceStart = source;
2544a16efa3SDimitry Andric *targetStart = target;
2554a16efa3SDimitry Andric #ifdef CVTUTF_DEBUG
2564a16efa3SDimitry Andric if (result == sourceIllegal) {
2574a16efa3SDimitry Andric fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
2584a16efa3SDimitry Andric fflush(stderr);
2594a16efa3SDimitry Andric }
2604a16efa3SDimitry Andric #endif
2614a16efa3SDimitry Andric return result;
2624a16efa3SDimitry Andric }
ConvertUTF16toUTF8(const UTF16 ** sourceStart,const UTF16 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)2634a16efa3SDimitry Andric ConversionResult ConvertUTF16toUTF8 (
2644a16efa3SDimitry Andric const UTF16** sourceStart, const UTF16* sourceEnd,
2654a16efa3SDimitry Andric UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
2664a16efa3SDimitry Andric ConversionResult result = conversionOK;
2674a16efa3SDimitry Andric const UTF16* source = *sourceStart;
2684a16efa3SDimitry Andric UTF8* target = *targetStart;
2694a16efa3SDimitry Andric while (source < sourceEnd) {
2704a16efa3SDimitry Andric UTF32 ch;
2714a16efa3SDimitry Andric unsigned short bytesToWrite = 0;
2724a16efa3SDimitry Andric const UTF32 byteMask = 0xBF;
2734a16efa3SDimitry Andric const UTF32 byteMark = 0x80;
2744a16efa3SDimitry Andric const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
2754a16efa3SDimitry Andric ch = *source++;
2764a16efa3SDimitry Andric /* If we have a surrogate pair, convert to UTF32 first. */
2774a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
2784a16efa3SDimitry Andric /* If the 16 bits following the high surrogate are in the source buffer... */
2794a16efa3SDimitry Andric if (source < sourceEnd) {
2804a16efa3SDimitry Andric UTF32 ch2 = *source;
2814a16efa3SDimitry Andric /* If it's a low surrogate, convert to UTF32. */
2824a16efa3SDimitry Andric if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
2834a16efa3SDimitry Andric ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
2844a16efa3SDimitry Andric + (ch2 - UNI_SUR_LOW_START) + halfBase;
2854a16efa3SDimitry Andric ++source;
2864a16efa3SDimitry Andric } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
2874a16efa3SDimitry Andric --source; /* return to the illegal value itself */
2884a16efa3SDimitry Andric result = sourceIllegal;
2894a16efa3SDimitry Andric break;
2904a16efa3SDimitry Andric }
2914a16efa3SDimitry Andric } else { /* We don't have the 16 bits following the high surrogate. */
2924a16efa3SDimitry Andric --source; /* return to the high surrogate */
2934a16efa3SDimitry Andric result = sourceExhausted;
2944a16efa3SDimitry Andric break;
2954a16efa3SDimitry Andric }
2964a16efa3SDimitry Andric } else if (flags == strictConversion) {
2974a16efa3SDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
2984a16efa3SDimitry Andric if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
2994a16efa3SDimitry Andric --source; /* return to the illegal value itself */
3004a16efa3SDimitry Andric result = sourceIllegal;
3014a16efa3SDimitry Andric break;
3024a16efa3SDimitry Andric }
3034a16efa3SDimitry Andric }
3044a16efa3SDimitry Andric /* Figure out how many bytes the result will require */
3054a16efa3SDimitry Andric if (ch < (UTF32)0x80) { bytesToWrite = 1;
3064a16efa3SDimitry Andric } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
3074a16efa3SDimitry Andric } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
3084a16efa3SDimitry Andric } else if (ch < (UTF32)0x110000) { bytesToWrite = 4;
3094a16efa3SDimitry Andric } else { bytesToWrite = 3;
3104a16efa3SDimitry Andric ch = UNI_REPLACEMENT_CHAR;
3114a16efa3SDimitry Andric }
3124a16efa3SDimitry Andric
3134a16efa3SDimitry Andric target += bytesToWrite;
3144a16efa3SDimitry Andric if (target > targetEnd) {
3154a16efa3SDimitry Andric source = oldSource; /* Back up source pointer! */
3164a16efa3SDimitry Andric target -= bytesToWrite; result = targetExhausted; break;
3174a16efa3SDimitry Andric }
3184a16efa3SDimitry Andric switch (bytesToWrite) { /* note: everything falls through. */
3194a16efa3SDimitry Andric case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3204a16efa3SDimitry Andric case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3214a16efa3SDimitry Andric case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3224a16efa3SDimitry Andric case 1: *--target = (UTF8)(ch | firstByteMark[bytesToWrite]);
3234a16efa3SDimitry Andric }
3244a16efa3SDimitry Andric target += bytesToWrite;
3254a16efa3SDimitry Andric }
3264a16efa3SDimitry Andric *sourceStart = source;
3274a16efa3SDimitry Andric *targetStart = target;
3284a16efa3SDimitry Andric return result;
3294a16efa3SDimitry Andric }
3304a16efa3SDimitry Andric
3314a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
3324a16efa3SDimitry Andric
ConvertUTF32toUTF8(const UTF32 ** sourceStart,const UTF32 * sourceEnd,UTF8 ** targetStart,UTF8 * targetEnd,ConversionFlags flags)3334a16efa3SDimitry Andric ConversionResult ConvertUTF32toUTF8 (
3344a16efa3SDimitry Andric const UTF32** sourceStart, const UTF32* sourceEnd,
3354a16efa3SDimitry Andric UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
3364a16efa3SDimitry Andric ConversionResult result = conversionOK;
3374a16efa3SDimitry Andric const UTF32* source = *sourceStart;
3384a16efa3SDimitry Andric UTF8* target = *targetStart;
3394a16efa3SDimitry Andric while (source < sourceEnd) {
3404a16efa3SDimitry Andric UTF32 ch;
3414a16efa3SDimitry Andric unsigned short bytesToWrite = 0;
3424a16efa3SDimitry Andric const UTF32 byteMask = 0xBF;
3434a16efa3SDimitry Andric const UTF32 byteMark = 0x80;
3444a16efa3SDimitry Andric ch = *source++;
3454a16efa3SDimitry Andric if (flags == strictConversion ) {
3464a16efa3SDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
3474a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
3484a16efa3SDimitry Andric --source; /* return to the illegal value itself */
3494a16efa3SDimitry Andric result = sourceIllegal;
3504a16efa3SDimitry Andric break;
3514a16efa3SDimitry Andric }
3524a16efa3SDimitry Andric }
3534a16efa3SDimitry Andric /*
3544a16efa3SDimitry Andric * Figure out how many bytes the result will require. Turn any
3554a16efa3SDimitry Andric * illegally large UTF32 things (> Plane 17) into replacement chars.
3564a16efa3SDimitry Andric */
3574a16efa3SDimitry Andric if (ch < (UTF32)0x80) { bytesToWrite = 1;
3584a16efa3SDimitry Andric } else if (ch < (UTF32)0x800) { bytesToWrite = 2;
3594a16efa3SDimitry Andric } else if (ch < (UTF32)0x10000) { bytesToWrite = 3;
3604a16efa3SDimitry Andric } else if (ch <= UNI_MAX_LEGAL_UTF32) { bytesToWrite = 4;
3614a16efa3SDimitry Andric } else { bytesToWrite = 3;
3624a16efa3SDimitry Andric ch = UNI_REPLACEMENT_CHAR;
3634a16efa3SDimitry Andric result = sourceIllegal;
3644a16efa3SDimitry Andric }
3654a16efa3SDimitry Andric
3664a16efa3SDimitry Andric target += bytesToWrite;
3674a16efa3SDimitry Andric if (target > targetEnd) {
3684a16efa3SDimitry Andric --source; /* Back up source pointer! */
3694a16efa3SDimitry Andric target -= bytesToWrite; result = targetExhausted; break;
3704a16efa3SDimitry Andric }
3714a16efa3SDimitry Andric switch (bytesToWrite) { /* note: everything falls through. */
3724a16efa3SDimitry Andric case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3734a16efa3SDimitry Andric case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3744a16efa3SDimitry Andric case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
3754a16efa3SDimitry Andric case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
3764a16efa3SDimitry Andric }
3774a16efa3SDimitry Andric target += bytesToWrite;
3784a16efa3SDimitry Andric }
3794a16efa3SDimitry Andric *sourceStart = source;
3804a16efa3SDimitry Andric *targetStart = target;
3814a16efa3SDimitry Andric return result;
3824a16efa3SDimitry Andric }
3834a16efa3SDimitry Andric
3844a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
3854a16efa3SDimitry Andric
3864a16efa3SDimitry Andric /*
3874a16efa3SDimitry Andric * Utility routine to tell whether a sequence of bytes is legal UTF-8.
3884a16efa3SDimitry Andric * This must be called with the length pre-determined by the first byte.
3894a16efa3SDimitry Andric * If not calling this from ConvertUTF8to*, then the length can be set by:
3904a16efa3SDimitry Andric * length = trailingBytesForUTF8[*source]+1;
3914a16efa3SDimitry Andric * and the sequence is illegal right away if there aren't that many bytes
3924a16efa3SDimitry Andric * available.
3934a16efa3SDimitry Andric * If presented with a length > 4, this returns false. The Unicode
3944a16efa3SDimitry Andric * definition of UTF-8 goes up to 4-byte sequences.
3954a16efa3SDimitry Andric */
3964a16efa3SDimitry Andric
isLegalUTF8(const UTF8 * source,int length)3974a16efa3SDimitry Andric static Boolean isLegalUTF8(const UTF8 *source, int length) {
3984a16efa3SDimitry Andric UTF8 a;
3994a16efa3SDimitry Andric const UTF8 *srcptr = source+length;
4004a16efa3SDimitry Andric switch (length) {
4014a16efa3SDimitry Andric default: return false;
4024a16efa3SDimitry Andric /* Everything else falls through when "true"... */
4034a16efa3SDimitry Andric case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4044a16efa3SDimitry Andric case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4054a16efa3SDimitry Andric case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4064a16efa3SDimitry Andric
4074a16efa3SDimitry Andric switch (*source) {
4084a16efa3SDimitry Andric /* no fall-through in this inner switch */
4094a16efa3SDimitry Andric case 0xE0: if (a < 0xA0) return false; break;
4104a16efa3SDimitry Andric case 0xED: if (a > 0x9F) return false; break;
4114a16efa3SDimitry Andric case 0xF0: if (a < 0x90) return false; break;
4124a16efa3SDimitry Andric case 0xF4: if (a > 0x8F) return false; break;
4134a16efa3SDimitry Andric default: if (a < 0x80) return false;
4144a16efa3SDimitry Andric }
4154a16efa3SDimitry Andric
4164a16efa3SDimitry Andric case 1: if (*source >= 0x80 && *source < 0xC2) return false;
4174a16efa3SDimitry Andric }
4184a16efa3SDimitry Andric if (*source > 0xF4) return false;
4194a16efa3SDimitry Andric return true;
4204a16efa3SDimitry Andric }
4214a16efa3SDimitry Andric
4224a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
4234a16efa3SDimitry Andric
4244a16efa3SDimitry Andric /*
4254a16efa3SDimitry Andric * Exported function to return whether a UTF-8 sequence is legal or not.
4264a16efa3SDimitry Andric * This is not used here; it's just exported.
4274a16efa3SDimitry Andric */
isLegalUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4284a16efa3SDimitry Andric Boolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
4294a16efa3SDimitry Andric int length = trailingBytesForUTF8[*source]+1;
4304a16efa3SDimitry Andric if (length > sourceEnd - source) {
4314a16efa3SDimitry Andric return false;
4324a16efa3SDimitry Andric }
4334a16efa3SDimitry Andric return isLegalUTF8(source, length);
4344a16efa3SDimitry Andric }
4354a16efa3SDimitry Andric
4361f917f69SDimitry Andric /*
4371f917f69SDimitry Andric * Exported function to return the size of the first utf-8 code unit sequence,
4381f917f69SDimitry Andric * Or 0 if the sequence is not valid;
4391f917f69SDimitry Andric */
getUTF8SequenceSize(const UTF8 * source,const UTF8 * sourceEnd)4401f917f69SDimitry Andric unsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
4411f917f69SDimitry Andric int length = trailingBytesForUTF8[*source] + 1;
4421f917f69SDimitry Andric return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
4431f917f69SDimitry Andric : 0;
4441f917f69SDimitry Andric }
4451f917f69SDimitry Andric
4464a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
4474a16efa3SDimitry Andric
4485ca98fd9SDimitry Andric static unsigned
findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 * source,const UTF8 * sourceEnd)4495ca98fd9SDimitry Andric findMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
4505ca98fd9SDimitry Andric const UTF8 *sourceEnd) {
4515ca98fd9SDimitry Andric UTF8 b1, b2, b3;
4525ca98fd9SDimitry Andric
4535ca98fd9SDimitry Andric assert(!isLegalUTF8Sequence(source, sourceEnd));
4545ca98fd9SDimitry Andric
4555ca98fd9SDimitry Andric /*
4565ca98fd9SDimitry Andric * Unicode 6.3.0, D93b:
4575ca98fd9SDimitry Andric *
4585ca98fd9SDimitry Andric * Maximal subpart of an ill-formed subsequence: The longest code unit
4595ca98fd9SDimitry Andric * subsequence starting at an unconvertible offset that is either:
4605ca98fd9SDimitry Andric * a. the initial subsequence of a well-formed code unit sequence, or
4615ca98fd9SDimitry Andric * b. a subsequence of length one.
4625ca98fd9SDimitry Andric */
4635ca98fd9SDimitry Andric
4645ca98fd9SDimitry Andric if (source == sourceEnd)
4655ca98fd9SDimitry Andric return 0;
4665ca98fd9SDimitry Andric
4675ca98fd9SDimitry Andric /*
4685ca98fd9SDimitry Andric * Perform case analysis. See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
4695ca98fd9SDimitry Andric * Byte Sequences.
4705ca98fd9SDimitry Andric */
4715ca98fd9SDimitry Andric
4725ca98fd9SDimitry Andric b1 = *source;
4735ca98fd9SDimitry Andric ++source;
4745ca98fd9SDimitry Andric if (b1 >= 0xC2 && b1 <= 0xDF) {
4755ca98fd9SDimitry Andric /*
4765ca98fd9SDimitry Andric * First byte is valid, but we know that this code unit sequence is
4775ca98fd9SDimitry Andric * invalid, so the maximal subpart has to end after the first byte.
4785ca98fd9SDimitry Andric */
4795ca98fd9SDimitry Andric return 1;
4805ca98fd9SDimitry Andric }
4815ca98fd9SDimitry Andric
4825ca98fd9SDimitry Andric if (source == sourceEnd)
4835ca98fd9SDimitry Andric return 1;
4845ca98fd9SDimitry Andric
4855ca98fd9SDimitry Andric b2 = *source;
4865ca98fd9SDimitry Andric ++source;
4875ca98fd9SDimitry Andric
4885ca98fd9SDimitry Andric if (b1 == 0xE0) {
4895ca98fd9SDimitry Andric return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
4905ca98fd9SDimitry Andric }
4915ca98fd9SDimitry Andric if (b1 >= 0xE1 && b1 <= 0xEC) {
4925ca98fd9SDimitry Andric return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4935ca98fd9SDimitry Andric }
4945ca98fd9SDimitry Andric if (b1 == 0xED) {
4955ca98fd9SDimitry Andric return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
4965ca98fd9SDimitry Andric }
4975ca98fd9SDimitry Andric if (b1 >= 0xEE && b1 <= 0xEF) {
4985ca98fd9SDimitry Andric return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
4995ca98fd9SDimitry Andric }
5005ca98fd9SDimitry Andric if (b1 == 0xF0) {
5015ca98fd9SDimitry Andric if (b2 >= 0x90 && b2 <= 0xBF) {
5025ca98fd9SDimitry Andric if (source == sourceEnd)
5035ca98fd9SDimitry Andric return 2;
5045ca98fd9SDimitry Andric
5055ca98fd9SDimitry Andric b3 = *source;
5065ca98fd9SDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5075ca98fd9SDimitry Andric }
5085ca98fd9SDimitry Andric return 1;
5095ca98fd9SDimitry Andric }
5105ca98fd9SDimitry Andric if (b1 >= 0xF1 && b1 <= 0xF3) {
5115ca98fd9SDimitry Andric if (b2 >= 0x80 && b2 <= 0xBF) {
5125ca98fd9SDimitry Andric if (source == sourceEnd)
5135ca98fd9SDimitry Andric return 2;
5145ca98fd9SDimitry Andric
5155ca98fd9SDimitry Andric b3 = *source;
5165ca98fd9SDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5175ca98fd9SDimitry Andric }
5185ca98fd9SDimitry Andric return 1;
5195ca98fd9SDimitry Andric }
5205ca98fd9SDimitry Andric if (b1 == 0xF4) {
5215ca98fd9SDimitry Andric if (b2 >= 0x80 && b2 <= 0x8F) {
5225ca98fd9SDimitry Andric if (source == sourceEnd)
5235ca98fd9SDimitry Andric return 2;
5245ca98fd9SDimitry Andric
5255ca98fd9SDimitry Andric b3 = *source;
5265ca98fd9SDimitry Andric return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5275ca98fd9SDimitry Andric }
5285ca98fd9SDimitry Andric return 1;
5295ca98fd9SDimitry Andric }
5305ca98fd9SDimitry Andric
5315ca98fd9SDimitry Andric assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
5325ca98fd9SDimitry Andric /*
5335ca98fd9SDimitry Andric * There are no valid sequences that start with these bytes. Maximal subpart
5345ca98fd9SDimitry Andric * is defined to have length 1 in these cases.
5355ca98fd9SDimitry Andric */
5365ca98fd9SDimitry Andric return 1;
5375ca98fd9SDimitry Andric }
5385ca98fd9SDimitry Andric
5395ca98fd9SDimitry Andric /* --------------------------------------------------------------------- */
5405ca98fd9SDimitry Andric
5414a16efa3SDimitry Andric /*
5424a16efa3SDimitry Andric * Exported function to return the total number of bytes in a codepoint
5434a16efa3SDimitry Andric * represented in UTF-8, given the value of the first byte.
5444a16efa3SDimitry Andric */
getNumBytesForUTF8(UTF8 first)5454a16efa3SDimitry Andric unsigned getNumBytesForUTF8(UTF8 first) {
5464a16efa3SDimitry Andric return trailingBytesForUTF8[first] + 1;
5474a16efa3SDimitry Andric }
5484a16efa3SDimitry Andric
5494a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
5504a16efa3SDimitry Andric
5514a16efa3SDimitry Andric /*
5524a16efa3SDimitry Andric * Exported function to return whether a UTF-8 string is legal or not.
5534a16efa3SDimitry Andric * This is not used here; it's just exported.
5544a16efa3SDimitry Andric */
isLegalUTF8String(const UTF8 ** source,const UTF8 * sourceEnd)5554a16efa3SDimitry Andric Boolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
5564a16efa3SDimitry Andric while (*source != sourceEnd) {
5574a16efa3SDimitry Andric int length = trailingBytesForUTF8[**source] + 1;
5584a16efa3SDimitry Andric if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
5594a16efa3SDimitry Andric return false;
5604a16efa3SDimitry Andric *source += length;
5614a16efa3SDimitry Andric }
5624a16efa3SDimitry Andric return true;
5634a16efa3SDimitry Andric }
5644a16efa3SDimitry Andric
5654a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
5664a16efa3SDimitry Andric
ConvertUTF8toUTF16(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF16 ** targetStart,UTF16 * targetEnd,ConversionFlags flags)5674a16efa3SDimitry Andric ConversionResult ConvertUTF8toUTF16 (
5684a16efa3SDimitry Andric const UTF8** sourceStart, const UTF8* sourceEnd,
5694a16efa3SDimitry Andric UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
5704a16efa3SDimitry Andric ConversionResult result = conversionOK;
5714a16efa3SDimitry Andric const UTF8* source = *sourceStart;
5724a16efa3SDimitry Andric UTF16* target = *targetStart;
5734a16efa3SDimitry Andric while (source < sourceEnd) {
5744a16efa3SDimitry Andric UTF32 ch = 0;
5754a16efa3SDimitry Andric unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
5764a16efa3SDimitry Andric if (extraBytesToRead >= sourceEnd - source) {
5774a16efa3SDimitry Andric result = sourceExhausted; break;
5784a16efa3SDimitry Andric }
5794a16efa3SDimitry Andric /* Do this check whether lenient or strict */
5804a16efa3SDimitry Andric if (!isLegalUTF8(source, extraBytesToRead+1)) {
5814a16efa3SDimitry Andric result = sourceIllegal;
5824a16efa3SDimitry Andric break;
5834a16efa3SDimitry Andric }
5844a16efa3SDimitry Andric /*
5854a16efa3SDimitry Andric * The cases all fall through. See "Note A" below.
5864a16efa3SDimitry Andric */
5874a16efa3SDimitry Andric switch (extraBytesToRead) {
5884a16efa3SDimitry Andric case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5894a16efa3SDimitry Andric case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
5904a16efa3SDimitry Andric case 3: ch += *source++; ch <<= 6;
5914a16efa3SDimitry Andric case 2: ch += *source++; ch <<= 6;
5924a16efa3SDimitry Andric case 1: ch += *source++; ch <<= 6;
5934a16efa3SDimitry Andric case 0: ch += *source++;
5944a16efa3SDimitry Andric }
5954a16efa3SDimitry Andric ch -= offsetsFromUTF8[extraBytesToRead];
5964a16efa3SDimitry Andric
5974a16efa3SDimitry Andric if (target >= targetEnd) {
5984a16efa3SDimitry Andric source -= (extraBytesToRead+1); /* Back up source pointer! */
5994a16efa3SDimitry Andric result = targetExhausted; break;
6004a16efa3SDimitry Andric }
6014a16efa3SDimitry Andric if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
6024a16efa3SDimitry Andric /* UTF-16 surrogate values are illegal in UTF-32 */
6034a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
6044a16efa3SDimitry Andric if (flags == strictConversion) {
6054a16efa3SDimitry Andric source -= (extraBytesToRead+1); /* return to the illegal value itself */
6064a16efa3SDimitry Andric result = sourceIllegal;
6074a16efa3SDimitry Andric break;
6084a16efa3SDimitry Andric } else {
6094a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
6104a16efa3SDimitry Andric }
6114a16efa3SDimitry Andric } else {
6124a16efa3SDimitry Andric *target++ = (UTF16)ch; /* normal case */
6134a16efa3SDimitry Andric }
6144a16efa3SDimitry Andric } else if (ch > UNI_MAX_UTF16) {
6154a16efa3SDimitry Andric if (flags == strictConversion) {
6164a16efa3SDimitry Andric result = sourceIllegal;
6174a16efa3SDimitry Andric source -= (extraBytesToRead+1); /* return to the start */
6184a16efa3SDimitry Andric break; /* Bail out; shouldn't continue */
6194a16efa3SDimitry Andric } else {
6204a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
6214a16efa3SDimitry Andric }
6224a16efa3SDimitry Andric } else {
6234a16efa3SDimitry Andric /* target is a character in range 0xFFFF - 0x10FFFF. */
6244a16efa3SDimitry Andric if (target + 1 >= targetEnd) {
6254a16efa3SDimitry Andric source -= (extraBytesToRead+1); /* Back up source pointer! */
6264a16efa3SDimitry Andric result = targetExhausted; break;
6274a16efa3SDimitry Andric }
6284a16efa3SDimitry Andric ch -= halfBase;
6294a16efa3SDimitry Andric *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
6304a16efa3SDimitry Andric *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
6314a16efa3SDimitry Andric }
6324a16efa3SDimitry Andric }
6334a16efa3SDimitry Andric *sourceStart = source;
6344a16efa3SDimitry Andric *targetStart = target;
6354a16efa3SDimitry Andric return result;
6364a16efa3SDimitry Andric }
6374a16efa3SDimitry Andric
6384a16efa3SDimitry Andric /* --------------------------------------------------------------------- */
6394a16efa3SDimitry Andric
ConvertUTF8toUTF32Impl(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags,Boolean InputIsPartial)6405ca98fd9SDimitry Andric static ConversionResult ConvertUTF8toUTF32Impl(
6414a16efa3SDimitry Andric const UTF8** sourceStart, const UTF8* sourceEnd,
6425ca98fd9SDimitry Andric UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
6435ca98fd9SDimitry Andric Boolean InputIsPartial) {
6444a16efa3SDimitry Andric ConversionResult result = conversionOK;
6454a16efa3SDimitry Andric const UTF8* source = *sourceStart;
6464a16efa3SDimitry Andric UTF32* target = *targetStart;
6474a16efa3SDimitry Andric while (source < sourceEnd) {
6484a16efa3SDimitry Andric UTF32 ch = 0;
6494a16efa3SDimitry Andric unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
6504a16efa3SDimitry Andric if (extraBytesToRead >= sourceEnd - source) {
6515ca98fd9SDimitry Andric if (flags == strictConversion || InputIsPartial) {
6525ca98fd9SDimitry Andric result = sourceExhausted;
6535ca98fd9SDimitry Andric break;
6545ca98fd9SDimitry Andric } else {
6555ca98fd9SDimitry Andric result = sourceIllegal;
6565ca98fd9SDimitry Andric
6575ca98fd9SDimitry Andric /*
6585ca98fd9SDimitry Andric * Replace the maximal subpart of ill-formed sequence with
6595ca98fd9SDimitry Andric * replacement character.
6605ca98fd9SDimitry Andric */
6615ca98fd9SDimitry Andric source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6625ca98fd9SDimitry Andric sourceEnd);
6635ca98fd9SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
6645ca98fd9SDimitry Andric continue;
6654a16efa3SDimitry Andric }
6665ca98fd9SDimitry Andric }
6675ca98fd9SDimitry Andric if (target >= targetEnd) {
6685ca98fd9SDimitry Andric result = targetExhausted; break;
6695ca98fd9SDimitry Andric }
6705ca98fd9SDimitry Andric
6714a16efa3SDimitry Andric /* Do this check whether lenient or strict */
6724a16efa3SDimitry Andric if (!isLegalUTF8(source, extraBytesToRead+1)) {
6734a16efa3SDimitry Andric result = sourceIllegal;
6745ca98fd9SDimitry Andric if (flags == strictConversion) {
6755ca98fd9SDimitry Andric /* Abort conversion. */
6764a16efa3SDimitry Andric break;
6775ca98fd9SDimitry Andric } else {
6785ca98fd9SDimitry Andric /*
6795ca98fd9SDimitry Andric * Replace the maximal subpart of ill-formed sequence with
6805ca98fd9SDimitry Andric * replacement character.
6815ca98fd9SDimitry Andric */
6825ca98fd9SDimitry Andric source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
6835ca98fd9SDimitry Andric sourceEnd);
6845ca98fd9SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
6855ca98fd9SDimitry Andric continue;
6865ca98fd9SDimitry Andric }
6874a16efa3SDimitry Andric }
6884a16efa3SDimitry Andric /*
6894a16efa3SDimitry Andric * The cases all fall through. See "Note A" below.
6904a16efa3SDimitry Andric */
6914a16efa3SDimitry Andric switch (extraBytesToRead) {
6924a16efa3SDimitry Andric case 5: ch += *source++; ch <<= 6;
6934a16efa3SDimitry Andric case 4: ch += *source++; ch <<= 6;
6944a16efa3SDimitry Andric case 3: ch += *source++; ch <<= 6;
6954a16efa3SDimitry Andric case 2: ch += *source++; ch <<= 6;
6964a16efa3SDimitry Andric case 1: ch += *source++; ch <<= 6;
6974a16efa3SDimitry Andric case 0: ch += *source++;
6984a16efa3SDimitry Andric }
6994a16efa3SDimitry Andric ch -= offsetsFromUTF8[extraBytesToRead];
7004a16efa3SDimitry Andric
7014a16efa3SDimitry Andric if (ch <= UNI_MAX_LEGAL_UTF32) {
7024a16efa3SDimitry Andric /*
7034a16efa3SDimitry Andric * UTF-16 surrogate values are illegal in UTF-32, and anything
7044a16efa3SDimitry Andric * over Plane 17 (> 0x10FFFF) is illegal.
7054a16efa3SDimitry Andric */
7064a16efa3SDimitry Andric if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
7074a16efa3SDimitry Andric if (flags == strictConversion) {
7084a16efa3SDimitry Andric source -= (extraBytesToRead+1); /* return to the illegal value itself */
7094a16efa3SDimitry Andric result = sourceIllegal;
7104a16efa3SDimitry Andric break;
7114a16efa3SDimitry Andric } else {
7124a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
7134a16efa3SDimitry Andric }
7144a16efa3SDimitry Andric } else {
7154a16efa3SDimitry Andric *target++ = ch;
7164a16efa3SDimitry Andric }
7174a16efa3SDimitry Andric } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
7184a16efa3SDimitry Andric result = sourceIllegal;
7194a16efa3SDimitry Andric *target++ = UNI_REPLACEMENT_CHAR;
7204a16efa3SDimitry Andric }
7214a16efa3SDimitry Andric }
7224a16efa3SDimitry Andric *sourceStart = source;
7234a16efa3SDimitry Andric *targetStart = target;
7244a16efa3SDimitry Andric return result;
7254a16efa3SDimitry Andric }
7264a16efa3SDimitry Andric
ConvertUTF8toUTF32Partial(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7275ca98fd9SDimitry Andric ConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
7285ca98fd9SDimitry Andric const UTF8 *sourceEnd,
7295ca98fd9SDimitry Andric UTF32 **targetStart,
7305ca98fd9SDimitry Andric UTF32 *targetEnd,
7315ca98fd9SDimitry Andric ConversionFlags flags) {
7325ca98fd9SDimitry Andric return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7335ca98fd9SDimitry Andric flags, /*InputIsPartial=*/true);
7345ca98fd9SDimitry Andric }
7355ca98fd9SDimitry Andric
ConvertUTF8toUTF32(const UTF8 ** sourceStart,const UTF8 * sourceEnd,UTF32 ** targetStart,UTF32 * targetEnd,ConversionFlags flags)7365ca98fd9SDimitry Andric ConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
7375ca98fd9SDimitry Andric const UTF8 *sourceEnd, UTF32 **targetStart,
7385ca98fd9SDimitry Andric UTF32 *targetEnd, ConversionFlags flags) {
7395ca98fd9SDimitry Andric return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
7405ca98fd9SDimitry Andric flags, /*InputIsPartial=*/false);
7415ca98fd9SDimitry Andric }
7425ca98fd9SDimitry Andric
7434a16efa3SDimitry Andric /* ---------------------------------------------------------------------
7444a16efa3SDimitry Andric
7454a16efa3SDimitry Andric Note A.
7464a16efa3SDimitry Andric The fall-through switches in UTF-8 reading code save a
7474a16efa3SDimitry Andric temp variable, some decrements & conditionals. The switches
7484a16efa3SDimitry Andric are equivalent to the following loop:
7494a16efa3SDimitry Andric {
7504a16efa3SDimitry Andric int tmpBytesToRead = extraBytesToRead+1;
7514a16efa3SDimitry Andric do {
7524a16efa3SDimitry Andric ch += *source++;
7534a16efa3SDimitry Andric --tmpBytesToRead;
7544a16efa3SDimitry Andric if (tmpBytesToRead) ch <<= 6;
7554a16efa3SDimitry Andric } while (tmpBytesToRead > 0);
7564a16efa3SDimitry Andric }
7574a16efa3SDimitry Andric In UTF-8 writing code, the switches on "bytesToWrite" are
7584a16efa3SDimitry Andric similarly unrolled loops.
7594a16efa3SDimitry Andric
7604a16efa3SDimitry Andric --------------------------------------------------------------------- */
761b915e9e0SDimitry Andric
762b915e9e0SDimitry Andric } // namespace llvm
763ab44ce3dSDimitry Andric
764ab44ce3dSDimitry Andric ConvertUTF_RESTORE_WARNINGS
765