lib/Support/ConvertUTF.cpp

4a16efa3SDimitry Andric/*===--- ConvertUTF.c - Universal Character Names conversions ---------------===
4a16efa3SDimitry Andric *
e6d15924SDimitry Andric * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
e6d15924SDimitry Andric * See https://llvm.org/LICENSE.txt for license information.
e6d15924SDimitry Andric * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
4a16efa3SDimitry Andric *
4a16efa3SDimitry Andric *===------------------------------------------------------------------------=*/
4a16efa3SDimitry Andric/*
e3b55780SDimitry Andric * Copyright © 1991-2015 Unicode, Inc. All rights reserved.
e3b55780SDimitry Andric * Distributed under the Terms of Use in
e3b55780SDimitry Andric * http://www.unicode.org/copyright.html.
4a16efa3SDimitry Andric *
e3b55780SDimitry Andric * Permission is hereby granted, free of charge, to any person obtaining
e3b55780SDimitry Andric * a copy of the Unicode data files and any associated documentation
e3b55780SDimitry Andric * (the "Data Files") or Unicode software and any associated documentation
e3b55780SDimitry Andric * (the "Software") to deal in the Data Files or Software
e3b55780SDimitry Andric * without restriction, including without limitation the rights to use,
e3b55780SDimitry Andric * copy, modify, merge, publish, distribute, and/or sell copies of
e3b55780SDimitry Andric * the Data Files or Software, and to permit persons to whom the Data Files
e3b55780SDimitry Andric * or Software are furnished to do so, provided that
e3b55780SDimitry Andric * (a) this copyright and permission notice appear with all copies
e3b55780SDimitry Andric * of the Data Files or Software,
e3b55780SDimitry Andric * (b) this copyright and permission notice appear in associated
e3b55780SDimitry Andric * documentation, and
e3b55780SDimitry Andric * (c) there is clear notice in each modified Data File or in the Software
e3b55780SDimitry Andric * as well as in the documentation associated with the Data File(s) or
e3b55780SDimitry Andric * Software that the data or software has been modified.
4a16efa3SDimitry Andric *
e3b55780SDimitry Andric * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF
e3b55780SDimitry Andric * ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE
e3b55780SDimitry Andric * WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
e3b55780SDimitry Andric * NONINFRINGEMENT OF THIRD PARTY RIGHTS.
e3b55780SDimitry Andric * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS
e3b55780SDimitry Andric * NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL
e3b55780SDimitry Andric * DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
e3b55780SDimitry Andric * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
e3b55780SDimitry Andric * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
e3b55780SDimitry Andric * PERFORMANCE OF THE DATA FILES OR SOFTWARE.
4a16efa3SDimitry Andric *
e3b55780SDimitry Andric * Except as contained in this notice, the name of a copyright holder
e3b55780SDimitry Andric * shall not be used in advertising or otherwise to promote the sale,
e3b55780SDimitry Andric * use or other dealings in these Data Files or Software without prior
e3b55780SDimitry Andric * written authorization of the copyright holder.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* ---------------------------------------------------------------------
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric    Conversions between UTF32, UTF-16, and UTF-8. Source code file.
4a16efa3SDimitry Andric    Author: Mark E. Davis, 1994.
4a16efa3SDimitry Andric    Rev History: Rick McGowan, fixes & updates May 2001.
4a16efa3SDimitry Andric    Sept 2001: fixed const & error conditions per
4a16efa3SDimitry Andric        mods suggested by S. Parent & A. Lillich.
4a16efa3SDimitry Andric    June 2002: Tim Dodd added detection and handling of incomplete
4a16efa3SDimitry Andric        source sequences, enhanced error detection, added casts
4a16efa3SDimitry Andric        to eliminate compiler warnings.
4a16efa3SDimitry Andric    July 2003: slight mods to back out aggressive FFFE detection.
4a16efa3SDimitry Andric    Jan 2004: updated switches in from-UTF8 conversions.
4a16efa3SDimitry Andric    Oct 2004: updated to use UNI_MAX_LEGAL_UTF32 in UTF-32 conversions.
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric    See the header file "ConvertUTF.h" for complete documentation.
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric------------------------------------------------------------------------ */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric#include "llvm/Support/ConvertUTF.h"
4a16efa3SDimitry Andric#ifdef CVTUTF_DEBUG
4a16efa3SDimitry Andric#include <stdio.h>
4a16efa3SDimitry Andric#endif
5ca98fd9SDimitry Andric#include <assert.h>
4a16efa3SDimitry Andric
ab44ce3dSDimitry Andric/*
ab44ce3dSDimitry Andric * This code extensively uses fall-through switches.
ab44ce3dSDimitry Andric * Keep the compiler from warning about that.
ab44ce3dSDimitry Andric */
ab44ce3dSDimitry Andric#if defined(__clang__) && defined(__has_warning)
ab44ce3dSDimitry Andric# if __has_warning("-Wimplicit-fallthrough")
ab44ce3dSDimitry Andric#  define ConvertUTF_DISABLE_WARNINGS \
ab44ce3dSDimitry Andric    _Pragma("clang diagnostic push")  \
ab44ce3dSDimitry Andric    _Pragma("clang diagnostic ignored \"-Wimplicit-fallthrough\"")
ab44ce3dSDimitry Andric#  define ConvertUTF_RESTORE_WARNINGS \
ab44ce3dSDimitry Andric    _Pragma("clang diagnostic pop")
ab44ce3dSDimitry Andric# endif
ab44ce3dSDimitry Andric#elif defined(__GNUC__) && __GNUC__ > 6
ab44ce3dSDimitry Andric# define ConvertUTF_DISABLE_WARNINGS \
ab44ce3dSDimitry Andric   _Pragma("GCC diagnostic push")    \
ab44ce3dSDimitry Andric   _Pragma("GCC diagnostic ignored \"-Wimplicit-fallthrough\"")
ab44ce3dSDimitry Andric# define ConvertUTF_RESTORE_WARNINGS \
ab44ce3dSDimitry Andric   _Pragma("GCC diagnostic pop")
ab44ce3dSDimitry Andric#endif
ab44ce3dSDimitry Andric#ifndef ConvertUTF_DISABLE_WARNINGS
ab44ce3dSDimitry Andric# define ConvertUTF_DISABLE_WARNINGS
ab44ce3dSDimitry Andric#endif
ab44ce3dSDimitry Andric#ifndef ConvertUTF_RESTORE_WARNINGS
ab44ce3dSDimitry Andric# define ConvertUTF_RESTORE_WARNINGS
ab44ce3dSDimitry Andric#endif
ab44ce3dSDimitry Andric
ab44ce3dSDimitry AndricConvertUTF_DISABLE_WARNINGS
ab44ce3dSDimitry Andric
b915e9e0SDimitry Andricnamespace llvm {
b915e9e0SDimitry Andric
4a16efa3SDimitry Andricstatic const int halfShift  = 10; /* used for shifting by 10 bits */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andricstatic const UTF32 halfBase = 0x0010000UL;
4a16efa3SDimitry Andricstatic const UTF32 halfMask = 0x3FFUL;
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric#define UNI_SUR_HIGH_START  (UTF32)0xD800
4a16efa3SDimitry Andric#define UNI_SUR_HIGH_END    (UTF32)0xDBFF
4a16efa3SDimitry Andric#define UNI_SUR_LOW_START   (UTF32)0xDC00
4a16efa3SDimitry Andric#define UNI_SUR_LOW_END     (UTF32)0xDFFF
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Index into the table below with the first byte of a UTF-8 sequence to
4a16efa3SDimitry Andric * get the number of trailing bytes that are supposed to follow it.
4a16efa3SDimitry Andric * Note that *legal* UTF-8 values can't have 4 or 5-bytes. The table is
4a16efa3SDimitry Andric * left as-is for anyone who may want to do such conversion, which was
4a16efa3SDimitry Andric * allowed in earlier algorithms.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andricstatic const char trailingBytesForUTF8[256] = {
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
4a16efa3SDimitry Andric    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
4a16efa3SDimitry Andric    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5
4a16efa3SDimitry Andric};
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Magic values subtracted from a buffer value during UTF8 conversion.
4a16efa3SDimitry Andric * This table contains as many values as there might be trailing bytes
4a16efa3SDimitry Andric * in a UTF-8 sequence.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andricstatic const UTF32 offsetsFromUTF8[6] = { 0x00000000UL, 0x00003080UL, 0x000E2080UL,
4a16efa3SDimitry Andric                     0x03C82080UL, 0xFA082080UL, 0x82082080UL };
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Once the bits are split out into bytes of UTF-8, this is a mask OR-ed
4a16efa3SDimitry Andric * into the first byte, depending on how many bytes follow.  There are
4a16efa3SDimitry Andric * as many entries in this table as there are UTF-8 sequence types.
4a16efa3SDimitry Andric * (I.e., one byte sequence, two byte... etc.). Remember that sequencs
4a16efa3SDimitry Andric * for *legal* UTF-8 will be 4 or fewer bytes total.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andricstatic const UTF8 firstByteMark[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* The interface converts a whole buffer to avoid function-call overhead.
4a16efa3SDimitry Andric * Constants have been gathered. Loops & conditionals have been removed as
4a16efa3SDimitry Andric * much as possible for efficiency, in favor of drop-through switches.
4a16efa3SDimitry Andric * (See "Note A" at the bottom of the file for equivalent code.)
4a16efa3SDimitry Andric * If your compiler supports it, the "isLegalUTF8" call can be turned
4a16efa3SDimitry Andric * into an inline function.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry AndricConversionResult ConvertUTF32toUTF16 (
4a16efa3SDimitry Andric        const UTF32** sourceStart, const UTF32* sourceEnd,
4a16efa3SDimitry Andric        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF32* source = *sourceStart;
4a16efa3SDimitry Andric    UTF16* target = *targetStart;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        UTF32 ch;
4a16efa3SDimitry Andric        if (target >= targetEnd) {
4a16efa3SDimitry Andric            result = targetExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        ch = *source++;
4a16efa3SDimitry Andric        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
4a16efa3SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32; 0xffff or 0xfffe are both reserved values */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                if (flags == strictConversion) {
4a16efa3SDimitry Andric                    --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                    result = sourceIllegal;
4a16efa3SDimitry Andric                    break;
4a16efa3SDimitry Andric                } else {
4a16efa3SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric                }
4a16efa3SDimitry Andric            } else {
4a16efa3SDimitry Andric                *target++ = (UTF16)ch; /* normal case */
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else if (ch > UNI_MAX_LEGAL_UTF32) {
4a16efa3SDimitry Andric            if (flags == strictConversion) {
4a16efa3SDimitry Andric                result = sourceIllegal;
4a16efa3SDimitry Andric            } else {
4a16efa3SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else {
4a16efa3SDimitry Andric            /* target is a character in range 0xFFFF - 0x10FFFF. */
4a16efa3SDimitry Andric            if (target + 1 >= targetEnd) {
4a16efa3SDimitry Andric                --source; /* Back up source pointer! */
4a16efa3SDimitry Andric                result = targetExhausted; break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric            ch -= halfBase;
4a16efa3SDimitry Andric            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
4a16efa3SDimitry Andric            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry AndricConversionResult ConvertUTF16toUTF32 (
4a16efa3SDimitry Andric        const UTF16** sourceStart, const UTF16* sourceEnd,
4a16efa3SDimitry Andric        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF16* source = *sourceStart;
4a16efa3SDimitry Andric    UTF32* target = *targetStart;
4a16efa3SDimitry Andric    UTF32 ch, ch2;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        const UTF16* oldSource = source; /*  In case we have to back up because of target overflow. */
4a16efa3SDimitry Andric        ch = *source++;
4a16efa3SDimitry Andric        /* If we have a surrogate pair, convert to UTF32 first. */
4a16efa3SDimitry Andric        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
4a16efa3SDimitry Andric            /* If the 16 bits following the high surrogate are in the source buffer... */
4a16efa3SDimitry Andric            if (source < sourceEnd) {
4a16efa3SDimitry Andric                ch2 = *source;
4a16efa3SDimitry Andric                /* If it's a low surrogate, convert to UTF32. */
4a16efa3SDimitry Andric                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
4a16efa3SDimitry Andric                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
4a16efa3SDimitry Andric                    ++source;
4a16efa3SDimitry Andric                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
4a16efa3SDimitry Andric                    --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                    result = sourceIllegal;
4a16efa3SDimitry Andric                    break;
4a16efa3SDimitry Andric                }
4a16efa3SDimitry Andric            } else { /* We don't have the 16 bits following the high surrogate. */
4a16efa3SDimitry Andric                --source; /* return to the high surrogate */
4a16efa3SDimitry Andric                result = sourceExhausted;
4a16efa3SDimitry Andric                break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else if (flags == strictConversion) {
4a16efa3SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                result = sourceIllegal;
4a16efa3SDimitry Andric                break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        if (target >= targetEnd) {
4a16efa3SDimitry Andric            source = oldSource; /* Back up source pointer! */
4a16efa3SDimitry Andric            result = targetExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        *target++ = ch;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric#ifdef CVTUTF_DEBUG
4a16efa3SDimitry Andricif (result == sourceIllegal) {
4a16efa3SDimitry Andric    fprintf(stderr, "ConvertUTF16toUTF32 illegal seq 0x%04x,%04x\n", ch, ch2);
4a16efa3SDimitry Andric    fflush(stderr);
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric#endif
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry AndricConversionResult ConvertUTF16toUTF8 (
4a16efa3SDimitry Andric        const UTF16** sourceStart, const UTF16* sourceEnd,
4a16efa3SDimitry Andric        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF16* source = *sourceStart;
4a16efa3SDimitry Andric    UTF8* target = *targetStart;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        UTF32 ch;
4a16efa3SDimitry Andric        unsigned short bytesToWrite = 0;
4a16efa3SDimitry Andric        const UTF32 byteMask = 0xBF;
4a16efa3SDimitry Andric        const UTF32 byteMark = 0x80;
4a16efa3SDimitry Andric        const UTF16* oldSource = source; /* In case we have to back up because of target overflow. */
4a16efa3SDimitry Andric        ch = *source++;
4a16efa3SDimitry Andric        /* If we have a surrogate pair, convert to UTF32 first. */
4a16efa3SDimitry Andric        if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_HIGH_END) {
4a16efa3SDimitry Andric            /* If the 16 bits following the high surrogate are in the source buffer... */
4a16efa3SDimitry Andric            if (source < sourceEnd) {
4a16efa3SDimitry Andric                UTF32 ch2 = *source;
4a16efa3SDimitry Andric                /* If it's a low surrogate, convert to UTF32. */
4a16efa3SDimitry Andric                if (ch2 >= UNI_SUR_LOW_START && ch2 <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                    ch = ((ch - UNI_SUR_HIGH_START) << halfShift)
4a16efa3SDimitry Andric                        + (ch2 - UNI_SUR_LOW_START) + halfBase;
4a16efa3SDimitry Andric                    ++source;
4a16efa3SDimitry Andric                } else if (flags == strictConversion) { /* it's an unpaired high surrogate */
4a16efa3SDimitry Andric                    --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                    result = sourceIllegal;
4a16efa3SDimitry Andric                    break;
4a16efa3SDimitry Andric                }
4a16efa3SDimitry Andric            } else { /* We don't have the 16 bits following the high surrogate. */
4a16efa3SDimitry Andric                --source; /* return to the high surrogate */
4a16efa3SDimitry Andric                result = sourceExhausted;
4a16efa3SDimitry Andric                break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else if (flags == strictConversion) {
4a16efa3SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_LOW_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                result = sourceIllegal;
4a16efa3SDimitry Andric                break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        /* Figure out how many bytes the result will require */
4a16efa3SDimitry Andric        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
4a16efa3SDimitry Andric        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
4a16efa3SDimitry Andric        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
4a16efa3SDimitry Andric        } else if (ch < (UTF32)0x110000) {  bytesToWrite = 4;
4a16efa3SDimitry Andric        } else {                            bytesToWrite = 3;
4a16efa3SDimitry Andric                                            ch = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric        target += bytesToWrite;
4a16efa3SDimitry Andric        if (target > targetEnd) {
4a16efa3SDimitry Andric            source = oldSource; /* Back up source pointer! */
4a16efa3SDimitry Andric            target -= bytesToWrite; result = targetExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        switch (bytesToWrite) { /* note: everything falls through. */
4a16efa3SDimitry Andric            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 1: *--target =  (UTF8)(ch | firstByteMark[bytesToWrite]);
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        target += bytesToWrite;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry AndricConversionResult ConvertUTF32toUTF8 (
4a16efa3SDimitry Andric        const UTF32** sourceStart, const UTF32* sourceEnd,
4a16efa3SDimitry Andric        UTF8** targetStart, UTF8* targetEnd, ConversionFlags flags) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF32* source = *sourceStart;
4a16efa3SDimitry Andric    UTF8* target = *targetStart;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        UTF32 ch;
4a16efa3SDimitry Andric        unsigned short bytesToWrite = 0;
4a16efa3SDimitry Andric        const UTF32 byteMask = 0xBF;
4a16efa3SDimitry Andric        const UTF32 byteMark = 0x80;
4a16efa3SDimitry Andric        ch = *source++;
4a16efa3SDimitry Andric        if (flags == strictConversion ) {
4a16efa3SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                --source; /* return to the illegal value itself */
4a16efa3SDimitry Andric                result = sourceIllegal;
4a16efa3SDimitry Andric                break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        /*
4a16efa3SDimitry Andric         * Figure out how many bytes the result will require. Turn any
4a16efa3SDimitry Andric         * illegally large UTF32 things (> Plane 17) into replacement chars.
4a16efa3SDimitry Andric         */
4a16efa3SDimitry Andric        if (ch < (UTF32)0x80) {      bytesToWrite = 1;
4a16efa3SDimitry Andric        } else if (ch < (UTF32)0x800) {     bytesToWrite = 2;
4a16efa3SDimitry Andric        } else if (ch < (UTF32)0x10000) {   bytesToWrite = 3;
4a16efa3SDimitry Andric        } else if (ch <= UNI_MAX_LEGAL_UTF32) {  bytesToWrite = 4;
4a16efa3SDimitry Andric        } else {                            bytesToWrite = 3;
4a16efa3SDimitry Andric                                            ch = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric                                            result = sourceIllegal;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric        target += bytesToWrite;
4a16efa3SDimitry Andric        if (target > targetEnd) {
4a16efa3SDimitry Andric            --source; /* Back up source pointer! */
4a16efa3SDimitry Andric            target -= bytesToWrite; result = targetExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        switch (bytesToWrite) { /* note: everything falls through. */
4a16efa3SDimitry Andric            case 4: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 3: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 2: *--target = (UTF8)((ch | byteMark) & byteMask); ch >>= 6;
4a16efa3SDimitry Andric            case 1: *--target = (UTF8) (ch | firstByteMark[bytesToWrite]);
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        target += bytesToWrite;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Utility routine to tell whether a sequence of bytes is legal UTF-8.
4a16efa3SDimitry Andric * This must be called with the length pre-determined by the first byte.
4a16efa3SDimitry Andric * If not calling this from ConvertUTF8to*, then the length can be set by:
4a16efa3SDimitry Andric *  length = trailingBytesForUTF8[*source]+1;
4a16efa3SDimitry Andric * and the sequence is illegal right away if there aren't that many bytes
4a16efa3SDimitry Andric * available.
4a16efa3SDimitry Andric * If presented with a length > 4, this returns false.  The Unicode
4a16efa3SDimitry Andric * definition of UTF-8 goes up to 4-byte sequences.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andricstatic Boolean isLegalUTF8(const UTF8 *source, int length) {
4a16efa3SDimitry Andric    UTF8 a;
4a16efa3SDimitry Andric    const UTF8 *srcptr = source+length;
4a16efa3SDimitry Andric    switch (length) {
4a16efa3SDimitry Andric    default: return false;
4a16efa3SDimitry Andric        /* Everything else falls through when "true"... */
4a16efa3SDimitry Andric    case 4: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4a16efa3SDimitry Andric    case 3: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4a16efa3SDimitry Andric    case 2: if ((a = (*--srcptr)) < 0x80 || a > 0xBF) return false;
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric        switch (*source) {
4a16efa3SDimitry Andric            /* no fall-through in this inner switch */
4a16efa3SDimitry Andric            case 0xE0: if (a < 0xA0) return false; break;
4a16efa3SDimitry Andric            case 0xED: if (a > 0x9F) return false; break;
4a16efa3SDimitry Andric            case 0xF0: if (a < 0x90) return false; break;
4a16efa3SDimitry Andric            case 0xF4: if (a > 0x8F) return false; break;
4a16efa3SDimitry Andric            default:   if (a < 0x80) return false;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric    case 1: if (*source >= 0x80 && *source < 0xC2) return false;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    if (*source > 0xF4) return false;
4a16efa3SDimitry Andric    return true;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Exported function to return whether a UTF-8 sequence is legal or not.
4a16efa3SDimitry Andric * This is not used here; it's just exported.
4a16efa3SDimitry Andric */
4a16efa3SDimitry AndricBoolean isLegalUTF8Sequence(const UTF8 *source, const UTF8 *sourceEnd) {
4a16efa3SDimitry Andric    int length = trailingBytesForUTF8[*source]+1;
4a16efa3SDimitry Andric    if (length > sourceEnd - source) {
4a16efa3SDimitry Andric        return false;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    return isLegalUTF8(source, length);
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
1f917f69SDimitry Andric/*
1f917f69SDimitry Andric * Exported function to return the size of the first utf-8 code unit sequence,
1f917f69SDimitry Andric * Or 0 if the sequence is not valid;
1f917f69SDimitry Andric */
1f917f69SDimitry Andricunsigned getUTF8SequenceSize(const UTF8 *source, const UTF8 *sourceEnd) {
1f917f69SDimitry Andric  int length = trailingBytesForUTF8[*source] + 1;
1f917f69SDimitry Andric  return (length <= sourceEnd - source && isLegalUTF8(source, length)) ? length
1f917f69SDimitry Andric                                                                       : 0;
1f917f69SDimitry Andric}
1f917f69SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
5ca98fd9SDimitry Andricstatic unsigned
5ca98fd9SDimitry AndricfindMaximalSubpartOfIllFormedUTF8Sequence(const UTF8 *source,
5ca98fd9SDimitry Andric                                          const UTF8 *sourceEnd) {
5ca98fd9SDimitry Andric  UTF8 b1, b2, b3;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  assert(!isLegalUTF8Sequence(source, sourceEnd));
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  /*
5ca98fd9SDimitry Andric   * Unicode 6.3.0, D93b:
5ca98fd9SDimitry Andric   *
5ca98fd9SDimitry Andric   *   Maximal subpart of an ill-formed subsequence: The longest code unit
5ca98fd9SDimitry Andric   *   subsequence starting at an unconvertible offset that is either:
5ca98fd9SDimitry Andric   *   a. the initial subsequence of a well-formed code unit sequence, or
5ca98fd9SDimitry Andric   *   b. a subsequence of length one.
5ca98fd9SDimitry Andric   */
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  if (source == sourceEnd)
5ca98fd9SDimitry Andric    return 0;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  /*
5ca98fd9SDimitry Andric   * Perform case analysis.  See Unicode 6.3.0, Table 3-7. Well-Formed UTF-8
5ca98fd9SDimitry Andric   * Byte Sequences.
5ca98fd9SDimitry Andric   */
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  b1 = *source;
5ca98fd9SDimitry Andric  ++source;
5ca98fd9SDimitry Andric  if (b1 >= 0xC2 && b1 <= 0xDF) {
5ca98fd9SDimitry Andric    /*
5ca98fd9SDimitry Andric     * First byte is valid, but we know that this code unit sequence is
5ca98fd9SDimitry Andric     * invalid, so the maximal subpart has to end after the first byte.
5ca98fd9SDimitry Andric     */
5ca98fd9SDimitry Andric    return 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  if (source == sourceEnd)
5ca98fd9SDimitry Andric    return 1;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  b2 = *source;
5ca98fd9SDimitry Andric  ++source;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  if (b1 == 0xE0) {
5ca98fd9SDimitry Andric    return (b2 >= 0xA0 && b2 <= 0xBF) ? 2 : 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 >= 0xE1 && b1 <= 0xEC) {
5ca98fd9SDimitry Andric    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 == 0xED) {
5ca98fd9SDimitry Andric    return (b2 >= 0x80 && b2 <= 0x9F) ? 2 : 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 >= 0xEE && b1 <= 0xEF) {
5ca98fd9SDimitry Andric    return (b2 >= 0x80 && b2 <= 0xBF) ? 2 : 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 == 0xF0) {
5ca98fd9SDimitry Andric    if (b2 >= 0x90 && b2 <= 0xBF) {
5ca98fd9SDimitry Andric      if (source == sourceEnd)
5ca98fd9SDimitry Andric        return 2;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric      b3 = *source;
5ca98fd9SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5ca98fd9SDimitry Andric    }
5ca98fd9SDimitry Andric    return 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 >= 0xF1 && b1 <= 0xF3) {
5ca98fd9SDimitry Andric    if (b2 >= 0x80 && b2 <= 0xBF) {
5ca98fd9SDimitry Andric      if (source == sourceEnd)
5ca98fd9SDimitry Andric        return 2;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric      b3 = *source;
5ca98fd9SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5ca98fd9SDimitry Andric    }
5ca98fd9SDimitry Andric    return 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric  if (b1 == 0xF4) {
5ca98fd9SDimitry Andric    if (b2 >= 0x80 && b2 <= 0x8F) {
5ca98fd9SDimitry Andric      if (source == sourceEnd)
5ca98fd9SDimitry Andric        return 2;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric      b3 = *source;
5ca98fd9SDimitry Andric      return (b3 >= 0x80 && b3 <= 0xBF) ? 3 : 2;
5ca98fd9SDimitry Andric    }
5ca98fd9SDimitry Andric    return 1;
5ca98fd9SDimitry Andric  }
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric  assert((b1 >= 0x80 && b1 <= 0xC1) || b1 >= 0xF5);
5ca98fd9SDimitry Andric  /*
5ca98fd9SDimitry Andric   * There are no valid sequences that start with these bytes.  Maximal subpart
5ca98fd9SDimitry Andric   * is defined to have length 1 in these cases.
5ca98fd9SDimitry Andric   */
5ca98fd9SDimitry Andric  return 1;
5ca98fd9SDimitry Andric}
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric/* --------------------------------------------------------------------- */
5ca98fd9SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Exported function to return the total number of bytes in a codepoint
4a16efa3SDimitry Andric * represented in UTF-8, given the value of the first byte.
4a16efa3SDimitry Andric */
4a16efa3SDimitry Andricunsigned getNumBytesForUTF8(UTF8 first) {
4a16efa3SDimitry Andric  return trailingBytesForUTF8[first] + 1;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/*
4a16efa3SDimitry Andric * Exported function to return whether a UTF-8 string is legal or not.
4a16efa3SDimitry Andric * This is not used here; it's just exported.
4a16efa3SDimitry Andric */
4a16efa3SDimitry AndricBoolean isLegalUTF8String(const UTF8 **source, const UTF8 *sourceEnd) {
4a16efa3SDimitry Andric    while (*source != sourceEnd) {
4a16efa3SDimitry Andric        int length = trailingBytesForUTF8[**source] + 1;
4a16efa3SDimitry Andric        if (length > sourceEnd - *source || !isLegalUTF8(*source, length))
4a16efa3SDimitry Andric            return false;
4a16efa3SDimitry Andric        *source += length;
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    return true;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
4a16efa3SDimitry AndricConversionResult ConvertUTF8toUTF16 (
4a16efa3SDimitry Andric        const UTF8** sourceStart, const UTF8* sourceEnd,
4a16efa3SDimitry Andric        UTF16** targetStart, UTF16* targetEnd, ConversionFlags flags) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF8* source = *sourceStart;
4a16efa3SDimitry Andric    UTF16* target = *targetStart;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        UTF32 ch = 0;
4a16efa3SDimitry Andric        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
4a16efa3SDimitry Andric        if (extraBytesToRead >= sourceEnd - source) {
4a16efa3SDimitry Andric            result = sourceExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        /* Do this check whether lenient or strict */
4a16efa3SDimitry Andric        if (!isLegalUTF8(source, extraBytesToRead+1)) {
4a16efa3SDimitry Andric            result = sourceIllegal;
4a16efa3SDimitry Andric            break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        /*
4a16efa3SDimitry Andric         * The cases all fall through. See "Note A" below.
4a16efa3SDimitry Andric         */
4a16efa3SDimitry Andric        switch (extraBytesToRead) {
4a16efa3SDimitry Andric            case 5: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
4a16efa3SDimitry Andric            case 4: ch += *source++; ch <<= 6; /* remember, illegal UTF-8 */
4a16efa3SDimitry Andric            case 3: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 2: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 1: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 0: ch += *source++;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        ch -= offsetsFromUTF8[extraBytesToRead];
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric        if (target >= targetEnd) {
4a16efa3SDimitry Andric            source -= (extraBytesToRead+1); /* Back up source pointer! */
4a16efa3SDimitry Andric            result = targetExhausted; break;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        if (ch <= UNI_MAX_BMP) { /* Target is a character <= 0xFFFF */
4a16efa3SDimitry Andric            /* UTF-16 surrogate values are illegal in UTF-32 */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                if (flags == strictConversion) {
4a16efa3SDimitry Andric                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
4a16efa3SDimitry Andric                    result = sourceIllegal;
4a16efa3SDimitry Andric                    break;
4a16efa3SDimitry Andric                } else {
4a16efa3SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric                }
4a16efa3SDimitry Andric            } else {
4a16efa3SDimitry Andric                *target++ = (UTF16)ch; /* normal case */
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else if (ch > UNI_MAX_UTF16) {
4a16efa3SDimitry Andric            if (flags == strictConversion) {
4a16efa3SDimitry Andric                result = sourceIllegal;
4a16efa3SDimitry Andric                source -= (extraBytesToRead+1); /* return to the start */
4a16efa3SDimitry Andric                break; /* Bail out; shouldn't continue */
4a16efa3SDimitry Andric            } else {
4a16efa3SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else {
4a16efa3SDimitry Andric            /* target is a character in range 0xFFFF - 0x10FFFF. */
4a16efa3SDimitry Andric            if (target + 1 >= targetEnd) {
4a16efa3SDimitry Andric                source -= (extraBytesToRead+1); /* Back up source pointer! */
4a16efa3SDimitry Andric                result = targetExhausted; break;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric            ch -= halfBase;
4a16efa3SDimitry Andric            *target++ = (UTF16)((ch >> halfShift) + UNI_SUR_HIGH_START);
4a16efa3SDimitry Andric            *target++ = (UTF16)((ch & halfMask) + UNI_SUR_LOW_START);
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric/* --------------------------------------------------------------------- */
4a16efa3SDimitry Andric
5ca98fd9SDimitry Andricstatic ConversionResult ConvertUTF8toUTF32Impl(
4a16efa3SDimitry Andric        const UTF8** sourceStart, const UTF8* sourceEnd,
5ca98fd9SDimitry Andric        UTF32** targetStart, UTF32* targetEnd, ConversionFlags flags,
5ca98fd9SDimitry Andric        Boolean InputIsPartial) {
4a16efa3SDimitry Andric    ConversionResult result = conversionOK;
4a16efa3SDimitry Andric    const UTF8* source = *sourceStart;
4a16efa3SDimitry Andric    UTF32* target = *targetStart;
4a16efa3SDimitry Andric    while (source < sourceEnd) {
4a16efa3SDimitry Andric        UTF32 ch = 0;
4a16efa3SDimitry Andric        unsigned short extraBytesToRead = trailingBytesForUTF8[*source];
4a16efa3SDimitry Andric        if (extraBytesToRead >= sourceEnd - source) {
5ca98fd9SDimitry Andric            if (flags == strictConversion || InputIsPartial) {
5ca98fd9SDimitry Andric                result = sourceExhausted;
5ca98fd9SDimitry Andric                break;
5ca98fd9SDimitry Andric            } else {
5ca98fd9SDimitry Andric                result = sourceIllegal;
5ca98fd9SDimitry Andric
5ca98fd9SDimitry Andric                /*
5ca98fd9SDimitry Andric                 * Replace the maximal subpart of ill-formed sequence with
5ca98fd9SDimitry Andric                 * replacement character.
5ca98fd9SDimitry Andric                 */
5ca98fd9SDimitry Andric                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
5ca98fd9SDimitry Andric                                                                    sourceEnd);
5ca98fd9SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
5ca98fd9SDimitry Andric                continue;
4a16efa3SDimitry Andric            }
5ca98fd9SDimitry Andric        }
5ca98fd9SDimitry Andric        if (target >= targetEnd) {
5ca98fd9SDimitry Andric            result = targetExhausted; break;
5ca98fd9SDimitry Andric        }
5ca98fd9SDimitry Andric
4a16efa3SDimitry Andric        /* Do this check whether lenient or strict */
4a16efa3SDimitry Andric        if (!isLegalUTF8(source, extraBytesToRead+1)) {
4a16efa3SDimitry Andric            result = sourceIllegal;
5ca98fd9SDimitry Andric            if (flags == strictConversion) {
5ca98fd9SDimitry Andric                /* Abort conversion. */
4a16efa3SDimitry Andric                break;
5ca98fd9SDimitry Andric            } else {
5ca98fd9SDimitry Andric                /*
5ca98fd9SDimitry Andric                 * Replace the maximal subpart of ill-formed sequence with
5ca98fd9SDimitry Andric                 * replacement character.
5ca98fd9SDimitry Andric                 */
5ca98fd9SDimitry Andric                source += findMaximalSubpartOfIllFormedUTF8Sequence(source,
5ca98fd9SDimitry Andric                                                                    sourceEnd);
5ca98fd9SDimitry Andric                *target++ = UNI_REPLACEMENT_CHAR;
5ca98fd9SDimitry Andric                continue;
5ca98fd9SDimitry Andric            }
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        /*
4a16efa3SDimitry Andric         * The cases all fall through. See "Note A" below.
4a16efa3SDimitry Andric         */
4a16efa3SDimitry Andric        switch (extraBytesToRead) {
4a16efa3SDimitry Andric            case 5: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 4: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 3: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 2: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 1: ch += *source++; ch <<= 6;
4a16efa3SDimitry Andric            case 0: ch += *source++;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric        ch -= offsetsFromUTF8[extraBytesToRead];
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric        if (ch <= UNI_MAX_LEGAL_UTF32) {
4a16efa3SDimitry Andric            /*
4a16efa3SDimitry Andric             * UTF-16 surrogate values are illegal in UTF-32, and anything
4a16efa3SDimitry Andric             * over Plane 17 (> 0x10FFFF) is illegal.
4a16efa3SDimitry Andric             */
4a16efa3SDimitry Andric            if (ch >= UNI_SUR_HIGH_START && ch <= UNI_SUR_LOW_END) {
4a16efa3SDimitry Andric                if (flags == strictConversion) {
4a16efa3SDimitry Andric                    source -= (extraBytesToRead+1); /* return to the illegal value itself */
4a16efa3SDimitry Andric                    result = sourceIllegal;
4a16efa3SDimitry Andric                    break;
4a16efa3SDimitry Andric                } else {
4a16efa3SDimitry Andric                    *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric                }
4a16efa3SDimitry Andric            } else {
4a16efa3SDimitry Andric                *target++ = ch;
4a16efa3SDimitry Andric            }
4a16efa3SDimitry Andric        } else { /* i.e., ch > UNI_MAX_LEGAL_UTF32 */
4a16efa3SDimitry Andric            result = sourceIllegal;
4a16efa3SDimitry Andric            *target++ = UNI_REPLACEMENT_CHAR;
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric    }
4a16efa3SDimitry Andric    *sourceStart = source;
4a16efa3SDimitry Andric    *targetStart = target;
4a16efa3SDimitry Andric    return result;
4a16efa3SDimitry Andric}
4a16efa3SDimitry Andric
5ca98fd9SDimitry AndricConversionResult ConvertUTF8toUTF32Partial(const UTF8 **sourceStart,
5ca98fd9SDimitry Andric                                           const UTF8 *sourceEnd,
5ca98fd9SDimitry Andric                                           UTF32 **targetStart,
5ca98fd9SDimitry Andric                                           UTF32 *targetEnd,
5ca98fd9SDimitry Andric                                           ConversionFlags flags) {
5ca98fd9SDimitry Andric  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
5ca98fd9SDimitry Andric                                flags, /*InputIsPartial=*/true);
5ca98fd9SDimitry Andric}
5ca98fd9SDimitry Andric
5ca98fd9SDimitry AndricConversionResult ConvertUTF8toUTF32(const UTF8 **sourceStart,
5ca98fd9SDimitry Andric                                    const UTF8 *sourceEnd, UTF32 **targetStart,
5ca98fd9SDimitry Andric                                    UTF32 *targetEnd, ConversionFlags flags) {
5ca98fd9SDimitry Andric  return ConvertUTF8toUTF32Impl(sourceStart, sourceEnd, targetStart, targetEnd,
5ca98fd9SDimitry Andric                                flags, /*InputIsPartial=*/false);
5ca98fd9SDimitry Andric}
5ca98fd9SDimitry Andric
4a16efa3SDimitry Andric/* ---------------------------------------------------------------------
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric    Note A.
4a16efa3SDimitry Andric    The fall-through switches in UTF-8 reading code save a
4a16efa3SDimitry Andric    temp variable, some decrements & conditionals.  The switches
4a16efa3SDimitry Andric    are equivalent to the following loop:
4a16efa3SDimitry Andric        {
4a16efa3SDimitry Andric            int tmpBytesToRead = extraBytesToRead+1;
4a16efa3SDimitry Andric            do {
4a16efa3SDimitry Andric                ch += *source++;
4a16efa3SDimitry Andric                --tmpBytesToRead;
4a16efa3SDimitry Andric                if (tmpBytesToRead) ch <<= 6;
4a16efa3SDimitry Andric            } while (tmpBytesToRead > 0);
4a16efa3SDimitry Andric        }
4a16efa3SDimitry Andric    In UTF-8 writing code, the switches on "bytesToWrite" are
4a16efa3SDimitry Andric    similarly unrolled loops.
4a16efa3SDimitry Andric
4a16efa3SDimitry Andric   --------------------------------------------------------------------- */
b915e9e0SDimitry Andric
b915e9e0SDimitry Andric} // namespace llvm
ab44ce3dSDimitry Andric
ab44ce3dSDimitry AndricConvertUTF_RESTORE_WARNINGS