1145449b1SDimitry Andric //===- llvm/Support/UnicodeNameToCodepoint.cpp - Unicode character properties
2145449b1SDimitry Andric //-*- C++ -*-===//
3145449b1SDimitry Andric //
4145449b1SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
5145449b1SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
6145449b1SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
7145449b1SDimitry Andric //
8145449b1SDimitry Andric //===----------------------------------------------------------------------===//
9145449b1SDimitry Andric //
10145449b1SDimitry Andric // This file implements functions to map the name or alias of a unicode
11145449b1SDimitry Andric // character to its codepoint.
12145449b1SDimitry Andric //
13145449b1SDimitry Andric //===----------------------------------------------------------------------===//
14145449b1SDimitry Andric
15145449b1SDimitry Andric #include "llvm/ADT/STLExtras.h"
16145449b1SDimitry Andric #include "llvm/ADT/StringExtras.h"
17145449b1SDimitry Andric #include "llvm/ADT/StringRef.h"
18145449b1SDimitry Andric #include "llvm/Support/Unicode.h"
19145449b1SDimitry Andric
20145449b1SDimitry Andric namespace llvm {
21145449b1SDimitry Andric namespace sys {
22145449b1SDimitry Andric namespace unicode {
23145449b1SDimitry Andric
24145449b1SDimitry Andric extern const char *UnicodeNameToCodepointDict;
25145449b1SDimitry Andric extern const uint8_t *UnicodeNameToCodepointIndex;
26145449b1SDimitry Andric extern const std::size_t UnicodeNameToCodepointIndexSize;
27145449b1SDimitry Andric extern const std::size_t UnicodeNameToCodepointLargestNameSize;
28145449b1SDimitry Andric
29145449b1SDimitry Andric using BufferType = SmallString<64>;
30145449b1SDimitry Andric
31145449b1SDimitry Andric struct Node {
32145449b1SDimitry Andric bool IsRoot = false;
33145449b1SDimitry Andric char32_t Value = 0xFFFFFFFF;
34145449b1SDimitry Andric uint32_t ChildrenOffset = 0;
35145449b1SDimitry Andric bool HasSibling = false;
36145449b1SDimitry Andric uint32_t Size = 0;
37145449b1SDimitry Andric StringRef Name;
38145449b1SDimitry Andric const Node *Parent = nullptr;
39145449b1SDimitry Andric
isValidllvm::sys::unicode::Node40145449b1SDimitry Andric constexpr bool isValid() const {
41145449b1SDimitry Andric return !Name.empty() || Value == 0xFFFFFFFF;
42145449b1SDimitry Andric }
hasChildrenllvm::sys::unicode::Node43145449b1SDimitry Andric constexpr bool hasChildren() const { return ChildrenOffset != 0 || IsRoot; }
44145449b1SDimitry Andric
fullNamellvm::sys::unicode::Node45145449b1SDimitry Andric std::string fullName() const {
46145449b1SDimitry Andric std::string S;
47145449b1SDimitry Andric // Reserve enough space for most unicode code points.
48145449b1SDimitry Andric // The chosen value represent the 99th percentile of name size as of
49e3b55780SDimitry Andric // Unicode 15.0.
50145449b1SDimitry Andric S.reserve(46);
51145449b1SDimitry Andric const Node *N = this;
52145449b1SDimitry Andric while (N) {
53145449b1SDimitry Andric std::reverse_copy(N->Name.begin(), N->Name.end(), std::back_inserter(S));
54145449b1SDimitry Andric N = N->Parent;
55145449b1SDimitry Andric }
56145449b1SDimitry Andric std::reverse(S.begin(), S.end());
57145449b1SDimitry Andric return S;
58145449b1SDimitry Andric }
59145449b1SDimitry Andric };
60145449b1SDimitry Andric
createRoot()61145449b1SDimitry Andric static Node createRoot() {
62145449b1SDimitry Andric Node N;
63145449b1SDimitry Andric N.IsRoot = true;
64145449b1SDimitry Andric N.ChildrenOffset = 1;
65145449b1SDimitry Andric N.Size = 1;
66145449b1SDimitry Andric return N;
67145449b1SDimitry Andric }
68145449b1SDimitry Andric
readNode(uint32_t Offset,const Node * Parent=nullptr)69145449b1SDimitry Andric static Node readNode(uint32_t Offset, const Node *Parent = nullptr) {
70145449b1SDimitry Andric if (Offset == 0)
71145449b1SDimitry Andric return createRoot();
72145449b1SDimitry Andric
73145449b1SDimitry Andric uint32_t Origin = Offset;
74145449b1SDimitry Andric Node N;
75145449b1SDimitry Andric N.Parent = Parent;
76145449b1SDimitry Andric uint8_t NameInfo = UnicodeNameToCodepointIndex[Offset++];
77145449b1SDimitry Andric if (Offset + 6 >= UnicodeNameToCodepointIndexSize)
78145449b1SDimitry Andric return N;
79145449b1SDimitry Andric
80145449b1SDimitry Andric bool LongName = NameInfo & 0x40;
81145449b1SDimitry Andric bool HasValue = NameInfo & 0x80;
82145449b1SDimitry Andric std::size_t Size = NameInfo & ~0xC0;
83145449b1SDimitry Andric if (LongName) {
84145449b1SDimitry Andric uint32_t NameOffset = (UnicodeNameToCodepointIndex[Offset++] << 8);
85145449b1SDimitry Andric NameOffset |= UnicodeNameToCodepointIndex[Offset++];
86145449b1SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + NameOffset, Size);
87145449b1SDimitry Andric } else {
88145449b1SDimitry Andric N.Name = StringRef(UnicodeNameToCodepointDict + Size, 1);
89145449b1SDimitry Andric }
90145449b1SDimitry Andric if (HasValue) {
91145449b1SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++];
92145449b1SDimitry Andric uint8_t M = UnicodeNameToCodepointIndex[Offset++];
93145449b1SDimitry Andric uint8_t L = UnicodeNameToCodepointIndex[Offset++];
94145449b1SDimitry Andric N.Value = ((H << 16) | (M << 8) | L) >> 3;
95145449b1SDimitry Andric
96145449b1SDimitry Andric bool HasChildren = L & 0x02;
97145449b1SDimitry Andric N.HasSibling = L & 0x01;
98145449b1SDimitry Andric
99145449b1SDimitry Andric if (HasChildren) {
100145449b1SDimitry Andric N.ChildrenOffset = UnicodeNameToCodepointIndex[Offset++] << 16;
101145449b1SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++] << 8;
102145449b1SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
103145449b1SDimitry Andric }
104145449b1SDimitry Andric } else {
105145449b1SDimitry Andric uint8_t H = UnicodeNameToCodepointIndex[Offset++];
106145449b1SDimitry Andric N.HasSibling = H & 0x80;
107145449b1SDimitry Andric bool HasChildren = H & 0x40;
108e3b55780SDimitry Andric H &= uint8_t(~0xC0);
109145449b1SDimitry Andric if (HasChildren) {
110145449b1SDimitry Andric N.ChildrenOffset = (H << 16);
111145449b1SDimitry Andric N.ChildrenOffset |=
112145449b1SDimitry Andric (uint32_t(UnicodeNameToCodepointIndex[Offset++]) << 8);
113145449b1SDimitry Andric N.ChildrenOffset |= UnicodeNameToCodepointIndex[Offset++];
114145449b1SDimitry Andric }
115145449b1SDimitry Andric }
116145449b1SDimitry Andric N.Size = Offset - Origin;
117145449b1SDimitry Andric return N;
118145449b1SDimitry Andric }
119145449b1SDimitry Andric
startsWith(StringRef Name,StringRef Needle,bool Strict,std::size_t & Consummed,char & PreviousCharInName,bool IsPrefix=false)120145449b1SDimitry Andric static bool startsWith(StringRef Name, StringRef Needle, bool Strict,
121145449b1SDimitry Andric std::size_t &Consummed, char &PreviousCharInName,
122b1c73532SDimitry Andric bool IsPrefix = false) {
123145449b1SDimitry Andric
124145449b1SDimitry Andric Consummed = 0;
125145449b1SDimitry Andric if (Strict) {
126b1c73532SDimitry Andric if (!Name.starts_with(Needle))
127145449b1SDimitry Andric return false;
128145449b1SDimitry Andric Consummed = Needle.size();
129145449b1SDimitry Andric return true;
130145449b1SDimitry Andric }
131145449b1SDimitry Andric if (Needle.empty())
132145449b1SDimitry Andric return true;
133145449b1SDimitry Andric
134145449b1SDimitry Andric auto NamePos = Name.begin();
135145449b1SDimitry Andric auto NeedlePos = Needle.begin();
136145449b1SDimitry Andric
137145449b1SDimitry Andric char PreviousCharInNameOrigin = PreviousCharInName;
138b1c73532SDimitry Andric char PreviousCharInNeedle = *Needle.begin();
139145449b1SDimitry Andric auto IgnoreSpaces = [](auto It, auto End, char &PreviousChar,
140b1c73532SDimitry Andric bool IsPrefix = false) {
141145449b1SDimitry Andric while (It != End) {
142145449b1SDimitry Andric const auto Next = std::next(It);
143145449b1SDimitry Andric // Ignore spaces, underscore, medial hyphens
144b1c73532SDimitry Andric // The generator ensures a needle never ends (or starts) by a medial
145b1c73532SDimitry Andric // hyphen https://unicode.org/reports/tr44/#UAX44-LM2.
146145449b1SDimitry Andric bool Ignore =
147145449b1SDimitry Andric *It == ' ' || *It == '_' ||
148145449b1SDimitry Andric (*It == '-' && isAlnum(PreviousChar) &&
149b1c73532SDimitry Andric ((Next != End && isAlnum(*Next)) || (Next == End && IsPrefix)));
150145449b1SDimitry Andric PreviousChar = *It;
151145449b1SDimitry Andric if (!Ignore)
152145449b1SDimitry Andric break;
153145449b1SDimitry Andric ++It;
154145449b1SDimitry Andric }
155145449b1SDimitry Andric return It;
156145449b1SDimitry Andric };
157145449b1SDimitry Andric
158145449b1SDimitry Andric while (true) {
159145449b1SDimitry Andric NamePos = IgnoreSpaces(NamePos, Name.end(), PreviousCharInName);
160145449b1SDimitry Andric NeedlePos =
161145449b1SDimitry Andric IgnoreSpaces(NeedlePos, Needle.end(), PreviousCharInNeedle, IsPrefix);
162145449b1SDimitry Andric if (NeedlePos == Needle.end())
163145449b1SDimitry Andric break;
164145449b1SDimitry Andric if (NamePos == Name.end())
165145449b1SDimitry Andric break;
166145449b1SDimitry Andric if (toUpper(*NeedlePos) != toUpper(*NamePos))
167145449b1SDimitry Andric break;
168145449b1SDimitry Andric NeedlePos++;
169145449b1SDimitry Andric NamePos++;
170145449b1SDimitry Andric }
171145449b1SDimitry Andric Consummed = std::distance(Name.begin(), NamePos);
172145449b1SDimitry Andric if (NeedlePos != Needle.end()) {
173145449b1SDimitry Andric PreviousCharInName = PreviousCharInNameOrigin;
174145449b1SDimitry Andric }
175145449b1SDimitry Andric return NeedlePos == Needle.end();
176145449b1SDimitry Andric }
177145449b1SDimitry Andric
178145449b1SDimitry Andric static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,char PreviousCharInName,BufferType & Buffer,const Node * Parent=nullptr)179145449b1SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict,
180b1c73532SDimitry Andric char PreviousCharInName, BufferType &Buffer,
181b1c73532SDimitry Andric const Node *Parent = nullptr) {
182145449b1SDimitry Andric Node N = readNode(Offset, Parent);
183145449b1SDimitry Andric std::size_t Consummed = 0;
184b1c73532SDimitry Andric bool DoesStartWith = N.IsRoot || startsWith(Name, N.Name, Strict, Consummed,
185b1c73532SDimitry Andric PreviousCharInName);
186145449b1SDimitry Andric if (!DoesStartWith)
187145449b1SDimitry Andric return std::make_tuple(N, false, 0);
188145449b1SDimitry Andric
189145449b1SDimitry Andric if (Name.size() - Consummed == 0 && N.Value != 0xFFFFFFFF)
190145449b1SDimitry Andric return std::make_tuple(N, true, N.Value);
191145449b1SDimitry Andric
192145449b1SDimitry Andric if (N.hasChildren()) {
193145449b1SDimitry Andric uint32_t ChildOffset = N.ChildrenOffset;
194145449b1SDimitry Andric for (;;) {
195145449b1SDimitry Andric Node C;
196145449b1SDimitry Andric bool Matches;
197145449b1SDimitry Andric uint32_t Value;
198145449b1SDimitry Andric std::tie(C, Matches, Value) =
199145449b1SDimitry Andric compareNode(ChildOffset, Name.substr(Consummed), Strict,
200b1c73532SDimitry Andric PreviousCharInName, Buffer, &N);
201145449b1SDimitry Andric if (Matches) {
202145449b1SDimitry Andric std::reverse_copy(C.Name.begin(), C.Name.end(),
203145449b1SDimitry Andric std::back_inserter(Buffer));
204145449b1SDimitry Andric return std::make_tuple(N, true, Value);
205145449b1SDimitry Andric }
206145449b1SDimitry Andric ChildOffset += C.Size;
207145449b1SDimitry Andric if (!C.HasSibling)
208145449b1SDimitry Andric break;
209145449b1SDimitry Andric }
210145449b1SDimitry Andric }
211145449b1SDimitry Andric return std::make_tuple(N, false, 0);
212145449b1SDimitry Andric }
213145449b1SDimitry Andric
214145449b1SDimitry Andric static std::tuple<Node, bool, uint32_t>
compareNode(uint32_t Offset,StringRef Name,bool Strict,BufferType & Buffer)215145449b1SDimitry Andric compareNode(uint32_t Offset, StringRef Name, bool Strict, BufferType &Buffer) {
216b1c73532SDimitry Andric return compareNode(Offset, Name, Strict, 0, Buffer);
217145449b1SDimitry Andric }
218145449b1SDimitry Andric
219145449b1SDimitry Andric // clang-format off
220145449b1SDimitry Andric constexpr const char *const HangulSyllables[][3] = {
221145449b1SDimitry Andric { "G", "A", "" },
222145449b1SDimitry Andric { "GG", "AE", "G" },
223145449b1SDimitry Andric { "N", "YA", "GG" },
224145449b1SDimitry Andric { "D", "YAE", "GS" },
225145449b1SDimitry Andric { "DD", "EO", "N", },
226145449b1SDimitry Andric { "R", "E", "NJ" },
227145449b1SDimitry Andric { "M", "YEO", "NH" },
228145449b1SDimitry Andric { "B", "YE", "D" },
229145449b1SDimitry Andric { "BB", "O", "L" },
230145449b1SDimitry Andric { "S", "WA", "LG" },
231145449b1SDimitry Andric { "SS", "WAE", "LM" },
232145449b1SDimitry Andric { "", "OE", "LB" },
233145449b1SDimitry Andric { "J", "YO", "LS" },
234145449b1SDimitry Andric { "JJ", "U", "LT" },
235145449b1SDimitry Andric { "C", "WEO", "LP" },
236145449b1SDimitry Andric { "K", "WE", "LH" },
237145449b1SDimitry Andric { "T", "WI", "M" },
238145449b1SDimitry Andric { "P", "YU", "B" },
239145449b1SDimitry Andric { "H", "EU", "BS" },
240145449b1SDimitry Andric { 0, "YI", "S" },
241145449b1SDimitry Andric { 0, "I", "SS" },
242145449b1SDimitry Andric { 0, 0, "NG" },
243145449b1SDimitry Andric { 0, 0, "J" },
244145449b1SDimitry Andric { 0, 0, "C" },
245145449b1SDimitry Andric { 0, 0, "K" },
246145449b1SDimitry Andric { 0, 0, "T" },
247145449b1SDimitry Andric { 0, 0, "P" },
248145449b1SDimitry Andric { 0, 0, "H" }
249145449b1SDimitry Andric };
250145449b1SDimitry Andric // clang-format on
251145449b1SDimitry Andric
252e3b55780SDimitry Andric // Unicode 15.0
253145449b1SDimitry Andric // 3.12 Conjoining Jamo Behavior Common constants
254145449b1SDimitry Andric constexpr const char32_t SBase = 0xAC00;
255145449b1SDimitry Andric constexpr const uint32_t LCount = 19;
256145449b1SDimitry Andric constexpr const uint32_t VCount = 21;
257145449b1SDimitry Andric constexpr const uint32_t TCount = 28;
258145449b1SDimitry Andric
findSyllable(StringRef Name,bool Strict,char & PreviousInName,int & Pos,int Column)259145449b1SDimitry Andric static std::size_t findSyllable(StringRef Name, bool Strict,
260145449b1SDimitry Andric char &PreviousInName, int &Pos, int Column) {
261145449b1SDimitry Andric assert(Column == 0 || Column == 1 || Column == 2);
262145449b1SDimitry Andric static std::size_t CountPerColumn[] = {LCount, VCount, TCount};
263145449b1SDimitry Andric int Len = -1;
264145449b1SDimitry Andric int Prev = PreviousInName;
265145449b1SDimitry Andric for (std::size_t I = 0; I < CountPerColumn[Column]; I++) {
266145449b1SDimitry Andric StringRef Syllable(HangulSyllables[I][Column]);
267145449b1SDimitry Andric if (int(Syllable.size()) <= Len)
268145449b1SDimitry Andric continue;
269145449b1SDimitry Andric std::size_t Consummed = 0;
270145449b1SDimitry Andric char PreviousInNameCopy = PreviousInName;
271b1c73532SDimitry Andric bool DoesStartWith =
272b1c73532SDimitry Andric startsWith(Name, Syllable, Strict, Consummed, PreviousInNameCopy);
273145449b1SDimitry Andric if (!DoesStartWith)
274145449b1SDimitry Andric continue;
275145449b1SDimitry Andric Len = Consummed;
276145449b1SDimitry Andric Pos = I;
277145449b1SDimitry Andric Prev = PreviousInNameCopy;
278145449b1SDimitry Andric }
279145449b1SDimitry Andric if (Len == -1)
280145449b1SDimitry Andric return 0;
281145449b1SDimitry Andric PreviousInName = Prev;
282145449b1SDimitry Andric return size_t(Len);
283145449b1SDimitry Andric }
284145449b1SDimitry Andric
285e3b55780SDimitry Andric static std::optional<char32_t>
nameToHangulCodePoint(StringRef Name,bool Strict,BufferType & Buffer)286145449b1SDimitry Andric nameToHangulCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
287145449b1SDimitry Andric Buffer.clear();
288145449b1SDimitry Andric // Hangul Syllable Decomposition
289145449b1SDimitry Andric std::size_t Consummed = 0;
290b1c73532SDimitry Andric char NameStart = 0;
291b1c73532SDimitry Andric bool DoesStartWith =
292b1c73532SDimitry Andric startsWith(Name, "HANGUL SYLLABLE ", Strict, Consummed, NameStart);
293145449b1SDimitry Andric if (!DoesStartWith)
294e3b55780SDimitry Andric return std::nullopt;
295145449b1SDimitry Andric Name = Name.substr(Consummed);
296145449b1SDimitry Andric int L = -1, V = -1, T = -1;
297145449b1SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, L, 0));
298145449b1SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, V, 1));
299145449b1SDimitry Andric Name = Name.substr(findSyllable(Name, Strict, NameStart, T, 2));
300145449b1SDimitry Andric if (L != -1 && V != -1 && T != -1 && Name.empty()) {
301145449b1SDimitry Andric if (!Strict) {
302145449b1SDimitry Andric Buffer.append("HANGUL SYLLABLE ");
303145449b1SDimitry Andric if (L != -1)
304145449b1SDimitry Andric Buffer.append(HangulSyllables[L][0]);
305145449b1SDimitry Andric if (V != -1)
306145449b1SDimitry Andric Buffer.append(HangulSyllables[V][1]);
307145449b1SDimitry Andric if (T != -1)
308145449b1SDimitry Andric Buffer.append(HangulSyllables[T][2]);
309145449b1SDimitry Andric }
310145449b1SDimitry Andric return SBase + (std::uint32_t(L) * VCount + std::uint32_t(V)) * TCount +
311145449b1SDimitry Andric std::uint32_t(T);
312145449b1SDimitry Andric }
313145449b1SDimitry Andric // Otherwise, it's an illegal syllable name.
314e3b55780SDimitry Andric return std::nullopt;
315145449b1SDimitry Andric }
316145449b1SDimitry Andric
317145449b1SDimitry Andric struct GeneratedNamesData {
318145449b1SDimitry Andric StringRef Prefix;
319145449b1SDimitry Andric uint32_t Start;
320145449b1SDimitry Andric uint32_t End;
321145449b1SDimitry Andric };
322145449b1SDimitry Andric
3234df029ccSDimitry Andric // Unicode 15.1 Table 4-8. Name Derivation Rule Prefix Strings
324145449b1SDimitry Andric static const GeneratedNamesData GeneratedNamesDataTable[] = {
325145449b1SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x3400, 0x4DBF},
326e3b55780SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x4E00, 0x9FFF},
327e3b55780SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x20000, 0x2A6DF},
328e3b55780SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2A700, 0x2B739},
329145449b1SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B740, 0x2B81D},
330145449b1SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2B820, 0x2CEA1},
331145449b1SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2CEB0, 0x2EBE0},
3324df029ccSDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x2EBF0, 0x2EE5D},
333145449b1SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x30000, 0x3134A},
334e3b55780SDimitry Andric {"CJK UNIFIED IDEOGRAPH-", 0x31350, 0x323AF},
335145449b1SDimitry Andric {"TANGUT IDEOGRAPH-", 0x17000, 0x187F7},
336145449b1SDimitry Andric {"TANGUT IDEOGRAPH-", 0x18D00, 0x18D08},
337145449b1SDimitry Andric {"KHITAN SMALL SCRIPT CHARACTER-", 0x18B00, 0x18CD5},
338145449b1SDimitry Andric {"NUSHU CHARACTER-", 0x1B170, 0x1B2FB},
339145449b1SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xF900, 0xFA6D},
340145449b1SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0xFA70, 0xFAD9},
341145449b1SDimitry Andric {"CJK COMPATIBILITY IDEOGRAPH-", 0x2F800, 0x2FA1D},
342145449b1SDimitry Andric };
343145449b1SDimitry Andric
344e3b55780SDimitry Andric static std::optional<char32_t>
nameToGeneratedCodePoint(StringRef Name,bool Strict,BufferType & Buffer)345145449b1SDimitry Andric nameToGeneratedCodePoint(StringRef Name, bool Strict, BufferType &Buffer) {
346145449b1SDimitry Andric for (auto &&Item : GeneratedNamesDataTable) {
347145449b1SDimitry Andric Buffer.clear();
348145449b1SDimitry Andric std::size_t Consummed = 0;
349b1c73532SDimitry Andric char NameStart = 0;
350145449b1SDimitry Andric bool DoesStartWith = startsWith(Name, Item.Prefix, Strict, Consummed,
351b1c73532SDimitry Andric NameStart, /*IsPrefix=*/true);
352145449b1SDimitry Andric if (!DoesStartWith)
353145449b1SDimitry Andric continue;
354145449b1SDimitry Andric auto Number = Name.substr(Consummed);
355145449b1SDimitry Andric unsigned long long V = 0;
356145449b1SDimitry Andric // Be consistent about mandating upper casing.
357145449b1SDimitry Andric if (Strict &&
358145449b1SDimitry Andric llvm::any_of(Number, [](char C) { return C >= 'a' && C <= 'f'; }))
359145449b1SDimitry Andric return {};
360145449b1SDimitry Andric if (getAsUnsignedInteger(Number, 16, V) || V < Item.Start || V > Item.End)
361145449b1SDimitry Andric continue;
362145449b1SDimitry Andric if (!Strict) {
363145449b1SDimitry Andric Buffer.append(Item.Prefix);
364145449b1SDimitry Andric Buffer.append(utohexstr(V, true));
365145449b1SDimitry Andric }
366145449b1SDimitry Andric return V;
367145449b1SDimitry Andric }
368e3b55780SDimitry Andric return std::nullopt;
369145449b1SDimitry Andric }
370145449b1SDimitry Andric
nameToCodepoint(StringRef Name,bool Strict,BufferType & Buffer)371e3b55780SDimitry Andric static std::optional<char32_t> nameToCodepoint(StringRef Name, bool Strict,
372145449b1SDimitry Andric BufferType &Buffer) {
373145449b1SDimitry Andric if (Name.empty())
374e3b55780SDimitry Andric return std::nullopt;
375145449b1SDimitry Andric
376e3b55780SDimitry Andric std::optional<char32_t> Res = nameToHangulCodePoint(Name, Strict, Buffer);
377145449b1SDimitry Andric if (!Res)
378145449b1SDimitry Andric Res = nameToGeneratedCodePoint(Name, Strict, Buffer);
379145449b1SDimitry Andric if (Res)
380145449b1SDimitry Andric return *Res;
381145449b1SDimitry Andric
382145449b1SDimitry Andric Buffer.clear();
383145449b1SDimitry Andric Node Node;
384145449b1SDimitry Andric bool Matches;
385145449b1SDimitry Andric uint32_t Value;
386145449b1SDimitry Andric std::tie(Node, Matches, Value) = compareNode(0, Name, Strict, Buffer);
387145449b1SDimitry Andric if (Matches) {
388145449b1SDimitry Andric std::reverse(Buffer.begin(), Buffer.end());
389145449b1SDimitry Andric // UAX44-LM2. Ignore case, whitespace, underscore ('_'), and all medial
390145449b1SDimitry Andric // hyphens except the hyphen in U+1180 HANGUL JUNGSEONG O-E.
391b1c73532SDimitry Andric if (!Strict && Value == 0x116c && Name.contains_insensitive("O-E")) {
392145449b1SDimitry Andric Buffer = "HANGUL JUNGSEONG O-E";
393145449b1SDimitry Andric Value = 0x1180;
394145449b1SDimitry Andric }
395145449b1SDimitry Andric return Value;
396145449b1SDimitry Andric }
397e3b55780SDimitry Andric return std::nullopt;
398145449b1SDimitry Andric }
399145449b1SDimitry Andric
nameToCodepointStrict(StringRef Name)400e3b55780SDimitry Andric std::optional<char32_t> nameToCodepointStrict(StringRef Name) {
401145449b1SDimitry Andric
402145449b1SDimitry Andric BufferType Buffer;
403145449b1SDimitry Andric auto Opt = nameToCodepoint(Name, true, Buffer);
404145449b1SDimitry Andric return Opt;
405145449b1SDimitry Andric }
406145449b1SDimitry Andric
407e3b55780SDimitry Andric std::optional<LooseMatchingResult>
nameToCodepointLooseMatching(StringRef Name)408145449b1SDimitry Andric nameToCodepointLooseMatching(StringRef Name) {
409145449b1SDimitry Andric BufferType Buffer;
410145449b1SDimitry Andric auto Opt = nameToCodepoint(Name, false, Buffer);
411145449b1SDimitry Andric if (!Opt)
412e3b55780SDimitry Andric return std::nullopt;
413145449b1SDimitry Andric return LooseMatchingResult{*Opt, Buffer};
414145449b1SDimitry Andric }
415145449b1SDimitry Andric
416145449b1SDimitry Andric // Find the unicode character whose editing distance to Pattern
417145449b1SDimitry Andric // is shortest, using the Wagner–Fischer algorithm.
418145449b1SDimitry Andric llvm::SmallVector<MatchForCodepointName>
nearestMatchesForCodepointName(StringRef Pattern,std::size_t MaxMatchesCount)419145449b1SDimitry Andric nearestMatchesForCodepointName(StringRef Pattern, std::size_t MaxMatchesCount) {
420145449b1SDimitry Andric // We maintain a fixed size vector of matches,
421145449b1SDimitry Andric // sorted by distance
422145449b1SDimitry Andric // The worst match (with the biggest distance) are discarded when new elements
423145449b1SDimitry Andric // are added.
424145449b1SDimitry Andric std::size_t LargestEditDistance = 0;
425145449b1SDimitry Andric llvm::SmallVector<MatchForCodepointName> Matches;
426145449b1SDimitry Andric Matches.reserve(MaxMatchesCount + 1);
427145449b1SDimitry Andric
428145449b1SDimitry Andric auto Insert = [&](const Node &Node, uint32_t Distance,
429145449b1SDimitry Andric char32_t Value) -> bool {
430145449b1SDimitry Andric if (Distance > LargestEditDistance) {
431145449b1SDimitry Andric if (Matches.size() == MaxMatchesCount)
432145449b1SDimitry Andric return false;
433145449b1SDimitry Andric LargestEditDistance = Distance;
434145449b1SDimitry Andric }
435145449b1SDimitry Andric // To avoid allocations, the creation of the name is delayed
436145449b1SDimitry Andric // as much as possible.
437145449b1SDimitry Andric std::string Name;
438145449b1SDimitry Andric auto GetName = [&] {
439145449b1SDimitry Andric if (Name.empty())
440145449b1SDimitry Andric Name = Node.fullName();
441145449b1SDimitry Andric return Name;
442145449b1SDimitry Andric };
443145449b1SDimitry Andric
444e3b55780SDimitry Andric auto It = llvm::lower_bound(
445e3b55780SDimitry Andric Matches, Distance,
446145449b1SDimitry Andric [&](const MatchForCodepointName &a, std::size_t Distance) {
447145449b1SDimitry Andric if (Distance == a.Distance)
448145449b1SDimitry Andric return a.Name < GetName();
449145449b1SDimitry Andric return a.Distance < Distance;
450145449b1SDimitry Andric });
451145449b1SDimitry Andric if (It == Matches.end() && Matches.size() == MaxMatchesCount)
452145449b1SDimitry Andric return false;
453145449b1SDimitry Andric
454145449b1SDimitry Andric MatchForCodepointName M{GetName(), Distance, Value};
455145449b1SDimitry Andric Matches.insert(It, std::move(M));
456145449b1SDimitry Andric if (Matches.size() > MaxMatchesCount)
457145449b1SDimitry Andric Matches.pop_back();
458145449b1SDimitry Andric return true;
459145449b1SDimitry Andric };
460145449b1SDimitry Andric
461145449b1SDimitry Andric // We ignore case, space, hyphens, etc,
462145449b1SDimitry Andric // in both the search pattern and the prospective names.
463145449b1SDimitry Andric auto Normalize = [](StringRef Name) {
464145449b1SDimitry Andric std::string Out;
465145449b1SDimitry Andric Out.reserve(Name.size());
466145449b1SDimitry Andric for (char C : Name) {
467145449b1SDimitry Andric if (isAlnum(C))
468145449b1SDimitry Andric Out.push_back(toUpper(C));
469145449b1SDimitry Andric }
470145449b1SDimitry Andric return Out;
471145449b1SDimitry Andric };
472145449b1SDimitry Andric std::string NormalizedName = Normalize(Pattern);
473145449b1SDimitry Andric
474145449b1SDimitry Andric // Allocate a matrix big enough for longest names.
475145449b1SDimitry Andric const std::size_t Columns =
476145449b1SDimitry Andric std::min(NormalizedName.size(), UnicodeNameToCodepointLargestNameSize) +
477145449b1SDimitry Andric 1;
478145449b1SDimitry Andric
479145449b1SDimitry Andric LLVM_ATTRIBUTE_UNUSED static std::size_t Rows =
480145449b1SDimitry Andric UnicodeNameToCodepointLargestNameSize + 1;
481145449b1SDimitry Andric
482145449b1SDimitry Andric std::vector<char> Distances(
483145449b1SDimitry Andric Columns * (UnicodeNameToCodepointLargestNameSize + 1), 0);
484145449b1SDimitry Andric
485145449b1SDimitry Andric auto Get = [&Distances, Columns](size_t Column, std::size_t Row) -> char & {
486145449b1SDimitry Andric assert(Column < Columns);
487145449b1SDimitry Andric assert(Row < Rows);
488145449b1SDimitry Andric return Distances[Row * Columns + Column];
489145449b1SDimitry Andric };
490145449b1SDimitry Andric
491145449b1SDimitry Andric for (std::size_t I = 0; I < Columns; I++)
492145449b1SDimitry Andric Get(I, 0) = I;
493145449b1SDimitry Andric
494145449b1SDimitry Andric // Visit the childrens,
495145449b1SDimitry Andric // Filling (and overriding) the matrix for the name fragment of each node
496145449b1SDimitry Andric // iteratively. CompleteName is used to collect the actual name of potential
497145449b1SDimitry Andric // match, respecting case and spacing.
498145449b1SDimitry Andric auto VisitNode = [&](const Node &N, std::size_t Row,
499145449b1SDimitry Andric auto &VisitNode) -> void {
500145449b1SDimitry Andric std::size_t J = 0;
501145449b1SDimitry Andric for (; J < N.Name.size(); J++) {
502145449b1SDimitry Andric if (!isAlnum(N.Name[J]))
503145449b1SDimitry Andric continue;
504145449b1SDimitry Andric
505145449b1SDimitry Andric Get(0, Row) = Row;
506145449b1SDimitry Andric
507145449b1SDimitry Andric for (std::size_t I = 1; I < Columns; I++) {
508145449b1SDimitry Andric const int Delete = Get(I - 1, Row) + 1;
509145449b1SDimitry Andric const int Insert = Get(I, Row - 1) + 1;
510145449b1SDimitry Andric
511145449b1SDimitry Andric const int Replace =
512145449b1SDimitry Andric Get(I - 1, Row - 1) + (NormalizedName[I - 1] != N.Name[J] ? 1 : 0);
513145449b1SDimitry Andric
514145449b1SDimitry Andric Get(I, Row) = std::min(Insert, std::min(Delete, Replace));
515145449b1SDimitry Andric }
516145449b1SDimitry Andric
517145449b1SDimitry Andric Row++;
518145449b1SDimitry Andric }
519145449b1SDimitry Andric
520145449b1SDimitry Andric unsigned Cost = Get(Columns - 1, Row - 1);
521145449b1SDimitry Andric if (N.Value != 0xFFFFFFFF) {
522145449b1SDimitry Andric Insert(N, Cost, N.Value);
523145449b1SDimitry Andric }
524145449b1SDimitry Andric
525145449b1SDimitry Andric if (N.hasChildren()) {
526145449b1SDimitry Andric auto ChildOffset = N.ChildrenOffset;
527145449b1SDimitry Andric for (;;) {
528145449b1SDimitry Andric Node C = readNode(ChildOffset, &N);
529145449b1SDimitry Andric ChildOffset += C.Size;
530145449b1SDimitry Andric if (!C.isValid())
531145449b1SDimitry Andric break;
532145449b1SDimitry Andric VisitNode(C, Row, VisitNode);
533145449b1SDimitry Andric if (!C.HasSibling)
534145449b1SDimitry Andric break;
535145449b1SDimitry Andric }
536145449b1SDimitry Andric }
537145449b1SDimitry Andric };
538145449b1SDimitry Andric
539145449b1SDimitry Andric Node Root = createRoot();
540145449b1SDimitry Andric VisitNode(Root, 1, VisitNode);
541145449b1SDimitry Andric return Matches;
542145449b1SDimitry Andric }
543145449b1SDimitry Andric
544145449b1SDimitry Andric } // namespace unicode
545145449b1SDimitry Andric
546145449b1SDimitry Andric } // namespace sys
547145449b1SDimitry Andric } // namespace llvm
548