1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3 * KUnit tests for utf-8 support.
4 *
5 * Copyright 2017 Collabora Ltd.
6 */
7
8 #include <linux/unicode.h>
9 #include <kunit/test.h>
10
11 #include "../utf8n.h"
12
13 static const struct {
14 /* UTF-8 strings in this vector _must_ be NULL-terminated. */
15 unsigned char str[10];
16 unsigned char dec[10];
17 } nfdi_test_data[] = {
18 /* Trivial sequence */
19 {
20 /* "ABba" decomposes to itself */
21 .str = "aBba",
22 .dec = "aBba",
23 },
24 /* Simple equivalent sequences */
25 {
26 /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
27 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
28 canonical decomposition */
29 .str = {0xc2, 0xbc, 0x00},
30 .dec = {0xc2, 0xbc, 0x00},
31 },
32 {
33 /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
34 'LETTER A' + 'COMBINING DIAERESIS' */
35 .str = {0xc3, 0xa4, 0x00},
36 .dec = {0x61, 0xcc, 0x88, 0x00},
37 },
38 {
39 /* 'LATIN SMALL LETTER LJ' can't decompose to
40 'LETTER L' + 'LETTER J' on canonical decomposition */
41 .str = {0xC7, 0x89, 0x00},
42 .dec = {0xC7, 0x89, 0x00},
43 },
44 {
45 /* GREEK ANO TELEIA decomposes to MIDDLE DOT */
46 .str = {0xCE, 0x87, 0x00},
47 .dec = {0xC2, 0xB7, 0x00}
48 },
49 /* Canonical ordering */
50 {
51 /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
52 to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
53 .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
54 .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
55 },
56 {
57 /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
58 decomposes to
59 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
60 .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
61
62 .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
63 },
64
65 };
66
67 static const struct {
68 /* UTF-8 strings in this vector _must_ be NULL-terminated. */
69 unsigned char str[30];
70 unsigned char ncf[30];
71 } nfdicf_test_data[] = {
72 /* Trivial sequences */
73 {
74 /* "ABba" folds to lowercase */
75 .str = {0x41, 0x42, 0x62, 0x61, 0x00},
76 .ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
77 },
78 {
79 /* All ASCII folds to lower-case */
80 .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
81 .ncf = "abcdefghijklmnopqrstuvwxyz0.1",
82 },
83 {
84 /* LATIN SMALL LETTER SHARP S folds to
85 LATIN SMALL LETTER S + LATIN SMALL LETTER S */
86 .str = {0xc3, 0x9f, 0x00},
87 .ncf = {0x73, 0x73, 0x00},
88 },
89 {
90 /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
91 LATIN SMALL LETTER A + COMBINING RING ABOVE */
92 .str = {0xC3, 0x85, 0x00},
93 .ncf = {0x61, 0xcc, 0x8a, 0x00},
94 },
95 /* Introduced by UTF-8.0.0. */
96 /* Cherokee letters are interesting test-cases because they fold
97 to upper-case. Before 8.0.0, Cherokee lowercase were
98 undefined, thus, the folding from LC is not stable between
99 7.0.0 -> 8.0.0, but it is from UC. */
100 {
101 /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
102 .str = {0xea, 0xad, 0xb0, 0x00},
103 .ncf = {0xe1, 0x8e, 0xa0, 0x00},
104 },
105 {
106 /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
107 .str = {0xe1, 0x8f, 0xb8, 0x00},
108 .ncf = {0xe1, 0x8f, 0xb0, 0x00},
109 },
110 {
111 /* OLD HUNGARIAN CAPITAL LETTER AMB folds to
112 OLD HUNGARIAN SMALL LETTER AMB */
113 .str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
114 .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
115 },
116 /* Introduced by UTF-9.0.0. */
117 {
118 /* OSAGE CAPITAL LETTER CHA folds to
119 OSAGE SMALL LETTER CHA */
120 .str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
121 .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
122 },
123 {
124 /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
125 LATIN LETTER SMALL CAPITAL I */
126 .str = {0xea, 0x9e, 0xae, 0x00},
127 .ncf = {0xc9, 0xaa, 0x00},
128 },
129 /* Introduced by UTF-11.0.0. */
130 {
131 /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
132 CAPITAL LETTER AN */
133 .str = {0xe1, 0xb2, 0x90, 0x00},
134 .ncf = {0xe1, 0x83, 0x90, 0x00},
135 }
136 };
137
utf8len(const struct unicode_map * um,enum utf8_normalization n,const char * s)138 static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
139 const char *s)
140 {
141 return utf8nlen(um, n, s, (size_t)-1);
142 }
143
utf8cursor(struct utf8cursor * u8c,const struct unicode_map * um,enum utf8_normalization n,const char * s)144 static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
145 enum utf8_normalization n, const char *s)
146 {
147 return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
148 }
149
check_utf8_nfdi(struct kunit * test)150 static void check_utf8_nfdi(struct kunit *test)
151 {
152 int i;
153 struct utf8cursor u8c;
154 struct unicode_map *um = test->priv;
155
156 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
157 int len = strlen(nfdi_test_data[i].str);
158 int nlen = strlen(nfdi_test_data[i].dec);
159 int j = 0;
160 unsigned char c;
161 int ret;
162
163 KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
164 KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
165 nlen);
166
167
168 ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str);
169 KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
170
171 while ((c = utf8byte(&u8c)) > 0) {
172 KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
173 "Unexpected byte 0x%x should be 0x%x\n",
174 c, nfdi_test_data[i].dec[j]);
175 j++;
176 }
177
178 KUNIT_EXPECT_EQ(test, j, nlen);
179 }
180 }
181
check_utf8_nfdicf(struct kunit * test)182 static void check_utf8_nfdicf(struct kunit *test)
183 {
184 int i;
185 struct utf8cursor u8c;
186 struct unicode_map *um = test->priv;
187
188 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
189 int len = strlen(nfdicf_test_data[i].str);
190 int nlen = strlen(nfdicf_test_data[i].ncf);
191 int j = 0;
192 int ret;
193 unsigned char c;
194
195 KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
196 nlen);
197 KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
198 nlen);
199
200 ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str);
201 KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
202
203 while ((c = utf8byte(&u8c)) > 0) {
204 KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
205 "Unexpected byte 0x%x should be 0x%x\n",
206 c, nfdicf_test_data[i].ncf[j]);
207 j++;
208 }
209
210 KUNIT_EXPECT_EQ(test, j, nlen);
211 }
212 }
213
check_utf8_comparisons(struct kunit * test)214 static void check_utf8_comparisons(struct kunit *test)
215 {
216 int i;
217 struct unicode_map *um = test->priv;
218
219 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
220 const struct qstr s1 = {.name = nfdi_test_data[i].str,
221 .len = sizeof(nfdi_test_data[i].str)};
222 const struct qstr s2 = {.name = nfdi_test_data[i].dec,
223 .len = sizeof(nfdi_test_data[i].dec)};
224
225 /* strncmp returns 0 when strings are equal */
226 KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0,
227 "%s %s comparison mismatch\n", s1.name, s2.name);
228 }
229
230 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
231 const struct qstr s1 = {.name = nfdicf_test_data[i].str,
232 .len = sizeof(nfdicf_test_data[i].str)};
233 const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
234 .len = sizeof(nfdicf_test_data[i].ncf)};
235
236 /* strncasecmp returns 0 when strings are equal */
237 KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0,
238 "%s %s comparison mismatch\n", s1.name, s2.name);
239 }
240 }
241
check_supported_versions(struct kunit * test)242 static void check_supported_versions(struct kunit *test)
243 {
244 struct unicode_map *um = test->priv;
245 /* Unicode 7.0.0 should be supported. */
246 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
247
248 /* Unicode 9.0.0 should be supported. */
249 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
250
251 /* Unicode 1x.0.0 (the latest version) should be supported. */
252 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
253
254 /* Next versions don't exist. */
255 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
256 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
257 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
258 }
259
260 static struct kunit_case unicode_normalization_test_cases[] = {
261 KUNIT_CASE(check_supported_versions),
262 KUNIT_CASE(check_utf8_comparisons),
263 KUNIT_CASE(check_utf8_nfdicf),
264 KUNIT_CASE(check_utf8_nfdi),
265 {}
266 };
267
init_test_ucd(struct kunit * test)268 static int init_test_ucd(struct kunit *test)
269 {
270 struct unicode_map *um = utf8_load(UTF8_LATEST);
271
272 test->priv = um;
273
274 KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0,
275 "%s: Unable to load utf8 table.\n", __func__);
276
277 return 0;
278 }
279
exit_test_ucd(struct kunit * test)280 static void exit_test_ucd(struct kunit *test)
281 {
282 utf8_unload(test->priv);
283 }
284
285 static struct kunit_suite unicode_normalization_test_suite = {
286 .name = "unicode_normalization",
287 .test_cases = unicode_normalization_test_cases,
288 .init = init_test_ucd,
289 .exit = exit_test_ucd,
290 };
291
292 kunit_test_suite(unicode_normalization_test_suite);
293
294
295 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
296 MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
297 MODULE_LICENSE("GPL");
298