1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * KUnit tests for utf-8 support. 4 * 5 * Copyright 2017 Collabora Ltd. 6 */ 7 8 #include <linux/unicode.h> 9 #include <kunit/test.h> 10 11 #include "../utf8n.h" 12 13 static const struct { 14 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 15 unsigned char str[10]; 16 unsigned char dec[10]; 17 } nfdi_test_data[] = { 18 /* Trivial sequence */ 19 { 20 /* "ABba" decomposes to itself */ 21 .str = "aBba", 22 .dec = "aBba", 23 }, 24 /* Simple equivalent sequences */ 25 { 26 /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to 27 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on 28 canonical decomposition */ 29 .str = {0xc2, 0xbc, 0x00}, 30 .dec = {0xc2, 0xbc, 0x00}, 31 }, 32 { 33 /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to 34 'LETTER A' + 'COMBINING DIAERESIS' */ 35 .str = {0xc3, 0xa4, 0x00}, 36 .dec = {0x61, 0xcc, 0x88, 0x00}, 37 }, 38 { 39 /* 'LATIN SMALL LETTER LJ' can't decompose to 40 'LETTER L' + 'LETTER J' on canonical decomposition */ 41 .str = {0xC7, 0x89, 0x00}, 42 .dec = {0xC7, 0x89, 0x00}, 43 }, 44 { 45 /* GREEK ANO TELEIA decomposes to MIDDLE DOT */ 46 .str = {0xCE, 0x87, 0x00}, 47 .dec = {0xC2, 0xB7, 0x00} 48 }, 49 /* Canonical ordering */ 50 { 51 /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes 52 to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */ 53 .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0}, 54 .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0}, 55 }, 56 { 57 /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK' 58 decomposes to 59 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */ 60 .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00}, 61 62 .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00}, 63 }, 64 65 }; 66 67 static const struct { 68 /* UTF-8 strings in this vector _must_ be NULL-terminated. */ 69 unsigned char str[30]; 70 unsigned char ncf[30]; 71 } nfdicf_test_data[] = { 72 /* Trivial sequences */ 73 { 74 /* "ABba" folds to lowercase */ 75 .str = {0x41, 0x42, 0x62, 0x61, 0x00}, 76 .ncf = {0x61, 0x62, 0x62, 0x61, 0x00}, 77 }, 78 { 79 /* All ASCII folds to lower-case */ 80 .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1", 81 .ncf = "abcdefghijklmnopqrstuvwxyz0.1", 82 }, 83 { 84 /* LATIN SMALL LETTER SHARP S folds to 85 LATIN SMALL LETTER S + LATIN SMALL LETTER S */ 86 .str = {0xc3, 0x9f, 0x00}, 87 .ncf = {0x73, 0x73, 0x00}, 88 }, 89 { 90 /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to 91 LATIN SMALL LETTER A + COMBINING RING ABOVE */ 92 .str = {0xC3, 0x85, 0x00}, 93 .ncf = {0x61, 0xcc, 0x8a, 0x00}, 94 }, 95 /* Introduced by UTF-8.0.0. */ 96 /* Cherokee letters are interesting test-cases because they fold 97 to upper-case. Before 8.0.0, Cherokee lowercase were 98 undefined, thus, the folding from LC is not stable between 99 7.0.0 -> 8.0.0, but it is from UC. */ 100 { 101 /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */ 102 .str = {0xea, 0xad, 0xb0, 0x00}, 103 .ncf = {0xe1, 0x8e, 0xa0, 0x00}, 104 }, 105 { 106 /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */ 107 .str = {0xe1, 0x8f, 0xb8, 0x00}, 108 .ncf = {0xe1, 0x8f, 0xb0, 0x00}, 109 }, 110 { 111 /* OLD HUNGARIAN CAPITAL LETTER AMB folds to 112 OLD HUNGARIAN SMALL LETTER AMB */ 113 .str = {0xf0, 0x90, 0xb2, 0x83, 0x00}, 114 .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00}, 115 }, 116 /* Introduced by UTF-9.0.0. */ 117 { 118 /* OSAGE CAPITAL LETTER CHA folds to 119 OSAGE SMALL LETTER CHA */ 120 .str = {0xf0, 0x90, 0x92, 0xb5, 0x00}, 121 .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00}, 122 }, 123 { 124 /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to 125 LATIN LETTER SMALL CAPITAL I */ 126 .str = {0xea, 0x9e, 0xae, 0x00}, 127 .ncf = {0xc9, 0xaa, 0x00}, 128 }, 129 /* Introduced by UTF-11.0.0. */ 130 { 131 /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI 132 CAPITAL LETTER AN */ 133 .str = {0xe1, 0xb2, 0x90, 0x00}, 134 .ncf = {0xe1, 0x83, 0x90, 0x00}, 135 } 136 }; 137 138 static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n, 139 const char *s) 140 { 141 return utf8nlen(um, n, s, (size_t)-1); 142 } 143 144 static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, 145 enum utf8_normalization n, const char *s) 146 { 147 return utf8ncursor(u8c, um, n, s, (unsigned int)-1); 148 } 149 150 static void check_utf8_nfdi(struct kunit *test) 151 { 152 int i; 153 struct utf8cursor u8c; 154 struct unicode_map *um = test->priv; 155 156 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 157 int len = strlen(nfdi_test_data[i].str); 158 int nlen = strlen(nfdi_test_data[i].dec); 159 int j = 0; 160 unsigned char c; 161 int ret; 162 163 KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); 164 KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), 165 nlen); 166 167 168 ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); 169 KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); 170 171 while ((c = utf8byte(&u8c)) > 0) { 172 KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], 173 "Unexpected byte 0x%x should be 0x%x\n", 174 c, nfdi_test_data[i].dec[j]); 175 j++; 176 } 177 178 KUNIT_EXPECT_EQ(test, j, nlen); 179 } 180 } 181 182 static void check_utf8_nfdicf(struct kunit *test) 183 { 184 int i; 185 struct utf8cursor u8c; 186 struct unicode_map *um = test->priv; 187 188 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 189 int len = strlen(nfdicf_test_data[i].str); 190 int nlen = strlen(nfdicf_test_data[i].ncf); 191 int j = 0; 192 int ret; 193 unsigned char c; 194 195 KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), 196 nlen); 197 KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), 198 nlen); 199 200 ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); 201 KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); 202 203 while ((c = utf8byte(&u8c)) > 0) { 204 KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], 205 "Unexpected byte 0x%x should be 0x%x\n", 206 c, nfdicf_test_data[i].ncf[j]); 207 j++; 208 } 209 210 KUNIT_EXPECT_EQ(test, j, nlen); 211 } 212 } 213 214 static void check_utf8_comparisons(struct kunit *test) 215 { 216 int i; 217 struct unicode_map *um = test->priv; 218 219 for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { 220 const struct qstr s1 = {.name = nfdi_test_data[i].str, 221 .len = sizeof(nfdi_test_data[i].str)}; 222 const struct qstr s2 = {.name = nfdi_test_data[i].dec, 223 .len = sizeof(nfdi_test_data[i].dec)}; 224 225 /* strncmp returns 0 when strings are equal */ 226 KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, 227 "%s %s comparison mismatch\n", s1.name, s2.name); 228 } 229 230 for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { 231 const struct qstr s1 = {.name = nfdicf_test_data[i].str, 232 .len = sizeof(nfdicf_test_data[i].str)}; 233 const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, 234 .len = sizeof(nfdicf_test_data[i].ncf)}; 235 236 /* strncasecmp returns 0 when strings are equal */ 237 KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, 238 "%s %s comparison mismatch\n", s1.name, s2.name); 239 } 240 } 241 242 static void check_supported_versions(struct kunit *test) 243 { 244 struct unicode_map *um = test->priv; 245 /* Unicode 7.0.0 should be supported. */ 246 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); 247 248 /* Unicode 9.0.0 should be supported. */ 249 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); 250 251 /* Unicode 1x.0.0 (the latest version) should be supported. */ 252 KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); 253 254 /* Next versions don't exist. */ 255 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); 256 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); 257 KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); 258 } 259 260 static struct kunit_case unicode_normalization_test_cases[] = { 261 KUNIT_CASE(check_supported_versions), 262 KUNIT_CASE(check_utf8_comparisons), 263 KUNIT_CASE(check_utf8_nfdicf), 264 KUNIT_CASE(check_utf8_nfdi), 265 {} 266 }; 267 268 static int init_test_ucd(struct kunit *test) 269 { 270 struct unicode_map *um = utf8_load(UTF8_LATEST); 271 272 test->priv = um; 273 274 KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, 275 "%s: Unable to load utf8 table.\n", __func__); 276 277 return 0; 278 } 279 280 static void exit_test_ucd(struct kunit *test) 281 { 282 utf8_unload(test->priv); 283 } 284 285 static struct kunit_suite unicode_normalization_test_suite = { 286 .name = "unicode_normalization", 287 .test_cases = unicode_normalization_test_cases, 288 .init = init_test_ucd, 289 .exit = exit_test_ucd, 290 }; 291 292 kunit_test_suite(unicode_normalization_test_suite); 293 294 295 MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); 296 MODULE_DESCRIPTION("KUnit tests for utf-8 support."); 297 MODULE_LICENSE("GPL"); 298