xref: /linux/fs/hfsplus/unicode.c (revision 4d9981429aa61c31e67371ac09e7dbba6b59de14)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  *  linux/fs/hfsplus/unicode.c
4  *
5  * Copyright (C) 2001
6  * Brad Boyer (flar@allandria.com)
7  * (C) 2003 Ardis Technologies <roman@ardistech.com>
8  *
9  * Handler routines for unicode strings
10  */
11 
12 #include <linux/types.h>
13 #include <linux/nls.h>
14 
15 #include <kunit/visibility.h>
16 
17 #include "hfsplus_fs.h"
18 #include "hfsplus_raw.h"
19 
20 /* Fold the case of a unicode char, given the 16 bit value */
21 /* Returns folded char, or 0 if ignorable */
case_fold(u16 c)22 static inline u16 case_fold(u16 c)
23 {
24 	u16 tmp;
25 
26 	tmp = hfsplus_case_fold_table[c >> 8];
27 	if (tmp)
28 		tmp = hfsplus_case_fold_table[tmp + (c & 0xff)];
29 	else
30 		tmp = c;
31 	return tmp;
32 }
33 
34 /* Compare unicode strings, return values like normal strcmp */
hfsplus_strcasecmp(const struct hfsplus_unistr * s1,const struct hfsplus_unistr * s2)35 int hfsplus_strcasecmp(const struct hfsplus_unistr *s1,
36 		       const struct hfsplus_unistr *s2)
37 {
38 	u16 len1, len2, c1, c2;
39 	const hfsplus_unichr *p1, *p2;
40 
41 	len1 = be16_to_cpu(s1->length);
42 	len2 = be16_to_cpu(s2->length);
43 	p1 = s1->unicode;
44 	p2 = s2->unicode;
45 
46 	if (len1 > HFSPLUS_MAX_STRLEN) {
47 		len1 = HFSPLUS_MAX_STRLEN;
48 		pr_err("invalid length %u has been corrected to %d\n",
49 			be16_to_cpu(s1->length), len1);
50 	}
51 
52 	if (len2 > HFSPLUS_MAX_STRLEN) {
53 		len2 = HFSPLUS_MAX_STRLEN;
54 		pr_err("invalid length %u has been corrected to %d\n",
55 			be16_to_cpu(s2->length), len2);
56 	}
57 
58 	while (1) {
59 		c1 = c2 = 0;
60 
61 		while (len1 && !c1) {
62 			c1 = case_fold(be16_to_cpu(*p1));
63 			p1++;
64 			len1--;
65 		}
66 		while (len2 && !c2) {
67 			c2 = case_fold(be16_to_cpu(*p2));
68 			p2++;
69 			len2--;
70 		}
71 
72 		if (c1 != c2)
73 			return (c1 < c2) ? -1 : 1;
74 		if (!c1 && !c2)
75 			return 0;
76 	}
77 }
78 EXPORT_SYMBOL_IF_KUNIT(hfsplus_strcasecmp);
79 
80 /* Compare names as a sequence of 16-bit unsigned integers */
hfsplus_strcmp(const struct hfsplus_unistr * s1,const struct hfsplus_unistr * s2)81 int hfsplus_strcmp(const struct hfsplus_unistr *s1,
82 		   const struct hfsplus_unistr *s2)
83 {
84 	u16 len1, len2, c1, c2;
85 	const hfsplus_unichr *p1, *p2;
86 	int len;
87 
88 	len1 = be16_to_cpu(s1->length);
89 	len2 = be16_to_cpu(s2->length);
90 	p1 = s1->unicode;
91 	p2 = s2->unicode;
92 
93 	if (len1 > HFSPLUS_MAX_STRLEN) {
94 		len1 = HFSPLUS_MAX_STRLEN;
95 		pr_err("invalid length %u has been corrected to %d\n",
96 			be16_to_cpu(s1->length), len1);
97 	}
98 
99 	if (len2 > HFSPLUS_MAX_STRLEN) {
100 		len2 = HFSPLUS_MAX_STRLEN;
101 		pr_err("invalid length %u has been corrected to %d\n",
102 			be16_to_cpu(s2->length), len2);
103 	}
104 
105 	for (len = min(len1, len2); len > 0; len--) {
106 		c1 = be16_to_cpu(*p1);
107 		c2 = be16_to_cpu(*p2);
108 		if (c1 != c2)
109 			return c1 < c2 ? -1 : 1;
110 		p1++;
111 		p2++;
112 	}
113 
114 	return len1 < len2 ? -1 :
115 	       len1 > len2 ? 1 : 0;
116 }
117 EXPORT_SYMBOL_IF_KUNIT(hfsplus_strcmp);
118 
119 #define Hangul_SBase	0xac00
120 #define Hangul_LBase	0x1100
121 #define Hangul_VBase	0x1161
122 #define Hangul_TBase	0x11a7
123 #define Hangul_SCount	11172
124 #define Hangul_LCount	19
125 #define Hangul_VCount	21
126 #define Hangul_TCount	28
127 #define Hangul_NCount	(Hangul_VCount * Hangul_TCount)
128 
129 
hfsplus_compose_lookup(u16 * p,u16 cc)130 static u16 *hfsplus_compose_lookup(u16 *p, u16 cc)
131 {
132 	int i, s, e;
133 
134 	s = 1;
135 	e = p[1];
136 	if (!e || cc < p[s * 2] || cc > p[e * 2])
137 		return NULL;
138 	do {
139 		i = (s + e) / 2;
140 		if (cc > p[i * 2])
141 			s = i + 1;
142 		else if (cc < p[i * 2])
143 			e = i - 1;
144 		else
145 			return hfsplus_compose_table + p[i * 2 + 1];
146 	} while (s <= e);
147 	return NULL;
148 }
149 
150 /*
151  * In HFS+, a filename can contain / because : is the separator.
152  * The slash is a valid filename character on macOS.
153  * But on Linux, / is the path separator and
154  * it cannot appear in a filename component.
155  * There's a parallel mapping for the NUL character (0 -> U+2400).
156  * NUL terminates strings in C/POSIX but is valid in HFS+ filenames.
157  */
158 static inline
hfsplus_mac2linux_compatibility_check(u16 symbol,u16 * conversion,int name_type)159 void hfsplus_mac2linux_compatibility_check(u16 symbol, u16 *conversion,
160 					   int name_type)
161 {
162 	*conversion = symbol;
163 
164 	switch (name_type) {
165 	case HFS_XATTR_NAME:
166 		/* ignore conversion */
167 		return;
168 
169 	default:
170 		/* continue logic */
171 		break;
172 	}
173 
174 	switch (symbol) {
175 	case 0:
176 		*conversion = 0x2400;
177 		break;
178 	case '/':
179 		*conversion = ':';
180 		break;
181 	}
182 }
183 
hfsplus_uni2asc(struct super_block * sb,const struct hfsplus_unistr * ustr,int max_len,char * astr,int * len_p,int name_type)184 static int hfsplus_uni2asc(struct super_block *sb,
185 			   const struct hfsplus_unistr *ustr,
186 			   int max_len, char *astr, int *len_p,
187 			   int name_type)
188 {
189 	const hfsplus_unichr *ip;
190 	struct nls_table *nls = HFSPLUS_SB(sb)->nls;
191 	u8 *op;
192 	u16 cc, c0, c1;
193 	u16 *ce1, *ce2;
194 	int i, len, ustrlen, res, compose;
195 
196 	op = astr;
197 	ip = ustr->unicode;
198 
199 	ustrlen = be16_to_cpu(ustr->length);
200 	if (ustrlen > max_len) {
201 		ustrlen = max_len;
202 		pr_err("invalid length %u has been corrected to %d\n",
203 			be16_to_cpu(ustr->length), ustrlen);
204 	}
205 
206 	len = *len_p;
207 	ce1 = NULL;
208 	compose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
209 
210 	while (ustrlen > 0) {
211 		c0 = be16_to_cpu(*ip++);
212 		ustrlen--;
213 		/* search for single decomposed char */
214 		if (likely(compose))
215 			ce1 = hfsplus_compose_lookup(hfsplus_compose_table, c0);
216 		if (ce1)
217 			cc = ce1[0];
218 		else
219 			cc = 0;
220 		if (cc) {
221 			/* start of a possibly decomposed Hangul char */
222 			if (cc != 0xffff)
223 				goto done;
224 			if (!ustrlen)
225 				goto same;
226 			c1 = be16_to_cpu(*ip) - Hangul_VBase;
227 			if (c1 < Hangul_VCount) {
228 				/* compose the Hangul char */
229 				cc = (c0 - Hangul_LBase) * Hangul_VCount;
230 				cc = (cc + c1) * Hangul_TCount;
231 				cc += Hangul_SBase;
232 				ip++;
233 				ustrlen--;
234 				if (!ustrlen)
235 					goto done;
236 				c1 = be16_to_cpu(*ip) - Hangul_TBase;
237 				if (c1 > 0 && c1 < Hangul_TCount) {
238 					cc += c1;
239 					ip++;
240 					ustrlen--;
241 				}
242 				goto done;
243 			}
244 		}
245 		while (1) {
246 			/* main loop for common case of not composed chars */
247 			if (!ustrlen)
248 				goto same;
249 			c1 = be16_to_cpu(*ip);
250 			if (likely(compose))
251 				ce1 = hfsplus_compose_lookup(
252 					hfsplus_compose_table, c1);
253 			if (ce1)
254 				break;
255 			hfsplus_mac2linux_compatibility_check(c0, &c0,
256 							      name_type);
257 			res = nls->uni2char(c0, op, len);
258 			if (res < 0) {
259 				if (res == -ENAMETOOLONG)
260 					goto out;
261 				*op = '?';
262 				res = 1;
263 			}
264 			op += res;
265 			len -= res;
266 			c0 = c1;
267 			ip++;
268 			ustrlen--;
269 		}
270 		ce2 = hfsplus_compose_lookup(ce1, c0);
271 		if (ce2) {
272 			i = 1;
273 			while (i < ustrlen) {
274 				ce1 = hfsplus_compose_lookup(ce2,
275 					be16_to_cpu(ip[i]));
276 				if (!ce1)
277 					break;
278 				i++;
279 				ce2 = ce1;
280 			}
281 			cc = ce2[0];
282 			if (cc) {
283 				ip += i;
284 				ustrlen -= i;
285 				goto done;
286 			}
287 		}
288 same:
289 		hfsplus_mac2linux_compatibility_check(c0, &cc,
290 						      name_type);
291 done:
292 		res = nls->uni2char(cc, op, len);
293 		if (res < 0) {
294 			if (res == -ENAMETOOLONG)
295 				goto out;
296 			*op = '?';
297 			res = 1;
298 		}
299 		op += res;
300 		len -= res;
301 	}
302 	res = 0;
303 out:
304 	*len_p = (char *)op - astr;
305 	return res;
306 }
307 
hfsplus_uni2asc_str(struct super_block * sb,const struct hfsplus_unistr * ustr,char * astr,int * len_p)308 inline int hfsplus_uni2asc_str(struct super_block *sb,
309 			       const struct hfsplus_unistr *ustr, char *astr,
310 			       int *len_p)
311 {
312 	return hfsplus_uni2asc(sb,
313 				ustr, HFSPLUS_MAX_STRLEN,
314 				astr, len_p,
315 				HFS_REGULAR_NAME);
316 }
317 EXPORT_SYMBOL_IF_KUNIT(hfsplus_uni2asc_str);
318 
hfsplus_uni2asc_xattr_str(struct super_block * sb,const struct hfsplus_attr_unistr * ustr,char * astr,int * len_p)319 inline int hfsplus_uni2asc_xattr_str(struct super_block *sb,
320 				     const struct hfsplus_attr_unistr *ustr,
321 				     char *astr, int *len_p)
322 {
323 	return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr,
324 				HFSPLUS_ATTR_MAX_STRLEN, astr, len_p,
325 				HFS_XATTR_NAME);
326 }
327 EXPORT_SYMBOL_IF_KUNIT(hfsplus_uni2asc_xattr_str);
328 
329 /*
330  * In HFS+, a filename can contain / because : is the separator.
331  * The slash is a valid filename character on macOS.
332  * But on Linux, / is the path separator and
333  * it cannot appear in a filename component.
334  * There's a parallel mapping for the NUL character (0 -> U+2400).
335  * NUL terminates strings in C/POSIX but is valid in HFS+ filenames.
336  */
337 static inline
hfsplus_linux2mac_compatibility_check(wchar_t * uc,int name_type)338 void hfsplus_linux2mac_compatibility_check(wchar_t *uc, int name_type)
339 {
340 	switch (name_type) {
341 	case HFS_XATTR_NAME:
342 		/* ignore conversion */
343 		return;
344 
345 	default:
346 		/* continue logic */
347 		break;
348 	}
349 
350 	switch (*uc) {
351 	case 0x2400:
352 		*uc = 0;
353 		break;
354 	case ':':
355 		*uc = '/';
356 		break;
357 	}
358 }
359 
360 /*
361  * Convert one or more ASCII characters into a single unicode character.
362  * Returns the number of ASCII characters corresponding to the unicode char.
363  */
asc2unichar(struct super_block * sb,const char * astr,int len,wchar_t * uc,int name_type)364 static inline int asc2unichar(struct super_block *sb, const char *astr, int len,
365 			      wchar_t *uc, int name_type)
366 {
367 	int size = HFSPLUS_SB(sb)->nls->char2uni(astr, len, uc);
368 
369 	if (size <= 0) {
370 		*uc = '?';
371 		size = 1;
372 	}
373 
374 	hfsplus_linux2mac_compatibility_check(uc, name_type);
375 	return size;
376 }
377 
378 /* Decomposes a non-Hangul unicode character. */
hfsplus_decompose_nonhangul(wchar_t uc,int * size)379 static u16 *hfsplus_decompose_nonhangul(wchar_t uc, int *size)
380 {
381 	int off;
382 
383 	off = hfsplus_decompose_table[(uc >> 12) & 0xf];
384 	if (off == 0 || off == 0xffff)
385 		return NULL;
386 
387 	off = hfsplus_decompose_table[off + ((uc >> 8) & 0xf)];
388 	if (!off)
389 		return NULL;
390 
391 	off = hfsplus_decompose_table[off + ((uc >> 4) & 0xf)];
392 	if (!off)
393 		return NULL;
394 
395 	off = hfsplus_decompose_table[off + (uc & 0xf)];
396 	*size = off & 3;
397 	if (*size == 0)
398 		return NULL;
399 	return hfsplus_decompose_table + (off / 4);
400 }
401 
402 /*
403  * Try to decompose a unicode character as Hangul. Return 0 if @uc is not
404  * precomposed Hangul, otherwise return the length of the decomposition.
405  *
406  * This function was adapted from sample code from the Unicode Standard
407  * Annex #15: Unicode Normalization Forms, version 3.2.0.
408  *
409  * Copyright (C) 1991-2018 Unicode, Inc.  All rights reserved.  Distributed
410  * under the Terms of Use in http://www.unicode.org/copyright.html.
411  */
hfsplus_try_decompose_hangul(wchar_t uc,u16 * result)412 static int hfsplus_try_decompose_hangul(wchar_t uc, u16 *result)
413 {
414 	int index;
415 	int l, v, t;
416 
417 	index = uc - Hangul_SBase;
418 	if (index < 0 || index >= Hangul_SCount)
419 		return 0;
420 
421 	l = Hangul_LBase + index / Hangul_NCount;
422 	v = Hangul_VBase + (index % Hangul_NCount) / Hangul_TCount;
423 	t = Hangul_TBase + index % Hangul_TCount;
424 
425 	result[0] = l;
426 	result[1] = v;
427 	if (t != Hangul_TBase) {
428 		result[2] = t;
429 		return 3;
430 	}
431 	return 2;
432 }
433 
434 /* Decomposes a single unicode character. */
decompose_unichar(wchar_t uc,int * size,u16 * hangul_buffer)435 static u16 *decompose_unichar(wchar_t uc, int *size, u16 *hangul_buffer)
436 {
437 	u16 *result;
438 
439 	/* Hangul is handled separately */
440 	result = hangul_buffer;
441 	*size = hfsplus_try_decompose_hangul(uc, result);
442 	if (*size == 0)
443 		result = hfsplus_decompose_nonhangul(uc, size);
444 	return result;
445 }
446 
hfsplus_asc2uni(struct super_block * sb,struct hfsplus_unistr * ustr,int max_unistr_len,const char * astr,int len,int name_type)447 int hfsplus_asc2uni(struct super_block *sb,
448 		    struct hfsplus_unistr *ustr, int max_unistr_len,
449 		    const char *astr, int len, int name_type)
450 {
451 	int size, dsize, decompose;
452 	u16 *dstr, outlen = 0;
453 	wchar_t c;
454 	u16 dhangul[3];
455 
456 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
457 	while (outlen < max_unistr_len && len > 0) {
458 		size = asc2unichar(sb, astr, len, &c, name_type);
459 
460 		if (decompose)
461 			dstr = decompose_unichar(c, &dsize, dhangul);
462 		else
463 			dstr = NULL;
464 		if (dstr) {
465 			if (outlen + dsize > max_unistr_len)
466 				break;
467 			do {
468 				ustr->unicode[outlen++] = cpu_to_be16(*dstr++);
469 			} while (--dsize > 0);
470 		} else
471 			ustr->unicode[outlen++] = cpu_to_be16(c);
472 
473 		astr += size;
474 		len -= size;
475 	}
476 	ustr->length = cpu_to_be16(outlen);
477 	if (len > 0)
478 		return -ENAMETOOLONG;
479 	return 0;
480 }
481 EXPORT_SYMBOL_IF_KUNIT(hfsplus_asc2uni);
482 
483 /*
484  * Hash a string to an integer as appropriate for the HFS+ filesystem.
485  * Composed unicode characters are decomposed and case-folding is performed
486  * if the appropriate bits are (un)set on the superblock.
487  */
hfsplus_hash_dentry(const struct dentry * dentry,struct qstr * str)488 int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str)
489 {
490 	struct super_block *sb = dentry->d_sb;
491 	const char *astr;
492 	const u16 *dstr;
493 	int casefold, decompose, size, len;
494 	unsigned long hash;
495 	wchar_t c;
496 	u16 c2;
497 	u16 dhangul[3];
498 
499 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
500 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
501 	hash = init_name_hash(dentry);
502 	astr = str->name;
503 	len = str->len;
504 	while (len > 0) {
505 		int dsize;
506 		size = asc2unichar(sb, astr, len, &c, HFS_REGULAR_NAME);
507 		astr += size;
508 		len -= size;
509 
510 		if (decompose)
511 			dstr = decompose_unichar(c, &dsize, dhangul);
512 		else
513 			dstr = NULL;
514 		if (dstr) {
515 			do {
516 				c2 = *dstr++;
517 				if (casefold)
518 					c2 = case_fold(c2);
519 				if (!casefold || c2)
520 					hash = partial_name_hash(c2, hash);
521 			} while (--dsize > 0);
522 		} else {
523 			c2 = c;
524 			if (casefold)
525 				c2 = case_fold(c2);
526 			if (!casefold || c2)
527 				hash = partial_name_hash(c2, hash);
528 		}
529 	}
530 	str->hash = end_name_hash(hash);
531 
532 	return 0;
533 }
534 EXPORT_SYMBOL_IF_KUNIT(hfsplus_hash_dentry);
535 
536 /*
537  * Compare strings with HFS+ filename ordering.
538  * Composed unicode characters are decomposed and case-folding is performed
539  * if the appropriate bits are (un)set on the superblock.
540  */
hfsplus_compare_dentry(const struct dentry * dentry,unsigned int len,const char * str,const struct qstr * name)541 int hfsplus_compare_dentry(const struct dentry *dentry,
542 		unsigned int len, const char *str, const struct qstr *name)
543 {
544 	struct super_block *sb = dentry->d_sb;
545 	int casefold, decompose, size;
546 	int dsize1, dsize2, len1, len2;
547 	const u16 *dstr1, *dstr2;
548 	const char *astr1, *astr2;
549 	u16 c1, c2;
550 	wchar_t c;
551 	u16 dhangul_1[3], dhangul_2[3];
552 
553 	casefold = test_bit(HFSPLUS_SB_CASEFOLD, &HFSPLUS_SB(sb)->flags);
554 	decompose = !test_bit(HFSPLUS_SB_NODECOMPOSE, &HFSPLUS_SB(sb)->flags);
555 	astr1 = str;
556 	len1 = len;
557 	astr2 = name->name;
558 	len2 = name->len;
559 	dsize1 = dsize2 = 0;
560 	dstr1 = dstr2 = NULL;
561 
562 	while (len1 > 0 && len2 > 0) {
563 		if (!dsize1) {
564 			size = asc2unichar(sb, astr1, len1, &c,
565 					   HFS_REGULAR_NAME);
566 			astr1 += size;
567 			len1 -= size;
568 
569 			if (decompose)
570 				dstr1 = decompose_unichar(c, &dsize1,
571 							  dhangul_1);
572 			if (!decompose || !dstr1) {
573 				c1 = c;
574 				dstr1 = &c1;
575 				dsize1 = 1;
576 			}
577 		}
578 
579 		if (!dsize2) {
580 			size = asc2unichar(sb, astr2, len2, &c,
581 					   HFS_REGULAR_NAME);
582 			astr2 += size;
583 			len2 -= size;
584 
585 			if (decompose)
586 				dstr2 = decompose_unichar(c, &dsize2,
587 							  dhangul_2);
588 			if (!decompose || !dstr2) {
589 				c2 = c;
590 				dstr2 = &c2;
591 				dsize2 = 1;
592 			}
593 		}
594 
595 		c1 = *dstr1;
596 		c2 = *dstr2;
597 		if (casefold) {
598 			c1 = case_fold(c1);
599 			if (!c1) {
600 				dstr1++;
601 				dsize1--;
602 				continue;
603 			}
604 			c2 = case_fold(c2);
605 			if (!c2) {
606 				dstr2++;
607 				dsize2--;
608 				continue;
609 			}
610 		}
611 		if (c1 < c2)
612 			return -1;
613 		else if (c1 > c2)
614 			return 1;
615 
616 		dstr1++;
617 		dsize1--;
618 		dstr2++;
619 		dsize2--;
620 	}
621 
622 	if (len1 < len2)
623 		return -1;
624 	if (len1 > len2)
625 		return 1;
626 	return 0;
627 }
628 EXPORT_SYMBOL_IF_KUNIT(hfsplus_compare_dentry);
629