1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * ucs.c - Universal Character Set processing 4 */ 5 6 #include <linux/array_size.h> 7 #include <linux/bsearch.h> 8 #include <linux/consolemap.h> 9 #include <linux/minmax.h> 10 11 struct ucs_interval16 { 12 u16 first; 13 u16 last; 14 }; 15 16 struct ucs_interval32 { 17 u32 first; 18 u32 last; 19 }; 20 21 #include "ucs_width_table.h" 22 23 static int interval16_cmp(const void *key, const void *element) 24 { 25 u16 cp = *(u16 *)key; 26 const struct ucs_interval16 *entry = element; 27 28 if (cp < entry->first) 29 return -1; 30 if (cp > entry->last) 31 return 1; 32 return 0; 33 } 34 35 static int interval32_cmp(const void *key, const void *element) 36 { 37 u32 cp = *(u32 *)key; 38 const struct ucs_interval32 *entry = element; 39 40 if (cp < entry->first) 41 return -1; 42 if (cp > entry->last) 43 return 1; 44 return 0; 45 } 46 47 static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size) 48 { 49 if (cp < ranges[0].first || cp > ranges[size - 1].last) 50 return false; 51 52 return __inline_bsearch(&cp, ranges, size, sizeof(*ranges), 53 interval16_cmp) != NULL; 54 } 55 56 static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size) 57 { 58 if (cp < ranges[0].first || cp > ranges[size - 1].last) 59 return false; 60 61 return __inline_bsearch(&cp, ranges, size, sizeof(*ranges), 62 interval32_cmp) != NULL; 63 } 64 65 #define UCS_IS_BMP(cp) ((cp) <= 0xffff) 66 67 /** 68 * ucs_is_zero_width() - Determine if a Unicode code point is zero-width. 69 * @cp: Unicode code point (UCS-4) 70 * 71 * Return: true if the character is zero-width, false otherwise 72 */ 73 bool ucs_is_zero_width(u32 cp) 74 { 75 if (UCS_IS_BMP(cp)) 76 return cp_in_range16(cp, ucs_zero_width_bmp_ranges, 77 ARRAY_SIZE(ucs_zero_width_bmp_ranges)); 78 else 79 return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges, 80 ARRAY_SIZE(ucs_zero_width_non_bmp_ranges)); 81 } 82 83 /** 84 * ucs_is_double_width() - Determine if a Unicode code point is double-width. 85 * @cp: Unicode code point (UCS-4) 86 * 87 * Return: true if the character is double-width, false otherwise 88 */ 89 bool ucs_is_double_width(u32 cp) 90 { 91 if (UCS_IS_BMP(cp)) 92 return cp_in_range16(cp, ucs_double_width_bmp_ranges, 93 ARRAY_SIZE(ucs_double_width_bmp_ranges)); 94 else 95 return cp_in_range32(cp, ucs_double_width_non_bmp_ranges, 96 ARRAY_SIZE(ucs_double_width_non_bmp_ranges)); 97 } 98 99 /* 100 * Structure for base with combining mark pairs and resulting recompositions. 101 * Using u16 to save space since all values are within BMP range. 102 */ 103 struct ucs_recomposition { 104 u16 base; /* base character */ 105 u16 mark; /* combining mark */ 106 u16 recomposed; /* corresponding recomposed character */ 107 }; 108 109 #include "ucs_recompose_table.h" 110 111 struct compare_key { 112 u16 base; 113 u16 mark; 114 }; 115 116 static int recomposition_cmp(const void *key, const void *element) 117 { 118 const struct compare_key *search_key = key; 119 const struct ucs_recomposition *entry = element; 120 121 /* Compare base character first */ 122 if (search_key->base < entry->base) 123 return -1; 124 if (search_key->base > entry->base) 125 return 1; 126 127 /* Base characters match, now compare combining character */ 128 if (search_key->mark < entry->mark) 129 return -1; 130 if (search_key->mark > entry->mark) 131 return 1; 132 133 /* Both match */ 134 return 0; 135 } 136 137 /** 138 * ucs_recompose() - Attempt to recompose two Unicode characters into a single character. 139 * @base: Base Unicode code point (UCS-4) 140 * @mark: Combining mark Unicode code point (UCS-4) 141 * 142 * Return: Recomposed Unicode code point, or 0 if no recomposition is possible 143 */ 144 u32 ucs_recompose(u32 base, u32 mark) 145 { 146 /* Check if characters are within the range of our table */ 147 if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE || 148 mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK) 149 return 0; 150 151 struct compare_key key = { base, mark }; 152 struct ucs_recomposition *result = 153 __inline_bsearch(&key, ucs_recomposition_table, 154 ARRAY_SIZE(ucs_recomposition_table), 155 sizeof(*ucs_recomposition_table), 156 recomposition_cmp); 157 158 return result ? result->recomposed : 0; 159 } 160 161 /* 162 * The fallback table structures implement a 2-level lookup. 163 */ 164 165 struct ucs_page_desc { 166 u8 page; /* Page index (high byte of code points) */ 167 u8 count; /* Number of entries in this page */ 168 u16 start; /* Start index in entries array */ 169 }; 170 171 struct ucs_page_entry { 172 u8 offset; /* Offset within page (0-255) */ 173 u8 fallback; /* Fallback character or range start marker */ 174 }; 175 176 #include "ucs_fallback_table.h" 177 178 static int ucs_page_desc_cmp(const void *key, const void *element) 179 { 180 u8 page = *(u8 *)key; 181 const struct ucs_page_desc *entry = element; 182 183 if (page < entry->page) 184 return -1; 185 if (page > entry->page) 186 return 1; 187 return 0; 188 } 189 190 static int ucs_page_entry_cmp(const void *key, const void *element) 191 { 192 u8 offset = *(u8 *)key; 193 const struct ucs_page_entry *entry = element; 194 195 if (offset < entry->offset) 196 return -1; 197 if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) { 198 if (offset > entry[1].offset) 199 return 1; 200 } else { 201 if (offset > entry->offset) 202 return 1; 203 } 204 return 0; 205 } 206 207 /** 208 * ucs_get_fallback() - Get a substitution for the provided Unicode character 209 * @cp: Unicode code point (UCS-4) 210 * 211 * Get a simpler fallback character for the provided Unicode character. 212 * This is used for terminal display when corresponding glyph is unavailable. 213 * The substitution may not be as good as the actual glyph for the original 214 * character but still way more helpful than a squared question mark. 215 * 216 * Return: Fallback Unicode code point, or 0 if none is available 217 */ 218 u32 ucs_get_fallback(u32 cp) 219 { 220 const struct ucs_page_desc *page; 221 const struct ucs_page_entry *entry; 222 u8 page_idx = cp >> 8, offset = cp; 223 224 if (!UCS_IS_BMP(cp)) 225 return 0; 226 227 /* 228 * Full-width to ASCII mapping (covering all printable ASCII 33-126) 229 * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~) 230 * We process them programmatically to reduce the table size. 231 */ 232 if (cp >= 0xFF01 && cp <= 0xFF5E) 233 return cp - 0xFF01 + 33; 234 235 page = __inline_bsearch(&page_idx, ucs_fallback_pages, 236 ARRAY_SIZE(ucs_fallback_pages), 237 sizeof(*ucs_fallback_pages), 238 ucs_page_desc_cmp); 239 if (!page) 240 return 0; 241 242 entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start, 243 page->count, sizeof(*ucs_fallback_entries), 244 ucs_page_entry_cmp); 245 if (!entry) 246 return 0; 247 248 if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) 249 entry++; 250 return entry->fallback; 251 } 252