1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9""" 10 11import re 12 13# Local cache for regular expressions 14re_cache = {} 15 16 17class KernRe: 18 """ 19 Helper class to simplify regex declaration and usage, 20 21 It calls re.compile for a given pattern. It also allows adding 22 regular expressions and define sub at class init time. 23 24 Regular expressions can be cached via an argument, helping to speedup 25 searches. 26 """ 27 28 def _add_regex(self, string, flags): 29 """ 30 Adds a new regex or re-use it from the cache. 31 """ 32 33 if string in re_cache: 34 self.regex = re_cache[string] 35 else: 36 self.regex = re.compile(string, flags=flags) 37 38 if self.cache: 39 re_cache[string] = self.regex 40 41 def __init__(self, string, cache=True, flags=0): 42 """ 43 Compile a regular expression and initialize internal vars. 44 """ 45 46 self.cache = cache 47 self.last_match = None 48 49 self._add_regex(string, flags) 50 51 def __str__(self): 52 """ 53 Return the regular expression pattern. 54 """ 55 return self.regex.pattern 56 57 def __add__(self, other): 58 """ 59 Allows adding two regular expressions into one. 60 """ 61 62 return KernRe(str(self) + str(other), cache=self.cache or other.cache, 63 flags=self.regex.flags | other.regex.flags) 64 65 def match(self, string): 66 """ 67 Handles a re.match storing its results 68 """ 69 70 self.last_match = self.regex.match(string) 71 return self.last_match 72 73 def search(self, string): 74 """ 75 Handles a re.search storing its results 76 """ 77 78 self.last_match = self.regex.search(string) 79 return self.last_match 80 81 def findall(self, string): 82 """ 83 Alias to re.findall 84 """ 85 86 return self.regex.findall(string) 87 88 def split(self, string): 89 """ 90 Alias to re.split 91 """ 92 93 return self.regex.split(string) 94 95 def sub(self, sub, string, count=0): 96 """ 97 Alias to re.sub 98 """ 99 100 return self.regex.sub(sub, string, count=count) 101 102 def group(self, num): 103 """ 104 Returns the group results of the last match 105 """ 106 107 return self.last_match.group(num) 108 109 110class NestedMatch: 111 """ 112 Finding nested delimiters is hard with regular expressions. It is 113 even harder on Python with its normal re module, as there are several 114 advanced regular expressions that are missing. 115 116 This is the case of this pattern: 117 118 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 119 120 which is used to properly match open/close parenthesis of the 121 string search STRUCT_GROUP(), 122 123 Add a class that counts pairs of delimiters, using it to match and 124 replace nested expressions. 125 126 The original approach was suggested by: 127 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 128 129 Although I re-implemented it to make it more generic and match 3 types 130 of delimiters. The logic checks if delimiters are paired. If not, it 131 will ignore the search string. 132 """ 133 134 # TODO: make NestedMatch handle multiple match groups 135 # 136 # Right now, regular expressions to match it are defined only up to 137 # the start delimiter, e.g.: 138 # 139 # \bSTRUCT_GROUP\( 140 # 141 # is similar to: STRUCT_GROUP\((.*)\) 142 # except that the content inside the match group is delimiter's aligned. 143 # 144 # The content inside parenthesis are converted into a single replace 145 # group (e.g. r`\1'). 146 # 147 # It would be nice to change such definition to support multiple 148 # match groups, allowing a regex equivalent to. 149 # 150 # FOO\((.*), (.*), (.*)\) 151 # 152 # it is probably easier to define it not as a regular expression, but 153 # with some lexical definition like: 154 # 155 # FOO(arg1, arg2, arg3) 156 157 DELIMITER_PAIRS = { 158 '{': '}', 159 '(': ')', 160 '[': ']', 161 } 162 163 RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 164 165 def _search(self, regex, line): 166 """ 167 Finds paired blocks for a regex that ends with a delimiter. 168 169 The suggestion of using finditer to match pairs came from: 170 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 171 but I ended using a different implementation to align all three types 172 of delimiters and seek for an initial regular expression. 173 174 The algorithm seeks for open/close paired delimiters and place them 175 into a stack, yielding a start/stop position of each match when the 176 stack is zeroed. 177 178 The algorithm shoud work fine for properly paired lines, but will 179 silently ignore end delimiters that preceeds an start delimiter. 180 This should be OK for kernel-doc parser, as unaligned delimiters 181 would cause compilation errors. So, we don't need to rise exceptions 182 to cover such issues. 183 """ 184 185 stack = [] 186 187 for match_re in regex.finditer(line): 188 start = match_re.start() 189 offset = match_re.end() 190 191 d = line[offset - 1] 192 if d not in self.DELIMITER_PAIRS: 193 continue 194 195 end = self.DELIMITER_PAIRS[d] 196 stack.append(end) 197 198 for match in self.RE_DELIM.finditer(line[offset:]): 199 pos = match.start() + offset 200 201 d = line[pos] 202 203 if d in self.DELIMITER_PAIRS: 204 end = self.DELIMITER_PAIRS[d] 205 206 stack.append(end) 207 continue 208 209 # Does the end delimiter match what it is expected? 210 if stack and d == stack[-1]: 211 stack.pop() 212 213 if not stack: 214 yield start, offset, pos + 1 215 break 216 217 def search(self, regex, line): 218 """ 219 This is similar to re.search: 220 221 It matches a regex that it is followed by a delimiter, 222 returning occurrences only if all delimiters are paired. 223 """ 224 225 for t in self._search(regex, line): 226 227 yield line[t[0]:t[2]] 228 229 def sub(self, regex, sub, line, count=0): 230 """ 231 This is similar to re.sub: 232 233 It matches a regex that it is followed by a delimiter, 234 replacing occurrences only if all delimiters are paired. 235 236 if r'\1' is used, it works just like re: it places there the 237 matched paired data with the delimiter stripped. 238 239 If count is different than zero, it will replace at most count 240 items. 241 """ 242 out = "" 243 244 cur_pos = 0 245 n = 0 246 247 for start, end, pos in self._search(regex, line): 248 out += line[cur_pos:start] 249 250 # Value, ignoring start/end delimiters 251 value = line[end:pos - 1] 252 253 # replaces \1 at the sub string, if \1 is used there 254 new_sub = sub 255 new_sub = new_sub.replace(r'\1', value) 256 257 out += new_sub 258 259 # Drop end ';' if any 260 if line[pos] == ';': 261 pos += 1 262 263 cur_pos = pos 264 n += 1 265 266 if count and count >= n: 267 break 268 269 # Append the remaining string 270 l = len(line) 271 out += line[cur_pos:l] 272 273 return out 274