1e31fd36dSMauro Carvalho Chehab#!/usr/bin/env python3 2e31fd36dSMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3e31fd36dSMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4e31fd36dSMauro Carvalho Chehab 5e31fd36dSMauro Carvalho Chehab""" 6e31fd36dSMauro Carvalho ChehabRegular expression ancillary classes. 7e31fd36dSMauro Carvalho Chehab 8e31fd36dSMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9e31fd36dSMauro Carvalho Chehab""" 10e31fd36dSMauro Carvalho Chehab 11e31fd36dSMauro Carvalho Chehabimport re 12e31fd36dSMauro Carvalho Chehab 13e31fd36dSMauro Carvalho Chehab# Local cache for regular expressions 14e31fd36dSMauro Carvalho Chehabre_cache = {} 15e31fd36dSMauro Carvalho Chehab 16e31fd36dSMauro Carvalho Chehab 1704a383ceSMauro Carvalho Chehabclass KernRe: 18e31fd36dSMauro Carvalho Chehab """ 195f88f44dSRandy Dunlap Helper class to simplify regex declaration and usage. 20e31fd36dSMauro Carvalho Chehab 21e31fd36dSMauro Carvalho Chehab It calls re.compile for a given pattern. It also allows adding 22e31fd36dSMauro Carvalho Chehab regular expressions and define sub at class init time. 23e31fd36dSMauro Carvalho Chehab 24e31fd36dSMauro Carvalho Chehab Regular expressions can be cached via an argument, helping to speedup 25e31fd36dSMauro Carvalho Chehab searches. 26e31fd36dSMauro Carvalho Chehab """ 27e31fd36dSMauro Carvalho Chehab 28e31fd36dSMauro Carvalho Chehab def _add_regex(self, string, flags): 29e31fd36dSMauro Carvalho Chehab """ 305f88f44dSRandy Dunlap Adds a new regex or reuses it from the cache. 31e31fd36dSMauro Carvalho Chehab """ 328078e0edSJonathan Corbet self.regex = re_cache.get(string, None) 338078e0edSJonathan Corbet if not self.regex: 34e31fd36dSMauro Carvalho Chehab self.regex = re.compile(string, flags=flags) 35e31fd36dSMauro Carvalho Chehab if self.cache: 36e31fd36dSMauro Carvalho Chehab re_cache[string] = self.regex 37e31fd36dSMauro Carvalho Chehab 38e31fd36dSMauro Carvalho Chehab def __init__(self, string, cache=True, flags=0): 39e31fd36dSMauro Carvalho Chehab """ 40e31fd36dSMauro Carvalho Chehab Compile a regular expression and initialize internal vars. 41e31fd36dSMauro Carvalho Chehab """ 42e31fd36dSMauro Carvalho Chehab 43e31fd36dSMauro Carvalho Chehab self.cache = cache 44e31fd36dSMauro Carvalho Chehab self.last_match = None 45e31fd36dSMauro Carvalho Chehab 46e31fd36dSMauro Carvalho Chehab self._add_regex(string, flags) 47e31fd36dSMauro Carvalho Chehab 48e31fd36dSMauro Carvalho Chehab def __str__(self): 49e31fd36dSMauro Carvalho Chehab """ 50e31fd36dSMauro Carvalho Chehab Return the regular expression pattern. 51e31fd36dSMauro Carvalho Chehab """ 52e31fd36dSMauro Carvalho Chehab return self.regex.pattern 53e31fd36dSMauro Carvalho Chehab 54b0b88915SMauro Carvalho Chehab def __repr__(self): 55b0b88915SMauro Carvalho Chehab return f're.compile("{self.regex.pattern}")' 56b0b88915SMauro Carvalho Chehab 57e31fd36dSMauro Carvalho Chehab def __add__(self, other): 58e31fd36dSMauro Carvalho Chehab """ 59e31fd36dSMauro Carvalho Chehab Allows adding two regular expressions into one. 60e31fd36dSMauro Carvalho Chehab """ 61e31fd36dSMauro Carvalho Chehab 6204a383ceSMauro Carvalho Chehab return KernRe(str(self) + str(other), cache=self.cache or other.cache, 63e31fd36dSMauro Carvalho Chehab flags=self.regex.flags | other.regex.flags) 64e31fd36dSMauro Carvalho Chehab 65e31fd36dSMauro Carvalho Chehab def match(self, string): 66e31fd36dSMauro Carvalho Chehab """ 67b0b88915SMauro Carvalho Chehab Handles a re.match storing its results. 68e31fd36dSMauro Carvalho Chehab """ 69e31fd36dSMauro Carvalho Chehab 70e31fd36dSMauro Carvalho Chehab self.last_match = self.regex.match(string) 71e31fd36dSMauro Carvalho Chehab return self.last_match 72e31fd36dSMauro Carvalho Chehab 73e31fd36dSMauro Carvalho Chehab def search(self, string): 74e31fd36dSMauro Carvalho Chehab """ 75b0b88915SMauro Carvalho Chehab Handles a re.search storing its results. 76e31fd36dSMauro Carvalho Chehab """ 77e31fd36dSMauro Carvalho Chehab 78e31fd36dSMauro Carvalho Chehab self.last_match = self.regex.search(string) 79e31fd36dSMauro Carvalho Chehab return self.last_match 80e31fd36dSMauro Carvalho Chehab 81e31fd36dSMauro Carvalho Chehab def findall(self, string): 82e31fd36dSMauro Carvalho Chehab """ 83b0b88915SMauro Carvalho Chehab Alias to re.findall. 84e31fd36dSMauro Carvalho Chehab """ 85e31fd36dSMauro Carvalho Chehab 86e31fd36dSMauro Carvalho Chehab return self.regex.findall(string) 87e31fd36dSMauro Carvalho Chehab 88e31fd36dSMauro Carvalho Chehab def split(self, string): 89e31fd36dSMauro Carvalho Chehab """ 90b0b88915SMauro Carvalho Chehab Alias to re.split. 91e31fd36dSMauro Carvalho Chehab """ 92e31fd36dSMauro Carvalho Chehab 93e31fd36dSMauro Carvalho Chehab return self.regex.split(string) 94e31fd36dSMauro Carvalho Chehab 95e31fd36dSMauro Carvalho Chehab def sub(self, sub, string, count=0): 96e31fd36dSMauro Carvalho Chehab """ 97b0b88915SMauro Carvalho Chehab Alias to re.sub. 98e31fd36dSMauro Carvalho Chehab """ 99e31fd36dSMauro Carvalho Chehab 100e31fd36dSMauro Carvalho Chehab return self.regex.sub(sub, string, count=count) 101e31fd36dSMauro Carvalho Chehab 102e31fd36dSMauro Carvalho Chehab def group(self, num): 103e31fd36dSMauro Carvalho Chehab """ 104b0b88915SMauro Carvalho Chehab Returns the group results of the last match. 105e31fd36dSMauro Carvalho Chehab """ 106e31fd36dSMauro Carvalho Chehab 107e31fd36dSMauro Carvalho Chehab return self.last_match.group(num) 108e31fd36dSMauro Carvalho Chehab 109e31fd36dSMauro Carvalho Chehab 110e31fd36dSMauro Carvalho Chehabclass NestedMatch: 111e31fd36dSMauro Carvalho Chehab """ 112e31fd36dSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 113e31fd36dSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 114e31fd36dSMauro Carvalho Chehab advanced regular expressions that are missing. 115e31fd36dSMauro Carvalho Chehab 116b0b88915SMauro Carvalho Chehab This is the case of this pattern:: 117e31fd36dSMauro Carvalho Chehab 118e31fd36dSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 119e31fd36dSMauro Carvalho Chehab 1205f88f44dSRandy Dunlap which is used to properly match open/close parentheses of the 121e31fd36dSMauro Carvalho Chehab string search STRUCT_GROUP(), 122e31fd36dSMauro Carvalho Chehab 123e31fd36dSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 124e31fd36dSMauro Carvalho Chehab replace nested expressions. 125e31fd36dSMauro Carvalho Chehab 126e31fd36dSMauro Carvalho Chehab The original approach was suggested by: 127b0b88915SMauro Carvalho Chehab 128e31fd36dSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 129e31fd36dSMauro Carvalho Chehab 130e31fd36dSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 131e31fd36dSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 132e31fd36dSMauro Carvalho Chehab will ignore the search string. 133e31fd36dSMauro Carvalho Chehab """ 134e31fd36dSMauro Carvalho Chehab 135485f6f79SMauro Carvalho Chehab # TODO: make NestedMatch handle multiple match groups 136485f6f79SMauro Carvalho Chehab # 137e31fd36dSMauro Carvalho Chehab # Right now, regular expressions to match it are defined only up to 138e31fd36dSMauro Carvalho Chehab # the start delimiter, e.g.: 139e31fd36dSMauro Carvalho Chehab # 140e31fd36dSMauro Carvalho Chehab # \bSTRUCT_GROUP\( 141e31fd36dSMauro Carvalho Chehab # 142e31fd36dSMauro Carvalho Chehab # is similar to: STRUCT_GROUP\((.*)\) 1435f88f44dSRandy Dunlap # except that the content inside the match group is delimiter-aligned. 144e31fd36dSMauro Carvalho Chehab # 1455f88f44dSRandy Dunlap # The content inside parentheses is converted into a single replace 146e31fd36dSMauro Carvalho Chehab # group (e.g. r`\1'). 147e31fd36dSMauro Carvalho Chehab # 148e31fd36dSMauro Carvalho Chehab # It would be nice to change such definition to support multiple 1495f88f44dSRandy Dunlap # match groups, allowing a regex equivalent to: 150e31fd36dSMauro Carvalho Chehab # 151e31fd36dSMauro Carvalho Chehab # FOO\((.*), (.*), (.*)\) 152e31fd36dSMauro Carvalho Chehab # 153e31fd36dSMauro Carvalho Chehab # it is probably easier to define it not as a regular expression, but 154e31fd36dSMauro Carvalho Chehab # with some lexical definition like: 155e31fd36dSMauro Carvalho Chehab # 156e31fd36dSMauro Carvalho Chehab # FOO(arg1, arg2, arg3) 157e31fd36dSMauro Carvalho Chehab 158e31fd36dSMauro Carvalho Chehab DELIMITER_PAIRS = { 159e31fd36dSMauro Carvalho Chehab '{': '}', 160e31fd36dSMauro Carvalho Chehab '(': ')', 161e31fd36dSMauro Carvalho Chehab '[': ']', 162e31fd36dSMauro Carvalho Chehab } 163e31fd36dSMauro Carvalho Chehab 164e31fd36dSMauro Carvalho Chehab RE_DELIM = re.compile(r'[\{\}\[\]\(\)]') 165e31fd36dSMauro Carvalho Chehab 166e31fd36dSMauro Carvalho Chehab def _search(self, regex, line): 167e31fd36dSMauro Carvalho Chehab """ 168e31fd36dSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 169e31fd36dSMauro Carvalho Chehab 170e31fd36dSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 171e31fd36dSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 172e31fd36dSMauro Carvalho Chehab but I ended using a different implementation to align all three types 173e31fd36dSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 174e31fd36dSMauro Carvalho Chehab 1755f88f44dSRandy Dunlap The algorithm seeks for open/close paired delimiters and places them 176e31fd36dSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 177e31fd36dSMauro Carvalho Chehab stack is zeroed. 178e31fd36dSMauro Carvalho Chehab 1795f88f44dSRandy Dunlap The algorithm should work fine for properly paired lines, but will 1805f88f44dSRandy Dunlap silently ignore end delimiters that precede a start delimiter. 181e31fd36dSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 1825f88f44dSRandy Dunlap would cause compilation errors. So, we don't need to raise exceptions 183e31fd36dSMauro Carvalho Chehab to cover such issues. 184e31fd36dSMauro Carvalho Chehab """ 185e31fd36dSMauro Carvalho Chehab 186e31fd36dSMauro Carvalho Chehab stack = [] 187e31fd36dSMauro Carvalho Chehab 188e31fd36dSMauro Carvalho Chehab for match_re in regex.finditer(line): 189e31fd36dSMauro Carvalho Chehab start = match_re.start() 190e31fd36dSMauro Carvalho Chehab offset = match_re.end() 191e31fd36dSMauro Carvalho Chehab 192e31fd36dSMauro Carvalho Chehab d = line[offset - 1] 193e31fd36dSMauro Carvalho Chehab if d not in self.DELIMITER_PAIRS: 194e31fd36dSMauro Carvalho Chehab continue 195e31fd36dSMauro Carvalho Chehab 196e31fd36dSMauro Carvalho Chehab end = self.DELIMITER_PAIRS[d] 197e31fd36dSMauro Carvalho Chehab stack.append(end) 198e31fd36dSMauro Carvalho Chehab 199e31fd36dSMauro Carvalho Chehab for match in self.RE_DELIM.finditer(line[offset:]): 200e31fd36dSMauro Carvalho Chehab pos = match.start() + offset 201e31fd36dSMauro Carvalho Chehab 202e31fd36dSMauro Carvalho Chehab d = line[pos] 203e31fd36dSMauro Carvalho Chehab 204e31fd36dSMauro Carvalho Chehab if d in self.DELIMITER_PAIRS: 205e31fd36dSMauro Carvalho Chehab end = self.DELIMITER_PAIRS[d] 206e31fd36dSMauro Carvalho Chehab 207e31fd36dSMauro Carvalho Chehab stack.append(end) 208e31fd36dSMauro Carvalho Chehab continue 209e31fd36dSMauro Carvalho Chehab 2105f88f44dSRandy Dunlap # Does the end delimiter match what is expected? 211e31fd36dSMauro Carvalho Chehab if stack and d == stack[-1]: 212e31fd36dSMauro Carvalho Chehab stack.pop() 213e31fd36dSMauro Carvalho Chehab 214e31fd36dSMauro Carvalho Chehab if not stack: 215e31fd36dSMauro Carvalho Chehab yield start, offset, pos + 1 216e31fd36dSMauro Carvalho Chehab break 217e31fd36dSMauro Carvalho Chehab 218e31fd36dSMauro Carvalho Chehab def search(self, regex, line): 219e31fd36dSMauro Carvalho Chehab """ 220e31fd36dSMauro Carvalho Chehab This is similar to re.search: 221e31fd36dSMauro Carvalho Chehab 222e31fd36dSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 223e31fd36dSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 224e31fd36dSMauro Carvalho Chehab """ 225e31fd36dSMauro Carvalho Chehab 226e31fd36dSMauro Carvalho Chehab for t in self._search(regex, line): 227e31fd36dSMauro Carvalho Chehab 228e31fd36dSMauro Carvalho Chehab yield line[t[0]:t[2]] 229e31fd36dSMauro Carvalho Chehab 230e31fd36dSMauro Carvalho Chehab def sub(self, regex, sub, line, count=0): 231*98f51c46SMauro Carvalho Chehab r""" 232e31fd36dSMauro Carvalho Chehab This is similar to re.sub: 233e31fd36dSMauro Carvalho Chehab 234e31fd36dSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 235e31fd36dSMauro Carvalho Chehab replacing occurrences only if all delimiters are paired. 236e31fd36dSMauro Carvalho Chehab 237*98f51c46SMauro Carvalho Chehab if the sub argument contains:: 238*98f51c46SMauro Carvalho Chehab 239*98f51c46SMauro Carvalho Chehab r'\1' 240*98f51c46SMauro Carvalho Chehab 241*98f51c46SMauro Carvalho Chehab it will work just like re: it places there the matched paired data 242*98f51c46SMauro Carvalho Chehab with the delimiter stripped. 243e31fd36dSMauro Carvalho Chehab 244e31fd36dSMauro Carvalho Chehab If count is different than zero, it will replace at most count 245e31fd36dSMauro Carvalho Chehab items. 246e31fd36dSMauro Carvalho Chehab """ 247e31fd36dSMauro Carvalho Chehab out = "" 248e31fd36dSMauro Carvalho Chehab 249e31fd36dSMauro Carvalho Chehab cur_pos = 0 250e31fd36dSMauro Carvalho Chehab n = 0 251e31fd36dSMauro Carvalho Chehab 252e31fd36dSMauro Carvalho Chehab for start, end, pos in self._search(regex, line): 253e31fd36dSMauro Carvalho Chehab out += line[cur_pos:start] 254e31fd36dSMauro Carvalho Chehab 255e31fd36dSMauro Carvalho Chehab # Value, ignoring start/end delimiters 256e31fd36dSMauro Carvalho Chehab value = line[end:pos - 1] 257e31fd36dSMauro Carvalho Chehab 258e31fd36dSMauro Carvalho Chehab # replaces \1 at the sub string, if \1 is used there 259e31fd36dSMauro Carvalho Chehab new_sub = sub 260e31fd36dSMauro Carvalho Chehab new_sub = new_sub.replace(r'\1', value) 261e31fd36dSMauro Carvalho Chehab 262e31fd36dSMauro Carvalho Chehab out += new_sub 263e31fd36dSMauro Carvalho Chehab 264e31fd36dSMauro Carvalho Chehab # Drop end ';' if any 265e31fd36dSMauro Carvalho Chehab if line[pos] == ';': 266e31fd36dSMauro Carvalho Chehab pos += 1 267e31fd36dSMauro Carvalho Chehab 268e31fd36dSMauro Carvalho Chehab cur_pos = pos 269e31fd36dSMauro Carvalho Chehab n += 1 270e31fd36dSMauro Carvalho Chehab 271e31fd36dSMauro Carvalho Chehab if count and count >= n: 272e31fd36dSMauro Carvalho Chehab break 273e31fd36dSMauro Carvalho Chehab 274e31fd36dSMauro Carvalho Chehab # Append the remaining string 275e31fd36dSMauro Carvalho Chehab l = len(line) 276e31fd36dSMauro Carvalho Chehab out += line[cur_pos:l] 277e31fd36dSMauro Carvalho Chehab 278e31fd36dSMauro Carvalho Chehab return out 279