1df50e848SMauro Carvalho Chehab#!/usr/bin/env python3 2df50e848SMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0 3df50e848SMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4df50e848SMauro Carvalho Chehab 5df50e848SMauro Carvalho Chehab""" 6df50e848SMauro Carvalho ChehabRegular expression ancillary classes. 7df50e848SMauro Carvalho Chehab 8df50e848SMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc. 9df50e848SMauro Carvalho Chehab 10df50e848SMauro Carvalho ChehabPlease notice that the code here may rise exceptions to indicate bad 11df50e848SMauro Carvalho Chehabusage inside kdoc to indicate problems at the replace pattern. 12df50e848SMauro Carvalho Chehab 13df50e848SMauro Carvalho ChehabOther errors are logged via log instance. 14df50e848SMauro Carvalho Chehab""" 15df50e848SMauro Carvalho Chehab 16df50e848SMauro Carvalho Chehabimport logging 17df50e848SMauro Carvalho Chehabimport re 18df50e848SMauro Carvalho Chehab 199aaeb817SMauro Carvalho Chehabfrom copy import copy 209aaeb817SMauro Carvalho Chehab 21df50e848SMauro Carvalho Chehabfrom .kdoc_re import KernRe 22df50e848SMauro Carvalho Chehab 23df50e848SMauro Carvalho Chehablog = logging.getLogger(__name__) 24df50e848SMauro Carvalho Chehab 25024e200eSMauro Carvalho Chehabdef tokenizer_set_log(logger, prefix = ""): 26024e200eSMauro Carvalho Chehab """ 27024e200eSMauro Carvalho Chehab Replace the module‑level logger with a LoggerAdapter that 28024e200eSMauro Carvalho Chehab prepends *prefix* to every message. 29024e200eSMauro Carvalho Chehab """ 30024e200eSMauro Carvalho Chehab global log 31024e200eSMauro Carvalho Chehab 32024e200eSMauro Carvalho Chehab class PrefixAdapter(logging.LoggerAdapter): 33024e200eSMauro Carvalho Chehab """ 34024e200eSMauro Carvalho Chehab Ancillary class to set prefix on all message logs. 35024e200eSMauro Carvalho Chehab """ 36024e200eSMauro Carvalho Chehab def process(self, msg, kwargs): 37024e200eSMauro Carvalho Chehab return f"{prefix}{msg}", kwargs 38024e200eSMauro Carvalho Chehab 39024e200eSMauro Carvalho Chehab # Wrap the provided logger in our adapter 40024e200eSMauro Carvalho Chehab log = PrefixAdapter(logger, {"prefix": prefix}) 41df50e848SMauro Carvalho Chehab 42df50e848SMauro Carvalho Chehabclass CToken(): 43df50e848SMauro Carvalho Chehab """ 44df50e848SMauro Carvalho Chehab Data class to define a C token. 45df50e848SMauro Carvalho Chehab """ 46df50e848SMauro Carvalho Chehab 47df50e848SMauro Carvalho Chehab # Tokens that can be used by the parser. Works like an C enum. 48df50e848SMauro Carvalho Chehab 49df50e848SMauro Carvalho Chehab COMMENT = 0 #: A standard C or C99 comment, including delimiter. 50df50e848SMauro Carvalho Chehab STRING = 1 #: A string, including quotation marks. 51df50e848SMauro Carvalho Chehab CHAR = 2 #: A character, including apostophes. 52df50e848SMauro Carvalho Chehab NUMBER = 3 #: A number. 53df50e848SMauro Carvalho Chehab PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 54df50e848SMauro Carvalho Chehab BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 55df50e848SMauro Carvalho Chehab END = 6 #: A end character: ``}`` / ``]`` / ``)``. 56df50e848SMauro Carvalho Chehab CPP = 7 #: A preprocessor macro. 57df50e848SMauro Carvalho Chehab HASH = 8 #: The hash character - useful to handle other macros. 58df50e848SMauro Carvalho Chehab OP = 9 #: A C operator (add, subtract, ...). 59df50e848SMauro Carvalho Chehab STRUCT = 10 #: A ``struct`` keyword. 60df50e848SMauro Carvalho Chehab UNION = 11 #: An ``union`` keyword. 61df50e848SMauro Carvalho Chehab ENUM = 12 #: A ``struct`` keyword. 62df50e848SMauro Carvalho Chehab TYPEDEF = 13 #: A ``typedef`` keyword. 63df50e848SMauro Carvalho Chehab NAME = 14 #: A name. Can be an ID or a type. 64df50e848SMauro Carvalho Chehab SPACE = 15 #: Any space characters, including new lines 65df50e848SMauro Carvalho Chehab ENDSTMT = 16 #: End of an statement (``;``). 66df50e848SMauro Carvalho Chehab 67df50e848SMauro Carvalho Chehab BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 68df50e848SMauro Carvalho Chehab 69df50e848SMauro Carvalho Chehab MISMATCH = 255 #: an error indicator: should never happen in practice. 70df50e848SMauro Carvalho Chehab 71df50e848SMauro Carvalho Chehab # Dict to convert from an enum interger into a string. 72df50e848SMauro Carvalho Chehab _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 73df50e848SMauro Carvalho Chehab 74df50e848SMauro Carvalho Chehab # Dict to convert from string to an enum-like integer value. 75df50e848SMauro Carvalho Chehab _name_to_val = {k: v for v, k in _name_by_val.items()} 76df50e848SMauro Carvalho Chehab 77df50e848SMauro Carvalho Chehab @staticmethod 78df50e848SMauro Carvalho Chehab def to_name(val): 79df50e848SMauro Carvalho Chehab """Convert from an integer value from CToken enum into a string""" 80df50e848SMauro Carvalho Chehab 81df50e848SMauro Carvalho Chehab return CToken._name_by_val.get(val, f"UNKNOWN({val})") 82df50e848SMauro Carvalho Chehab 83df50e848SMauro Carvalho Chehab @staticmethod 84df50e848SMauro Carvalho Chehab def from_name(name): 85df50e848SMauro Carvalho Chehab """Convert a string into a CToken enum value""" 86df50e848SMauro Carvalho Chehab if name in CToken._name_to_val: 87df50e848SMauro Carvalho Chehab return CToken._name_to_val[name] 88df50e848SMauro Carvalho Chehab 89df50e848SMauro Carvalho Chehab return CToken.MISMATCH 90df50e848SMauro Carvalho Chehab 91df50e848SMauro Carvalho Chehab 92df50e848SMauro Carvalho Chehab def __init__(self, kind, value=None, pos=0, 93df50e848SMauro Carvalho Chehab brace_level=0, paren_level=0, bracket_level=0): 94df50e848SMauro Carvalho Chehab self.kind = kind 95df50e848SMauro Carvalho Chehab self.value = value 96df50e848SMauro Carvalho Chehab self.pos = pos 97df50e848SMauro Carvalho Chehab self.level = (bracket_level, paren_level, brace_level) 98df50e848SMauro Carvalho Chehab 99df50e848SMauro Carvalho Chehab def __repr__(self): 100df50e848SMauro Carvalho Chehab name = self.to_name(self.kind) 101df50e848SMauro Carvalho Chehab if isinstance(self.value, str): 102df50e848SMauro Carvalho Chehab value = '"' + self.value + '"' 103df50e848SMauro Carvalho Chehab else: 104df50e848SMauro Carvalho Chehab value = self.value 105df50e848SMauro Carvalho Chehab 106df50e848SMauro Carvalho Chehab return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 107df50e848SMauro Carvalho Chehab 108df50e848SMauro Carvalho Chehab#: Regexes to parse C code, transforming it into tokens. 109df50e848SMauro Carvalho ChehabRE_SCANNER_LIST = [ 110df50e848SMauro Carvalho Chehab # 111df50e848SMauro Carvalho Chehab # Note that \s\S is different than .*, as it also catches \n 112df50e848SMauro Carvalho Chehab # 113df50e848SMauro Carvalho Chehab (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 114df50e848SMauro Carvalho Chehab 115df50e848SMauro Carvalho Chehab (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 116df50e848SMauro Carvalho Chehab (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 117df50e848SMauro Carvalho Chehab 118df50e848SMauro Carvalho Chehab (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 119df50e848SMauro Carvalho Chehab r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 120df50e848SMauro Carvalho Chehab 121df50e848SMauro Carvalho Chehab (CToken.ENDSTMT, r"(?:\s+;|;)"), 122df50e848SMauro Carvalho Chehab 123df50e848SMauro Carvalho Chehab (CToken.PUNC, r"[,\.]"), 124df50e848SMauro Carvalho Chehab 125df50e848SMauro Carvalho Chehab (CToken.BEGIN, r"[\[\(\{]"), 126df50e848SMauro Carvalho Chehab 127df50e848SMauro Carvalho Chehab (CToken.END, r"[\]\)\}]"), 128df50e848SMauro Carvalho Chehab 129df50e848SMauro Carvalho Chehab (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 130df50e848SMauro Carvalho Chehab 131df50e848SMauro Carvalho Chehab (CToken.HASH, r"#"), 132df50e848SMauro Carvalho Chehab 133df50e848SMauro Carvalho Chehab (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 134df50e848SMauro Carvalho Chehab r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 135df50e848SMauro Carvalho Chehab 136df50e848SMauro Carvalho Chehab (CToken.STRUCT, r"\bstruct\b"), 137df50e848SMauro Carvalho Chehab (CToken.UNION, r"\bunion\b"), 138df50e848SMauro Carvalho Chehab (CToken.ENUM, r"\benum\b"), 139df50e848SMauro Carvalho Chehab (CToken.TYPEDEF, r"\btypedef\b"), 140df50e848SMauro Carvalho Chehab 141df50e848SMauro Carvalho Chehab (CToken.NAME, r"[A-Za-z_]\w*"), 142df50e848SMauro Carvalho Chehab 143df50e848SMauro Carvalho Chehab (CToken.SPACE, r"\s+"), 144df50e848SMauro Carvalho Chehab 145df50e848SMauro Carvalho Chehab (CToken.BACKREF, r"\\\d+"), 146df50e848SMauro Carvalho Chehab 147df50e848SMauro Carvalho Chehab (CToken.MISMATCH,r"."), 148df50e848SMauro Carvalho Chehab] 149df50e848SMauro Carvalho Chehab 150df50e848SMauro Carvalho Chehabdef fill_re_scanner(token_list): 151df50e848SMauro Carvalho Chehab """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 152df50e848SMauro Carvalho Chehab re_tokens = [] 153df50e848SMauro Carvalho Chehab 154df50e848SMauro Carvalho Chehab for kind, pattern in token_list: 155df50e848SMauro Carvalho Chehab name = CToken.to_name(kind) 156df50e848SMauro Carvalho Chehab re_tokens.append(f"(?P<{name}>{pattern})") 157df50e848SMauro Carvalho Chehab 158df50e848SMauro Carvalho Chehab return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 159df50e848SMauro Carvalho Chehab 160df50e848SMauro Carvalho Chehab#: Handle C continuation lines. 161df50e848SMauro Carvalho ChehabRE_CONT = KernRe(r"\\\n") 162df50e848SMauro Carvalho Chehab 163df50e848SMauro Carvalho ChehabRE_COMMENT_START = KernRe(r'/\*\s*') 164df50e848SMauro Carvalho Chehab 165df50e848SMauro Carvalho Chehab#: tokenizer regex. Will be filled at the first CTokenizer usage. 166df50e848SMauro Carvalho ChehabRE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 167df50e848SMauro Carvalho Chehab 168df50e848SMauro Carvalho Chehab 169df50e848SMauro Carvalho Chehabclass CTokenizer(): 170df50e848SMauro Carvalho Chehab """ 171df50e848SMauro Carvalho Chehab Scan C statements and definitions and produce tokens. 172df50e848SMauro Carvalho Chehab 173df50e848SMauro Carvalho Chehab When converted to string, it drops comments and handle public/private 174df50e848SMauro Carvalho Chehab values, respecting depth. 175df50e848SMauro Carvalho Chehab """ 176df50e848SMauro Carvalho Chehab 177df50e848SMauro Carvalho Chehab # This class is inspired and follows the basic concepts of: 178df50e848SMauro Carvalho Chehab # https://docs.python.org/3/library/re.html#writing-a-tokenizer 179df50e848SMauro Carvalho Chehab 180*2ca0b54dSMauro Carvalho Chehab def __init__(self, source=None): 181df50e848SMauro Carvalho Chehab """ 182df50e848SMauro Carvalho Chehab Create a regular expression to handle RE_SCANNER_LIST. 183df50e848SMauro Carvalho Chehab 184df50e848SMauro Carvalho Chehab While I generally don't like using regex group naming via: 185df50e848SMauro Carvalho Chehab (?P<name>...) 186df50e848SMauro Carvalho Chehab 187df50e848SMauro Carvalho Chehab in this particular case, it makes sense, as we can pick the name 188df50e848SMauro Carvalho Chehab when matching a code via RE_SCANNER. 189df50e848SMauro Carvalho Chehab """ 190df50e848SMauro Carvalho Chehab 191*2ca0b54dSMauro Carvalho Chehab # 192*2ca0b54dSMauro Carvalho Chehab # Store logger to allow parser classes to re-use it 193*2ca0b54dSMauro Carvalho Chehab # 194*2ca0b54dSMauro Carvalho Chehab global log 195*2ca0b54dSMauro Carvalho Chehab self.log = log 196*2ca0b54dSMauro Carvalho Chehab 197df50e848SMauro Carvalho Chehab self.tokens = [] 198df50e848SMauro Carvalho Chehab 199df50e848SMauro Carvalho Chehab if not source: 200df50e848SMauro Carvalho Chehab return 201df50e848SMauro Carvalho Chehab 202df50e848SMauro Carvalho Chehab if isinstance(source, list): 203df50e848SMauro Carvalho Chehab self.tokens = source 204df50e848SMauro Carvalho Chehab return 205df50e848SMauro Carvalho Chehab 206df50e848SMauro Carvalho Chehab # 207df50e848SMauro Carvalho Chehab # While we could just use _tokenize directly via interator, 208df50e848SMauro Carvalho Chehab # As we'll need to use the tokenizer several times inside kernel-doc 209df50e848SMauro Carvalho Chehab # to handle macro transforms, cache the results on a list, as 210df50e848SMauro Carvalho Chehab # re-using it is cheaper than having to parse everytime. 211df50e848SMauro Carvalho Chehab # 212df50e848SMauro Carvalho Chehab for tok in self._tokenize(source): 213df50e848SMauro Carvalho Chehab self.tokens.append(tok) 214df50e848SMauro Carvalho Chehab 215df50e848SMauro Carvalho Chehab def _tokenize(self, source): 216df50e848SMauro Carvalho Chehab """ 217df50e848SMauro Carvalho Chehab Iterator that parses ``source``, splitting it into tokens, as defined 218df50e848SMauro Carvalho Chehab at ``self.RE_SCANNER_LIST``. 219df50e848SMauro Carvalho Chehab 220df50e848SMauro Carvalho Chehab The interactor returns a CToken class object. 221df50e848SMauro Carvalho Chehab """ 222df50e848SMauro Carvalho Chehab 223df50e848SMauro Carvalho Chehab # Handle continuation lines. Note that kdoc_parser already has a 224df50e848SMauro Carvalho Chehab # logic to do that. Still, let's keep it for completeness, as we might 225df50e848SMauro Carvalho Chehab # end re-using this tokenizer outsize kernel-doc some day - or we may 226df50e848SMauro Carvalho Chehab # eventually remove from there as a future cleanup. 227df50e848SMauro Carvalho Chehab source = RE_CONT.sub("", source) 228df50e848SMauro Carvalho Chehab 229df50e848SMauro Carvalho Chehab brace_level = 0 230df50e848SMauro Carvalho Chehab paren_level = 0 231df50e848SMauro Carvalho Chehab bracket_level = 0 232df50e848SMauro Carvalho Chehab 233df50e848SMauro Carvalho Chehab for match in RE_SCANNER.finditer(source): 234df50e848SMauro Carvalho Chehab kind = CToken.from_name(match.lastgroup) 235df50e848SMauro Carvalho Chehab pos = match.start() 236df50e848SMauro Carvalho Chehab value = match.group() 237df50e848SMauro Carvalho Chehab 238df50e848SMauro Carvalho Chehab if kind == CToken.MISMATCH: 239df50e848SMauro Carvalho Chehab log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 240df50e848SMauro Carvalho Chehab elif kind == CToken.BEGIN: 241df50e848SMauro Carvalho Chehab if value == '(': 242df50e848SMauro Carvalho Chehab paren_level += 1 243df50e848SMauro Carvalho Chehab elif value == '[': 244df50e848SMauro Carvalho Chehab bracket_level += 1 245df50e848SMauro Carvalho Chehab else: # value == '{' 246df50e848SMauro Carvalho Chehab brace_level += 1 247df50e848SMauro Carvalho Chehab 248df50e848SMauro Carvalho Chehab elif kind == CToken.END: 249df50e848SMauro Carvalho Chehab if value == ')' and paren_level > 0: 250df50e848SMauro Carvalho Chehab paren_level -= 1 251df50e848SMauro Carvalho Chehab elif value == ']' and bracket_level > 0: 252df50e848SMauro Carvalho Chehab bracket_level -= 1 253df50e848SMauro Carvalho Chehab elif brace_level > 0: # value == '}' 254df50e848SMauro Carvalho Chehab brace_level -= 1 255df50e848SMauro Carvalho Chehab 256df50e848SMauro Carvalho Chehab yield CToken(kind, value, pos, 257df50e848SMauro Carvalho Chehab brace_level, paren_level, bracket_level) 258df50e848SMauro Carvalho Chehab 259df50e848SMauro Carvalho Chehab def __str__(self): 260df50e848SMauro Carvalho Chehab out="" 261df50e848SMauro Carvalho Chehab show_stack = [True] 262df50e848SMauro Carvalho Chehab 263df50e848SMauro Carvalho Chehab for i, tok in enumerate(self.tokens): 264df50e848SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 265df50e848SMauro Carvalho Chehab show_stack.append(show_stack[-1]) 266df50e848SMauro Carvalho Chehab 267df50e848SMauro Carvalho Chehab elif tok.kind == CToken.END: 268df50e848SMauro Carvalho Chehab prev = show_stack[-1] 269df50e848SMauro Carvalho Chehab if len(show_stack) > 1: 270df50e848SMauro Carvalho Chehab show_stack.pop() 271df50e848SMauro Carvalho Chehab 272df50e848SMauro Carvalho Chehab if not prev and show_stack[-1]: 273df50e848SMauro Carvalho Chehab # 274df50e848SMauro Carvalho Chehab # Try to preserve indent 275df50e848SMauro Carvalho Chehab # 276df50e848SMauro Carvalho Chehab out += "\t" * (len(show_stack) - 1) 277df50e848SMauro Carvalho Chehab 278df50e848SMauro Carvalho Chehab out += str(tok.value) 279df50e848SMauro Carvalho Chehab continue 280df50e848SMauro Carvalho Chehab 281df50e848SMauro Carvalho Chehab elif tok.kind == CToken.COMMENT: 282df50e848SMauro Carvalho Chehab comment = RE_COMMENT_START.sub("", tok.value) 283df50e848SMauro Carvalho Chehab 284df50e848SMauro Carvalho Chehab if comment.startswith("private:"): 285df50e848SMauro Carvalho Chehab show_stack[-1] = False 286df50e848SMauro Carvalho Chehab show = False 287df50e848SMauro Carvalho Chehab elif comment.startswith("public:"): 288df50e848SMauro Carvalho Chehab show_stack[-1] = True 289df50e848SMauro Carvalho Chehab 290df50e848SMauro Carvalho Chehab continue 291df50e848SMauro Carvalho Chehab 292df50e848SMauro Carvalho Chehab if not show_stack[-1]: 293df50e848SMauro Carvalho Chehab continue 294df50e848SMauro Carvalho Chehab 295df50e848SMauro Carvalho Chehab if i < len(self.tokens) - 1: 296df50e848SMauro Carvalho Chehab next_tok = self.tokens[i + 1] 297df50e848SMauro Carvalho Chehab 298df50e848SMauro Carvalho Chehab # Do some cleanups before ";" 299df50e848SMauro Carvalho Chehab 300f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 301df50e848SMauro Carvalho Chehab continue 302df50e848SMauro Carvalho Chehab 303f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 304df50e848SMauro Carvalho Chehab continue 305df50e848SMauro Carvalho Chehab 306df50e848SMauro Carvalho Chehab out += str(tok.value) 307df50e848SMauro Carvalho Chehab 308df50e848SMauro Carvalho Chehab return out 309f1cf9f7cSMauro Carvalho Chehab 310f1cf9f7cSMauro Carvalho Chehab 3119aaeb817SMauro Carvalho Chehabclass CTokenArgs: 3129aaeb817SMauro Carvalho Chehab """ 3139aaeb817SMauro Carvalho Chehab Ancillary class to help using backrefs from sub matches. 3149aaeb817SMauro Carvalho Chehab 3159aaeb817SMauro Carvalho Chehab If the highest backref contain a "+" at the last element, 3169aaeb817SMauro Carvalho Chehab the logic will be greedy, picking all other delims. 3179aaeb817SMauro Carvalho Chehab 3189aaeb817SMauro Carvalho Chehab This is needed to parse struct_group macros with end with ``MEMBERS...``. 3199aaeb817SMauro Carvalho Chehab """ 3209aaeb817SMauro Carvalho Chehab def __init__(self, sub_str): 3219aaeb817SMauro Carvalho Chehab self.sub_groups = set() 3229aaeb817SMauro Carvalho Chehab self.max_group = -1 3239aaeb817SMauro Carvalho Chehab self.greedy = None 3249aaeb817SMauro Carvalho Chehab 3259aaeb817SMauro Carvalho Chehab for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 3269aaeb817SMauro Carvalho Chehab group = int(m.group(1)) 3279aaeb817SMauro Carvalho Chehab if m.group(2) == "+": 3289aaeb817SMauro Carvalho Chehab if self.greedy and self.greedy != group: 3299aaeb817SMauro Carvalho Chehab raise ValueError("There are multiple greedy patterns!") 3309aaeb817SMauro Carvalho Chehab self.greedy = group 3319aaeb817SMauro Carvalho Chehab 3329aaeb817SMauro Carvalho Chehab self.sub_groups.add(group) 3339aaeb817SMauro Carvalho Chehab self.max_group = max(self.max_group, group) 3349aaeb817SMauro Carvalho Chehab 3359aaeb817SMauro Carvalho Chehab if self.greedy: 3369aaeb817SMauro Carvalho Chehab if self.greedy != self.max_group: 3379aaeb817SMauro Carvalho Chehab raise ValueError("Greedy pattern is not the last one!") 3389aaeb817SMauro Carvalho Chehab 3399aaeb817SMauro Carvalho Chehab sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 3409aaeb817SMauro Carvalho Chehab 3419aaeb817SMauro Carvalho Chehab self.sub_str = sub_str 3429aaeb817SMauro Carvalho Chehab self.sub_tokeninzer = CTokenizer(sub_str) 3439aaeb817SMauro Carvalho Chehab 3449aaeb817SMauro Carvalho Chehab def groups(self, new_tokenizer): 3458c0b7c0dSMauro Carvalho Chehab r""" 3469aaeb817SMauro Carvalho Chehab Create replacement arguments for backrefs like: 3479aaeb817SMauro Carvalho Chehab 3488c0b7c0dSMauro Carvalho Chehab ``\0``, ``\1``, ``\2``, ... ``\{number}`` 3499aaeb817SMauro Carvalho Chehab 3508c0b7c0dSMauro Carvalho Chehab It also accepts a ``+`` character to the highest backref, like 3518c0b7c0dSMauro Carvalho Chehab ``\4+``. When used, the backref will be greedy, picking all other 3528c0b7c0dSMauro Carvalho Chehab arguments afterwards. 3539aaeb817SMauro Carvalho Chehab 3549aaeb817SMauro Carvalho Chehab The logic is smart enough to only go up to the maximum required 3559aaeb817SMauro Carvalho Chehab argument, even if there are more. 3569aaeb817SMauro Carvalho Chehab 3579aaeb817SMauro Carvalho Chehab If there is a backref for an argument above the limit, it will 3589aaeb817SMauro Carvalho Chehab raise an exception. Please notice that, on C, square brackets 3599aaeb817SMauro Carvalho Chehab don't have any separator on it. Trying to use ``\1``..``\n`` for 3609aaeb817SMauro Carvalho Chehab brackets also raise an exception. 3619aaeb817SMauro Carvalho Chehab """ 3629aaeb817SMauro Carvalho Chehab 3639aaeb817SMauro Carvalho Chehab level = (0, 0, 0) 3649aaeb817SMauro Carvalho Chehab 3659aaeb817SMauro Carvalho Chehab if self.max_group < 0: 3669aaeb817SMauro Carvalho Chehab return level, [] 3679aaeb817SMauro Carvalho Chehab 3689aaeb817SMauro Carvalho Chehab tokens = new_tokenizer.tokens 3699aaeb817SMauro Carvalho Chehab 3709aaeb817SMauro Carvalho Chehab # 3719aaeb817SMauro Carvalho Chehab # Fill \0 with the full token contents 3729aaeb817SMauro Carvalho Chehab # 3739aaeb817SMauro Carvalho Chehab groups_list = [ [] ] 3749aaeb817SMauro Carvalho Chehab 3759aaeb817SMauro Carvalho Chehab if 0 in self.sub_groups: 3769aaeb817SMauro Carvalho Chehab inner_level = 0 3779aaeb817SMauro Carvalho Chehab 3789aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 3799aaeb817SMauro Carvalho Chehab tok = tokens[i] 3809aaeb817SMauro Carvalho Chehab 3819aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 3829aaeb817SMauro Carvalho Chehab inner_level += 1 3839aaeb817SMauro Carvalho Chehab 3849aaeb817SMauro Carvalho Chehab # 3859aaeb817SMauro Carvalho Chehab # Discard first begin 3869aaeb817SMauro Carvalho Chehab # 3879aaeb817SMauro Carvalho Chehab if not groups_list[0]: 3889aaeb817SMauro Carvalho Chehab continue 3899aaeb817SMauro Carvalho Chehab elif tok.kind == CToken.END: 3909aaeb817SMauro Carvalho Chehab inner_level -= 1 3919aaeb817SMauro Carvalho Chehab if inner_level < 0: 3929aaeb817SMauro Carvalho Chehab break 3939aaeb817SMauro Carvalho Chehab 3949aaeb817SMauro Carvalho Chehab if inner_level: 3959aaeb817SMauro Carvalho Chehab groups_list[0].append(tok) 3969aaeb817SMauro Carvalho Chehab 3979aaeb817SMauro Carvalho Chehab if not self.max_group: 3989aaeb817SMauro Carvalho Chehab return level, groups_list 3999aaeb817SMauro Carvalho Chehab 4009aaeb817SMauro Carvalho Chehab delim = None 4019aaeb817SMauro Carvalho Chehab 4029aaeb817SMauro Carvalho Chehab # 4039aaeb817SMauro Carvalho Chehab # Ignore everything before BEGIN. The value of begin gives the 4049aaeb817SMauro Carvalho Chehab # delimiter to be used for the matches 4059aaeb817SMauro Carvalho Chehab # 4069aaeb817SMauro Carvalho Chehab for i in range(0, len(tokens)): 4079aaeb817SMauro Carvalho Chehab tok = tokens[i] 4089aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 4099aaeb817SMauro Carvalho Chehab if tok.value == "{": 4109aaeb817SMauro Carvalho Chehab delim = ";" 4119aaeb817SMauro Carvalho Chehab elif tok.value == "(": 4129aaeb817SMauro Carvalho Chehab delim = "," 4139aaeb817SMauro Carvalho Chehab else: 4149aaeb817SMauro Carvalho Chehab self.log.error(fr"Can't handle \1..\n on {sub_str}") 4159aaeb817SMauro Carvalho Chehab 4169aaeb817SMauro Carvalho Chehab level = tok.level 4179aaeb817SMauro Carvalho Chehab break 4189aaeb817SMauro Carvalho Chehab 4199aaeb817SMauro Carvalho Chehab pos = 1 4209aaeb817SMauro Carvalho Chehab groups_list.append([]) 4219aaeb817SMauro Carvalho Chehab 4229aaeb817SMauro Carvalho Chehab inner_level = 0 4239aaeb817SMauro Carvalho Chehab for i in range(i + 1, len(tokens)): 4249aaeb817SMauro Carvalho Chehab tok = tokens[i] 4259aaeb817SMauro Carvalho Chehab 4269aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN: 4279aaeb817SMauro Carvalho Chehab inner_level += 1 4289aaeb817SMauro Carvalho Chehab if tok.kind == CToken.END: 4299aaeb817SMauro Carvalho Chehab inner_level -= 1 4309aaeb817SMauro Carvalho Chehab if inner_level < 0: 4319aaeb817SMauro Carvalho Chehab break 4329aaeb817SMauro Carvalho Chehab 4339aaeb817SMauro Carvalho Chehab if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 4349aaeb817SMauro Carvalho Chehab pos += 1 4359aaeb817SMauro Carvalho Chehab if self.greedy and pos > self.max_group: 4369aaeb817SMauro Carvalho Chehab pos -= 1 4379aaeb817SMauro Carvalho Chehab else: 4389aaeb817SMauro Carvalho Chehab groups_list.append([]) 4399aaeb817SMauro Carvalho Chehab 4409aaeb817SMauro Carvalho Chehab if pos > self.max_group: 4419aaeb817SMauro Carvalho Chehab break 4429aaeb817SMauro Carvalho Chehab 4439aaeb817SMauro Carvalho Chehab continue 4449aaeb817SMauro Carvalho Chehab 4459aaeb817SMauro Carvalho Chehab groups_list[pos].append(tok) 4469aaeb817SMauro Carvalho Chehab 4479aaeb817SMauro Carvalho Chehab if pos < self.max_group: 4489aaeb817SMauro Carvalho Chehab log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 4499aaeb817SMauro Carvalho Chehab 4509aaeb817SMauro Carvalho Chehab return level, groups_list 4519aaeb817SMauro Carvalho Chehab 4529aaeb817SMauro Carvalho Chehab def tokens(self, new_tokenizer): 4539aaeb817SMauro Carvalho Chehab level, groups = self.groups(new_tokenizer) 4549aaeb817SMauro Carvalho Chehab 4559aaeb817SMauro Carvalho Chehab new = CTokenizer() 4569aaeb817SMauro Carvalho Chehab 4579aaeb817SMauro Carvalho Chehab for tok in self.sub_tokeninzer.tokens: 4589aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BACKREF: 4599aaeb817SMauro Carvalho Chehab group = int(tok.value[1:]) 4609aaeb817SMauro Carvalho Chehab 4619aaeb817SMauro Carvalho Chehab for group_tok in groups[group]: 4629aaeb817SMauro Carvalho Chehab new_tok = copy(group_tok) 4639aaeb817SMauro Carvalho Chehab 4649aaeb817SMauro Carvalho Chehab new_level = [0, 0, 0] 4659aaeb817SMauro Carvalho Chehab 4669aaeb817SMauro Carvalho Chehab for i in range(0, len(level)): 4679aaeb817SMauro Carvalho Chehab new_level[i] = new_tok.level[i] + level[i] 4689aaeb817SMauro Carvalho Chehab 4699aaeb817SMauro Carvalho Chehab new_tok.level = tuple(new_level) 4709aaeb817SMauro Carvalho Chehab 4719aaeb817SMauro Carvalho Chehab new.tokens += [ new_tok ] 4729aaeb817SMauro Carvalho Chehab else: 4739aaeb817SMauro Carvalho Chehab new.tokens += [ tok ] 4749aaeb817SMauro Carvalho Chehab 4759aaeb817SMauro Carvalho Chehab return new.tokens 4769aaeb817SMauro Carvalho Chehab 4779aaeb817SMauro Carvalho Chehab 478f1cf9f7cSMauro Carvalho Chehabclass CMatch: 479f1cf9f7cSMauro Carvalho Chehab """ 480f1cf9f7cSMauro Carvalho Chehab Finding nested delimiters is hard with regular expressions. It is 481f1cf9f7cSMauro Carvalho Chehab even harder on Python with its normal re module, as there are several 482f1cf9f7cSMauro Carvalho Chehab advanced regular expressions that are missing. 483f1cf9f7cSMauro Carvalho Chehab 484f1cf9f7cSMauro Carvalho Chehab This is the case of this pattern:: 485f1cf9f7cSMauro Carvalho Chehab 486f1cf9f7cSMauro Carvalho Chehab '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 487f1cf9f7cSMauro Carvalho Chehab 488f1cf9f7cSMauro Carvalho Chehab which is used to properly match open/close parentheses of the 489f1cf9f7cSMauro Carvalho Chehab string search STRUCT_GROUP(), 490f1cf9f7cSMauro Carvalho Chehab 491f1cf9f7cSMauro Carvalho Chehab Add a class that counts pairs of delimiters, using it to match and 492f1cf9f7cSMauro Carvalho Chehab replace nested expressions. 493f1cf9f7cSMauro Carvalho Chehab 494f1cf9f7cSMauro Carvalho Chehab The original approach was suggested by: 495f1cf9f7cSMauro Carvalho Chehab 496f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 497f1cf9f7cSMauro Carvalho Chehab 498f1cf9f7cSMauro Carvalho Chehab Although I re-implemented it to make it more generic and match 3 types 499f1cf9f7cSMauro Carvalho Chehab of delimiters. The logic checks if delimiters are paired. If not, it 500f1cf9f7cSMauro Carvalho Chehab will ignore the search string. 501f1cf9f7cSMauro Carvalho Chehab """ 502f1cf9f7cSMauro Carvalho Chehab 503f1cf9f7cSMauro Carvalho Chehab 5049aaeb817SMauro Carvalho Chehab def __init__(self, regex, delim="("): 5059aaeb817SMauro Carvalho Chehab self.regex = KernRe("^" + regex + r"\b") 5069aaeb817SMauro Carvalho Chehab self.start_delim = delim 507f1cf9f7cSMauro Carvalho Chehab 508f1cf9f7cSMauro Carvalho Chehab def _search(self, tokenizer): 509f1cf9f7cSMauro Carvalho Chehab """ 510f1cf9f7cSMauro Carvalho Chehab Finds paired blocks for a regex that ends with a delimiter. 511f1cf9f7cSMauro Carvalho Chehab 512f1cf9f7cSMauro Carvalho Chehab The suggestion of using finditer to match pairs came from: 513f1cf9f7cSMauro Carvalho Chehab https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 514f1cf9f7cSMauro Carvalho Chehab but I ended using a different implementation to align all three types 515f1cf9f7cSMauro Carvalho Chehab of delimiters and seek for an initial regular expression. 516f1cf9f7cSMauro Carvalho Chehab 517f1cf9f7cSMauro Carvalho Chehab The algorithm seeks for open/close paired delimiters and places them 518f1cf9f7cSMauro Carvalho Chehab into a stack, yielding a start/stop position of each match when the 519f1cf9f7cSMauro Carvalho Chehab stack is zeroed. 520f1cf9f7cSMauro Carvalho Chehab 521f1cf9f7cSMauro Carvalho Chehab The algorithm should work fine for properly paired lines, but will 522f1cf9f7cSMauro Carvalho Chehab silently ignore end delimiters that precede a start delimiter. 523f1cf9f7cSMauro Carvalho Chehab This should be OK for kernel-doc parser, as unaligned delimiters 524f1cf9f7cSMauro Carvalho Chehab would cause compilation errors. So, we don't need to raise exceptions 525f1cf9f7cSMauro Carvalho Chehab to cover such issues. 526f1cf9f7cSMauro Carvalho Chehab """ 527f1cf9f7cSMauro Carvalho Chehab 528f1cf9f7cSMauro Carvalho Chehab start = None 529f1cf9f7cSMauro Carvalho Chehab started = False 530f1cf9f7cSMauro Carvalho Chehab 531f1cf9f7cSMauro Carvalho Chehab import sys 532f1cf9f7cSMauro Carvalho Chehab 533f1cf9f7cSMauro Carvalho Chehab stack = [] 534f1cf9f7cSMauro Carvalho Chehab 535f1cf9f7cSMauro Carvalho Chehab for i, tok in enumerate(tokenizer.tokens): 536f1cf9f7cSMauro Carvalho Chehab if start is None: 537f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.NAME and self.regex.match(tok.value): 538f1cf9f7cSMauro Carvalho Chehab start = i 539f1cf9f7cSMauro Carvalho Chehab stack.append((start, tok.level)) 540f1cf9f7cSMauro Carvalho Chehab started = False 541f1cf9f7cSMauro Carvalho Chehab 542f1cf9f7cSMauro Carvalho Chehab continue 543f1cf9f7cSMauro Carvalho Chehab 5449aaeb817SMauro Carvalho Chehab if not started: 5459aaeb817SMauro Carvalho Chehab if tok.kind == CToken.SPACE: 5469aaeb817SMauro Carvalho Chehab continue 5479aaeb817SMauro Carvalho Chehab 5489aaeb817SMauro Carvalho Chehab if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 549f1cf9f7cSMauro Carvalho Chehab started = True 550f1cf9f7cSMauro Carvalho Chehab continue 551f1cf9f7cSMauro Carvalho Chehab 5529aaeb817SMauro Carvalho Chehab # Name only token without BEGIN/END 5539aaeb817SMauro Carvalho Chehab if i > start: 5549aaeb817SMauro Carvalho Chehab i -= 1 5559aaeb817SMauro Carvalho Chehab yield start, i 5569aaeb817SMauro Carvalho Chehab start = None 5579aaeb817SMauro Carvalho Chehab 558f1cf9f7cSMauro Carvalho Chehab if tok.kind == CToken.END and tok.level == stack[-1][1]: 559f1cf9f7cSMauro Carvalho Chehab start, level = stack.pop() 560f1cf9f7cSMauro Carvalho Chehab 5619aaeb817SMauro Carvalho Chehab yield start, i 562f1cf9f7cSMauro Carvalho Chehab start = None 563f1cf9f7cSMauro Carvalho Chehab 564f1cf9f7cSMauro Carvalho Chehab # 565f1cf9f7cSMauro Carvalho Chehab # If an END zeroing levels is not there, return remaining stuff 566f1cf9f7cSMauro Carvalho Chehab # This is meant to solve cases where the caller logic might be 567f1cf9f7cSMauro Carvalho Chehab # picking an incomplete block. 568f1cf9f7cSMauro Carvalho Chehab # 5699aaeb817SMauro Carvalho Chehab if start and stack: 5709aaeb817SMauro Carvalho Chehab if started: 5719aaeb817SMauro Carvalho Chehab s = str(tokenizer) 5729aaeb817SMauro Carvalho Chehab log.warning(f"can't find a final end at {s}") 5739aaeb817SMauro Carvalho Chehab 5749aaeb817SMauro Carvalho Chehab yield start, len(tokenizer.tokens) 575f1cf9f7cSMauro Carvalho Chehab 576f1cf9f7cSMauro Carvalho Chehab def search(self, source): 577f1cf9f7cSMauro Carvalho Chehab """ 578f1cf9f7cSMauro Carvalho Chehab This is similar to re.search: 579f1cf9f7cSMauro Carvalho Chehab 580f1cf9f7cSMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 581f1cf9f7cSMauro Carvalho Chehab returning occurrences only if all delimiters are paired. 582f1cf9f7cSMauro Carvalho Chehab """ 583f1cf9f7cSMauro Carvalho Chehab 584f1cf9f7cSMauro Carvalho Chehab if isinstance(source, CTokenizer): 585f1cf9f7cSMauro Carvalho Chehab tokenizer = source 586f1cf9f7cSMauro Carvalho Chehab is_token = True 587f1cf9f7cSMauro Carvalho Chehab else: 588f1cf9f7cSMauro Carvalho Chehab tokenizer = CTokenizer(source) 589f1cf9f7cSMauro Carvalho Chehab is_token = False 590f1cf9f7cSMauro Carvalho Chehab 5919aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 5929aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 5939aaeb817SMauro Carvalho Chehab 594f1cf9f7cSMauro Carvalho Chehab if is_token: 595f1cf9f7cSMauro Carvalho Chehab yield new_tokenizer 596f1cf9f7cSMauro Carvalho Chehab else: 597f1cf9f7cSMauro Carvalho Chehab yield str(new_tokenizer) 5989aaeb817SMauro Carvalho Chehab 5999aaeb817SMauro Carvalho Chehab def sub(self, sub_str, source, count=0): 6009aaeb817SMauro Carvalho Chehab """ 6019aaeb817SMauro Carvalho Chehab This is similar to re.sub: 6029aaeb817SMauro Carvalho Chehab 6039aaeb817SMauro Carvalho Chehab It matches a regex that it is followed by a delimiter, 6049aaeb817SMauro Carvalho Chehab replacing occurrences only if all delimiters are paired. 6059aaeb817SMauro Carvalho Chehab 6069aaeb817SMauro Carvalho Chehab if the sub argument contains:: 6079aaeb817SMauro Carvalho Chehab 6089aaeb817SMauro Carvalho Chehab r'\0' 6099aaeb817SMauro Carvalho Chehab 6109aaeb817SMauro Carvalho Chehab it will work just like re: it places there the matched paired data 6119aaeb817SMauro Carvalho Chehab with the delimiter stripped. 6129aaeb817SMauro Carvalho Chehab 6139aaeb817SMauro Carvalho Chehab If count is different than zero, it will replace at most count 6149aaeb817SMauro Carvalho Chehab items. 6159aaeb817SMauro Carvalho Chehab """ 6169aaeb817SMauro Carvalho Chehab if isinstance(source, CTokenizer): 6179aaeb817SMauro Carvalho Chehab is_token = True 6189aaeb817SMauro Carvalho Chehab tokenizer = source 6199aaeb817SMauro Carvalho Chehab else: 6209aaeb817SMauro Carvalho Chehab is_token = False 6219aaeb817SMauro Carvalho Chehab tokenizer = CTokenizer(source) 6229aaeb817SMauro Carvalho Chehab 6239aaeb817SMauro Carvalho Chehab # Detect if sub_str contains sub arguments 6249aaeb817SMauro Carvalho Chehab 6259aaeb817SMauro Carvalho Chehab args_match = CTokenArgs(sub_str) 6269aaeb817SMauro Carvalho Chehab 6279aaeb817SMauro Carvalho Chehab new_tokenizer = CTokenizer() 6289aaeb817SMauro Carvalho Chehab pos = 0 6299aaeb817SMauro Carvalho Chehab n = 0 6309aaeb817SMauro Carvalho Chehab 6319aaeb817SMauro Carvalho Chehab # 6329aaeb817SMauro Carvalho Chehab # NOTE: the code below doesn't consider overlays at sub. 6339aaeb817SMauro Carvalho Chehab # We may need to add some extra unit tests to check if those 6349aaeb817SMauro Carvalho Chehab # would cause problems. When replacing by "", this should not 6359aaeb817SMauro Carvalho Chehab # be a problem, but other transformations could be problematic 6369aaeb817SMauro Carvalho Chehab # 6379aaeb817SMauro Carvalho Chehab for start, end in self._search(tokenizer): 6389aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:start] 6399aaeb817SMauro Carvalho Chehab 6409aaeb817SMauro Carvalho Chehab new = CTokenizer(tokenizer.tokens[start:end + 1]) 6419aaeb817SMauro Carvalho Chehab 6429aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += args_match.tokens(new) 6439aaeb817SMauro Carvalho Chehab 6449aaeb817SMauro Carvalho Chehab pos = end + 1 6459aaeb817SMauro Carvalho Chehab 6469aaeb817SMauro Carvalho Chehab n += 1 6479aaeb817SMauro Carvalho Chehab if count and n >= count: 6489aaeb817SMauro Carvalho Chehab break 6499aaeb817SMauro Carvalho Chehab 6509aaeb817SMauro Carvalho Chehab new_tokenizer.tokens += tokenizer.tokens[pos:] 6519aaeb817SMauro Carvalho Chehab 6529aaeb817SMauro Carvalho Chehab if not is_token: 6539aaeb817SMauro Carvalho Chehab return str(new_tokenizer) 6549aaeb817SMauro Carvalho Chehab 6559aaeb817SMauro Carvalho Chehab return new_tokenizer 6569aaeb817SMauro Carvalho Chehab 6579aaeb817SMauro Carvalho Chehab def __repr__(self): 6589aaeb817SMauro Carvalho Chehab """ 6599aaeb817SMauro Carvalho Chehab Returns a displayable version of the class init. 6609aaeb817SMauro Carvalho Chehab """ 6619aaeb817SMauro Carvalho Chehab 6629aaeb817SMauro Carvalho Chehab return f'CMatch("{self.regex.regex.pattern}")' 663