1#!/usr/bin/env python3 2# SPDX-License-Identifier: GPL-2.0 3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>. 4 5""" 6Regular expression ancillary classes. 7 8Those help caching regular expressions and do matching for kernel-doc. 9 10Please notice that the code here may rise exceptions to indicate bad 11usage inside kdoc to indicate problems at the replace pattern. 12 13Other errors are logged via log instance. 14""" 15 16import logging 17import re 18 19from copy import copy 20 21from .kdoc_re import KernRe 22 23log = logging.getLogger(__name__) 24 25def tokenizer_set_log(logger, prefix = ""): 26 """ 27 Replace the module‑level logger with a LoggerAdapter that 28 prepends *prefix* to every message. 29 """ 30 global log 31 32 class PrefixAdapter(logging.LoggerAdapter): 33 """ 34 Ancillary class to set prefix on all message logs. 35 """ 36 def process(self, msg, kwargs): 37 return f"{prefix}{msg}", kwargs 38 39 # Wrap the provided logger in our adapter 40 log = PrefixAdapter(logger, {"prefix": prefix}) 41 42class CToken(): 43 """ 44 Data class to define a C token. 45 """ 46 47 # Tokens that can be used by the parser. Works like an C enum. 48 49 COMMENT = 0 #: A standard C or C99 comment, including delimiter. 50 STRING = 1 #: A string, including quotation marks. 51 CHAR = 2 #: A character, including apostophes. 52 NUMBER = 3 #: A number. 53 PUNC = 4 #: A puntuation mark: / ``,`` / ``.``. 54 BEGIN = 5 #: A begin character: ``{`` / ``[`` / ``(``. 55 END = 6 #: A end character: ``}`` / ``]`` / ``)``. 56 CPP = 7 #: A preprocessor macro. 57 HASH = 8 #: The hash character - useful to handle other macros. 58 OP = 9 #: A C operator (add, subtract, ...). 59 STRUCT = 10 #: A ``struct`` keyword. 60 UNION = 11 #: An ``union`` keyword. 61 ENUM = 12 #: A ``struct`` keyword. 62 TYPEDEF = 13 #: A ``typedef`` keyword. 63 NAME = 14 #: A name. Can be an ID or a type. 64 SPACE = 15 #: Any space characters, including new lines 65 ENDSTMT = 16 #: End of an statement (``;``). 66 67 BACKREF = 17 #: Not a valid C sequence, but used at sub regex patterns. 68 69 MISMATCH = 255 #: an error indicator: should never happen in practice. 70 71 # Dict to convert from an enum interger into a string. 72 _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)} 73 74 # Dict to convert from string to an enum-like integer value. 75 _name_to_val = {k: v for v, k in _name_by_val.items()} 76 77 @staticmethod 78 def to_name(val): 79 """Convert from an integer value from CToken enum into a string""" 80 81 return CToken._name_by_val.get(val, f"UNKNOWN({val})") 82 83 @staticmethod 84 def from_name(name): 85 """Convert a string into a CToken enum value""" 86 if name in CToken._name_to_val: 87 return CToken._name_to_val[name] 88 89 return CToken.MISMATCH 90 91 92 def __init__(self, kind, value=None, pos=0, 93 brace_level=0, paren_level=0, bracket_level=0): 94 self.kind = kind 95 self.value = value 96 self.pos = pos 97 self.level = (bracket_level, paren_level, brace_level) 98 99 def __repr__(self): 100 name = self.to_name(self.kind) 101 if isinstance(self.value, str): 102 value = '"' + self.value + '"' 103 else: 104 value = self.value 105 106 return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})" 107 108#: Regexes to parse C code, transforming it into tokens. 109RE_SCANNER_LIST = [ 110 # 111 # Note that \s\S is different than .*, as it also catches \n 112 # 113 (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"), 114 115 (CToken.STRING, r'"(?:\\.|[^"\\])*"'), 116 (CToken.CHAR, r"'(?:\\.|[^'\\])'"), 117 118 (CToken.NUMBER, r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|" 119 r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"), 120 121 (CToken.ENDSTMT, r"(?:\s+;|;)"), 122 123 (CToken.PUNC, r"[,\.]"), 124 125 (CToken.BEGIN, r"[\[\(\{]"), 126 127 (CToken.END, r"[\]\)\}]"), 128 129 (CToken.CPP, r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"), 130 131 (CToken.HASH, r"#"), 132 133 (CToken.OP, r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%=" 134 r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"), 135 136 (CToken.STRUCT, r"\bstruct\b"), 137 (CToken.UNION, r"\bunion\b"), 138 (CToken.ENUM, r"\benum\b"), 139 (CToken.TYPEDEF, r"\btypedef\b"), 140 141 (CToken.NAME, r"[A-Za-z_]\w*"), 142 143 (CToken.SPACE, r"\s+"), 144 145 (CToken.BACKREF, r"\\\d+"), 146 147 (CToken.MISMATCH,r"."), 148] 149 150def fill_re_scanner(token_list): 151 """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex""" 152 re_tokens = [] 153 154 for kind, pattern in token_list: 155 name = CToken.to_name(kind) 156 re_tokens.append(f"(?P<{name}>{pattern})") 157 158 return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL) 159 160#: Handle C continuation lines. 161RE_CONT = KernRe(r"\\\n") 162 163RE_COMMENT_START = KernRe(r'/\*\s*') 164 165#: tokenizer regex. Will be filled at the first CTokenizer usage. 166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST) 167 168 169class CTokenizer(): 170 """ 171 Scan C statements and definitions and produce tokens. 172 173 When converted to string, it drops comments and handle public/private 174 values, respecting depth. 175 """ 176 177 # This class is inspired and follows the basic concepts of: 178 # https://docs.python.org/3/library/re.html#writing-a-tokenizer 179 180 def __init__(self, source=None): 181 """ 182 Create a regular expression to handle RE_SCANNER_LIST. 183 184 While I generally don't like using regex group naming via: 185 (?P<name>...) 186 187 in this particular case, it makes sense, as we can pick the name 188 when matching a code via RE_SCANNER. 189 """ 190 191 # 192 # Store logger to allow parser classes to re-use it 193 # 194 global log 195 self.log = log 196 197 self.tokens = [] 198 199 if not source: 200 return 201 202 if isinstance(source, list): 203 self.tokens = source 204 return 205 206 # 207 # While we could just use _tokenize directly via interator, 208 # As we'll need to use the tokenizer several times inside kernel-doc 209 # to handle macro transforms, cache the results on a list, as 210 # re-using it is cheaper than having to parse everytime. 211 # 212 for tok in self._tokenize(source): 213 self.tokens.append(tok) 214 215 def _tokenize(self, source): 216 """ 217 Iterator that parses ``source``, splitting it into tokens, as defined 218 at ``self.RE_SCANNER_LIST``. 219 220 The interactor returns a CToken class object. 221 """ 222 223 # Handle continuation lines. Note that kdoc_parser already has a 224 # logic to do that. Still, let's keep it for completeness, as we might 225 # end re-using this tokenizer outsize kernel-doc some day - or we may 226 # eventually remove from there as a future cleanup. 227 source = RE_CONT.sub("", source) 228 229 brace_level = 0 230 paren_level = 0 231 bracket_level = 0 232 233 for match in RE_SCANNER.finditer(source): 234 kind = CToken.from_name(match.lastgroup) 235 pos = match.start() 236 value = match.group() 237 238 if kind == CToken.MISMATCH: 239 log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'") 240 elif kind == CToken.BEGIN: 241 if value == '(': 242 paren_level += 1 243 elif value == '[': 244 bracket_level += 1 245 else: # value == '{' 246 brace_level += 1 247 248 elif kind == CToken.END: 249 if value == ')' and paren_level > 0: 250 paren_level -= 1 251 elif value == ']' and bracket_level > 0: 252 bracket_level -= 1 253 elif brace_level > 0: # value == '}' 254 brace_level -= 1 255 256 yield CToken(kind, value, pos, 257 brace_level, paren_level, bracket_level) 258 259 def __str__(self): 260 out="" 261 show_stack = [True] 262 263 for i, tok in enumerate(self.tokens): 264 if tok.kind == CToken.BEGIN: 265 show_stack.append(show_stack[-1]) 266 267 elif tok.kind == CToken.END: 268 prev = show_stack[-1] 269 if len(show_stack) > 1: 270 show_stack.pop() 271 272 if not prev and show_stack[-1]: 273 # 274 # Try to preserve indent 275 # 276 out += "\t" * (len(show_stack) - 1) 277 278 out += str(tok.value) 279 continue 280 281 elif tok.kind == CToken.COMMENT: 282 comment = RE_COMMENT_START.sub("", tok.value) 283 284 if comment.startswith("private:"): 285 show_stack[-1] = False 286 show = False 287 elif comment.startswith("public:"): 288 show_stack[-1] = True 289 290 continue 291 292 if not show_stack[-1]: 293 continue 294 295 if i < len(self.tokens) - 1: 296 next_tok = self.tokens[i + 1] 297 298 # Do some cleanups before ";" 299 300 if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT: 301 continue 302 303 if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind: 304 continue 305 306 out += str(tok.value) 307 308 return out 309 310 311class CTokenArgs: 312 """ 313 Ancillary class to help using backrefs from sub matches. 314 315 If the highest backref contain a "+" at the last element, 316 the logic will be greedy, picking all other delims. 317 318 This is needed to parse struct_group macros with end with ``MEMBERS...``. 319 """ 320 def __init__(self, sub_str): 321 self.sub_groups = set() 322 self.max_group = -1 323 self.greedy = None 324 325 for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str): 326 group = int(m.group(1)) 327 if m.group(2) == "+": 328 if self.greedy and self.greedy != group: 329 raise ValueError("There are multiple greedy patterns!") 330 self.greedy = group 331 332 self.sub_groups.add(group) 333 self.max_group = max(self.max_group, group) 334 335 if self.greedy: 336 if self.greedy != self.max_group: 337 raise ValueError("Greedy pattern is not the last one!") 338 339 sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str) 340 341 self.sub_str = sub_str 342 self.sub_tokeninzer = CTokenizer(sub_str) 343 344 def groups(self, new_tokenizer): 345 r""" 346 Create replacement arguments for backrefs like: 347 348 ``\0``, ``\1``, ``\2``, ... ``\{number}`` 349 350 It also accepts a ``+`` character to the highest backref, like 351 ``\4+``. When used, the backref will be greedy, picking all other 352 arguments afterwards. 353 354 The logic is smart enough to only go up to the maximum required 355 argument, even if there are more. 356 357 If there is a backref for an argument above the limit, it will 358 raise an exception. Please notice that, on C, square brackets 359 don't have any separator on it. Trying to use ``\1``..``\n`` for 360 brackets also raise an exception. 361 """ 362 363 level = (0, 0, 0) 364 365 if self.max_group < 0: 366 return level, [] 367 368 tokens = new_tokenizer.tokens 369 370 # 371 # Fill \0 with the full token contents 372 # 373 groups_list = [ [] ] 374 375 if 0 in self.sub_groups: 376 inner_level = 0 377 378 for i in range(0, len(tokens)): 379 tok = tokens[i] 380 381 if tok.kind == CToken.BEGIN: 382 inner_level += 1 383 384 # 385 # Discard first begin 386 # 387 if not groups_list[0]: 388 continue 389 elif tok.kind == CToken.END: 390 inner_level -= 1 391 if inner_level < 0: 392 break 393 394 if inner_level: 395 groups_list[0].append(tok) 396 397 if not self.max_group: 398 return level, groups_list 399 400 delim = None 401 402 # 403 # Ignore everything before BEGIN. The value of begin gives the 404 # delimiter to be used for the matches 405 # 406 for i in range(0, len(tokens)): 407 tok = tokens[i] 408 if tok.kind == CToken.BEGIN: 409 if tok.value == "{": 410 delim = ";" 411 elif tok.value == "(": 412 delim = "," 413 else: 414 self.log.error(fr"Can't handle \1..\n on {sub_str}") 415 416 level = tok.level 417 break 418 419 pos = 1 420 groups_list.append([]) 421 422 inner_level = 0 423 for i in range(i + 1, len(tokens)): 424 tok = tokens[i] 425 426 if tok.kind == CToken.BEGIN: 427 inner_level += 1 428 if tok.kind == CToken.END: 429 inner_level -= 1 430 if inner_level < 0: 431 break 432 433 if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value: 434 pos += 1 435 if self.greedy and pos > self.max_group: 436 pos -= 1 437 else: 438 groups_list.append([]) 439 440 if pos > self.max_group: 441 break 442 443 continue 444 445 groups_list[pos].append(tok) 446 447 if pos < self.max_group: 448 log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}") 449 450 return level, groups_list 451 452 def tokens(self, new_tokenizer): 453 level, groups = self.groups(new_tokenizer) 454 455 new = CTokenizer() 456 457 for tok in self.sub_tokeninzer.tokens: 458 if tok.kind == CToken.BACKREF: 459 group = int(tok.value[1:]) 460 461 for group_tok in groups[group]: 462 new_tok = copy(group_tok) 463 464 new_level = [0, 0, 0] 465 466 for i in range(0, len(level)): 467 new_level[i] = new_tok.level[i] + level[i] 468 469 new_tok.level = tuple(new_level) 470 471 new.tokens += [ new_tok ] 472 else: 473 new.tokens += [ tok ] 474 475 return new.tokens 476 477 478class CMatch: 479 """ 480 Finding nested delimiters is hard with regular expressions. It is 481 even harder on Python with its normal re module, as there are several 482 advanced regular expressions that are missing. 483 484 This is the case of this pattern:: 485 486 '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;' 487 488 which is used to properly match open/close parentheses of the 489 string search STRUCT_GROUP(), 490 491 Add a class that counts pairs of delimiters, using it to match and 492 replace nested expressions. 493 494 The original approach was suggested by: 495 496 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 497 498 Although I re-implemented it to make it more generic and match 3 types 499 of delimiters. The logic checks if delimiters are paired. If not, it 500 will ignore the search string. 501 """ 502 503 504 def __init__(self, regex, delim="("): 505 self.regex = KernRe("^" + regex + r"\b") 506 self.start_delim = delim 507 508 def _search(self, tokenizer): 509 """ 510 Finds paired blocks for a regex that ends with a delimiter. 511 512 The suggestion of using finditer to match pairs came from: 513 https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex 514 but I ended using a different implementation to align all three types 515 of delimiters and seek for an initial regular expression. 516 517 The algorithm seeks for open/close paired delimiters and places them 518 into a stack, yielding a start/stop position of each match when the 519 stack is zeroed. 520 521 The algorithm should work fine for properly paired lines, but will 522 silently ignore end delimiters that precede a start delimiter. 523 This should be OK for kernel-doc parser, as unaligned delimiters 524 would cause compilation errors. So, we don't need to raise exceptions 525 to cover such issues. 526 """ 527 528 start = None 529 started = False 530 531 import sys 532 533 stack = [] 534 535 for i, tok in enumerate(tokenizer.tokens): 536 if start is None: 537 if tok.kind == CToken.NAME and self.regex.match(tok.value): 538 start = i 539 stack.append((start, tok.level)) 540 started = False 541 542 continue 543 544 if not started: 545 if tok.kind == CToken.SPACE: 546 continue 547 548 if tok.kind == CToken.BEGIN and tok.value == self.start_delim: 549 started = True 550 continue 551 552 # Name only token without BEGIN/END 553 if i > start: 554 i -= 1 555 yield start, i 556 start = None 557 558 if tok.kind == CToken.END and tok.level == stack[-1][1]: 559 start, level = stack.pop() 560 561 yield start, i 562 start = None 563 564 # 565 # If an END zeroing levels is not there, return remaining stuff 566 # This is meant to solve cases where the caller logic might be 567 # picking an incomplete block. 568 # 569 if start and stack: 570 if started: 571 s = str(tokenizer) 572 log.warning(f"can't find a final end at {s}") 573 574 yield start, len(tokenizer.tokens) 575 576 def search(self, source): 577 """ 578 This is similar to re.search: 579 580 It matches a regex that it is followed by a delimiter, 581 returning occurrences only if all delimiters are paired. 582 """ 583 584 if isinstance(source, CTokenizer): 585 tokenizer = source 586 is_token = True 587 else: 588 tokenizer = CTokenizer(source) 589 is_token = False 590 591 for start, end in self._search(tokenizer): 592 new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1]) 593 594 if is_token: 595 yield new_tokenizer 596 else: 597 yield str(new_tokenizer) 598 599 def sub(self, sub_str, source, count=0): 600 """ 601 This is similar to re.sub: 602 603 It matches a regex that it is followed by a delimiter, 604 replacing occurrences only if all delimiters are paired. 605 606 if the sub argument contains:: 607 608 r'\0' 609 610 it will work just like re: it places there the matched paired data 611 with the delimiter stripped. 612 613 If count is different than zero, it will replace at most count 614 items. 615 """ 616 if isinstance(source, CTokenizer): 617 is_token = True 618 tokenizer = source 619 else: 620 is_token = False 621 tokenizer = CTokenizer(source) 622 623 # Detect if sub_str contains sub arguments 624 625 args_match = CTokenArgs(sub_str) 626 627 new_tokenizer = CTokenizer() 628 pos = 0 629 n = 0 630 631 # 632 # NOTE: the code below doesn't consider overlays at sub. 633 # We may need to add some extra unit tests to check if those 634 # would cause problems. When replacing by "", this should not 635 # be a problem, but other transformations could be problematic 636 # 637 for start, end in self._search(tokenizer): 638 new_tokenizer.tokens += tokenizer.tokens[pos:start] 639 640 new = CTokenizer(tokenizer.tokens[start:end + 1]) 641 642 new_tokenizer.tokens += args_match.tokens(new) 643 644 pos = end + 1 645 646 n += 1 647 if count and n >= count: 648 break 649 650 new_tokenizer.tokens += tokenizer.tokens[pos:] 651 652 if not is_token: 653 return str(new_tokenizer) 654 655 return new_tokenizer 656 657 def __repr__(self): 658 """ 659 Returns a displayable version of the class init. 660 """ 661 662 return f'CMatch("{self.regex.regex.pattern}")' 663