xref: /linux/tools/lib/python/kdoc/c_lex.py (revision 5181afcdf99527dd92a88f80fc4d0d8013e1b510)
1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9
10Please notice that the code here may rise exceptions to indicate bad
11usage inside kdoc to indicate problems at the replace pattern.
12
13Other errors are logged via log instance.
14"""
15
16import logging
17import re
18
19from copy import copy
20
21from .kdoc_re import KernRe
22
23log = logging.getLogger(__name__)
24
25def tokenizer_set_log(logger, prefix = ""):
26    """
27    Replace the module‑level logger with a LoggerAdapter that
28    prepends *prefix* to every message.
29    """
30    global log
31
32    class PrefixAdapter(logging.LoggerAdapter):
33        """
34        Ancillary class to set prefix on all message logs.
35        """
36        def process(self, msg, kwargs):
37            return f"{prefix}{msg}", kwargs
38
39    # Wrap the provided logger in our adapter
40    log = PrefixAdapter(logger, {"prefix": prefix})
41
42class CToken():
43    """
44    Data class to define a C token.
45    """
46
47    # Tokens that can be used by the parser. Works like an C enum.
48
49    COMMENT = 0     #: A standard C or C99 comment, including delimiter.
50    STRING = 1      #: A string, including quotation marks.
51    CHAR = 2        #: A character, including apostophes.
52    NUMBER = 3      #: A number.
53    PUNC = 4        #: A puntuation mark: / ``,`` / ``.``.
54    BEGIN = 5       #: A begin character: ``{`` / ``[`` / ``(``.
55    END = 6         #: A end character: ``}`` / ``]`` / ``)``.
56    CPP = 7         #: A preprocessor macro.
57    HASH = 8        #: The hash character - useful to handle other macros.
58    OP = 9          #: A C operator (add, subtract, ...).
59    STRUCT = 10     #: A ``struct`` keyword.
60    UNION = 11      #: An ``union`` keyword.
61    ENUM = 12       #: A ``struct`` keyword.
62    TYPEDEF = 13    #: A ``typedef`` keyword.
63    NAME = 14       #: A name. Can be an ID or a type.
64    SPACE = 15      #: Any space characters, including new lines
65    ENDSTMT = 16    #: End of an statement (``;``).
66
67    BACKREF = 17    #: Not a valid C sequence, but used at sub regex patterns.
68
69    MISMATCH = 255  #: an error indicator: should never happen in practice.
70
71    # Dict to convert from an enum interger into a string.
72    _name_by_val = {v: k for k, v in dict(vars()).items() if isinstance(v, int)}
73
74    # Dict to convert from string to an enum-like integer value.
75    _name_to_val = {k: v for v, k in _name_by_val.items()}
76
77    @staticmethod
78    def to_name(val):
79        """Convert from an integer value from CToken enum into a string"""
80
81        return CToken._name_by_val.get(val, f"UNKNOWN({val})")
82
83    @staticmethod
84    def from_name(name):
85        """Convert a string into a CToken enum value"""
86        if name in CToken._name_to_val:
87            return CToken._name_to_val[name]
88
89        return CToken.MISMATCH
90
91
92    def __init__(self, kind, value=None, pos=0,
93                 brace_level=0, paren_level=0, bracket_level=0):
94        self.kind = kind
95        self.value = value
96        self.pos = pos
97        self.level = (bracket_level, paren_level, brace_level)
98
99    def __repr__(self):
100        name = self.to_name(self.kind)
101        if isinstance(self.value, str):
102            value = '"' + self.value + '"'
103        else:
104            value = self.value
105
106        return f"CToken(CToken.{name}, {value}, {self.pos}, {self.level})"
107
108#: Regexes to parse C code, transforming it into tokens.
109RE_SCANNER_LIST = [
110    #
111    # Note that \s\S is different than .*, as it also catches \n
112    #
113    (CToken.COMMENT, r"//[^\n]*|/\*[\s\S]*?\*/"),
114
115    (CToken.STRING,  r'"(?:\\.|[^"\\])*"'),
116    (CToken.CHAR,    r"'(?:\\.|[^'\\])'"),
117
118    (CToken.NUMBER,  r"0[xX][\da-fA-F]+[uUlL]*|0[0-7]+[uUlL]*|"
119                     r"\d+(?:\.\d*)?(?:[eE][+-]?\d+)?[fFlL]*"),
120
121    (CToken.ENDSTMT, r"(?:\s+;|;)"),
122
123    (CToken.PUNC,    r"[,\.]"),
124
125    (CToken.BEGIN,   r"[\[\(\{]"),
126
127    (CToken.END,     r"[\]\)\}]"),
128
129    (CToken.CPP,     r"#\s*(?:define|include|ifdef|ifndef|if|else|elif|endif|undef|pragma)\b"),
130
131    (CToken.HASH,    r"#"),
132
133    (CToken.OP,      r"\+\+|\-\-|\->|==|\!=|<=|>=|&&|\|\||<<|>>|\+=|\-=|\*=|/=|%="
134                     r"|&=|\|=|\^=|[=\+\-\*/%<>&\|\^~!\?\:]"),
135
136    (CToken.STRUCT,  r"\bstruct\b"),
137    (CToken.UNION,   r"\bunion\b"),
138    (CToken.ENUM,    r"\benum\b"),
139    (CToken.TYPEDEF, r"\btypedef\b"),
140
141    (CToken.NAME,    r"[A-Za-z_]\w*"),
142
143    (CToken.SPACE,   r"\s+"),
144
145    (CToken.BACKREF, r"\\\d+"),
146
147    (CToken.MISMATCH,r"."),
148]
149
150def fill_re_scanner(token_list):
151    """Ancillary routine to convert RE_SCANNER_LIST into a finditer regex"""
152    re_tokens = []
153
154    for kind, pattern in token_list:
155        name = CToken.to_name(kind)
156        re_tokens.append(f"(?P<{name}>{pattern})")
157
158    return KernRe("|".join(re_tokens), re.MULTILINE | re.DOTALL)
159
160#: Handle C continuation lines.
161RE_CONT = KernRe(r"\\\n")
162
163RE_COMMENT_START = KernRe(r'/\*\s*')
164
165#: tokenizer regex. Will be filled at the first CTokenizer usage.
166RE_SCANNER = fill_re_scanner(RE_SCANNER_LIST)
167
168
169class CTokenizer():
170    """
171    Scan C statements and definitions and produce tokens.
172
173    When converted to string, it drops comments and handle public/private
174    values, respecting depth.
175    """
176
177    # This class is inspired and follows the basic concepts of:
178    #   https://docs.python.org/3/library/re.html#writing-a-tokenizer
179
180    def __init__(self, source=None):
181        """
182        Create a regular expression to handle RE_SCANNER_LIST.
183
184        While I generally don't like using regex group naming via:
185            (?P<name>...)
186
187        in this particular case, it makes sense, as we can pick the name
188        when matching a code via RE_SCANNER.
189        """
190
191        #
192        # Store logger to allow parser classes to re-use it
193        #
194        global log
195        self.log = log
196
197        self.tokens = []
198
199        if not source:
200            return
201
202        if isinstance(source, list):
203            self.tokens = source
204            return
205
206        #
207        # While we could just use _tokenize directly via interator,
208        # As we'll need to use the tokenizer several times inside kernel-doc
209        # to handle macro transforms, cache the results on a list, as
210        # re-using it is cheaper than having to parse everytime.
211        #
212        for tok in self._tokenize(source):
213            self.tokens.append(tok)
214
215    def _tokenize(self, source):
216        """
217        Iterator that parses ``source``, splitting it into tokens, as defined
218        at ``self.RE_SCANNER_LIST``.
219
220        The interactor returns a CToken class object.
221        """
222
223        # Handle continuation lines. Note that kdoc_parser already has a
224        # logic to do that. Still, let's keep it for completeness, as we might
225        # end re-using this tokenizer outsize kernel-doc some day - or we may
226        # eventually remove from there as a future cleanup.
227        source = RE_CONT.sub("", source)
228
229        brace_level = 0
230        paren_level = 0
231        bracket_level = 0
232
233        for match in RE_SCANNER.finditer(source):
234            kind = CToken.from_name(match.lastgroup)
235            pos = match.start()
236            value = match.group()
237
238            if kind == CToken.MISMATCH:
239                log.error(f"Unexpected token '{value}' on pos {pos}:\n\t'{source}'")
240            elif kind == CToken.BEGIN:
241                if value == '(':
242                    paren_level += 1
243                elif value == '[':
244                    bracket_level += 1
245                else:  # value == '{'
246                    brace_level += 1
247
248            elif kind == CToken.END:
249                if value == ')' and paren_level > 0:
250                    paren_level -= 1
251                elif value == ']' and bracket_level > 0:
252                    bracket_level -= 1
253                elif brace_level > 0:    # value == '}'
254                    brace_level -= 1
255
256            yield CToken(kind, value, pos,
257                         brace_level, paren_level, bracket_level)
258
259    def __str__(self):
260        out=""
261        show_stack = [True]
262
263        for i, tok in enumerate(self.tokens):
264            if tok.kind == CToken.BEGIN:
265                show_stack.append(show_stack[-1])
266
267            elif tok.kind == CToken.END:
268                prev = show_stack[-1]
269                if len(show_stack) > 1:
270                    show_stack.pop()
271
272                if not prev and show_stack[-1]:
273                    #
274                    # Try to preserve indent
275                    #
276                    out += "\t" * (len(show_stack) - 1)
277
278                    out += str(tok.value)
279                    continue
280
281            elif tok.kind == CToken.COMMENT:
282                comment = RE_COMMENT_START.sub("", tok.value)
283
284                if comment.startswith("private:"):
285                    show_stack[-1] = False
286                    show = False
287                elif comment.startswith("public:"):
288                    show_stack[-1] = True
289
290                continue
291
292            if not show_stack[-1]:
293                continue
294
295            if i < len(self.tokens) - 1:
296                next_tok = self.tokens[i + 1]
297
298                # Do some cleanups before ";"
299
300                if tok.kind == CToken.SPACE and next_tok.kind == CToken.ENDSTMT:
301                    continue
302
303                if tok.kind == CToken.ENDSTMT and next_tok.kind == tok.kind:
304                    continue
305
306            out += str(tok.value)
307
308        return out
309
310
311class CTokenArgs:
312    """
313    Ancillary class to help using backrefs from sub matches.
314
315    If the highest backref contain a "+" at the last element,
316    the logic will be greedy, picking all other delims.
317
318    This is needed to parse struct_group macros with end with ``MEMBERS...``.
319    """
320    def __init__(self, sub_str):
321        self.sub_groups = set()
322        self.max_group = -1
323        self.greedy = None
324
325        for m in KernRe(r'\\(\d+)([+]?)').finditer(sub_str):
326            group = int(m.group(1))
327            if m.group(2) == "+":
328                if self.greedy and self.greedy != group:
329                    raise ValueError("There are multiple greedy patterns!")
330                self.greedy = group
331
332            self.sub_groups.add(group)
333            self.max_group = max(self.max_group, group)
334
335        if self.greedy:
336            if self.greedy != self.max_group:
337                raise ValueError("Greedy pattern is not the last one!")
338
339            sub_str = KernRe(r'(\\\d+)[+]').sub(r"\1", sub_str)
340
341        self.sub_str = sub_str
342        self.sub_tokeninzer = CTokenizer(sub_str)
343
344    def groups(self, new_tokenizer):
345        r"""
346        Create replacement arguments for backrefs like:
347
348        ``\0``, ``\1``, ``\2``, ... ``\{number}``
349
350        It also accepts a ``+`` character to the highest backref, like
351        ``\4+``. When used, the backref will be greedy, picking all other
352        arguments afterwards.
353
354        The logic is smart enough to only go up to the maximum required
355        argument, even if there are more.
356
357        If there is a backref for an argument above the limit, it will
358        raise an exception. Please notice that, on C, square brackets
359        don't have any separator on it. Trying to use ``\1``..``\n`` for
360        brackets also raise an exception.
361        """
362
363        level = (0, 0, 0)
364
365        if self.max_group < 0:
366            return level, []
367
368        tokens = new_tokenizer.tokens
369
370        #
371        # Fill \0 with the full token contents
372        #
373        groups_list = [ [] ]
374
375        if 0 in self.sub_groups:
376            inner_level = 0
377
378            for i in range(0, len(tokens)):
379                tok = tokens[i]
380
381                if tok.kind == CToken.BEGIN:
382                    inner_level += 1
383
384                    #
385                    # Discard first begin
386                    #
387                    if not groups_list[0]:
388                        continue
389                elif tok.kind == CToken.END:
390                    inner_level -= 1
391                    if inner_level < 0:
392                        break
393
394                if inner_level:
395                    groups_list[0].append(tok)
396
397        if not self.max_group:
398            return level, groups_list
399
400        delim = None
401
402        #
403        # Ignore everything before BEGIN. The value of begin gives the
404        # delimiter to be used for the matches
405        #
406        for i in range(0, len(tokens)):
407            tok = tokens[i]
408            if tok.kind == CToken.BEGIN:
409                if tok.value == "{":
410                    delim = ";"
411                elif tok.value == "(":
412                    delim = ","
413                else:
414                    self.log.error(fr"Can't handle \1..\n on {sub_str}")
415
416                level = tok.level
417                break
418
419        pos = 1
420        groups_list.append([])
421
422        inner_level = 0
423        for i in range(i + 1, len(tokens)):
424            tok = tokens[i]
425
426            if tok.kind == CToken.BEGIN:
427                inner_level += 1
428            if tok.kind == CToken.END:
429                inner_level -= 1
430                if inner_level < 0:
431                    break
432
433            if tok.kind in [CToken.PUNC, CToken.ENDSTMT] and delim == tok.value:
434                pos += 1
435                if self.greedy and pos > self.max_group:
436                    pos -= 1
437                else:
438                    groups_list.append([])
439
440                    if pos > self.max_group:
441                        break
442
443                    continue
444
445            groups_list[pos].append(tok)
446
447        if pos < self.max_group:
448            log.error(fr"{self.sub_str} groups are up to {pos} instead of {self.max_group}")
449
450        return level, groups_list
451
452    def tokens(self, new_tokenizer):
453        level, groups = self.groups(new_tokenizer)
454
455        new = CTokenizer()
456
457        for tok in self.sub_tokeninzer.tokens:
458            if tok.kind == CToken.BACKREF:
459                group = int(tok.value[1:])
460
461                for group_tok in groups[group]:
462                    new_tok = copy(group_tok)
463
464                    new_level = [0, 0, 0]
465
466                    for i in range(0, len(level)):
467                        new_level[i] = new_tok.level[i] + level[i]
468
469                    new_tok.level = tuple(new_level)
470
471                    new.tokens += [ new_tok ]
472            else:
473                new.tokens += [ tok ]
474
475        return new.tokens
476
477
478class CMatch:
479    """
480    Finding nested delimiters is hard with regular expressions. It is
481    even harder on Python with its normal re module, as there are several
482    advanced regular expressions that are missing.
483
484    This is the case of this pattern::
485
486            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
487
488    which is used to properly match open/close parentheses of the
489    string search STRUCT_GROUP(),
490
491    Add a class that counts pairs of delimiters, using it to match and
492    replace nested expressions.
493
494    The original approach was suggested by:
495
496        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
497
498    Although I re-implemented it to make it more generic and match 3 types
499    of delimiters. The logic checks if delimiters are paired. If not, it
500    will ignore the search string.
501    """
502
503
504    def __init__(self, regex, delim="("):
505        self.regex = KernRe("^" + regex + r"\b")
506        self.start_delim = delim
507
508    def _search(self, tokenizer):
509        """
510        Finds paired blocks for a regex that ends with a delimiter.
511
512        The suggestion of using finditer to match pairs came from:
513        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
514        but I ended using a different implementation to align all three types
515        of delimiters and seek for an initial regular expression.
516
517        The algorithm seeks for open/close paired delimiters and places them
518        into a stack, yielding a start/stop position of each match when the
519        stack is zeroed.
520
521        The algorithm should work fine for properly paired lines, but will
522        silently ignore end delimiters that precede a start delimiter.
523        This should be OK for kernel-doc parser, as unaligned delimiters
524        would cause compilation errors. So, we don't need to raise exceptions
525        to cover such issues.
526        """
527
528        start = None
529        started = False
530
531        import sys
532
533        stack = []
534
535        for i, tok in enumerate(tokenizer.tokens):
536            if start is None:
537                if tok.kind == CToken.NAME and self.regex.match(tok.value):
538                    start = i
539                    stack.append((start, tok.level))
540                    started = False
541
542                continue
543
544            if not started:
545                if tok.kind == CToken.SPACE:
546                    continue
547
548                if tok.kind == CToken.BEGIN and tok.value == self.start_delim:
549                    started = True
550                    continue
551
552                # Name only token without BEGIN/END
553                if i > start:
554                    i -= 1
555                yield start, i
556                start = None
557
558            if tok.kind == CToken.END and tok.level == stack[-1][1]:
559                start, level = stack.pop()
560
561                yield start, i
562                start = None
563
564        #
565        # If an END zeroing levels is not there, return remaining stuff
566        # This is meant to solve cases where the caller logic might be
567        # picking an incomplete block.
568        #
569        if start and stack:
570            if started:
571                s = str(tokenizer)
572                log.warning(f"can't find a final end at {s}")
573
574            yield start, len(tokenizer.tokens)
575
576    def search(self, source):
577        """
578        This is similar to re.search:
579
580        It matches a regex that it is followed by a delimiter,
581        returning occurrences only if all delimiters are paired.
582        """
583
584        if isinstance(source, CTokenizer):
585            tokenizer = source
586            is_token = True
587        else:
588            tokenizer = CTokenizer(source)
589            is_token = False
590
591        for start, end in self._search(tokenizer):
592            new_tokenizer = CTokenizer(tokenizer.tokens[start:end + 1])
593
594            if is_token:
595                yield new_tokenizer
596            else:
597                yield str(new_tokenizer)
598
599    def sub(self, sub_str, source, count=0):
600        """
601        This is similar to re.sub:
602
603        It matches a regex that it is followed by a delimiter,
604        replacing occurrences only if all delimiters are paired.
605
606        if the sub argument contains::
607
608            r'\0'
609
610        it will work just like re: it places there the matched paired data
611        with the delimiter stripped.
612
613        If count is different than zero, it will replace at most count
614        items.
615        """
616        if isinstance(source, CTokenizer):
617            is_token = True
618            tokenizer = source
619        else:
620            is_token = False
621            tokenizer = CTokenizer(source)
622
623        # Detect if sub_str contains sub arguments
624
625        args_match = CTokenArgs(sub_str)
626
627        new_tokenizer = CTokenizer()
628        pos = 0
629        n = 0
630
631        #
632        # NOTE: the code below doesn't consider overlays at sub.
633        # We may need to add some extra unit tests to check if those
634        # would cause problems. When replacing by "", this should not
635        # be a problem, but other transformations could be problematic
636        #
637        for start, end in self._search(tokenizer):
638            new_tokenizer.tokens += tokenizer.tokens[pos:start]
639
640            new = CTokenizer(tokenizer.tokens[start:end + 1])
641
642            new_tokenizer.tokens += args_match.tokens(new)
643
644            pos = end + 1
645
646            n += 1
647            if count and n >= count:
648                break
649
650        new_tokenizer.tokens += tokenizer.tokens[pos:]
651
652        if not is_token:
653            return str(new_tokenizer)
654
655        return new_tokenizer
656
657    def __repr__(self):
658        """
659        Returns a displayable version of the class init.
660        """
661
662        return f'CMatch("{self.regex.regex.pattern}")'
663