xref: /linux/tools/lib/python/kdoc/kdoc_re.py (revision 72c395024dac5e215136cbff793455f065603b06)
1e31fd36dSMauro Carvalho Chehab#!/usr/bin/env python3
2e31fd36dSMauro Carvalho Chehab# SPDX-License-Identifier: GPL-2.0
3e31fd36dSMauro Carvalho Chehab# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4e31fd36dSMauro Carvalho Chehab
5e31fd36dSMauro Carvalho Chehab"""
6e31fd36dSMauro Carvalho ChehabRegular expression ancillary classes.
7e31fd36dSMauro Carvalho Chehab
8e31fd36dSMauro Carvalho ChehabThose help caching regular expressions and do matching for kernel-doc.
9e31fd36dSMauro Carvalho Chehab"""
10e31fd36dSMauro Carvalho Chehab
11e31fd36dSMauro Carvalho Chehabimport re
12e31fd36dSMauro Carvalho Chehab
13e31fd36dSMauro Carvalho Chehab# Local cache for regular expressions
14e31fd36dSMauro Carvalho Chehabre_cache = {}
15e31fd36dSMauro Carvalho Chehab
16e31fd36dSMauro Carvalho Chehab
1704a383ceSMauro Carvalho Chehabclass KernRe:
18e31fd36dSMauro Carvalho Chehab    """
195f88f44dSRandy Dunlap    Helper class to simplify regex declaration and usage.
20e31fd36dSMauro Carvalho Chehab
21e31fd36dSMauro Carvalho Chehab    It calls re.compile for a given pattern. It also allows adding
22e31fd36dSMauro Carvalho Chehab    regular expressions and define sub at class init time.
23e31fd36dSMauro Carvalho Chehab
24e31fd36dSMauro Carvalho Chehab    Regular expressions can be cached via an argument, helping to speedup
25e31fd36dSMauro Carvalho Chehab    searches.
26e31fd36dSMauro Carvalho Chehab    """
27e31fd36dSMauro Carvalho Chehab
28e31fd36dSMauro Carvalho Chehab    def _add_regex(self, string, flags):
29e31fd36dSMauro Carvalho Chehab        """
305f88f44dSRandy Dunlap        Adds a new regex or reuses it from the cache.
31e31fd36dSMauro Carvalho Chehab        """
328078e0edSJonathan Corbet        self.regex = re_cache.get(string, None)
338078e0edSJonathan Corbet        if not self.regex:
34e31fd36dSMauro Carvalho Chehab            self.regex = re.compile(string, flags=flags)
35e31fd36dSMauro Carvalho Chehab            if self.cache:
36e31fd36dSMauro Carvalho Chehab                re_cache[string] = self.regex
37e31fd36dSMauro Carvalho Chehab
38e31fd36dSMauro Carvalho Chehab    def __init__(self, string, cache=True, flags=0):
39e31fd36dSMauro Carvalho Chehab        """
40e31fd36dSMauro Carvalho Chehab        Compile a regular expression and initialize internal vars.
41e31fd36dSMauro Carvalho Chehab        """
42e31fd36dSMauro Carvalho Chehab
43e31fd36dSMauro Carvalho Chehab        self.cache = cache
44e31fd36dSMauro Carvalho Chehab        self.last_match = None
45e31fd36dSMauro Carvalho Chehab
46e31fd36dSMauro Carvalho Chehab        self._add_regex(string, flags)
47e31fd36dSMauro Carvalho Chehab
48e31fd36dSMauro Carvalho Chehab    def __str__(self):
49e31fd36dSMauro Carvalho Chehab        """
50e31fd36dSMauro Carvalho Chehab        Return the regular expression pattern.
51e31fd36dSMauro Carvalho Chehab        """
52e31fd36dSMauro Carvalho Chehab        return self.regex.pattern
53e31fd36dSMauro Carvalho Chehab
54b0b88915SMauro Carvalho Chehab    def __repr__(self):
55b0b88915SMauro Carvalho Chehab        return f're.compile("{self.regex.pattern}")'
56b0b88915SMauro Carvalho Chehab
57e31fd36dSMauro Carvalho Chehab    def __add__(self, other):
58e31fd36dSMauro Carvalho Chehab        """
59e31fd36dSMauro Carvalho Chehab        Allows adding two regular expressions into one.
60e31fd36dSMauro Carvalho Chehab        """
61e31fd36dSMauro Carvalho Chehab
6204a383ceSMauro Carvalho Chehab        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63e31fd36dSMauro Carvalho Chehab                  flags=self.regex.flags | other.regex.flags)
64e31fd36dSMauro Carvalho Chehab
65e31fd36dSMauro Carvalho Chehab    def match(self, string):
66e31fd36dSMauro Carvalho Chehab        """
67b0b88915SMauro Carvalho Chehab        Handles a re.match storing its results.
68e31fd36dSMauro Carvalho Chehab        """
69e31fd36dSMauro Carvalho Chehab
70e31fd36dSMauro Carvalho Chehab        self.last_match = self.regex.match(string)
71e31fd36dSMauro Carvalho Chehab        return self.last_match
72e31fd36dSMauro Carvalho Chehab
73e31fd36dSMauro Carvalho Chehab    def search(self, string):
74e31fd36dSMauro Carvalho Chehab        """
75b0b88915SMauro Carvalho Chehab        Handles a re.search storing its results.
76e31fd36dSMauro Carvalho Chehab        """
77e31fd36dSMauro Carvalho Chehab
78e31fd36dSMauro Carvalho Chehab        self.last_match = self.regex.search(string)
79e31fd36dSMauro Carvalho Chehab        return self.last_match
80e31fd36dSMauro Carvalho Chehab
81e31fd36dSMauro Carvalho Chehab    def findall(self, string):
82e31fd36dSMauro Carvalho Chehab        """
83b0b88915SMauro Carvalho Chehab        Alias to re.findall.
84e31fd36dSMauro Carvalho Chehab        """
85e31fd36dSMauro Carvalho Chehab
86e31fd36dSMauro Carvalho Chehab        return self.regex.findall(string)
87e31fd36dSMauro Carvalho Chehab
88e31fd36dSMauro Carvalho Chehab    def split(self, string):
89e31fd36dSMauro Carvalho Chehab        """
90b0b88915SMauro Carvalho Chehab        Alias to re.split.
91e31fd36dSMauro Carvalho Chehab        """
92e31fd36dSMauro Carvalho Chehab
93e31fd36dSMauro Carvalho Chehab        return self.regex.split(string)
94e31fd36dSMauro Carvalho Chehab
95e31fd36dSMauro Carvalho Chehab    def sub(self, sub, string, count=0):
96e31fd36dSMauro Carvalho Chehab        """
97b0b88915SMauro Carvalho Chehab        Alias to re.sub.
98e31fd36dSMauro Carvalho Chehab        """
99e31fd36dSMauro Carvalho Chehab
100e31fd36dSMauro Carvalho Chehab        return self.regex.sub(sub, string, count=count)
101e31fd36dSMauro Carvalho Chehab
102e31fd36dSMauro Carvalho Chehab    def group(self, num):
103e31fd36dSMauro Carvalho Chehab        """
104b0b88915SMauro Carvalho Chehab        Returns the group results of the last match.
105e31fd36dSMauro Carvalho Chehab        """
106e31fd36dSMauro Carvalho Chehab
107e31fd36dSMauro Carvalho Chehab        return self.last_match.group(num)
108e31fd36dSMauro Carvalho Chehab
109e31fd36dSMauro Carvalho Chehab
110e31fd36dSMauro Carvalho Chehabclass NestedMatch:
111e31fd36dSMauro Carvalho Chehab    """
112e31fd36dSMauro Carvalho Chehab    Finding nested delimiters is hard with regular expressions. It is
113e31fd36dSMauro Carvalho Chehab    even harder on Python with its normal re module, as there are several
114e31fd36dSMauro Carvalho Chehab    advanced regular expressions that are missing.
115e31fd36dSMauro Carvalho Chehab
116b0b88915SMauro Carvalho Chehab    This is the case of this pattern::
117e31fd36dSMauro Carvalho Chehab
118e31fd36dSMauro Carvalho Chehab            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119e31fd36dSMauro Carvalho Chehab
1205f88f44dSRandy Dunlap    which is used to properly match open/close parentheses of the
121e31fd36dSMauro Carvalho Chehab    string search STRUCT_GROUP(),
122e31fd36dSMauro Carvalho Chehab
123e31fd36dSMauro Carvalho Chehab    Add a class that counts pairs of delimiters, using it to match and
124e31fd36dSMauro Carvalho Chehab    replace nested expressions.
125e31fd36dSMauro Carvalho Chehab
126e31fd36dSMauro Carvalho Chehab    The original approach was suggested by:
127b0b88915SMauro Carvalho Chehab
128e31fd36dSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
129e31fd36dSMauro Carvalho Chehab
130e31fd36dSMauro Carvalho Chehab    Although I re-implemented it to make it more generic and match 3 types
131e31fd36dSMauro Carvalho Chehab    of delimiters. The logic checks if delimiters are paired. If not, it
132e31fd36dSMauro Carvalho Chehab    will ignore the search string.
133e31fd36dSMauro Carvalho Chehab    """
134e31fd36dSMauro Carvalho Chehab
135485f6f79SMauro Carvalho Chehab    # TODO: make NestedMatch handle multiple match groups
136485f6f79SMauro Carvalho Chehab    #
137e31fd36dSMauro Carvalho Chehab    # Right now, regular expressions to match it are defined only up to
138e31fd36dSMauro Carvalho Chehab    #       the start delimiter, e.g.:
139e31fd36dSMauro Carvalho Chehab    #
140e31fd36dSMauro Carvalho Chehab    #       \bSTRUCT_GROUP\(
141e31fd36dSMauro Carvalho Chehab    #
142e31fd36dSMauro Carvalho Chehab    # is similar to: STRUCT_GROUP\((.*)\)
1435f88f44dSRandy Dunlap    # except that the content inside the match group is delimiter-aligned.
144e31fd36dSMauro Carvalho Chehab    #
1455f88f44dSRandy Dunlap    # The content inside parentheses is converted into a single replace
146e31fd36dSMauro Carvalho Chehab    # group (e.g. r`\1').
147e31fd36dSMauro Carvalho Chehab    #
148e31fd36dSMauro Carvalho Chehab    # It would be nice to change such definition to support multiple
1495f88f44dSRandy Dunlap    # match groups, allowing a regex equivalent to:
150e31fd36dSMauro Carvalho Chehab    #
151e31fd36dSMauro Carvalho Chehab    #   FOO\((.*), (.*), (.*)\)
152e31fd36dSMauro Carvalho Chehab    #
153e31fd36dSMauro Carvalho Chehab    # it is probably easier to define it not as a regular expression, but
154e31fd36dSMauro Carvalho Chehab    # with some lexical definition like:
155e31fd36dSMauro Carvalho Chehab    #
156e31fd36dSMauro Carvalho Chehab    #   FOO(arg1, arg2, arg3)
157e31fd36dSMauro Carvalho Chehab
158e31fd36dSMauro Carvalho Chehab    DELIMITER_PAIRS = {
159e31fd36dSMauro Carvalho Chehab        '{': '}',
160e31fd36dSMauro Carvalho Chehab        '(': ')',
161e31fd36dSMauro Carvalho Chehab        '[': ']',
162e31fd36dSMauro Carvalho Chehab    }
163e31fd36dSMauro Carvalho Chehab
164e31fd36dSMauro Carvalho Chehab    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
165e31fd36dSMauro Carvalho Chehab
166e31fd36dSMauro Carvalho Chehab    def _search(self, regex, line):
167e31fd36dSMauro Carvalho Chehab        """
168e31fd36dSMauro Carvalho Chehab        Finds paired blocks for a regex that ends with a delimiter.
169e31fd36dSMauro Carvalho Chehab
170e31fd36dSMauro Carvalho Chehab        The suggestion of using finditer to match pairs came from:
171e31fd36dSMauro Carvalho Chehab        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
172e31fd36dSMauro Carvalho Chehab        but I ended using a different implementation to align all three types
173e31fd36dSMauro Carvalho Chehab        of delimiters and seek for an initial regular expression.
174e31fd36dSMauro Carvalho Chehab
1755f88f44dSRandy Dunlap        The algorithm seeks for open/close paired delimiters and places them
176e31fd36dSMauro Carvalho Chehab        into a stack, yielding a start/stop position of each match when the
177e31fd36dSMauro Carvalho Chehab        stack is zeroed.
178e31fd36dSMauro Carvalho Chehab
1795f88f44dSRandy Dunlap        The algorithm should work fine for properly paired lines, but will
1805f88f44dSRandy Dunlap        silently ignore end delimiters that precede a start delimiter.
181e31fd36dSMauro Carvalho Chehab        This should be OK for kernel-doc parser, as unaligned delimiters
1825f88f44dSRandy Dunlap        would cause compilation errors. So, we don't need to raise exceptions
183e31fd36dSMauro Carvalho Chehab        to cover such issues.
184e31fd36dSMauro Carvalho Chehab        """
185e31fd36dSMauro Carvalho Chehab
186e31fd36dSMauro Carvalho Chehab        stack = []
187e31fd36dSMauro Carvalho Chehab
188e31fd36dSMauro Carvalho Chehab        for match_re in regex.finditer(line):
189e31fd36dSMauro Carvalho Chehab            start = match_re.start()
190e31fd36dSMauro Carvalho Chehab            offset = match_re.end()
191e31fd36dSMauro Carvalho Chehab
192e31fd36dSMauro Carvalho Chehab            d = line[offset - 1]
193e31fd36dSMauro Carvalho Chehab            if d not in self.DELIMITER_PAIRS:
194e31fd36dSMauro Carvalho Chehab                continue
195e31fd36dSMauro Carvalho Chehab
196e31fd36dSMauro Carvalho Chehab            end = self.DELIMITER_PAIRS[d]
197e31fd36dSMauro Carvalho Chehab            stack.append(end)
198e31fd36dSMauro Carvalho Chehab
199e31fd36dSMauro Carvalho Chehab            for match in self.RE_DELIM.finditer(line[offset:]):
200e31fd36dSMauro Carvalho Chehab                pos = match.start() + offset
201e31fd36dSMauro Carvalho Chehab
202e31fd36dSMauro Carvalho Chehab                d = line[pos]
203e31fd36dSMauro Carvalho Chehab
204e31fd36dSMauro Carvalho Chehab                if d in self.DELIMITER_PAIRS:
205e31fd36dSMauro Carvalho Chehab                    end = self.DELIMITER_PAIRS[d]
206e31fd36dSMauro Carvalho Chehab
207e31fd36dSMauro Carvalho Chehab                    stack.append(end)
208e31fd36dSMauro Carvalho Chehab                    continue
209e31fd36dSMauro Carvalho Chehab
2105f88f44dSRandy Dunlap                # Does the end delimiter match what is expected?
211e31fd36dSMauro Carvalho Chehab                if stack and d == stack[-1]:
212e31fd36dSMauro Carvalho Chehab                    stack.pop()
213e31fd36dSMauro Carvalho Chehab
214e31fd36dSMauro Carvalho Chehab                    if not stack:
215e31fd36dSMauro Carvalho Chehab                        yield start, offset, pos + 1
216e31fd36dSMauro Carvalho Chehab                        break
217e31fd36dSMauro Carvalho Chehab
218e31fd36dSMauro Carvalho Chehab    def search(self, regex, line):
219e31fd36dSMauro Carvalho Chehab        """
220e31fd36dSMauro Carvalho Chehab        This is similar to re.search:
221e31fd36dSMauro Carvalho Chehab
222e31fd36dSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
223e31fd36dSMauro Carvalho Chehab        returning occurrences only if all delimiters are paired.
224e31fd36dSMauro Carvalho Chehab        """
225e31fd36dSMauro Carvalho Chehab
226e31fd36dSMauro Carvalho Chehab        for t in self._search(regex, line):
227e31fd36dSMauro Carvalho Chehab
228e31fd36dSMauro Carvalho Chehab            yield line[t[0]:t[2]]
229e31fd36dSMauro Carvalho Chehab
230e31fd36dSMauro Carvalho Chehab    def sub(self, regex, sub, line, count=0):
231*98f51c46SMauro Carvalho Chehab        r"""
232e31fd36dSMauro Carvalho Chehab        This is similar to re.sub:
233e31fd36dSMauro Carvalho Chehab
234e31fd36dSMauro Carvalho Chehab        It matches a regex that it is followed by a delimiter,
235e31fd36dSMauro Carvalho Chehab        replacing occurrences only if all delimiters are paired.
236e31fd36dSMauro Carvalho Chehab
237*98f51c46SMauro Carvalho Chehab        if the sub argument contains::
238*98f51c46SMauro Carvalho Chehab
239*98f51c46SMauro Carvalho Chehab            r'\1'
240*98f51c46SMauro Carvalho Chehab
241*98f51c46SMauro Carvalho Chehab        it will work just like re: it places there the matched paired data
242*98f51c46SMauro Carvalho Chehab        with the delimiter stripped.
243e31fd36dSMauro Carvalho Chehab
244e31fd36dSMauro Carvalho Chehab        If count is different than zero, it will replace at most count
245e31fd36dSMauro Carvalho Chehab        items.
246e31fd36dSMauro Carvalho Chehab        """
247e31fd36dSMauro Carvalho Chehab        out = ""
248e31fd36dSMauro Carvalho Chehab
249e31fd36dSMauro Carvalho Chehab        cur_pos = 0
250e31fd36dSMauro Carvalho Chehab        n = 0
251e31fd36dSMauro Carvalho Chehab
252e31fd36dSMauro Carvalho Chehab        for start, end, pos in self._search(regex, line):
253e31fd36dSMauro Carvalho Chehab            out += line[cur_pos:start]
254e31fd36dSMauro Carvalho Chehab
255e31fd36dSMauro Carvalho Chehab            # Value, ignoring start/end delimiters
256e31fd36dSMauro Carvalho Chehab            value = line[end:pos - 1]
257e31fd36dSMauro Carvalho Chehab
258e31fd36dSMauro Carvalho Chehab            # replaces \1 at the sub string, if \1 is used there
259e31fd36dSMauro Carvalho Chehab            new_sub = sub
260e31fd36dSMauro Carvalho Chehab            new_sub = new_sub.replace(r'\1', value)
261e31fd36dSMauro Carvalho Chehab
262e31fd36dSMauro Carvalho Chehab            out += new_sub
263e31fd36dSMauro Carvalho Chehab
264e31fd36dSMauro Carvalho Chehab            # Drop end ';' if any
265e31fd36dSMauro Carvalho Chehab            if line[pos] == ';':
266e31fd36dSMauro Carvalho Chehab                pos += 1
267e31fd36dSMauro Carvalho Chehab
268e31fd36dSMauro Carvalho Chehab            cur_pos = pos
269e31fd36dSMauro Carvalho Chehab            n += 1
270e31fd36dSMauro Carvalho Chehab
271e31fd36dSMauro Carvalho Chehab            if count and count >= n:
272e31fd36dSMauro Carvalho Chehab                break
273e31fd36dSMauro Carvalho Chehab
274e31fd36dSMauro Carvalho Chehab        # Append the remaining string
275e31fd36dSMauro Carvalho Chehab        l = len(line)
276e31fd36dSMauro Carvalho Chehab        out += line[cur_pos:l]
277e31fd36dSMauro Carvalho Chehab
278e31fd36dSMauro Carvalho Chehab        return out
279