1#!/usr/bin/env python3
2# SPDX-License-Identifier: GPL-2.0
3# Copyright(c) 2025: Mauro Carvalho Chehab <mchehab@kernel.org>.
4
5"""
6Regular expression ancillary classes.
7
8Those help caching regular expressions and do matching for kernel-doc.
9"""
10
11import re
12
13# Local cache for regular expressions
14re_cache = {}
15
16
17class KernRe:
18    """
19    Helper class to simplify regex declaration and usage,
20
21    It calls re.compile for a given pattern. It also allows adding
22    regular expressions and define sub at class init time.
23
24    Regular expressions can be cached via an argument, helping to speedup
25    searches.
26    """
27
28    def _add_regex(self, string, flags):
29        """
30        Adds a new regex or re-use it from the cache.
31        """
32
33        if string in re_cache:
34            self.regex = re_cache[string]
35        else:
36            self.regex = re.compile(string, flags=flags)
37
38            if self.cache:
39                re_cache[string] = self.regex
40
41    def __init__(self, string, cache=True, flags=0):
42        """
43        Compile a regular expression and initialize internal vars.
44        """
45
46        self.cache = cache
47        self.last_match = None
48
49        self._add_regex(string, flags)
50
51    def __str__(self):
52        """
53        Return the regular expression pattern.
54        """
55        return self.regex.pattern
56
57    def __add__(self, other):
58        """
59        Allows adding two regular expressions into one.
60        """
61
62        return KernRe(str(self) + str(other), cache=self.cache or other.cache,
63                  flags=self.regex.flags | other.regex.flags)
64
65    def match(self, string):
66        """
67        Handles a re.match storing its results
68        """
69
70        self.last_match = self.regex.match(string)
71        return self.last_match
72
73    def search(self, string):
74        """
75        Handles a re.search storing its results
76        """
77
78        self.last_match = self.regex.search(string)
79        return self.last_match
80
81    def findall(self, string):
82        """
83        Alias to re.findall
84        """
85
86        return self.regex.findall(string)
87
88    def split(self, string):
89        """
90        Alias to re.split
91        """
92
93        return self.regex.split(string)
94
95    def sub(self, sub, string, count=0):
96        """
97        Alias to re.sub
98        """
99
100        return self.regex.sub(sub, string, count=count)
101
102    def group(self, num):
103        """
104        Returns the group results of the last match
105        """
106
107        return self.last_match.group(num)
108
109
110class NestedMatch:
111    """
112    Finding nested delimiters is hard with regular expressions. It is
113    even harder on Python with its normal re module, as there are several
114    advanced regular expressions that are missing.
115
116    This is the case of this pattern:
117
118            '\\bSTRUCT_GROUP(\\(((?:(?>[^)(]+)|(?1))*)\\))[^;]*;'
119
120    which is used to properly match open/close parenthesis of the
121    string search STRUCT_GROUP(),
122
123    Add a class that counts pairs of delimiters, using it to match and
124    replace nested expressions.
125
126    The original approach was suggested by:
127        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
128
129    Although I re-implemented it to make it more generic and match 3 types
130    of delimiters. The logic checks if delimiters are paired. If not, it
131    will ignore the search string.
132    """
133
134    # TODO: make NestedMatch handle multiple match groups
135    #
136    # Right now, regular expressions to match it are defined only up to
137    #       the start delimiter, e.g.:
138    #
139    #       \bSTRUCT_GROUP\(
140    #
141    # is similar to: STRUCT_GROUP\((.*)\)
142    # except that the content inside the match group is delimiter's aligned.
143    #
144    # The content inside parenthesis are converted into a single replace
145    # group (e.g. r`\1').
146    #
147    # It would be nice to change such definition to support multiple
148    # match groups, allowing a regex equivalent to.
149    #
150    #   FOO\((.*), (.*), (.*)\)
151    #
152    # it is probably easier to define it not as a regular expression, but
153    # with some lexical definition like:
154    #
155    #   FOO(arg1, arg2, arg3)
156
157    DELIMITER_PAIRS = {
158        '{': '}',
159        '(': ')',
160        '[': ']',
161    }
162
163    RE_DELIM = re.compile(r'[\{\}\[\]\(\)]')
164
165    def _search(self, regex, line):
166        """
167        Finds paired blocks for a regex that ends with a delimiter.
168
169        The suggestion of using finditer to match pairs came from:
170        https://stackoverflow.com/questions/5454322/python-how-to-match-nested-parentheses-with-regex
171        but I ended using a different implementation to align all three types
172        of delimiters and seek for an initial regular expression.
173
174        The algorithm seeks for open/close paired delimiters and place them
175        into a stack, yielding a start/stop position of each match  when the
176        stack is zeroed.
177
178        The algorithm shoud work fine for properly paired lines, but will
179        silently ignore end delimiters that preceeds an start delimiter.
180        This should be OK for kernel-doc parser, as unaligned delimiters
181        would cause compilation errors. So, we don't need to rise exceptions
182        to cover such issues.
183        """
184
185        stack = []
186
187        for match_re in regex.finditer(line):
188            start = match_re.start()
189            offset = match_re.end()
190
191            d = line[offset - 1]
192            if d not in self.DELIMITER_PAIRS:
193                continue
194
195            end = self.DELIMITER_PAIRS[d]
196            stack.append(end)
197
198            for match in self.RE_DELIM.finditer(line[offset:]):
199                pos = match.start() + offset
200
201                d = line[pos]
202
203                if d in self.DELIMITER_PAIRS:
204                    end = self.DELIMITER_PAIRS[d]
205
206                    stack.append(end)
207                    continue
208
209                # Does the end delimiter match what it is expected?
210                if stack and d == stack[-1]:
211                    stack.pop()
212
213                    if not stack:
214                        yield start, offset, pos + 1
215                        break
216
217    def search(self, regex, line):
218        """
219        This is similar to re.search:
220
221        It matches a regex that it is followed by a delimiter,
222        returning occurrences only if all delimiters are paired.
223        """
224
225        for t in self._search(regex, line):
226
227            yield line[t[0]:t[2]]
228
229    def sub(self, regex, sub, line, count=0):
230        """
231        This is similar to re.sub:
232
233        It matches a regex that it is followed by a delimiter,
234        replacing occurrences only if all delimiters are paired.
235
236        if r'\1' is used, it works just like re: it places there the
237        matched paired data with the delimiter stripped.
238
239        If count is different than zero, it will replace at most count
240        items.
241        """
242        out = ""
243
244        cur_pos = 0
245        n = 0
246
247        for start, end, pos in self._search(regex, line):
248            out += line[cur_pos:start]
249
250            # Value, ignoring start/end delimiters
251            value = line[end:pos - 1]
252
253            # replaces \1 at the sub string, if \1 is used there
254            new_sub = sub
255            new_sub = new_sub.replace(r'\1', value)
256
257            out += new_sub
258
259            # Drop end ';' if any
260            if line[pos] == ';':
261                pos += 1
262
263            cur_pos = pos
264            n += 1
265
266            if count and count >= n:
267                break
268
269        # Append the remaining string
270        l = len(line)
271        out += line[cur_pos:l]
272
273        return out
274