1 /*
2 * regcomp and regexec -- regsub and regerror are elsewhere
3 *
4 * Copyright (c) 1986 by University of Toronto.
5 * Written by Henry Spencer. Not derived from licensed software.
6 *
7 * Permission is granted to anyone to use this software for any
8 * purpose on any computer system, and to redistribute it freely,
9 * subject to the following restrictions:
10 *
11 * 1. The author is not responsible for the consequences of use of
12 * this software, no matter how awful, even if they arise
13 * from defects in it.
14 *
15 * 2. The origin of this software must not be misrepresented, either
16 * by explicit claim or by omission.
17 *
18 * 3. Altered versions must be plainly marked as such, and must not
19 * be misrepresented as being the original software.
20 *
21 * Beware that some of this code is subtly aware of the way operator
22 * precedence is structured in regular expressions. Serious changes in
23 * regular-expression syntax might require a total rethink.
24 *
25 * *** NOTE: this code has been altered slightly for use in Tcl. ***
26 * Slightly modified by David MacKenzie to undo most of the changes for TCL.
27 * Added regexec2 with notbol parameter. -- 4/19/99 Mark Nudelman
28 */
29
30 #include "less.h"
31 #if HAVE_STDIO_H
32 #include <stdio.h>
33 #endif
34 #if HAVE_STDLIB_H
35 #include <stdlib.h>
36 #endif
37 #if HAVE_STRING_H
38 #include <string.h>
39 #endif
40 #include "regexp.h"
41
42 /*
43 * The "internal use only" fields in regexp.h are present to pass info from
44 * compile to execute that permits the execute phase to run lots faster on
45 * simple cases. They are:
46 *
47 * regstart char that must begin a match; '\0' if none obvious
48 * reganch is the match anchored (at beginning-of-line only)?
49 * regmust string (pointer into program) that match must include, or NULL
50 * regmlen length of regmust string
51 *
52 * Regstart and reganch permit very fast decisions on suitable starting points
53 * for a match, cutting down the work a lot. Regmust permits fast rejection
54 * of lines that cannot possibly match. The regmust tests are costly enough
55 * that regcomp() supplies a regmust only if the r.e. contains something
56 * potentially expensive (at present, the only such thing detected is * or +
57 * at the start of the r.e., which can involve a lot of backup). Regmlen is
58 * supplied because the test in regexec() needs it and regcomp() is
59 * computing it anyway.
60 */
61
62 /*
63 * Structure for regexp "program". This is essentially a linear encoding
64 * of a nondeterministic finite-state machine (aka syntax charts or
65 * "railroad normal form" in parsing technology). Each node is an opcode
66 * plus a "next" pointer, possibly plus an operand. "Next" pointers of
67 * all nodes except BRANCH implement concatenation; a "next" pointer with
68 * a BRANCH on both ends of it is connecting two alternatives. (Here we
69 * have one of the subtle syntax dependencies: an individual BRANCH (as
70 * opposed to a collection of them) is never concatenated with anything
71 * because of operator precedence.) The operand of some types of node is
72 * a literal string; for others, it is a node leading into a sub-FSM. In
73 * particular, the operand of a BRANCH node is the first node of the branch.
74 * (NB this is *not* a tree structure: the tail of the branch connects
75 * to the thing following the set of BRANCHes.) The opcodes are:
76 */
77
78 /* definition number opnd? meaning */
79 #undef EOL
80 #define END 0 /* no End of program. */
81 #define BOL 1 /* no Match "" at beginning of line. */
82 #define EOL 2 /* no Match "" at end of line. */
83 #define ANY 3 /* no Match any one character. */
84 #define ANYOF 4 /* str Match any character in this string. */
85 #define ANYBUT 5 /* str Match any character not in this string. */
86 #define BRANCH 6 /* node Match this alternative, or the next... */
87 #define BACK 7 /* no Match "", "next" ptr points backward. */
88 #define EXACTLY 8 /* str Match this string. */
89 #define NOTHING 9 /* no Match empty string. */
90 #define STAR 10 /* node Match this (simple) thing 0 or more times. */
91 #define PLUS 11 /* node Match this (simple) thing 1 or more times. */
92 #define OPEN 20 /* no Mark this point in input as start of #n. */
93 /* OPEN+1 is number 1, etc. */
94 #define CLOSE 30 /* no Analogous to OPEN. */
95
96 /*
97 * Opcode notes:
98 *
99 * BRANCH The set of branches constituting a single choice are hooked
100 * together with their "next" pointers, since precedence prevents
101 * anything being concatenated to any individual branch. The
102 * "next" pointer of the last BRANCH in a choice points to the
103 * thing following the whole choice. This is also where the
104 * final "next" pointer of each individual branch points; each
105 * branch starts with the operand node of a BRANCH node.
106 *
107 * BACK Normal "next" pointers all implicitly point forward; BACK
108 * exists to make loop structures possible.
109 *
110 * STAR,PLUS '?', and complex '*' and '+', are implemented as circular
111 * BRANCH structures using BACK. Simple cases (one character
112 * per match) are implemented with STAR and PLUS for speed
113 * and to minimize recursive plunges.
114 *
115 * OPEN,CLOSE ...are numbered at compile time.
116 */
117
118 /*
119 * A node is one char of opcode followed by two chars of "next" pointer.
120 * "Next" pointers are stored as two 8-bit pieces, high order first. The
121 * value is a positive offset from the opcode of the node containing it.
122 * An operand, if any, simply follows the node. (Note that much of the
123 * code generation knows about this implicit relationship.)
124 *
125 * Using two bytes for the "next" pointer is vast overkill for most things,
126 * but allows patterns to get big without disasters.
127 */
128 #define OP(p) (*(p))
129 #define NEXT(p) (((*((p)+1)&0377)<<8) + (*((p)+2)&0377))
130 #define OPERAND(p) ((p) + 3)
131
132 /*
133 * See regmagic.h for one further detail of program structure.
134 */
135
136
137 /*
138 * Utility definitions.
139 */
140 #ifndef CHARBITS
141 #define UCHARAT(p) ((int)*(unsigned char *)(p))
142 #else
143 #define UCHARAT(p) ((int)*(p)&CHARBITS)
144 #endif
145
146 #define FAIL(m) { regerror(m); return(NULL); }
147 #define ISMULT(c) ((c) == '*' || (c) == '+' || (c) == '?')
148 #define META "^$.[()|?+*\\"
149
150 /*
151 * Flags to be passed up and down.
152 */
153 #define HASWIDTH 01 /* Known never to match null string. */
154 #define SIMPLE 02 /* Simple enough to be STAR/PLUS operand. */
155 #define SPSTART 04 /* Starts with * or +. */
156 #define WORST 0 /* Worst case. */
157
158 /*
159 * Global work variables for regcomp().
160 */
161 static constant char *regparse; /* Input-scan pointer. */
162 static int regnpar; /* () count. */
163 static char regdummy;
164 static char *regcode; /* Code-emit pointer; ®dummy = don't. */
165 static long regsize; /* Code size. */
166
167 /*
168 * The first byte of the regexp internal "program" is actually this magic
169 * number; the start node begins in the second byte.
170 */
171 #define MAGIC 0234
172
173
174 /*
175 * Forward declarations for regcomp()'s friends.
176 */
177 #ifndef STATIC
178 #define STATIC static
179 #endif
180 STATIC char *reg(int, int *);
181 STATIC char *regbranch(int *);
182 STATIC char *regpiece(int *);
183 STATIC char *regatom(int *);
184 STATIC char *regnode(char);
185 STATIC char *regnext(register char *);
186 STATIC void regc(char);
187 STATIC void reginsert(char, char *);
188 STATIC void regtail(char *, char *);
189 STATIC void regoptail(char *, char *);
190 #ifdef STRCSPN
191 STATIC int strcspn(constant char *, constant char *);
192 #endif
193
194 /*
195 - regcomp - compile a regular expression into internal code
196 *
197 * We can't allocate space until we know how big the compiled form will be,
198 * but we can't compile it (and thus know how big it is) until we've got a
199 * place to put the code. So we cheat: we compile it twice, once with code
200 * generation turned off and size counting turned on, and once "for real".
201 * This also means that we don't allocate space until we are sure that the
202 * thing really will compile successfully, and we never have to move the
203 * code and thus invalidate pointers into it. (Note that it has to be in
204 * one piece because free() must be able to free it all.)
205 *
206 * Beware that the optimization-preparation code in here knows about some
207 * of the structure of the compiled regexp.
208 */
209 regexp *
regcomp(constant char * exp)210 regcomp(constant char *exp)
211 {
212 register regexp *r;
213 register char *scan;
214 register char *longest;
215 register int len;
216 int flags;
217
218 if (exp == NULL)
219 FAIL("NULL argument");
220
221 /* First pass: determine size, legality. */
222 regparse = exp;
223 regnpar = 1;
224 regsize = 0L;
225 regcode = ®dummy;
226 regc(MAGIC);
227 if (reg(0, &flags) == NULL)
228 return(NULL);
229
230 /* Small enough for pointer-storage convention? */
231 if (regsize >= 32767L) /* Probably could be 65535L. */
232 FAIL("regexp too big");
233
234 /* Allocate space. */
235 r = (regexp *)malloc(sizeof(regexp) + (unsigned)regsize);
236 if (r == NULL)
237 FAIL("out of space");
238
239 /* Second pass: emit code. */
240 regparse = exp;
241 regnpar = 1;
242 regcode = r->program;
243 regc(MAGIC);
244 if (reg(0, &flags) == NULL)
245 {
246 free(r);
247 return(NULL);
248 }
249
250 /* Dig out information for optimizations. */
251 r->regstart = '\0'; /* Worst-case defaults. */
252 r->reganch = 0;
253 r->regmust = NULL;
254 r->regmlen = 0;
255 scan = r->program+1; /* First BRANCH. */
256 if (OP(regnext(scan)) == END) { /* Only one top-level choice. */
257 scan = OPERAND(scan);
258
259 /* Starting-point info. */
260 if (OP(scan) == EXACTLY)
261 r->regstart = *OPERAND(scan);
262 else if (OP(scan) == BOL)
263 r->reganch++;
264
265 /*
266 * If there's something expensive in the r.e., find the
267 * longest literal string that must appear and make it the
268 * regmust. Resolve ties in favor of later strings, since
269 * the regstart check works with the beginning of the r.e.
270 * and avoiding duplication strengthens checking. Not a
271 * strong reason, but sufficient in the absence of others.
272 */
273 if (flags&SPSTART) {
274 longest = NULL;
275 len = 0;
276 for (; scan != NULL; scan = regnext(scan))
277 if (OP(scan) == EXACTLY && ((int) strlen(OPERAND(scan))) >= len) {
278 longest = OPERAND(scan);
279 len = (int) strlen(OPERAND(scan));
280 }
281 r->regmust = longest;
282 r->regmlen = len;
283 }
284 }
285
286 return(r);
287 }
288
289 /*
290 - reg - regular expression, i.e. main body or parenthesized thing
291 *
292 * Caller must absorb opening parenthesis.
293 *
294 * Combining parenthesis handling with the base level of regular expression
295 * is a trifle forced, but the need to tie the tails of the branches to what
296 * follows makes it hard to avoid.
297 */
298 static char *
reg(int paren,int * flagp)299 reg(int paren, int *flagp)
300 {
301 register char *ret;
302 register char *br;
303 register char *ender;
304 register int parno = 0;
305 int flags;
306
307 *flagp = HASWIDTH; /* Tentatively. */
308
309 /* Make an OPEN node, if parenthesized. */
310 if (paren) {
311 if (regnpar >= NSUBEXP)
312 FAIL("too many ()");
313 parno = regnpar;
314 regnpar++;
315 ret = regnode(OPEN+parno);
316 } else
317 ret = NULL;
318
319 /* Pick up the branches, linking them together. */
320 br = regbranch(&flags);
321 if (br == NULL)
322 return(NULL);
323 if (ret != NULL)
324 regtail(ret, br); /* OPEN -> first. */
325 else
326 ret = br;
327 if (!(flags&HASWIDTH))
328 *flagp &= ~HASWIDTH;
329 *flagp |= flags&SPSTART;
330 while (*regparse == '|') {
331 regparse++;
332 br = regbranch(&flags);
333 if (br == NULL)
334 return(NULL);
335 regtail(ret, br); /* BRANCH -> BRANCH. */
336 if (!(flags&HASWIDTH))
337 *flagp &= ~HASWIDTH;
338 *flagp |= flags&SPSTART;
339 }
340
341 /* Make a closing node, and hook it on the end. */
342 ender = regnode((paren) ? CLOSE+parno : END);
343 regtail(ret, ender);
344
345 /* Hook the tails of the branches to the closing node. */
346 for (br = ret; br != NULL; br = regnext(br))
347 regoptail(br, ender);
348
349 /* Check for proper termination. */
350 if (paren && *regparse++ != ')') {
351 FAIL("unmatched ()");
352 } else if (!paren && *regparse != '\0') {
353 if (*regparse == ')') {
354 FAIL("unmatched ()");
355 } else
356 FAIL("junk on end"); /* "Can't happen". */
357 /* NOTREACHED */
358 }
359
360 return(ret);
361 }
362
363 /*
364 - regbranch - one alternative of an | operator
365 *
366 * Implements the concatenation operator.
367 */
368 static char *
regbranch(int * flagp)369 regbranch(int *flagp)
370 {
371 register char *ret;
372 register char *chain;
373 register char *latest;
374 int flags;
375
376 *flagp = WORST; /* Tentatively. */
377
378 ret = regnode(BRANCH);
379 chain = NULL;
380 while (*regparse != '\0' && *regparse != '|' && *regparse != ')') {
381 latest = regpiece(&flags);
382 if (latest == NULL)
383 return(NULL);
384 *flagp |= flags&HASWIDTH;
385 if (chain == NULL) /* First piece. */
386 *flagp |= flags&SPSTART;
387 else
388 regtail(chain, latest);
389 chain = latest;
390 }
391 if (chain == NULL) /* Loop ran zero times. */
392 (void) regnode(NOTHING);
393
394 return(ret);
395 }
396
397 /*
398 - regpiece - something followed by possible [*+?]
399 *
400 * Note that the branching code sequences used for ? and the general cases
401 * of * and + are somewhat optimized: they use the same NOTHING node as
402 * both the endmarker for their branch list and the body of the last branch.
403 * It might seem that this node could be dispensed with entirely, but the
404 * endmarker role is not redundant.
405 */
406 static char *
regpiece(int * flagp)407 regpiece(int *flagp)
408 {
409 register char *ret;
410 register char op;
411 register char *next;
412 int flags;
413
414 ret = regatom(&flags);
415 if (ret == NULL)
416 return(NULL);
417
418 op = *regparse;
419 if (!ISMULT(op)) {
420 *flagp = flags;
421 return(ret);
422 }
423
424 if (!(flags&HASWIDTH) && op != '?')
425 FAIL("*+ operand could be empty");
426 *flagp = (op != '+') ? (WORST|SPSTART) : (WORST|HASWIDTH);
427
428 if (op == '*' && (flags&SIMPLE))
429 reginsert(STAR, ret);
430 else if (op == '*') {
431 /* Emit x* as (x&|), where & means "self". */
432 reginsert(BRANCH, ret); /* Either x */
433 regoptail(ret, regnode(BACK)); /* and loop */
434 regoptail(ret, ret); /* back */
435 regtail(ret, regnode(BRANCH)); /* or */
436 regtail(ret, regnode(NOTHING)); /* null. */
437 } else if (op == '+' && (flags&SIMPLE))
438 reginsert(PLUS, ret);
439 else if (op == '+') {
440 /* Emit x+ as x(&|), where & means "self". */
441 next = regnode(BRANCH); /* Either */
442 regtail(ret, next);
443 regtail(regnode(BACK), ret); /* loop back */
444 regtail(next, regnode(BRANCH)); /* or */
445 regtail(ret, regnode(NOTHING)); /* null. */
446 } else if (op == '?') {
447 /* Emit x? as (x|) */
448 reginsert(BRANCH, ret); /* Either x */
449 regtail(ret, regnode(BRANCH)); /* or */
450 next = regnode(NOTHING); /* null. */
451 regtail(ret, next);
452 regoptail(ret, next);
453 }
454 regparse++;
455 if (ISMULT(*regparse))
456 FAIL("nested *?+");
457
458 return(ret);
459 }
460
461 /*
462 - regatom - the lowest level
463 *
464 * Optimization: gobbles an entire sequence of ordinary characters so that
465 * it can turn them into a single node, which is smaller to store and
466 * faster to run. Backslashed characters are exceptions, each becoming a
467 * separate node; the code is simpler that way and it's not worth fixing.
468 */
469 static char *
regatom(int * flagp)470 regatom(int *flagp)
471 {
472 register char *ret;
473 int flags;
474
475 *flagp = WORST; /* Tentatively. */
476
477 switch (*regparse++) {
478 case '^':
479 ret = regnode(BOL);
480 break;
481 case '$':
482 ret = regnode(EOL);
483 break;
484 case '.':
485 ret = regnode(ANY);
486 *flagp |= HASWIDTH|SIMPLE;
487 break;
488 case '[': {
489 register int clss;
490 register int classend;
491
492 if (*regparse == '^') { /* Complement of range. */
493 ret = regnode(ANYBUT);
494 regparse++;
495 } else
496 ret = regnode(ANYOF);
497 if (*regparse == ']' || *regparse == '-')
498 regc(*regparse++);
499 while (*regparse != '\0' && *regparse != ']') {
500 if (*regparse == '-') {
501 regparse++;
502 if (*regparse == ']' || *regparse == '\0')
503 regc('-');
504 else {
505 clss = UCHARAT(regparse-2)+1;
506 classend = UCHARAT(regparse);
507 if (clss > classend+1)
508 FAIL("invalid [] range");
509 for (; clss <= classend; clss++)
510 regc(clss);
511 regparse++;
512 }
513 } else
514 regc(*regparse++);
515 }
516 regc('\0');
517 if (*regparse != ']')
518 FAIL("unmatched []");
519 regparse++;
520 *flagp |= HASWIDTH|SIMPLE;
521 }
522 break;
523 case '(':
524 ret = reg(1, &flags);
525 if (ret == NULL)
526 return(NULL);
527 *flagp |= flags&(HASWIDTH|SPSTART);
528 break;
529 case '\0':
530 case '|':
531 case ')':
532 FAIL("internal urp"); /* Supposed to be caught earlier. */
533 /* NOTREACHED */
534 break;
535 case '?':
536 case '+':
537 case '*':
538 FAIL("?+* follows nothing");
539 /* NOTREACHED */
540 break;
541 case '\\':
542 if (*regparse == '\0')
543 FAIL("trailing \\");
544 ret = regnode(EXACTLY);
545 regc(*regparse++);
546 regc('\0');
547 *flagp |= HASWIDTH|SIMPLE;
548 break;
549 default: {
550 register int len;
551 register char ender;
552
553 regparse--;
554 len = (int) strcspn(regparse, META);
555 if (len <= 0)
556 FAIL("internal disaster");
557 ender = *(regparse+len);
558 if (len > 1 && ISMULT(ender))
559 len--; /* Back off clear of ?+* operand. */
560 *flagp |= HASWIDTH;
561 if (len == 1)
562 *flagp |= SIMPLE;
563 ret = regnode(EXACTLY);
564 while (len > 0) {
565 regc(*regparse++);
566 len--;
567 }
568 regc('\0');
569 }
570 break;
571 }
572
573 return(ret);
574 }
575
576 /*
577 - regnode - emit a node
578 */
579 static char * /* Location. */
regnode(char op)580 regnode(char op)
581 {
582 register char *ret;
583 register char *ptr;
584
585 ret = regcode;
586 if (ret == ®dummy) {
587 regsize += 3;
588 return(ret);
589 }
590
591 ptr = ret;
592 *ptr++ = op;
593 *ptr++ = '\0'; /* Null "next" pointer. */
594 *ptr++ = '\0';
595 regcode = ptr;
596
597 return(ret);
598 }
599
600 /*
601 - regc - emit (if appropriate) a byte of code
602 */
603 static void
regc(char b)604 regc(char b)
605 {
606 if (regcode != ®dummy)
607 *regcode++ = b;
608 else
609 regsize++;
610 }
611
612 /*
613 - reginsert - insert an operator in front of already-emitted operand
614 *
615 * Means relocating the operand.
616 */
617 static void
reginsert(char op,char * opnd)618 reginsert(char op, char *opnd)
619 {
620 register constant char *src;
621 register char *dst;
622 register char *place;
623
624 if (regcode == ®dummy) {
625 regsize += 3;
626 return;
627 }
628
629 src = regcode;
630 regcode += 3;
631 dst = regcode;
632 while (src > opnd)
633 *--dst = *--src;
634
635 place = opnd; /* Op node, where operand used to be. */
636 *place++ = op;
637 *place++ = '\0';
638 *place++ = '\0';
639 }
640
641 /*
642 - regtail - set the next-pointer at the end of a node chain
643 */
644 static void
regtail(char * p,char * val)645 regtail(char *p, char *val)
646 {
647 register char *scan;
648 register char *temp;
649 register int offset;
650
651 if (p == ®dummy)
652 return;
653
654 /* Find last node. */
655 scan = p;
656 for (;;) {
657 temp = regnext(scan);
658 if (temp == NULL)
659 break;
660 scan = temp;
661 }
662
663 if (OP(scan) == BACK)
664 offset = (int) (scan - val);
665 else
666 offset = (int) (val - scan);
667 *(scan+1) = (offset>>8)&0377;
668 *(scan+2) = offset&0377;
669 }
670
671 /*
672 - regoptail - regtail on operand of first argument; nop if operandless
673 */
674 static void
regoptail(char * p,char * val)675 regoptail(char *p, char *val)
676 {
677 /* "Operandless" and "op != BRANCH" are synonymous in practice. */
678 if (p == NULL || p == ®dummy || OP(p) != BRANCH)
679 return;
680 regtail(OPERAND(p), val);
681 }
682
683 /*
684 * regexec and friends
685 */
686
687 /*
688 * Global work variables for regexec().
689 */
690 static constant char *reginput; /* String-input pointer. */
691 static constant char *regbol; /* Beginning of input, for ^ check. */
692 static constant char **regstartp; /* Pointer to startp array. */
693 static constant char **regendp; /* Ditto for endp. */
694
695 /*
696 * Forwards.
697 */
698 STATIC int regtry(regexp *, constant char *);
699 STATIC int regmatch(char *);
700 STATIC int regrepeat(char *);
701
702 #ifdef DEBUG
703 int regnarrate = 0;
704 void regdump();
705 STATIC char *regprop();
706 #endif
707
708 /*
709 - regexec - match a regexp against a string
710 */
711 int
regexec2(register regexp * prog,register constant char * string,int notbol)712 regexec2(register regexp *prog, register constant char *string, int notbol)
713 {
714 register constant char *s;
715
716 /* Be paranoid... */
717 if (prog == NULL || string == NULL) {
718 regerror("NULL parameter");
719 return(0);
720 }
721
722 /* Check validity of program. */
723 if (UCHARAT(prog->program) != MAGIC) {
724 regerror("corrupted program");
725 return(0);
726 }
727
728 /* If there is a "must appear" string, look for it. */
729 if (prog->regmust != NULL) {
730 s = string;
731 while ((s = strchr(s, prog->regmust[0])) != NULL) {
732 if (strncmp(s, prog->regmust, prog->regmlen) == 0)
733 break; /* Found it. */
734 s++;
735 }
736 if (s == NULL) /* Not present. */
737 return(0);
738 }
739
740 /* Mark beginning of line for ^ . */
741 if (notbol)
742 regbol = NULL;
743 else
744 regbol = string;
745
746 /* Simplest case: anchored match need be tried only once. */
747 if (prog->reganch)
748 return(regtry(prog, string));
749
750 /* Messy cases: unanchored match. */
751 s = string;
752 if (prog->regstart != '\0')
753 /* We know what char it must start with. */
754 while ((s = strchr(s, prog->regstart)) != NULL) {
755 if (regtry(prog, s))
756 return(1);
757 s++;
758 }
759 else
760 /* We don't -- general case. */
761 do {
762 if (regtry(prog, s))
763 return(1);
764 } while (*s++ != '\0');
765
766 /* Failure. */
767 return(0);
768 }
769
770 int
regexec(register regexp * prog,register constant char * string)771 regexec(register regexp *prog, register constant char *string)
772 {
773 return regexec2(prog, string, 0);
774 }
775
776 /*
777 - regtry - try match at specific point
778 */
779 static int /* 0 failure, 1 success */
regtry(regexp * prog,constant char * string)780 regtry(regexp *prog, constant char *string)
781 {
782 register int i;
783 register constant char **sp;
784 register constant char **ep;
785
786 reginput = string;
787 regstartp = prog->startp;
788 regendp = prog->endp;
789
790 sp = prog->startp;
791 ep = prog->endp;
792 for (i = NSUBEXP; i > 0; i--) {
793 *sp++ = NULL;
794 *ep++ = NULL;
795 }
796 if (regmatch(prog->program + 1)) {
797 prog->startp[0] = string;
798 prog->endp[0] = reginput;
799 return(1);
800 } else
801 return(0);
802 }
803
804 /*
805 - regmatch - main matching routine
806 *
807 * Conceptually the strategy is simple: check to see whether the current
808 * node matches, call self recursively to see whether the rest matches,
809 * and then act accordingly. In practice we make some effort to avoid
810 * recursion, in particular by going through "ordinary" nodes (that don't
811 * need to know whether the rest of the match failed) by a loop instead of
812 * by recursion.
813 */
814 static int /* 0 failure, 1 success */
regmatch(char * prog)815 regmatch(char *prog)
816 {
817 register char *scan; /* Current node. */
818 char *next; /* Next node. */
819
820 scan = prog;
821 #ifdef DEBUG
822 if (scan != NULL && regnarrate)
823 fprintf(stderr, "%s(\n", regprop(scan));
824 #endif
825 while (scan != NULL) {
826 #ifdef DEBUG
827 if (regnarrate)
828 fprintf(stderr, "%s...\n", regprop(scan));
829 #endif
830 next = regnext(scan);
831
832 switch (OP(scan)) {
833 case BOL:
834 if (reginput != regbol)
835 return(0);
836 break;
837 case EOL:
838 if (*reginput != '\0')
839 return(0);
840 break;
841 case ANY:
842 if (*reginput == '\0')
843 return(0);
844 reginput++;
845 break;
846 case EXACTLY: {
847 register int len;
848 register char *opnd;
849
850 opnd = OPERAND(scan);
851 /* Inline the first character, for speed. */
852 if (*opnd != *reginput)
853 return(0);
854 len = (int) strlen(opnd);
855 if (len > 1 && strncmp(opnd, reginput, len) != 0)
856 return(0);
857 reginput += len;
858 }
859 break;
860 case ANYOF:
861 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) == NULL)
862 return(0);
863 reginput++;
864 break;
865 case ANYBUT:
866 if (*reginput == '\0' || strchr(OPERAND(scan), *reginput) != NULL)
867 return(0);
868 reginput++;
869 break;
870 case NOTHING:
871 break;
872 case BACK:
873 break;
874 case OPEN+1:
875 case OPEN+2:
876 case OPEN+3:
877 case OPEN+4:
878 case OPEN+5:
879 case OPEN+6:
880 case OPEN+7:
881 case OPEN+8:
882 case OPEN+9: {
883 register int no;
884 register constant char *save;
885
886 no = OP(scan) - OPEN;
887 save = reginput;
888
889 if (regmatch(next)) {
890 /*
891 * Don't set startp if some later
892 * invocation of the same parentheses
893 * already has.
894 */
895 if (regstartp[no] == NULL)
896 regstartp[no] = save;
897 return(1);
898 } else
899 return(0);
900 }
901 /* NOTREACHED */
902 break;
903 case CLOSE+1:
904 case CLOSE+2:
905 case CLOSE+3:
906 case CLOSE+4:
907 case CLOSE+5:
908 case CLOSE+6:
909 case CLOSE+7:
910 case CLOSE+8:
911 case CLOSE+9: {
912 register int no;
913 register constant char *save;
914
915 no = OP(scan) - CLOSE;
916 save = reginput;
917
918 if (regmatch(next)) {
919 /*
920 * Don't set endp if some later
921 * invocation of the same parentheses
922 * already has.
923 */
924 if (regendp[no] == NULL)
925 regendp[no] = save;
926 return(1);
927 } else
928 return(0);
929 }
930 /* NOTREACHED */
931 break;
932 case BRANCH: {
933 register constant char *save;
934
935 if (OP(next) != BRANCH) /* No choice. */
936 next = OPERAND(scan); /* Avoid recursion. */
937 else {
938 do {
939 save = reginput;
940 if (regmatch(OPERAND(scan)))
941 return(1);
942 reginput = save;
943 scan = regnext(scan);
944 } while (scan != NULL && OP(scan) == BRANCH);
945 return(0);
946 /* NOTREACHED */
947 }
948 }
949 /* NOTREACHED */
950 break;
951 case STAR:
952 case PLUS: {
953 register char nextch;
954 register int no;
955 register constant char *save;
956 register int min;
957
958 /*
959 * Lookahead to avoid useless match attempts
960 * when we know what character comes next.
961 */
962 nextch = '\0';
963 if (OP(next) == EXACTLY)
964 nextch = *OPERAND(next);
965 min = (OP(scan) == STAR) ? 0 : 1;
966 save = reginput;
967 no = regrepeat(OPERAND(scan));
968 while (no >= min) {
969 /* If it could work, try it. */
970 if (nextch == '\0' || *reginput == nextch)
971 if (regmatch(next))
972 return(1);
973 /* Couldn't or didn't -- back up. */
974 no--;
975 reginput = save + no;
976 }
977 return(0);
978 }
979 /* NOTREACHED */
980 break;
981 case END:
982 return(1); /* Success! */
983 /* NOTREACHED */
984 break;
985 default:
986 regerror("memory corruption");
987 return(0);
988 /* NOTREACHED */
989 break;
990 }
991
992 scan = next;
993 }
994
995 /*
996 * We get here only if there's trouble -- normally "case END" is
997 * the terminating point.
998 */
999 regerror("corrupted pointers");
1000 return(0);
1001 }
1002
1003 /*
1004 - regrepeat - repeatedly match something simple, report how many
1005 */
1006 static int
regrepeat(char * p)1007 regrepeat(char *p)
1008 {
1009 register int count = 0;
1010 register constant char *scan;
1011 register char *opnd;
1012
1013 scan = reginput;
1014 opnd = OPERAND(p);
1015 switch (OP(p)) {
1016 case ANY:
1017 count = (int) strlen(scan);
1018 scan += count;
1019 break;
1020 case EXACTLY:
1021 while (*opnd == *scan) {
1022 count++;
1023 scan++;
1024 }
1025 break;
1026 case ANYOF:
1027 while (*scan != '\0' && strchr(opnd, *scan) != NULL) {
1028 count++;
1029 scan++;
1030 }
1031 break;
1032 case ANYBUT:
1033 while (*scan != '\0' && strchr(opnd, *scan) == NULL) {
1034 count++;
1035 scan++;
1036 }
1037 break;
1038 default: /* Oh dear. Called inappropriately. */
1039 regerror("internal foulup");
1040 count = 0; /* Best compromise. */
1041 break;
1042 }
1043 reginput = scan;
1044
1045 return(count);
1046 }
1047
1048 /*
1049 - regnext - dig the "next" pointer out of a node
1050 */
1051 static char *
regnext(register char * p)1052 regnext(register char *p)
1053 {
1054 register int offset;
1055
1056 if (p == ®dummy)
1057 return(NULL);
1058
1059 offset = NEXT(p);
1060 if (offset == 0)
1061 return(NULL);
1062
1063 if (OP(p) == BACK)
1064 return(p-offset);
1065 else
1066 return(p+offset);
1067 }
1068
1069 #ifdef DEBUG
1070
1071 STATIC char *regprop();
1072
1073 /*
1074 - regdump - dump a regexp onto stdout in vaguely comprehensible form
1075 */
1076 void
regdump(r)1077 regdump(r)
1078 regexp *r;
1079 {
1080 register char *s;
1081 register char op = EXACTLY; /* Arbitrary non-END op. */
1082 register char *next;
1083
1084
1085 s = r->program + 1;
1086 while (op != END) { /* While that wasn't END last time... */
1087 op = OP(s);
1088 printf("%2d%s", s-r->program, regprop(s)); /* Where, what. */
1089 next = regnext(s);
1090 if (next == NULL) /* Next ptr. */
1091 printf("(0)");
1092 else
1093 printf("(%d)", (s-r->program)+(next-s));
1094 s += 3;
1095 if (op == ANYOF || op == ANYBUT || op == EXACTLY) {
1096 /* Literal string, where present. */
1097 while (*s != '\0') {
1098 putchar(*s);
1099 s++;
1100 }
1101 s++;
1102 }
1103 putchar('\n');
1104 }
1105
1106 /* Header fields of interest. */
1107 if (r->regstart != '\0')
1108 printf("start `%c' ", r->regstart);
1109 if (r->reganch)
1110 printf("anchored ");
1111 if (r->regmust != NULL)
1112 printf("must have \"%s\"", r->regmust);
1113 printf("\n");
1114 }
1115
1116 /*
1117 - regprop - printable representation of opcode
1118 */
1119 static constant char *
regprop(op)1120 regprop(op)
1121 constant char *op;
1122 {
1123 register char *p;
1124 static char buf[50];
1125
1126 (void) strcpy(buf, ":");
1127
1128 switch (OP(op)) {
1129 case BOL:
1130 p = "BOL";
1131 break;
1132 case EOL:
1133 p = "EOL";
1134 break;
1135 case ANY:
1136 p = "ANY";
1137 break;
1138 case ANYOF:
1139 p = "ANYOF";
1140 break;
1141 case ANYBUT:
1142 p = "ANYBUT";
1143 break;
1144 case BRANCH:
1145 p = "BRANCH";
1146 break;
1147 case EXACTLY:
1148 p = "EXACTLY";
1149 break;
1150 case NOTHING:
1151 p = "NOTHING";
1152 break;
1153 case BACK:
1154 p = "BACK";
1155 break;
1156 case END:
1157 p = "END";
1158 break;
1159 case OPEN+1:
1160 case OPEN+2:
1161 case OPEN+3:
1162 case OPEN+4:
1163 case OPEN+5:
1164 case OPEN+6:
1165 case OPEN+7:
1166 case OPEN+8:
1167 case OPEN+9:
1168 sprintf(buf+strlen(buf), "OPEN%d", OP(op)-OPEN);
1169 p = NULL;
1170 break;
1171 case CLOSE+1:
1172 case CLOSE+2:
1173 case CLOSE+3:
1174 case CLOSE+4:
1175 case CLOSE+5:
1176 case CLOSE+6:
1177 case CLOSE+7:
1178 case CLOSE+8:
1179 case CLOSE+9:
1180 sprintf(buf+strlen(buf), "CLOSE%d", OP(op)-CLOSE);
1181 p = NULL;
1182 break;
1183 case STAR:
1184 p = "STAR";
1185 break;
1186 case PLUS:
1187 p = "PLUS";
1188 break;
1189 default:
1190 regerror("corrupted opcode");
1191 break;
1192 }
1193 if (p != NULL)
1194 (void) strcat(buf, p);
1195 return(buf);
1196 }
1197 #endif
1198
1199 /*
1200 * The following is provided for those people who do not have strcspn() in
1201 * their C libraries. They should get off their butts and do something
1202 * about it; at least one public-domain implementation of those (highly
1203 * useful) string routines has been published on Usenet.
1204 */
1205 #ifdef STRCSPN
1206 /*
1207 * strcspn - find length of initial segment of s1 consisting entirely
1208 * of characters not from s2
1209 */
1210
1211 static int
strcspn(constant char * s1,constant char * s2)1212 strcspn(constant char *s1, constant char *s2)
1213 {
1214 register char *scan1;
1215 register char *scan2;
1216 register int count;
1217
1218 count = 0;
1219 for (scan1 = s1; *scan1 != '\0'; scan1++) {
1220 for (scan2 = s2; *scan2 != '\0';) /* ++ moved down. */
1221 if (*scan1 == *scan2++)
1222 return(count);
1223 count++;
1224 }
1225 return(count);
1226 }
1227 #endif
1228