|
NCBI Home IEB Home C Toolkit docs C++ Toolkit source browser C Toolkit source browser (2) |
NCBI C Toolkit Cross ReferenceC/regexp/pcre.c |
source navigation diff markup identifier search freetext search file search |
1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4
5 /*
6 This is a library of functions to support regular expressions whose syntax
7 and semantics are as close as possible to those of the Perl 5 language. See
8 the file Tech.Notes for some information on the internals.
9
10 Written by: Philip Hazel <ph10@cam.ac.uk>
11
12 Copyright (c) 1997-2001 University of Cambridge
13
14 -----------------------------------------------------------------------------
15 Permission is granted to anyone to use this software for any purpose on any
16 computer system, and to redistribute it freely, subject to the following
17 restrictions:
18
19 1. This software is distributed in the hope that it will be useful,
20 but WITHOUT ANY WARRANTY; without even the implied warranty of
21 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
22
23 2. The origin of this software must not be misrepresented, either by
24 explicit claim or by omission.
25
26 3. Altered versions must be plainly marked as such, and must not be
27 misrepresented as being the original software.
28
29 4. If PCRE is embedded in any software that is released under the GNU
30 General Purpose Licence (GPL), then the terms of that licence shall
31 supersede any condition above with which it is incompatible.
32 -----------------------------------------------------------------------------
33 */
34
35 /* Use a macro for debugging printing */
36
37 #if defined(_DEBUG)
38 # define DPRINTF(p) printf p
39 #else
40 # define DPRINTF(p) /*nothing*/
41 #endif
42
43 /* Include the internals header, which itself includes Standard C headers plus
44 the external pcre header. */
45
46 #include "pcre_internal.h"
47
48 /* Allow compilation as C++ source code, should anybody want to do that. */
49
50 #ifdef __cplusplus
51 #define class pcre_class
52 #endif
53
54
55 /* Maximum number of items on the nested bracket stacks at compile time. This
56 applies to the nesting of all kinds of parentheses. It does not limit
57 un-nested, non-capturing parentheses. This number can be made bigger if
58 necessary - it is used to dimension one int and one unsigned char vector at
59 compile time. */
60
61 #define BRASTACK_SIZE 200
62
63
64 /* The number of bytes in a literal character string above which we can't add
65 any more is different when UTF-8 characters may be encountered. */
66
67 #ifdef SUPPORT_UTF8
68 #define MAXLIT 250
69 #else
70 #define MAXLIT 255
71 #endif
72
73
74 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
75
76 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
77 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
78
79 /* Text forms of OP_ values and things, for debugging (not all used) */
80
81 #ifdef DEBUG
82 static const char *OP_names[] = {
83 "End", "\\A", "\\B", "\\b", "\\D", "\\d",
84 "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
85 "Opt", "^", "$", "Any", "chars", "not",
86 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
87 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
88 "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
89 "*", "*?", "+", "+?", "?", "??", "{", "{",
90 "class", "Ref", "Recurse",
91 "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
92 "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
93 "Brazero", "Braminzero", "Branumber", "Bra"
94 };
95 #endif
96
97 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
98 are simple data values; negative values are for special things like \d and so
99 on. Zero means further processing is needed (for things like \x), or the escape
100 is invalid. */
101
102 static const short int escapes[] = {
103 0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
104 0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
105 '@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
106 0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
107 0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
108 0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
109 '`', 7, -ESC_b, 0, -ESC_d, ESC_E, ESC_F, 0, /* ` - g */
110 0, 0, 0, 0, 0, 0, ESC_N, 0, /* h - o */
111 0, 0, ESC_R, -ESC_s, ESC_T, 0, 0, -ESC_w, /* p - w */
112 0, 0, -ESC_z /* x - z */
113 };
114
115 /* Tables of names of POSIX character classes and their lengths. The list is
116 terminated by a zero length entry. The first three must be alpha, upper, lower,
117 as this is assumed for handling case independence. */
118
119 static const char *posix_names[] = {
120 "alpha", "lower", "upper",
121 "alnum", "ascii", "cntrl", "digit", "graph",
122 "print", "punct", "space", "word", "xdigit" };
123
124 static const uschar posix_name_lengths[] = {
125 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
126
127 /* Table of class bit maps for each POSIX class; up to three may be combined
128 to form the class. */
129
130 static const int posix_class_maps[] = {
131 cbit_lower, cbit_upper, -1, /* alpha */
132 cbit_lower, -1, -1, /* lower */
133 cbit_upper, -1, -1, /* upper */
134 cbit_digit, cbit_lower, cbit_upper, /* alnum */
135 cbit_print, cbit_cntrl, -1, /* ascii */
136 cbit_cntrl, -1, -1, /* cntrl */
137 cbit_digit, -1, -1, /* digit */
138 cbit_graph, -1, -1, /* graph */
139 cbit_print, -1, -1, /* print */
140 cbit_punct, -1, -1, /* punct */
141 cbit_space, -1, -1, /* space */
142 cbit_word, -1, -1, /* word */
143 cbit_xdigit,-1, -1 /* xdigit */
144 };
145
146
147 /* Definition to allow mutual recursion */
148
149 static BOOL
150 compile_regex(int, int, int *, uschar **, const uschar **, const char **,
151 BOOL, int, int *, int *, compile_data *);
152
153 /* Structure for building a chain of data that actually lives on the
154 stack, for holding the values of the subject pointer at the start of each
155 subpattern, so as to detect when an empty string has been matched by a
156 subpattern - to break infinite loops. */
157
158 typedef struct eptrblock {
159 struct eptrblock *prev;
160 const uschar *saved_eptr;
161 } eptrblock;
162
163 /* Flag bits for the match() function */
164
165 #define match_condassert 0x01 /* Called to check a condition assertion */
166 #define match_isgroup 0x02 /* Set if start of bracketed group */
167
168
169
170 /*************************************************
171 * Global variables *
172 *************************************************/
173
174 /* PCRE is thread-clean and doesn't use any global variables in the normal
175 sense. However, it calls memory allocation and free functions via the two
176 indirections below, which are can be changed by the caller, but are shared
177 between all threads. */
178
179 void *(*pcre_malloc)(size_t) = malloc;
180 void (*pcre_free)(void *) = free;
181
182
183
184 /*************************************************
185 * Macros and tables for character handling *
186 *************************************************/
187
188 /* When UTF-8 encoding is being used, a character is no longer just a single
189 byte. The macros for character handling generate simple sequences when used in
190 byte-mode, and more complicated ones for UTF-8 characters. */
191
192 #ifndef SUPPORT_UTF8
193 #define GETCHARINC(c, eptr) c = *eptr++;
194 #define GETCHARLEN(c, eptr, len) c = *eptr;
195 #define BACKCHAR(eptr)
196
197 #else /* SUPPORT_UTF8 */
198
199 /* Get the next UTF-8 character, advancing the pointer */
200
201 #define GETCHARINC(c, eptr) \
202 c = *eptr++; \
203 if (md->utf8 && (c & 0xc0) == 0xc0) \
204 { \
205 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
206 int s = 6*a; \
207 c = (c & utf8_table3[a]) << s; \
208 while (a-- > 0) \
209 { \
210 s -= 6; \
211 c |= (*eptr++ & 0x3f) << s; \
212 } \
213 }
214
215 /* Get the next UTF-8 character, not advancing the pointer, setting length */
216
217 #define GETCHARLEN(c, eptr, len) \
218 c = *eptr; \
219 len = 1; \
220 if (md->utf8 && (c & 0xc0) == 0xc0) \
221 { \
222 int i; \
223 int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
224 int s = 6*a; \
225 c = (c & utf8_table3[a]) << s; \
226 for (i = 1; i <= a; i++) \
227 { \
228 s -= 6; \
229 c |= (eptr[i] & 0x3f) << s; \
230 } \
231 len += a; \
232 }
233
234 /* If the pointer is not at the start of a character, move it back until
235 it is. */
236
237 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
238
239 #endif
240
241
242
243 /*************************************************
244 * Default character tables *
245 *************************************************/
246
247 /* A default set of character tables is included in the PCRE binary. Its source
248 is built by the maketables auxiliary program, which uses the default C ctypes
249 functions, and put in the file chartables.c. These tables are used by PCRE
250 whenever the caller of pcre_compile() does not provide an alternate set of
251 tables. */
252
253 #include "chartables.c"
254
255
256
257 #ifdef SUPPORT_UTF8
258 /*************************************************
259 * Tables for UTF-8 support *
260 *************************************************/
261
262 /* These are the breakpoints for different numbers of bytes in a UTF-8
263 character. */
264
265 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
266
267 /* These are the indicator bits and the mask for the data bits to set in the
268 first byte of a character, indexed by the number of additional bytes. */
269
270 static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
271 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
272
273 /* Table of the number of extra characters, indexed by the first character
274 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
275 0x3d. */
276
277 static uschar utf8_table4[] = {
278 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
279 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
280 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
281 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
282
283
284 /*************************************************
285 * Convert character value to UTF-8 *
286 *************************************************/
287
288 /* This function takes an integer value in the range 0 - 0x7fffffff
289 and encodes it as a UTF-8 character in 0 to 6 bytes.
290
291 Arguments:
292 cvalue the character value
293 buffer pointer to buffer for result - at least 6 bytes long
294
295 Returns: number of characters placed in the buffer
296 */
297
298 static int
299 ord2utf8(int cvalue, uschar *buffer)
300 {
301 register int i, j;
302 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
303 if (cvalue <= utf8_table1[i]) break;
304 buffer += i;
305 for (j = i; j > 0; j--)
306 {
307 *buffer-- = 0x80 | (cvalue & 0x3f);
308 cvalue >>= 6;
309 }
310 *buffer = utf8_table2[i] | cvalue;
311 return i + 1;
312 }
313 #endif
314
315
316
317 /*************************************************
318 * Return version string *
319 *************************************************/
320
321 #define STRING(a) # a
322 #define XSTRING(s) STRING(s)
323
324 const char *
325 pcre_version(void)
326 {
327 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
328 }
329
330
331
332
333 /*************************************************
334 * (Obsolete) Return info about compiled pattern *
335 *************************************************/
336
337 /* This is the original "info" function. It picks potentially useful data out
338 of the private structure, but its interface was too rigid. It remains for
339 backwards compatibility. The public options are passed back in an int - though
340 the re->options field has been expanded to a long int, all the public options
341 at the low end of it, and so even on 16-bit systems this will still be OK.
342 Therefore, I haven't changed the API for pcre_info().
343
344 Arguments:
345 external_re points to compiled code
346 optptr where to pass back the options
347 first_char where to pass back the first character,
348 or -1 if multiline and all branches start ^,
349 or -2 otherwise
350
351 Returns: number of capturing subpatterns
352 or negative values on error
353 */
354
355 int
356 pcre_info(const pcre *external_re, int *optptr, int *first_char)
357 {
358 const real_pcre *re = (const real_pcre *)external_re;
359 if (re == NULL) return PCRE_ERROR_NULL;
360 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
361 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
362 if (first_char != NULL)
363 *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
364 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
365 return re->top_bracket;
366 }
367
368
369
370 /*************************************************
371 * Return info about compiled pattern *
372 *************************************************/
373
374 /* This is a newer "info" function which has an extensible interface so
375 that additional items can be added compatibly.
376
377 Arguments:
378 external_re points to compiled code
379 external_study points to study data, or NULL
380 what what information is required
381 where where to put the information
382
383 Returns: 0 if data returned, negative on error
384 */
385
386 int
387 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
388 void *where)
389 {
390 const real_pcre *re = (const real_pcre *)external_re;
391 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
392
393 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
394 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
395
396 switch (what)
397 {
398 case PCRE_INFO_OPTIONS:
399 *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
400 break;
401
402 case PCRE_INFO_SIZE:
403 *((size_t *)where) = re->size;
404 break;
405
406 case PCRE_INFO_CAPTURECOUNT:
407 *((int *)where) = re->top_bracket;
408 break;
409
410 case PCRE_INFO_BACKREFMAX:
411 *((int *)where) = re->top_backref;
412 break;
413
414 case PCRE_INFO_FIRSTCHAR:
415 *((int *)where) =
416 ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
417 ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
418 break;
419
420 case PCRE_INFO_FIRSTTABLE:
421 *((const uschar **)where) =
422 (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
423 study->start_bits : NULL;
424 break;
425
426 case PCRE_INFO_LASTLITERAL:
427 *((int *)where) =
428 ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
429 break;
430
431 default: return PCRE_ERROR_BADOPTION;
432 }
433
434 return 0;
435 }
436
437
438
439 #ifdef DEBUG
440 /*************************************************
441 * Debugging function to print chars *
442 *************************************************/
443
444 /* Print a sequence of chars in printable format, stopping at the end of the
445 subject if the requested.
446
447 Arguments:
448 p points to characters
449 length number to print
450 is_subject TRUE if printing from within md->start_subject
451 md pointer to matching data block, if is_subject is TRUE
452
453 Returns: nothing
454 */
455
456 static void
457 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
458 {
459 int c;
460 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
461 while (length-- > 0)
462 if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
463 }
464 #endif
465
466
467
468
469 /*************************************************
470 * Handle escapes *
471 *************************************************/
472
473 /* This function is called when a \ has been encountered. It either returns a
474 positive value for a simple escape such as \n, or a negative value which
475 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
476 a positive value greater than 255 may be returned. On entry, ptr is pointing at
477 the \. On exit, it is on the final character of the escape sequence.
478
479 Arguments:
480 ptrptr points to the pattern position pointer
481 errorptr points to the pointer to the error message
482 bracount number of previous extracting brackets
483 options the options bits
484 isclass TRUE if inside a character class
485 cd pointer to char tables block
486
487 Returns: zero or positive => a data character
488 negative => a special escape sequence
489 on error, errorptr is set
490 */
491
492 static int
493 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
494 int options, BOOL isclass, compile_data *cd)
495 {
496 const uschar *ptr = *ptrptr;
497 int c, i;
498
499 /* If backslash is at the end of the pattern, it's an error. */
500
501 c = *(++ptr);
502 if (c == 0) *errorptr = ERR1;
503
504 /* Digits or letters may have special meaning; all others are literals. */
505
506 else if (c < '0' || c > 'z') {}
507
508 /* Do an initial lookup in a table. A non-zero result is something that can be
509 returned immediately. Otherwise further processing may be required. */
510
511 else if ((i = escapes[c - '0']) != 0) c = i;
512
513 /* Escapes that need further processing, or are illegal. */
514
515 else
516 {
517 const uschar *oldptr;
518 switch (c)
519 {
520 /* The handling of escape sequences consisting of a string of digits
521 starting with one that is not zero is not straightforward. By experiment,
522 the way Perl works seems to be as follows:
523
524 Outside a character class, the digits are read as a decimal number. If the
525 number is less than 10, or if there are that many previous extracting
526 left brackets, then it is a back reference. Otherwise, up to three octal
527 digits are read to form an escaped byte. Thus \123 is likely to be octal
528 123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
529 value is greater than 377, the least significant 8 bits are taken. Inside a
530 character class, \ followed by a digit is always an octal number. */
531
532 case '1': case '2': case '3': case '4': case '5':
533 case '6': case '7': case '8': case '9':
534
535 if (!isclass)
536 {
537 oldptr = ptr;
538 c -= '0';
539 while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
540 c = c * 10 + *(++ptr) - '0';
541 if (c < 10 || c <= bracount)
542 {
543 c = -(ESC_REF + c);
544 break;
545 }
546 ptr = oldptr; /* Put the pointer back and fall through */
547 }
548
549 /* Handle an octal number following \. If the first digit is 8 or 9, Perl
550 generates a binary zero byte and treats the digit as a following literal.
551 Thus we have to pull back the pointer by one. */
552
553 if ((c = *ptr) >= '8')
554 {
555 ptr--;
556 c = 0;
557 break;
558 }
559
560 /* \0 always starts an octal number, but we may drop through to here with a
561 larger first octal digit. */
562
563 case '0':
564 c -= '0';
565 while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
566 ptr[1] != '8' && ptr[1] != '9')
567 c = c * 8 + *(++ptr) - '0';
568 c &= 255; /* Take least significant 8 bits */
569 break;
570
571 /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
572 which can be greater than 0xff, but only if the ddd are hex digits. */
573
574 case 'x':
575 #ifdef SUPPORT_UTF8
576 if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
577 {
578 const uschar *pt = ptr + 2;
579 register int count = 0;
580 c = 0;
581 while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
582 {
583 count++;
584 c = c * 16 + cd->lcc[*pt] -
585 (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
586 pt++;
587 }
588 if (*pt == '}')
589 {
590 if (c < 0 || count > 8) *errorptr = ERR34;
591 ptr = pt;
592 break;
593 }
594 /* If the sequence of hex digits does not end with '}', then we don't
595 recognize this construct; fall through to the normal \x handling. */
596 }
597 #endif
598
599 /* Read just a single hex char */
600
601 c = 0;
602 while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
603 {
604 ptr++;
605 c = c * 16 + cd->lcc[*ptr] -
606 (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
607 }
608 break;
609
610 /* Other special escapes not starting with a digit are straightforward */
611
612 case 'c':
613 c = *(++ptr);
614 if (c == 0)
615 {
616 *errorptr = ERR2;
617 return 0;
618 }
619
620 /* A letter is upper-cased; then the 0x40 bit is flipped */
621
622 if (c >= 'a' && c <= 'z') c = cd->fcc[c];
623 c ^= 0x40;
624 break;
625
626 /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
627 other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
628 for Perl compatibility, it is a literal. This code looks a bit odd, but
629 there used to be some cases other than the default, and there may be again
630 in future, so I haven't "optimized" it. */
631
632 default:
633 if ((options & PCRE_EXTRA) != 0) switch(c)
634 {
635 default:
636 *errorptr = ERR3;
637 break;
638 }
639 break;
640 }
641 }
642
643 *ptrptr = ptr;
644 return c;
645 }
646
647
648
649 /*************************************************
650 * Check for counted repeat *
651 *************************************************/
652
653 /* This function is called when a '{' is encountered in a place where it might
654 start a quantifier. It looks ahead to see if it really is a quantifier or not.
655 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
656 where the ddds are digits.
657
658 Arguments:
659 p pointer to the first char after '{'
660 cd pointer to char tables block
661
662 Returns: TRUE or FALSE
663 */
664
665 static BOOL
666 is_counted_repeat(const uschar *p, compile_data *cd)
667 {
668 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
669 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
670 if (*p == '}') return TRUE;
671
672 if (*p++ != ',') return FALSE;
673 if (*p == '}') return TRUE;
674
675 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
676 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
677 return (*p == '}');
678 }
679
680
681
682 /*************************************************
683 * Read repeat counts *
684 *************************************************/
685
686 /* Read an item of the form {n,m} and return the values. This is called only
687 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
688 so the syntax is guaranteed to be correct, but we need to check the values.
689
690 Arguments:
691 p pointer to first char after '{'
692 minp pointer to int for min
693 maxp pointer to int for max
694 returned as -1 if no max
695 errorptr points to pointer to error message
696 cd pointer to character tables clock
697
698 Returns: pointer to '}' on success;
699 current ptr on error, with errorptr set
700 */
701
702 static const uschar *
703 read_repeat_counts(const uschar *p, int *minp, int *maxp,
704 const char **errorptr, compile_data *cd)
705 {
706 int min = 0;
707 int max = -1;
708
709 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
710
711 if (*p == '}') max = min; else
712 {
713 if (*(++p) != '}')
714 {
715 max = 0;
716 while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
717 if (max < min)
718 {
719 *errorptr = ERR4;
720 return p;
721 }
722 }
723 }
724
725 /* Do paranoid checks, then fill in the required variables, and pass back the
726 pointer to the terminating '}'. */
727
728 if (min > 65535 || max > 65535)
729 *errorptr = ERR5;
730 else
731 {
732 *minp = min;
733 *maxp = max;
734 }
735 return p;
736 }
737
738
739
740 /*************************************************
741 * Find the fixed length of a pattern *
742 *************************************************/
743
744 /* Scan a pattern and compute the fixed length of subject that will match it,
745 if the length is fixed. This is needed for dealing with backward assertions.
746
747 Arguments:
748 code points to the start of the pattern (the bracket)
749 options the compiling options
750
751 Returns: the fixed length, or -1 if there is no fixed length
752 */
753
754 static int
755 find_fixedlength(uschar *code, int options)
756 {
757 int length = -1;
758
759 register int branchlength = 0;
760 register uschar *cc = code + 3;
761
762 /* Scan along the opcodes for this branch. If we get to the end of the
763 branch, check the length against that of the other branches. */
764
765 for (;;)
766 {
767 int d;
768 register int op = *cc;
769 if (op >= OP_BRA) op = OP_BRA;
770
771 switch (op)
772 {
773 case OP_BRA:
774 case OP_ONCE:
775 case OP_COND:
776 d = find_fixedlength(cc, options);
777 if (d < 0) return -1;
778 branchlength += d;
779 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
780 cc += 3;
781 break;
782
783 /* Reached end of a branch; if it's a ket it is the end of a nested
784 call. If it's ALT it is an alternation in a nested call. If it is
785 END it's the end of the outer call. All can be handled by the same code. */
786
787 case OP_ALT:
788 case OP_KET:
789 case OP_KETRMAX:
790 case OP_KETRMIN:
791 case OP_END:
792 if (length < 0) length = branchlength;
793 else if (length != branchlength) return -1;
794 if (*cc != OP_ALT) return length;
795 cc += 3;
796 branchlength = 0;
797 break;
798
799 /* Skip over assertive subpatterns */
800
801 case OP_ASSERT:
802 case OP_ASSERT_NOT:
803 case OP_ASSERTBACK:
804 case OP_ASSERTBACK_NOT:
805 do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
806 cc += 3;
807 break;
808
809 /* Skip over things that don't match chars */
810
811 case OP_REVERSE:
812 case OP_BRANUMBER:
813 case OP_CREF:
814 cc++;
815 /* Fall through */
816
817 case OP_OPT:
818 cc++;
819 /* Fall through */
820
821 case OP_SOD:
822 case OP_EOD:
823 case OP_EODN:
824 case OP_CIRC:
825 case OP_DOLL:
826 case OP_NOT_WORD_BOUNDARY:
827 case OP_WORD_BOUNDARY:
828 cc++;
829 break;
830
831 /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
832 This requires a scan of the string, unfortunately. We assume valid UTF-8
833 strings, so all we do is reduce the length by one for byte whose bits are
834 10xxxxxx. */
835
836 case OP_CHARS:
837 branchlength += *(++cc);
838 #ifdef SUPPORT_UTF8
839 for (d = 1; d <= *cc; d++)
840 if ((cc[d] & 0xc0) == 0x80) branchlength--;
841 #endif
842 cc += *cc + 1;
843 break;
844
845 /* Handle exact repetitions */
846
847 case OP_EXACT:
848 case OP_TYPEEXACT:
849 branchlength += (cc[1] << 8) + cc[2];
850 cc += 4;
851 break;
852
853 /* Handle single-char matchers */
854
855 case OP_NOT_DIGIT:
856 case OP_DIGIT:
857 case OP_NOT_WHITESPACE:
858 case OP_WHITESPACE:
859 case OP_NOT_WORDCHAR:
860 case OP_WORDCHAR:
861 case OP_ANY:
862 branchlength++;
863 cc++;
864 break;
865
866
867 /* Check a class for variable quantification */
868
869 case OP_CLASS:
870 cc += 33;
871
872 switch (*cc)
873 {
874 case OP_CRSTAR:
875 case OP_CRMINSTAR:
876 case OP_CRQUERY:
877 case OP_CRMINQUERY:
878 return -1;
879
880 case OP_CRRANGE:
881 case OP_CRMINRANGE:
882 if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
883 branchlength += (cc[1] << 8) + cc[2];
884 cc += 5;
885 break;
886
887 default:
888 branchlength++;
889 }
890 break;
891
892 /* Anything else is variable length */
893
894 default:
895 return -1;
896 }
897 }
898 /* Control never gets here */
899 }
900
901
902
903
904 /*************************************************
905 * Check for POSIX class syntax *
906 *************************************************/
907
908 /* This function is called when the sequence "[:" or "[." or "[=" is
909 encountered in a character class. It checks whether this is followed by an
910 optional ^ and then a sequence of letters, terminated by a matching ":]" or
911 ".]" or "=]".
912
913 Argument:
914 ptr pointer to the initial [
915 endptr where to return the end pointer
916 cd pointer to compile data
917
918 Returns: TRUE or FALSE
919 */
920
921 static BOOL
922 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
923 {
924 int terminator; /* Don't combine these lines; the Solaris cc */
925 terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
926 if (*(++ptr) == '^') ptr++;
927 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
928 if (*ptr == terminator && ptr[1] == ']')
929 {
930 *endptr = ptr;
931 return TRUE;
932 }
933 return FALSE;
934 }
935
936
937
938
939 /*************************************************
940 * Check POSIX class name *
941 *************************************************/
942
943 /* This function is called to check the name given in a POSIX-style class entry
944 such as [:alnum:].
945
946 Arguments:
947 ptr points to the first letter
948 len the length of the name
949
950 Returns: a value representing the name, or -1 if unknown
951 */
952
953 static int
954 check_posix_name(const uschar *ptr, int len)
955 {
956 register int yield = 0;
957 while (posix_name_lengths[yield] != 0)
958 {
959 if (len == posix_name_lengths[yield] &&
960 strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
961 yield++;
962 }
963 return -1;
964 }
965
966
967
968
969 /*************************************************
970 * Compile one branch *
971 *************************************************/
972
973 /* Scan the pattern, compiling it into the code vector.
974
975 Arguments:
976 options the option bits
977 brackets points to number of extracting brackets used
978 code points to the pointer to the current code point
979 ptrptr points to the current pattern pointer
980 errorptr points to pointer to error message
981 optchanged set to the value of the last OP_OPT item compiled
982 reqchar set to the last literal character required, else -1
983 countlits set to count of mandatory literal characters
984 cd contains pointers to tables
985
986 Returns: TRUE on success
987 FALSE, with *errorptr set on error
988 */
989
990 static BOOL
991 compile_branch(int options, int *brackets, uschar **codeptr,
992 const uschar **ptrptr, const char **errorptr, int *optchanged,
993 int *reqchar, int *countlits, compile_data *cd)
994 {
995 int repeat_type, op_type;
996 int repeat_min, repeat_max;
997 int bravalue, length;
998 int greedy_default, greedy_non_default;
999 int prevreqchar;
1000 int condcount = 0;
1001 int subcountlits = 0;
1002 register int c;
1003 register uschar *code = *codeptr;
1004 uschar *tempcode;
1005 const uschar *ptr = *ptrptr;
1006 const uschar *tempptr;
1007 uschar *previous = NULL;
1008 uschar class[32];
1009
1010 /* Set up the default and non-default settings for greediness */
1011
1012 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1013 greedy_non_default = greedy_default ^ 1;
1014
1015 /* Initialize no required char, and count of literals */
1016
1017 *reqchar = prevreqchar = -1;
1018 *countlits = 0;
1019
1020 /* Switch on next character until the end of the branch */
1021
1022 for (;; ptr++)
1023 {
1024 BOOL negate_class;
1025 int class_charcount;
1026 int class_lastchar;
1027 int newoptions;
1028 int skipbytes;
1029 int subreqchar;
1030
1031 c = *ptr;
1032 if ((options & PCRE_EXTENDED) != 0)
1033 {
1034 if ((cd->ctypes[c] & ctype_space) != 0) continue;
1035 if (c == '#')
1036 {
1037 /* The space before the ; is to avoid a warning on a silly compiler
1038 on the Macintosh. */
1039 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1040 continue;
1041 }
1042 }
1043
1044 switch(c)
1045 {
1046 /* The branch terminates at end of string, |, or ). */
1047
1048 case 0:
1049 case '|':
1050 case ')':
1051 *codeptr = code;
1052 *ptrptr = ptr;
1053 return TRUE;
1054
1055 /* Handle single-character metacharacters */
1056
1057 case '^':
1058 previous = NULL;
1059 *code++ = OP_CIRC;
1060 break;
1061
1062 case '$':
1063 previous = NULL;
1064 *code++ = OP_DOLL;
1065 break;
1066
1067 case '.':
1068 previous = code;
1069 *code++ = OP_ANY;
1070 break;
1071
1072 /* Character classes. These always build a 32-byte bitmap of the permitted
1073 characters, except in the special case where there is only one character.
1074 For negated classes, we build the map as usual, then invert it at the end.
1075 */
1076
1077 case '[':
1078 previous = code;
1079 *code++ = OP_CLASS;
1080
1081 /* If the first character is '^', set the negation flag and skip it. */
1082
1083 if ((c = *(++ptr)) == '^')
1084 {
1085 negate_class = TRUE;
1086 c = *(++ptr);
1087 }
1088 else negate_class = FALSE;
1089
1090 /* Keep a count of chars so that we can optimize the case of just a single
1091 character. */
1092
1093 class_charcount = 0;
1094 class_lastchar = -1;
1095
1096 /* Initialize the 32-char bit map to all zeros. We have to build the
1097 map in a temporary bit of store, in case the class contains only 1
1098 character, because in that case the compiled code doesn't use the
1099 bit map. */
1100
1101 memset(class, 0, 32 * sizeof(uschar));
1102
1103 /* Process characters until ] is reached. By writing this as a "do" it
1104 means that an initial ] is taken as a data character. */
1105
1106 do
1107 {
1108 if (c == 0)
1109 {
1110 *errorptr = ERR6;
1111 goto FAILED;
1112 }
1113
1114 /* Handle POSIX class names. Perl allows a negation extension of the
1115 form [:^name]. A square bracket that doesn't match the syntax is
1116 treated as a literal. We also recognize the POSIX constructions
1117 [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1118 5.6 does. */
1119
1120 if (c == '[' &&
1121 (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1122 check_posix_syntax(ptr, &tempptr, cd))
1123 {
1124 BOOL local_negate = FALSE;
1125 int posix_class, i;
1126 register const uschar *cbits = cd->cbits;
1127
1128 if (ptr[1] != ':')
1129 {
1130 *errorptr = ERR31;
1131 goto FAILED;
1132 }
1133
1134 ptr += 2;
1135 if (*ptr == '^')
1136 {
1137 local_negate = TRUE;
1138 ptr++;
1139 }
1140
1141 posix_class = check_posix_name(ptr, tempptr - ptr);
1142 if (posix_class < 0)
1143 {
1144 *errorptr = ERR30;
1145 goto FAILED;
1146 }
1147
1148 /* If matching is caseless, upper and lower are converted to
1149 alpha. This relies on the fact that the class table starts with
1150 alpha, lower, upper as the first 3 entries. */
1151
1152 if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1153 posix_class = 0;
1154
1155 /* Or into the map we are building up to 3 of the static class
1156 tables, or their negations. */
1157
1158 posix_class *= 3;
1159 for (i = 0; i < 3; i++)
1160 {
1161 int taboffset = posix_class_maps[posix_class + i];
1162 if (taboffset < 0) break;
1163 if (local_negate)
1164 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1165 else
1166 for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1167 }
1168
1169 ptr = tempptr + 1;
1170 class_charcount = 10; /* Set > 1; assumes more than 1 per class */
1171 continue;
1172 }
1173
1174 /* Backslash may introduce a single character, or it may introduce one
1175 of the specials, which just set a flag. Escaped items are checked for
1176 validity in the pre-compiling pass. The sequence \b is a special case.
1177 Inside a class (and only there) it is treated as backspace. Elsewhere
1178 it marks a word boundary. Other escapes have preset maps ready to
1179 or into the one we are building. We assume they have more than one
1180 character in them, so set class_count bigger than one. */
1181
1182 if (c == '\\')
1183 {
1184 c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1185 if (-c == ESC_b) c = '\b';
1186 else if (c < 0)
1187 {
1188 register const uschar *cbits = cd->cbits;
1189 class_charcount = 10;
1190 switch (-c)
1191 {
1192 case ESC_d:
1193 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1194 continue;
1195
1196 case ESC_D:
1197 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1198 continue;
1199
1200 case ESC_w:
1201 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1202 continue;
1203
1204 case ESC_W:
1205 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1206 continue;
1207
1208 case ESC_s:
1209 for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1210 continue;
1211
1212 case ESC_S:
1213 for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1214 continue;
1215
1216 default:
1217 *errorptr = ERR7;
1218 goto FAILED;
1219 }
1220 }
1221
1222 /* Fall through if single character, but don't at present allow
1223 chars > 255 in UTF-8 mode. */
1224
1225 #ifdef SUPPORT_UTF8
1226 if (c > 255)
1227 {
1228 *errorptr = ERR33;
1229 goto FAILED;
1230 }
1231 #endif
1232 }
1233
1234 /* A single character may be followed by '-' to form a range. However,
1235 Perl does not permit ']' to be the end of the range. A '-' character
1236 here is treated as a literal. */
1237
1238 if (ptr[1] == '-' && ptr[2] != ']')
1239 {
1240 int d;
1241 ptr += 2;
1242 d = *ptr;
1243
1244 if (d == 0)
1245 {
1246 *errorptr = ERR6;
1247 goto FAILED;
1248 }
1249
1250 /* The second part of a range can be a single-character escape, but
1251 not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1252 in such circumstances. */
1253
1254 if (d == '\\')
1255 {
1256 const uschar *oldptr = ptr;
1257 d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1258
1259 #ifdef SUPPORT_UTF8
1260 if (d > 255)
1261 {
1262 *errorptr = ERR33;
1263 goto FAILED;
1264 }
1265 #endif
1266 /* \b is backslash; any other special means the '-' was literal */
1267
1268 if (d < 0)
1269 {
1270 if (d == -ESC_b) d = '\b'; else
1271 {
1272 ptr = oldptr - 2;
1273 goto SINGLE_CHARACTER; /* A few lines below */
1274 }
1275 }
1276 }
1277
1278 if (d < c)
1279 {
1280 *errorptr = ERR8;
1281 goto FAILED;
1282 }
1283
1284 for (; c <= d; c++)
1285 {
1286 class[c/8] |= (1 << (c&7));
1287 if ((options & PCRE_CASELESS) != 0)
1288 {
1289 int uc = cd->fcc[c]; /* flip case */
1290 class[uc/8] |= (1 << (uc&7));
1291 }
1292 class_charcount++; /* in case a one-char range */
1293 class_lastchar = c;
1294 }
1295 continue; /* Go get the next char in the class */
1296 }
1297
1298 /* Handle a lone single character - we can get here for a normal
1299 non-escape char, or after \ that introduces a single character. */
1300
1301 SINGLE_CHARACTER:
1302
1303 class [c/8] |= (1 << (c&7));
1304 if ((options & PCRE_CASELESS) != 0)
1305 {
1306 c = cd->fcc[c]; /* flip case */
1307 class[c/8] |= (1 << (c&7));
1308 }
1309 class_charcount++;
1310 class_lastchar = c;
1311 }
1312
1313 /* Loop until ']' reached; the check for end of string happens inside the
1314 loop. This "while" is the end of the "do" above. */
1315
1316 while ((c = *(++ptr)) != ']');
1317
1318 /* If class_charcount is 1 and class_lastchar is not negative, we saw
1319 precisely one character. This doesn't need the whole 32-byte bit map.
1320 We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1321 it's negative. */
1322
1323 if (class_charcount == 1 && class_lastchar >= 0)
1324 {
1325 if (negate_class)
1326 {
1327 code[-1] = OP_NOT;
1328 }
1329 else
1330 {
1331 code[-1] = OP_CHARS;
1332 *code++ = 1;
1333 }
1334 *code++ = class_lastchar;
1335 }
1336
1337 /* Otherwise, negate the 32-byte map if necessary, and copy it into
1338 the code vector. */
1339
1340 else
1341 {
1342 if (negate_class)
1343 for (c = 0; c < 32; c++) code[c] = ~class[c];
1344 else
1345 memcpy(code, class, 32);
1346 code += 32;
1347 }
1348 break;
1349
1350 /* Various kinds of repeat */
1351
1352 case '{':
1353 if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1354 ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1355 if (*errorptr != NULL) goto FAILED;
1356 goto REPEAT;
1357
1358 case '*':
1359 repeat_min = 0;
1360 repeat_max = -1;
1361 goto REPEAT;
1362
1363 case '+':
1364 repeat_min = 1;
1365 repeat_max = -1;
1366 goto REPEAT;
1367
1368 case '?':
1369 repeat_min = 0;
1370 repeat_max = 1;
1371
1372 REPEAT:
1373 if (previous == NULL)
1374 {
1375 *errorptr = ERR9;
1376 goto FAILED;
1377 }
1378
1379 /* If the next character is '?' this is a minimizing repeat, by default,
1380 but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1381 next character. */
1382
1383 if (ptr[1] == '?')
1384 { repeat_type = greedy_non_default; ptr++; }
1385 else repeat_type = greedy_default;
1386
1387 /* If previous was a string of characters, chop off the last one and use it
1388 as the subject of the repeat. If there was only one character, we can
1389 abolish the previous item altogether. A repeat with a zero minimum wipes
1390 out any reqchar setting, backing up to the previous value. We must also
1391 adjust the countlits value. */
1392
1393 if (*previous == OP_CHARS)
1394 {
1395 int len = previous[1];
1396
1397 if (repeat_min == 0) *reqchar = prevreqchar;
1398 *countlits += repeat_min - 1;
1399
1400 if (len == 1)
1401 {
1402 c = previous[2];
1403 code = previous;
1404 }
1405 else
1406 {
1407 c = previous[len+1];
1408 previous[1]--;
1409 code--;
1410 }
1411 op_type = 0; /* Use single-char op codes */
1412 goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
1413 }
1414
1415 /* If previous was a single negated character ([^a] or similar), we use
1416 one of the special opcodes, replacing it. The code is shared with single-
1417 character repeats by adding a suitable offset into repeat_type. */
1418
1419 else if ((int)*previous == OP_NOT)
1420 {
1421 op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
1422 c = previous[1];
1423 code = previous;
1424 goto OUTPUT_SINGLE_REPEAT;
1425 }
1426
1427 /* If previous was a character type match (\d or similar), abolish it and
1428 create a suitable repeat item. The code is shared with single-character
1429 repeats by adding a suitable offset into repeat_type. */
1430
1431 else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1432 {
1433 op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
1434 c = *previous;
1435 code = previous;
1436
1437 OUTPUT_SINGLE_REPEAT:
1438
1439 /* If the maximum is zero then the minimum must also be zero; Perl allows
1440 this case, so we do too - by simply omitting the item altogether. */
1441
1442 if (repeat_max == 0) goto END_REPEAT;
1443
1444 /* Combine the op_type with the repeat_type */
1445
1446 repeat_type += op_type;
1447
1448 /* A minimum of zero is handled either as the special case * or ?, or as
1449 an UPTO, with the maximum given. */
1450
1451 if (repeat_min == 0)
1452 {
1453 if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1454 else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1455 else
1456 {
1457 *code++ = OP_UPTO + repeat_type;
1458 *code++ = repeat_max >> 8;
1459 *code++ = (repeat_max & 255);
1460 }
1461 }
1462
1463 /* The case {1,} is handled as the special case + */
1464
1465 else if (repeat_min == 1 && repeat_max == -1)
1466 *code++ = OP_PLUS + repeat_type;
1467
1468 /* The case {n,n} is just an EXACT, while the general case {n,m} is
1469 handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1470
1471 else
1472 {
1473 if (repeat_min != 1)
1474 {
1475 *code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
1476 *code++ = repeat_min >> 8;
1477 *code++ = (repeat_min & 255);
1478 }
1479
1480 /* If the mininum is 1 and the previous item was a character string,
1481 we either have to put back the item that got cancelled if the string
1482 length was 1, or add the character back onto the end of a longer
1483 string. For a character type nothing need be done; it will just get
1484 put back naturally. Note that the final character is always going to
1485 get added below. */
1486
1487 else if (*previous == OP_CHARS)
1488 {
1489 if (code == previous) code += 2; else previous[1]++;
1490 }
1491
1492 /* For a single negated character we also have to put back the
1493 item that got cancelled. */
1494
1495 else if (*previous == OP_NOT) code++;
1496
1497 /* If the maximum is unlimited, insert an OP_STAR. */
1498
1499 if (repeat_max < 0)
1500 {
1501 *code++ = c;
1502 *code++ = OP_STAR + repeat_type;
1503 }
1504
1505 /* Else insert an UPTO if the max is greater than the min. */
1506
1507 else if (repeat_max != repeat_min)
1508 {
1509 *code++ = c;
1510 repeat_max -= repeat_min;
1511 *code++ = OP_UPTO + repeat_type;
1512 *code++ = repeat_max >> 8;
1513 *code++ = (repeat_max & 255);
1514 }
1515 }
1516
1517 /* The character or character type itself comes last in all cases. */
1518
1519 *code++ = c;
1520 }
1521
1522 /* If previous was a character class or a back reference, we put the repeat
1523 stuff after it, but just skip the item if the repeat was {0,0}. */
1524
1525 else if (*previous == OP_CLASS || *previous == OP_REF)
1526 {
1527 if (repeat_max == 0)
1528 {
1529 code = previous;
1530 goto END_REPEAT;
1531 }
1532 if (repeat_min == 0 && repeat_max == -1)
1533 *code++ = OP_CRSTAR + repeat_type;
1534 else if (repeat_min == 1 && repeat_max == -1)
1535 *code++ = OP_CRPLUS + repeat_type;
1536 else if (repeat_min == 0 && repeat_max == 1)
1537 *code++ = OP_CRQUERY + repeat_type;
1538 else
1539 {
1540 *code++ = OP_CRRANGE + repeat_type;
1541 *code++ = repeat_min >> 8;
1542 *code++ = repeat_min & 255;
1543 if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
1544 *code++ = repeat_max >> 8;
1545 *code++ = repeat_max & 255;
1546 }
1547 }
1548
1549 /* If previous was a bracket group, we may have to replicate it in certain
1550 cases. */
1551
1552 else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1553 (int)*previous == OP_COND)
1554 {
1555 register int i;
1556 int ketoffset = 0;
1557 int len = code - previous;
1558 uschar *bralink = NULL;
1559
1560 /* If the maximum repeat count is unlimited, find the end of the bracket
1561 by scanning through from the start, and compute the offset back to it
1562 from the current code pointer. There may be an OP_OPT setting following
1563 the final KET, so we can't find the end just by going back from the code
1564 pointer. */
1565
1566 if (repeat_max == -1)
1567 {
1568 register uschar *ket = previous;
1569 do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1570 ketoffset = code - ket;
1571 }
1572
1573 /* The case of a zero minimum is special because of the need to stick
1574 OP_BRAZERO in front of it, and because the group appears once in the
1575 data, whereas in other cases it appears the minimum number of times. For
1576 this reason, it is simplest to treat this case separately, as otherwise
1577 the code gets far too messy. There are several special subcases when the
1578 minimum is zero. */
1579
1580 if (repeat_min == 0)
1581 {
1582 /* If we set up a required char from the bracket, we must back off
1583 to the previous value and reset the countlits value too. */
1584
1585 if (subcountlits > 0)
1586 {
1587 *reqchar = prevreqchar;
1588 *countlits -= subcountlits;
1589 }
1590
1591 /* If the maximum is also zero, we just omit the group from the output
1592 altogether. */
1593
1594 if (repeat_max == 0)
1595 {
1596 code = previous;
1597 goto END_REPEAT;
1598 }
1599
1600 /* If the maximum is 1 or unlimited, we just have to stick in the
1601 BRAZERO and do no more at this point. */
1602
1603 if (repeat_max <= 1)
1604 {
1605 memmove(previous+1, previous, len);
1606 code++;
1607 *previous++ = OP_BRAZERO + repeat_type;
1608 }
1609
1610 /* If the maximum is greater than 1 and limited, we have to replicate
1611 in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1612 The first one has to be handled carefully because it's the original
1613 copy, which has to be moved up. The remainder can be handled by code
1614 that is common with the non-zero minimum case below. We just have to
1615 adjust the value or repeat_max, since one less copy is required. */
1616
1617 else
1618 {
1619 int offset;
1620 memmove(previous+4, previous, len);
1621 code += 4;
1622 *previous++ = OP_BRAZERO + repeat_type;
1623 *previous++ = OP_BRA;
1624
1625 /* We chain together the bracket offset fields that have to be
1626 filled in later when the ends of the brackets are reached. */
1627
1628 offset = (bralink == NULL)? 0 : previous - bralink;
1629 bralink = previous;
1630 *previous++ = offset >> 8;
1631 *previous++ = offset & 255;
1632 }
1633
1634 repeat_max--;
1635 }
1636
1637 /* If the minimum is greater than zero, replicate the group as many
1638 times as necessary, and adjust the maximum to the number of subsequent
1639 copies that we need. */
1640
1641 else
1642 {
1643 for (i = 1; i < repeat_min; i++)
1644 {
1645 memcpy(code, previous, len);
1646 code += len;
1647 }
1648 if (repeat_max > 0) repeat_max -= repeat_min;
1649 }
1650
1651 /* This code is common to both the zero and non-zero minimum cases. If
1652 the maximum is limited, it replicates the group in a nested fashion,
1653 remembering the bracket starts on a stack. In the case of a zero minimum,
1654 the first one was set up above. In all cases the repeat_max now specifies
1655 the number of additional copies needed. */
1656
1657 if (repeat_max >= 0)
1658 {
1659 for (i = repeat_max - 1; i >= 0; i--)
1660 {
1661 *code++ = OP_BRAZERO + repeat_type;
1662
1663 /* All but the final copy start a new nesting, maintaining the
1664 chain of brackets outstanding. */
1665
1666 if (i != 0)
1667 {
1668 int offset;
1669 *code++ = OP_BRA;
1670 offset = (bralink == NULL)? 0 : code - bralink;
1671 bralink = code;
1672 *code++ = offset >> 8;
1673 *code++ = offset & 255;
1674 }
1675
1676 memcpy(code, previous, len);
1677 code += len;
1678 }
1679
1680 /* Now chain through the pending brackets, and fill in their length
1681 fields (which are holding the chain links pro tem). */
1682
1683 while (bralink != NULL)
1684 {
1685 int oldlinkoffset;
1686 int offset = code - bralink + 1;
1687 uschar *bra = code - offset;
1688 oldlinkoffset = (bra[1] << 8) + bra[2];
1689 bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1690 *code++ = OP_KET;
1691 *code++ = bra[1] = offset >> 8;
1692 *code++ = bra[2] = (offset & 255);
1693 }
1694 }
1695
1696 /* If the maximum is unlimited, set a repeater in the final copy. We
1697 can't just offset backwards from the current code point, because we
1698 don't know if there's been an options resetting after the ket. The
1699 correct offset was computed above. */
1700
1701 else code[-ketoffset] = OP_KETRMAX + repeat_type;
1702 }
1703
1704 /* Else there's some kind of shambles */
1705
1706 else
1707 {
1708 *errorptr = ERR11;
1709 goto FAILED;
1710 }
1711
1712 /* In all case we no longer have a previous item. */
1713
1714 END_REPEAT:
1715 previous = NULL;
1716 break;
1717
1718
1719 /* Start of nested bracket sub-expression, or comment or lookahead or
1720 lookbehind or option setting or condition. First deal with special things
1721 that can come after a bracket; all are introduced by ?, and the appearance
1722 of any of them means that this is not a referencing group. They were
1723 checked for validity in the first pass over the string, so we don't have to
1724 check for syntax errors here. */
1725
1726 case '(':
1727 newoptions = options;
1728 skipbytes = 0;
1729
1730 if (*(++ptr) == '?')
1731 {
1732 int set, unset;
1733 int *optset;
1734
1735 switch (*(++ptr))
1736 {
1737 case '#': /* Comment; skip to ket */
1738 ptr++;
1739 while (*ptr != ')') ptr++;
1740 continue;
1741
1742 case ':': /* Non-extracting bracket */
1743 bravalue = OP_BRA;
1744 ptr++;
1745 break;
1746
1747 case '(':
1748 bravalue = OP_COND; /* Conditional group */
1749 if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1750 {
1751 int condref = *ptr - '0';
1752 while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1753 if (condref == 0)
1754 {
1755 *errorptr = ERR35;
1756 goto FAILED;
1757 }
1758 ptr++;
1759 code[3] = OP_CREF;
1760 code[4] = condref >> 8;
1761 code[5] = condref & 255;
1762 skipbytes = 3;
1763 }
1764 else ptr--;
1765 break;
1766
1767 case '=': /* Positive lookahead */
1768 bravalue = OP_ASSERT;
1769 ptr++;
1770 break;
1771
1772 case '!': /* Negative lookahead */
1773 bravalue = OP_ASSERT_NOT;
1774 ptr++;
1775 break;
1776
1777 case '<': /* Lookbehinds */
1778 switch (*(++ptr))
1779 {
1780 case '=': /* Positive lookbehind */
1781 bravalue = OP_ASSERTBACK;
1782 ptr++;
1783 break;
1784
1785 case '!': /* Negative lookbehind */
1786 bravalue = OP_ASSERTBACK_NOT;
1787 ptr++;
1788 break;
1789
1790 default: /* Syntax error */
1791 *errorptr = ERR24;
1792 goto FAILED;
1793 }
1794 break;
1795
1796 case '>': /* One-time brackets */
1797 bravalue = OP_ONCE;
1798 ptr++;
1799 break;
1800
1801 case 'R': /* Pattern recursion */
1802 *code++ = OP_RECURSE;
1803 ptr++;
1804 continue;
1805
1806 default: /* Option setting */
1807 set = unset = 0;
1808 optset = &set;
1809
1810 while (*ptr != ')' && *ptr != ':')
1811 {
1812 switch (*ptr++)
1813 {
1814 case '-': optset = &unset; break;
1815
1816 case 'i': *optset |= PCRE_CASELESS; break;
1817 case 'm': *optset |= PCRE_MULTILINE; break;
1818 case 's': *optset |= PCRE_DOTALL; break;
1819 case 'x': *optset |= PCRE_EXTENDED; break;
1820 case 'U': *optset |= PCRE_UNGREEDY; break;
1821 case 'X': *optset |= PCRE_EXTRA; break;
1822
1823 default:
1824 *errorptr = ERR12;
1825 goto FAILED;
1826 }
1827 }
1828
1829 /* Set up the changed option bits, but don't change anything yet. */
1830
1831 newoptions = (options | set) & (~unset);
1832
1833 /* If the options ended with ')' this is not the start of a nested
1834 group with option changes, so the options change at this level. At top
1835 level there is nothing else to be done (the options will in fact have
1836 been set from the start of compiling as a result of the first pass) but
1837 at an inner level we must compile code to change the ims options if
1838 necessary, and pass the new setting back so that it can be put at the
1839 start of any following branches, and when this group ends, a resetting
1840 item can be compiled. */
1841
1842 if (*ptr == ')')
1843 {
1844 if ((options & PCRE_INGROUP) != 0 &&
1845 (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846 {
1847 *code++ = OP_OPT;
1848 *code++ = *optchanged = newoptions & PCRE_IMS;
1849 }
1850 options = newoptions; /* Change options at this level */
1851 previous = NULL; /* This item can't be repeated */
1852 continue; /* It is complete */
1853 }
1854
1855 /* If the options ended with ':' we are heading into a nested group
1856 with possible change of options. Such groups are non-capturing and are
1857 not assertions of any kind. All we need to do is skip over the ':';
1858 the newoptions value is handled below. */
1859
1860 bravalue = OP_BRA;
1861 ptr++;
1862 }
1863 }
1864
1865 /* Else we have a referencing group; adjust the opcode. If the bracket
1866 number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1867 arrange for the true number to follow later, in an OP_BRANUMBER item. */
1868
1869 else
1870 {
1871 if (++(*brackets) > EXTRACT_BASIC_MAX)
1872 {
1873 bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1874 code[3] = OP_BRANUMBER;
1875 code[4] = *brackets >> 8;
1876 code[5] = *brackets & 255;
1877 skipbytes = 3;
1878 }
1879 else bravalue = OP_BRA + *brackets;
1880 }
1881
1882 /* Process nested bracketed re. Assertions may not be repeated, but other
1883 kinds can be. We copy code into a non-register variable in order to be able
1884 to pass its address because some compilers complain otherwise. Pass in a
1885 new setting for the ims options if they have changed. */
1886
1887 previous = (bravalue >= OP_ONCE)? code : NULL;
1888 *code = bravalue;
1889 tempcode = code;
1890
1891 if (!compile_regex(
1892 options | PCRE_INGROUP, /* Set for all nested groups */
1893 ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1894 newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1895 brackets, /* Extracting bracket count */
1896 &tempcode, /* Where to put code (updated) */
1897 &ptr, /* Input pointer (updated) */
1898 errorptr, /* Where to put an error message */
1899 (bravalue == OP_ASSERTBACK ||
1900 bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1901 skipbytes, /* Skip over OP_COND/OP_BRANUMBER */
1902 &subreqchar, /* For possible last char */
1903 &subcountlits, /* For literal count */
1904 cd)) /* Tables block */
1905 goto FAILED;
1906
1907 /* At the end of compiling, code is still pointing to the start of the
1908 group, while tempcode has been updated to point past the end of the group
1909 and any option resetting that may follow it. The pattern pointer (ptr)
1910 is on the bracket. */
1911
1912 /* If this is a conditional bracket, check that there are no more than
1913 two branches in the group. */
1914
1915 else if (bravalue == OP_COND)
1916 {
1917 uschar *tc = code;
1918 condcount = 0;
1919
1920 do {
1921 condcount++;
1922 tc += (tc[1] << 8) | tc[2];
1923 }
1924 while (*tc != OP_KET);
1925
1926 if (condcount > 2)
1927 {
1928 *errorptr = ERR27;
1929 goto FAILED;
1930 }
1931 }
1932
1933 /* Handle updating of the required character. If the subpattern didn't
1934 set one, leave it as it was. Otherwise, update it for normal brackets of
1935 all kinds, forward assertions, and conditions with two branches. Don't
1936 update the literal count for forward assertions, however. If the bracket
1937 is followed by a quantifier with zero repeat, we have to back off. Hence
1938 the definition of prevreqchar and subcountlits outside the main loop so
1939 that they can be accessed for the back off. */
1940
1941 if (subreqchar > 0 &&
1942 (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1943 (bravalue == OP_COND && condcount == 2)))
1944 {
1945 prevreqchar = *reqchar;
1946 *reqchar = subreqchar;
1947 if (bravalue != OP_ASSERT) *countlits += subcountlits;
1948 }
1949
1950 /* Now update the main code pointer to the end of the group. */
1951
1952 code = tempcode;
1953
1954 /* Error if hit end of pattern */
1955
1956 if (*ptr != ')')
1957 {
1958 *errorptr = ERR14;
1959 goto FAILED;
1960 }
1961 break;
1962
1963 /* Check \ for being a real metacharacter; if not, fall through and handle
1964 it as a data character at the start of a string. Escape items are checked
1965 for validity in the pre-compiling pass. */
1966
1967 case '\\':
1968 tempptr = ptr;
1969 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1970
1971 /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1972 are arranged to be the negation of the corresponding OP_values. For the
1973 back references, the values are ESC_REF plus the reference number. Only
1974 back references and those types that consume a character may be repeated.
1975 We can test for values between ESC_b and ESC_Z for the latter; this may
1976 have to change if any new ones are ever created. */
1977
1978 if (c < 0)
1979 {
1980 if (-c >= ESC_REF)
1981 {
1982 int number = -c - ESC_REF;
1983 previous = code;
1984 *code++ = OP_REF;
1985 *code++ = number >> 8;
1986 *code++ = number & 255;
1987 }
1988 else
1989 {
1990 previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1991 *code++ = -c;
1992 }
1993 continue;
1994 }
1995
1996 /* Data character: reset and fall through */
1997
1998 ptr = tempptr;
1999 c = '\\';
2000
2001 /* Handle a run of data characters until a metacharacter is encountered.
2002 The first character is guaranteed not to be whitespace or # when the
2003 extended flag is set. */
2004
2005 NORMAL_CHAR:
2006 default:
2007 previous = code;
2008 *code = OP_CHARS;
2009 code += 2;
2010 length = 0;
2011
2012 do
2013 {
2014 if ((options & PCRE_EXTENDED) != 0)
2015 {
2016 if ((cd->ctypes[c] & ctype_space) != 0) continue;
2017 if (c == '#')
2018 {
2019 /* The space before the ; is to avoid a warning on a silly compiler
2020 on the Macintosh. */
2021 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2022 if (c == 0) break;
2023 continue;
2024 }
2025 }
2026
2027 /* Backslash may introduce a data char or a metacharacter. Escaped items
2028 are checked for validity in the pre-compiling pass. Stop the string
2029 before a metaitem. */
2030
2031 if (c == '\\')
2032 {
2033 tempptr = ptr;
2034 c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2035 if (c < 0) { ptr = tempptr; break; }
2036
2037 /* If a character is > 127 in UTF-8 mode, we have to turn it into
2038 two or more characters in the UTF-8 encoding. */
2039
2040 #ifdef SUPPORT_UTF8
2041 if (c > 127 && (options & PCRE_UTF8) != 0)
2042 {
2043 uschar buffer[8];
2044 int len = ord2utf8(c, buffer);
2045 for (c = 0; c < len; c++) *code++ = buffer[c];
2046 length += len;
2047 continue;
2048 }
2049 #endif
2050 }
2051
2052 /* Ordinary character or single-char escape */
2053
2054 *code++ = c;
2055 length++;
2056 }
2057
2058 /* This "while" is the end of the "do" above. */
2059
2060 while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2061
2062 /* Update the last character and the count of literals */
2063
2064 prevreqchar = (length > 1)? code[-2] : *reqchar;
2065 *reqchar = code[-1];
2066 *countlits += length;
2067
2068 /* Compute the length and set it in the data vector, and advance to
2069 the next state. */
2070
2071 previous[1] = length;
2072 if (length < MAXLIT) ptr--;
2073 break;
2074 }
2075 } /* end of big loop */
2076
2077 /* Control never reaches here by falling through, only by a goto for all the
2078 error states. Pass back the position in the pattern so that it can be displayed
2079 to the user for diagnosing the error. */
2080
2081 FAILED:
2082 *ptrptr = ptr;
2083 return FALSE;
2084 }
2085
2086
2087
2088
2089 /*************************************************
2090 * Compile sequence of alternatives *
2091 *************************************************/
2092
2093 /* On entry, ptr is pointing past the bracket character, but on return
2094 it points to the closing bracket, or vertical bar, or end of string.
2095 The code variable is pointing at the byte into which the BRA operator has been
2096 stored. If the ims options are changed at the start (for a (?ims: group) or
2097 during any branch, we need to insert an OP_OPT item at the start of every
2098 following branch to ensure they get set correctly at run time, and also pass
2099 the new options into every subsequent branch compile.
2100
2101 Argument:
2102 options the option bits
2103 optchanged new ims options to set as if (?ims) were at the start, or -1
2104 for no change
2105 brackets -> int containing the number of extracting brackets used
2106 codeptr -> the address of the current code pointer
2107 ptrptr -> the address of the current pattern pointer
2108 errorptr -> pointer to error message
2109 lookbehind TRUE if this is a lookbehind assertion
2110 skipbytes skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2111 reqchar -> place to put the last required character, or a negative number
2112 countlits -> place to put the shortest literal count of any branch
2113 cd points to the data block with tables pointers
2114
2115 Returns: TRUE on success
2116 */
2117
2118 static BOOL
2119 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2120 const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2121 int *reqchar, int *countlits, compile_data *cd)
2122 {
2123 const uschar *ptr = *ptrptr;
2124 uschar *code = *codeptr;
2125 uschar *last_branch = code;
2126 uschar *start_bracket = code;
2127 uschar *reverse_count = NULL;
2128 int oldoptions = options & PCRE_IMS;
2129 int branchreqchar, branchcountlits;
2130
2131 *reqchar = -1;
2132 *countlits = INT_MAX;
2133 code += 3 + skipbytes;
2134
2135 /* Loop for each alternative branch */
2136
2137 for (;;)
2138 {
2139 int length;
2140
2141 /* Handle change of options */
2142
2143 if (optchanged >= 0)
2144 {
2145 *code++ = OP_OPT;
2146 *code++ = optchanged;
2147 options = (options & ~PCRE_IMS) | optchanged;
2148 }
2149
2150 /* Set up dummy OP_REVERSE if lookbehind assertion */
2151
2152 if (lookbehind)
2153 {
2154 *code++ = OP_REVERSE;
2155 reverse_count = code;
2156 *code++ = 0;
2157 *code++ = 0;
2158 }
2159
2160 /* Now compile the branch */
2161
2162 if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2163 &branchreqchar, &branchcountlits, cd))
2164 {
2165 *ptrptr = ptr;
2166 return FALSE;
2167 }
2168
2169 /* Fill in the length of the last branch */
2170
2171 length = code - last_branch;
2172 last_branch[1] = length >> 8;
2173 last_branch[2] = length & 255;
2174
2175 /* Save the last required character if all branches have the same; a current
2176 value of -1 means unset, while -2 means "previous branch had no last required
2177 char". */
2178
2179 if (*reqchar != -2)
2180 {
2181 if (branchreqchar >= 0)
2182 {
2183 if (*reqchar == -1) *reqchar = branchreqchar;
2184 else if (*reqchar != branchreqchar) *reqchar = -2;
2185 }
2186 else *reqchar = -2;
2187 }
2188
2189 /* Keep the shortest literal count */
2190
2191 if (branchcountlits < *countlits) *countlits = branchcountlits;
2192 DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2193
2194 /* If lookbehind, check that this branch matches a fixed-length string,
2195 and put the length into the OP_REVERSE item. Temporarily mark the end of
2196 the branch with OP_END. */
2197
2198 if (lookbehind)
2199 {
2200 *code = OP_END;
2201 length = find_fixedlength(last_branch, options);
2202 DPRINTF(("fixed length = %d\n", length));
2203 if (length < 0)
2204 {
2205 *errorptr = ERR25;
2206 *ptrptr = ptr;
2207 return FALSE;
2208 }
2209 reverse_count[0] = (length >> 8);
2210 reverse_count[1] = length & 255;
2211 }
2212
2213 /* Reached end of expression, either ')' or end of pattern. Insert a
2214 terminating ket and the length of the whole bracketed item, and return,
2215 leaving the pointer at the terminating char. If any of the ims options
2216 were changed inside the group, compile a resetting op-code following. */
2217
2218 if (*ptr != '|')
2219 {
2220 length = code - start_bracket;
2221 *code++ = OP_KET;
2222 *code++ = length >> 8;
2223 *code++ = length & 255;
2224 if (optchanged >= 0)
2225 {
2226 *code++ = OP_OPT;
2227 *code++ = oldoptions;
2228 }
2229 *codeptr = code;
2230 *ptrptr = ptr;
2231 return TRUE;
2232 }
2233
2234 /* Another branch follows; insert an "or" node and advance the pointer. */
2235
2236 *code = OP_ALT;
2237 last_branch = code;
2238 code += 3;
2239 ptr++;
2240 }
2241 /* Control never reaches here */
2242 }
2243
2244
2245
2246
2247 /*************************************************
2248 * Find first significant op code *
2249 *************************************************/
2250
2251 /* This is called by several functions that scan a compiled expression looking
2252 for a fixed first character, or an anchoring op code etc. It skips over things
2253 that do not influence this. For one application, a change of caseless option is
2254 important.
2255
2256 Arguments:
2257 code pointer to the start of the group
2258 options pointer to external options
2259 optbit the option bit whose changing is significant, or
2260 zero if none are
2261 optstop TRUE to return on option change, otherwise change the options
2262 value and continue
2263
2264 Returns: pointer to the first significant opcode
2265 */
2266
2267 static const uschar*
2268 first_significant_code(const uschar *code, int *options, int optbit,
2269 BOOL optstop)
2270 {
2271 for (;;)
2272 {
2273 switch ((int)*code)
2274 {
2275 case OP_OPT:
2276 if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2277 {
2278 if (optstop) return code;
2279 *options = (int)code[1];
2280 }
2281 code += 2;
2282 break;
2283
2284 case OP_CREF:
2285 case OP_BRANUMBER:
2286 code += 3;
2287 break;
2288
2289 case OP_WORD_BOUNDARY:
2290 case OP_NOT_WORD_BOUNDARY:
2291 code++;
2292 break;
2293
2294 case OP_ASSERT_NOT:
2295 case OP_ASSERTBACK:
2296 case OP_ASSERTBACK_NOT:
2297 do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2298 code += 3;
2299 break;
2300
2301 default:
2302 return code;
2303 }
2304 }
2305 /* Control never reaches here */
2306 }
2307
2308
2309
2310
2311 /*************************************************
2312 * Check for anchored expression *
2313 *************************************************/
2314
2315 /* Try to find out if this is an anchored regular expression. Consider each
2316 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2317 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2318 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2319 counts, since OP_CIRC can match in the middle.
2320
2321 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2322 because that will try the rest of the pattern at all possible matching points,
2323 so there is no point trying them again.
2324
2325 Arguments:
2326 code points to start of expression (the bracket)
2327 options points to the options setting
2328
2329 Returns: TRUE or FALSE
2330 */
2331
2332 static BOOL
2333 is_anchored(register const uschar *code, int *options)
2334 {
2335 do {
2336 const uschar *scode = first_significant_code(code + 3, options,
2337 PCRE_MULTILINE, FALSE);
2338 register int op = *scode;
2339 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2340 { if (!is_anchored(scode, options)) return FALSE; }
2341 else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2342 (*options & PCRE_DOTALL) != 0)
2343 { if (scode[1] != OP_ANY) return FALSE; }
2344 else if (op != OP_SOD &&
2345 ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2346 return FALSE;
2347 code += (code[1] << 8) + code[2];
2348 }
2349 while (*code == OP_ALT);
2350 return TRUE;
2351 }
2352
2353
2354
2355 /*************************************************
2356 * Check for starting with ^ or .* *
2357 *************************************************/
2358
2359 /* This is called to find out if every branch starts with ^ or .* so that
2360 "first char" processing can be done to speed things up in multiline
2361 matching and for non-DOTALL patterns that start with .* (which must start at
2362 the beginning or after \n).
2363
2364 Argument: points to start of expression (the bracket)
2365 Returns: TRUE or FALSE
2366 */
2367
2368 static BOOL
2369 is_startline(const uschar *code)
2370 {
2371 do {
2372 const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2373 register int op = *scode;
2374 if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2375 { if (!is_startline(scode)) return FALSE; }
2376 else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2377 { if (scode[1] != OP_ANY) return FALSE; }
2378 else if (op != OP_CIRC) return FALSE;
2379 code += (code[1] << 8) + code[2];
2380 }
2381 while (*code == OP_ALT);
2382 return TRUE;
2383 }
2384
2385
2386
2387 /*************************************************
2388 * Check for fixed first char *
2389 *************************************************/
2390
2391 /* Try to find out if there is a fixed first character. This is called for
2392 unanchored expressions, as it speeds up their processing quite considerably.
2393 Consider each alternative branch. If they all start with the same char, or with
2394 a bracket all of whose alternatives start with the same char (recurse ad lib),
2395 then we return that char, otherwise -1.
2396
2397 Arguments:
2398 code points to start of expression (the bracket)
2399 options pointer to the options (used to check casing changes)
2400
2401 Returns: -1 or the fixed first char
2402 */
2403
2404 static int
2405 find_firstchar(const uschar *code, int *options)
2406 {
2407 register int c = -1;
2408 do {
2409 int d;
2410 const uschar *scode = first_significant_code(code + 3, options,
2411 PCRE_CASELESS, TRUE);
2412 register int op = *scode;
2413
2414 if (op >= OP_BRA) op = OP_BRA;
2415
2416 switch(op)
2417 {
2418 default:
2419 return -1;
2420
2421 case OP_BRA:
2422 case OP_ASSERT:
2423 case OP_ONCE:
2424 case OP_COND:
2425 if ((d = find_firstchar(scode, options)) < 0) return -1;
2426 if (c < 0) c = d; else if (c != d) return -1;
2427 break;
2428
2429 case OP_EXACT: /* Fall through */
2430 scode++;
2431
2432 case OP_CHARS: /* Fall through */
2433 scode++;
2434
2435 case OP_PLUS:
2436 case OP_MINPLUS:
2437 if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2438 break;
2439 }
2440
2441 code += (code[1] << 8) + code[2];
2442 }
2443 while (*code == OP_ALT);
2444 return c;
2445 }
2446
2447
2448
2449
2450
2451 /*************************************************
2452 * Compile a Regular Expression *
2453 *************************************************/
2454
2455 /* This function takes a string and returns a pointer to a block of store
2456 holding a compiled version of the expression.
2457
2458 Arguments:
2459 pattern the regular expression
2460 options various option bits
2461 errorptr pointer to pointer to error text
2462 erroroffset ptr offset in pattern where error was detected
2463 tables pointer to character tables or NULL
2464
2465 Returns: pointer to compiled data block, or NULL on error,
2466 with errorptr and erroroffset set
2467 */
2468
2469 pcre *
2470 pcre_compile(const char *pattern, int options, const char **errorptr,
2471 int *erroroffset, const unsigned char *tables)
2472 {
2473 real_pcre *re;
2474 int length = 3; /* For initial BRA plus length */
2475 int runlength;
2476 int c, reqchar, countlits;
2477 int bracount = 0;
2478 int top_backref = 0;
2479 int branch_extra = 0;
2480 int branch_newextra;
2481 unsigned int brastackptr = 0;
2482 size_t size;
2483 uschar *code;
2484 const uschar *ptr;
2485 compile_data compile_block;
2486 int brastack[BRASTACK_SIZE];
2487 uschar bralenstack[BRASTACK_SIZE];
2488
2489 #ifdef DEBUG
2490 uschar *code_base, *code_end;
2491 #endif
2492
2493 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2494
2495 #ifndef SUPPORT_UTF8
2496 if ((options & PCRE_UTF8) != 0)
2497 {
2498 *errorptr = ERR32;
2499 return NULL;
2500 }
2501 #endif
2502
2503 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2504 can do is just return NULL. */
2505
2506 if (errorptr == NULL) return NULL;
2507 *errorptr = NULL;
2508
2509 /* However, we can give a message for this error */
2510
2511 if (erroroffset == NULL)
2512 {
2513 *errorptr = ERR16;
2514 return NULL;
2515 }
2516 *erroroffset = 0;
2517
2518 if ((options & ~PUBLIC_OPTIONS) != 0)
2519 {
2520 *errorptr = ERR17;
2521 return NULL;
2522 }
2523
2524 /* Set up pointers to the individual character tables */
2525
2526 if (tables == NULL) tables = pcre_default_tables;
2527 compile_block.lcc = tables + lcc_offset;
2528 compile_block.fcc = tables + fcc_offset;
2529 compile_block.cbits = tables + cbits_offset;
2530 compile_block.ctypes = tables + ctypes_offset;
2531
2532 /* Reflect pattern for debugging output */
2533
2534 DPRINTF(("------------------------------------------------------------------\n"));
2535 DPRINTF(("%s\n", pattern));
2536
2537 /* The first thing to do is to make a pass over the pattern to compute the
2538 amount of store required to hold the compiled code. This does not have to be
2539 perfect as long as errors are overestimates. At the same time we can detect any
2540 internal flag settings. Make an attempt to correct for any counted white space
2541 if an "extended" flag setting appears late in the pattern. We can't be so
2542 clever for #-comments. */
2543
2544 ptr = (const uschar *)(pattern - 1);
2545 while ((c = *(++ptr)) != 0)
2546 {
2547 int min, max;
2548 int class_charcount;
2549 int bracket_length;
2550
2551 if ((options & PCRE_EXTENDED) != 0)
2552 {
2553 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554 if (c == '#')
2555 {
2556 /* The space before the ; is to avoid a warning on a silly compiler
2557 on the Macintosh. */
2558 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2559 continue;
2560 }
2561 }
2562
2563 switch(c)
2564 {
2565 /* A backslashed item may be an escaped "normal" character or a
2566 character type. For a "normal" character, put the pointers and
2567 character back so that tests for whitespace etc. in the input
2568 are done correctly. */
2569
2570 case '\\':
2571 {
2572 const uschar *save_ptr = ptr;
2573 c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575 if (c >= 0)
2576 {
2577 ptr = save_ptr;
2578 c = '\\';
2579 goto NORMAL_CHAR;
2580 }
2581 }
2582 length++;
2583
2584 /* A back reference needs an additional 2 bytes, plus either one or 5
2585 bytes for a repeat. We also need to keep the value of the highest
2586 back reference. */
2587
2588 if (c <= -ESC_REF)
2589 {
2590 int refnum = -c - ESC_REF;
2591 if (refnum > top_backref) top_backref = refnum;
2592 length += 2; /* For single back reference */
2593 if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2594 {
2595 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597 if ((min == 0 && (max == 1 || max == -1)) ||
2598 (min == 1 && max == -1))
2599 length++;
2600 else length += 5;
2601 if (ptr[1] == '?') ptr++;
2602 }
2603 }
2604 continue;
2605
2606 case '^':
2607 case '.':
2608 case '$':
2609 case '*': /* These repeats won't be after brackets; */
2610 case '+': /* those are handled separately */
2611 case '?':
2612 length++;
2613 continue;
2614
2615 /* This covers the cases of repeats after a single char, metachar, class,
2616 or back reference. */
2617
2618 case '{':
2619 if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620 ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622 if ((min == 0 && (max == 1 || max == -1)) ||
2623 (min == 1 && max == -1))
2624 length++;
2625 else
2626 {
2627 length--; /* Uncount the original char or metachar */
2628 if (min == 1) length++; else if (min > 0) length += 4;
2629 if (max > 0) length += 4; else length += 2;
2630 }
2631 if (ptr[1] == '?') ptr++;
2632 continue;
2633
2634 /* An alternation contains an offset to the next branch or ket. If any ims
2635 options changed in the previous branch(es), and/or if we are in a
2636 lookbehind assertion, extra space will be needed at the start of the
2637 branch. This is handled by branch_extra. */
2638
2639 case '|':
2640 length += 3 + branch_extra;
2641 continue;
2642
2643 /* A character class uses 33 characters. Don't worry about character types
2644 that aren't allowed in classes - they'll get picked up during the compile.
2645 A character class that contains only one character uses 2 or 3 bytes,
2646 depending on whether it is negated or not. Notice this where we can. */
2647
2648 case '[':
2649 class_charcount = 0;
2650 if (*(++ptr) == '^') ptr++;
2651 do
2652 {
2653 if (*ptr == '\\')
2654 {
2655 int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2656 &compile_block);
2657 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658 if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2659 }
2660 else class_charcount++;
2661 ptr++;
2662 }
2663 while (*ptr != 0 && *ptr != ']');
2664
2665 /* Repeats for negated single chars are handled by the general code */
2666
2667 if (class_charcount == 1) length += 3; else
2668 {
2669 length += 33;
2670
2671 /* A repeat needs either 1 or 5 bytes. */
2672
2673 if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2674 {
2675 ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677 if ((min == 0 && (max == 1 || max == -1)) ||
2678 (min == 1 && max == -1))
2679 length++;
2680 else length += 5;
2681 if (ptr[1] == '?') ptr++;
2682 }
2683 }
2684 continue;
2685
2686 /* Brackets may be genuine groups or special things */
2687
2688 case '(':
2689 branch_newextra = 0;
2690 bracket_length = 3;
2691
2692 /* Handle special forms of bracket, which all start (? */
2693
2694 if (ptr[1] == '?')
2695 {
2696 int set, unset;
2697 int *optset;
2698
2699 switch (c = ptr[2])
2700 {
2701 /* Skip over comments entirely */
2702 case '#':
2703 ptr += 3;
2704 while (*ptr != 0 && *ptr != ')') ptr++;
2705 if (*ptr == 0)
2706 {
2707 *errorptr = ERR18;
2708 goto PCRE_ERROR_RETURN;
2709 }
2710 continue;
2711
2712 /* Non-referencing groups and lookaheads just move the pointer on, and
2713 then behave like a non-special bracket, except that they don't increment
2714 the count of extracting brackets. Ditto for the "once only" bracket,
2715 which is in Perl from version 5.005. */
2716
2717 case ':':
2718 case '=':
2719 case '!':
2720 case '>':
2721 ptr += 2;
2722 break;
2723
2724 /* A recursive call to the regex is an extension, to provide the
2725 facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2726
2727 case 'R':
2728 if (ptr[3] != ')')
2729 {
2730 *errorptr = ERR29;
2731 goto PCRE_ERROR_RETURN;
2732 }
2733 ptr += 3;
2734 length += 1;
2735 break;
2736
2737 /* Lookbehinds are in Perl from version 5.005 */
2738
2739 case '<':
2740 if (ptr[3] == '=' || ptr[3] == '!')
2741 {
2742 ptr += 3;
2743 branch_newextra = 3;
2744 length += 3; /* For the first branch */
2745 break;
2746 }
2747 *errorptr = ERR24;
2748 goto PCRE_ERROR_RETURN;
2749
2750 /* Conditionals are in Perl from version 5.005. The bracket must either
2751 be followed by a number (for bracket reference) or by an assertion
2752 group. */
2753
2754 case '(':
2755 if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2756 {
2757 ptr += 4;
2758 length += 3;
2759 while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2760 if (*ptr != ')')
2761 {
2762 *errorptr = ERR26;
2763 goto PCRE_ERROR_RETURN;
2764 }
2765 }
2766 else /* An assertion must follow */
2767 {
2768 ptr++; /* Can treat like ':' as far as spacing is concerned */
2769 if (ptr[2] != '?' ||
2770 (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2771 {
2772 ptr += 2; /* To get right offset in message */
2773 *errorptr = ERR28;
2774 goto PCRE_ERROR_RETURN;
2775 }
2776 }
2777 break;
2778
2779 /* Else loop checking valid options until ) is met. Anything else is an
2780 error. If we are without any brackets, i.e. at top level, the settings
2781 act as if specified in the options, so massage the options immediately.
2782 This is for backward compatibility with Perl 5.004. */
2783
2784 default:
2785 set = unset = 0;
2786 optset = &set;
2787 ptr += 2;
2788
2789 for (;; ptr++)
2790 {
2791 c = *ptr;
2792 switch (c)
2793 {
2794 case 'i':
2795 *optset |= PCRE_CASELESS;
2796 continue;
2797
2798 case 'm':
2799 *optset |= PCRE_MULTILINE;
2800 continue;
2801
2802 case 's':
2803 *optset |= PCRE_DOTALL;
2804 continue;
2805
2806 case 'x':
2807 *optset |= PCRE_EXTENDED;
2808 continue;
2809
2810 case 'X':
2811 *optset |= PCRE_EXTRA;
2812 continue;
2813
2814 case 'U':
2815 *optset |= PCRE_UNGREEDY;
2816 continue;
2817
2818 case '-':
2819 optset = &unset;
2820 continue;
2821
2822 /* A termination by ')' indicates an options-setting-only item;
2823 this is global at top level; otherwise nothing is done here and
2824 it is handled during the compiling process on a per-bracket-group
2825 basis. */
2826
2827 case ')':
2828 if (brastackptr == 0)
2829 {
2830 options = (options | set) & (~unset);
2831 set = unset = 0; /* To save length */
2832 }
2833 /* Fall through */
2834
2835 /* A termination by ':' indicates the start of a nested group with
2836 the given options set. This is again handled at compile time, but
2837 we must allow for compiled space if any of the ims options are
2838 set. We also have to allow for resetting space at the end of
2839 the group, which is why 4 is added to the length and not just 2.
2840 If there are several changes of options within the same group, this
2841 will lead to an over-estimate on the length, but this shouldn't
2842 matter very much. We also have to allow for resetting options at
2843 the start of any alternations, which we do by setting
2844 branch_newextra to 2. Finally, we record whether the case-dependent
2845 flag ever changes within the regex. This is used by the "required
2846 character" code. */
2847
2848 case ':':
2849 if (((set|unset) & PCRE_IMS) != 0)
2850 {
2851 length += 4;
2852 branch_newextra = 2;
2853 if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2854 }
2855 goto END_OPTIONS;
2856
2857 /* Unrecognized option character */
2858
2859 default:
2860 *errorptr = ERR12;
2861 goto PCRE_ERROR_RETURN;
2862 }
2863 }
2864
2865 /* If we hit a closing bracket, that's it - this is a freestanding
2866 option-setting. We need to ensure that branch_extra is updated if
2867 necessary. The only values branch_newextra can have here are 0 or 2.
2868 If the value is 2, then branch_extra must either be 2 or 5, depending
2869 on whether this is a lookbehind group or not. */
2870
2871 END_OPTIONS:
2872 if (c == ')')
2873 {
2874 if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2875 branch_extra += branch_newextra;
2876 continue;
2877 }
2878
2879 /* If options were terminated by ':' control comes here. Fall through
2880 to handle the group below. */
2881 }
2882 }
2883
2884 /* Extracting brackets must be counted so we can process escapes in a
2885 Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2886 need an additional 3 bytes of store per extracting bracket. */
2887
2888 else
2889 {
2890 bracount++;
2891 if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2892 }
2893
2894 /* Save length for computing whole length at end if there's a repeat that
2895 requires duplication of the group. Also save the current value of
2896 branch_extra, and start the new group with the new value. If non-zero, this
2897 will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2898
2899 if (brastackptr >= sizeof(brastack)/sizeof(int))
2900 {
2901 *errorptr = ERR19;
2902 goto PCRE_ERROR_RETURN;
2903 }
2904
2905 bralenstack[brastackptr] = branch_extra;
2906 branch_extra = branch_newextra;
2907
2908 brastack[brastackptr++] = length;
2909 length += bracket_length;
2910 continue;
2911
2912 /* Handle ket. Look for subsequent max/min; for certain sets of values we
2913 have to replicate this bracket up to that many times. If brastackptr is
2914 0 this is an unmatched bracket which will generate an error, but take care
2915 not to try to access brastack[-1] when computing the length and restoring
2916 the branch_extra value. */
2917
2918 case ')':
2919 length += 3;
2920 {
2921 int minval = 1;
2922 int maxval = 1;
2923 int duplength;
2924
2925 if (brastackptr > 0)
2926 {
2927 duplength = length - brastack[--brastackptr];
2928 branch_extra = bralenstack[brastackptr];
2929 }
2930 else duplength = 0;
2931
2932 /* Leave ptr at the final char; for read_repeat_counts this happens
2933 automatically; for the others we need an increment. */
2934
2935 if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2936 {
2937 ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2938 &compile_block);
2939 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2940 }
2941 else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2942 else if (c == '+') { maxval = -1; ptr++; }
2943 else if (c == '?') { minval = 0; ptr++; }
2944
2945 /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2946 group, and if the maximum is greater than zero, we have to replicate
2947 maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2948 bracket set - hence the 7. */
2949
2950 if (minval == 0)
2951 {
2952 length++;
2953 if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2954 }
2955
2956 /* When the minimum is greater than zero, 1 we have to replicate up to
2957 minval-1 times, with no additions required in the copies. Then, if
2958 there is a limited maximum we have to replicate up to maxval-1 times
2959 allowing for a BRAZERO item before each optional copy and nesting
2960 brackets for all but one of the optional copies. */
2961
2962 else
2963 {
2964 length += (minval - 1) * duplength;
2965 if (maxval > minval) /* Need this test as maxval=-1 means no limit */
2966 length += (maxval - minval) * (duplength + 7) - 6;
2967 }
2968 }
2969 continue;
2970
2971 /* Non-special character. For a run of such characters the length required
2972 is the number of characters + 2, except that the maximum run length is 255.
2973 We won't get a skipped space or a non-data escape or the start of a #
2974 comment as the first character, so the length can't be zero. */
2975
2976 NORMAL_CHAR:
2977 default:
2978 length += 2;
2979 runlength = 0;
2980 do
2981 {
2982 if ((options & PCRE_EXTENDED) != 0)
2983 {
2984 if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2985 if (c == '#')
2986 {
2987 /* The space before the ; is to avoid a warning on a silly compiler
2988 on the Macintosh. */
2989 while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2990 continue;
2991 }
2992 }
2993
2994 /* Backslash may introduce a data char or a metacharacter; stop the
2995 string before the latter. */
2996
2997 if (c == '\\')
2998 {
2999 const uschar *saveptr = ptr;
3000 c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3001 &compile_block);
3002 if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3003 if (c < 0) { ptr = saveptr; break; }
3004
3005 #ifdef SUPPORT_UTF8
3006 if (c > 127 && (options & PCRE_UTF8) != 0)
3007 {
3008 int i;
3009 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3010 if (c <= utf8_table1[i]) break;
3011 runlength += i;
3012 }
3013 #endif
3014 }
3015
3016 /* Ordinary character or single-char escape */
3017
3018 runlength++;
3019 }
3020
3021 /* This "while" is the end of the "do" above. */
3022
3023 while (runlength < MAXLIT &&
3024 (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3025
3026 ptr--;
3027 length += runlength;
3028 continue;
3029 }
3030 }
3031
3032 length += 4; /* For final KET and END */
3033
3034 if (length > 65539)
3035 {
3036 *errorptr = ERR20;
3037 return NULL;
3038 }
3039
3040 /* Compute the size of data block needed and get it, either from malloc or
3041 externally provided function. We specify "code[0]" in the offsetof() expression
3042 rather than just "code", because it has been reported that one broken compiler
3043 fails on "code" because it is also an independent variable. It should make no
3044 difference to the value of the offsetof(). */
3045
3046 size = length + offsetof(real_pcre, code[0]);
3047 re = (real_pcre *)(pcre_malloc)(size);
3048
3049 if (re == NULL)
3050 {
3051 *errorptr = ERR21;
3052 return NULL;
3053 }
3054
3055 /* Put in the magic number, and save the size, options, and table pointer */
3056
3057 re->magic_number = MAGIC_NUMBER;
3058 re->size = size;
3059 re->options = options;
3060 re->tables = tables;
3061
3062 /* Set up a starting, non-extracting bracket, then compile the expression. On
3063 error, *errorptr will be set non-NULL, so we don't need to look at the result
3064 of the function here. */
3065
3066 ptr = (const uschar *)pattern;
3067 code = re->code;
3068 *code = OP_BRA;
3069 bracount = 0;
3070 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3071 &reqchar, &countlits, &compile_block);
3072 re->top_bracket = bracount;
3073 re->top_backref = top_backref;
3074
3075 /* If not reached end of pattern on success, there's an excess bracket. */
3076
3077 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3078
3079 /* Fill in the terminating state and check for disastrous overflow, but
3080 if debugging, leave the test till after things are printed out. */
3081
3082 *code++ = OP_END;
3083
3084 #ifndef DEBUG
3085 if (code - re->code > length) *errorptr = ERR23;
3086 #endif
3087
3088 /* Give an error if there's back reference to a non-existent capturing
3089 subpattern. */
3090
3091 if (top_backref > re->top_bracket) *errorptr = ERR15;
3092
3093 /* Failed to compile */
3094
3095 if (*errorptr != NULL)
3096 {
3097 (pcre_free)(re);
3098 PCRE_ERROR_RETURN:
3099 *erroroffset = ptr - (const uschar *)pattern;
3100 return NULL;
3101 }
3102
3103 /* If the anchored option was not passed, set flag if we can determine that the
3104 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3105 starting with .* when DOTALL is set).
3106
3107 Otherwise, see if we can determine what the first character has to be, because
3108 that speeds up unanchored matches no end. If not, see if we can set the
3109 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3110 start with ^. and also when all branches start with .* for non-DOTALL matches.
3111 */
3112
3113 if ((options & PCRE_ANCHORED) == 0)
3114 {
3115 int temp_options = options;
3116 if (is_anchored(re->code, &temp_options))
3117 re->options |= PCRE_ANCHORED;
3118 else
3119 {
3120 int ch = find_firstchar(re->code, &temp_options);
3121 if (ch >= 0)
3122 {
3123 re->first_char = ch;
3124 re->options |= PCRE_FIRSTSET;
3125 }
3126 else if (is_startline(re->code))
3127 re->options |= PCRE_STARTLINE;
3128 }
3129 }
3130
3131 /* Save the last required character if there are at least two literal
3132 characters on all paths, or if there is no first character setting. */
3133
3134 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3135 {
3136 re->req_char = reqchar;
3137 re->options |= PCRE_REQCHSET;
3138 }
3139
3140 /* Print out the compiled data for debugging */
3141
3142 #ifdef DEBUG
3143
3144 printf("Length = %d top_bracket = %d top_backref = %d\n",
3145 length, re->top_bracket, re->top_backref);
3146
3147 if (re->options != 0)
3148 {
3149 printf("%s%s%s%s%s%s%s%s%s\n",
3150 ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3151 ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3152 ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3153 ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3154 ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3155 ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3156 ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3157 ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3158 ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3159 }
3160
3161 if ((re->options & PCRE_FIRSTSET) != 0)
3162 {
3163 if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3164 else printf("First char = \\x%02x\n", re->first_char);
3165 }
3166
3167 if ((re->options & PCRE_REQCHSET) != 0)
3168 {
3169 if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3170 else printf("Req char = \\x%02x\n", re->req_char);
3171 }
3172
3173 code_end = code;
3174 code_base = code = re->code;
3175
3176 while (code < code_end)
3177 {
3178 int charlength;
3179
3180 printf("%3d ", code - code_base);
3181
3182 if (*code >= OP_BRA)
3183 {
3184 if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3185 printf("%3d Bra extra", (code[1] << 8) + code[2]);
3186 else
3187 printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3188 code += 2;
3189 }
3190
3191 else switch(*code)
3192 {
3193 case OP_OPT:
3194 printf(" %.2x %s", code[1], OP_names[*code]);
3195 code++;
3196 break;
3197
3198 case OP_CHARS:
3199 charlength = *(++code);
3200 printf("%3d ", charlength);
3201 while (charlength-- > 0)
3202 if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3203 break;
3204
3205 case OP_KETRMAX:
3206 case OP_KETRMIN:
3207 case OP_ALT:
3208 case OP_KET:
3209 case OP_ASSERT:
3210 case OP_ASSERT_NOT:
3211 case OP_ASSERTBACK:
3212 case OP_ASSERTBACK_NOT:
3213 case OP_ONCE:
3214 case OP_REVERSE:
3215 case OP_BRANUMBER:
3216 case OP_COND:
3217 case OP_CREF:
3218 printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3219 code += 2;
3220 break;
3221
3222 case OP_STAR:
3223 case OP_MINSTAR:
3224 case OP_PLUS:
3225 case OP_MINPLUS:
3226 case OP_QUERY:
3227 case OP_MINQUERY:
3228 case OP_TYPESTAR:
3229 case OP_TYPEMINSTAR:
3230 case OP_TYPEPLUS:
3231 case OP_TYPEMINPLUS:
3232 case OP_TYPEQUERY:
3233 case OP_TYPEMINQUERY:
3234 if (*code >= OP_TYPESTAR)
3235 printf(" %s", OP_names[code[1]]);
3236 else if (isprint(c = code[1])) printf(" %c", c);
3237 else printf(" \\x%02x", c);
3238 printf("%s", OP_names[*code++]);
3239 break;
3240
3241 case OP_EXACT:
3242 case OP_UPTO:
3243 case OP_MINUPTO:
3244 if (isprint(c = code[3])) printf(" %c{", c);
3245 else printf(" \\x%02x{", c);
3246 if (*code != OP_EXACT) printf("0,");
3247 printf("%d}", (code[1] << 8) + code[2]);
3248 if (*code == OP_MINUPTO) printf("?");
3249 code += 3;
3250 break;
3251
3252 case OP_TYPEEXACT:
3253 case OP_TYPEUPTO:
3254 case OP_TYPEMINUPTO:
3255 printf(" %s{", OP_names[code[3]]);
3256 if (*code != OP_TYPEEXACT) printf(",");
3257 printf("%d}", (code[1] << 8) + code[2]);
3258 if (*code == OP_TYPEMINUPTO) printf("?");
3259 code += 3;
3260 break;
3261
3262 case OP_NOT:
3263 if (isprint(c = *(++code))) printf(" [^%c]", c);
3264 else printf(" [^\\x%02x]", c);
3265 break;
3266
3267 case OP_NOTSTAR:
3268 case OP_NOTMINSTAR:
3269 case OP_NOTPLUS:
3270 case OP_NOTMINPLUS:
3271 case OP_NOTQUERY:
3272 case OP_NOTMINQUERY:
3273 if (isprint(c = code[1])) printf(" [^%c]", c);
3274 else printf(" [^\\x%02x]", c);
3275 printf("%s", OP_names[*code++]);
3276 break;
3277
3278 case OP_NOTEXACT:
3279 case OP_NOTUPTO:
3280 case OP_NOTMINUPTO:
3281 if (isprint(c = code[3])) printf(" [^%c]{", c);
3282 else printf(" [^\\x%02x]{", c);
3283 if (*code != OP_NOTEXACT) printf(",");
3284 printf("%d}", (code[1] << 8) + code[2]);
3285 if (*code == OP_NOTMINUPTO) printf("?");
3286 code += 3;
3287 break;
3288
3289 case OP_REF:
3290 printf(" \\%d", (code[1] << 8) | code[2]);
3291 code += 3;
3292 goto CLASS_REF_REPEAT;
3293
3294 case OP_CLASS:
3295 {
3296 int i, min, max;
3297 code++;
3298 printf(" [");
3299
3300 for (i = 0; i < 256; i++)
3301 {
3302 if ((code[i/8] & (1 << (i&7))) != 0)
3303 {
3304 int j;
3305 for (j = i+1; j < 256; j++)
3306 if ((code[j/8] & (1 << (j&7))) == 0) break;
3307 if (i == '-' || i == ']') printf("\\");
3308 if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3309 if (--j > i)
3310 {
3311 printf("-");
3312 if (j == '-' || j == ']') printf("\\");
3313 if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3314 }
3315 i = j;
3316 }
3317 }
3318 printf("]");
3319 code += 32;
3320
3321 CLASS_REF_REPEAT:
3322
3323 switch(*code)
3324 {
3325 case OP_CRSTAR:
3326 case OP_CRMINSTAR:
3327 case OP_CRPLUS:
3328 case OP_CRMINPLUS:
3329 case OP_CRQUERY:
3330 case OP_CRMINQUERY:
3331 printf("%s", OP_names[*code]);
3332 break;
3333
3334 case OP_CRRANGE:
3335 case OP_CRMINRANGE:
3336 min = (code[1] << 8) + code[2];
3337 max = (code[3] << 8) + code[4];
3338 if (max == 0) printf("{%d,}", min);
3339 else printf("{%d,%d}", min, max);
3340 if (*code == OP_CRMINRANGE) printf("?");
3341 code += 4;
3342 break;
3343
3344 default:
3345 code--;
3346 }
3347 }
3348 break;
3349
3350 /* Anything else is just a one-node item */
3351
3352 default:
3353 printf(" %s", OP_names[*code]);
3354 break;
3355 }
3356
3357 code++;
3358 printf("\n");
3359 }
3360 printf("------------------------------------------------------------------\n");
3361
3362 /* This check is done here in the debugging case so that the code that
3363 was compiled can be seen. */
3364
3365 if (code - re->code > length)
3366 {
3367 *errorptr = ERR23;
3368 (pcre_free)(re);
3369 *erroroffset = ptr - (uschar *)pattern;
3370 return NULL;
3371 }
3372 #endif
3373
3374 return (pcre *)re;
3375 }
3376
3377
3378
3379 /*************************************************
3380 * Match a back-reference *
3381 *************************************************/
3382
3383 /* If a back reference hasn't been set, the length that is passed is greater
3384 than the number of characters left in the string, so the match fails.
3385
3386 Arguments:
3387 offset index into the offset vector
3388 eptr points into the subject
3389 length length to be matched
3390 md points to match data block
3391 ims the ims flags
3392
3393 Returns: TRUE if matched
3394 */
3395
3396 static BOOL
3397 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3398 unsigned long int ims)
3399 {
3400 const uschar *p = md->start_subject + md->offset_vector[offset];
3401
3402 #ifdef DEBUG
3403 if (eptr >= md->end_subject)
3404 printf("matching subject <null>");
3405 else
3406 {
3407 printf("matching subject ");
3408 pchars(eptr, length, TRUE, md);
3409 }
3410 printf(" against backref ");
3411 pchars(p, length, FALSE, md);
3412 printf("\n");
3413 #endif
3414
3415 /* Always fail if not enough characters left */
3416
3417 if (length > md->end_subject - eptr) return FALSE;
3418
3419 /* Separate the caselesss case for speed */
3420
3421 if ((ims & PCRE_CASELESS) != 0)
3422 {
3423 while (length-- > 0)
3424 if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3425 }
3426 else
3427 { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3428
3429 return TRUE;
3430 }
3431
3432
3433
3434 /*************************************************
3435 * Match from current position *
3436 *************************************************/
3437
3438 /* On entry ecode points to the first opcode, and eptr to the first character
3439 in the subject string, while eptrb holds the value of eptr at the start of the
3440 last bracketed group - used for breaking infinite loops matching zero-length
3441 strings.
3442
3443 Arguments:
3444 eptr pointer in subject
3445 ecode position in code
3446 offset_top current top pointer
3447 md pointer to "static" info for the match
3448 ims current /i, /m, and /s options
3449 eptrb pointer to chain of blocks containing eptr at start of
3450 brackets - for testing for empty matches
3451 flags can contain
3452 match_condassert - this is an assertion condition
3453 match_isgroup - this is the start of a bracketed group
3454
3455 Returns: TRUE if matched
3456 */
3457
3458 static BOOL
3459 match(register const uschar *eptr, register const uschar *ecode,
3460 int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3461 int flags)
3462 {
3463 unsigned long int original_ims = ims; /* Save for resetting on ')' */
3464 eptrblock newptrb;
3465
3466 /* At the start of a bracketed group, add the current subject pointer to the
3467 stack of such pointers, to be re-instated at the end of the group when we hit
3468 the closing ket. When match() is called in other circumstances, we don't add to
3469 the stack. */
3470
3471 if ((flags & match_isgroup) != 0)
3472 {
3473 newptrb.prev = eptrb;
3474 newptrb.saved_eptr = eptr;
3475 eptrb = &newptrb;
3476 }
3477
3478 /* Now start processing the operations. */
3479
3480 for (;;)
3481 {
3482 int op = (int)*ecode;
3483 int min, max, ctype;
3484 register int i;
3485 register int c;
3486 BOOL minimize = FALSE;
3487
3488 /* Opening capturing bracket. If there is space in the offset vector, save
3489 the current subject position in the working slot at the top of the vector. We
3490 mustn't change the current values of the data slot, because they may be set
3491 from a previous iteration of this group, and be referred to by a reference
3492 inside the group.
3493
3494 If the bracket fails to match, we need to restore this value and also the
3495 values of the final offsets, in case they were set by a previous iteration of
3496 the same bracket.
3497
3498 If there isn't enough space in the offset vector, treat this as if it were a
3499 non-capturing bracket. Don't worry about setting the flag for the error case
3500 here; that is handled in the code for KET. */
3501
3502 if (op > OP_BRA)
3503 {
3504 int offset;
3505 int number = op - OP_BRA;
3506
3507 /* For extended extraction brackets (large number), we have to fish out the
3508 number from a dummy opcode at the start. */
3509
3510 if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3511 offset = number << 1;
3512
3513 #ifdef DEBUG
3514 printf("start bracket %d subject=", number);
3515 pchars(eptr, 16, TRUE, md);
3516 printf("\n");
3517 #endif
3518
3519 if (offset < md->offset_max)
3520 {
3521 int save_offset1 = md->offset_vector[offset];
3522 int save_offset2 = md->offset_vector[offset+1];
3523 int save_offset3 = md->offset_vector[md->offset_end - number];
3524
3525 DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3526 md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3527
3528 do
3529 {
3530 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3531 return TRUE;
3532 ecode += (ecode[1] << 8) + ecode[2];
3533 }
3534 while (*ecode == OP_ALT);
3535
3536 DPRINTF(("bracket %d failed\n", number));
3537
3538 md->offset_vector[offset] = save_offset1;
3539 md->offset_vector[offset+1] = save_offset2;
3540 md->offset_vector[md->offset_end - number] = save_offset3;
3541
3542 return FALSE;
3543 }
3544
3545 /* Insufficient room for saving captured contents */
3546
3547 else op = OP_BRA;
3548 }
3549
3550 /* Other types of node can be handled by a switch */
3551
3552 switch(op)
3553 {
3554 case OP_BRA: /* Non-capturing bracket: optimized */
3555 DPRINTF(("start bracket 0\n"));
3556 do
3557 {
3558 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3559 return TRUE;
3560 ecode += (ecode[1] << 8) + ecode[2];
3561 }
3562 while (*ecode == OP_ALT);
3563 DPRINTF(("bracket 0 failed\n"));
3564 return FALSE;
3565
3566 /* Conditional group: compilation checked that there are no more than
3567 two branches. If the condition is false, skipping the first branch takes us
3568 past the end if there is only one branch, but that's OK because that is
3569 exactly what going to the ket would do. */
3570
3571 case OP_COND:
3572 if (ecode[3] == OP_CREF) /* Condition is extraction test */
3573 {
3574 int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3575 return match(eptr,
3576 ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3577 6 : 3 + (ecode[1] << 8) + ecode[2]),
3578 offset_top, md, ims, eptrb, match_isgroup);
3579 }
3580
3581 /* The condition is an assertion. Call match() to evaluate it - setting
3582 the final argument TRUE causes it to stop at the end of an assertion. */
3583
3584 else
3585 {
3586 if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3587 match_condassert | match_isgroup))
3588 {
3589 ecode += 3 + (ecode[4] << 8) + ecode[5];
3590 while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3591 }
3592 else ecode += (ecode[1] << 8) + ecode[2];
3593 return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3594 }
3595 /* Control never reaches here */
3596
3597 /* Skip over conditional reference or large extraction number data if
3598 encountered. */
3599
3600 case OP_CREF:
3601 case OP_BRANUMBER:
3602 ecode += 3;
3603 break;
3604
3605 /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3606 an empty string - recursion will then try other alternatives, if any. */
3607
3608 case OP_END:
3609 if (md->notempty && eptr == md->start_match) return FALSE;
3610 md->end_match_ptr = eptr; /* Record where we ended */
3611 md->end_offset_top = offset_top; /* and how many extracts were taken */
3612 return TRUE;
3613
3614 /* Change option settings */
3615
3616 case OP_OPT:
3617 ims = ecode[1];
3618 ecode += 2;
3619 DPRINTF(("ims set to %02lx\n", ims));
3620 break;
3621
3622 /* Assertion brackets. Check the alternative branches in turn - the
3623 matching won't pass the KET for an assertion. If any one branch matches,
3624 the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3625 start of each branch to move the current point backwards, so the code at
3626 this level is identical to the lookahead case. */
3627
3628 case OP_ASSERT:
3629 case OP_ASSERTBACK:
3630 do
3631 {
3632 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3633 ecode += (ecode[1] << 8) + ecode[2];
3634 }
3635 while (*ecode == OP_ALT);
3636 if (*ecode == OP_KET) return FALSE;
3637
3638 /* If checking an assertion for a condition, return TRUE. */
3639
3640 if ((flags & match_condassert) != 0) return TRUE;
3641
3642 /* Continue from after the assertion, updating the offsets high water
3643 mark, since extracts may have been taken during the assertion. */
3644
3645 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3646 ecode += 3;
3647 offset_top = md->end_offset_top;
3648 continue;
3649
3650 /* Negative assertion: all branches must fail to match */
3651
3652 case OP_ASSERT_NOT:
3653 case OP_ASSERTBACK_NOT:
3654 do
3655 {
3656 if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3657 return FALSE;
3658 ecode += (ecode[1] << 8) + ecode[2];
3659 }
3660 while (*ecode == OP_ALT);
3661
3662 if ((flags & match_condassert) != 0) return TRUE;
3663
3664 ecode += 3;
3665 continue;
3666
3667 /* Move the subject pointer back. This occurs only at the start of
3668 each branch of a lookbehind assertion. If we are too close to the start to
3669 move back, this match function fails. When working with UTF-8 we move
3670 back a number of characters, not bytes. */
3671
3672 case OP_REVERSE:
3673 #ifdef SUPPORT_UTF8
3674 c = (ecode[1] << 8) + ecode[2];
3675 for (i = 0; i < c; i++)
3676 {
3677 eptr--;
3678 BACKCHAR(eptr)
3679 }
3680 #else
3681 eptr -= (ecode[1] << 8) + ecode[2];
3682 #endif
3683
3684 if (eptr < md->start_subject) return FALSE;
3685 ecode += 3;
3686 break;
3687
3688 /* Recursion matches the current regex, nested. If there are any capturing
3689 brackets started but not finished, we have to save their starting points
3690 and reinstate them after the recursion. However, we don't know how many
3691 such there are (offset_top records the completed total) so we just have
3692 to save all the potential data. There may be up to 99 such values, which
3693 is a bit large to put on the stack, but using malloc for small numbers
3694 seems expensive. As a compromise, the stack is used when there are fewer
3695 than 16 values to store; otherwise malloc is used. A problem is what to do
3696 if the malloc fails ... there is no way of returning to the top level with
3697 an error. Save the top 15 values on the stack, and accept that the rest
3698 may be wrong. */
3699
3700 case OP_RECURSE:
3701 {
3702 BOOL rc;
3703 int *save;
3704 int stacksave[15];
3705
3706 c = md->offset_max;
3707
3708 if (c < 16) save = stacksave; else
3709 {
3710 save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3711 if (save == NULL)
3712 {
3713 save = stacksave;
3714 c = 15;
3715 }
3716 }
3717
3718 for (i = 1; i <= c; i++)
3719 save[i] = md->offset_vector[md->offset_end - i];
3720 rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3721 match_isgroup);
3722 for (i = 1; i <= c; i++)
3723 md->offset_vector[md->offset_end - i] = save[i];
3724 if (save != stacksave) (pcre_free)(save);
3725 if (!rc) return FALSE;
3726
3727 /* In case the recursion has set more capturing values, save the final
3728 number, then move along the subject till after the recursive match,
3729 and advance one byte in the pattern code. */
3730
3731 offset_top = md->end_offset_top;
3732 eptr = md->end_match_ptr;
3733 ecode++;
3734 }
3735 break;
3736
3737 /* "Once" brackets are like assertion brackets except that after a match,
3738 the point in the subject string is not moved back. Thus there can never be
3739 a move back into the brackets. Check the alternative branches in turn - the
3740 matching won't pass the KET for this kind of subpattern. If any one branch
3741 matches, we carry on as at the end of a normal bracket, leaving the subject
3742 pointer. */
3743
3744 case OP_ONCE:
3745 {
3746 const uschar *prev = ecode;
3747 const uschar *saved_eptr = eptr;
3748
3749 do
3750 {
3751 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3752 break;
3753 ecode += (ecode[1] << 8) + ecode[2];
3754 }
3755 while (*ecode == OP_ALT);
3756
3757 /* If hit the end of the group (which could be repeated), fail */
3758
3759 if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3760
3761 /* Continue as from after the assertion, updating the offsets high water
3762 mark, since extracts may have been taken. */
3763
3764 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3765
3766 offset_top = md->end_offset_top;
3767 eptr = md->end_match_ptr;
3768
3769 /* For a non-repeating ket, just continue at this level. This also
3770 happens for a repeating ket if no characters were matched in the group.
3771 This is the forcible breaking of infinite loops as implemented in Perl
3772 5.005. If there is an options reset, it will get obeyed in the normal
3773 course of events. */
3774
3775 if (*ecode == OP_KET || eptr == saved_eptr)
3776 {
3777 ecode += 3;
3778 break;
3779 }
3780
3781 /* The repeating kets try the rest of the pattern or restart from the
3782 preceding bracket, in the appropriate order. We need to reset any options
3783 that changed within the bracket before re-running it, so check the next
3784 opcode. */
3785
3786 if (ecode[3] == OP_OPT)
3787 {
3788 ims = (ims & ~PCRE_IMS) | ecode[4];
3789 DPRINTF(("ims set to %02lx at group repeat\n", ims));
3790 }
3791
3792 if (*ecode == OP_KETRMIN)
3793 {
3794 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3795 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3796 return TRUE;
3797 }
3798 else /* OP_KETRMAX */
3799 {
3800 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3801 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3802 }
3803 }
3804 return FALSE;
3805
3806 /* An alternation is the end of a branch; scan along to find the end of the
3807 bracketed group and go to there. */
3808
3809 case OP_ALT:
3810 do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3811 break;
3812
3813 /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3814 that it may occur zero times. It may repeat infinitely, or not at all -
3815 i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3816 repeat limits are compiled as a number of copies, with the optional ones
3817 preceded by BRAZERO or BRAMINZERO. */
3818
3819 case OP_BRAZERO:
3820 {
3821 const uschar *next = ecode+1;
3822 if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3823 return TRUE;
3824 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3825 ecode = next + 3;
3826 }
3827 break;
3828
3829 case OP_BRAMINZERO:
3830 {
3831 const uschar *next = ecode+1;
3832 do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833 if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3834 return TRUE;
3835 ecode++;
3836 }
3837 break;
3838
3839 /* End of a group, repeated or non-repeating. If we are at the end of
3840 an assertion "group", stop matching and return TRUE, but record the
3841 current high water mark for use by positive assertions. Do this also
3842 for the "once" (not-backup up) groups. */
3843
3844 case OP_KET:
3845 case OP_KETRMIN:
3846 case OP_KETRMAX:
3847 {
3848 const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3849 const uschar *saved_eptr = eptrb->saved_eptr;
3850
3851 eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
3852
3853 if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3854 *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3855 *prev == OP_ONCE)
3856 {
3857 md->end_match_ptr = eptr; /* For ONCE */
3858 md->end_offset_top = offset_top;
3859 return TRUE;
3860 }
3861
3862 /* In all other cases except a conditional group we have to check the
3863 group number back at the start and if necessary complete handling an
3864 extraction by setting the offsets and bumping the high water mark. */
3865
3866 if (*prev != OP_COND)
3867 {
3868 int offset;
3869 int number = *prev - OP_BRA;
3870
3871 /* For extended extraction brackets (large number), we have to fish out
3872 the number from a dummy opcode at the start. */
3873
3874 if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3875 offset = number << 1;
3876
3877 #ifdef DEBUG
3878 printf("end bracket %d", number);
3879 printf("\n");
3880 #endif
3881
3882 if (number > 0)
3883 {
3884 if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3885 {
3886 md->offset_vector[offset] =
3887 md->offset_vector[md->offset_end - number];
3888 md->offset_vector[offset+1] = eptr - md->start_subject;
3889 if (offset_top <= offset) offset_top = offset + 2;
3890 }
3891 }
3892 }
3893
3894 /* Reset the value of the ims flags, in case they got changed during
3895 the group. */
3896
3897 ims = original_ims;
3898 DPRINTF(("ims reset to %02lx\n", ims));
3899
3900 /* For a non-repeating ket, just continue at this level. This also
3901 happens for a repeating ket if no characters were matched in the group.
3902 This is the forcible breaking of infinite loops as implemented in Perl
3903 5.005. If there is an options reset, it will get obeyed in the normal
3904 course of events. */
3905
3906 if (*ecode == OP_KET || eptr == saved_eptr)
3907 {
3908 ecode += 3;
3909 break;
3910 }
3911
3912 /* The repeating kets try the rest of the pattern or restart from the
3913 preceding bracket, in the appropriate order. */
3914
3915 if (*ecode == OP_KETRMIN)
3916 {
3917 if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3918 match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3919 return TRUE;
3920 }
3921 else /* OP_KETRMAX */
3922 {
3923 if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3924 match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3925 }
3926 }
3927 return FALSE;
3928
3929 /* Start of subject unless notbol, or after internal newline if multiline */
3930
3931 case OP_CIRC:
3932 if (md->notbol && eptr == md->start_subject) return FALSE;
3933 if ((ims & PCRE_MULTILINE) != 0)
3934 {
3935 if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3936 ecode++;
3937 break;
3938 }
3939 /* ... else fall through */
3940
3941 /* Start of subject assertion */
3942
3943 case OP_SOD:
3944 if (eptr != md->start_subject) return FALSE;
3945 ecode++;
3946 break;
3947
3948 /* Assert before internal newline if multiline, or before a terminating
3949 newline unless endonly is set, else end of subject unless noteol is set. */
3950
3951 case OP_DOLL:
3952 if ((ims & PCRE_MULTILINE) != 0)
3953 {
3954 if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3955 else { if (md->noteol) return FALSE; }
3956 ecode++;
3957 break;
3958 }
3959 else
3960 {
3961 if (md->noteol) return FALSE;
3962 if (!md->endonly)
3963 {
3964 if (eptr < md->end_subject - 1 ||
3965 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3966
3967 ecode++;
3968 break;
3969 }
3970 }
3971 /* ... else fall through */
3972
3973 /* End of subject assertion (\z) */
3974
3975 case OP_EOD:
3976 if (eptr < md->end_subject) return FALSE;
3977 ecode++;
3978 break;
3979
3980 /* End of subject or ending \n assertion (\Z) */
3981
3982 case OP_EODN:
3983 if (eptr < md->end_subject - 1 ||
3984 (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3985 ecode++;
3986 break;
3987
3988 /* Word boundary assertions */
3989
3990 case OP_NOT_WORD_BOUNDARY:
3991 case OP_WORD_BOUNDARY:
3992 {
3993 BOOL prev_is_word = (eptr != md->start_subject) &&
3994 ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3995 BOOL cur_is_word = (eptr < md->end_subject) &&
3996 ((md->ctypes[*eptr] & ctype_word) != 0);
3997 if ((*ecode++ == OP_WORD_BOUNDARY)?
3998 cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3999 return FALSE;
4000 }
4001 break;
4002
4003 /* Match a single character type; inline for speed */
4004
4005 case OP_ANY:
4006 if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4007 return FALSE;
4008 if (eptr++ >= md->end_subject) return FALSE;
4009 #ifdef SUPPORT_UTF8
4010 if (md->utf8)
4011 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4012 #endif
4013 ecode++;
4014 break;
4015
4016 case OP_NOT_DIGIT:
4017 if (eptr >= md->end_subject ||
4018 (md->ctypes[*eptr++] & ctype_digit) != 0)
4019 return FALSE;
4020 ecode++;
4021 break;
4022
4023 case OP_DIGIT:
4024 if (eptr >= md->end_subject ||
4025 (md->ctypes[*eptr++] & ctype_digit) == 0)
4026 return FALSE;
4027 ecode++;
4028 break;
4029
4030 case OP_NOT_WHITESPACE:
4031 if (eptr >= md->end_subject ||
4032 (md->ctypes[*eptr++] & ctype_space) != 0)
4033 return FALSE;
4034 ecode++;
4035 break;
4036
4037 case OP_WHITESPACE:
4038 if (eptr >= md->end_subject ||
4039 (md->ctypes[*eptr++] & ctype_space) == 0)
4040 return FALSE;
4041 ecode++;
4042 break;
4043
4044 case OP_NOT_WORDCHAR:
4045 if (eptr >= md->end_subject ||
4046 (md->ctypes[*eptr++] & ctype_word) != 0)
4047 return FALSE;
4048 ecode++;
4049 break;
4050
4051 case OP_WORDCHAR:
4052 if (eptr >= md->end_subject ||
4053 (md->ctypes[*eptr++] & ctype_word) == 0)
4054 return FALSE;
4055 ecode++;
4056 break;
4057
4058 /* Match a back reference, possibly repeatedly. Look past the end of the
4059 item to see if there is repeat information following. The code is similar
4060 to that for character classes, but repeated for efficiency. Then obey
4061 similar code to character type repeats - written out again for speed.
4062 However, if the referenced string is the empty string, always treat
4063 it as matched, any number of times (otherwise there could be infinite
4064 loops). */
4065
4066 case OP_REF:
4067 {
4068 int length;
4069 int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4070 ecode += 3; /* Advance past item */
4071
4072 /* If the reference is unset, set the length to be longer than the amount
4073 of subject left; this ensures that every attempt at a match fails. We
4074 can't just fail here, because of the possibility of quantifiers with zero
4075 minima. */
4076
4077 length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4078 md->end_subject - eptr + 1 :
4079 md->offset_vector[offset+1] - md->offset_vector[offset];
4080
4081 /* Set up for repetition, or handle the non-repeated case */
4082
4083 switch (*ecode)
4084 {
4085 case OP_CRSTAR:
4086 case OP_CRMINSTAR:
4087 case OP_CRPLUS:
4088 case OP_CRMINPLUS:
4089 case OP_CRQUERY:
4090 case OP_CRMINQUERY:
4091 c = *ecode++ - OP_CRSTAR;
4092 minimize = (c & 1) != 0;
4093 min = rep_min[c]; /* Pick up values from tables; */
4094 max = rep_max[c]; /* zero for max => infinity */
4095 if (max == 0) max = INT_MAX;
4096 break;
4097
4098 case OP_CRRANGE:
4099 case OP_CRMINRANGE:
4100 minimize = (*ecode == OP_CRMINRANGE);
4101 min = (ecode[1] << 8) + ecode[2];
4102 max = (ecode[3] << 8) + ecode[4];
4103 if (max == 0) max = INT_MAX;
4104 ecode += 5;
4105 break;
4106
4107 default: /* No repeat follows */
4108 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4109 eptr += length;
4110 continue; /* With the main loop */
4111 }
4112
4113 /* If the length of the reference is zero, just continue with the
4114 main loop. */
4115
4116 if (length == 0) continue;
4117
4118 /* First, ensure the minimum number of matches are present. We get back
4119 the length of the reference string explicitly rather than passing the
4120 address of eptr, so that eptr can be a register variable. */
4121
4122 for (i = 1; i <= min; i++)
4123 {
4124 if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4125 eptr += length;
4126 }
4127
4128 /* If min = max, continue at the same level without recursion.
4129 They are not both allowed to be zero. */
4130
4131 if (min == max) continue;
4132
4133 /* If minimizing, keep trying and advancing the pointer */
4134
4135 if (minimize)
4136 {
4137 for (i = min;; i++)
4138 {
4139 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4140 return TRUE;
4141 if (i >= max || !match_ref(offset, eptr, length, md, ims))
4142 return FALSE;
4143 eptr += length;
4144 }
4145 /* Control never gets here */
4146 }
4147
4148 /* If maximizing, find the longest string and work backwards */
4149
4150 else
4151 {
4152 const uschar *pp = eptr;
4153 for (i = min; i < max; i++)
4154 {
4155 if (!match_ref(offset, eptr, length, md, ims)) break;
4156 eptr += length;
4157 }
4158 while (eptr >= pp)
4159 {
4160 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4161 return TRUE;
4162 eptr -= length;
4163 }
4164 return FALSE;
4165 }
4166 }
4167 /* Control never gets here */
4168
4169
4170
4171 /* Match a character class, possibly repeatedly. Look past the end of the
4172 item to see if there is repeat information following. Then obey similar
4173 code to character type repeats - written out again for speed. */
4174
4175 case OP_CLASS:
4176 {
4177 const uschar *data = ecode + 1; /* Save for matching */
4178 ecode += 33; /* Advance past the item */
4179
4180 switch (*ecode)
4181 {
4182 case OP_CRSTAR:
4183 case OP_CRMINSTAR:
4184 case OP_CRPLUS:
4185 case OP_CRMINPLUS:
4186 case OP_CRQUERY:
4187 case OP_CRMINQUERY:
4188 c = *ecode++ - OP_CRSTAR;
4189 minimize = (c & 1) != 0;
4190 min = rep_min[c]; /* Pick up values from tables; */
4191 max = rep_max[c]; /* zero for max => infinity */
4192 if (max == 0) max = INT_MAX;
4193 break;
4194
4195 case OP_CRRANGE:
4196 case OP_CRMINRANGE:
4197 minimize = (*ecode == OP_CRMINRANGE);
4198 min = (ecode[1] << 8) + ecode[2];
4199 max = (ecode[3] << 8) + ecode[4];
4200 if (max == 0) max = INT_MAX;
4201 ecode += 5;
4202 break;
4203
4204 default: /* No repeat follows */
4205 min = max = 1;
4206 break;
4207 }
4208
4209 /* First, ensure the minimum number of matches are present. */
4210
4211 for (i = 1; i <= min; i++)
4212 {
4213 if (eptr >= md->end_subject) return FALSE;
4214 GETCHARINC(c, eptr) /* Get character; increment eptr */
4215
4216 #ifdef SUPPORT_UTF8
4217 /* We do not yet support class members > 255 */
4218 if (c > 255) return FALSE;
4219 #endif
4220
4221 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4222 return FALSE;
4223 }
4224
4225 /* If max == min we can continue with the main loop without the
4226 need to recurse. */
4227
4228 if (min == max) continue;
4229
4230 /* If minimizing, keep testing the rest of the expression and advancing
4231 the pointer while it matches the class. */
4232
4233 if (minimize)
4234 {
4235 for (i = min;; i++)
4236 {
4237 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4238 return TRUE;
4239 if (i >= max || eptr >= md->end_subject) return FALSE;
4240 GETCHARINC(c, eptr) /* Get character; increment eptr */
4241
4242 #ifdef SUPPORT_UTF8
4243 /* We do not yet support class members > 255 */
4244 if (c > 255) return FALSE;
4245 #endif
4246 if ((data[c/8] & (1 << (c&7))) != 0) continue;
4247 return FALSE;
4248 }
4249 /* Control never gets here */
4250 }
4251
4252 /* If maximizing, find the longest possible run, then work backwards. */
4253
4254 else
4255 {
4256 const uschar *pp = eptr;
4257 int len = 1;
4258 for (i = min; i < max; i++)
4259 {
4260 if (eptr >= md->end_subject) break;
4261 GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
4262
4263 #ifdef SUPPORT_UTF8
4264 /* We do not yet support class members > 255 */
4265 if (c > 255) break;
4266 #endif
4267 if ((data[c/8] & (1 << (c&7))) == 0) break;
4268 eptr += len;
4269 }
4270
4271 while (eptr >= pp)
4272 {
4273 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4274 return TRUE;
4275
4276 #ifdef SUPPORT_UTF8
4277 BACKCHAR(eptr)
4278 #endif
4279 }
4280 return FALSE;
4281 }
4282 }
4283 /* Control never gets here */
4284
4285 /* Match a run of characters */
4286
4287 case OP_CHARS:
4288 {
4289 register int length = ecode[1];
4290 ecode += 2;
4291
4292 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4293 if (eptr >= md->end_subject)
4294 printf("matching subject <null> against pattern ");
4295 else
4296 {
4297 printf("matching subject ");
4298 pchars(eptr, length, TRUE, md);
4299 printf(" against pattern ");
4300 }
4301 pchars(ecode, length, FALSE, md);
4302 printf("\n");
4303 #endif
4304
4305 if (length > md->end_subject - eptr) return FALSE;
4306 if ((ims & PCRE_CASELESS) != 0)
4307 {
4308 while (length-- > 0)
4309 if (md->lcc[*ecode++] != md->lcc[*eptr++])
4310 return FALSE;
4311 }
4312 else
4313 {
4314 while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4315 }
4316 }
4317 break;
4318
4319 /* Match a single character repeatedly; different opcodes share code. */
4320
4321 case OP_EXACT:
4322 min = max = (ecode[1] << 8) + ecode[2];
4323 ecode += 3;
4324 goto REPEATCHAR;
4325
4326 case OP_UPTO:
4327 case OP_MINUPTO:
4328 min = 0;
4329 max = (ecode[1] << 8) + ecode[2];
4330 minimize = *ecode == OP_MINUPTO;
4331 ecode += 3;
4332 goto REPEATCHAR;
4333
4334 case OP_STAR:
4335 case OP_MINSTAR:
4336 case OP_PLUS:
4337 case OP_MINPLUS:
4338 case OP_QUERY:
4339 case OP_MINQUERY:
4340 c = *ecode++ - OP_STAR;
4341 minimize = (c & 1) != 0;
4342 min = rep_min[c]; /* Pick up values from tables; */
4343 max = rep_max[c]; /* zero for max => infinity */
4344 if (max == 0) max = INT_MAX;
4345
4346 /* Common code for all repeated single-character matches. We can give
4347 up quickly if there are fewer than the minimum number of characters left in
4348 the subject. */
4349
4350 REPEATCHAR:
4351 if (min > md->end_subject - eptr) return FALSE;
4352 c = *ecode++;
4353
4354 /* The code is duplicated for the caseless and caseful cases, for speed,
4355 since matching characters is likely to be quite common. First, ensure the
4356 minimum number of matches are present. If min = max, continue at the same
4357 level without recursing. Otherwise, if minimizing, keep trying the rest of
4358 the expression and advancing one matching character if failing, up to the
4359 maximum. Alternatively, if maximizing, find the maximum number of
4360 characters and work backwards. */
4361
4362 DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4363 max, eptr));
4364
4365 if ((ims & PCRE_CASELESS) != 0)
4366 {
4367 c = md->lcc[c];
4368 for (i = 1; i <= min; i++)
4369 if (c != md->lcc[*eptr++]) return FALSE;
4370 if (min == max) continue;
4371 if (minimize)
4372 {
4373 for (i = min;; i++)
4374 {
4375 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4376 return TRUE;
4377 if (i >= max || eptr >= md->end_subject ||
4378 c != md->lcc[*eptr++])
4379 return FALSE;
4380 }
4381 /* Control never gets here */
4382 }
4383 else
4384 {
4385 const uschar *pp = eptr;
4386 for (i = min; i < max; i++)
4387 {
4388 if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4389 eptr++;
4390 }
4391 while (eptr >= pp)
4392 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4393 return TRUE;
4394 return FALSE;
4395 }
4396 /* Control never gets here */
4397 }
4398
4399 /* Caseful comparisons */
4400
4401 else
4402 {
4403 for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4404 if (min == max) continue;
4405 if (minimize)
4406 {
4407 for (i = min;; i++)
4408 {
4409 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4410 return TRUE;
4411 if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4412 }
4413 /* Control never gets here */
4414 }
4415 else
4416 {
4417 const uschar *pp = eptr;
4418 for (i = min; i < max; i++)
4419 {
4420 if (eptr >= md->end_subject || c != *eptr) break;
4421 eptr++;
4422 }
4423 while (eptr >= pp)
4424 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4425 return TRUE;
4426 return FALSE;
4427 }
4428 }
4429 /* Control never gets here */
4430
4431 /* Match a negated single character */
4432
4433 case OP_NOT:
4434 if (eptr >= md->end_subject) return FALSE;
4435 ecode++;
4436 if ((ims & PCRE_CASELESS) != 0)
4437 {
4438 if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4439 }
4440 else
4441 {
4442 if (*ecode++ == *eptr++) return FALSE;
4443 }
4444 break;
4445
4446 /* Match a negated single character repeatedly. This is almost a repeat of
4447 the code for a repeated single character, but I haven't found a nice way of
4448 commoning these up that doesn't require a test of the positive/negative
4449 option for each character match. Maybe that wouldn't add very much to the
4450 time taken, but character matching *is* what this is all about... */
4451
4452 case OP_NOTEXACT:
4453 min = max = (ecode[1] << 8) + ecode[2];
4454 ecode += 3;
4455 goto REPEATNOTCHAR;
4456
4457 case OP_NOTUPTO:
4458 case OP_NOTMINUPTO:
4459 min = 0;
4460 max = (ecode[1] << 8) + ecode[2];
4461 minimize = *ecode == OP_NOTMINUPTO;
4462 ecode += 3;
4463 goto REPEATNOTCHAR;
4464
4465 case OP_NOTSTAR:
4466 case OP_NOTMINSTAR:
4467 case OP_NOTPLUS:
4468 case OP_NOTMINPLUS:
4469 case OP_NOTQUERY:
4470 case OP_NOTMINQUERY:
4471 c = *ecode++ - OP_NOTSTAR;
4472 minimize = (c & 1) != 0;
4473 min = rep_min[c]; /* Pick up values from tables; */
4474 max = rep_max[c]; /* zero for max => infinity */
4475 if (max == 0) max = INT_MAX;
4476
4477 /* Common code for all repeated single-character matches. We can give
4478 up quickly if there are fewer than the minimum number of characters left in
4479 the subject. */
4480
4481 REPEATNOTCHAR:
4482 if (min > md->end_subject - eptr) return FALSE;
4483 c = *ecode++;
4484
4485 /* The code is duplicated for the caseless and caseful cases, for speed,
4486 since matching characters is likely to be quite common. First, ensure the
4487 minimum number of matches are present. If min = max, continue at the same
4488 level without recursing. Otherwise, if minimizing, keep trying the rest of
4489 the expression and advancing one matching character if failing, up to the
4490 maximum. Alternatively, if maximizing, find the maximum number of
4491 characters and work backwards. */
4492
4493 DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4494 max, eptr));
4495
4496 if ((ims & PCRE_CASELESS) != 0)
4497 {
4498 c = md->lcc[c];
4499 for (i = 1; i <= min; i++)
4500 if (c == md->lcc[*eptr++]) return FALSE;
4501 if (min == max) continue;
4502 if (minimize)
4503 {
4504 for (i = min;; i++)
4505 {
4506 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4507 return TRUE;
4508 if (i >= max || eptr >= md->end_subject ||
4509 c == md->lcc[*eptr++])
4510 return FALSE;
4511 }
4512 /* Control never gets here */
4513 }
4514 else
4515 {
4516 const uschar *pp = eptr;
4517 for (i = min; i < max; i++)
4518 {
4519 if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4520 eptr++;
4521 }
4522 while (eptr >= pp)
4523 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4524 return TRUE;
4525 return FALSE;
4526 }
4527 /* Control never gets here */
4528 }
4529
4530 /* Caseful comparisons */
4531
4532 else
4533 {
4534 for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4535 if (min == max) continue;
4536 if (minimize)
4537 {
4538 for (i = min;; i++)
4539 {
4540 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4541 return TRUE;
4542 if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4543 }
4544 /* Control never gets here */
4545 }
4546 else
4547 {
4548 const uschar *pp = eptr;
4549 for (i = min; i < max; i++)
4550 {
4551 if (eptr >= md->end_subject || c == *eptr) break;
4552 eptr++;
4553 }
4554 while (eptr >= pp)
4555 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4556 return TRUE;
4557 return FALSE;
4558 }
4559 }
4560 /* Control never gets here */
4561
4562 /* Match a single character type repeatedly; several different opcodes
4563 share code. This is very similar to the code for single characters, but we
4564 repeat it in the interests of efficiency. */
4565
4566 case OP_TYPEEXACT:
4567 min = max = (ecode[1] << 8) + ecode[2];
4568 minimize = TRUE;
4569 ecode += 3;
4570 goto REPEATTYPE;
4571
4572 case OP_TYPEUPTO:
4573 case OP_TYPEMINUPTO:
4574 min = 0;
4575 max = (ecode[1] << 8) + ecode[2];
4576 minimize = *ecode == OP_TYPEMINUPTO;
4577 ecode += 3;
4578 goto REPEATTYPE;
4579
4580 case OP_TYPESTAR:
4581 case OP_TYPEMINSTAR:
4582 case OP_TYPEPLUS:
4583 case OP_TYPEMINPLUS:
4584 case OP_TYPEQUERY:
4585 case OP_TYPEMINQUERY:
4586 c = *ecode++ - OP_TYPESTAR;
4587 minimize = (c & 1) != 0;
4588 min = rep_min[c]; /* Pick up values from tables; */
4589 max = rep_max[c]; /* zero for max => infinity */
4590 if (max == 0) max = INT_MAX;
4591
4592 /* Common code for all repeated single character type matches */
4593
4594 REPEATTYPE:
4595 ctype = *ecode++; /* Code for the character type */
4596
4597 /* First, ensure the minimum number of matches are present. Use inline
4598 code for maximizing the speed, and do the type test once at the start
4599 (i.e. keep it out of the loop). Also we can test that there are at least
4600 the minimum number of bytes before we start, except when doing '.' in
4601 UTF8 mode. Leave the test in in all cases; in the special case we have
4602 to test after each character. */
4603
4604 if (min > md->end_subject - eptr) return FALSE;
4605 if (min > 0) switch(ctype)
4606 {
4607 case OP_ANY:
4608 #ifdef SUPPORT_UTF8
4609 if (md->utf8)
4610 {
4611 for (i = 1; i <= min; i++)
4612 {
4613 if (eptr >= md->end_subject ||
4614 (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4615 return FALSE;
4616 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4617 }
4618 break;
4619 }
4620 #endif
4621 /* Non-UTF8 can be faster */
4622 if ((ims & PCRE_DOTALL) == 0)
4623 { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4624 else eptr += min;
4625 break;
4626
4627 case OP_NOT_DIGIT:
4628 for (i = 1; i <= min; i++)
4629 if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4630 break;
4631
4632 case OP_DIGIT:
4633 for (i = 1; i <= min; i++)
4634 if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4635 break;
4636
4637 case OP_NOT_WHITESPACE:
4638 for (i = 1; i <= min; i++)
4639 if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4640 break;
4641
4642 case OP_WHITESPACE:
4643 for (i = 1; i <= min; i++)
4644 if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4645 break;
4646
4647 case OP_NOT_WORDCHAR:
4648 for (i = 1; i <= min; i++)
4649 if ((md->ctypes[*eptr++] & ctype_word) != 0)
4650 return FALSE;
4651 break;
4652
4653 case OP_WORDCHAR:
4654 for (i = 1; i <= min; i++)
4655 if ((md->ctypes[*eptr++] & ctype_word) == 0)
4656 return FALSE;
4657 break;
4658 }
4659
4660 /* If min = max, continue at the same level without recursing */
4661
4662 if (min == max) continue;
4663
4664 /* If minimizing, we have to test the rest of the pattern before each
4665 subsequent match. */
4666
4667 if (minimize)
4668 {
4669 for (i = min;; i++)
4670 {
4671 if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4672 if (i >= max || eptr >= md->end_subject) return FALSE;
4673
4674 c = *eptr++;
4675 switch(ctype)
4676 {
4677 case OP_ANY:
4678 if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4679 #ifdef SUPPORT_UTF8
4680 if (md->utf8)
4681 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4682 #endif
4683 break;
4684
4685 case OP_NOT_DIGIT:
4686 if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4687 break;
4688
4689 case OP_DIGIT:
4690 if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4691 break;
4692
4693 case OP_NOT_WHITESPACE:
4694 if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4695 break;
4696
4697 case OP_WHITESPACE:
4698 if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4699 break;
4700
4701 case OP_NOT_WORDCHAR:
4702 if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4703 break;
4704
4705 case OP_WORDCHAR:
4706 if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4707 break;
4708 }
4709 }
4710 /* Control never gets here */
4711 }
4712
4713 /* If maximizing it is worth using inline code for speed, doing the type
4714 test once at the start (i.e. keep it out of the loop). */
4715
4716 else
4717 {
4718 const uschar *pp = eptr;
4719 switch(ctype)
4720 {
4721 case OP_ANY:
4722
4723 /* Special code is required for UTF8, but when the maximum is unlimited
4724 we don't need it. */
4725
4726 #ifdef SUPPORT_UTF8
4727 if (md->utf8 && max < INT_MAX)
4728 {
4729 if ((ims & PCRE_DOTALL) == 0)
4730 {
4731 for (i = min; i < max; i++)
4732 {
4733 if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4734 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4735 }
4736 }
4737 else
4738 {
4739 for (i = min; i < max; i++)
4740 {
4741 eptr++;
4742 while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743 }
4744 }
4745 break;
4746 }
4747 #endif
4748 /* Non-UTF8 can be faster */
4749 if ((ims & PCRE_DOTALL) == 0)
4750 {
4751 for (i = min; i < max; i++)
4752 {
4753 if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4754 eptr++;
4755 }
4756 }
4757 else
4758 {
4759 c = max - min;
4760 if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4761 eptr += c;
4762 }
4763 break;
4764
4765 case OP_NOT_DIGIT:
4766 for (i = min; i < max; i++)
4767 {
4768 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4769 break;
4770 eptr++;
4771 }
4772 break;
4773
4774 case OP_DIGIT:
4775 for (i = min; i < max; i++)
4776 {
4777 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4778 break;
4779 eptr++;
4780 }
4781 break;
4782
4783 case OP_NOT_WHITESPACE:
4784 for (i = min; i < max; i++)
4785 {
4786 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4787 break;
4788 eptr++;
4789 }
4790 break;
4791
4792 case OP_WHITESPACE:
4793 for (i = min; i < max; i++)
4794 {
4795 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4796 break;
4797 eptr++;
4798 }
4799 break;
4800
4801 case OP_NOT_WORDCHAR:
4802 for (i = min; i < max; i++)
4803 {
4804 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4805 break;
4806 eptr++;
4807 }
4808 break;
4809
4810 case OP_WORDCHAR:
4811 for (i = min; i < max; i++)
4812 {
4813 if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4814 break;
4815 eptr++;
4816 }
4817 break;
4818 }
4819
4820 while (eptr >= pp)
4821 {
4822 if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4823 return TRUE;
4824 #ifdef SUPPORT_UTF8
4825 if (md->utf8)
4826 while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4827 #endif
4828 }
4829 return FALSE;
4830 }
4831 /* Control never gets here */
4832
4833 /* There's been some horrible disaster. */
4834
4835 default:
4836 DPRINTF(("Unknown opcode %d\n", *ecode));
4837 md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4838 return FALSE;
4839 }
4840
4841 /* Do not stick any code in here without much thought; it is assumed
4842 that "continue" in the code above comes out to here to repeat the main
4843 loop. */
4844
4845 } /* End of main loop */
4846 /* Control never reaches here */
4847 }
4848
4849
4850
4851
4852 /*************************************************
4853 * Execute a Regular Expression *
4854 *************************************************/
4855
4856 /* This function applies a compiled re to a subject string and picks out
4857 portions of the string if it matches. Two elements in the vector are set for
4858 each substring: the offsets to the start and end of the substring.
4859
4860 Arguments:
4861 external_re points to the compiled expression
4862 external_extra points to "hints" from pcre_study() or is NULL
4863 subject points to the subject string
4864 length length of subject string (may contain binary zeros)
4865 start_offset where to start in the subject string
4866 options option bits
4867 offsets points to a vector of ints to be filled in with offsets
4868 offsetcount the number of elements in the vector
4869
4870 Returns: > 0 => success; value is the number of elements filled in
4871 = 0 => success, but offsets is not big enough
4872 -1 => failed to match
4873 < -1 => some kind of unexpected problem
4874 */
4875
4876 int
4877 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4878 const char *subject, int length, int start_offset, int options, int *offsets,
4879 int offsetcount)
4880 {
4881 int resetcount, ocount;
4882 int first_char = -1;
4883 int req_char = -1;
4884 int req_char2 = -1;
4885 unsigned long int ims = 0;
4886 match_data match_block;
4887 const uschar *start_bits = NULL;
4888 const uschar *start_match = (const uschar *)subject + start_offset;
4889 const uschar *end_subject;
4890 const uschar *req_char_ptr = start_match - 1;
4891 const real_pcre *re = (const real_pcre *)external_re;
4892 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4893 BOOL using_temporary_offsets = FALSE;
4894 BOOL anchored;
4895 BOOL startline;
4896
4897 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4898
4899 if (re == NULL || subject == NULL ||
4900 (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4901 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4902
4903 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4904 startline = (re->options & PCRE_STARTLINE) != 0;
4905
4906 match_block.start_pattern = re->code;
4907 match_block.start_subject = (const uschar *)subject;
4908 match_block.end_subject = match_block.start_subject + length;
4909 end_subject = match_block.end_subject;
4910
4911 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4912 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4913
4914 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4915 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4916 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4917
4918 match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
4919
4920 match_block.lcc = re->tables + lcc_offset;
4921 match_block.ctypes = re->tables + ctypes_offset;
4922
4923 /* The ims options can vary during the matching as a result of the presence
4924 of (?ims) items in the pattern. They are kept in a local variable so that
4925 restoring at the exit of a group is easy. */
4926
4927 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4928
4929 /* If the expression has got more back references than the offsets supplied can
4930 hold, we get a temporary bit of working store to use during the matching.
4931 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4932 of 3. */
4933
4934 ocount = offsetcount - (offsetcount % 3);
4935
4936 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4937 {
4938 ocount = re->top_backref * 3 + 3;
4939 match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4940 if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4941 using_temporary_offsets = TRUE;
4942 DPRINTF(("Got memory to hold back references\n"));
4943 }
4944 else match_block.offset_vector = offsets;
4945
4946 match_block.offset_end = ocount;
4947 match_block.offset_max = (2*ocount)/3;
4948 match_block.offset_overflow = FALSE;
4949
4950 /* Compute the minimum number of offsets that we need to reset each time. Doing
4951 this makes a huge difference to execution time when there aren't many brackets
4952 in the pattern. */
4953
4954 resetcount = 2 + re->top_bracket * 2;
4955 if (resetcount > offsetcount) resetcount = ocount;
4956
4957 /* Reset the working variable associated with each extraction. These should
4958 never be used unless previously set, but they get saved and restored, and so we
4959 initialize them to avoid reading uninitialized locations. */
4960
4961 if (match_block.offset_vector != NULL)
4962 {
4963 register int *iptr = match_block.offset_vector + ocount;
4964 register int *iend = iptr - resetcount/2 + 1;
4965 while (--iptr >= iend) *iptr = -1;
4966 }
4967
4968 /* Set up the first character to match, if available. The first_char value is
4969 never set for an anchored regular expression, but the anchoring may be forced
4970 at run time, so we have to test for anchoring. The first char may be unset for
4971 an unanchored pattern, of course. If there's no first char and the pattern was
4972 studied, there may be a bitmap of possible first characters. */
4973
4974 if (!anchored)
4975 {
4976 if ((re->options & PCRE_FIRSTSET) != 0)
4977 {
4978 first_char = re->first_char;
4979 if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4980 }
4981 else
4982 if (!startline && extra != NULL &&
4983 (extra->options & PCRE_STUDY_MAPPED) != 0)
4984 start_bits = extra->start_bits;
4985 }
4986
4987 /* For anchored or unanchored matches, there may be a "last known required
4988 character" set. If the PCRE_CASELESS is set, implying that the match starts
4989 caselessly, or if there are any changes of this flag within the regex, set up
4990 both cases of the character. Otherwise set the two values the same, which will
4991 avoid duplicate testing (which takes significant time). This covers the vast
4992 majority of cases. It will be suboptimal when the case flag changes in a regex
4993 and the required character in fact is caseful. */
4994
4995 if ((re->options & PCRE_REQCHSET) != 0)
4996 {
4997 req_char = re->req_char;
4998 req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4999 (re->tables + fcc_offset)[req_char] : req_char;
5000 }
5001
5002 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5003 the loop runs just once. */
5004
5005 do
5006 {
5007 int rc;
5008 register int *iptr = match_block.offset_vector;
5009 register int *iend = iptr + resetcount;
5010
5011 /* Reset the maximum number of extractions we might see. */
5012
5013 while (iptr < iend) *iptr++ = -1;
5014
5015 /* Advance to a unique first char if possible */
5016
5017 if (first_char >= 0)
5018 {
5019 if ((ims & PCRE_CASELESS) != 0)
5020 while (start_match < end_subject &&
5021 match_block.lcc[*start_match] != first_char)
5022 start_match++;
5023 else
5024 while (start_match < end_subject && *start_match != first_char)
5025 start_match++;
5026 }
5027
5028 /* Or to just after \n for a multiline match if possible */
5029
5030 else if (startline)
5031 {
5032 if (start_match > match_block.start_subject + start_offset)
5033 {
5034 while (start_match < end_subject && start_match[-1] != NEWLINE)
5035 start_match++;
5036 }
5037 }
5038
5039 /* Or to a non-unique first char after study */
5040
5041 else if (start_bits != NULL)
5042 {
5043 while (start_match < end_subject)
5044 {
5045 register int c = *start_match;
5046 if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5047 }
5048 }
5049
5050 #ifdef DEBUG /* Sigh. Some compilers never learn. */
5051 printf(">>>> Match against: ");
5052 pchars(start_match, end_subject - start_match, TRUE, &match_block);
5053 printf("\n");
5054 #endif
5055
5056 /* If req_char is set, we know that that character must appear in the subject
5057 for the match to succeed. If the first character is set, req_char must be
5058 later in the subject; otherwise the test starts at the match point. This
5059 optimization can save a huge amount of backtracking in patterns with nested
5060 unlimited repeats that aren't going to match. We don't know what the state of
5061 case matching may be when this character is hit, so test for it in both its
5062 cases if necessary. However, the different cased versions will not be set up
5063 unless PCRE_CASELESS was given or the casing state changes within the regex.
5064 Writing separate code makes it go faster, as does using an autoincrement and
5065 backing off on a match. */
5066
5067 if (req_char >= 0)
5068 {
5069 register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5070
5071 /* We don't need to repeat the search if we haven't yet reached the
5072 place we found it at last time. */
5073
5074 if (p > req_char_ptr)
5075 {
5076 /* Do a single test if no case difference is set up */
5077
5078 if (req_char == req_char2)
5079 {
5080 while (p < end_subject)
5081 {
5082 if (*p++ == req_char) { p--; break; }
5083 }
5084 }
5085
5086 /* Otherwise test for either case */
5087
5088 else
5089 {
5090 while (p < end_subject)
5091 {
5092 register int pp = *p++;
5093 if (pp == req_char || pp == req_char2) { p--; break; }
5094 }
5095 }
5096
5097 /* If we can't find the required character, break the matching loop */
5098
5099 if (p >= end_subject) break;
5100
5101 /* If we have found the required character, save the point where we
5102 found it, so that we don't search again next time round the loop if
5103 the start hasn't passed this character yet. */
5104
5105 req_char_ptr = p;
5106 }
5107 }
5108
5109 /* When a match occurs, substrings will be set for all internal extractions;
5110 we just need to set up the whole thing as substring 0 before returning. If
5111 there were too many extractions, set the return code to zero. In the case
5112 where we had to get some local store to hold offsets for backreferences, copy
5113 those back references that we can. In this case there need not be overflow
5114 if certain parts of the pattern were not used. */
5115
5116 match_block.start_match = start_match;
5117 if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5118 continue;
5119
5120 /* Copy the offset information from temporary store if necessary */
5121
5122 if (using_temporary_offsets)
5123 {
5124 if (offsetcount >= 4)
5125 {
5126 memcpy(offsets + 2, match_block.offset_vector + 2,
5127 (offsetcount - 2) * sizeof(int));
5128 DPRINTF(("Copied offsets from temporary memory\n"));
5129 }
5130 if (match_block.end_offset_top > offsetcount)
5131 match_block.offset_overflow = TRUE;
5132
5133 DPRINTF(("Freeing temporary memory\n"));
5134 (pcre_free)(match_block.offset_vector);
5135 }
5136
5137 rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5138
5139 if (offsetcount < 2) rc = 0; else
5140 {
5141 offsets[0] = start_match - match_block.start_subject;
5142 offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5143 }
5144
5145 DPRINTF((">>>> returning %d\n", rc));
5146 return rc;
5147 }
5148
5149 /* This "while" is the end of the "do" above */
5150
5151 while (!anchored &&
5152 match_block.errorcode == PCRE_ERROR_NOMATCH &&
5153 start_match++ < end_subject);
5154
5155 if (using_temporary_offsets)
5156 {
5157 DPRINTF(("Freeing temporary memory\n"));
5158 (pcre_free)(match_block.offset_vector);
5159 }
5160
5161 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5162
5163 return match_block.errorcode;
5164 }
5165
5166 /* End of pcre.c */
5167 |
This page was automatically generated by the
LXR engine.
Visit the LXR main site for more information. |