NCBI C Toolkit Cross Reference

C/regexp/pcre.c


  1 /*************************************************
  2 *      Perl-Compatible Regular Expressions       *
  3 *************************************************/
  4 
  5 /*
  6 This is a library of functions to support regular expressions whose syntax
  7 and semantics are as close as possible to those of the Perl 5 language. See
  8 the file Tech.Notes for some information on the internals.
  9 
 10 Written by: Philip Hazel <ph10@cam.ac.uk>
 11 
 12            Copyright (c) 1997-2001 University of Cambridge
 13 
 14 -----------------------------------------------------------------------------
 15 Permission is granted to anyone to use this software for any purpose on any
 16 computer system, and to redistribute it freely, subject to the following
 17 restrictions:
 18 
 19 1. This software is distributed in the hope that it will be useful,
 20    but WITHOUT ANY WARRANTY; without even the implied warranty of
 21    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 22 
 23 2. The origin of this software must not be misrepresented, either by
 24    explicit claim or by omission.
 25 
 26 3. Altered versions must be plainly marked as such, and must not be
 27    misrepresented as being the original software.
 28 
 29 4. If PCRE is embedded in any software that is released under the GNU
 30    General Purpose Licence (GPL), then the terms of that licence shall
 31    supersede any condition above with which it is incompatible.
 32 -----------------------------------------------------------------------------
 33 */
 34 
 35 /* Use a macro for debugging printing */
 36 
 37 #if defined(_DEBUG)
 38 #  define DPRINTF(p) printf p
 39 #else
 40 #  define DPRINTF(p) /*nothing*/
 41 #endif
 42 
 43 /* Include the internals header, which itself includes Standard C headers plus
 44 the external pcre header. */
 45 
 46 #include "pcre_internal.h"
 47 
 48 /* Allow compilation as C++ source code, should anybody want to do that. */
 49 
 50 #ifdef __cplusplus
 51 #define class pcre_class
 52 #endif
 53 
 54 
 55 /* Maximum number of items on the nested bracket stacks at compile time. This
 56 applies to the nesting of all kinds of parentheses. It does not limit
 57 un-nested, non-capturing parentheses. This number can be made bigger if
 58 necessary - it is used to dimension one int and one unsigned char vector at
 59 compile time. */
 60 
 61 #define BRASTACK_SIZE 200
 62 
 63 
 64 /* The number of bytes in a literal character string above which we can't add
 65 any more is different when UTF-8 characters may be encountered. */
 66 
 67 #ifdef SUPPORT_UTF8
 68 #define MAXLIT 250
 69 #else
 70 #define MAXLIT 255
 71 #endif
 72 
 73 
 74 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
 75 
 76 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
 77 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
 78 
 79 /* Text forms of OP_ values and things, for debugging (not all used) */
 80 
 81 #ifdef DEBUG
 82 static const char *OP_names[] = {
 83   "End", "\\A", "\\B", "\\b", "\\D", "\\d",
 84   "\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
 85   "Opt", "^", "$", "Any", "chars", "not",
 86   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 87   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 88   "*", "*?", "+", "+?", "?", "??", "{", "{", "{",
 89   "*", "*?", "+", "+?", "?", "??", "{", "{",
 90   "class", "Ref", "Recurse",
 91   "Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
 92   "AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
 93   "Brazero", "Braminzero", "Branumber", "Bra"
 94 };
 95 #endif
 96 
 97 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 98 are simple data values; negative values are for special things like \d and so
 99 on. Zero means further processing is needed (for things like \x), or the escape
100 is invalid. */
101 
102 static const short int escapes[] = {
103     0,      0,      0,      0,      0,      0,      0,      0,   /* 0 - 7 */
104     0,      0,    ':',    ';',    '<',    '=',    '>',    '?',   /* 8 - ? */
105   '@', -ESC_A, -ESC_B,      0, -ESC_D,      0,      0,      0,   /* @ - G */
106     0,      0,      0,      0,      0,      0,      0,      0,   /* H - O */
107     0,      0,      0, -ESC_S,      0,      0,      0, -ESC_W,   /* P - W */
108     0,      0, -ESC_Z,    '[',   '\\',    ']',    '^',    '_',   /* X - _ */
109   '`',      7, -ESC_b,      0, -ESC_d,  ESC_E,  ESC_F,      0,   /* ` - g */
110     0,      0,      0,      0,      0,      0,  ESC_N,      0,   /* h - o */
111     0,      0,  ESC_R, -ESC_s,  ESC_T,      0,      0, -ESC_w,   /* p - w */
112     0,      0, -ESC_z                                            /* x - z */
113 };
114 
115 /* Tables of names of POSIX character classes and their lengths. The list is
116 terminated by a zero length entry. The first three must be alpha, upper, lower,
117 as this is assumed for handling case independence. */
118 
119 static const char *posix_names[] = {
120   "alpha", "lower", "upper",
121   "alnum", "ascii", "cntrl", "digit", "graph",
122   "print", "punct", "space", "word",  "xdigit" };
123 
124 static const uschar posix_name_lengths[] = {
125   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
126 
127 /* Table of class bit maps for each POSIX class; up to three may be combined
128 to form the class. */
129 
130 static const int posix_class_maps[] = {
131   cbit_lower, cbit_upper, -1,             /* alpha */
132   cbit_lower, -1,         -1,             /* lower */
133   cbit_upper, -1,         -1,             /* upper */
134   cbit_digit, cbit_lower, cbit_upper,     /* alnum */
135   cbit_print, cbit_cntrl, -1,             /* ascii */
136   cbit_cntrl, -1,         -1,             /* cntrl */
137   cbit_digit, -1,         -1,             /* digit */
138   cbit_graph, -1,         -1,             /* graph */
139   cbit_print, -1,         -1,             /* print */
140   cbit_punct, -1,         -1,             /* punct */
141   cbit_space, -1,         -1,             /* space */
142   cbit_word,  -1,         -1,             /* word */
143   cbit_xdigit,-1,         -1              /* xdigit */
144 };
145 
146 
147 /* Definition to allow mutual recursion */
148 
149 static BOOL
150   compile_regex(int, int, int *, uschar **, const uschar **, const char **,
151     BOOL, int, int *, int *, compile_data *);
152 
153 /* Structure for building a chain of data that actually lives on the
154 stack, for holding the values of the subject pointer at the start of each
155 subpattern, so as to detect when an empty string has been matched by a
156 subpattern - to break infinite loops. */
157 
158 typedef struct eptrblock {
159   struct eptrblock *prev;
160   const uschar *saved_eptr;
161 } eptrblock;
162 
163 /* Flag bits for the match() function */
164 
165 #define match_condassert   0x01    /* Called to check a condition assertion */
166 #define match_isgroup      0x02    /* Set if start of bracketed group */
167 
168 
169 
170 /*************************************************
171 *               Global variables                 *
172 *************************************************/
173 
174 /* PCRE is thread-clean and doesn't use any global variables in the normal
175 sense. However, it calls memory allocation and free functions via the two
176 indirections below, which are can be changed by the caller, but are shared
177 between all threads. */
178 
179 void *(*pcre_malloc)(size_t) = malloc;
180 void  (*pcre_free)(void *) = free;
181 
182 
183 
184 /*************************************************
185 *    Macros and tables for character handling    *
186 *************************************************/
187 
188 /* When UTF-8 encoding is being used, a character is no longer just a single
189 byte. The macros for character handling generate simple sequences when used in
190 byte-mode, and more complicated ones for UTF-8 characters. */
191 
192 #ifndef SUPPORT_UTF8
193 #define GETCHARINC(c, eptr) c = *eptr++;
194 #define GETCHARLEN(c, eptr, len) c = *eptr;
195 #define BACKCHAR(eptr)
196 
197 #else   /* SUPPORT_UTF8 */
198 
199 /* Get the next UTF-8 character, advancing the pointer */
200 
201 #define GETCHARINC(c, eptr) \
202   c = *eptr++; \
203   if (md->utf8 && (c & 0xc0) == 0xc0) \
204     { \
205     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
206     int s = 6*a; \
207     c = (c & utf8_table3[a]) << s; \
208     while (a-- > 0) \
209       { \
210       s -= 6; \
211       c |= (*eptr++ & 0x3f) << s; \
212       } \
213     }
214 
215 /* Get the next UTF-8 character, not advancing the pointer, setting length */
216 
217 #define GETCHARLEN(c, eptr, len) \
218   c = *eptr; \
219   len = 1; \
220   if (md->utf8 && (c & 0xc0) == 0xc0) \
221     { \
222     int i; \
223     int a = utf8_table4[c & 0x3f];  /* Number of additional bytes */ \
224     int s = 6*a; \
225     c = (c & utf8_table3[a]) << s; \
226     for (i = 1; i <= a; i++) \
227       { \
228       s -= 6; \
229       c |= (eptr[i] & 0x3f) << s; \
230       } \
231     len += a; \
232     }
233 
234 /* If the pointer is not at the start of a character, move it back until
235 it is. */
236 
237 #define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
238 
239 #endif
240 
241 
242 
243 /*************************************************
244 *             Default character tables           *
245 *************************************************/
246 
247 /* A default set of character tables is included in the PCRE binary. Its source
248 is built by the maketables auxiliary program, which uses the default C ctypes
249 functions, and put in the file chartables.c. These tables are used by PCRE
250 whenever the caller of pcre_compile() does not provide an alternate set of
251 tables. */
252 
253 #include "chartables.c"
254 
255 
256 
257 #ifdef SUPPORT_UTF8
258 /*************************************************
259 *           Tables for UTF-8 support             *
260 *************************************************/
261 
262 /* These are the breakpoints for different numbers of bytes in a UTF-8
263 character. */
264 
265 static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
266 
267 /* These are the indicator bits and the mask for the data bits to set in the
268 first byte of a character, indexed by the number of additional bytes. */
269 
270 static int utf8_table2[] = { 0,    0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
271 static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
272 
273 /* Table of the number of extra characters, indexed by the first character
274 masked with 0x3f. The highest number for a valid UTF-8 character is in fact
275 0x3d. */
276 
277 static uschar utf8_table4[] = {
278   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
279   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
280   2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
281   3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
282 
283 
284 /*************************************************
285 *       Convert character value to UTF-8         *
286 *************************************************/
287 
288 /* This function takes an integer value in the range 0 - 0x7fffffff
289 and encodes it as a UTF-8 character in 0 to 6 bytes.
290 
291 Arguments:
292   cvalue     the character value
293   buffer     pointer to buffer for result - at least 6 bytes long
294 
295 Returns:     number of characters placed in the buffer
296 */
297 
298 static int
299 ord2utf8(int cvalue, uschar *buffer)
300 {
301 register int i, j;
302 for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
303   if (cvalue <= utf8_table1[i]) break;
304 buffer += i;
305 for (j = i; j > 0; j--)
306  {
307  *buffer-- = 0x80 | (cvalue & 0x3f);
308  cvalue >>= 6;
309  }
310 *buffer = utf8_table2[i] | cvalue;
311 return i + 1;
312 }
313 #endif
314 
315 
316 
317 /*************************************************
318 *          Return version string                 *
319 *************************************************/
320 
321 #define STRING(a)  # a
322 #define XSTRING(s) STRING(s)
323 
324 const char *
325 pcre_version(void)
326 {
327 return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
328 }
329 
330 
331 
332 
333 /*************************************************
334 * (Obsolete) Return info about compiled pattern  *
335 *************************************************/
336 
337 /* This is the original "info" function. It picks potentially useful data out
338 of the private structure, but its interface was too rigid. It remains for
339 backwards compatibility. The public options are passed back in an int - though
340 the re->options field has been expanded to a long int, all the public options
341 at the low end of it, and so even on 16-bit systems this will still be OK.
342 Therefore, I haven't changed the API for pcre_info().
343 
344 Arguments:
345   external_re   points to compiled code
346   optptr        where to pass back the options
347   first_char    where to pass back the first character,
348                 or -1 if multiline and all branches start ^,
349                 or -2 otherwise
350 
351 Returns:        number of capturing subpatterns
352                 or negative values on error
353 */
354 
355 int
356 pcre_info(const pcre *external_re, int *optptr, int *first_char)
357 {
358 const real_pcre *re = (const real_pcre *)external_re;
359 if (re == NULL) return PCRE_ERROR_NULL;
360 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
361 if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
362 if (first_char != NULL)
363   *first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
364      ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
365 return re->top_bracket;
366 }
367 
368 
369 
370 /*************************************************
371 *        Return info about compiled pattern      *
372 *************************************************/
373 
374 /* This is a newer "info" function which has an extensible interface so
375 that additional items can be added compatibly.
376 
377 Arguments:
378   external_re      points to compiled code
379   external_study   points to study data, or NULL
380   what             what information is required
381   where            where to put the information
382 
383 Returns:           0 if data returned, negative on error
384 */
385 
386 int
387 pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
388   void *where)
389 {
390 const real_pcre *re = (const real_pcre *)external_re;
391 const real_pcre_extra *study = (const real_pcre_extra *)study_data;
392 
393 if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
394 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
395 
396 switch (what)
397   {
398   case PCRE_INFO_OPTIONS:
399   *((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
400   break;
401 
402   case PCRE_INFO_SIZE:
403   *((size_t *)where) = re->size;
404   break;
405 
406   case PCRE_INFO_CAPTURECOUNT:
407   *((int *)where) = re->top_bracket;
408   break;
409 
410   case PCRE_INFO_BACKREFMAX:
411   *((int *)where) = re->top_backref;
412   break;
413 
414   case PCRE_INFO_FIRSTCHAR:
415   *((int *)where) =
416     ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
417     ((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
418   break;
419 
420   case PCRE_INFO_FIRSTTABLE:
421   *((const uschar **)where) =
422     (study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
423       study->start_bits : NULL;
424   break;
425 
426   case PCRE_INFO_LASTLITERAL:
427   *((int *)where) =
428     ((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
429   break;
430 
431   default: return PCRE_ERROR_BADOPTION;
432   }
433 
434 return 0;
435 }
436 
437 
438 
439 #ifdef DEBUG
440 /*************************************************
441 *        Debugging function to print chars       *
442 *************************************************/
443 
444 /* Print a sequence of chars in printable format, stopping at the end of the
445 subject if the requested.
446 
447 Arguments:
448   p           points to characters
449   length      number to print
450   is_subject  TRUE if printing from within md->start_subject
451   md          pointer to matching data block, if is_subject is TRUE
452 
453 Returns:     nothing
454 */
455 
456 static void
457 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
458 {
459 int c;
460 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
461 while (length-- > 0)
462   if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
463 }
464 #endif
465 
466 
467 
468 
469 /*************************************************
470 *            Handle escapes                      *
471 *************************************************/
472 
473 /* This function is called when a \ has been encountered. It either returns a
474 positive value for a simple escape such as \n, or a negative value which
475 encodes one of the more complicated things such as \d. When UTF-8 is enabled,
476 a positive value greater than 255 may be returned. On entry, ptr is pointing at
477 the \. On exit, it is on the final character of the escape sequence.
478 
479 Arguments:
480   ptrptr     points to the pattern position pointer
481   errorptr   points to the pointer to the error message
482   bracount   number of previous extracting brackets
483   options    the options bits
484   isclass    TRUE if inside a character class
485   cd         pointer to char tables block
486 
487 Returns:     zero or positive => a data character
488              negative => a special escape sequence
489              on error, errorptr is set
490 */
491 
492 static int
493 check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
494   int options, BOOL isclass, compile_data *cd)
495 {
496 const uschar *ptr = *ptrptr;
497 int c, i;
498 
499 /* If backslash is at the end of the pattern, it's an error. */
500 
501 c = *(++ptr);
502 if (c == 0) *errorptr = ERR1;
503 
504 /* Digits or letters may have special meaning; all others are literals. */
505 
506 else if (c < '0' || c > 'z') {}
507 
508 /* Do an initial lookup in a table. A non-zero result is something that can be
509 returned immediately. Otherwise further processing may be required. */
510 
511 else if ((i = escapes[c - '0']) != 0) c = i;
512 
513 /* Escapes that need further processing, or are illegal. */
514 
515 else
516   {
517   const uschar *oldptr;
518   switch (c)
519     {
520     /* The handling of escape sequences consisting of a string of digits
521     starting with one that is not zero is not straightforward. By experiment,
522     the way Perl works seems to be as follows:
523 
524     Outside a character class, the digits are read as a decimal number. If the
525     number is less than 10, or if there are that many previous extracting
526     left brackets, then it is a back reference. Otherwise, up to three octal
527     digits are read to form an escaped byte. Thus \123 is likely to be octal
528     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
529     value is greater than 377, the least significant 8 bits are taken. Inside a
530     character class, \ followed by a digit is always an octal number. */
531 
532     case '1': case '2': case '3': case '4': case '5':
533     case '6': case '7': case '8': case '9':
534 
535     if (!isclass)
536       {
537       oldptr = ptr;
538       c -= '0';
539       while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
540         c = c * 10 + *(++ptr) - '0';
541       if (c < 10 || c <= bracount)
542         {
543         c = -(ESC_REF + c);
544         break;
545         }
546       ptr = oldptr;      /* Put the pointer back and fall through */
547       }
548 
549     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
550     generates a binary zero byte and treats the digit as a following literal.
551     Thus we have to pull back the pointer by one. */
552 
553     if ((c = *ptr) >= '8')
554       {
555       ptr--;
556       c = 0;
557       break;
558       }
559 
560     /* \0 always starts an octal number, but we may drop through to here with a
561     larger first octal digit. */
562 
563     case '0':
564     c -= '0';
565     while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
566       ptr[1] != '8' && ptr[1] != '9')
567         c = c * 8 + *(++ptr) - '0';
568     c &= 255;     /* Take least significant 8 bits */
569     break;
570 
571     /* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
572     which can be greater than 0xff, but only if the ddd are hex digits. */
573 
574     case 'x':
575 #ifdef SUPPORT_UTF8
576     if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
577       {
578       const uschar *pt = ptr + 2;
579       register int count = 0;
580       c = 0;
581       while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
582         {
583         count++;
584         c = c * 16 + cd->lcc[*pt] -
585           (((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
586         pt++;
587         }
588       if (*pt == '}')
589         {
590         if (c < 0 || count > 8) *errorptr = ERR34;
591         ptr = pt;
592         break;
593         }
594       /* If the sequence of hex digits does not end with '}', then we don't
595       recognize this construct; fall through to the normal \x handling. */
596       }
597 #endif
598 
599     /* Read just a single hex char */
600 
601     c = 0;
602     while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
603       {
604       ptr++;
605       c = c * 16 + cd->lcc[*ptr] -
606         (((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
607       }
608     break;
609 
610     /* Other special escapes not starting with a digit are straightforward */
611 
612     case 'c':
613     c = *(++ptr);
614     if (c == 0)
615       {
616       *errorptr = ERR2;
617       return 0;
618       }
619 
620     /* A letter is upper-cased; then the 0x40 bit is flipped */
621 
622     if (c >= 'a' && c <= 'z') c = cd->fcc[c];
623     c ^= 0x40;
624     break;
625 
626     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
627     other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
628     for Perl compatibility, it is a literal. This code looks a bit odd, but
629     there used to be some cases other than the default, and there may be again
630     in future, so I haven't "optimized" it. */
631 
632     default:
633     if ((options & PCRE_EXTRA) != 0) switch(c)
634       {
635       default:
636       *errorptr = ERR3;
637       break;
638       }
639     break;
640     }
641   }
642 
643 *ptrptr = ptr;
644 return c;
645 }
646 
647 
648 
649 /*************************************************
650 *            Check for counted repeat            *
651 *************************************************/
652 
653 /* This function is called when a '{' is encountered in a place where it might
654 start a quantifier. It looks ahead to see if it really is a quantifier or not.
655 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
656 where the ddds are digits.
657 
658 Arguments:
659   p         pointer to the first char after '{'
660   cd        pointer to char tables block
661 
662 Returns:    TRUE or FALSE
663 */
664 
665 static BOOL
666 is_counted_repeat(const uschar *p, compile_data *cd)
667 {
668 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
669 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
670 if (*p == '}') return TRUE;
671 
672 if (*p++ != ',') return FALSE;
673 if (*p == '}') return TRUE;
674 
675 if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
676 while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
677 return (*p == '}');
678 }
679 
680 
681 
682 /*************************************************
683 *         Read repeat counts                     *
684 *************************************************/
685 
686 /* Read an item of the form {n,m} and return the values. This is called only
687 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
688 so the syntax is guaranteed to be correct, but we need to check the values.
689 
690 Arguments:
691   p          pointer to first char after '{'
692   minp       pointer to int for min
693   maxp       pointer to int for max
694              returned as -1 if no max
695   errorptr   points to pointer to error message
696   cd         pointer to character tables clock
697 
698 Returns:     pointer to '}' on success;
699              current ptr on error, with errorptr set
700 */
701 
702 static const uschar *
703 read_repeat_counts(const uschar *p, int *minp, int *maxp,
704   const char **errorptr, compile_data *cd)
705 {
706 int min = 0;
707 int max = -1;
708 
709 while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
710 
711 if (*p == '}') max = min; else
712   {
713   if (*(++p) != '}')
714     {
715     max = 0;
716     while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
717     if (max < min)
718       {
719       *errorptr = ERR4;
720       return p;
721       }
722     }
723   }
724 
725 /* Do paranoid checks, then fill in the required variables, and pass back the
726 pointer to the terminating '}'. */
727 
728 if (min > 65535 || max > 65535)
729   *errorptr = ERR5;
730 else
731   {
732   *minp = min;
733   *maxp = max;
734   }
735 return p;
736 }
737 
738 
739 
740 /*************************************************
741 *        Find the fixed length of a pattern      *
742 *************************************************/
743 
744 /* Scan a pattern and compute the fixed length of subject that will match it,
745 if the length is fixed. This is needed for dealing with backward assertions.
746 
747 Arguments:
748   code     points to the start of the pattern (the bracket)
749   options  the compiling options
750 
751 Returns:   the fixed length, or -1 if there is no fixed length
752 */
753 
754 static int
755 find_fixedlength(uschar *code, int options)
756 {
757 int length = -1;
758 
759 register int branchlength = 0;
760 register uschar *cc = code + 3;
761 
762 /* Scan along the opcodes for this branch. If we get to the end of the
763 branch, check the length against that of the other branches. */
764 
765 for (;;)
766   {
767   int d;
768   register int op = *cc;
769   if (op >= OP_BRA) op = OP_BRA;
770 
771   switch (op)
772     {
773     case OP_BRA:
774     case OP_ONCE:
775     case OP_COND:
776     d = find_fixedlength(cc, options);
777     if (d < 0) return -1;
778     branchlength += d;
779     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
780     cc += 3;
781     break;
782 
783     /* Reached end of a branch; if it's a ket it is the end of a nested
784     call. If it's ALT it is an alternation in a nested call. If it is
785     END it's the end of the outer call. All can be handled by the same code. */
786 
787     case OP_ALT:
788     case OP_KET:
789     case OP_KETRMAX:
790     case OP_KETRMIN:
791     case OP_END:
792     if (length < 0) length = branchlength;
793       else if (length != branchlength) return -1;
794     if (*cc != OP_ALT) return length;
795     cc += 3;
796     branchlength = 0;
797     break;
798 
799     /* Skip over assertive subpatterns */
800 
801     case OP_ASSERT:
802     case OP_ASSERT_NOT:
803     case OP_ASSERTBACK:
804     case OP_ASSERTBACK_NOT:
805     do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
806     cc += 3;
807     break;
808 
809     /* Skip over things that don't match chars */
810 
811     case OP_REVERSE:
812     case OP_BRANUMBER:
813     case OP_CREF:
814     cc++;
815     /* Fall through */
816 
817     case OP_OPT:
818     cc++;
819     /* Fall through */
820 
821     case OP_SOD:
822     case OP_EOD:
823     case OP_EODN:
824     case OP_CIRC:
825     case OP_DOLL:
826     case OP_NOT_WORD_BOUNDARY:
827     case OP_WORD_BOUNDARY:
828     cc++;
829     break;
830 
831     /* Handle char strings. In UTF-8 mode we must count characters, not bytes.
832     This requires a scan of the string, unfortunately. We assume valid UTF-8
833     strings, so all we do is reduce the length by one for byte whose bits are
834     10xxxxxx. */
835 
836     case OP_CHARS:
837     branchlength += *(++cc);
838 #ifdef SUPPORT_UTF8
839     for (d = 1; d <= *cc; d++)
840       if ((cc[d] & 0xc0) == 0x80) branchlength--;
841 #endif
842     cc += *cc + 1;
843     break;
844 
845     /* Handle exact repetitions */
846 
847     case OP_EXACT:
848     case OP_TYPEEXACT:
849     branchlength += (cc[1] << 8) + cc[2];
850     cc += 4;
851     break;
852 
853     /* Handle single-char matchers */
854 
855     case OP_NOT_DIGIT:
856     case OP_DIGIT:
857     case OP_NOT_WHITESPACE:
858     case OP_WHITESPACE:
859     case OP_NOT_WORDCHAR:
860     case OP_WORDCHAR:
861     case OP_ANY:
862     branchlength++;
863     cc++;
864     break;
865 
866 
867     /* Check a class for variable quantification */
868 
869     case OP_CLASS:
870     cc += 33;
871 
872     switch (*cc)
873       {
874       case OP_CRSTAR:
875       case OP_CRMINSTAR:
876       case OP_CRQUERY:
877       case OP_CRMINQUERY:
878       return -1;
879 
880       case OP_CRRANGE:
881       case OP_CRMINRANGE:
882       if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
883       branchlength += (cc[1] << 8) + cc[2];
884       cc += 5;
885       break;
886 
887       default:
888       branchlength++;
889       }
890     break;
891 
892     /* Anything else is variable length */
893 
894     default:
895     return -1;
896     }
897   }
898 /* Control never gets here */
899 }
900 
901 
902 
903 
904 /*************************************************
905 *           Check for POSIX class syntax         *
906 *************************************************/
907 
908 /* This function is called when the sequence "[:" or "[." or "[=" is
909 encountered in a character class. It checks whether this is followed by an
910 optional ^ and then a sequence of letters, terminated by a matching ":]" or
911 ".]" or "=]".
912 
913 Argument:
914   ptr      pointer to the initial [
915   endptr   where to return the end pointer
916   cd       pointer to compile data
917 
918 Returns:   TRUE or FALSE
919 */
920 
921 static BOOL
922 check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
923 {
924 int terminator;          /* Don't combine these lines; the Solaris cc */
925 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
926 if (*(++ptr) == '^') ptr++;
927 while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
928 if (*ptr == terminator && ptr[1] == ']')
929   {
930   *endptr = ptr;
931   return TRUE;
932   }
933 return FALSE;
934 }
935 
936 
937 
938 
939 /*************************************************
940 *          Check POSIX class name                *
941 *************************************************/
942 
943 /* This function is called to check the name given in a POSIX-style class entry
944 such as [:alnum:].
945 
946 Arguments:
947   ptr        points to the first letter
948   len        the length of the name
949 
950 Returns:     a value representing the name, or -1 if unknown
951 */
952 
953 static int
954 check_posix_name(const uschar *ptr, int len)
955 {
956 register int yield = 0;
957 while (posix_name_lengths[yield] != 0)
958   {
959   if (len == posix_name_lengths[yield] &&
960     strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
961   yield++;
962   }
963 return -1;
964 }
965 
966 
967 
968 
969 /*************************************************
970 *           Compile one branch                   *
971 *************************************************/
972 
973 /* Scan the pattern, compiling it into the code vector.
974 
975 Arguments:
976   options      the option bits
977   brackets     points to number of extracting brackets used
978   code         points to the pointer to the current code point
979   ptrptr       points to the current pattern pointer
980   errorptr     points to pointer to error message
981   optchanged   set to the value of the last OP_OPT item compiled
982   reqchar      set to the last literal character required, else -1
983   countlits    set to count of mandatory literal characters
984   cd           contains pointers to tables
985 
986 Returns:       TRUE on success
987                FALSE, with *errorptr set on error
988 */
989 
990 static BOOL
991 compile_branch(int options, int *brackets, uschar **codeptr,
992   const uschar **ptrptr, const char **errorptr, int *optchanged,
993   int *reqchar, int *countlits, compile_data *cd)
994 {
995 int repeat_type, op_type;
996 int repeat_min, repeat_max;
997 int bravalue, length;
998 int greedy_default, greedy_non_default;
999 int prevreqchar;
1000 int condcount = 0;
1001 int subcountlits = 0;
1002 register int c;
1003 register uschar *code = *codeptr;
1004 uschar *tempcode;
1005 const uschar *ptr = *ptrptr;
1006 const uschar *tempptr;
1007 uschar *previous = NULL;
1008 uschar class[32];
1009 
1010 /* Set up the default and non-default settings for greediness */
1011 
1012 greedy_default = ((options & PCRE_UNGREEDY) != 0);
1013 greedy_non_default = greedy_default ^ 1;
1014 
1015 /* Initialize no required char, and count of literals */
1016 
1017 *reqchar = prevreqchar = -1;
1018 *countlits = 0;
1019 
1020 /* Switch on next character until the end of the branch */
1021 
1022 for (;; ptr++)
1023   {
1024   BOOL negate_class;
1025   int class_charcount;
1026   int class_lastchar;
1027   int newoptions;
1028   int skipbytes;
1029   int subreqchar;
1030 
1031   c = *ptr;
1032   if ((options & PCRE_EXTENDED) != 0)
1033     {
1034     if ((cd->ctypes[c] & ctype_space) != 0) continue;
1035     if (c == '#')
1036       {
1037       /* The space before the ; is to avoid a warning on a silly compiler
1038       on the Macintosh. */
1039       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
1040       continue;
1041       }
1042     }
1043 
1044   switch(c)
1045     {
1046     /* The branch terminates at end of string, |, or ). */
1047 
1048     case 0:
1049     case '|':
1050     case ')':
1051     *codeptr = code;
1052     *ptrptr = ptr;
1053     return TRUE;
1054 
1055     /* Handle single-character metacharacters */
1056 
1057     case '^':
1058     previous = NULL;
1059     *code++ = OP_CIRC;
1060     break;
1061 
1062     case '$':
1063     previous = NULL;
1064     *code++ = OP_DOLL;
1065     break;
1066 
1067     case '.':
1068     previous = code;
1069     *code++ = OP_ANY;
1070     break;
1071 
1072     /* Character classes. These always build a 32-byte bitmap of the permitted
1073     characters, except in the special case where there is only one character.
1074     For negated classes, we build the map as usual, then invert it at the end.
1075     */
1076 
1077     case '[':
1078     previous = code;
1079     *code++ = OP_CLASS;
1080 
1081     /* If the first character is '^', set the negation flag and skip it. */
1082 
1083     if ((c = *(++ptr)) == '^')
1084       {
1085       negate_class = TRUE;
1086       c = *(++ptr);
1087       }
1088     else negate_class = FALSE;
1089 
1090     /* Keep a count of chars so that we can optimize the case of just a single
1091     character. */
1092 
1093     class_charcount = 0;
1094     class_lastchar = -1;
1095 
1096     /* Initialize the 32-char bit map to all zeros. We have to build the
1097     map in a temporary bit of store, in case the class contains only 1
1098     character, because in that case the compiled code doesn't use the
1099     bit map. */
1100 
1101     memset(class, 0, 32 * sizeof(uschar));
1102 
1103     /* Process characters until ] is reached. By writing this as a "do" it
1104     means that an initial ] is taken as a data character. */
1105 
1106     do
1107       {
1108       if (c == 0)
1109         {
1110         *errorptr = ERR6;
1111         goto FAILED;
1112         }
1113 
1114       /* Handle POSIX class names. Perl allows a negation extension of the
1115       form [:^name]. A square bracket that doesn't match the syntax is
1116       treated as a literal. We also recognize the POSIX constructions
1117       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
1118       5.6 does. */
1119 
1120       if (c == '[' &&
1121           (ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
1122           check_posix_syntax(ptr, &tempptr, cd))
1123         {
1124         BOOL local_negate = FALSE;
1125         int posix_class, i;
1126         register const uschar *cbits = cd->cbits;
1127 
1128         if (ptr[1] != ':')
1129           {
1130           *errorptr = ERR31;
1131           goto FAILED;
1132           }
1133 
1134         ptr += 2;
1135         if (*ptr == '^')
1136           {
1137           local_negate = TRUE;
1138           ptr++;
1139           }
1140 
1141         posix_class = check_posix_name(ptr, tempptr - ptr);
1142         if (posix_class < 0)
1143           {
1144           *errorptr = ERR30;
1145           goto FAILED;
1146           }
1147 
1148         /* If matching is caseless, upper and lower are converted to
1149         alpha. This relies on the fact that the class table starts with
1150         alpha, lower, upper as the first 3 entries. */
1151 
1152         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
1153           posix_class = 0;
1154 
1155         /* Or into the map we are building up to 3 of the static class
1156         tables, or their negations. */
1157 
1158         posix_class *= 3;
1159         for (i = 0; i < 3; i++)
1160           {
1161           int taboffset = posix_class_maps[posix_class + i];
1162           if (taboffset < 0) break;
1163           if (local_negate)
1164             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
1165           else
1166             for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
1167           }
1168 
1169         ptr = tempptr + 1;
1170         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
1171         continue;
1172         }
1173 
1174       /* Backslash may introduce a single character, or it may introduce one
1175       of the specials, which just set a flag. Escaped items are checked for
1176       validity in the pre-compiling pass. The sequence \b is a special case.
1177       Inside a class (and only there) it is treated as backspace. Elsewhere
1178       it marks a word boundary. Other escapes have preset maps ready to
1179       or into the one we are building. We assume they have more than one
1180       character in them, so set class_count bigger than one. */
1181 
1182       if (c == '\\')
1183         {
1184         c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1185         if (-c == ESC_b) c = '\b';
1186         else if (c < 0)
1187           {
1188           register const uschar *cbits = cd->cbits;
1189           class_charcount = 10;
1190           switch (-c)
1191             {
1192             case ESC_d:
1193             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
1194             continue;
1195 
1196             case ESC_D:
1197             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
1198             continue;
1199 
1200             case ESC_w:
1201             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
1202             continue;
1203 
1204             case ESC_W:
1205             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
1206             continue;
1207 
1208             case ESC_s:
1209             for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
1210             continue;
1211 
1212             case ESC_S:
1213             for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
1214             continue;
1215 
1216             default:
1217             *errorptr = ERR7;
1218             goto FAILED;
1219             }
1220           }
1221 
1222         /* Fall through if single character, but don't at present allow
1223         chars > 255 in UTF-8 mode. */
1224 
1225 #ifdef SUPPORT_UTF8
1226         if (c > 255)
1227           {
1228           *errorptr = ERR33;
1229           goto FAILED;
1230           }
1231 #endif
1232         }
1233 
1234       /* A single character may be followed by '-' to form a range. However,
1235       Perl does not permit ']' to be the end of the range. A '-' character
1236       here is treated as a literal. */
1237 
1238       if (ptr[1] == '-' && ptr[2] != ']')
1239         {
1240         int d;
1241         ptr += 2;
1242         d = *ptr;
1243 
1244         if (d == 0)
1245           {
1246           *errorptr = ERR6;
1247           goto FAILED;
1248           }
1249 
1250         /* The second part of a range can be a single-character escape, but
1251         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
1252         in such circumstances. */
1253 
1254         if (d == '\\')
1255           {
1256           const uschar *oldptr = ptr;
1257           d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
1258 
1259 #ifdef SUPPORT_UTF8
1260           if (d > 255)
1261             {
1262             *errorptr = ERR33;
1263             goto FAILED;
1264             }
1265 #endif
1266           /* \b is backslash; any other special means the '-' was literal */
1267 
1268           if (d < 0)
1269             {
1270             if (d == -ESC_b) d = '\b'; else
1271               {
1272               ptr = oldptr - 2;
1273               goto SINGLE_CHARACTER;  /* A few lines below */
1274               }
1275             }
1276           }
1277 
1278         if (d < c)
1279           {
1280           *errorptr = ERR8;
1281           goto FAILED;
1282           }
1283 
1284         for (; c <= d; c++)
1285           {
1286           class[c/8] |= (1 << (c&7));
1287           if ((options & PCRE_CASELESS) != 0)
1288             {
1289             int uc = cd->fcc[c];           /* flip case */
1290             class[uc/8] |= (1 << (uc&7));
1291             }
1292           class_charcount++;                /* in case a one-char range */
1293           class_lastchar = c;
1294           }
1295         continue;   /* Go get the next char in the class */
1296         }
1297 
1298       /* Handle a lone single character - we can get here for a normal
1299       non-escape char, or after \ that introduces a single character. */
1300 
1301       SINGLE_CHARACTER:
1302 
1303       class [c/8] |= (1 << (c&7));
1304       if ((options & PCRE_CASELESS) != 0)
1305         {
1306         c = cd->fcc[c];   /* flip case */
1307         class[c/8] |= (1 << (c&7));
1308         }
1309       class_charcount++;
1310       class_lastchar = c;
1311       }
1312 
1313     /* Loop until ']' reached; the check for end of string happens inside the
1314     loop. This "while" is the end of the "do" above. */
1315 
1316     while ((c = *(++ptr)) != ']');
1317 
1318     /* If class_charcount is 1 and class_lastchar is not negative, we saw
1319     precisely one character. This doesn't need the whole 32-byte bit map.
1320     We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
1321     it's negative. */
1322 
1323     if (class_charcount == 1 && class_lastchar >= 0)
1324       {
1325       if (negate_class)
1326         {
1327         code[-1] = OP_NOT;
1328         }
1329       else
1330         {
1331         code[-1] = OP_CHARS;
1332         *code++ = 1;
1333         }
1334       *code++ = class_lastchar;
1335       }
1336 
1337     /* Otherwise, negate the 32-byte map if necessary, and copy it into
1338     the code vector. */
1339 
1340     else
1341       {
1342       if (negate_class)
1343         for (c = 0; c < 32; c++) code[c] = ~class[c];
1344       else
1345         memcpy(code, class, 32);
1346       code += 32;
1347       }
1348     break;
1349 
1350     /* Various kinds of repeat */
1351 
1352     case '{':
1353     if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
1354     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
1355     if (*errorptr != NULL) goto FAILED;
1356     goto REPEAT;
1357 
1358     case '*':
1359     repeat_min = 0;
1360     repeat_max = -1;
1361     goto REPEAT;
1362 
1363     case '+':
1364     repeat_min = 1;
1365     repeat_max = -1;
1366     goto REPEAT;
1367 
1368     case '?':
1369     repeat_min = 0;
1370     repeat_max = 1;
1371 
1372     REPEAT:
1373     if (previous == NULL)
1374       {
1375       *errorptr = ERR9;
1376       goto FAILED;
1377       }
1378 
1379     /* If the next character is '?' this is a minimizing repeat, by default,
1380     but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
1381     next character. */
1382 
1383     if (ptr[1] == '?')
1384       { repeat_type = greedy_non_default; ptr++; }
1385     else repeat_type = greedy_default;
1386 
1387     /* If previous was a string of characters, chop off the last one and use it
1388     as the subject of the repeat. If there was only one character, we can
1389     abolish the previous item altogether. A repeat with a zero minimum wipes
1390     out any reqchar setting, backing up to the previous value. We must also
1391     adjust the countlits value. */
1392 
1393     if (*previous == OP_CHARS)
1394       {
1395       int len = previous[1];
1396 
1397       if (repeat_min == 0) *reqchar = prevreqchar;
1398       *countlits += repeat_min - 1;
1399 
1400       if (len == 1)
1401         {
1402         c = previous[2];
1403         code = previous;
1404         }
1405       else
1406         {
1407         c = previous[len+1];
1408         previous[1]--;
1409         code--;
1410         }
1411       op_type = 0;                 /* Use single-char op codes */
1412       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
1413       }
1414 
1415     /* If previous was a single negated character ([^a] or similar), we use
1416     one of the special opcodes, replacing it. The code is shared with single-
1417     character repeats by adding a suitable offset into repeat_type. */
1418 
1419     else if ((int)*previous == OP_NOT)
1420       {
1421       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
1422       c = previous[1];
1423       code = previous;
1424       goto OUTPUT_SINGLE_REPEAT;
1425       }
1426 
1427     /* If previous was a character type match (\d or similar), abolish it and
1428     create a suitable repeat item. The code is shared with single-character
1429     repeats by adding a suitable offset into repeat_type. */
1430 
1431     else if ((int)*previous < OP_EODN || *previous == OP_ANY)
1432       {
1433       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
1434       c = *previous;
1435       code = previous;
1436 
1437       OUTPUT_SINGLE_REPEAT:
1438 
1439       /* If the maximum is zero then the minimum must also be zero; Perl allows
1440       this case, so we do too - by simply omitting the item altogether. */
1441 
1442       if (repeat_max == 0) goto END_REPEAT;
1443 
1444       /* Combine the op_type with the repeat_type */
1445 
1446       repeat_type += op_type;
1447 
1448       /* A minimum of zero is handled either as the special case * or ?, or as
1449       an UPTO, with the maximum given. */
1450 
1451       if (repeat_min == 0)
1452         {
1453         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
1454           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
1455         else
1456           {
1457           *code++ = OP_UPTO + repeat_type;
1458           *code++ = repeat_max >> 8;
1459           *code++ = (repeat_max & 255);
1460           }
1461         }
1462 
1463       /* The case {1,} is handled as the special case + */
1464 
1465       else if (repeat_min == 1 && repeat_max == -1)
1466         *code++ = OP_PLUS + repeat_type;
1467 
1468       /* The case {n,n} is just an EXACT, while the general case {n,m} is
1469       handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
1470 
1471       else
1472         {
1473         if (repeat_min != 1)
1474           {
1475           *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
1476           *code++ = repeat_min >> 8;
1477           *code++ = (repeat_min & 255);
1478           }
1479 
1480         /* If the mininum is 1 and the previous item was a character string,
1481         we either have to put back the item that got cancelled if the string
1482         length was 1, or add the character back onto the end of a longer
1483         string. For a character type nothing need be done; it will just get
1484         put back naturally. Note that the final character is always going to
1485         get added below. */
1486 
1487         else if (*previous == OP_CHARS)
1488           {
1489           if (code == previous) code += 2; else previous[1]++;
1490           }
1491 
1492         /*  For a single negated character we also have to put back the
1493         item that got cancelled. */
1494 
1495         else if (*previous == OP_NOT) code++;
1496 
1497         /* If the maximum is unlimited, insert an OP_STAR. */
1498 
1499         if (repeat_max < 0)
1500           {
1501           *code++ = c;
1502           *code++ = OP_STAR + repeat_type;
1503           }
1504 
1505         /* Else insert an UPTO if the max is greater than the min. */
1506 
1507         else if (repeat_max != repeat_min)
1508           {
1509           *code++ = c;
1510           repeat_max -= repeat_min;
1511           *code++ = OP_UPTO + repeat_type;
1512           *code++ = repeat_max >> 8;
1513           *code++ = (repeat_max & 255);
1514           }
1515         }
1516 
1517       /* The character or character type itself comes last in all cases. */
1518 
1519       *code++ = c;
1520       }
1521 
1522     /* If previous was a character class or a back reference, we put the repeat
1523     stuff after it, but just skip the item if the repeat was {0,0}. */
1524 
1525     else if (*previous == OP_CLASS || *previous == OP_REF)
1526       {
1527       if (repeat_max == 0)
1528         {
1529         code = previous;
1530         goto END_REPEAT;
1531         }
1532       if (repeat_min == 0 && repeat_max == -1)
1533         *code++ = OP_CRSTAR + repeat_type;
1534       else if (repeat_min == 1 && repeat_max == -1)
1535         *code++ = OP_CRPLUS + repeat_type;
1536       else if (repeat_min == 0 && repeat_max == 1)
1537         *code++ = OP_CRQUERY + repeat_type;
1538       else
1539         {
1540         *code++ = OP_CRRANGE + repeat_type;
1541         *code++ = repeat_min >> 8;
1542         *code++ = repeat_min & 255;
1543         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
1544         *code++ = repeat_max >> 8;
1545         *code++ = repeat_max & 255;
1546         }
1547       }
1548 
1549     /* If previous was a bracket group, we may have to replicate it in certain
1550     cases. */
1551 
1552     else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
1553              (int)*previous == OP_COND)
1554       {
1555       register int i;
1556       int ketoffset = 0;
1557       int len = code - previous;
1558       uschar *bralink = NULL;
1559 
1560       /* If the maximum repeat count is unlimited, find the end of the bracket
1561       by scanning through from the start, and compute the offset back to it
1562       from the current code pointer. There may be an OP_OPT setting following
1563       the final KET, so we can't find the end just by going back from the code
1564       pointer. */
1565 
1566       if (repeat_max == -1)
1567         {
1568         register uschar *ket = previous;
1569         do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
1570         ketoffset = code - ket;
1571         }
1572 
1573       /* The case of a zero minimum is special because of the need to stick
1574       OP_BRAZERO in front of it, and because the group appears once in the
1575       data, whereas in other cases it appears the minimum number of times. For
1576       this reason, it is simplest to treat this case separately, as otherwise
1577       the code gets far too messy. There are several special subcases when the
1578       minimum is zero. */
1579 
1580       if (repeat_min == 0)
1581         {
1582         /* If we set up a required char from the bracket, we must back off
1583         to the previous value and reset the countlits value too. */
1584 
1585         if (subcountlits > 0)
1586           {
1587           *reqchar = prevreqchar;
1588           *countlits -= subcountlits;
1589           }
1590 
1591         /* If the maximum is also zero, we just omit the group from the output
1592         altogether. */
1593 
1594         if (repeat_max == 0)
1595           {
1596           code = previous;
1597           goto END_REPEAT;
1598           }
1599 
1600         /* If the maximum is 1 or unlimited, we just have to stick in the
1601         BRAZERO and do no more at this point. */
1602 
1603         if (repeat_max <= 1)
1604           {
1605           memmove(previous+1, previous, len);
1606           code++;
1607           *previous++ = OP_BRAZERO + repeat_type;
1608           }
1609 
1610         /* If the maximum is greater than 1 and limited, we have to replicate
1611         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
1612         The first one has to be handled carefully because it's the original
1613         copy, which has to be moved up. The remainder can be handled by code
1614         that is common with the non-zero minimum case below. We just have to
1615         adjust the value or repeat_max, since one less copy is required. */
1616 
1617         else
1618           {
1619           int offset;
1620           memmove(previous+4, previous, len);
1621           code += 4;
1622           *previous++ = OP_BRAZERO + repeat_type;
1623           *previous++ = OP_BRA;
1624 
1625           /* We chain together the bracket offset fields that have to be
1626           filled in later when the ends of the brackets are reached. */
1627 
1628           offset = (bralink == NULL)? 0 : previous - bralink;
1629           bralink = previous;
1630           *previous++ = offset >> 8;
1631           *previous++ = offset & 255;
1632           }
1633 
1634         repeat_max--;
1635         }
1636 
1637       /* If the minimum is greater than zero, replicate the group as many
1638       times as necessary, and adjust the maximum to the number of subsequent
1639       copies that we need. */
1640 
1641       else
1642         {
1643         for (i = 1; i < repeat_min; i++)
1644           {
1645           memcpy(code, previous, len);
1646           code += len;
1647           }
1648         if (repeat_max > 0) repeat_max -= repeat_min;
1649         }
1650 
1651       /* This code is common to both the zero and non-zero minimum cases. If
1652       the maximum is limited, it replicates the group in a nested fashion,
1653       remembering the bracket starts on a stack. In the case of a zero minimum,
1654       the first one was set up above. In all cases the repeat_max now specifies
1655       the number of additional copies needed. */
1656 
1657       if (repeat_max >= 0)
1658         {
1659         for (i = repeat_max - 1; i >= 0; i--)
1660           {
1661           *code++ = OP_BRAZERO + repeat_type;
1662 
1663           /* All but the final copy start a new nesting, maintaining the
1664           chain of brackets outstanding. */
1665 
1666           if (i != 0)
1667             {
1668             int offset;
1669             *code++ = OP_BRA;
1670             offset = (bralink == NULL)? 0 : code - bralink;
1671             bralink = code;
1672             *code++ = offset >> 8;
1673             *code++ = offset & 255;
1674             }
1675 
1676           memcpy(code, previous, len);
1677           code += len;
1678           }
1679 
1680         /* Now chain through the pending brackets, and fill in their length
1681         fields (which are holding the chain links pro tem). */
1682 
1683         while (bralink != NULL)
1684           {
1685           int oldlinkoffset;
1686           int offset = code - bralink + 1;
1687           uschar *bra = code - offset;
1688           oldlinkoffset = (bra[1] << 8) + bra[2];
1689           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
1690           *code++ = OP_KET;
1691           *code++ = bra[1] = offset >> 8;
1692           *code++ = bra[2] = (offset & 255);
1693           }
1694         }
1695 
1696       /* If the maximum is unlimited, set a repeater in the final copy. We
1697       can't just offset backwards from the current code point, because we
1698       don't know if there's been an options resetting after the ket. The
1699       correct offset was computed above. */
1700 
1701       else code[-ketoffset] = OP_KETRMAX + repeat_type;
1702       }
1703 
1704     /* Else there's some kind of shambles */
1705 
1706     else
1707       {
1708       *errorptr = ERR11;
1709       goto FAILED;
1710       }
1711 
1712     /* In all case we no longer have a previous item. */
1713 
1714     END_REPEAT:
1715     previous = NULL;
1716     break;
1717 
1718 
1719     /* Start of nested bracket sub-expression, or comment or lookahead or
1720     lookbehind or option setting or condition. First deal with special things
1721     that can come after a bracket; all are introduced by ?, and the appearance
1722     of any of them means that this is not a referencing group. They were
1723     checked for validity in the first pass over the string, so we don't have to
1724     check for syntax errors here.  */
1725 
1726     case '(':
1727     newoptions = options;
1728     skipbytes = 0;
1729 
1730     if (*(++ptr) == '?')
1731       {
1732       int set, unset;
1733       int *optset;
1734 
1735       switch (*(++ptr))
1736         {
1737         case '#':                 /* Comment; skip to ket */
1738         ptr++;
1739         while (*ptr != ')') ptr++;
1740         continue;
1741 
1742         case ':':                 /* Non-extracting bracket */
1743         bravalue = OP_BRA;
1744         ptr++;
1745         break;
1746 
1747         case '(':
1748         bravalue = OP_COND;       /* Conditional group */
1749         if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
1750           {
1751           int condref = *ptr - '0';
1752           while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
1753           if (condref == 0)
1754             {
1755             *errorptr = ERR35;
1756             goto FAILED;
1757             }
1758           ptr++;
1759           code[3] = OP_CREF;
1760           code[4] = condref >> 8;
1761           code[5] = condref & 255;
1762           skipbytes = 3;
1763           }
1764         else ptr--;
1765         break;
1766 
1767         case '=':                 /* Positive lookahead */
1768         bravalue = OP_ASSERT;
1769         ptr++;
1770         break;
1771 
1772         case '!':                 /* Negative lookahead */
1773         bravalue = OP_ASSERT_NOT;
1774         ptr++;
1775         break;
1776 
1777         case '<':                 /* Lookbehinds */
1778         switch (*(++ptr))
1779           {
1780           case '=':               /* Positive lookbehind */
1781           bravalue = OP_ASSERTBACK;
1782           ptr++;
1783           break;
1784 
1785           case '!':               /* Negative lookbehind */
1786           bravalue = OP_ASSERTBACK_NOT;
1787           ptr++;
1788           break;
1789 
1790           default:                /* Syntax error */
1791           *errorptr = ERR24;
1792           goto FAILED;
1793           }
1794         break;
1795 
1796         case '>':                 /* One-time brackets */
1797         bravalue = OP_ONCE;
1798         ptr++;
1799         break;
1800 
1801         case 'R':                 /* Pattern recursion */
1802         *code++ = OP_RECURSE;
1803         ptr++;
1804         continue;
1805 
1806         default:                  /* Option setting */
1807         set = unset = 0;
1808         optset = &set;
1809 
1810         while (*ptr != ')' && *ptr != ':')
1811           {
1812           switch (*ptr++)
1813             {
1814             case '-': optset = &unset; break;
1815 
1816             case 'i': *optset |= PCRE_CASELESS; break;
1817             case 'm': *optset |= PCRE_MULTILINE; break;
1818             case 's': *optset |= PCRE_DOTALL; break;
1819             case 'x': *optset |= PCRE_EXTENDED; break;
1820             case 'U': *optset |= PCRE_UNGREEDY; break;
1821             case 'X': *optset |= PCRE_EXTRA; break;
1822 
1823             default:
1824             *errorptr = ERR12;
1825             goto FAILED;
1826             }
1827           }
1828 
1829         /* Set up the changed option bits, but don't change anything yet. */
1830 
1831         newoptions = (options | set) & (~unset);
1832 
1833         /* If the options ended with ')' this is not the start of a nested
1834         group with option changes, so the options change at this level. At top
1835         level there is nothing else to be done (the options will in fact have
1836         been set from the start of compiling as a result of the first pass) but
1837         at an inner level we must compile code to change the ims options if
1838         necessary, and pass the new setting back so that it can be put at the
1839         start of any following branches, and when this group ends, a resetting
1840         item can be compiled. */
1841 
1842         if (*ptr == ')')
1843           {
1844           if ((options & PCRE_INGROUP) != 0 &&
1845               (options & PCRE_IMS) != (newoptions & PCRE_IMS))
1846             {
1847             *code++ = OP_OPT;
1848             *code++ = *optchanged = newoptions & PCRE_IMS;
1849             }
1850           options = newoptions;  /* Change options at this level */
1851           previous = NULL;       /* This item can't be repeated */
1852           continue;              /* It is complete */
1853           }
1854 
1855         /* If the options ended with ':' we are heading into a nested group
1856         with possible change of options. Such groups are non-capturing and are
1857         not assertions of any kind. All we need to do is skip over the ':';
1858         the newoptions value is handled below. */
1859 
1860         bravalue = OP_BRA;
1861         ptr++;
1862         }
1863       }
1864 
1865     /* Else we have a referencing group; adjust the opcode. If the bracket
1866     number is greater than EXTRACT_BASIC_MAX, we set the opcode one higher, and
1867     arrange for the true number to follow later, in an OP_BRANUMBER item. */
1868 
1869     else
1870       {
1871       if (++(*brackets) > EXTRACT_BASIC_MAX)
1872         {
1873         bravalue = OP_BRA + EXTRACT_BASIC_MAX + 1;
1874         code[3] = OP_BRANUMBER;
1875         code[4] = *brackets >> 8;
1876         code[5] = *brackets & 255;
1877         skipbytes = 3;
1878         }
1879       else bravalue = OP_BRA + *brackets;
1880       }
1881 
1882     /* Process nested bracketed re. Assertions may not be repeated, but other
1883     kinds can be. We copy code into a non-register variable in order to be able
1884     to pass its address because some compilers complain otherwise. Pass in a
1885     new setting for the ims options if they have changed. */
1886 
1887     previous = (bravalue >= OP_ONCE)? code : NULL;
1888     *code = bravalue;
1889     tempcode = code;
1890 
1891     if (!compile_regex(
1892          options | PCRE_INGROUP,       /* Set for all nested groups */
1893          ((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
1894            newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
1895          brackets,                     /* Extracting bracket count */
1896          &tempcode,                    /* Where to put code (updated) */
1897          &ptr,                         /* Input pointer (updated) */
1898          errorptr,                     /* Where to put an error message */
1899          (bravalue == OP_ASSERTBACK ||
1900           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
1901          skipbytes,                    /* Skip over OP_COND/OP_BRANUMBER */
1902          &subreqchar,                  /* For possible last char */
1903          &subcountlits,                /* For literal count */
1904          cd))                          /* Tables block */
1905       goto FAILED;
1906 
1907     /* At the end of compiling, code is still pointing to the start of the
1908     group, while tempcode has been updated to point past the end of the group
1909     and any option resetting that may follow it. The pattern pointer (ptr)
1910     is on the bracket. */
1911 
1912     /* If this is a conditional bracket, check that there are no more than
1913     two branches in the group. */
1914 
1915     else if (bravalue == OP_COND)
1916       {
1917       uschar *tc = code;
1918       condcount = 0;
1919 
1920       do {
1921          condcount++;
1922          tc += (tc[1] << 8) | tc[2];
1923          }
1924       while (*tc != OP_KET);
1925 
1926       if (condcount > 2)
1927         {
1928         *errorptr = ERR27;
1929         goto FAILED;
1930         }
1931       }
1932 
1933     /* Handle updating of the required character. If the subpattern didn't
1934     set one, leave it as it was. Otherwise, update it for normal brackets of
1935     all kinds, forward assertions, and conditions with two branches. Don't
1936     update the literal count for forward assertions, however. If the bracket
1937     is followed by a quantifier with zero repeat, we have to back off. Hence
1938     the definition of prevreqchar and subcountlits outside the main loop so
1939     that they can be accessed for the back off. */
1940 
1941     if (subreqchar > 0 &&
1942          (bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
1943          (bravalue == OP_COND && condcount == 2)))
1944       {
1945       prevreqchar = *reqchar;
1946       *reqchar = subreqchar;
1947       if (bravalue != OP_ASSERT) *countlits += subcountlits;
1948       }
1949 
1950     /* Now update the main code pointer to the end of the group. */
1951 
1952     code = tempcode;
1953 
1954     /* Error if hit end of pattern */
1955 
1956     if (*ptr != ')')
1957       {
1958       *errorptr = ERR14;
1959       goto FAILED;
1960       }
1961     break;
1962 
1963     /* Check \ for being a real metacharacter; if not, fall through and handle
1964     it as a data character at the start of a string. Escape items are checked
1965     for validity in the pre-compiling pass. */
1966 
1967     case '\\':
1968     tempptr = ptr;
1969     c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
1970 
1971     /* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
1972     are arranged to be the negation of the corresponding OP_values. For the
1973     back references, the values are ESC_REF plus the reference number. Only
1974     back references and those types that consume a character may be repeated.
1975     We can test for values between ESC_b and ESC_Z for the latter; this may
1976     have to change if any new ones are ever created. */
1977 
1978     if (c < 0)
1979       {
1980       if (-c >= ESC_REF)
1981         {
1982         int number = -c - ESC_REF;
1983         previous = code;
1984         *code++ = OP_REF;
1985         *code++ = number >> 8;
1986         *code++ = number & 255;
1987         }
1988       else
1989         {
1990         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
1991         *code++ = -c;
1992         }
1993       continue;
1994       }
1995 
1996     /* Data character: reset and fall through */
1997 
1998     ptr = tempptr;
1999     c = '\\';
2000 
2001     /* Handle a run of data characters until a metacharacter is encountered.
2002     The first character is guaranteed not to be whitespace or # when the
2003     extended flag is set. */
2004 
2005     NORMAL_CHAR:
2006     default:
2007     previous = code;
2008     *code = OP_CHARS;
2009     code += 2;
2010     length = 0;
2011 
2012     do
2013       {
2014       if ((options & PCRE_EXTENDED) != 0)
2015         {
2016         if ((cd->ctypes[c] & ctype_space) != 0) continue;
2017         if (c == '#')
2018           {
2019           /* The space before the ; is to avoid a warning on a silly compiler
2020           on the Macintosh. */
2021           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2022           if (c == 0) break;
2023           continue;
2024           }
2025         }
2026 
2027       /* Backslash may introduce a data char or a metacharacter. Escaped items
2028       are checked for validity in the pre-compiling pass. Stop the string
2029       before a metaitem. */
2030 
2031       if (c == '\\')
2032         {
2033         tempptr = ptr;
2034         c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
2035         if (c < 0) { ptr = tempptr; break; }
2036 
2037         /* If a character is > 127 in UTF-8 mode, we have to turn it into
2038         two or more characters in the UTF-8 encoding. */
2039 
2040 #ifdef SUPPORT_UTF8
2041         if (c > 127 && (options & PCRE_UTF8) != 0)
2042           {
2043           uschar buffer[8];
2044           int len = ord2utf8(c, buffer);
2045           for (c = 0; c < len; c++) *code++ = buffer[c];
2046           length += len;
2047           continue;
2048           }
2049 #endif
2050         }
2051 
2052       /* Ordinary character or single-char escape */
2053 
2054       *code++ = c;
2055       length++;
2056       }
2057 
2058     /* This "while" is the end of the "do" above. */
2059 
2060     while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
2061 
2062     /* Update the last character and the count of literals */
2063 
2064     prevreqchar = (length > 1)? code[-2] : *reqchar;
2065     *reqchar = code[-1];
2066     *countlits += length;
2067 
2068     /* Compute the length and set it in the data vector, and advance to
2069     the next state. */
2070 
2071     previous[1] = length;
2072     if (length < MAXLIT) ptr--;
2073     break;
2074     }
2075   }                   /* end of big loop */
2076 
2077 /* Control never reaches here by falling through, only by a goto for all the
2078 error states. Pass back the position in the pattern so that it can be displayed
2079 to the user for diagnosing the error. */
2080 
2081 FAILED:
2082 *ptrptr = ptr;
2083 return FALSE;
2084 }
2085 
2086 
2087 
2088 
2089 /*************************************************
2090 *     Compile sequence of alternatives           *
2091 *************************************************/
2092 
2093 /* On entry, ptr is pointing past the bracket character, but on return
2094 it points to the closing bracket, or vertical bar, or end of string.
2095 The code variable is pointing at the byte into which the BRA operator has been
2096 stored. If the ims options are changed at the start (for a (?ims: group) or
2097 during any branch, we need to insert an OP_OPT item at the start of every
2098 following branch to ensure they get set correctly at run time, and also pass
2099 the new options into every subsequent branch compile.
2100 
2101 Argument:
2102   options     the option bits
2103   optchanged  new ims options to set as if (?ims) were at the start, or -1
2104                for no change
2105   brackets    -> int containing the number of extracting brackets used
2106   codeptr     -> the address of the current code pointer
2107   ptrptr      -> the address of the current pattern pointer
2108   errorptr    -> pointer to error message
2109   lookbehind  TRUE if this is a lookbehind assertion
2110   skipbytes   skip this many bytes at start (for OP_COND, OP_BRANUMBER)
2111   reqchar     -> place to put the last required character, or a negative number
2112   countlits   -> place to put the shortest literal count of any branch
2113   cd          points to the data block with tables pointers
2114 
2115 Returns:      TRUE on success
2116 */
2117 
2118 static BOOL
2119 compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
2120   const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int skipbytes,
2121   int *reqchar, int *countlits, compile_data *cd)
2122 {
2123 const uschar *ptr = *ptrptr;
2124 uschar *code = *codeptr;
2125 uschar *last_branch = code;
2126 uschar *start_bracket = code;
2127 uschar *reverse_count = NULL;
2128 int oldoptions = options & PCRE_IMS;
2129 int branchreqchar, branchcountlits;
2130 
2131 *reqchar = -1;
2132 *countlits = INT_MAX;
2133 code += 3 + skipbytes;
2134 
2135 /* Loop for each alternative branch */
2136 
2137 for (;;)
2138   {
2139   int length;
2140 
2141   /* Handle change of options */
2142 
2143   if (optchanged >= 0)
2144     {
2145     *code++ = OP_OPT;
2146     *code++ = optchanged;
2147     options = (options & ~PCRE_IMS) | optchanged;
2148     }
2149 
2150   /* Set up dummy OP_REVERSE if lookbehind assertion */
2151 
2152   if (lookbehind)
2153     {
2154     *code++ = OP_REVERSE;
2155     reverse_count = code;
2156     *code++ = 0;
2157     *code++ = 0;
2158     }
2159 
2160   /* Now compile the branch */
2161 
2162   if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
2163       &branchreqchar, &branchcountlits, cd))
2164     {
2165     *ptrptr = ptr;
2166     return FALSE;
2167     }
2168 
2169   /* Fill in the length of the last branch */
2170 
2171   length = code - last_branch;
2172   last_branch[1] = length >> 8;
2173   last_branch[2] = length & 255;
2174 
2175   /* Save the last required character if all branches have the same; a current
2176   value of -1 means unset, while -2 means "previous branch had no last required
2177   char".  */
2178 
2179   if (*reqchar != -2)
2180     {
2181     if (branchreqchar >= 0)
2182       {
2183       if (*reqchar == -1) *reqchar = branchreqchar;
2184       else if (*reqchar != branchreqchar) *reqchar = -2;
2185       }
2186     else *reqchar = -2;
2187     }
2188 
2189   /* Keep the shortest literal count */
2190 
2191   if (branchcountlits < *countlits) *countlits = branchcountlits;
2192   DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
2193 
2194   /* If lookbehind, check that this branch matches a fixed-length string,
2195   and put the length into the OP_REVERSE item. Temporarily mark the end of
2196   the branch with OP_END. */
2197 
2198   if (lookbehind)
2199     {
2200     *code = OP_END;
2201     length = find_fixedlength(last_branch, options);
2202     DPRINTF(("fixed length = %d\n", length));
2203     if (length < 0)
2204       {
2205       *errorptr = ERR25;
2206       *ptrptr = ptr;
2207       return FALSE;
2208       }
2209     reverse_count[0] = (length >> 8);
2210     reverse_count[1] = length & 255;
2211     }
2212 
2213   /* Reached end of expression, either ')' or end of pattern. Insert a
2214   terminating ket and the length of the whole bracketed item, and return,
2215   leaving the pointer at the terminating char. If any of the ims options
2216   were changed inside the group, compile a resetting op-code following. */
2217 
2218   if (*ptr != '|')
2219     {
2220     length = code - start_bracket;
2221     *code++ = OP_KET;
2222     *code++ = length >> 8;
2223     *code++ = length & 255;
2224     if (optchanged >= 0)
2225       {
2226       *code++ = OP_OPT;
2227       *code++ = oldoptions;
2228       }
2229     *codeptr = code;
2230     *ptrptr = ptr;
2231     return TRUE;
2232     }
2233 
2234   /* Another branch follows; insert an "or" node and advance the pointer. */
2235 
2236   *code = OP_ALT;
2237   last_branch = code;
2238   code += 3;
2239   ptr++;
2240   }
2241 /* Control never reaches here */
2242 }
2243 
2244 
2245 
2246 
2247 /*************************************************
2248 *      Find first significant op code            *
2249 *************************************************/
2250 
2251 /* This is called by several functions that scan a compiled expression looking
2252 for a fixed first character, or an anchoring op code etc. It skips over things
2253 that do not influence this. For one application, a change of caseless option is
2254 important.
2255 
2256 Arguments:
2257   code       pointer to the start of the group
2258   options    pointer to external options
2259   optbit     the option bit whose changing is significant, or
2260              zero if none are
2261   optstop    TRUE to return on option change, otherwise change the options
2262                value and continue
2263 
2264 Returns:     pointer to the first significant opcode
2265 */
2266 
2267 static const uschar*
2268 first_significant_code(const uschar *code, int *options, int optbit,
2269   BOOL optstop)
2270 {
2271 for (;;)
2272   {
2273   switch ((int)*code)
2274     {
2275     case OP_OPT:
2276     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
2277       {
2278       if (optstop) return code;
2279       *options = (int)code[1];
2280       }
2281     code += 2;
2282     break;
2283 
2284     case OP_CREF:
2285     case OP_BRANUMBER:
2286     code += 3;
2287     break;
2288 
2289     case OP_WORD_BOUNDARY:
2290     case OP_NOT_WORD_BOUNDARY:
2291     code++;
2292     break;
2293 
2294     case OP_ASSERT_NOT:
2295     case OP_ASSERTBACK:
2296     case OP_ASSERTBACK_NOT:
2297     do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
2298     code += 3;
2299     break;
2300 
2301     default:
2302     return code;
2303     }
2304   }
2305 /* Control never reaches here */
2306 }
2307 
2308 
2309 
2310 
2311 /*************************************************
2312 *          Check for anchored expression         *
2313 *************************************************/
2314 
2315 /* Try to find out if this is an anchored regular expression. Consider each
2316 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
2317 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
2318 it's anchored. However, if this is a multiline pattern, then only OP_SOD
2319 counts, since OP_CIRC can match in the middle.
2320 
2321 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
2322 because that will try the rest of the pattern at all possible matching points,
2323 so there is no point trying them again.
2324 
2325 Arguments:
2326   code       points to start of expression (the bracket)
2327   options    points to the options setting
2328 
2329 Returns:     TRUE or FALSE
2330 */
2331 
2332 static BOOL
2333 is_anchored(register const uschar *code, int *options)
2334 {
2335 do {
2336    const uschar *scode = first_significant_code(code + 3, options,
2337      PCRE_MULTILINE, FALSE);
2338    register int op = *scode;
2339    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2340      { if (!is_anchored(scode, options)) return FALSE; }
2341    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
2342             (*options & PCRE_DOTALL) != 0)
2343      { if (scode[1] != OP_ANY) return FALSE; }
2344    else if (op != OP_SOD &&
2345            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
2346      return FALSE;
2347    code += (code[1] << 8) + code[2];
2348    }
2349 while (*code == OP_ALT);
2350 return TRUE;
2351 }
2352 
2353 
2354 
2355 /*************************************************
2356 *         Check for starting with ^ or .*        *
2357 *************************************************/
2358 
2359 /* This is called to find out if every branch starts with ^ or .* so that
2360 "first char" processing can be done to speed things up in multiline
2361 matching and for non-DOTALL patterns that start with .* (which must start at
2362 the beginning or after \n).
2363 
2364 Argument:  points to start of expression (the bracket)
2365 Returns:   TRUE or FALSE
2366 */
2367 
2368 static BOOL
2369 is_startline(const uschar *code)
2370 {
2371 do {
2372    const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
2373    register int op = *scode;
2374    if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
2375      { if (!is_startline(scode)) return FALSE; }
2376    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
2377      { if (scode[1] != OP_ANY) return FALSE; }
2378    else if (op != OP_CIRC) return FALSE;
2379    code += (code[1] << 8) + code[2];
2380    }
2381 while (*code == OP_ALT);
2382 return TRUE;
2383 }
2384 
2385 
2386 
2387 /*************************************************
2388 *          Check for fixed first char            *
2389 *************************************************/
2390 
2391 /* Try to find out if there is a fixed first character. This is called for
2392 unanchored expressions, as it speeds up their processing quite considerably.
2393 Consider each alternative branch. If they all start with the same char, or with
2394 a bracket all of whose alternatives start with the same char (recurse ad lib),
2395 then we return that char, otherwise -1.
2396 
2397 Arguments:
2398   code       points to start of expression (the bracket)
2399   options    pointer to the options (used to check casing changes)
2400 
2401 Returns:     -1 or the fixed first char
2402 */
2403 
2404 static int
2405 find_firstchar(const uschar *code, int *options)
2406 {
2407 register int c = -1;
2408 do {
2409    int d;
2410    const uschar *scode = first_significant_code(code + 3, options,
2411      PCRE_CASELESS, TRUE);
2412    register int op = *scode;
2413 
2414    if (op >= OP_BRA) op = OP_BRA;
2415 
2416    switch(op)
2417      {
2418      default:
2419      return -1;
2420 
2421      case OP_BRA:
2422      case OP_ASSERT:
2423      case OP_ONCE:
2424      case OP_COND:
2425      if ((d = find_firstchar(scode, options)) < 0) return -1;
2426      if (c < 0) c = d; else if (c != d) return -1;
2427      break;
2428 
2429      case OP_EXACT:       /* Fall through */
2430      scode++;
2431 
2432      case OP_CHARS:       /* Fall through */
2433      scode++;
2434 
2435      case OP_PLUS:
2436      case OP_MINPLUS:
2437      if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
2438      break;
2439      }
2440 
2441    code += (code[1] << 8) + code[2];
2442    }
2443 while (*code == OP_ALT);
2444 return c;
2445 }
2446 
2447 
2448 
2449 
2450 
2451 /*************************************************
2452 *        Compile a Regular Expression            *
2453 *************************************************/
2454 
2455 /* This function takes a string and returns a pointer to a block of store
2456 holding a compiled version of the expression.
2457 
2458 Arguments:
2459   pattern      the regular expression
2460   options      various option bits
2461   errorptr     pointer to pointer to error text
2462   erroroffset  ptr offset in pattern where error was detected
2463   tables       pointer to character tables or NULL
2464 
2465 Returns:       pointer to compiled data block, or NULL on error,
2466                with errorptr and erroroffset set
2467 */
2468 
2469 pcre *
2470 pcre_compile(const char *pattern, int options, const char **errorptr,
2471   int *erroroffset, const unsigned char *tables)
2472 {
2473 real_pcre *re;
2474 int length = 3;      /* For initial BRA plus length */
2475 int runlength;
2476 int c, reqchar, countlits;
2477 int bracount = 0;
2478 int top_backref = 0;
2479 int branch_extra = 0;
2480 int branch_newextra;
2481 unsigned int brastackptr = 0;
2482 size_t size;
2483 uschar *code;
2484 const uschar *ptr;
2485 compile_data compile_block;
2486 int brastack[BRASTACK_SIZE];
2487 uschar bralenstack[BRASTACK_SIZE];
2488 
2489 #ifdef DEBUG
2490 uschar *code_base, *code_end;
2491 #endif
2492 
2493 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
2494 
2495 #ifndef SUPPORT_UTF8
2496 if ((options & PCRE_UTF8) != 0)
2497   {
2498   *errorptr = ERR32;
2499   return NULL;
2500   }
2501 #endif
2502 
2503 /* We can't pass back an error message if errorptr is NULL; I guess the best we
2504 can do is just return NULL. */
2505 
2506 if (errorptr == NULL) return NULL;
2507 *errorptr = NULL;
2508 
2509 /* However, we can give a message for this error */
2510 
2511 if (erroroffset == NULL)
2512   {
2513   *errorptr = ERR16;
2514   return NULL;
2515   }
2516 *erroroffset = 0;
2517 
2518 if ((options & ~PUBLIC_OPTIONS) != 0)
2519   {
2520   *errorptr = ERR17;
2521   return NULL;
2522   }
2523 
2524 /* Set up pointers to the individual character tables */
2525 
2526 if (tables == NULL) tables = pcre_default_tables;
2527 compile_block.lcc = tables + lcc_offset;
2528 compile_block.fcc = tables + fcc_offset;
2529 compile_block.cbits = tables + cbits_offset;
2530 compile_block.ctypes = tables + ctypes_offset;
2531 
2532 /* Reflect pattern for debugging output */
2533 
2534 DPRINTF(("------------------------------------------------------------------\n"));
2535 DPRINTF(("%s\n", pattern));
2536 
2537 /* The first thing to do is to make a pass over the pattern to compute the
2538 amount of store required to hold the compiled code. This does not have to be
2539 perfect as long as errors are overestimates. At the same time we can detect any
2540 internal flag settings. Make an attempt to correct for any counted white space
2541 if an "extended" flag setting appears late in the pattern. We can't be so
2542 clever for #-comments. */
2543 
2544 ptr = (const uschar *)(pattern - 1);
2545 while ((c = *(++ptr)) != 0)
2546   {
2547   int min, max;
2548   int class_charcount;
2549   int bracket_length;
2550 
2551   if ((options & PCRE_EXTENDED) != 0)
2552     {
2553     if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2554     if (c == '#')
2555       {
2556       /* The space before the ; is to avoid a warning on a silly compiler
2557       on the Macintosh. */
2558       while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2559       continue;
2560       }
2561     }
2562 
2563   switch(c)
2564     {
2565     /* A backslashed item may be an escaped "normal" character or a
2566     character type. For a "normal" character, put the pointers and
2567     character back so that tests for whitespace etc. in the input
2568     are done correctly. */
2569 
2570     case '\\':
2571       {
2572       const uschar *save_ptr = ptr;
2573       c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
2574       if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2575       if (c >= 0)
2576         {
2577         ptr = save_ptr;
2578         c = '\\';
2579         goto NORMAL_CHAR;
2580         }
2581       }
2582     length++;
2583 
2584     /* A back reference needs an additional 2 bytes, plus either one or 5
2585     bytes for a repeat. We also need to keep the value of the highest
2586     back reference. */
2587 
2588     if (c <= -ESC_REF)
2589       {
2590       int refnum = -c - ESC_REF;
2591       if (refnum > top_backref) top_backref = refnum;
2592       length += 2;   /* For single back reference */
2593       if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2594         {
2595         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2596         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2597         if ((min == 0 && (max == 1 || max == -1)) ||
2598           (min == 1 && max == -1))
2599             length++;
2600         else length += 5;
2601         if (ptr[1] == '?') ptr++;
2602         }
2603       }
2604     continue;
2605 
2606     case '^':
2607     case '.':
2608     case '$':
2609     case '*':     /* These repeats won't be after brackets; */
2610     case '+':     /* those are handled separately */
2611     case '?':
2612     length++;
2613     continue;
2614 
2615     /* This covers the cases of repeats after a single char, metachar, class,
2616     or back reference. */
2617 
2618     case '{':
2619     if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
2620     ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
2621     if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2622     if ((min == 0 && (max == 1 || max == -1)) ||
2623       (min == 1 && max == -1))
2624         length++;
2625     else
2626       {
2627       length--;   /* Uncount the original char or metachar */
2628       if (min == 1) length++; else if (min > 0) length += 4;
2629       if (max > 0) length += 4; else length += 2;
2630       }
2631     if (ptr[1] == '?') ptr++;
2632     continue;
2633 
2634     /* An alternation contains an offset to the next branch or ket. If any ims
2635     options changed in the previous branch(es), and/or if we are in a
2636     lookbehind assertion, extra space will be needed at the start of the
2637     branch. This is handled by branch_extra. */
2638 
2639     case '|':
2640     length += 3 + branch_extra;
2641     continue;
2642 
2643     /* A character class uses 33 characters. Don't worry about character types
2644     that aren't allowed in classes - they'll get picked up during the compile.
2645     A character class that contains only one character uses 2 or 3 bytes,
2646     depending on whether it is negated or not. Notice this where we can. */
2647 
2648     case '[':
2649     class_charcount = 0;
2650     if (*(++ptr) == '^') ptr++;
2651     do
2652       {
2653       if (*ptr == '\\')
2654         {
2655         int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
2656           &compile_block);
2657         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2658         if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
2659         }
2660       else class_charcount++;
2661       ptr++;
2662       }
2663     while (*ptr != 0 && *ptr != ']');
2664 
2665     /* Repeats for negated single chars are handled by the general code */
2666 
2667     if (class_charcount == 1) length += 3; else
2668       {
2669       length += 33;
2670 
2671       /* A repeat needs either 1 or 5 bytes. */
2672 
2673       if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
2674         {
2675         ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
2676         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2677         if ((min == 0 && (max == 1 || max == -1)) ||
2678           (min == 1 && max == -1))
2679             length++;
2680         else length += 5;
2681         if (ptr[1] == '?') ptr++;
2682         }
2683       }
2684     continue;
2685 
2686     /* Brackets may be genuine groups or special things */
2687 
2688     case '(':
2689     branch_newextra = 0;
2690     bracket_length = 3;
2691 
2692     /* Handle special forms of bracket, which all start (? */
2693 
2694     if (ptr[1] == '?')
2695       {
2696       int set, unset;
2697       int *optset;
2698 
2699       switch (c = ptr[2])
2700         {
2701         /* Skip over comments entirely */
2702         case '#':
2703         ptr += 3;
2704         while (*ptr != 0 && *ptr != ')') ptr++;
2705         if (*ptr == 0)
2706           {
2707           *errorptr = ERR18;
2708           goto PCRE_ERROR_RETURN;
2709           }
2710         continue;
2711 
2712         /* Non-referencing groups and lookaheads just move the pointer on, and
2713         then behave like a non-special bracket, except that they don't increment
2714         the count of extracting brackets. Ditto for the "once only" bracket,
2715         which is in Perl from version 5.005. */
2716 
2717         case ':':
2718         case '=':
2719         case '!':
2720         case '>':
2721         ptr += 2;
2722         break;
2723 
2724         /* A recursive call to the regex is an extension, to provide the
2725         facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
2726 
2727         case 'R':
2728         if (ptr[3] != ')')
2729           {
2730           *errorptr = ERR29;
2731           goto PCRE_ERROR_RETURN;
2732           }
2733         ptr += 3;
2734         length += 1;
2735         break;
2736 
2737         /* Lookbehinds are in Perl from version 5.005 */
2738 
2739         case '<':
2740         if (ptr[3] == '=' || ptr[3] == '!')
2741           {
2742           ptr += 3;
2743           branch_newextra = 3;
2744           length += 3;         /* For the first branch */
2745           break;
2746           }
2747         *errorptr = ERR24;
2748         goto PCRE_ERROR_RETURN;
2749 
2750         /* Conditionals are in Perl from version 5.005. The bracket must either
2751         be followed by a number (for bracket reference) or by an assertion
2752         group. */
2753 
2754         case '(':
2755         if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
2756           {
2757           ptr += 4;
2758           length += 3;
2759           while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
2760           if (*ptr != ')')
2761             {
2762             *errorptr = ERR26;
2763             goto PCRE_ERROR_RETURN;
2764             }
2765           }
2766         else   /* An assertion must follow */
2767           {
2768           ptr++;   /* Can treat like ':' as far as spacing is concerned */
2769           if (ptr[2] != '?' ||
2770              (ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
2771             {
2772             ptr += 2;    /* To get right offset in message */
2773             *errorptr = ERR28;
2774             goto PCRE_ERROR_RETURN;
2775             }
2776           }
2777         break;
2778 
2779         /* Else loop checking valid options until ) is met. Anything else is an
2780         error. If we are without any brackets, i.e. at top level, the settings
2781         act as if specified in the options, so massage the options immediately.
2782         This is for backward compatibility with Perl 5.004. */
2783 
2784         default:
2785         set = unset = 0;
2786         optset = &set;
2787         ptr += 2;
2788 
2789         for (;; ptr++)
2790           {
2791           c = *ptr;
2792           switch (c)
2793             {
2794             case 'i':
2795             *optset |= PCRE_CASELESS;
2796             continue;
2797 
2798             case 'm':
2799             *optset |= PCRE_MULTILINE;
2800             continue;
2801 
2802             case 's':
2803             *optset |= PCRE_DOTALL;
2804             continue;
2805 
2806             case 'x':
2807             *optset |= PCRE_EXTENDED;
2808             continue;
2809 
2810             case 'X':
2811             *optset |= PCRE_EXTRA;
2812             continue;
2813 
2814             case 'U':
2815             *optset |= PCRE_UNGREEDY;
2816             continue;
2817 
2818             case '-':
2819             optset = &unset;
2820             continue;
2821 
2822             /* A termination by ')' indicates an options-setting-only item;
2823             this is global at top level; otherwise nothing is done here and
2824             it is handled during the compiling process on a per-bracket-group
2825             basis. */
2826 
2827             case ')':
2828             if (brastackptr == 0)
2829               {
2830               options = (options | set) & (~unset);
2831               set = unset = 0;     /* To save length */
2832               }
2833             /* Fall through */
2834 
2835             /* A termination by ':' indicates the start of a nested group with
2836             the given options set. This is again handled at compile time, but
2837             we must allow for compiled space if any of the ims options are
2838             set. We also have to allow for resetting space at the end of
2839             the group, which is why 4 is added to the length and not just 2.
2840             If there are several changes of options within the same group, this
2841             will lead to an over-estimate on the length, but this shouldn't
2842             matter very much. We also have to allow for resetting options at
2843             the start of any alternations, which we do by setting
2844             branch_newextra to 2. Finally, we record whether the case-dependent
2845             flag ever changes within the regex. This is used by the "required
2846             character" code. */
2847 
2848             case ':':
2849             if (((set|unset) & PCRE_IMS) != 0)
2850               {
2851               length += 4;
2852               branch_newextra = 2;
2853               if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
2854               }
2855             goto END_OPTIONS;
2856 
2857             /* Unrecognized option character */
2858 
2859             default:
2860             *errorptr = ERR12;
2861             goto PCRE_ERROR_RETURN;
2862             }
2863           }
2864 
2865         /* If we hit a closing bracket, that's it - this is a freestanding
2866         option-setting. We need to ensure that branch_extra is updated if
2867         necessary. The only values branch_newextra can have here are 0 or 2.
2868         If the value is 2, then branch_extra must either be 2 or 5, depending
2869         on whether this is a lookbehind group or not. */
2870 
2871         END_OPTIONS:
2872         if (c == ')')
2873           {
2874           if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
2875             branch_extra += branch_newextra;
2876           continue;
2877           }
2878 
2879         /* If options were terminated by ':' control comes here. Fall through
2880         to handle the group below. */
2881         }
2882       }
2883 
2884     /* Extracting brackets must be counted so we can process escapes in a
2885     Perlish way. If the number exceeds EXTRACT_BASIC_MAX we are going to
2886     need an additional 3 bytes of store per extracting bracket. */
2887 
2888     else
2889       {
2890       bracount++;
2891       if (bracount > EXTRACT_BASIC_MAX) bracket_length += 3;
2892       }
2893 
2894     /* Save length for computing whole length at end if there's a repeat that
2895     requires duplication of the group. Also save the current value of
2896     branch_extra, and start the new group with the new value. If non-zero, this
2897     will either be 2 for a (?imsx: group, or 3 for a lookbehind assertion. */
2898 
2899     if (brastackptr >= sizeof(brastack)/sizeof(int))
2900       {
2901       *errorptr = ERR19;
2902       goto PCRE_ERROR_RETURN;
2903       }
2904 
2905     bralenstack[brastackptr] = branch_extra;
2906     branch_extra = branch_newextra;
2907 
2908     brastack[brastackptr++] = length;
2909     length += bracket_length;
2910     continue;
2911 
2912     /* Handle ket. Look for subsequent max/min; for certain sets of values we
2913     have to replicate this bracket up to that many times. If brastackptr is
2914     0 this is an unmatched bracket which will generate an error, but take care
2915     not to try to access brastack[-1] when computing the length and restoring
2916     the branch_extra value. */
2917 
2918     case ')':
2919     length += 3;
2920       {
2921       int minval = 1;
2922       int maxval = 1;
2923       int duplength;
2924 
2925       if (brastackptr > 0)
2926         {
2927         duplength = length - brastack[--brastackptr];
2928         branch_extra = bralenstack[brastackptr];
2929         }
2930       else duplength = 0;
2931 
2932       /* Leave ptr at the final char; for read_repeat_counts this happens
2933       automatically; for the others we need an increment. */
2934 
2935       if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
2936         {
2937         ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
2938           &compile_block);
2939         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
2940         }
2941       else if (c == '*') { minval = 0; maxval = -1; ptr++; }
2942       else if (c == '+') { maxval = -1; ptr++; }
2943       else if (c == '?') { minval = 0; ptr++; }
2944 
2945       /* If the minimum is zero, we have to allow for an OP_BRAZERO before the
2946       group, and if the maximum is greater than zero, we have to replicate
2947       maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
2948       bracket set - hence the 7. */
2949 
2950       if (minval == 0)
2951         {
2952         length++;
2953         if (maxval > 0) length += (maxval - 1) * (duplength + 7);
2954         }
2955 
2956       /* When the minimum is greater than zero, 1 we have to replicate up to
2957       minval-1 times, with no additions required in the copies. Then, if
2958       there is a limited maximum we have to replicate up to maxval-1 times
2959       allowing for a BRAZERO item before each optional copy and nesting
2960       brackets for all but one of the optional copies. */
2961 
2962       else
2963         {
2964         length += (minval - 1) * duplength;
2965         if (maxval > minval)   /* Need this test as maxval=-1 means no limit */
2966           length += (maxval - minval) * (duplength + 7) - 6;
2967         }
2968       }
2969     continue;
2970 
2971     /* Non-special character. For a run of such characters the length required
2972     is the number of characters + 2, except that the maximum run length is 255.
2973     We won't get a skipped space or a non-data escape or the start of a #
2974     comment as the first character, so the length can't be zero. */
2975 
2976     NORMAL_CHAR:
2977     default:
2978     length += 2;
2979     runlength = 0;
2980     do
2981       {
2982       if ((options & PCRE_EXTENDED) != 0)
2983         {
2984         if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
2985         if (c == '#')
2986           {
2987           /* The space before the ; is to avoid a warning on a silly compiler
2988           on the Macintosh. */
2989           while ((c = *(++ptr)) != 0 && c != NEWLINE) ;
2990           continue;
2991           }
2992         }
2993 
2994       /* Backslash may introduce a data char or a metacharacter; stop the
2995       string before the latter. */
2996 
2997       if (c == '\\')
2998         {
2999         const uschar *saveptr = ptr;
3000         c = check_escape(&ptr, errorptr, bracount, options, FALSE,
3001           &compile_block);
3002         if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
3003         if (c < 0) { ptr = saveptr; break; }
3004 
3005 #ifdef SUPPORT_UTF8
3006         if (c > 127 && (options & PCRE_UTF8) != 0)
3007           {
3008           int i;
3009           for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
3010             if (c <= utf8_table1[i]) break;
3011           runlength += i;
3012           }
3013 #endif
3014         }
3015 
3016       /* Ordinary character or single-char escape */
3017 
3018       runlength++;
3019       }
3020 
3021     /* This "while" is the end of the "do" above. */
3022 
3023     while (runlength < MAXLIT &&
3024       (compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
3025 
3026     ptr--;
3027     length += runlength;
3028     continue;
3029     }
3030   }
3031 
3032 length += 4;    /* For final KET and END */
3033 
3034 if (length > 65539)
3035   {
3036   *errorptr = ERR20;
3037   return NULL;
3038   }
3039 
3040 /* Compute the size of data block needed and get it, either from malloc or
3041 externally provided function. We specify "code[0]" in the offsetof() expression
3042 rather than just "code", because it has been reported that one broken compiler
3043 fails on "code" because it is also an independent variable. It should make no
3044 difference to the value of the offsetof(). */
3045 
3046 size = length + offsetof(real_pcre, code[0]);
3047 re = (real_pcre *)(pcre_malloc)(size);
3048 
3049 if (re == NULL)
3050   {
3051   *errorptr = ERR21;
3052   return NULL;
3053   }
3054 
3055 /* Put in the magic number, and save the size, options, and table pointer */
3056 
3057 re->magic_number = MAGIC_NUMBER;
3058 re->size = size;
3059 re->options = options;
3060 re->tables = tables;
3061 
3062 /* Set up a starting, non-extracting bracket, then compile the expression. On
3063 error, *errorptr will be set non-NULL, so we don't need to look at the result
3064 of the function here. */
3065 
3066 ptr = (const uschar *)pattern;
3067 code = re->code;
3068 *code = OP_BRA;
3069 bracount = 0;
3070 (void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, 0,
3071   &reqchar, &countlits, &compile_block);
3072 re->top_bracket = bracount;
3073 re->top_backref = top_backref;
3074 
3075 /* If not reached end of pattern on success, there's an excess bracket. */
3076 
3077 if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
3078 
3079 /* Fill in the terminating state and check for disastrous overflow, but
3080 if debugging, leave the test till after things are printed out. */
3081 
3082 *code++ = OP_END;
3083 
3084 #ifndef DEBUG
3085 if (code - re->code > length) *errorptr = ERR23;
3086 #endif
3087 
3088 /* Give an error if there's back reference to a non-existent capturing
3089 subpattern. */
3090 
3091 if (top_backref > re->top_bracket) *errorptr = ERR15;
3092 
3093 /* Failed to compile */
3094 
3095 if (*errorptr != NULL)
3096   {
3097   (pcre_free)(re);
3098   PCRE_ERROR_RETURN:
3099   *erroroffset = ptr - (const uschar *)pattern;
3100   return NULL;
3101   }
3102 
3103 /* If the anchored option was not passed, set flag if we can determine that the
3104 pattern is anchored by virtue of ^ characters or \A or anything else (such as
3105 starting with .* when DOTALL is set).
3106 
3107 Otherwise, see if we can determine what the first character has to be, because
3108 that speeds up unanchored matches no end. If not, see if we can set the
3109 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
3110 start with ^. and also when all branches start with .* for non-DOTALL matches.
3111 */
3112 
3113 if ((options & PCRE_ANCHORED) == 0)
3114   {
3115   int temp_options = options;
3116   if (is_anchored(re->code, &temp_options))
3117     re->options |= PCRE_ANCHORED;
3118   else
3119     {
3120     int ch = find_firstchar(re->code, &temp_options);
3121     if (ch >= 0)
3122       {
3123       re->first_char = ch;
3124       re->options |= PCRE_FIRSTSET;
3125       }
3126     else if (is_startline(re->code))
3127       re->options |= PCRE_STARTLINE;
3128     }
3129   }
3130 
3131 /* Save the last required character if there are at least two literal
3132 characters on all paths, or if there is no first character setting. */
3133 
3134 if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
3135   {
3136   re->req_char = reqchar;
3137   re->options |= PCRE_REQCHSET;
3138   }
3139 
3140 /* Print out the compiled data for debugging */
3141 
3142 #ifdef DEBUG
3143 
3144 printf("Length = %d top_bracket = %d top_backref = %d\n",
3145   length, re->top_bracket, re->top_backref);
3146 
3147 if (re->options != 0)
3148   {
3149   printf("%s%s%s%s%s%s%s%s%s\n",
3150     ((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
3151     ((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
3152     ((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
3153     ((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
3154     ((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
3155     ((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
3156     ((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
3157     ((re->options & PCRE_EXTRA) != 0)? "extra " : "",
3158     ((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
3159   }
3160 
3161 if ((re->options & PCRE_FIRSTSET) != 0)
3162   {
3163   if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
3164     else printf("First char = \\x%02x\n", re->first_char);
3165   }
3166 
3167 if ((re->options & PCRE_REQCHSET) != 0)
3168   {
3169   if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
3170     else printf("Req char = \\x%02x\n", re->req_char);
3171   }
3172 
3173 code_end = code;
3174 code_base = code = re->code;
3175 
3176 while (code < code_end)
3177   {
3178   int charlength;
3179 
3180   printf("%3d ", code - code_base);
3181 
3182   if (*code >= OP_BRA)
3183     {
3184     if (*code - OP_BRA > EXTRACT_BASIC_MAX)
3185       printf("%3d Bra extra", (code[1] << 8) + code[2]);
3186     else
3187       printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
3188     code += 2;
3189     }
3190 
3191   else switch(*code)
3192     {
3193     case OP_OPT:
3194     printf(" %.2x %s", code[1], OP_names[*code]);
3195     code++;
3196     break;
3197 
3198     case OP_CHARS:
3199     charlength = *(++code);
3200     printf("%3d ", charlength);
3201     while (charlength-- > 0)
3202       if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
3203     break;
3204 
3205     case OP_KETRMAX:
3206     case OP_KETRMIN:
3207     case OP_ALT:
3208     case OP_KET:
3209     case OP_ASSERT:
3210     case OP_ASSERT_NOT:
3211     case OP_ASSERTBACK:
3212     case OP_ASSERTBACK_NOT:
3213     case OP_ONCE:
3214     case OP_REVERSE:
3215     case OP_BRANUMBER:
3216     case OP_COND:
3217     case OP_CREF:
3218     printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
3219     code += 2;
3220     break;
3221 
3222     case OP_STAR:
3223     case OP_MINSTAR:
3224     case OP_PLUS:
3225     case OP_MINPLUS:
3226     case OP_QUERY:
3227     case OP_MINQUERY:
3228     case OP_TYPESTAR:
3229     case OP_TYPEMINSTAR:
3230     case OP_TYPEPLUS:
3231     case OP_TYPEMINPLUS:
3232     case OP_TYPEQUERY:
3233     case OP_TYPEMINQUERY:
3234     if (*code >= OP_TYPESTAR)
3235       printf("    %s", OP_names[code[1]]);
3236     else if (isprint(c = code[1])) printf("    %c", c);
3237       else printf("    \\x%02x", c);
3238     printf("%s", OP_names[*code++]);
3239     break;
3240 
3241     case OP_EXACT:
3242     case OP_UPTO:
3243     case OP_MINUPTO:
3244     if (isprint(c = code[3])) printf("    %c{", c);
3245       else printf("    \\x%02x{", c);
3246     if (*code != OP_EXACT) printf("0,");
3247     printf("%d}", (code[1] << 8) + code[2]);
3248     if (*code == OP_MINUPTO) printf("?");
3249     code += 3;
3250     break;
3251 
3252     case OP_TYPEEXACT:
3253     case OP_TYPEUPTO:
3254     case OP_TYPEMINUPTO:
3255     printf("    %s{", OP_names[code[3]]);
3256     if (*code != OP_TYPEEXACT) printf(",");
3257     printf("%d}", (code[1] << 8) + code[2]);
3258     if (*code == OP_TYPEMINUPTO) printf("?");
3259     code += 3;
3260     break;
3261 
3262     case OP_NOT:
3263     if (isprint(c = *(++code))) printf("    [^%c]", c);
3264       else printf("    [^\\x%02x]", c);
3265     break;
3266 
3267     case OP_NOTSTAR:
3268     case OP_NOTMINSTAR:
3269     case OP_NOTPLUS:
3270     case OP_NOTMINPLUS:
3271     case OP_NOTQUERY:
3272     case OP_NOTMINQUERY:
3273     if (isprint(c = code[1])) printf("    [^%c]", c);
3274       else printf("    [^\\x%02x]", c);
3275     printf("%s", OP_names[*code++]);
3276     break;
3277 
3278     case OP_NOTEXACT:
3279     case OP_NOTUPTO:
3280     case OP_NOTMINUPTO:
3281     if (isprint(c = code[3])) printf("    [^%c]{", c);
3282       else printf("    [^\\x%02x]{", c);
3283     if (*code != OP_NOTEXACT) printf(",");
3284     printf("%d}", (code[1] << 8) + code[2]);
3285     if (*code == OP_NOTMINUPTO) printf("?");
3286     code += 3;
3287     break;
3288 
3289     case OP_REF:
3290     printf("    \\%d", (code[1] << 8) | code[2]);
3291     code += 3;
3292     goto CLASS_REF_REPEAT;
3293 
3294     case OP_CLASS:
3295       {
3296       int i, min, max;
3297       code++;
3298       printf("    [");
3299 
3300       for (i = 0; i < 256; i++)
3301         {
3302         if ((code[i/8] & (1 << (i&7))) != 0)
3303           {
3304           int j;
3305           for (j = i+1; j < 256; j++)
3306             if ((code[j/8] & (1 << (j&7))) == 0) break;
3307           if (i == '-' || i == ']') printf("\\");
3308           if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
3309           if (--j > i)
3310             {
3311             printf("-");
3312             if (j == '-' || j == ']') printf("\\");
3313             if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
3314             }
3315           i = j;
3316           }
3317         }
3318       printf("]");
3319       code += 32;
3320 
3321       CLASS_REF_REPEAT:
3322 
3323       switch(*code)
3324         {
3325         case OP_CRSTAR:
3326         case OP_CRMINSTAR:
3327         case OP_CRPLUS:
3328         case OP_CRMINPLUS:
3329         case OP_CRQUERY:
3330         case OP_CRMINQUERY:
3331         printf("%s", OP_names[*code]);
3332         break;
3333 
3334         case OP_CRRANGE:
3335         case OP_CRMINRANGE:
3336         min = (code[1] << 8) + code[2];
3337         max = (code[3] << 8) + code[4];
3338         if (max == 0) printf("{%d,}", min);
3339         else printf("{%d,%d}", min, max);
3340         if (*code == OP_CRMINRANGE) printf("?");
3341         code += 4;
3342         break;
3343 
3344         default:
3345         code--;
3346         }
3347       }
3348     break;
3349 
3350     /* Anything else is just a one-node item */
3351 
3352     default:
3353     printf("    %s", OP_names[*code]);
3354     break;
3355     }
3356 
3357   code++;
3358   printf("\n");
3359   }
3360 printf("------------------------------------------------------------------\n");
3361 
3362 /* This check is done here in the debugging case so that the code that
3363 was compiled can be seen. */
3364 
3365 if (code - re->code > length)
3366   {
3367   *errorptr = ERR23;
3368   (pcre_free)(re);
3369   *erroroffset = ptr - (uschar *)pattern;
3370   return NULL;
3371   }
3372 #endif
3373 
3374 return (pcre *)re;
3375 }
3376 
3377 
3378 
3379 /*************************************************
3380 *          Match a back-reference                *
3381 *************************************************/
3382 
3383 /* If a back reference hasn't been set, the length that is passed is greater
3384 than the number of characters left in the string, so the match fails.
3385 
3386 Arguments:
3387   offset      index into the offset vector
3388   eptr        points into the subject
3389   length      length to be matched
3390   md          points to match data block
3391   ims         the ims flags
3392 
3393 Returns:      TRUE if matched
3394 */
3395 
3396 static BOOL
3397 match_ref(int offset, register const uschar *eptr, int length, match_data *md,
3398   unsigned long int ims)
3399 {
3400 const uschar *p = md->start_subject + md->offset_vector[offset];
3401 
3402 #ifdef DEBUG
3403 if (eptr >= md->end_subject)
3404   printf("matching subject <null>");
3405 else
3406   {
3407   printf("matching subject ");
3408   pchars(eptr, length, TRUE, md);
3409   }
3410 printf(" against backref ");
3411 pchars(p, length, FALSE, md);
3412 printf("\n");
3413 #endif
3414 
3415 /* Always fail if not enough characters left */
3416 
3417 if (length > md->end_subject - eptr) return FALSE;
3418 
3419 /* Separate the caselesss case for speed */
3420 
3421 if ((ims & PCRE_CASELESS) != 0)
3422   {
3423   while (length-- > 0)
3424     if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
3425   }
3426 else
3427   { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
3428 
3429 return TRUE;
3430 }
3431 
3432 
3433 
3434 /*************************************************
3435 *         Match from current position            *
3436 *************************************************/
3437 
3438 /* On entry ecode points to the first opcode, and eptr to the first character
3439 in the subject string, while eptrb holds the value of eptr at the start of the
3440 last bracketed group - used for breaking infinite loops matching zero-length
3441 strings.
3442 
3443 Arguments:
3444    eptr        pointer in subject
3445    ecode       position in code
3446    offset_top  current top pointer
3447    md          pointer to "static" info for the match
3448    ims         current /i, /m, and /s options
3449    eptrb       pointer to chain of blocks containing eptr at start of
3450                  brackets - for testing for empty matches
3451    flags       can contain
3452                  match_condassert - this is an assertion condition
3453                  match_isgroup - this is the start of a bracketed group
3454 
3455 Returns:       TRUE if matched
3456 */
3457 
3458 static BOOL
3459 match(register const uschar *eptr, register const uschar *ecode,
3460   int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
3461   int flags)
3462 {
3463 unsigned long int original_ims = ims;   /* Save for resetting on ')' */
3464 eptrblock newptrb;
3465 
3466 /* At the start of a bracketed group, add the current subject pointer to the
3467 stack of such pointers, to be re-instated at the end of the group when we hit
3468 the closing ket. When match() is called in other circumstances, we don't add to
3469 the stack. */
3470 
3471 if ((flags & match_isgroup) != 0)
3472   {
3473   newptrb.prev = eptrb;
3474   newptrb.saved_eptr = eptr;
3475   eptrb = &newptrb;
3476   }
3477 
3478 /* Now start processing the operations. */
3479 
3480 for (;;)
3481   {
3482   int op = (int)*ecode;
3483   int min, max, ctype;
3484   register int i;
3485   register int c;
3486   BOOL minimize = FALSE;
3487 
3488   /* Opening capturing bracket. If there is space in the offset vector, save
3489   the current subject position in the working slot at the top of the vector. We
3490   mustn't change the current values of the data slot, because they may be set
3491   from a previous iteration of this group, and be referred to by a reference
3492   inside the group.
3493 
3494   If the bracket fails to match, we need to restore this value and also the
3495   values of the final offsets, in case they were set by a previous iteration of
3496   the same bracket.
3497 
3498   If there isn't enough space in the offset vector, treat this as if it were a
3499   non-capturing bracket. Don't worry about setting the flag for the error case
3500   here; that is handled in the code for KET. */
3501 
3502   if (op > OP_BRA)
3503     {
3504     int offset;
3505     int number = op - OP_BRA;
3506 
3507     /* For extended extraction brackets (large number), we have to fish out the
3508     number from a dummy opcode at the start. */
3509 
3510     if (number > EXTRACT_BASIC_MAX) number = (ecode[4] << 8) | ecode[5];
3511     offset = number << 1;
3512 
3513 #ifdef DEBUG
3514     printf("start bracket %d subject=", number);
3515     pchars(eptr, 16, TRUE, md);
3516     printf("\n");
3517 #endif
3518 
3519     if (offset < md->offset_max)
3520       {
3521       int save_offset1 = md->offset_vector[offset];
3522       int save_offset2 = md->offset_vector[offset+1];
3523       int save_offset3 = md->offset_vector[md->offset_end - number];
3524 
3525       DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
3526       md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
3527 
3528       do
3529         {
3530         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3531           return TRUE;
3532         ecode += (ecode[1] << 8) + ecode[2];
3533         }
3534       while (*ecode == OP_ALT);
3535 
3536       DPRINTF(("bracket %d failed\n", number));
3537 
3538       md->offset_vector[offset] = save_offset1;
3539       md->offset_vector[offset+1] = save_offset2;
3540       md->offset_vector[md->offset_end - number] = save_offset3;
3541 
3542       return FALSE;
3543       }
3544 
3545     /* Insufficient room for saving captured contents */
3546 
3547     else op = OP_BRA;
3548     }
3549 
3550   /* Other types of node can be handled by a switch */
3551 
3552   switch(op)
3553     {
3554     case OP_BRA:     /* Non-capturing bracket: optimized */
3555     DPRINTF(("start bracket 0\n"));
3556     do
3557       {
3558       if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3559         return TRUE;
3560       ecode += (ecode[1] << 8) + ecode[2];
3561       }
3562     while (*ecode == OP_ALT);
3563     DPRINTF(("bracket 0 failed\n"));
3564     return FALSE;
3565 
3566     /* Conditional group: compilation checked that there are no more than
3567     two branches. If the condition is false, skipping the first branch takes us
3568     past the end if there is only one branch, but that's OK because that is
3569     exactly what going to the ket would do. */
3570 
3571     case OP_COND:
3572     if (ecode[3] == OP_CREF)         /* Condition is extraction test */
3573       {
3574       int offset = (ecode[4] << 9) | (ecode[5] << 1); /* Doubled ref number */
3575       return match(eptr,
3576         ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
3577           6 : 3 + (ecode[1] << 8) + ecode[2]),
3578         offset_top, md, ims, eptrb, match_isgroup);
3579       }
3580 
3581     /* The condition is an assertion. Call match() to evaluate it - setting
3582     the final argument TRUE causes it to stop at the end of an assertion. */
3583 
3584     else
3585       {
3586       if (match(eptr, ecode+3, offset_top, md, ims, NULL,
3587           match_condassert | match_isgroup))
3588         {
3589         ecode += 3 + (ecode[4] << 8) + ecode[5];
3590         while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
3591         }
3592       else ecode += (ecode[1] << 8) + ecode[2];
3593       return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
3594       }
3595     /* Control never reaches here */
3596 
3597     /* Skip over conditional reference or large extraction number data if
3598     encountered. */
3599 
3600     case OP_CREF:
3601     case OP_BRANUMBER:
3602     ecode += 3;
3603     break;
3604 
3605     /* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
3606     an empty string - recursion will then try other alternatives, if any. */
3607 
3608     case OP_END:
3609     if (md->notempty && eptr == md->start_match) return FALSE;
3610     md->end_match_ptr = eptr;          /* Record where we ended */
3611     md->end_offset_top = offset_top;   /* and how many extracts were taken */
3612     return TRUE;
3613 
3614     /* Change option settings */
3615 
3616     case OP_OPT:
3617     ims = ecode[1];
3618     ecode += 2;
3619     DPRINTF(("ims set to %02lx\n", ims));
3620     break;
3621 
3622     /* Assertion brackets. Check the alternative branches in turn - the
3623     matching won't pass the KET for an assertion. If any one branch matches,
3624     the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
3625     start of each branch to move the current point backwards, so the code at
3626     this level is identical to the lookahead case. */
3627 
3628     case OP_ASSERT:
3629     case OP_ASSERTBACK:
3630     do
3631       {
3632       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
3633       ecode += (ecode[1] << 8) + ecode[2];
3634       }
3635     while (*ecode == OP_ALT);
3636     if (*ecode == OP_KET) return FALSE;
3637 
3638     /* If checking an assertion for a condition, return TRUE. */
3639 
3640     if ((flags & match_condassert) != 0) return TRUE;
3641 
3642     /* Continue from after the assertion, updating the offsets high water
3643     mark, since extracts may have been taken during the assertion. */
3644 
3645     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3646     ecode += 3;
3647     offset_top = md->end_offset_top;
3648     continue;
3649 
3650     /* Negative assertion: all branches must fail to match */
3651 
3652     case OP_ASSERT_NOT:
3653     case OP_ASSERTBACK_NOT:
3654     do
3655       {
3656       if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
3657         return FALSE;
3658       ecode += (ecode[1] << 8) + ecode[2];
3659       }
3660     while (*ecode == OP_ALT);
3661 
3662     if ((flags & match_condassert) != 0) return TRUE;
3663 
3664     ecode += 3;
3665     continue;
3666 
3667     /* Move the subject pointer back. This occurs only at the start of
3668     each branch of a lookbehind assertion. If we are too close to the start to
3669     move back, this match function fails. When working with UTF-8 we move
3670     back a number of characters, not bytes. */
3671 
3672     case OP_REVERSE:
3673 #ifdef SUPPORT_UTF8
3674     c = (ecode[1] << 8) + ecode[2];
3675     for (i = 0; i < c; i++)
3676       {
3677       eptr--;
3678       BACKCHAR(eptr)
3679       }
3680 #else
3681     eptr -= (ecode[1] << 8) + ecode[2];
3682 #endif
3683 
3684     if (eptr < md->start_subject) return FALSE;
3685     ecode += 3;
3686     break;
3687 
3688     /* Recursion matches the current regex, nested. If there are any capturing
3689     brackets started but not finished, we have to save their starting points
3690     and reinstate them after the recursion. However, we don't know how many
3691     such there are (offset_top records the completed total) so we just have
3692     to save all the potential data. There may be up to 99 such values, which
3693     is a bit large to put on the stack, but using malloc for small numbers
3694     seems expensive. As a compromise, the stack is used when there are fewer
3695     than 16 values to store; otherwise malloc is used. A problem is what to do
3696     if the malloc fails ... there is no way of returning to the top level with
3697     an error. Save the top 15 values on the stack, and accept that the rest
3698     may be wrong. */
3699 
3700     case OP_RECURSE:
3701       {
3702       BOOL rc;
3703       int *save;
3704       int stacksave[15];
3705 
3706       c = md->offset_max;
3707 
3708       if (c < 16) save = stacksave; else
3709         {
3710         save = (int *)(pcre_malloc)((c+1) * sizeof(int));
3711         if (save == NULL)
3712           {
3713           save = stacksave;
3714           c = 15;
3715           }
3716         }
3717 
3718       for (i = 1; i <= c; i++)
3719         save[i] = md->offset_vector[md->offset_end - i];
3720       rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
3721         match_isgroup);
3722       for (i = 1; i <= c; i++)
3723         md->offset_vector[md->offset_end - i] = save[i];
3724       if (save != stacksave) (pcre_free)(save);
3725       if (!rc) return FALSE;
3726 
3727       /* In case the recursion has set more capturing values, save the final
3728       number, then move along the subject till after the recursive match,
3729       and advance one byte in the pattern code. */
3730 
3731       offset_top = md->end_offset_top;
3732       eptr = md->end_match_ptr;
3733       ecode++;
3734       }
3735     break;
3736 
3737     /* "Once" brackets are like assertion brackets except that after a match,
3738     the point in the subject string is not moved back. Thus there can never be
3739     a move back into the brackets. Check the alternative branches in turn - the
3740     matching won't pass the KET for this kind of subpattern. If any one branch
3741     matches, we carry on as at the end of a normal bracket, leaving the subject
3742     pointer. */
3743 
3744     case OP_ONCE:
3745       {
3746       const uschar *prev = ecode;
3747       const uschar *saved_eptr = eptr;
3748 
3749       do
3750         {
3751         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
3752           break;
3753         ecode += (ecode[1] << 8) + ecode[2];
3754         }
3755       while (*ecode == OP_ALT);
3756 
3757       /* If hit the end of the group (which could be repeated), fail */
3758 
3759       if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
3760 
3761       /* Continue as from after the assertion, updating the offsets high water
3762       mark, since extracts may have been taken. */
3763 
3764       do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3765 
3766       offset_top = md->end_offset_top;
3767       eptr = md->end_match_ptr;
3768 
3769       /* For a non-repeating ket, just continue at this level. This also
3770       happens for a repeating ket if no characters were matched in the group.
3771       This is the forcible breaking of infinite loops as implemented in Perl
3772       5.005. If there is an options reset, it will get obeyed in the normal
3773       course of events. */
3774 
3775       if (*ecode == OP_KET || eptr == saved_eptr)
3776         {
3777         ecode += 3;
3778         break;
3779         }
3780 
3781       /* The repeating kets try the rest of the pattern or restart from the
3782       preceding bracket, in the appropriate order. We need to reset any options
3783       that changed within the bracket before re-running it, so check the next
3784       opcode. */
3785 
3786       if (ecode[3] == OP_OPT)
3787         {
3788         ims = (ims & ~PCRE_IMS) | ecode[4];
3789         DPRINTF(("ims set to %02lx at group repeat\n", ims));
3790         }
3791 
3792       if (*ecode == OP_KETRMIN)
3793         {
3794         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3795             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3796               return TRUE;
3797         }
3798       else  /* OP_KETRMAX */
3799         {
3800         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3801             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3802         }
3803       }
3804     return FALSE;
3805 
3806     /* An alternation is the end of a branch; scan along to find the end of the
3807     bracketed group and go to there. */
3808 
3809     case OP_ALT:
3810     do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
3811     break;
3812 
3813     /* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
3814     that it may occur zero times. It may repeat infinitely, or not at all -
3815     i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
3816     repeat limits are compiled as a number of copies, with the optional ones
3817     preceded by BRAZERO or BRAMINZERO. */
3818 
3819     case OP_BRAZERO:
3820       {
3821       const uschar *next = ecode+1;
3822       if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
3823         return TRUE;
3824       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3825       ecode = next + 3;
3826       }
3827     break;
3828 
3829     case OP_BRAMINZERO:
3830       {
3831       const uschar *next = ecode+1;
3832       do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
3833       if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
3834         return TRUE;
3835       ecode++;
3836       }
3837     break;
3838 
3839     /* End of a group, repeated or non-repeating. If we are at the end of
3840     an assertion "group", stop matching and return TRUE, but record the
3841     current high water mark for use by positive assertions. Do this also
3842     for the "once" (not-backup up) groups. */
3843 
3844     case OP_KET:
3845     case OP_KETRMIN:
3846     case OP_KETRMAX:
3847       {
3848       const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
3849       const uschar *saved_eptr = eptrb->saved_eptr;
3850 
3851       eptrb = eptrb->prev;    /* Back up the stack of bracket start pointers */
3852 
3853       if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
3854           *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
3855           *prev == OP_ONCE)
3856         {
3857         md->end_match_ptr = eptr;      /* For ONCE */
3858         md->end_offset_top = offset_top;
3859         return TRUE;
3860         }
3861 
3862       /* In all other cases except a conditional group we have to check the
3863       group number back at the start and if necessary complete handling an
3864       extraction by setting the offsets and bumping the high water mark. */
3865 
3866       if (*prev != OP_COND)
3867         {
3868         int offset;
3869         int number = *prev - OP_BRA;
3870 
3871         /* For extended extraction brackets (large number), we have to fish out
3872         the number from a dummy opcode at the start. */
3873 
3874         if (number > EXTRACT_BASIC_MAX) number = (prev[4] << 8) | prev[5];
3875         offset = number << 1;
3876 
3877 #ifdef DEBUG
3878         printf("end bracket %d", number);
3879         printf("\n");
3880 #endif
3881 
3882         if (number > 0)
3883           {
3884           if (offset >= md->offset_max) md->offset_overflow = TRUE; else
3885             {
3886             md->offset_vector[offset] =
3887               md->offset_vector[md->offset_end - number];
3888             md->offset_vector[offset+1] = eptr - md->start_subject;
3889             if (offset_top <= offset) offset_top = offset + 2;
3890             }
3891           }
3892         }
3893 
3894       /* Reset the value of the ims flags, in case they got changed during
3895       the group. */
3896 
3897       ims = original_ims;
3898       DPRINTF(("ims reset to %02lx\n", ims));
3899 
3900       /* For a non-repeating ket, just continue at this level. This also
3901       happens for a repeating ket if no characters were matched in the group.
3902       This is the forcible breaking of infinite loops as implemented in Perl
3903       5.005. If there is an options reset, it will get obeyed in the normal
3904       course of events. */
3905 
3906       if (*ecode == OP_KET || eptr == saved_eptr)
3907         {
3908         ecode += 3;
3909         break;
3910         }
3911 
3912       /* The repeating kets try the rest of the pattern or restart from the
3913       preceding bracket, in the appropriate order. */
3914 
3915       if (*ecode == OP_KETRMIN)
3916         {
3917         if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
3918             match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
3919               return TRUE;
3920         }
3921       else  /* OP_KETRMAX */
3922         {
3923         if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
3924             match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
3925         }
3926       }
3927     return FALSE;
3928 
3929     /* Start of subject unless notbol, or after internal newline if multiline */
3930 
3931     case OP_CIRC:
3932     if (md->notbol && eptr == md->start_subject) return FALSE;
3933     if ((ims & PCRE_MULTILINE) != 0)
3934       {
3935       if (eptr != md->start_subject && eptr[-1] != NEWLINE) return FALSE;
3936       ecode++;
3937       break;
3938       }
3939     /* ... else fall through */
3940 
3941     /* Start of subject assertion */
3942 
3943     case OP_SOD:
3944     if (eptr != md->start_subject) return FALSE;
3945     ecode++;
3946     break;
3947 
3948     /* Assert before internal newline if multiline, or before a terminating
3949     newline unless endonly is set, else end of subject unless noteol is set. */
3950 
3951     case OP_DOLL:
3952     if ((ims & PCRE_MULTILINE) != 0)
3953       {
3954       if (eptr < md->end_subject) { if (*eptr != NEWLINE) return FALSE; }
3955         else { if (md->noteol) return FALSE; }
3956       ecode++;
3957       break;
3958       }
3959     else
3960       {
3961       if (md->noteol) return FALSE;
3962       if (!md->endonly)
3963         {
3964         if (eptr < md->end_subject - 1 ||
3965            (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3966 
3967         ecode++;
3968         break;
3969         }
3970       }
3971     /* ... else fall through */
3972 
3973     /* End of subject assertion (\z) */
3974 
3975     case OP_EOD:
3976     if (eptr < md->end_subject) return FALSE;
3977     ecode++;
3978     break;
3979 
3980     /* End of subject or ending \n assertion (\Z) */
3981 
3982     case OP_EODN:
3983     if (eptr < md->end_subject - 1 ||
3984        (eptr == md->end_subject - 1 && *eptr != NEWLINE)) return FALSE;
3985     ecode++;
3986     break;
3987 
3988     /* Word boundary assertions */
3989 
3990     case OP_NOT_WORD_BOUNDARY:
3991     case OP_WORD_BOUNDARY:
3992       {
3993       BOOL prev_is_word = (eptr != md->start_subject) &&
3994         ((md->ctypes[eptr[-1]] & ctype_word) != 0);
3995       BOOL cur_is_word = (eptr < md->end_subject) &&
3996         ((md->ctypes[*eptr] & ctype_word) != 0);
3997       if ((*ecode++ == OP_WORD_BOUNDARY)?
3998            cur_is_word == prev_is_word : cur_is_word != prev_is_word)
3999         return FALSE;
4000       }
4001     break;
4002 
4003     /* Match a single character type; inline for speed */
4004 
4005     case OP_ANY:
4006     if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == NEWLINE)
4007       return FALSE;
4008     if (eptr++ >= md->end_subject) return FALSE;
4009 #ifdef SUPPORT_UTF8
4010     if (md->utf8)
4011       while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4012 #endif
4013     ecode++;
4014     break;
4015 
4016     case OP_NOT_DIGIT:
4017     if (eptr >= md->end_subject ||
4018        (md->ctypes[*eptr++] & ctype_digit) != 0)
4019       return FALSE;
4020     ecode++;
4021     break;
4022 
4023     case OP_DIGIT:
4024     if (eptr >= md->end_subject ||
4025        (md->ctypes[*eptr++] & ctype_digit) == 0)
4026       return FALSE;
4027     ecode++;
4028     break;
4029 
4030     case OP_NOT_WHITESPACE:
4031     if (eptr >= md->end_subject ||
4032        (md->ctypes[*eptr++] & ctype_space) != 0)
4033       return FALSE;
4034     ecode++;
4035     break;
4036 
4037     case OP_WHITESPACE:
4038     if (eptr >= md->end_subject ||
4039        (md->ctypes[*eptr++] & ctype_space) == 0)
4040       return FALSE;
4041     ecode++;
4042     break;
4043 
4044     case OP_NOT_WORDCHAR:
4045     if (eptr >= md->end_subject ||
4046        (md->ctypes[*eptr++] & ctype_word) != 0)
4047       return FALSE;
4048     ecode++;
4049     break;
4050 
4051     case OP_WORDCHAR:
4052     if (eptr >= md->end_subject ||
4053        (md->ctypes[*eptr++] & ctype_word) == 0)
4054       return FALSE;
4055     ecode++;
4056     break;
4057 
4058     /* Match a back reference, possibly repeatedly. Look past the end of the
4059     item to see if there is repeat information following. The code is similar
4060     to that for character classes, but repeated for efficiency. Then obey
4061     similar code to character type repeats - written out again for speed.
4062     However, if the referenced string is the empty string, always treat
4063     it as matched, any number of times (otherwise there could be infinite
4064     loops). */
4065 
4066     case OP_REF:
4067       {
4068       int length;
4069       int offset = (ecode[1] << 9) | (ecode[2] << 1); /* Doubled ref number */
4070       ecode += 3;                                     /* Advance past item */
4071 
4072       /* If the reference is unset, set the length to be longer than the amount
4073       of subject left; this ensures that every attempt at a match fails. We
4074       can't just fail here, because of the possibility of quantifiers with zero
4075       minima. */
4076 
4077       length = (offset >= offset_top || md->offset_vector[offset] < 0)?
4078         md->end_subject - eptr + 1 :
4079         md->offset_vector[offset+1] - md->offset_vector[offset];
4080 
4081       /* Set up for repetition, or handle the non-repeated case */
4082 
4083       switch (*ecode)
4084         {
4085         case OP_CRSTAR:
4086         case OP_CRMINSTAR:
4087         case OP_CRPLUS:
4088         case OP_CRMINPLUS:
4089         case OP_CRQUERY:
4090         case OP_CRMINQUERY:
4091         c = *ecode++ - OP_CRSTAR;
4092         minimize = (c & 1) != 0;
4093         min = rep_min[c];                 /* Pick up values from tables; */
4094         max = rep_max[c];                 /* zero for max => infinity */
4095         if (max == 0) max = INT_MAX;
4096         break;
4097 
4098         case OP_CRRANGE:
4099         case OP_CRMINRANGE:
4100         minimize = (*ecode == OP_CRMINRANGE);
4101         min = (ecode[1] << 8) + ecode[2];
4102         max = (ecode[3] << 8) + ecode[4];
4103         if (max == 0) max = INT_MAX;
4104         ecode += 5;
4105         break;
4106 
4107         default:               /* No repeat follows */
4108         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4109         eptr += length;
4110         continue;              /* With the main loop */
4111         }
4112 
4113       /* If the length of the reference is zero, just continue with the
4114       main loop. */
4115 
4116       if (length == 0) continue;
4117 
4118       /* First, ensure the minimum number of matches are present. We get back
4119       the length of the reference string explicitly rather than passing the
4120       address of eptr, so that eptr can be a register variable. */
4121 
4122       for (i = 1; i <= min; i++)
4123         {
4124         if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
4125         eptr += length;
4126         }
4127 
4128       /* If min = max, continue at the same level without recursion.
4129       They are not both allowed to be zero. */
4130 
4131       if (min == max) continue;
4132 
4133       /* If minimizing, keep trying and advancing the pointer */
4134 
4135       if (minimize)
4136         {
4137         for (i = min;; i++)
4138           {
4139           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4140             return TRUE;
4141           if (i >= max || !match_ref(offset, eptr, length, md, ims))
4142             return FALSE;
4143           eptr += length;
4144           }
4145         /* Control never gets here */
4146         }
4147 
4148       /* If maximizing, find the longest string and work backwards */
4149 
4150       else
4151         {
4152         const uschar *pp = eptr;
4153         for (i = min; i < max; i++)
4154           {
4155           if (!match_ref(offset, eptr, length, md, ims)) break;
4156           eptr += length;
4157           }
4158         while (eptr >= pp)
4159           {
4160           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4161             return TRUE;
4162           eptr -= length;
4163           }
4164         return FALSE;
4165         }
4166       }
4167     /* Control never gets here */
4168 
4169 
4170 
4171     /* Match a character class, possibly repeatedly. Look past the end of the
4172     item to see if there is repeat information following. Then obey similar
4173     code to character type repeats - written out again for speed. */
4174 
4175     case OP_CLASS:
4176       {
4177       const uschar *data = ecode + 1;  /* Save for matching */
4178       ecode += 33;                     /* Advance past the item */
4179 
4180       switch (*ecode)
4181         {
4182         case OP_CRSTAR:
4183         case OP_CRMINSTAR:
4184         case OP_CRPLUS:
4185         case OP_CRMINPLUS:
4186         case OP_CRQUERY:
4187         case OP_CRMINQUERY:
4188         c = *ecode++ - OP_CRSTAR;
4189         minimize = (c & 1) != 0;
4190         min = rep_min[c];                 /* Pick up values from tables; */
4191         max = rep_max[c];                 /* zero for max => infinity */
4192         if (max == 0) max = INT_MAX;
4193         break;
4194 
4195         case OP_CRRANGE:
4196         case OP_CRMINRANGE:
4197         minimize = (*ecode == OP_CRMINRANGE);
4198         min = (ecode[1] << 8) + ecode[2];
4199         max = (ecode[3] << 8) + ecode[4];
4200         if (max == 0) max = INT_MAX;
4201         ecode += 5;
4202         break;
4203 
4204         default:               /* No repeat follows */
4205         min = max = 1;
4206         break;
4207         }
4208 
4209       /* First, ensure the minimum number of matches are present. */
4210 
4211       for (i = 1; i <= min; i++)
4212         {
4213         if (eptr >= md->end_subject) return FALSE;
4214         GETCHARINC(c, eptr)         /* Get character; increment eptr */
4215 
4216 #ifdef SUPPORT_UTF8
4217         /* We do not yet support class members > 255 */
4218         if (c > 255) return FALSE;
4219 #endif
4220 
4221         if ((data[c/8] & (1 << (c&7))) != 0) continue;
4222         return FALSE;
4223         }
4224 
4225       /* If max == min we can continue with the main loop without the
4226       need to recurse. */
4227 
4228       if (min == max) continue;
4229 
4230       /* If minimizing, keep testing the rest of the expression and advancing
4231       the pointer while it matches the class. */
4232 
4233       if (minimize)
4234         {
4235         for (i = min;; i++)
4236           {
4237           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4238             return TRUE;
4239           if (i >= max || eptr >= md->end_subject) return FALSE;
4240           GETCHARINC(c, eptr)       /* Get character; increment eptr */
4241 
4242 #ifdef SUPPORT_UTF8
4243           /* We do not yet support class members > 255 */
4244           if (c > 255) return FALSE;
4245 #endif
4246           if ((data[c/8] & (1 << (c&7))) != 0) continue;
4247           return FALSE;
4248           }
4249         /* Control never gets here */
4250         }
4251 
4252       /* If maximizing, find the longest possible run, then work backwards. */
4253 
4254       else
4255         {
4256         const uschar *pp = eptr;
4257         int len = 1;
4258         for (i = min; i < max; i++)
4259           {
4260           if (eptr >= md->end_subject) break;
4261           GETCHARLEN(c, eptr, len)  /* Get character, set length if UTF-8 */
4262 
4263 #ifdef SUPPORT_UTF8
4264           /* We do not yet support class members > 255 */
4265           if (c > 255) break;
4266 #endif
4267           if ((data[c/8] & (1 << (c&7))) == 0) break;
4268           eptr += len;
4269           }
4270 
4271         while (eptr >= pp)
4272           {
4273           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4274             return TRUE;
4275 
4276 #ifdef SUPPORT_UTF8
4277           BACKCHAR(eptr)
4278 #endif
4279           }
4280         return FALSE;
4281         }
4282       }
4283     /* Control never gets here */
4284 
4285     /* Match a run of characters */
4286 
4287     case OP_CHARS:
4288       {
4289       register int length = ecode[1];
4290       ecode += 2;
4291 
4292 #ifdef DEBUG    /* Sigh. Some compilers never learn. */
4293       if (eptr >= md->end_subject)
4294         printf("matching subject <null> against pattern ");
4295       else
4296         {
4297         printf("matching subject ");
4298         pchars(eptr, length, TRUE, md);
4299         printf(" against pattern ");
4300         }
4301       pchars(ecode, length, FALSE, md);
4302       printf("\n");
4303 #endif
4304 
4305       if (length > md->end_subject - eptr) return FALSE;
4306       if ((ims & PCRE_CASELESS) != 0)
4307         {
4308         while (length-- > 0)
4309           if (md->lcc[*ecode++] != md->lcc[*eptr++])
4310             return FALSE;
4311         }
4312       else
4313         {
4314         while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
4315         }
4316       }
4317     break;
4318 
4319     /* Match a single character repeatedly; different opcodes share code. */
4320 
4321     case OP_EXACT:
4322     min = max = (ecode[1] << 8) + ecode[2];
4323     ecode += 3;
4324     goto REPEATCHAR;
4325 
4326     case OP_UPTO:
4327     case OP_MINUPTO:
4328     min = 0;
4329     max = (ecode[1] << 8) + ecode[2];
4330     minimize = *ecode == OP_MINUPTO;
4331     ecode += 3;
4332     goto REPEATCHAR;
4333 
4334     case OP_STAR:
4335     case OP_MINSTAR:
4336     case OP_PLUS:
4337     case OP_MINPLUS:
4338     case OP_QUERY:
4339     case OP_MINQUERY:
4340     c = *ecode++ - OP_STAR;
4341     minimize = (c & 1) != 0;
4342     min = rep_min[c];                 /* Pick up values from tables; */
4343     max = rep_max[c];                 /* zero for max => infinity */
4344     if (max == 0) max = INT_MAX;
4345 
4346     /* Common code for all repeated single-character matches. We can give
4347     up quickly if there are fewer than the minimum number of characters left in
4348     the subject. */
4349 
4350     REPEATCHAR:
4351     if (min > md->end_subject - eptr) return FALSE;
4352     c = *ecode++;
4353 
4354     /* The code is duplicated for the caseless and caseful cases, for speed,
4355     since matching characters is likely to be quite common. First, ensure the
4356     minimum number of matches are present. If min = max, continue at the same
4357     level without recursing. Otherwise, if minimizing, keep trying the rest of
4358     the expression and advancing one matching character if failing, up to the
4359     maximum. Alternatively, if maximizing, find the maximum number of
4360     characters and work backwards. */
4361 
4362     DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
4363       max, eptr));
4364 
4365     if ((ims & PCRE_CASELESS) != 0)
4366       {
4367       c = md->lcc[c];
4368       for (i = 1; i <= min; i++)
4369         if (c != md->lcc[*eptr++]) return FALSE;
4370       if (min == max) continue;
4371       if (minimize)
4372         {
4373         for (i = min;; i++)
4374           {
4375           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4376             return TRUE;
4377           if (i >= max || eptr >= md->end_subject ||
4378               c != md->lcc[*eptr++])
4379             return FALSE;
4380           }
4381         /* Control never gets here */
4382         }
4383       else
4384         {
4385         const uschar *pp = eptr;
4386         for (i = min; i < max; i++)
4387           {
4388           if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
4389           eptr++;
4390           }
4391         while (eptr >= pp)
4392           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4393             return TRUE;
4394         return FALSE;
4395         }
4396       /* Control never gets here */
4397       }
4398 
4399     /* Caseful comparisons */
4400 
4401     else
4402       {
4403       for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
4404       if (min == max) continue;
4405       if (minimize)
4406         {
4407         for (i = min;; i++)
4408           {
4409           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4410             return TRUE;
4411           if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
4412           }
4413         /* Control never gets here */
4414         }
4415       else
4416         {
4417         const uschar *pp = eptr;
4418         for (i = min; i < max; i++)
4419           {
4420           if (eptr >= md->end_subject || c != *eptr) break;
4421           eptr++;
4422           }
4423         while (eptr >= pp)
4424          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4425            return TRUE;
4426         return FALSE;
4427         }
4428       }
4429     /* Control never gets here */
4430 
4431     /* Match a negated single character */
4432 
4433     case OP_NOT:
4434     if (eptr >= md->end_subject) return FALSE;
4435     ecode++;
4436     if ((ims & PCRE_CASELESS) != 0)
4437       {
4438       if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
4439       }
4440     else
4441       {
4442       if (*ecode++ == *eptr++) return FALSE;
4443       }
4444     break;
4445 
4446     /* Match a negated single character repeatedly. This is almost a repeat of
4447     the code for a repeated single character, but I haven't found a nice way of
4448     commoning these up that doesn't require a test of the positive/negative
4449     option for each character match. Maybe that wouldn't add very much to the
4450     time taken, but character matching *is* what this is all about... */
4451 
4452     case OP_NOTEXACT:
4453     min = max = (ecode[1] << 8) + ecode[2];
4454     ecode += 3;
4455     goto REPEATNOTCHAR;
4456 
4457     case OP_NOTUPTO:
4458     case OP_NOTMINUPTO:
4459     min = 0;
4460     max = (ecode[1] << 8) + ecode[2];
4461     minimize = *ecode == OP_NOTMINUPTO;
4462     ecode += 3;
4463     goto REPEATNOTCHAR;
4464 
4465     case OP_NOTSTAR:
4466     case OP_NOTMINSTAR:
4467     case OP_NOTPLUS:
4468     case OP_NOTMINPLUS:
4469     case OP_NOTQUERY:
4470     case OP_NOTMINQUERY:
4471     c = *ecode++ - OP_NOTSTAR;
4472     minimize = (c & 1) != 0;
4473     min = rep_min[c];                 /* Pick up values from tables; */
4474     max = rep_max[c];                 /* zero for max => infinity */
4475     if (max == 0) max = INT_MAX;
4476 
4477     /* Common code for all repeated single-character matches. We can give
4478     up quickly if there are fewer than the minimum number of characters left in
4479     the subject. */
4480 
4481     REPEATNOTCHAR:
4482     if (min > md->end_subject - eptr) return FALSE;
4483     c = *ecode++;
4484 
4485     /* The code is duplicated for the caseless and caseful cases, for speed,
4486     since matching characters is likely to be quite common. First, ensure the
4487     minimum number of matches are present. If min = max, continue at the same
4488     level without recursing. Otherwise, if minimizing, keep trying the rest of
4489     the expression and advancing one matching character if failing, up to the
4490     maximum. Alternatively, if maximizing, find the maximum number of
4491     characters and work backwards. */
4492 
4493     DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
4494       max, eptr));
4495 
4496     if ((ims & PCRE_CASELESS) != 0)
4497       {
4498       c = md->lcc[c];
4499       for (i = 1; i <= min; i++)
4500         if (c == md->lcc[*eptr++]) return FALSE;
4501       if (min == max) continue;
4502       if (minimize)
4503         {
4504         for (i = min;; i++)
4505           {
4506           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4507             return TRUE;
4508           if (i >= max || eptr >= md->end_subject ||
4509               c == md->lcc[*eptr++])
4510             return FALSE;
4511           }
4512         /* Control never gets here */
4513         }
4514       else
4515         {
4516         const uschar *pp = eptr;
4517         for (i = min; i < max; i++)
4518           {
4519           if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
4520           eptr++;
4521           }
4522         while (eptr >= pp)
4523           if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4524             return TRUE;
4525         return FALSE;
4526         }
4527       /* Control never gets here */
4528       }
4529 
4530     /* Caseful comparisons */
4531 
4532     else
4533       {
4534       for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
4535       if (min == max) continue;
4536       if (minimize)
4537         {
4538         for (i = min;; i++)
4539           {
4540           if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
4541             return TRUE;
4542           if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
4543           }
4544         /* Control never gets here */
4545         }
4546       else
4547         {
4548         const uschar *pp = eptr;
4549         for (i = min; i < max; i++)
4550           {
4551           if (eptr >= md->end_subject || c == *eptr) break;
4552           eptr++;
4553           }
4554         while (eptr >= pp)
4555          if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4556            return TRUE;
4557         return FALSE;
4558         }
4559       }
4560     /* Control never gets here */
4561 
4562     /* Match a single character type repeatedly; several different opcodes
4563     share code. This is very similar to the code for single characters, but we
4564     repeat it in the interests of efficiency. */
4565 
4566     case OP_TYPEEXACT:
4567     min = max = (ecode[1] << 8) + ecode[2];
4568     minimize = TRUE;
4569     ecode += 3;
4570     goto REPEATTYPE;
4571 
4572     case OP_TYPEUPTO:
4573     case OP_TYPEMINUPTO:
4574     min = 0;
4575     max = (ecode[1] << 8) + ecode[2];
4576     minimize = *ecode == OP_TYPEMINUPTO;
4577     ecode += 3;
4578     goto REPEATTYPE;
4579 
4580     case OP_TYPESTAR:
4581     case OP_TYPEMINSTAR:
4582     case OP_TYPEPLUS:
4583     case OP_TYPEMINPLUS:
4584     case OP_TYPEQUERY:
4585     case OP_TYPEMINQUERY:
4586     c = *ecode++ - OP_TYPESTAR;
4587     minimize = (c & 1) != 0;
4588     min = rep_min[c];                 /* Pick up values from tables; */
4589     max = rep_max[c];                 /* zero for max => infinity */
4590     if (max == 0) max = INT_MAX;
4591 
4592     /* Common code for all repeated single character type matches */
4593 
4594     REPEATTYPE:
4595     ctype = *ecode++;      /* Code for the character type */
4596 
4597     /* First, ensure the minimum number of matches are present. Use inline
4598     code for maximizing the speed, and do the type test once at the start
4599     (i.e. keep it out of the loop). Also we can test that there are at least
4600     the minimum number of bytes before we start, except when doing '.' in
4601     UTF8 mode. Leave the test in in all cases; in the special case we have
4602     to test after each character. */
4603 
4604     if (min > md->end_subject - eptr) return FALSE;
4605     if (min > 0) switch(ctype)
4606       {
4607       case OP_ANY:
4608 #ifdef SUPPORT_UTF8
4609       if (md->utf8)
4610         {
4611         for (i = 1; i <= min; i++)
4612           {
4613           if (eptr >= md->end_subject ||
4614              (*eptr++ == NEWLINE && (ims & PCRE_DOTALL) == 0))
4615             return FALSE;
4616           while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4617           }
4618         break;
4619         }
4620 #endif
4621       /* Non-UTF8 can be faster */
4622       if ((ims & PCRE_DOTALL) == 0)
4623         { for (i = 1; i <= min; i++) if (*eptr++ == NEWLINE) return FALSE; }
4624       else eptr += min;
4625       break;
4626 
4627       case OP_NOT_DIGIT:
4628       for (i = 1; i <= min; i++)
4629         if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
4630       break;
4631 
4632       case OP_DIGIT:
4633       for (i = 1; i <= min; i++)
4634         if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
4635       break;
4636 
4637       case OP_NOT_WHITESPACE:
4638       for (i = 1; i <= min; i++)
4639         if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
4640       break;
4641 
4642       case OP_WHITESPACE:
4643       for (i = 1; i <= min; i++)
4644         if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
4645       break;
4646 
4647       case OP_NOT_WORDCHAR:
4648       for (i = 1; i <= min; i++)
4649         if ((md->ctypes[*eptr++] & ctype_word) != 0)
4650           return FALSE;
4651       break;
4652 
4653       case OP_WORDCHAR:
4654       for (i = 1; i <= min; i++)
4655         if ((md->ctypes[*eptr++] & ctype_word) == 0)
4656           return FALSE;
4657       break;
4658       }
4659 
4660     /* If min = max, continue at the same level without recursing */
4661 
4662     if (min == max) continue;
4663 
4664     /* If minimizing, we have to test the rest of the pattern before each
4665     subsequent match. */
4666 
4667     if (minimize)
4668       {
4669       for (i = min;; i++)
4670         {
4671         if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
4672         if (i >= max || eptr >= md->end_subject) return FALSE;
4673 
4674         c = *eptr++;
4675         switch(ctype)
4676           {
4677           case OP_ANY:
4678           if ((ims & PCRE_DOTALL) == 0 && c == NEWLINE) return FALSE;
4679 #ifdef SUPPORT_UTF8
4680           if (md->utf8)
4681             while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4682 #endif
4683           break;
4684 
4685           case OP_NOT_DIGIT:
4686           if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
4687           break;
4688 
4689           case OP_DIGIT:
4690           if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
4691           break;
4692 
4693           case OP_NOT_WHITESPACE:
4694           if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
4695           break;
4696 
4697           case OP_WHITESPACE:
4698           if  ((md->ctypes[c] & ctype_space) == 0) return FALSE;
4699           break;
4700 
4701           case OP_NOT_WORDCHAR:
4702           if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
4703           break;
4704 
4705           case OP_WORDCHAR:
4706           if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
4707           break;
4708           }
4709         }
4710       /* Control never gets here */
4711       }
4712 
4713     /* If maximizing it is worth using inline code for speed, doing the type
4714     test once at the start (i.e. keep it out of the loop). */
4715 
4716     else
4717       {
4718       const uschar *pp = eptr;
4719       switch(ctype)
4720         {
4721         case OP_ANY:
4722 
4723         /* Special code is required for UTF8, but when the maximum is unlimited
4724         we don't need it. */
4725 
4726 #ifdef SUPPORT_UTF8
4727         if (md->utf8 && max < INT_MAX)
4728           {
4729           if ((ims & PCRE_DOTALL) == 0)
4730             {
4731             for (i = min; i < max; i++)
4732               {
4733               if (eptr >= md->end_subject || *eptr++ == NEWLINE) break;
4734               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4735               }
4736             }
4737           else
4738             {
4739             for (i = min; i < max; i++)
4740               {
4741               eptr++;
4742               while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
4743               }
4744             }
4745           break;
4746           }
4747 #endif
4748         /* Non-UTF8 can be faster */
4749         if ((ims & PCRE_DOTALL) == 0)
4750           {
4751           for (i = min; i < max; i++)
4752             {
4753             if (eptr >= md->end_subject || *eptr == NEWLINE) break;
4754             eptr++;
4755             }
4756           }
4757         else
4758           {
4759           c = max - min;
4760           if (c > md->end_subject - eptr) c = md->end_subject - eptr;
4761           eptr += c;
4762           }
4763         break;
4764 
4765         case OP_NOT_DIGIT:
4766         for (i = min; i < max; i++)
4767           {
4768           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4769             break;
4770           eptr++;
4771           }
4772         break;
4773 
4774         case OP_DIGIT:
4775         for (i = min; i < max; i++)
4776           {
4777           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4778             break;
4779           eptr++;
4780           }
4781         break;
4782 
4783         case OP_NOT_WHITESPACE:
4784         for (i = min; i < max; i++)
4785           {
4786           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4787             break;
4788           eptr++;
4789           }
4790         break;
4791 
4792         case OP_WHITESPACE:
4793         for (i = min; i < max; i++)
4794           {
4795           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4796             break;
4797           eptr++;
4798           }
4799         break;
4800 
4801         case OP_NOT_WORDCHAR:
4802         for (i = min; i < max; i++)
4803           {
4804           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4805             break;
4806           eptr++;
4807           }
4808         break;
4809 
4810         case OP_WORDCHAR:
4811         for (i = min; i < max; i++)
4812           {
4813           if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4814             break;
4815           eptr++;
4816           }
4817         break;
4818         }
4819 
4820       while (eptr >= pp)
4821         {
4822         if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
4823           return TRUE;
4824 #ifdef SUPPORT_UTF8
4825         if (md->utf8)
4826           while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
4827 #endif
4828         }
4829       return FALSE;
4830       }
4831     /* Control never gets here */
4832 
4833     /* There's been some horrible disaster. */
4834 
4835     default:
4836     DPRINTF(("Unknown opcode %d\n", *ecode));
4837     md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
4838     return FALSE;
4839     }
4840 
4841   /* Do not stick any code in here without much thought; it is assumed
4842   that "continue" in the code above comes out to here to repeat the main
4843   loop. */
4844 
4845   }             /* End of main loop */
4846 /* Control never reaches here */
4847 }
4848 
4849 
4850 
4851 
4852 /*************************************************
4853 *         Execute a Regular Expression           *
4854 *************************************************/
4855 
4856 /* This function applies a compiled re to a subject string and picks out
4857 portions of the string if it matches. Two elements in the vector are set for
4858 each substring: the offsets to the start and end of the substring.
4859 
4860 Arguments:
4861   external_re     points to the compiled expression
4862   external_extra  points to "hints" from pcre_study() or is NULL
4863   subject         points to the subject string
4864   length          length of subject string (may contain binary zeros)
4865   start_offset    where to start in the subject string
4866   options         option bits
4867   offsets         points to a vector of ints to be filled in with offsets
4868   offsetcount     the number of elements in the vector
4869 
4870 Returns:          > 0 => success; value is the number of elements filled in
4871                   = 0 => success, but offsets is not big enough
4872                    -1 => failed to match
4873                  < -1 => some kind of unexpected problem
4874 */
4875 
4876 int
4877 pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
4878   const char *subject, int length, int start_offset, int options, int *offsets,
4879   int offsetcount)
4880 {
4881 int resetcount, ocount;
4882 int first_char = -1;
4883 int req_char = -1;
4884 int req_char2 = -1;
4885 unsigned long int ims = 0;
4886 match_data match_block;
4887 const uschar *start_bits = NULL;
4888 const uschar *start_match = (const uschar *)subject + start_offset;
4889 const uschar *end_subject;
4890 const uschar *req_char_ptr = start_match - 1;
4891 const real_pcre *re = (const real_pcre *)external_re;
4892 const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
4893 BOOL using_temporary_offsets = FALSE;
4894 BOOL anchored;
4895 BOOL startline;
4896 
4897 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4898 
4899 if (re == NULL || subject == NULL ||
4900    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4901 if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
4902 
4903 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4904 startline = (re->options & PCRE_STARTLINE) != 0;
4905 
4906 match_block.start_pattern = re->code;
4907 match_block.start_subject = (const uschar *)subject;
4908 match_block.end_subject = match_block.start_subject + length;
4909 end_subject = match_block.end_subject;
4910 
4911 match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4912 match_block.utf8 = (re->options & PCRE_UTF8) != 0;
4913 
4914 match_block.notbol = (options & PCRE_NOTBOL) != 0;
4915 match_block.noteol = (options & PCRE_NOTEOL) != 0;
4916 match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
4917 
4918 match_block.errorcode = PCRE_ERROR_NOMATCH;     /* Default error */
4919 
4920 match_block.lcc = re->tables + lcc_offset;
4921 match_block.ctypes = re->tables + ctypes_offset;
4922 
4923 /* The ims options can vary during the matching as a result of the presence
4924 of (?ims) items in the pattern. They are kept in a local variable so that
4925 restoring at the exit of a group is easy. */
4926 
4927 ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
4928 
4929 /* If the expression has got more back references than the offsets supplied can
4930 hold, we get a temporary bit of working store to use during the matching.
4931 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4932 of 3. */
4933 
4934 ocount = offsetcount - (offsetcount % 3);
4935 
4936 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4937   {
4938   ocount = re->top_backref * 3 + 3;
4939   match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4940   if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4941   using_temporary_offsets = TRUE;
4942   DPRINTF(("Got memory to hold back references\n"));
4943   }
4944 else match_block.offset_vector = offsets;
4945 
4946 match_block.offset_end = ocount;
4947 match_block.offset_max = (2*ocount)/3;
4948 match_block.offset_overflow = FALSE;
4949 
4950 /* Compute the minimum number of offsets that we need to reset each time. Doing
4951 this makes a huge difference to execution time when there aren't many brackets
4952 in the pattern. */
4953 
4954 resetcount = 2 + re->top_bracket * 2;
4955 if (resetcount > offsetcount) resetcount = ocount;
4956 
4957 /* Reset the working variable associated with each extraction. These should
4958 never be used unless previously set, but they get saved and restored, and so we
4959 initialize them to avoid reading uninitialized locations. */
4960 
4961 if (match_block.offset_vector != NULL)
4962   {
4963   register int *iptr = match_block.offset_vector + ocount;
4964   register int *iend = iptr - resetcount/2 + 1;
4965   while (--iptr >= iend) *iptr = -1;
4966   }
4967 
4968 /* Set up the first character to match, if available. The first_char value is
4969 never set for an anchored regular expression, but the anchoring may be forced
4970 at run time, so we have to test for anchoring. The first char may be unset for
4971 an unanchored pattern, of course. If there's no first char and the pattern was
4972 studied, there may be a bitmap of possible first characters. */
4973 
4974 if (!anchored)
4975   {
4976   if ((re->options & PCRE_FIRSTSET) != 0)
4977     {
4978     first_char = re->first_char;
4979     if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
4980     }
4981   else
4982     if (!startline && extra != NULL &&
4983       (extra->options & PCRE_STUDY_MAPPED) != 0)
4984         start_bits = extra->start_bits;
4985   }
4986 
4987 /* For anchored or unanchored matches, there may be a "last known required
4988 character" set. If the PCRE_CASELESS is set, implying that the match starts
4989 caselessly, or if there are any changes of this flag within the regex, set up
4990 both cases of the character. Otherwise set the two values the same, which will
4991 avoid duplicate testing (which takes significant time). This covers the vast
4992 majority of cases. It will be suboptimal when the case flag changes in a regex
4993 and the required character in fact is caseful. */
4994 
4995 if ((re->options & PCRE_REQCHSET) != 0)
4996   {
4997   req_char = re->req_char;
4998   req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
4999     (re->tables + fcc_offset)[req_char] : req_char;
5000   }
5001 
5002 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
5003 the loop runs just once. */
5004 
5005 do
5006   {
5007   int rc;
5008   register int *iptr = match_block.offset_vector;
5009   register int *iend = iptr + resetcount;
5010 
5011   /* Reset the maximum number of extractions we might see. */
5012 
5013   while (iptr < iend) *iptr++ = -1;
5014 
5015   /* Advance to a unique first char if possible */
5016 
5017   if (first_char >= 0)
5018     {
5019     if ((ims & PCRE_CASELESS) != 0)
5020       while (start_match < end_subject &&
5021              match_block.lcc[*start_match] != first_char)
5022         start_match++;
5023     else
5024       while (start_match < end_subject && *start_match != first_char)
5025         start_match++;
5026     }
5027 
5028   /* Or to just after \n for a multiline match if possible */
5029 
5030   else if (startline)
5031     {
5032     if (start_match > match_block.start_subject + start_offset)
5033       {
5034       while (start_match < end_subject && start_match[-1] != NEWLINE)
5035         start_match++;
5036       }
5037     }
5038 
5039   /* Or to a non-unique first char after study */
5040 
5041   else if (start_bits != NULL)
5042     {
5043     while (start_match < end_subject)
5044       {
5045       register int c = *start_match;
5046       if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
5047       }
5048     }
5049 
5050 #ifdef DEBUG  /* Sigh. Some compilers never learn. */
5051   printf(">>>> Match against: ");
5052   pchars(start_match, end_subject - start_match, TRUE, &match_block);
5053   printf("\n");
5054 #endif
5055 
5056   /* If req_char is set, we know that that character must appear in the subject
5057   for the match to succeed. If the first character is set, req_char must be
5058   later in the subject; otherwise the test starts at the match point. This
5059   optimization can save a huge amount of backtracking in patterns with nested
5060   unlimited repeats that aren't going to match. We don't know what the state of
5061   case matching may be when this character is hit, so test for it in both its
5062   cases if necessary. However, the different cased versions will not be set up
5063   unless PCRE_CASELESS was given or the casing state changes within the regex.
5064   Writing separate code makes it go faster, as does using an autoincrement and
5065   backing off on a match. */
5066 
5067   if (req_char >= 0)
5068     {
5069     register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
5070 
5071     /* We don't need to repeat the search if we haven't yet reached the
5072     place we found it at last time. */
5073 
5074     if (p > req_char_ptr)
5075       {
5076       /* Do a single test if no case difference is set up */
5077 
5078       if (req_char == req_char2)
5079         {
5080         while (p < end_subject)
5081           {
5082           if (*p++ == req_char) { p--; break; }
5083           }
5084         }
5085 
5086       /* Otherwise test for either case */
5087 
5088       else
5089         {
5090         while (p < end_subject)
5091           {
5092           register int pp = *p++;
5093           if (pp == req_char || pp == req_char2) { p--; break; }
5094           }
5095         }
5096 
5097       /* If we can't find the required character, break the matching loop */
5098 
5099       if (p >= end_subject) break;
5100 
5101       /* If we have found the required character, save the point where we
5102       found it, so that we don't search again next time round the loop if
5103       the start hasn't passed this character yet. */
5104 
5105       req_char_ptr = p;
5106       }
5107     }
5108 
5109   /* When a match occurs, substrings will be set for all internal extractions;
5110   we just need to set up the whole thing as substring 0 before returning. If
5111   there were too many extractions, set the return code to zero. In the case
5112   where we had to get some local store to hold offsets for backreferences, copy
5113   those back references that we can. In this case there need not be overflow
5114   if certain parts of the pattern were not used. */
5115 
5116   match_block.start_match = start_match;
5117   if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
5118     continue;
5119 
5120   /* Copy the offset information from temporary store if necessary */
5121 
5122   if (using_temporary_offsets)
5123     {
5124     if (offsetcount >= 4)
5125       {
5126       memcpy(offsets + 2, match_block.offset_vector + 2,
5127         (offsetcount - 2) * sizeof(int));
5128       DPRINTF(("Copied offsets from temporary memory\n"));
5129       }
5130     if (match_block.end_offset_top > offsetcount)
5131       match_block.offset_overflow = TRUE;
5132 
5133     DPRINTF(("Freeing temporary memory\n"));
5134     (pcre_free)(match_block.offset_vector);
5135     }
5136 
5137   rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
5138 
5139   if (offsetcount < 2) rc = 0; else
5140     {
5141     offsets[0] = start_match - match_block.start_subject;
5142     offsets[1] = match_block.end_match_ptr - match_block.start_subject;
5143     }
5144 
5145   DPRINTF((">>>> returning %d\n", rc));
5146   return rc;
5147   }
5148 
5149 /* This "while" is the end of the "do" above */
5150 
5151 while (!anchored &&
5152        match_block.errorcode == PCRE_ERROR_NOMATCH &&
5153        start_match++ < end_subject);
5154 
5155 if (using_temporary_offsets)
5156   {
5157   DPRINTF(("Freeing temporary memory\n"));
5158   (pcre_free)(match_block.offset_vector);
5159   }
5160 
5161 DPRINTF((">>>> returning %d\n", match_block.errorcode));
5162 
5163 return match_block.errorcode;
5164 }
5165 
5166 /* End of pcre.c */
5167 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.