NCBI C++ Toolkit Cross Reference

C++/src/util/regexp/pcre_compile.c


  1 /*************************************************
  2 *      Perl-Compatible Regular Expressions       *
  3 *************************************************/
  4 
  5 /* PCRE is a library of functions to support regular expressions whose syntax
  6 and semantics are as close as possible to those of the Perl 5 language.
  7 
  8                        Written by Philip Hazel
  9            Copyright (c) 1997-2009 University of Cambridge
 10 
 11 -----------------------------------------------------------------------------
 12 Redistribution and use in source and binary forms, with or without
 13 modification, are permitted provided that the following conditions are met:
 14 
 15     * Redistributions of source code must retain the above copyright notice,
 16       this list of conditions and the following disclaimer.
 17 
 18     * Redistributions in binary form must reproduce the above copyright
 19       notice, this list of conditions and the following disclaimer in the
 20       documentation and/or other materials provided with the distribution.
 21 
 22     * Neither the name of the University of Cambridge nor the names of its
 23       contributors may be used to endorse or promote products derived from
 24       this software without specific prior written permission.
 25 
 26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 36 POSSIBILITY OF SUCH DAMAGE.
 37 -----------------------------------------------------------------------------
 38 */
 39 
 40 
 41 /* This module contains the external function pcre_compile(), along with
 42 supporting internal functions that are not used by other modules. */
 43 
 44 
 45 #ifdef HAVE_CONFIG_H
 46 #include "config.h"
 47 #endif
 48 
 49 #define NLBLOCK cd             /* Block containing newline information */
 50 #define PSSTART start_pattern  /* Field containing processed string start */
 51 #define PSEND   end_pattern    /* Field containing processed string end */
 52 
 53 #include "pcre_internal.h"
 54 
 55 
 56 /* When DEBUG is defined, we need the pcre_printint() function, which is also
 57 used by pcretest. DEBUG is not defined when building a production library. */
 58 
 59 #ifdef DEBUG
 60 #include "pcre_printint.src"
 61 #endif
 62 
 63 
 64 /* Macro for setting individual bits in class bitmaps. */
 65 
 66 #define SETBIT(a,b) a[b/8] |= (1 << (b%8))
 67 
 68 /* Maximum length value to check against when making sure that the integer that
 69 holds the compiled pattern length does not overflow. We make it a bit less than
 70 INT_MAX to allow for adding in group terminating bytes, so that we don't have
 71 to check them every time. */
 72 
 73 #define OFLOW_MAX (INT_MAX - 20)
 74 
 75 
 76 /*************************************************
 77 *      Code parameters and static tables         *
 78 *************************************************/
 79 
 80 /* This value specifies the size of stack workspace that is used during the
 81 first pre-compile phase that determines how much memory is required. The regex
 82 is partly compiled into this space, but the compiled parts are discarded as
 83 soon as they can be, so that hopefully there will never be an overrun. The code
 84 does, however, check for an overrun. The largest amount I've seen used is 218,
 85 so this number is very generous.
 86 
 87 The same workspace is used during the second, actual compile phase for
 88 remembering forward references to groups so that they can be filled in at the
 89 end. Each entry in this list occupies LINK_SIZE bytes, so even when LINK_SIZE
 90 is 4 there is plenty of room. */
 91 
 92 #define COMPILE_WORK_SIZE (4096)
 93 
 94 
 95 /* Table for handling escaped characters in the range '0'-'z'. Positive returns
 96 are simple data values; negative values are for special things like \d and so
 97 on. Zero means further processing is needed (for things like \x), or the escape
 98 is invalid. */
 99 
100 #ifndef EBCDIC
101 
102 /* This is the "normal" table for ASCII systems or for EBCDIC systems running
103 in UTF-8 mode. */
104 
105 static const short int escapes[] = {
106      0,                       0,
107      0,                       0,
108      0,                       0,
109      0,                       0,
110      0,                       0,
111      CHAR_COLON,              CHAR_SEMICOLON,
112      CHAR_LESS_THAN_SIGN,     CHAR_EQUALS_SIGN,
113      CHAR_GREATER_THAN_SIGN,  CHAR_QUESTION_MARK,
114      CHAR_COMMERCIAL_AT,      -ESC_A,
115      -ESC_B,                  -ESC_C,
116      -ESC_D,                  -ESC_E,
117      0,                       -ESC_G,
118      -ESC_H,                  0,
119      0,                       -ESC_K,
120      0,                       0,
121      0,                       0,
122      -ESC_P,                  -ESC_Q,
123      -ESC_R,                  -ESC_S,
124      0,                       0,
125      -ESC_V,                  -ESC_W,
126      -ESC_X,                  0,
127      -ESC_Z,                  CHAR_LEFT_SQUARE_BRACKET,
128      CHAR_BACKSLASH,          CHAR_RIGHT_SQUARE_BRACKET,
129      CHAR_CIRCUMFLEX_ACCENT,  CHAR_UNDERSCORE,
130      CHAR_GRAVE_ACCENT,       7,
131      -ESC_b,                  0,
132      -ESC_d,                  ESC_e,
133      ESC_f,                   0,
134      -ESC_h,                  0,
135      0,                       -ESC_k,
136      0,                       0,
137      ESC_n,                   0,
138      -ESC_p,                  0,
139      ESC_r,                   -ESC_s,
140      ESC_tee,                 0,
141      -ESC_v,                  -ESC_w,
142      0,                       0,
143      -ESC_z
144 };
145 
146 #else
147 
148 /* This is the "abnormal" table for EBCDIC systems without UTF-8 support. */
149 
150 static const short int escapes[] = {
151 /*  48 */     0,     0,      0,     '.',    '<',   '(',    '+',    '|',
152 /*  50 */   '&',     0,      0,       0,      0,     0,      0,      0,
153 /*  58 */     0,     0,    '!',     '$',    '*',   ')',    ';',    '~',
154 /*  60 */   '-',   '/',      0,       0,      0,     0,      0,      0,
155 /*  68 */     0,     0,    '|',     ',',    '%',   '_',    '>',    '?',
156 /*  70 */     0,     0,      0,       0,      0,     0,      0,      0,
157 /*  78 */     0,   '`',    ':',     '#',    '@',  '\'',    '=',    '"',
158 /*  80 */     0,     7, -ESC_b,       0, -ESC_d, ESC_e,  ESC_f,      0,
159 /*  88 */-ESC_h,     0,      0,     '{',      0,     0,      0,      0,
160 /*  90 */     0,     0, -ESC_k,     'l',      0, ESC_n,      0, -ESC_p,
161 /*  98 */     0, ESC_r,      0,     '}',      0,     0,      0,      0,
162 /*  A0 */     0,   '~', -ESC_s, ESC_tee,      0,-ESC_v, -ESC_w,      0,
163 /*  A8 */     0,-ESC_z,      0,       0,      0,   '[',      0,      0,
164 /*  B0 */     0,     0,      0,       0,      0,     0,      0,      0,
165 /*  B8 */     0,     0,      0,       0,      0,   ']',    '=',    '-',
166 /*  C0 */   '{',-ESC_A, -ESC_B,  -ESC_C, -ESC_D,-ESC_E,      0, -ESC_G,
167 /*  C8 */-ESC_H,     0,      0,       0,      0,     0,      0,      0,
168 /*  D0 */   '}',     0, -ESC_K,       0,      0,     0,      0, -ESC_P,
169 /*  D8 */-ESC_Q,-ESC_R,      0,       0,      0,     0,      0,      0,
170 /*  E0 */  '\\',     0, -ESC_S,       0,      0,-ESC_V, -ESC_W, -ESC_X,
171 /*  E8 */     0,-ESC_Z,      0,       0,      0,     0,      0,      0,
172 /*  F0 */     0,     0,      0,       0,      0,     0,      0,      0,
173 /*  F8 */     0,     0,      0,       0,      0,     0,      0,      0
174 };
175 #endif
176 
177 
178 /* Table of special "verbs" like (*PRUNE). This is a short table, so it is
179 searched linearly. Put all the names into a single string, in order to reduce
180 the number of relocations when a shared library is dynamically linked. The
181 string is built from string macros so that it works in UTF-8 mode on EBCDIC
182 platforms. */
183 
184 typedef struct verbitem {
185   int   len;
186   int   op;
187 } verbitem;
188 
189 static const char verbnames[] =
190   STRING_ACCEPT0
191   STRING_COMMIT0
192   STRING_F0
193   STRING_FAIL0
194   STRING_PRUNE0
195   STRING_SKIP0
196   STRING_THEN;
197 
198 static const verbitem verbs[] = {
199   { 6, OP_ACCEPT },
200   { 6, OP_COMMIT },
201   { 1, OP_FAIL },
202   { 4, OP_FAIL },
203   { 5, OP_PRUNE },
204   { 4, OP_SKIP  },
205   { 4, OP_THEN  }
206 };
207 
208 static const int verbcount = sizeof(verbs)/sizeof(verbitem);
209 
210 
211 /* Tables of names of POSIX character classes and their lengths. The names are
212 now all in a single string, to reduce the number of relocations when a shared
213 library is dynamically loaded. The list of lengths is terminated by a zero
214 length entry. The first three must be alpha, lower, upper, as this is assumed
215 for handling case independence. */
216 
217 static const char posix_names[] =
218   STRING_alpha0 STRING_lower0 STRING_upper0 STRING_alnum0
219   STRING_ascii0 STRING_blank0 STRING_cntrl0 STRING_digit0
220   STRING_graph0 STRING_print0 STRING_punct0 STRING_space0
221   STRING_word0  STRING_xdigit;
222 
223 static const uschar posix_name_lengths[] = {
224   5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
225 
226 /* Table of class bit maps for each POSIX class. Each class is formed from a
227 base map, with an optional addition or removal of another map. Then, for some
228 classes, there is some additional tweaking: for [:blank:] the vertical space
229 characters are removed, and for [:alpha:] and [:alnum:] the underscore
230 character is removed. The triples in the table consist of the base map offset,
231 second map offset or -1 if no second map, and a non-negative value for map
232 addition or a negative value for map subtraction (if there are two maps). The
233 absolute value of the third field has these meanings: 0 => no tweaking, 1 =>
234 remove vertical space characters, 2 => remove underscore. */
235 
236 static const int posix_class_maps[] = {
237   cbit_word,  cbit_digit, -2,             /* alpha */
238   cbit_lower, -1,          0,             /* lower */
239   cbit_upper, -1,          0,             /* upper */
240   cbit_word,  -1,          2,             /* alnum - word without underscore */
241   cbit_print, cbit_cntrl,  0,             /* ascii */
242   cbit_space, -1,          1,             /* blank - a GNU extension */
243   cbit_cntrl, -1,          0,             /* cntrl */
244   cbit_digit, -1,          0,             /* digit */
245   cbit_graph, -1,          0,             /* graph */
246   cbit_print, -1,          0,             /* print */
247   cbit_punct, -1,          0,             /* punct */
248   cbit_space, -1,          0,             /* space */
249   cbit_word,  -1,          0,             /* word - a Perl extension */
250   cbit_xdigit,-1,          0              /* xdigit */
251 };
252 
253 
254 #define STRING(a)  # a
255 #define XSTRING(s) STRING(s)
256 
257 /* The texts of compile-time error messages. These are "char *" because they
258 are passed to the outside world. Do not ever re-use any error number, because
259 they are documented. Always add a new error instead. Messages marked DEAD below
260 are no longer used. This used to be a table of strings, but in order to reduce
261 the number of relocations needed when a shared library is loaded dynamically,
262 it is now one long string. We cannot use a table of offsets, because the
263 lengths of inserts such as XSTRING(MAX_NAME_SIZE) are not known. Instead, we
264 simply count through to the one we want - this isn't a performance issue
265 because these strings are used only when there is a compilation error. */
266 
267 static const char error_texts[] =
268   "no error\0"
269   "\\ at end of pattern\0"
270   "\\c at end of pattern\0"
271   "unrecognized character follows \\\0"
272   "numbers out of order in {} quantifier\0"
273   /* 5 */
274   "number too big in {} quantifier\0"
275   "missing terminating ] for character class\0"
276   "invalid escape sequence in character class\0"
277   "range out of order in character class\0"
278   "nothing to repeat\0"
279   /* 10 */
280   "operand of unlimited repeat could match the empty string\0"  /** DEAD **/
281   "internal error: unexpected repeat\0"
282   "unrecognized character after (? or (?-\0"
283   "POSIX named classes are supported only within a class\0"
284   "missing )\0"
285   /* 15 */
286   "reference to non-existent subpattern\0"
287   "erroffset passed as NULL\0"
288   "unknown option bit(s) set\0"
289   "missing ) after comment\0"
290   "parentheses nested too deeply\0"  /** DEAD **/
291   /* 20 */
292   "regular expression is too large\0"
293   "failed to get memory\0"
294   "unmatched parentheses\0"
295   "internal error: code overflow\0"
296   "unrecognized character after (?<\0"
297   /* 25 */
298   "lookbehind assertion is not fixed length\0"
299   "malformed number or name after (?(\0"
300   "conditional group contains more than two branches\0"
301   "assertion expected after (?(\0"
302   "(?R or (?[+-]digits must be followed by )\0"
303   /* 30 */
304   "unknown POSIX class name\0"
305   "POSIX collating elements are not supported\0"
306   "this version of PCRE is not compiled with PCRE_UTF8 support\0"
307   "spare error\0"  /** DEAD **/
308   "character value in \\x{...} sequence is too large\0"
309   /* 35 */
310   "invalid condition (?(0)\0"
311   "\\C not allowed in lookbehind assertion\0"
312   "PCRE does not support \\L, \\l, \\N, \\U, or \\u\0"
313   "number after (?C is > 255\0"
314   "closing ) for (?C expected\0"
315   /* 40 */
316   "recursive call could loop indefinitely\0"
317   "unrecognized character after (?P\0"
318   "syntax error in subpattern name (missing terminator)\0"
319   "two named subpatterns have the same name\0"
320   "invalid UTF-8 string\0"
321   /* 45 */
322   "support for \\P, \\p, and \\X has not been compiled\0"
323   "malformed \\P or \\p sequence\0"
324   "unknown property name after \\P or \\p\0"
325   "subpattern name is too long (maximum " XSTRING(MAX_NAME_SIZE) " characters)\0"
326   "too many named subpatterns (maximum " XSTRING(MAX_NAME_COUNT) ")\0"
327   /* 50 */
328   "repeated subpattern is too long\0"    /** DEAD **/
329   "octal value is greater than \\377 (not in UTF-8 mode)\0"
330   "internal error: overran compiling workspace\0"
331   "internal error: previously-checked referenced subpattern not found\0"
332   "DEFINE group contains more than one branch\0"
333   /* 55 */
334   "repeating a DEFINE group is not allowed\0"
335   "inconsistent NEWLINE options\0"
336   "\\g is not followed by a braced, angle-bracketed, or quoted name/number or by a plain number\0"
337   "a numbered reference must not be zero\0"
338   "(*VERB) with an argument is not supported\0"
339   /* 60 */
340   "(*VERB) not recognized\0"
341   "number is too big\0"
342   "subpattern name expected\0"
343   "digit expected after (?+\0"
344   "] is an invalid data character in JavaScript compatibility mode";
345 
346 
347 /* Table to identify digits and hex digits. This is used when compiling
348 patterns. Note that the tables in chartables are dependent on the locale, and
349 may mark arbitrary characters as digits - but the PCRE compiling code expects
350 to handle only 0-9, a-z, and A-Z as digits when compiling. That is why we have
351 a private table here. It costs 256 bytes, but it is a lot faster than doing
352 character value tests (at least in some simple cases I timed), and in some
353 applications one wants PCRE to compile efficiently as well as match
354 efficiently.
355 
356 For convenience, we use the same bit definitions as in chartables:
357 
358   0x04   decimal digit
359   0x08   hexadecimal digit
360 
361 Then we can use ctype_digit and ctype_xdigit in the code. */
362 
363 #ifndef EBCDIC
364 
365 /* This is the "normal" case, for ASCII systems, and EBCDIC systems running in
366 UTF-8 mode. */
367 
368 static const unsigned char digitab[] =
369   {
370   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7 */
371   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15 */
372   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 */
373   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
374   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - '  */
375   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ( - /  */
376   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  */
377   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00, /*  8 - ?  */
378   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  @ - G  */
379   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H - O  */
380   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  P - W  */
381   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  X - _  */
382   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  ` - g  */
383   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h - o  */
384   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  p - w  */
385   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  x -127 */
386   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 128-135 */
387   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 136-143 */
388   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144-151 */
389   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 152-159 */
390   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160-167 */
391   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 168-175 */
392   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 176-183 */
393   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
394   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 192-199 */
395   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 200-207 */
396   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 208-215 */
397   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 216-223 */
398   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 224-231 */
399   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 232-239 */
400   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 240-247 */
401   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00};/* 248-255 */
402 
403 #else
404 
405 /* This is the "abnormal" case, for EBCDIC systems not running in UTF-8 mode. */
406 
407 static const unsigned char digitab[] =
408   {
409   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   0-  7  0 */
410   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*   8- 15    */
411   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  16- 23 10 */
412   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31    */
413   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  32- 39 20 */
414   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47    */
415   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 30 */
416   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63    */
417   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 40 */
418   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  72- |     */
419   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 50 */
420   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  88- 95    */
421   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 60 */
422   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 104- ?     */
423   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 70 */
424   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "     */
425   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /* 128- g  80 */
426   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143    */
427   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 144- p  90 */
428   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159    */
429   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 160- x  A0 */
430   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175    */
431   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 B0 */
432   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 184-191    */
433   0x00,0x08,0x08,0x08,0x08,0x08,0x08,0x00, /*  { - G  C0 */
434   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207    */
435   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  } - P  D0 */
436   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223    */
437   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  \ - X  E0 */
438   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239    */
439   0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c,0x0c, /*  0 - 7  F0 */
440   0x0c,0x0c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255    */
441 
442 static const unsigned char ebcdic_chartab[] = { /* chartable partial dup */
443   0x80,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*   0-  7 */
444   0x00,0x00,0x00,0x00,0x01,0x01,0x00,0x00, /*   8- 15 */
445   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  16- 23 */
446   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  24- 31 */
447   0x00,0x00,0x00,0x00,0x00,0x01,0x00,0x00, /*  32- 39 */
448   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  40- 47 */
449   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  48- 55 */
450   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  56- 63 */
451   0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*    - 71 */
452   0x00,0x00,0x00,0x80,0x00,0x80,0x80,0x80, /*  72- |  */
453   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  & - 87 */
454   0x00,0x00,0x00,0x80,0x80,0x80,0x00,0x00, /*  88- 95 */
455   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  - -103 */
456   0x00,0x00,0x00,0x00,0x00,0x10,0x00,0x80, /* 104- ?  */
457   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 112-119 */
458   0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /* 120- "  */
459   0x00,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /* 128- g  */
460   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  h -143 */
461   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /* 144- p  */
462   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  q -159 */
463   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /* 160- x  */
464   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  y -175 */
465   0x80,0x00,0x00,0x00,0x00,0x00,0x00,0x00, /*  ^ -183 */
466   0x00,0x00,0x80,0x00,0x00,0x00,0x00,0x00, /* 184-191 */
467   0x80,0x1a,0x1a,0x1a,0x1a,0x1a,0x1a,0x12, /*  { - G  */
468   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  H -207 */
469   0x00,0x12,0x12,0x12,0x12,0x12,0x12,0x12, /*  } - P  */
470   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Q -223 */
471   0x00,0x00,0x12,0x12,0x12,0x12,0x12,0x12, /*  \ - X  */
472   0x12,0x12,0x00,0x00,0x00,0x00,0x00,0x00, /*  Y -239 */
473   0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c,0x1c, /*  0 - 7  */
474   0x1c,0x1c,0x00,0x00,0x00,0x00,0x00,0x00};/*  8 -255 */
475 #endif
476 
477 
478 /* Definition to allow mutual recursion */
479 
480 static BOOL
481   compile_regex(int, int, uschar **, const uschar **, int *, BOOL, BOOL, int,
482     int *, int *, branch_chain *, compile_data *, int *);
483 
484 
485 
486 /*************************************************
487 *            Find an error text                  *
488 *************************************************/
489 
490 /* The error texts are now all in one long string, to save on relocations. As
491 some of the text is of unknown length, we can't use a table of offsets.
492 Instead, just count through the strings. This is not a performance issue
493 because it happens only when there has been a compilation error.
494 
495 Argument:   the error number
496 Returns:    pointer to the error string
497 */
498 
499 static const char *
500 find_error_text(int n)
501 {
502 const char *s = error_texts;
503 for (; n > 0; n--) while (*s++ != 0) {};
504 return s;
505 }
506 
507 
508 /*************************************************
509 *            Handle escapes                      *
510 *************************************************/
511 
512 /* This function is called when a \ has been encountered. It either returns a
513 positive value for a simple escape such as \n, or a negative value which
514 encodes one of the more complicated things such as \d. A backreference to group
515 n is returned as -(ESC_REF + n); ESC_REF is the highest ESC_xxx macro. When
516 UTF-8 is enabled, a positive value greater than 255 may be returned. On entry,
517 ptr is pointing at the \. On exit, it is on the final character of the escape
518 sequence.
519 
520 Arguments:
521   ptrptr         points to the pattern position pointer
522   errorcodeptr   points to the errorcode variable
523   bracount       number of previous extracting brackets
524   options        the options bits
525   isclass        TRUE if inside a character class
526 
527 Returns:         zero or positive => a data character
528                  negative => a special escape sequence
529                  on error, errorcodeptr is set
530 */
531 
532 static int
533 check_escape(const uschar **ptrptr, int *errorcodeptr, int bracount,
534   int options, BOOL isclass)
535 {
536 BOOL utf8 = (options & PCRE_UTF8) != 0;
537 const uschar *ptr = *ptrptr + 1;
538 int c, i;
539 
540 GETCHARINCTEST(c, ptr);           /* Get character value, increment pointer */
541 ptr--;                            /* Set pointer back to the last byte */
542 
543 /* If backslash is at the end of the pattern, it's an error. */
544 
545 if (c == 0) *errorcodeptr = ERR1;
546 
547 /* Non-alphanumerics are literals. For digits or letters, do an initial lookup
548 in a table. A non-zero result is something that can be returned immediately.
549 Otherwise further processing may be required. */
550 
551 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
552 else if (c < CHAR_0 || c > CHAR_z) {}                     /* Not alphanumeric */
553 else if ((i = escapes[c - CHAR_0]) != 0) c = i;
554 
555 #else           /* EBCDIC coding */
556 else if (c < 'a' || (ebcdic_chartab[c] & 0x0E) == 0) {}   /* Not alphanumeric */
557 else if ((i = escapes[c - 0x48]) != 0)  c = i;
558 #endif
559 
560 /* Escapes that need further processing, or are illegal. */
561 
562 else
563   {
564   const uschar *oldptr;
565   BOOL braced, negated;
566 
567   switch (c)
568     {
569     /* A number of Perl escapes are not handled by PCRE. We give an explicit
570     error. */
571 
572     case CHAR_l:
573     case CHAR_L:
574     case CHAR_N:
575     case CHAR_u:
576     case CHAR_U:
577     *errorcodeptr = ERR37;
578     break;
579 
580     /* \g must be followed by one of a number of specific things:
581 
582     (1) A number, either plain or braced. If positive, it is an absolute
583     backreference. If negative, it is a relative backreference. This is a Perl
584     5.10 feature.
585 
586     (2) Perl 5.10 also supports \g{name} as a reference to a named group. This
587     is part of Perl's movement towards a unified syntax for back references. As
588     this is synonymous with \k{name}, we fudge it up by pretending it really
589     was \k.
590 
591     (3) For Oniguruma compatibility we also support \g followed by a name or a
592     number either in angle brackets or in single quotes. However, these are
593     (possibly recursive) subroutine calls, _not_ backreferences. Just return
594     the -ESC_g code (cf \k). */
595 
596     case CHAR_g:
597     if (ptr[1] == CHAR_LESS_THAN_SIGN || ptr[1] == CHAR_APOSTROPHE)
598       {
599       c = -ESC_g;
600       break;
601       }
602 
603     /* Handle the Perl-compatible cases */
604 
605     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
606       {
607       const uschar *p;
608       for (p = ptr+2; *p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET; p++)
609         if (*p != CHAR_MINUS && (digitab[*p] & ctype_digit) == 0) break;
610       if (*p != 0 && *p != CHAR_RIGHT_CURLY_BRACKET)
611         {
612         c = -ESC_k;
613         break;
614         }
615       braced = TRUE;
616       ptr++;
617       }
618     else braced = FALSE;
619 
620     if (ptr[1] == CHAR_MINUS)
621       {
622       negated = TRUE;
623       ptr++;
624       }
625     else negated = FALSE;
626 
627     c = 0;
628     while ((digitab[ptr[1]] & ctype_digit) != 0)
629       c = c * 10 + *(++ptr) - CHAR_0;
630 
631     if (c < 0)   /* Integer overflow */
632       {
633       *errorcodeptr = ERR61;
634       break;
635       }
636 
637     if (braced && *(++ptr) != CHAR_RIGHT_CURLY_BRACKET)
638       {
639       *errorcodeptr = ERR57;
640       break;
641       }
642 
643     if (c == 0)
644       {
645       *errorcodeptr = ERR58;
646       break;
647       }
648 
649     if (negated)
650       {
651       if (c > bracount)
652         {
653         *errorcodeptr = ERR15;
654         break;
655         }
656       c = bracount - (c - 1);
657       }
658 
659     c = -(ESC_REF + c);
660     break;
661 
662     /* The handling of escape sequences consisting of a string of digits
663     starting with one that is not zero is not straightforward. By experiment,
664     the way Perl works seems to be as follows:
665 
666     Outside a character class, the digits are read as a decimal number. If the
667     number is less than 10, or if there are that many previous extracting
668     left brackets, then it is a back reference. Otherwise, up to three octal
669     digits are read to form an escaped byte. Thus \123 is likely to be octal
670     123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
671     value is greater than 377, the least significant 8 bits are taken. Inside a
672     character class, \ followed by a digit is always an octal number. */
673 
674     case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4: case CHAR_5:
675     case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
676 
677     if (!isclass)
678       {
679       oldptr = ptr;
680       c -= CHAR_0;
681       while ((digitab[ptr[1]] & ctype_digit) != 0)
682         c = c * 10 + *(++ptr) - CHAR_0;
683       if (c < 0)    /* Integer overflow */
684         {
685         *errorcodeptr = ERR61;
686         break;
687         }
688       if (c < 10 || c <= bracount)
689         {
690         c = -(ESC_REF + c);
691         break;
692         }
693       ptr = oldptr;      /* Put the pointer back and fall through */
694       }
695 
696     /* Handle an octal number following \. If the first digit is 8 or 9, Perl
697     generates a binary zero byte and treats the digit as a following literal.
698     Thus we have to pull back the pointer by one. */
699 
700     if ((c = *ptr) >= CHAR_8)
701       {
702       ptr--;
703       c = 0;
704       break;
705       }
706 
707     /* \0 always starts an octal number, but we may drop through to here with a
708     larger first octal digit. The original code used just to take the least
709     significant 8 bits of octal numbers (I think this is what early Perls used
710     to do). Nowadays we allow for larger numbers in UTF-8 mode, but no more
711     than 3 octal digits. */
712 
713     case CHAR_0:
714     c -= CHAR_0;
715     while(i++ < 2 && ptr[1] >= CHAR_0 && ptr[1] <= CHAR_7)
716         c = c * 8 + *(++ptr) - CHAR_0;
717     if (!utf8 && c > 255) *errorcodeptr = ERR51;
718     break;
719 
720     /* \x is complicated. \x{ddd} is a character number which can be greater
721     than 0xff in utf8 mode, but only if the ddd are hex digits. If not, { is
722     treated as a data character. */
723 
724     case CHAR_x:
725     if (ptr[1] == CHAR_LEFT_CURLY_BRACKET)
726       {
727       const uschar *pt = ptr + 2;
728       int count = 0;
729 
730       c = 0;
731       while ((digitab[*pt] & ctype_xdigit) != 0)
732         {
733         register int cc = *pt++;
734         if (c == 0 && cc == CHAR_0) continue;     /* Leading zeroes */
735         count++;
736 
737 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
738         if (cc >= CHAR_a) cc -= 32;               /* Convert to upper case */
739         c = (c << 4) + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
740 #else           /* EBCDIC coding */
741         if (cc >= CHAR_a && cc <= CHAR_z) cc += 64;  /* Convert to upper case */
742         c = (c << 4) + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
743 #endif
744         }
745 
746       if (*pt == CHAR_RIGHT_CURLY_BRACKET)
747         {
748         if (c < 0 || count > (utf8? 8 : 2)) *errorcodeptr = ERR34;
749         ptr = pt;
750         break;
751         }
752 
753       /* If the sequence of hex digits does not end with '}', then we don't
754       recognize this construct; fall through to the normal \x handling. */
755       }
756 
757     /* Read just a single-byte hex-defined char */
758 
759     c = 0;
760     while (i++ < 2 && (digitab[ptr[1]] & ctype_xdigit) != 0)
761       {
762       int cc;                                  /* Some compilers don't like */
763       cc = *(++ptr);                           /* ++ in initializers */
764 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
765       if (cc >= CHAR_a) cc -= 32;              /* Convert to upper case */
766       c = c * 16 + cc - ((cc < CHAR_A)? CHAR_0 : (CHAR_A - 10));
767 #else           /* EBCDIC coding */
768       if (cc <= CHAR_z) cc += 64;              /* Convert to upper case */
769       c = c * 16 + cc - ((cc >= CHAR_0)? CHAR_0 : (CHAR_A - 10));
770 #endif
771       }
772     break;
773 
774     /* For \c, a following letter is upper-cased; then the 0x40 bit is flipped.
775     This coding is ASCII-specific, but then the whole concept of \cx is
776     ASCII-specific. (However, an EBCDIC equivalent has now been added.) */
777 
778     case CHAR_c:
779     c = *(++ptr);
780     if (c == 0)
781       {
782       *errorcodeptr = ERR2;
783       break;
784       }
785 
786 #ifndef EBCDIC  /* ASCII/UTF-8 coding */
787     if (c >= CHAR_a && c <= CHAR_z) c -= 32;
788     c ^= 0x40;
789 #else           /* EBCDIC coding */
790     if (c >= CHAR_a && c <= CHAR_z) c += 64;
791     c ^= 0xC0;
792 #endif
793     break;
794 
795     /* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
796     other alphanumeric following \ is an error if PCRE_EXTRA was set;
797     otherwise, for Perl compatibility, it is a literal. This code looks a bit
798     odd, but there used to be some cases other than the default, and there may
799     be again in future, so I haven't "optimized" it. */
800 
801     default:
802     if ((options & PCRE_EXTRA) != 0) switch(c)
803       {
804       default:
805       *errorcodeptr = ERR3;
806       break;
807       }
808     break;
809     }
810   }
811 
812 *ptrptr = ptr;
813 return c;
814 }
815 
816 
817 
818 #ifdef SUPPORT_UCP
819 /*************************************************
820 *               Handle \P and \p                 *
821 *************************************************/
822 
823 /* This function is called after \P or \p has been encountered, provided that
824 PCRE is compiled with support for Unicode properties. On entry, ptrptr is
825 pointing at the P or p. On exit, it is pointing at the final character of the
826 escape sequence.
827 
828 Argument:
829   ptrptr         points to the pattern position pointer
830   negptr         points to a boolean that is set TRUE for negation else FALSE
831   dptr           points to an int that is set to the detailed property value
832   errorcodeptr   points to the error code variable
833 
834 Returns:         type value from ucp_type_table, or -1 for an invalid type
835 */
836 
837 static int
838 get_ucp(const uschar **ptrptr, BOOL *negptr, int *dptr, int *errorcodeptr)
839 {
840 int c, i, bot, top;
841 const uschar *ptr = *ptrptr;
842 char name[32];
843 
844 c = *(++ptr);
845 if (c == 0) goto ERROR_RETURN;
846 
847 *negptr = FALSE;
848 
849 /* \P or \p can be followed by a name in {}, optionally preceded by ^ for
850 negation. */
851 
852 if (c == CHAR_LEFT_CURLY_BRACKET)
853   {
854   if (ptr[1] == CHAR_CIRCUMFLEX_ACCENT)
855     {
856     *negptr = TRUE;
857     ptr++;
858     }
859   for (i = 0; i < (int)sizeof(name) - 1; i++)
860     {
861     c = *(++ptr);
862     if (c == 0) goto ERROR_RETURN;
863     if (c == CHAR_RIGHT_CURLY_BRACKET) break;
864     name[i] = c;
865     }
866   if (c != CHAR_RIGHT_CURLY_BRACKET) goto ERROR_RETURN;
867   name[i] = 0;
868   }
869 
870 /* Otherwise there is just one following character */
871 
872 else
873   {
874   name[0] = c;
875   name[1] = 0;
876   }
877 
878 *ptrptr = ptr;
879 
880 /* Search for a recognized property name using binary chop */
881 
882 bot = 0;
883 top = _pcre_utt_size;
884 
885 while (bot < top)
886   {
887   i = (bot + top) >> 1;
888   c = strcmp(name, _pcre_utt_names + _pcre_utt[i].name_offset);
889   if (c == 0)
890     {
891     *dptr = _pcre_utt[i].value;
892     return _pcre_utt[i].type;
893     }
894   if (c > 0) bot = i + 1; else top = i;
895   }
896 
897 *errorcodeptr = ERR47;
898 *ptrptr = ptr;
899 return -1;
900 
901 ERROR_RETURN:
902 *errorcodeptr = ERR46;
903 *ptrptr = ptr;
904 return -1;
905 }
906 #endif
907 
908 
909 
910 
911 /*************************************************
912 *            Check for counted repeat            *
913 *************************************************/
914 
915 /* This function is called when a '{' is encountered in a place where it might
916 start a quantifier. It looks ahead to see if it really is a quantifier or not.
917 It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
918 where the ddds are digits.
919 
920 Arguments:
921   p         pointer to the first char after '{'
922 
923 Returns:    TRUE or FALSE
924 */
925 
926 static BOOL
927 is_counted_repeat(const uschar *p)
928 {
929 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
930 while ((digitab[*p] & ctype_digit) != 0) p++;
931 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
932 
933 if (*p++ != CHAR_COMMA) return FALSE;
934 if (*p == CHAR_RIGHT_CURLY_BRACKET) return TRUE;
935 
936 if ((digitab[*p++] & ctype_digit) == 0) return FALSE;
937 while ((digitab[*p] & ctype_digit) != 0) p++;
938 
939 return (*p == CHAR_RIGHT_CURLY_BRACKET);
940 }
941 
942 
943 
944 /*************************************************
945 *         Read repeat counts                     *
946 *************************************************/
947 
948 /* Read an item of the form {n,m} and return the values. This is called only
949 after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
950 so the syntax is guaranteed to be correct, but we need to check the values.
951 
952 Arguments:
953   p              pointer to first char after '{'
954   minp           pointer to int for min
955   maxp           pointer to int for max
956                  returned as -1 if no max
957   errorcodeptr   points to error code variable
958 
959 Returns:         pointer to '}' on success;
960                  current ptr on error, with errorcodeptr set non-zero
961 */
962 
963 static const uschar *
964 read_repeat_counts(const uschar *p, int *minp, int *maxp, int *errorcodeptr)
965 {
966 int min = 0;
967 int max = -1;
968 
969 /* Read the minimum value and do a paranoid check: a negative value indicates
970 an integer overflow. */
971 
972 while ((digitab[*p] & ctype_digit) != 0) min = min * 10 + *p++ - CHAR_0;
973 if (min < 0 || min > 65535)
974   {
975   *errorcodeptr = ERR5;
976   return p;
977   }
978 
979 /* Read the maximum value if there is one, and again do a paranoid on its size.
980 Also, max must not be less than min. */
981 
982 if (*p == CHAR_RIGHT_CURLY_BRACKET) max = min; else
983   {
984   if (*(++p) != CHAR_RIGHT_CURLY_BRACKET)
985     {
986     max = 0;
987     while((digitab[*p] & ctype_digit) != 0) max = max * 10 + *p++ - CHAR_0;
988     if (max < 0 || max > 65535)
989       {
990       *errorcodeptr = ERR5;
991       return p;
992       }
993     if (max < min)
994       {
995       *errorcodeptr = ERR4;
996       return p;
997       }
998     }
999   }
1000 
1001 /* Fill in the required variables, and pass back the pointer to the terminating
1002 '}'. */
1003 
1004 *minp = min;
1005 *maxp = max;
1006 return p;
1007 }
1008 
1009 
1010 
1011 /*************************************************
1012 *  Subroutine for finding forward reference      *
1013 *************************************************/
1014 
1015 /* This recursive function is called only from find_parens() below. The
1016 top-level call starts at the beginning of the pattern. All other calls must
1017 start at a parenthesis. It scans along a pattern's text looking for capturing
1018 subpatterns, and counting them. If it finds a named pattern that matches the
1019 name it is given, it returns its number. Alternatively, if the name is NULL, it
1020 returns when it reaches a given numbered subpattern. We know that if (?P< is
1021 encountered, the name will be terminated by '>' because that is checked in the
1022 first pass. Recursion is used to keep track of subpatterns that reset the
1023 capturing group numbers - the (?| feature.
1024 
1025 Arguments:
1026   ptrptr       address of the current character pointer (updated)
1027   cd           compile background data
1028   name         name to seek, or NULL if seeking a numbered subpattern
1029   lorn         name length, or subpattern number if name is NULL
1030   xmode        TRUE if we are in /x mode
1031   count        pointer to the current capturing subpattern number (updated)
1032 
1033 Returns:       the number of the named subpattern, or -1 if not found
1034 */
1035 
1036 static int
1037 find_parens_sub(uschar **ptrptr, compile_data *cd, const uschar *name, int lorn,
1038   BOOL xmode, int *count)
1039 {
1040 uschar *ptr = *ptrptr;
1041 int start_count = *count;
1042 int hwm_count = start_count;
1043 BOOL dup_parens = FALSE;
1044 
1045 /* If the first character is a parenthesis, check on the type of group we are
1046 dealing with. The very first call may not start with a parenthesis. */
1047 
1048 if (ptr[0] == CHAR_LEFT_PARENTHESIS)
1049   {
1050   if (ptr[1] == CHAR_QUESTION_MARK &&
1051       ptr[2] == CHAR_VERTICAL_LINE)
1052     {
1053     ptr += 3;
1054     dup_parens = TRUE;
1055     }
1056 
1057   /* Handle a normal, unnamed capturing parenthesis */
1058 
1059   else if (ptr[1] != CHAR_QUESTION_MARK && ptr[1] != CHAR_ASTERISK)
1060     {
1061     *count += 1;
1062     if (name == NULL && *count == lorn) return *count;
1063     ptr++;
1064     }
1065 
1066   /* Handle a condition. If it is an assertion, just carry on so that it
1067   is processed as normal. If not, skip to the closing parenthesis of the
1068   condition (there can't be any nested parens. */
1069 
1070   else if (ptr[2] == CHAR_LEFT_PARENTHESIS)
1071     {
1072     ptr += 2;
1073     if (ptr[1] != CHAR_QUESTION_MARK)
1074       {
1075       while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
1076       if (*ptr != 0) ptr++;
1077       }
1078     }
1079 
1080   /* We have either (? or (* and not a condition */
1081 
1082   else
1083     {
1084     ptr += 2;
1085     if (*ptr == CHAR_P) ptr++;                      /* Allow optional P */
1086 
1087     /* We have to disambiguate (?<! and (?<= from (?<name> for named groups */
1088 
1089     if ((*ptr == CHAR_LESS_THAN_SIGN && ptr[1] != CHAR_EXCLAMATION_MARK &&
1090         ptr[1] != CHAR_EQUALS_SIGN) || *ptr == CHAR_APOSTROPHE)
1091       {
1092       int term;
1093       const uschar *thisname;
1094       *count += 1;
1095       if (name == NULL && *count == lorn) return *count;
1096       term = *ptr++;
1097       if (term == CHAR_LESS_THAN_SIGN) term = CHAR_GREATER_THAN_SIGN;
1098       thisname = ptr;
1099       while (*ptr != term) ptr++;
1100       if (name != NULL && lorn == ptr - thisname &&
1101           strncmp((const char *)name, (const char *)thisname, lorn) == 0)
1102         return *count;
1103       }
1104     }
1105   }
1106 
1107 /* Past any initial parenthesis handling, scan for parentheses or vertical
1108 bars. */
1109 
1110 for (; *ptr != 0; ptr++)
1111   {
1112   /* Skip over backslashed characters and also entire \Q...\E */
1113 
1114   if (*ptr == CHAR_BACKSLASH)
1115     {
1116     if (*(++ptr) == 0) goto FAIL_EXIT;
1117     if (*ptr == CHAR_Q) for (;;)
1118       {
1119       while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1120       if (*ptr == 0) goto FAIL_EXIT;
1121       if (*(++ptr) == CHAR_E) break;
1122       }
1123     continue;
1124     }
1125 
1126   /* Skip over character classes; this logic must be similar to the way they
1127   are handled for real. If the first character is '^', skip it. Also, if the
1128   first few characters (either before or after ^) are \Q\E or \E we skip them
1129   too. This makes for compatibility with Perl. Note the use of STR macros to
1130   encode "Q\\E" so that it works in UTF-8 on EBCDIC platforms. */
1131 
1132   if (*ptr == CHAR_LEFT_SQUARE_BRACKET)
1133     {
1134     BOOL negate_class = FALSE;
1135     for (;;)
1136       {
1137       int c = *(++ptr);
1138       if (c == CHAR_BACKSLASH)
1139         {
1140         if (ptr[1] == CHAR_E)
1141           ptr++;
1142         else if (strncmp((const char *)ptr+1,
1143                  STR_Q STR_BACKSLASH STR_E, 3) == 0)
1144           ptr += 3;
1145         else
1146           break;
1147         }
1148       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
1149         negate_class = TRUE;
1150       else break;
1151       }
1152 
1153     /* If the next character is ']', it is a data character that must be
1154     skipped, except in JavaScript compatibility mode. */
1155 
1156     if (ptr[1] == CHAR_RIGHT_SQUARE_BRACKET &&
1157         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) == 0)
1158       ptr++;
1159 
1160     while (*(++ptr) != CHAR_RIGHT_SQUARE_BRACKET)
1161       {
1162       if (*ptr == 0) return -1;
1163       if (*ptr == CHAR_BACKSLASH)
1164         {
1165         if (*(++ptr) == 0) goto FAIL_EXIT;
1166         if (*ptr == CHAR_Q) for (;;)
1167           {
1168           while (*(++ptr) != 0 && *ptr != CHAR_BACKSLASH) {};
1169           if (*ptr == 0) goto FAIL_EXIT;
1170           if (*(++ptr) == CHAR_E) break;
1171           }
1172         continue;
1173         }
1174       }
1175     continue;
1176     }
1177 
1178   /* Skip comments in /x mode */
1179 
1180   if (xmode && *ptr == CHAR_NUMBER_SIGN)
1181     {
1182     while (*(++ptr) != 0 && *ptr != CHAR_NL) {};
1183     if (*ptr == 0) goto FAIL_EXIT;
1184     continue;
1185     }
1186 
1187   /* Check for the special metacharacters */
1188 
1189   if (*ptr == CHAR_LEFT_PARENTHESIS)
1190     {
1191     int rc = find_parens_sub(&ptr, cd, name, lorn, xmode, count);
1192     if (rc > 0) return rc;
1193     if (*ptr == 0) goto FAIL_EXIT;
1194     }
1195 
1196   else if (*ptr == CHAR_RIGHT_PARENTHESIS)
1197     {
1198     if (dup_parens && *count < hwm_count) *count = hwm_count;
1199     *ptrptr = ptr;
1200     return -1;
1201     }
1202 
1203   else if (*ptr == CHAR_VERTICAL_LINE && dup_parens)
1204     {
1205     if (*count > hwm_count) hwm_count = *count;
1206     *count = start_count;
1207     }
1208   }
1209 
1210 FAIL_EXIT:
1211 *ptrptr = ptr;
1212 return -1;
1213 }
1214 
1215 
1216 
1217 
1218 /*************************************************
1219 *       Find forward referenced subpattern       *
1220 *************************************************/
1221 
1222 /* This function scans along a pattern's text looking for capturing
1223 subpatterns, and counting them. If it finds a named pattern that matches the
1224 name it is given, it returns its number. Alternatively, if the name is NULL, it
1225 returns when it reaches a given numbered subpattern. This is used for forward
1226 references to subpatterns. We used to be able to start this scan from the
1227 current compiling point, using the current count value from cd->bracount, and
1228 do it all in a single loop, but the addition of the possibility of duplicate
1229 subpattern numbers means that we have to scan from the very start, in order to
1230 take account of such duplicates, and to use a recursive function to keep track
1231 of the different types of group.
1232 
1233 Arguments:
1234   cd           compile background data
1235   name         name to seek, or NULL if seeking a numbered subpattern
1236   lorn         name length, or subpattern number if name is NULL
1237   xmode        TRUE if we are in /x mode
1238 
1239 Returns:       the number of the found subpattern, or -1 if not found
1240 */
1241 
1242 static int
1243 find_parens(compile_data *cd, const uschar *name, int lorn, BOOL xmode)
1244 {
1245 uschar *ptr = (uschar *)cd->start_pattern;
1246 int count = 0;
1247 int rc;
1248 
1249 /* If the pattern does not start with an opening parenthesis, the first call
1250 to find_parens_sub() will scan right to the end (if necessary). However, if it
1251 does start with a parenthesis, find_parens_sub() will return when it hits the
1252 matching closing parens. That is why we have to have a loop. */
1253 
1254 for (;;)
1255   {
1256   rc = find_parens_sub(&ptr, cd, name, lorn, xmode, &count);
1257   if (rc > 0 || *ptr++ == 0) break;
1258   }
1259 
1260 return rc;
1261 }
1262 
1263 
1264 
1265 
1266 /*************************************************
1267 *      Find first significant op code            *
1268 *************************************************/
1269 
1270 /* This is called by several functions that scan a compiled expression looking
1271 for a fixed first character, or an anchoring op code etc. It skips over things
1272 that do not influence this. For some calls, a change of option is important.
1273 For some calls, it makes sense to skip negative forward and all backward
1274 assertions, and also the \b assertion; for others it does not.
1275 
1276 Arguments:
1277   code         pointer to the start of the group
1278   options      pointer to external options
1279   optbit       the option bit whose changing is significant, or
1280                  zero if none are
1281   skipassert   TRUE if certain assertions are to be skipped
1282 
1283 Returns:       pointer to the first significant opcode
1284 */
1285 
1286 static const uschar*
1287 first_significant_code(const uschar *code, int *options, int optbit,
1288   BOOL skipassert)
1289 {
1290 for (;;)
1291   {
1292   switch ((int)*code)
1293     {
1294     case OP_OPT:
1295     if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
1296       *options = (int)code[1];
1297     code += 2;
1298     break;
1299 
1300     case OP_ASSERT_NOT:
1301     case OP_ASSERTBACK:
1302     case OP_ASSERTBACK_NOT:
1303     if (!skipassert) return code;
1304     do code += GET(code, 1); while (*code == OP_ALT);
1305     code += _pcre_OP_lengths[*code];
1306     break;
1307 
1308     case OP_WORD_BOUNDARY:
1309     case OP_NOT_WORD_BOUNDARY:
1310     if (!skipassert) return code;
1311     /* Fall through */
1312 
1313     case OP_CALLOUT:
1314     case OP_CREF:
1315     case OP_RREF:
1316     case OP_DEF:
1317     code += _pcre_OP_lengths[*code];
1318     break;
1319 
1320     default:
1321     return code;
1322     }
1323   }
1324 /* Control never reaches here */
1325 }
1326 
1327 
1328 
1329 
1330 /*************************************************
1331 *        Find the fixed length of a pattern      *
1332 *************************************************/
1333 
1334 /* Scan a pattern and compute the fixed length of subject that will match it,
1335 if the length is fixed. This is needed for dealing with backward assertions.
1336 In UTF8 mode, the result is in characters rather than bytes.
1337 
1338 Arguments:
1339   code     points to the start of the pattern (the bracket)
1340   options  the compiling options
1341 
1342 Returns:   the fixed length, or -1 if there is no fixed length,
1343              or -2 if \C was encountered
1344 */
1345 
1346 static int
1347 find_fixedlength(uschar *code, int options)
1348 {
1349 int length = -1;
1350 
1351 register int branchlength = 0;
1352 register uschar *cc = code + 1 + LINK_SIZE;
1353 
1354 /* Scan along the opcodes for this branch. If we get to the end of the
1355 branch, check the length against that of the other branches. */
1356 
1357 for (;;)
1358   {
1359   int d;
1360   register int op = *cc;
1361   switch (op)
1362     {
1363     case OP_CBRA:
1364     case OP_BRA:
1365     case OP_ONCE:
1366     case OP_COND:
1367     d = find_fixedlength(cc + ((op == OP_CBRA)? 2:0), options);
1368     if (d < 0) return d;
1369     branchlength += d;
1370     do cc += GET(cc, 1); while (*cc == OP_ALT);
1371     cc += 1 + LINK_SIZE;
1372     break;
1373 
1374     /* Reached end of a branch; if it's a ket it is the end of a nested
1375     call. If it's ALT it is an alternation in a nested call. If it is
1376     END it's the end of the outer call. All can be handled by the same code. */
1377 
1378     case OP_ALT:
1379     case OP_KET:
1380     case OP_KETRMAX:
1381     case OP_KETRMIN:
1382     case OP_END:
1383     if (length < 0) length = branchlength;
1384       else if (length != branchlength) return -1;
1385     if (*cc != OP_ALT) return length;
1386     cc += 1 + LINK_SIZE;
1387     branchlength = 0;
1388     break;
1389 
1390     /* Skip over assertive subpatterns */
1391 
1392     case OP_ASSERT:
1393     case OP_ASSERT_NOT:
1394     case OP_ASSERTBACK:
1395     case OP_ASSERTBACK_NOT:
1396     do cc += GET(cc, 1); while (*cc == OP_ALT);
1397     /* Fall through */
1398 
1399     /* Skip over things that don't match chars */
1400 
1401     case OP_REVERSE:
1402     case OP_CREF:
1403     case OP_RREF:
1404     case OP_DEF:
1405     case OP_OPT:
1406     case OP_CALLOUT:
1407     case OP_SOD:
1408     case OP_SOM:
1409     case OP_EOD:
1410     case OP_EODN:
1411     case OP_CIRC:
1412     case OP_DOLL:
1413     case OP_NOT_WORD_BOUNDARY:
1414     case OP_WORD_BOUNDARY:
1415     cc += _pcre_OP_lengths[*cc];
1416     break;
1417 
1418     /* Handle literal characters */
1419 
1420     case OP_CHAR:
1421     case OP_CHARNC:
1422     case OP_NOT:
1423     branchlength++;
1424     cc += 2;
1425 #ifdef SUPPORT_UTF8
1426     if ((options & PCRE_UTF8) != 0)
1427       {
1428       while ((*cc & 0xc0) == 0x80) cc++;
1429       }
1430 #endif
1431     break;
1432 
1433     /* Handle exact repetitions. The count is already in characters, but we
1434     need to skip over a multibyte character in UTF8 mode.  */
1435 
1436     case OP_EXACT:
1437     branchlength += GET2(cc,1);
1438     cc += 4;
1439 #ifdef SUPPORT_UTF8
1440     if ((options & PCRE_UTF8) != 0)
1441       {
1442       while((*cc & 0x80) == 0x80) cc++;
1443       }
1444 #endif
1445     break;
1446 
1447     case OP_TYPEEXACT:
1448     branchlength += GET2(cc,1);
1449     if (cc[3] == OP_PROP || cc[3] == OP_NOTPROP) cc += 2;
1450     cc += 4;
1451     break;
1452 
1453     /* Handle single-char matchers */
1454 
1455     case OP_PROP:
1456     case OP_NOTPROP:
1457     cc += 2;
1458     /* Fall through */
1459 
1460     case OP_NOT_DIGIT:
1461     case OP_DIGIT:
1462     case OP_NOT_WHITESPACE:
1463     case OP_WHITESPACE:
1464     case OP_NOT_WORDCHAR:
1465     case OP_WORDCHAR:
1466     case OP_ANY:
1467     case OP_ALLANY:
1468     branchlength++;
1469     cc++;
1470     break;
1471 
1472     /* The single-byte matcher isn't allowed */
1473 
1474     case OP_ANYBYTE:
1475     return -2;
1476 
1477     /* Check a class for variable quantification */
1478 
1479 #ifdef SUPPORT_UTF8
1480     case OP_XCLASS:
1481     cc += GET(cc, 1) - 33;
1482     /* Fall through */
1483 #endif
1484 
1485     case OP_CLASS:
1486     case OP_NCLASS:
1487     cc += 33;
1488 
1489     switch (*cc)
1490       {
1491       case OP_CRSTAR:
1492       case OP_CRMINSTAR:
1493       case OP_CRQUERY:
1494       case OP_CRMINQUERY:
1495       return -1;
1496 
1497       case OP_CRRANGE:
1498       case OP_CRMINRANGE:
1499       if (GET2(cc,1) != GET2(cc,3)) return -1;
1500       branchlength += GET2(cc,1);
1501       cc += 5;
1502       break;
1503 
1504       default:
1505       branchlength++;
1506       }
1507     break;
1508 
1509     /* Anything else is variable length */
1510 
1511     default:
1512     return -1;
1513     }
1514   }
1515 /* Control never gets here */
1516 }
1517 
1518 
1519 
1520 
1521 /*************************************************
1522 *    Scan compiled regex for numbered bracket    *
1523 *************************************************/
1524 
1525 /* This little function scans through a compiled pattern until it finds a
1526 capturing bracket with the given number.
1527 
1528 Arguments:
1529   code        points to start of expression
1530   utf8        TRUE in UTF-8 mode
1531   number      the required bracket number
1532 
1533 Returns:      pointer to the opcode for the bracket, or NULL if not found
1534 */
1535 
1536 static const uschar *
1537 find_bracket(const uschar *code, BOOL utf8, int number)
1538 {
1539 for (;;)
1540   {
1541   register int c = *code;
1542   if (c == OP_END) return NULL;
1543 
1544   /* XCLASS is used for classes that cannot be represented just by a bit
1545   map. This includes negated single high-valued characters. The length in
1546   the table is zero; the actual length is stored in the compiled code. */
1547 
1548   if (c == OP_XCLASS) code += GET(code, 1);
1549 
1550   /* Handle capturing bracket */
1551 
1552   else if (c == OP_CBRA)
1553     {
1554     int n = GET2(code, 1+LINK_SIZE);
1555     if (n == number) return (uschar *)code;
1556     code += _pcre_OP_lengths[c];
1557     }
1558 
1559   /* Otherwise, we can get the item's length from the table, except that for
1560   repeated character types, we have to test for \p and \P, which have an extra
1561   two bytes of parameters. */
1562 
1563   else
1564     {
1565     switch(c)
1566       {
1567       case OP_TYPESTAR:
1568       case OP_TYPEMINSTAR:
1569       case OP_TYPEPLUS:
1570       case OP_TYPEMINPLUS:
1571       case OP_TYPEQUERY:
1572       case OP_TYPEMINQUERY:
1573       case OP_TYPEPOSSTAR:
1574       case OP_TYPEPOSPLUS:
1575       case OP_TYPEPOSQUERY:
1576       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1577       break;
1578 
1579       case OP_TYPEUPTO:
1580       case OP_TYPEMINUPTO:
1581       case OP_TYPEEXACT:
1582       case OP_TYPEPOSUPTO:
1583       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1584       break;
1585       }
1586 
1587     /* Add in the fixed length from the table */
1588 
1589     code += _pcre_OP_lengths[c];
1590 
1591   /* In UTF-8 mode, opcodes that are followed by a character may be followed by
1592   a multi-byte character. The length in the table is a minimum, so we have to
1593   arrange to skip the extra bytes. */
1594 
1595 #ifdef SUPPORT_UTF8
1596     if (utf8) switch(c)
1597       {
1598       case OP_CHAR:
1599       case OP_CHARNC:
1600       case OP_EXACT:
1601       case OP_UPTO:
1602       case OP_MINUPTO:
1603       case OP_POSUPTO:
1604       case OP_STAR:
1605       case OP_MINSTAR:
1606       case OP_POSSTAR:
1607       case OP_PLUS:
1608       case OP_MINPLUS:
1609       case OP_POSPLUS:
1610       case OP_QUERY:
1611       case OP_MINQUERY:
1612       case OP_POSQUERY:
1613       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1614       break;
1615       }
1616 #else
1617     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1618 #endif
1619     }
1620   }
1621 }
1622 
1623 
1624 
1625 /*************************************************
1626 *   Scan compiled regex for recursion reference  *
1627 *************************************************/
1628 
1629 /* This little function scans through a compiled pattern until it finds an
1630 instance of OP_RECURSE.
1631 
1632 Arguments:
1633   code        points to start of expression
1634   utf8        TRUE in UTF-8 mode
1635 
1636 Returns:      pointer to the opcode for OP_RECURSE, or NULL if not found
1637 */
1638 
1639 static const uschar *
1640 find_recurse(const uschar *code, BOOL utf8)
1641 {
1642 for (;;)
1643   {
1644   register int c = *code;
1645   if (c == OP_END) return NULL;
1646   if (c == OP_RECURSE) return code;
1647 
1648   /* XCLASS is used for classes that cannot be represented just by a bit
1649   map. This includes negated single high-valued characters. The length in
1650   the table is zero; the actual length is stored in the compiled code. */
1651 
1652   if (c == OP_XCLASS) code += GET(code, 1);
1653 
1654   /* Otherwise, we can get the item's length from the table, except that for
1655   repeated character types, we have to test for \p and \P, which have an extra
1656   two bytes of parameters. */
1657 
1658   else
1659     {
1660     switch(c)
1661       {
1662       case OP_TYPESTAR:
1663       case OP_TYPEMINSTAR:
1664       case OP_TYPEPLUS:
1665       case OP_TYPEMINPLUS:
1666       case OP_TYPEQUERY:
1667       case OP_TYPEMINQUERY:
1668       case OP_TYPEPOSSTAR:
1669       case OP_TYPEPOSPLUS:
1670       case OP_TYPEPOSQUERY:
1671       if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1672       break;
1673 
1674       case OP_TYPEPOSUPTO:
1675       case OP_TYPEUPTO:
1676       case OP_TYPEMINUPTO:
1677       case OP_TYPEEXACT:
1678       if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1679       break;
1680       }
1681 
1682     /* Add in the fixed length from the table */
1683 
1684     code += _pcre_OP_lengths[c];
1685 
1686     /* In UTF-8 mode, opcodes that are followed by a character may be followed
1687     by a multi-byte character. The length in the table is a minimum, so we have
1688     to arrange to skip the extra bytes. */
1689 
1690 #ifdef SUPPORT_UTF8
1691     if (utf8) switch(c)
1692       {
1693       case OP_CHAR:
1694       case OP_CHARNC:
1695       case OP_EXACT:
1696       case OP_UPTO:
1697       case OP_MINUPTO:
1698       case OP_POSUPTO:
1699       case OP_STAR:
1700       case OP_MINSTAR:
1701       case OP_POSSTAR:
1702       case OP_PLUS:
1703       case OP_MINPLUS:
1704       case OP_POSPLUS:
1705       case OP_QUERY:
1706       case OP_MINQUERY:
1707       case OP_POSQUERY:
1708       if (code[-1] >= 0xc0) code += _pcre_utf8_table4[code[-1] & 0x3f];
1709       break;
1710       }
1711 #else
1712     (void)(utf8);  /* Keep compiler happy by referencing function argument */
1713 #endif
1714     }
1715   }
1716 }
1717 
1718 
1719 
1720 /*************************************************
1721 *    Scan compiled branch for non-emptiness      *
1722 *************************************************/
1723 
1724 /* This function scans through a branch of a compiled pattern to see whether it
1725 can match the empty string or not. It is called from could_be_empty()
1726 below and from compile_branch() when checking for an unlimited repeat of a
1727 group that can match nothing. Note that first_significant_code() skips over
1728 backward and negative forward assertions when its final argument is TRUE. If we
1729 hit an unclosed bracket, we return "empty" - this means we've struck an inner
1730 bracket whose current branch will already have been scanned.
1731 
1732 Arguments:
1733   code        points to start of search
1734   endcode     points to where to stop
1735   utf8        TRUE if in UTF8 mode
1736 
1737 Returns:      TRUE if what is matched could be empty
1738 */
1739 
1740 static BOOL
1741 could_be_empty_branch(const uschar *code, const uschar *endcode, BOOL utf8)
1742 {
1743 register int c;
1744 for (code = first_significant_code(code + _pcre_OP_lengths[*code], NULL, 0, TRUE);
1745      code < endcode;
1746      code = first_significant_code(code + _pcre_OP_lengths[c], NULL, 0, TRUE))
1747   {
1748   const uschar *ccode;
1749 
1750   c = *code;
1751 
1752   /* Skip over forward assertions; the other assertions are skipped by
1753   first_significant_code() with a TRUE final argument. */
1754 
1755   if (c == OP_ASSERT)
1756     {
1757     do code += GET(code, 1); while (*code == OP_ALT);
1758     c = *code;
1759     continue;
1760     }
1761 
1762   /* Groups with zero repeats can of course be empty; skip them. */
1763 
1764   if (c == OP_BRAZERO || c == OP_BRAMINZERO || c == OP_SKIPZERO)
1765     {
1766     code += _pcre_OP_lengths[c];
1767     do code += GET(code, 1); while (*code == OP_ALT);
1768     c = *code;
1769     continue;
1770     }
1771 
1772   /* For other groups, scan the branches. */
1773 
1774   if (c == OP_BRA || c == OP_CBRA || c == OP_ONCE || c == OP_COND)
1775     {
1776     BOOL empty_branch;
1777     if (GET(code, 1) == 0) return TRUE;    /* Hit unclosed bracket */
1778 
1779     /* If a conditional group has only one branch, there is a second, implied,
1780     empty branch, so just skip over the conditional, because it could be empty.
1781     Otherwise, scan the individual branches of the group. */
1782 
1783     if (c == OP_COND && code[GET(code, 1)] != OP_ALT)
1784       code += GET(code, 1);
1785     else
1786       {
1787       empty_branch = FALSE;
1788       do
1789         {
1790         if (!empty_branch && could_be_empty_branch(code, endcode, utf8))
1791           empty_branch = TRUE;
1792         code += GET(code, 1);
1793         }
1794       while (*code == OP_ALT);
1795       if (!empty_branch) return FALSE;   /* All branches are non-empty */
1796       }
1797 
1798     c = *code;
1799     continue;
1800     }
1801 
1802   /* Handle the other opcodes */
1803 
1804   switch (c)
1805     {
1806     /* Check for quantifiers after a class. XCLASS is used for classes that
1807     cannot be represented just by a bit map. This includes negated single
1808     high-valued characters. The length in _pcre_OP_lengths[] is zero; the
1809     actual length is stored in the compiled code, so we must update "code"
1810     here. */
1811 
1812 #ifdef SUPPORT_UTF8
1813     case OP_XCLASS:
1814     ccode = code += GET(code, 1);
1815     goto CHECK_CLASS_REPEAT;
1816 #endif
1817 
1818     case OP_CLASS:
1819     case OP_NCLASS:
1820     ccode = code + 33;
1821 
1822 #ifdef SUPPORT_UTF8
1823     CHECK_CLASS_REPEAT:
1824 #endif
1825 
1826     switch (*ccode)
1827       {
1828       case OP_CRSTAR:            /* These could be empty; continue */
1829       case OP_CRMINSTAR:
1830       case OP_CRQUERY:
1831       case OP_CRMINQUERY:
1832       break;
1833 
1834       default:                   /* Non-repeat => class must match */
1835       case OP_CRPLUS:            /* These repeats aren't empty */
1836       case OP_CRMINPLUS:
1837       return FALSE;
1838 
1839       case OP_CRRANGE:
1840       case OP_CRMINRANGE:
1841       if (GET2(ccode, 1) > 0) return FALSE;  /* Minimum > 0 */
1842       break;
1843       }
1844     break;
1845 
1846     /* Opcodes that must match a character */
1847 
1848     case OP_PROP:
1849     case OP_NOTPROP:
1850     case OP_EXTUNI:
1851     case OP_NOT_DIGIT:
1852     case OP_DIGIT:
1853     case OP_NOT_WHITESPACE:
1854     case OP_WHITESPACE:
1855     case OP_NOT_WORDCHAR:
1856     case OP_WORDCHAR:
1857     case OP_ANY:
1858     case OP_ALLANY:
1859     case OP_ANYBYTE:
1860     case OP_CHAR:
1861     case OP_CHARNC:
1862     case OP_NOT:
1863     case OP_PLUS:
1864     case OP_MINPLUS:
1865     case OP_POSPLUS:
1866     case OP_EXACT:
1867     case OP_NOTPLUS:
1868     case OP_NOTMINPLUS:
1869     case OP_NOTPOSPLUS:
1870     case OP_NOTEXACT:
1871     case OP_TYPEPLUS:
1872     case OP_TYPEMINPLUS:
1873     case OP_TYPEPOSPLUS:
1874     case OP_TYPEEXACT:
1875     return FALSE;
1876 
1877     /* These are going to continue, as they may be empty, but we have to
1878     fudge the length for the \p and \P cases. */
1879 
1880     case OP_TYPESTAR:
1881     case OP_TYPEMINSTAR:
1882     case OP_TYPEPOSSTAR:
1883     case OP_TYPEQUERY:
1884     case OP_TYPEMINQUERY:
1885     case OP_TYPEPOSQUERY:
1886     if (code[1] == OP_PROP || code[1] == OP_NOTPROP) code += 2;
1887     break;
1888 
1889     /* Same for these */
1890 
1891     case OP_TYPEUPTO:
1892     case OP_TYPEMINUPTO:
1893     case OP_TYPEPOSUPTO:
1894     if (code[3] == OP_PROP || code[3] == OP_NOTPROP) code += 2;
1895     break;
1896 
1897     /* End of branch */
1898 
1899     case OP_KET:
1900     case OP_KETRMAX:
1901     case OP_KETRMIN:
1902     case OP_ALT:
1903     return TRUE;
1904 
1905     /* In UTF-8 mode, STAR, MINSTAR, POSSTAR, QUERY, MINQUERY, POSQUERY, UPTO,
1906     MINUPTO, and POSUPTO may be followed by a multibyte character */
1907 
1908 #ifdef SUPPORT_UTF8
1909     case OP_STAR:
1910     case OP_MINSTAR:
1911     case OP_POSSTAR:
1912     case OP_QUERY:
1913     case OP_MINQUERY:
1914     case OP_POSQUERY:
1915     case OP_UPTO:
1916     case OP_MINUPTO:
1917     case OP_POSUPTO:
1918     if (utf8) while ((code[2] & 0xc0) == 0x80) code++;
1919     break;
1920 #endif
1921     }
1922   }
1923 
1924 return TRUE;
1925 }
1926 
1927 
1928 
1929 /*************************************************
1930 *    Scan compiled regex for non-emptiness       *
1931 *************************************************/
1932 
1933 /* This function is called to check for left recursive calls. We want to check
1934 the current branch of the current pattern to see if it could match the empty
1935 string. If it could, we must look outwards for branches at other levels,
1936 stopping when we pass beyond the bracket which is the subject of the recursion.
1937 
1938 Arguments:
1939   code        points to start of the recursion
1940   endcode     points to where to stop (current RECURSE item)
1941   bcptr       points to the chain of current (unclosed) branch starts
1942   utf8        TRUE if in UTF-8 mode
1943 
1944 Returns:      TRUE if what is matched could be empty
1945 */
1946 
1947 static BOOL
1948 could_be_empty(const uschar *code, const uschar *endcode, branch_chain *bcptr,
1949   BOOL utf8)
1950 {
1951 while (bcptr != NULL && bcptr->current >= code)
1952   {
1953   if (!could_be_empty_branch(bcptr->current, endcode, utf8)) return FALSE;
1954   bcptr = bcptr->outer;
1955   }
1956 return TRUE;
1957 }
1958 
1959 
1960 
1961 /*************************************************
1962 *           Check for POSIX class syntax         *
1963 *************************************************/
1964 
1965 /* This function is called when the sequence "[:" or "[." or "[=" is
1966 encountered in a character class. It checks whether this is followed by a
1967 sequence of characters terminated by a matching ":]" or ".]" or "=]". If we
1968 reach an unescaped ']' without the special preceding character, return FALSE.
1969 
1970 Originally, this function only recognized a sequence of letters between the
1971 terminators, but it seems that Perl recognizes any sequence of characters,
1972 though of course unknown POSIX names are subsequently rejected. Perl gives an
1973 "Unknown POSIX class" error for [:f\oo:] for example, where previously PCRE
1974 didn't consider this to be a POSIX class. Likewise for [:1234:].
1975 
1976 The problem in trying to be exactly like Perl is in the handling of escapes. We
1977 have to be sure that [abc[:x\]pqr] is *not* treated as containing a POSIX
1978 class, but [abc[:x\]pqr:]] is (so that an error can be generated). The code
1979 below handles the special case of \], but does not try to do any other escape
1980 processing. This makes it different from Perl for cases such as [:l\ower:]
1981 where Perl recognizes it as the POSIX class "lower" but PCRE does not recognize
1982 "l\ower". This is a lesser evil that not diagnosing bad classes when Perl does,
1983 I think.
1984 
1985 Arguments:
1986   ptr      pointer to the initial [
1987   endptr   where to return the end pointer
1988 
1989 Returns:   TRUE or FALSE
1990 */
1991 
1992 static BOOL
1993 check_posix_syntax(const uschar *ptr, const uschar **endptr)
1994 {
1995 int terminator;          /* Don't combine these lines; the Solaris cc */
1996 terminator = *(++ptr);   /* compiler warns about "non-constant" initializer. */
1997 for (++ptr; *ptr != 0; ptr++)
1998   {
1999   if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET) ptr++; else
2000     {
2001     if (*ptr == CHAR_RIGHT_SQUARE_BRACKET) return FALSE;
2002     if (*ptr == terminator && ptr[1] == CHAR_RIGHT_SQUARE_BRACKET)
2003       {
2004       *endptr = ptr;
2005       return TRUE;
2006       }
2007     }
2008   }
2009 return FALSE;
2010 }
2011 
2012 
2013 
2014 
2015 /*************************************************
2016 *          Check POSIX class name                *
2017 *************************************************/
2018 
2019 /* This function is called to check the name given in a POSIX-style class entry
2020 such as [:alnum:].
2021 
2022 Arguments:
2023   ptr        points to the first letter
2024   len        the length of the name
2025 
2026 Returns:     a value representing the name, or -1 if unknown
2027 */
2028 
2029 static int
2030 check_posix_name(const uschar *ptr, int len)
2031 {
2032 const char *pn = posix_names;
2033 register int yield = 0;
2034 while (posix_name_lengths[yield] != 0)
2035   {
2036   if (len == posix_name_lengths[yield] &&
2037     strncmp((const char *)ptr, pn, len) == 0) return yield;
2038   pn += posix_name_lengths[yield] + 1;
2039   yield++;
2040   }
2041 return -1;
2042 }
2043 
2044 
2045 /*************************************************
2046 *    Adjust OP_RECURSE items in repeated group   *
2047 *************************************************/
2048 
2049 /* OP_RECURSE items contain an offset from the start of the regex to the group
2050 that is referenced. This means that groups can be replicated for fixed
2051 repetition simply by copying (because the recursion is allowed to refer to
2052 earlier groups that are outside the current group). However, when a group is
2053 optional (i.e. the minimum quantifier is zero), OP_BRAZERO or OP_SKIPZERO is
2054 inserted before it, after it has been compiled. This means that any OP_RECURSE
2055 items within it that refer to the group itself or any contained groups have to
2056 have their offsets adjusted. That one of the jobs of this function. Before it
2057 is called, the partially compiled regex must be temporarily terminated with
2058 OP_END.
2059 
2060 This function has been extended with the possibility of forward references for
2061 recursions and subroutine calls. It must also check the list of such references
2062 for the group we are dealing with. If it finds that one of the recursions in
2063 the current group is on this list, it adjusts the offset in the list, not the
2064 value in the reference (which is a group number).
2065 
2066 Arguments:
2067   group      points to the start of the group
2068   adjust     the amount by which the group is to be moved
2069   utf8       TRUE in UTF-8 mode
2070   cd         contains pointers to tables etc.
2071   save_hwm   the hwm forward reference pointer at the start of the group
2072 
2073 Returns:     nothing
2074 */
2075 
2076 static void
2077 adjust_recurse(uschar *group, int adjust, BOOL utf8, compile_data *cd,
2078   uschar *save_hwm)
2079 {
2080 uschar *ptr = group;
2081 
2082 while ((ptr = (uschar *)find_recurse(ptr, utf8)) != NULL)
2083   {
2084   int offset;
2085   uschar *hc;
2086 
2087   /* See if this recursion is on the forward reference list. If so, adjust the
2088   reference. */
2089 
2090   for (hc = save_hwm; hc < cd->hwm; hc += LINK_SIZE)
2091     {
2092     offset = GET(hc, 0);
2093     if (cd->start_code + offset == ptr + 1)
2094       {
2095       PUT(hc, 0, offset + adjust);
2096       break;
2097       }
2098     }
2099 
2100   /* Otherwise, adjust the recursion offset if it's after the start of this
2101   group. */
2102 
2103   if (hc >= cd->hwm)
2104     {
2105     offset = GET(ptr, 1);
2106     if (cd->start_code + offset >= group) PUT(ptr, 1, offset + adjust);
2107     }
2108 
2109   ptr += 1 + LINK_SIZE;
2110   }
2111 }
2112 
2113 
2114 
2115 /*************************************************
2116 *        Insert an automatic callout point       *
2117 *************************************************/
2118 
2119 /* This function is called when the PCRE_AUTO_CALLOUT option is set, to insert
2120 callout points before each pattern item.
2121 
2122 Arguments:
2123   code           current code pointer
2124   ptr            current pattern pointer
2125   cd             pointers to tables etc
2126 
2127 Returns:         new code pointer
2128 */
2129 
2130 static uschar *
2131 auto_callout(uschar *code, const uschar *ptr, compile_data *cd)
2132 {
2133 *code++ = OP_CALLOUT;
2134 *code++ = 255;
2135 PUT(code, 0, ptr - cd->start_pattern);  /* Pattern offset */
2136 PUT(code, LINK_SIZE, 0);                /* Default length */
2137 return code + 2*LINK_SIZE;
2138 }
2139 
2140 
2141 
2142 /*************************************************
2143 *         Complete a callout item                *
2144 *************************************************/
2145 
2146 /* A callout item contains the length of the next item in the pattern, which
2147 we can't fill in till after we have reached the relevant point. This is used
2148 for both automatic and manual callouts.
2149 
2150 Arguments:
2151   previous_callout   points to previous callout item
2152   ptr                current pattern pointer
2153   cd                 pointers to tables etc
2154 
2155 Returns:             nothing
2156 */
2157 
2158 static void
2159 complete_callout(uschar *previous_callout, const uschar *ptr, compile_data *cd)
2160 {
2161 int length = ptr - cd->start_pattern - GET(previous_callout, 2);
2162 PUT(previous_callout, 2 + LINK_SIZE, length);
2163 }
2164 
2165 
2166 
2167 #ifdef SUPPORT_UCP
2168 /*************************************************
2169 *           Get othercase range                  *
2170 *************************************************/
2171 
2172 /* This function is passed the start and end of a class range, in UTF-8 mode
2173 with UCP support. It searches up the characters, looking for internal ranges of
2174 characters in the "other" case. Each call returns the next one, updating the
2175 start address.
2176 
2177 Arguments:
2178   cptr        points to starting character value; updated
2179   d           end value
2180   ocptr       where to put start of othercase range
2181   odptr       where to put end of othercase range
2182 
2183 Yield:        TRUE when range returned; FALSE when no more
2184 */
2185 
2186 static BOOL
2187 get_othercase_range(unsigned int *cptr, unsigned int d, unsigned int *ocptr,
2188   unsigned int *odptr)
2189 {
2190 unsigned int c, othercase, next;
2191 
2192 for (c = *cptr; c <= d; c++)
2193   { if ((othercase = UCD_OTHERCASE(c)) != c) break; }
2194 
2195 if (c > d) return FALSE;
2196 
2197 *ocptr = othercase;
2198 next = othercase + 1;
2199 
2200 for (++c; c <= d; c++)
2201   {
2202   if (UCD_OTHERCASE(c) != next) break;
2203   next++;
2204   }
2205 
2206 *odptr = next - 1;
2207 *cptr = c;
2208 
2209 return TRUE;
2210 }
2211 #endif  /* SUPPORT_UCP */
2212 
2213 
2214 
2215 /*************************************************
2216 *     Check if auto-possessifying is possible    *
2217 *************************************************/
2218 
2219 /* This function is called for unlimited repeats of certain items, to see
2220 whether the next thing could possibly match the repeated item. If not, it makes
2221 sense to automatically possessify the repeated item.
2222 
2223 Arguments:
2224   op_code       the repeated op code
2225   this          data for this item, depends on the opcode
2226   utf8          TRUE in UTF-8 mode
2227   utf8_char     used for utf8 character bytes, NULL if not relevant
2228   ptr           next character in pattern
2229   options       options bits
2230   cd            contains pointers to tables etc.
2231 
2232 Returns:        TRUE if possessifying is wanted
2233 */
2234 
2235 static BOOL
2236 check_auto_possessive(int op_code, int item, BOOL utf8, uschar *utf8_char,
2237   const uschar *ptr, int options, compile_data *cd)
2238 {
2239 int next;
2240 
2241 /* Skip whitespace and comments in extended mode */
2242 
2243 if ((options & PCRE_EXTENDED) != 0)
2244   {
2245   for (;;)
2246     {
2247     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2248     if (*ptr == CHAR_NUMBER_SIGN)
2249       {
2250       while (*(++ptr) != 0)
2251         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2252       }
2253     else break;
2254     }
2255   }
2256 
2257 /* If the next item is one that we can handle, get its value. A non-negative
2258 value is a character, a negative value is an escape value. */
2259 
2260 if (*ptr == CHAR_BACKSLASH)
2261   {
2262   int temperrorcode = 0;
2263   next = check_escape(&ptr, &temperrorcode, cd->bracount, options, FALSE);
2264   if (temperrorcode != 0) return FALSE;
2265   ptr++;    /* Point after the escape sequence */
2266   }
2267 
2268 else if ((cd->ctypes[*ptr] & ctype_meta) == 0)
2269   {
2270 #ifdef SUPPORT_UTF8
2271   if (utf8) { GETCHARINC(next, ptr); } else
2272 #endif
2273   next = *ptr++;
2274   }
2275 
2276 else return FALSE;
2277 
2278 /* Skip whitespace and comments in extended mode */
2279 
2280 if ((options & PCRE_EXTENDED) != 0)
2281   {
2282   for (;;)
2283     {
2284     while ((cd->ctypes[*ptr] & ctype_space) != 0) ptr++;
2285     if (*ptr == CHAR_NUMBER_SIGN)
2286       {
2287       while (*(++ptr) != 0)
2288         if (IS_NEWLINE(ptr)) { ptr += cd->nllen; break; }
2289       }
2290     else break;
2291     }
2292   }
2293 
2294 /* If the next thing is itself optional, we have to give up. */
2295 
2296 if (*ptr == CHAR_ASTERISK || *ptr == CHAR_QUESTION_MARK ||
2297   strncmp((char *)ptr, STR_LEFT_CURLY_BRACKET STR_0 STR_COMMA, 3) == 0)
2298     return FALSE;
2299 
2300 /* Now compare the next item with the previous opcode. If the previous is a
2301 positive single character match, "item" either contains the character or, if
2302 "item" is greater than 127 in utf8 mode, the character's bytes are in
2303 utf8_char. */
2304 
2305 
2306 /* Handle cases when the next item is a character. */
2307 
2308 if (next >= 0) switch(op_code)
2309   {
2310   case OP_CHAR:
2311 #ifdef SUPPORT_UTF8
2312   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2313 #else
2314   (void)(utf8_char);  /* Keep compiler happy by referencing function argument */
2315 #endif
2316   return item != next;
2317 
2318   /* For CHARNC (caseless character) we must check the other case. If we have
2319   Unicode property support, we can use it to test the other case of
2320   high-valued characters. */
2321 
2322   case OP_CHARNC:
2323 #ifdef SUPPORT_UTF8
2324   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2325 #endif
2326   if (item == next) return FALSE;
2327 #ifdef SUPPORT_UTF8
2328   if (utf8)
2329     {
2330     unsigned int othercase;
2331     if (next < 128) othercase = cd->fcc[next]; else
2332 #ifdef SUPPORT_UCP
2333     othercase = UCD_OTHERCASE((unsigned int)next);
2334 #else
2335     othercase = NOTACHAR;
2336 #endif
2337     return (unsigned int)item != othercase;
2338     }
2339   else
2340 #endif  /* SUPPORT_UTF8 */
2341   return (item != cd->fcc[next]);  /* Non-UTF-8 mode */
2342 
2343   /* For OP_NOT, "item" must be a single-byte character. */
2344 
2345   case OP_NOT:
2346   if (item == next) return TRUE;
2347   if ((options & PCRE_CASELESS) == 0) return FALSE;
2348 #ifdef SUPPORT_UTF8
2349   if (utf8)
2350     {
2351     unsigned int othercase;
2352     if (next < 128) othercase = cd->fcc[next]; else
2353 #ifdef SUPPORT_UCP
2354     othercase = UCD_OTHERCASE(next);
2355 #else
2356     othercase = NOTACHAR;
2357 #endif
2358     return (unsigned int)item == othercase;
2359     }
2360   else
2361 #endif  /* SUPPORT_UTF8 */
2362   return (item == cd->fcc[next]);  /* Non-UTF-8 mode */
2363 
2364   case OP_DIGIT:
2365   return next > 127 || (cd->ctypes[next] & ctype_digit) == 0;
2366 
2367   case OP_NOT_DIGIT:
2368   return next <= 127 && (cd->ctypes[next] & ctype_digit) != 0;
2369 
2370   case OP_WHITESPACE:
2371   return next > 127 || (cd->ctypes[next] & ctype_space) == 0;
2372 
2373   case OP_NOT_WHITESPACE:
2374   return next <= 127 && (cd->ctypes[next] & ctype_space) != 0;
2375 
2376   case OP_WORDCHAR:
2377   return next > 127 || (cd->ctypes[next] & ctype_word) == 0;
2378 
2379   case OP_NOT_WORDCHAR:
2380   return next <= 127 && (cd->ctypes[next] & ctype_word) != 0;
2381 
2382   case OP_HSPACE:
2383   case OP_NOT_HSPACE:
2384   switch(next)
2385     {
2386     case 0x09:
2387     case 0x20:
2388     case 0xa0:
2389     case 0x1680:
2390     case 0x180e:
2391     case 0x2000:
2392     case 0x2001:
2393     case 0x2002:
2394     case 0x2003:
2395     case 0x2004:
2396     case 0x2005:
2397     case 0x2006:
2398     case 0x2007:
2399     case 0x2008:
2400     case 0x2009:
2401     case 0x200A:
2402     case 0x202f:
2403     case 0x205f:
2404     case 0x3000:
2405     return op_code != OP_HSPACE;
2406     default:
2407     return op_code == OP_HSPACE;
2408     }
2409 
2410   case OP_VSPACE:
2411   case OP_NOT_VSPACE:
2412   switch(next)
2413     {
2414     case 0x0a:
2415     case 0x0b:
2416     case 0x0c:
2417     case 0x0d:
2418     case 0x85:
2419     case 0x2028:
2420     case 0x2029:
2421     return op_code != OP_VSPACE;
2422     default:
2423     return op_code == OP_VSPACE;
2424     }
2425 
2426   default:
2427   return FALSE;
2428   }
2429 
2430 
2431 /* Handle the case when the next item is \d, \s, etc. */
2432 
2433 switch(op_code)
2434   {
2435   case OP_CHAR:
2436   case OP_CHARNC:
2437 #ifdef SUPPORT_UTF8
2438   if (utf8 && item > 127) { GETCHAR(item, utf8_char); }
2439 #endif
2440   switch(-next)
2441     {
2442     case ESC_d:
2443     return item > 127 || (cd->ctypes[item] & ctype_digit) == 0;
2444 
2445     case ESC_D:
2446     return item <= 127 && (cd->ctypes[item] & ctype_digit) != 0;
2447 
2448     case ESC_s:
2449     return item > 127 || (cd->ctypes[item] & ctype_space) == 0;
2450 
2451     case ESC_S:
2452     return item <= 127 && (cd->ctypes[item] & ctype_space) != 0;
2453 
2454     case ESC_w:
2455     return item > 127 || (cd->ctypes[item] & ctype_word) == 0;
2456 
2457     case ESC_W:
2458     return item <= 127 && (cd->ctypes[item] & ctype_word) != 0;
2459 
2460     case ESC_h:
2461     case ESC_H:
2462     switch(item)
2463       {
2464       case 0x09:
2465       case 0x20:
2466       case 0xa0:
2467       case 0x1680:
2468       case 0x180e:
2469       case 0x2000:
2470       case 0x2001:
2471       case 0x2002:
2472       case 0x2003:
2473       case 0x2004:
2474       case 0x2005:
2475       case 0x2006:
2476       case 0x2007:
2477       case 0x2008:
2478       case 0x2009:
2479       case 0x200A:
2480       case 0x202f:
2481       case 0x205f:
2482       case 0x3000:
2483       return -next != ESC_h;
2484       default:
2485       return -next == ESC_h;
2486       }
2487 
2488     case ESC_v:
2489     case ESC_V:
2490     switch(item)
2491       {
2492       case 0x0a:
2493       case 0x0b:
2494       case 0x0c:
2495       case 0x0d:
2496       case 0x85:
2497       case 0x2028:
2498       case 0x2029:
2499       return -next != ESC_v;
2500       default:
2501       return -next == ESC_v;
2502       }
2503 
2504     default:
2505     return FALSE;
2506     }
2507 
2508   case OP_DIGIT:
2509   return next == -ESC_D || next == -ESC_s || next == -ESC_W ||
2510          next == -ESC_h || next == -ESC_v;
2511 
2512   case OP_NOT_DIGIT:
2513   return next == -ESC_d;
2514 
2515   case OP_WHITESPACE:
2516   return next == -ESC_S || next == -ESC_d || next == -ESC_w;
2517 
2518   case OP_NOT_WHITESPACE:
2519   return next == -ESC_s || next == -ESC_h || next == -ESC_v;
2520 
2521   case OP_HSPACE:
2522   return next == -ESC_S || next == -ESC_H || next == -ESC_d || next == -ESC_w;
2523 
2524   case OP_NOT_HSPACE:
2525   return next == -ESC_h;
2526 
2527   /* Can't have \S in here because VT matches \S (Perl anomaly) */
2528   case OP_VSPACE:
2529   return next == -ESC_V || next == -ESC_d || next == -ESC_w;
2530 
2531   case OP_NOT_VSPACE:
2532   return next == -ESC_v;
2533 
2534   case OP_WORDCHAR:
2535   return next == -ESC_W || next == -ESC_s || next == -ESC_h || next == -ESC_v;
2536 
2537   case OP_NOT_WORDCHAR:
2538   return next == -ESC_w || next == -ESC_d;
2539 
2540   default:
2541   return FALSE;
2542   }
2543 
2544 /* Control does not reach here */
2545 }
2546 
2547 
2548 
2549 /*************************************************
2550 *           Compile one branch                   *
2551 *************************************************/
2552 
2553 /* Scan the pattern, compiling it into the a vector. If the options are
2554 changed during the branch, the pointer is used to change the external options
2555 bits. This function is used during the pre-compile phase when we are trying
2556 to find out the amount of memory needed, as well as during the real compile
2557 phase. The value of lengthptr distinguishes the two phases.
2558 
2559 Arguments:
2560   optionsptr     pointer to the option bits
2561   codeptr        points to the pointer to the current code point
2562   ptrptr         points to the current pattern pointer
2563   errorcodeptr   points to error code variable
2564   firstbyteptr   set to initial literal character, or < 0 (REQ_UNSET, REQ_NONE)
2565   reqbyteptr     set to the last literal character required, else < 0
2566   bcptr          points to current branch chain
2567   cd             contains pointers to tables etc.
2568   lengthptr      NULL during the real compile phase
2569                  points to length accumulator during pre-compile phase
2570 
2571 Returns:         TRUE on success
2572                  FALSE, with *errorcodeptr set non-zero on error
2573 */
2574 
2575 static BOOL
2576 compile_branch(int *optionsptr, uschar **codeptr, const uschar **ptrptr,
2577   int *errorcodeptr, int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr,
2578   compile_data *cd, int *lengthptr)
2579 {
2580 int repeat_type, op_type;
2581 int repeat_min = 0, repeat_max = 0;      /* To please picky compilers */
2582 int bravalue = 0;
2583 int greedy_default, greedy_non_default;
2584 int firstbyte, reqbyte;
2585 int zeroreqbyte, zerofirstbyte;
2586 int req_caseopt, reqvary, tempreqvary;
2587 int options = *optionsptr;
2588 int after_manual_callout = 0;
2589 int length_prevgroup = 0;
2590 register int c;
2591 register uschar *code = *codeptr;
2592 uschar *last_code = code;
2593 uschar *orig_code = code;
2594 uschar *tempcode;
2595 BOOL inescq = FALSE;
2596 BOOL groupsetfirstbyte = FALSE;
2597 const uschar *ptr = *ptrptr;
2598 const uschar *tempptr;
2599 uschar *previous = NULL;
2600 uschar *previous_callout = NULL;
2601 uschar *save_hwm = NULL;
2602 uschar classbits[32];
2603 
2604 #ifdef SUPPORT_UTF8
2605 BOOL class_utf8;
2606 BOOL utf8 = (options & PCRE_UTF8) != 0;
2607 uschar *class_utf8data;
2608 uschar *class_utf8data_base;
2609 uschar utf8_char[6];
2610 #else
2611 BOOL utf8 = FALSE;
2612 uschar *utf8_char = NULL;
2613 #endif
2614 
2615 #ifdef DEBUG
2616 if (lengthptr != NULL) DPRINTF((">> start branch\n"));
2617 #endif
2618 
2619 /* Set up the default and non-default settings for greediness */
2620 
2621 greedy_default = ((options & PCRE_UNGREEDY) != 0);
2622 greedy_non_default = greedy_default ^ 1;
2623 
2624 /* Initialize no first byte, no required byte. REQ_UNSET means "no char
2625 matching encountered yet". It gets changed to REQ_NONE if we hit something that
2626 matches a non-fixed char first char; reqbyte just remains unset if we never
2627 find one.
2628 
2629 When we hit a repeat whose minimum is zero, we may have to adjust these values
2630 to take the zero repeat into account. This is implemented by setting them to
2631 zerofirstbyte and zeroreqbyte when such a repeat is encountered. The individual
2632 item types that can be repeated set these backoff variables appropriately. */
2633 
2634 firstbyte = reqbyte = zerofirstbyte = zeroreqbyte = REQ_UNSET;
2635 
2636 /* The variable req_caseopt contains either the REQ_CASELESS value or zero,
2637 according to the current setting of the caseless flag. REQ_CASELESS is a bit
2638 value > 255. It is added into the firstbyte or reqbyte variables to record the
2639 case status of the value. This is used only for ASCII characters. */
2640 
2641 req_caseopt = ((options & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
2642 
2643 /* Switch on next character until the end of the branch */
2644 
2645 for (;; ptr++)
2646   {
2647   BOOL negate_class;
2648   BOOL should_flip_negation;
2649   BOOL possessive_quantifier;
2650   BOOL is_quantifier;
2651   BOOL is_recurse;
2652   BOOL reset_bracount;
2653   int class_charcount;
2654   int class_lastchar;
2655   int newoptions;
2656   int recno;
2657   int refsign;
2658   int skipbytes;
2659   int subreqbyte;
2660   int subfirstbyte;
2661   int terminator;
2662   int mclength;
2663   uschar mcbuffer[8];
2664 
2665   /* Get next byte in the pattern */
2666 
2667   c = *ptr;
2668 
2669   /* If we are in the pre-compile phase, accumulate the length used for the
2670   previous cycle of this loop. */
2671 
2672   if (lengthptr != NULL)
2673     {
2674 #ifdef DEBUG
2675     if (code > cd->hwm) cd->hwm = code;                 /* High water info */
2676 #endif
2677     if (code > cd->start_workspace + COMPILE_WORK_SIZE) /* Check for overrun */
2678       {
2679       *errorcodeptr = ERR52;
2680       goto FAILED;
2681       }
2682 
2683     /* There is at least one situation where code goes backwards: this is the
2684     case of a zero quantifier after a class (e.g. [ab]{0}). At compile time,
2685     the class is simply eliminated. However, it is created first, so we have to
2686     allow memory for it. Therefore, don't ever reduce the length at this point.
2687     */
2688 
2689     if (code < last_code) code = last_code;
2690 
2691     /* Paranoid check for integer overflow */
2692 
2693     if (OFLOW_MAX - *lengthptr < code - last_code)
2694       {
2695       *errorcodeptr = ERR20;
2696       goto FAILED;
2697       }
2698 
2699     *lengthptr += code - last_code;
2700     DPRINTF(("length=%d added %d c=%c\n", *lengthptr, code - last_code, c));
2701 
2702     /* If "previous" is set and it is not at the start of the work space, move
2703     it back to there, in order to avoid filling up the work space. Otherwise,
2704     if "previous" is NULL, reset the current code pointer to the start. */
2705 
2706     if (previous != NULL)
2707       {
2708       if (previous > orig_code)
2709         {
2710         memmove(orig_code, previous, code - previous);
2711         code -= previous - orig_code;
2712         previous = orig_code;
2713         }
2714       }
2715     else code = orig_code;
2716 
2717     /* Remember where this code item starts so we can pick up the length
2718     next time round. */
2719 
2720     last_code = code;
2721     }
2722 
2723   /* In the real compile phase, just check the workspace used by the forward
2724   reference list. */
2725 
2726   else if (cd->hwm > cd->start_workspace + COMPILE_WORK_SIZE)
2727     {
2728     *errorcodeptr = ERR52;
2729     goto FAILED;
2730     }
2731 
2732   /* If in \Q...\E, check for the end; if not, we have a literal */
2733 
2734   if (inescq && c != 0)
2735     {
2736     if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)
2737       {
2738       inescq = FALSE;
2739       ptr++;
2740       continue;
2741       }
2742     else
2743       {
2744       if (previous_callout != NULL)
2745         {
2746         if (lengthptr == NULL)  /* Don't attempt in pre-compile phase */
2747           complete_callout(previous_callout, ptr, cd);
2748         previous_callout = NULL;
2749         }
2750       if ((options & PCRE_AUTO_CALLOUT) != 0)
2751         {
2752         previous_callout = code;
2753         code = auto_callout(code, ptr, cd);
2754         }
2755       goto NORMAL_CHAR;
2756       }
2757     }
2758 
2759   /* Fill in length of a previous callout, except when the next thing is
2760   a quantifier. */
2761 
2762   is_quantifier =
2763     c == CHAR_ASTERISK || c == CHAR_PLUS || c == CHAR_QUESTION_MARK ||
2764     (c == CHAR_LEFT_CURLY_BRACKET && is_counted_repeat(ptr+1));
2765 
2766   if (!is_quantifier && previous_callout != NULL &&
2767        after_manual_callout-- <= 0)
2768     {
2769     if (lengthptr == NULL)      /* Don't attempt in pre-compile phase */
2770       complete_callout(previous_callout, ptr, cd);
2771     previous_callout = NULL;
2772     }
2773 
2774   /* In extended mode, skip white space and comments */
2775 
2776   if ((options & PCRE_EXTENDED) != 0)
2777     {
2778     if ((cd->ctypes[c] & ctype_space) != 0) continue;
2779     if (c == CHAR_NUMBER_SIGN)
2780       {
2781       while (*(++ptr) != 0)
2782         {
2783         if (IS_NEWLINE(ptr)) { ptr += cd->nllen - 1; break; }
2784         }
2785       if (*ptr != 0) continue;
2786 
2787       /* Else fall through to handle end of string */
2788       c = 0;
2789       }
2790     }
2791 
2792   /* No auto callout for quantifiers. */
2793 
2794   if ((options & PCRE_AUTO_CALLOUT) != 0 && !is_quantifier)
2795     {
2796     previous_callout = code;
2797     code = auto_callout(code, ptr, cd);
2798     }
2799 
2800   switch(c)
2801     {
2802     /* ===================================================================*/
2803     case 0:                        /* The branch terminates at string end */
2804     case CHAR_VERTICAL_LINE:       /* or | or ) */
2805     case CHAR_RIGHT_PARENTHESIS:
2806     *firstbyteptr = firstbyte;
2807     *reqbyteptr = reqbyte;
2808     *codeptr = code;
2809     *ptrptr = ptr;
2810     if (lengthptr != NULL)
2811       {
2812       if (OFLOW_MAX - *lengthptr < code - last_code)
2813         {
2814         *errorcodeptr = ERR20;
2815         goto FAILED;
2816         }
2817       *lengthptr += code - last_code;   /* To include callout length */
2818       DPRINTF((">> end branch\n"));
2819       }
2820     return TRUE;
2821 
2822 
2823     /* ===================================================================*/
2824     /* Handle single-character metacharacters. In multiline mode, ^ disables
2825     the setting of any following char as a first character. */
2826 
2827     case CHAR_CIRCUMFLEX_ACCENT:
2828     if ((options & PCRE_MULTILINE) != 0)
2829       {
2830       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2831       }
2832     previous = NULL;
2833     *code++ = OP_CIRC;
2834     break;
2835 
2836     case CHAR_DOLLAR_SIGN:
2837     previous = NULL;
2838     *code++ = OP_DOLL;
2839     break;
2840 
2841     /* There can never be a first char if '.' is first, whatever happens about
2842     repeats. The value of reqbyte doesn't change either. */
2843 
2844     case CHAR_DOT:
2845     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2846     zerofirstbyte = firstbyte;
2847     zeroreqbyte = reqbyte;
2848     previous = code;
2849     *code++ = ((options & PCRE_DOTALL) != 0)? OP_ALLANY: OP_ANY;
2850     break;
2851 
2852 
2853     /* ===================================================================*/
2854     /* Character classes. If the included characters are all < 256, we build a
2855     32-byte bitmap of the permitted characters, except in the special case
2856     where there is only one such character. For negated classes, we build the
2857     map as usual, then invert it at the end. However, we use a different opcode
2858     so that data characters > 255 can be handled correctly.
2859 
2860     If the class contains characters outside the 0-255 range, a different
2861     opcode is compiled. It may optionally have a bit map for characters < 256,
2862     but those above are are explicitly listed afterwards. A flag byte tells
2863     whether the bitmap is present, and whether this is a negated class or not.
2864 
2865     In JavaScript compatibility mode, an isolated ']' causes an error. In
2866     default (Perl) mode, it is treated as a data character. */
2867 
2868     case CHAR_RIGHT_SQUARE_BRACKET:
2869     if ((cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2870       {
2871       *errorcodeptr = ERR64;
2872       goto FAILED;
2873       }
2874     goto NORMAL_CHAR;
2875 
2876     case CHAR_LEFT_SQUARE_BRACKET:
2877     previous = code;
2878 
2879     /* PCRE supports POSIX class stuff inside a class. Perl gives an error if
2880     they are encountered at the top level, so we'll do that too. */
2881 
2882     if ((ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
2883          ptr[1] == CHAR_EQUALS_SIGN) &&
2884         check_posix_syntax(ptr, &tempptr))
2885       {
2886       *errorcodeptr = (ptr[1] == CHAR_COLON)? ERR13 : ERR31;
2887       goto FAILED;
2888       }
2889 
2890     /* If the first character is '^', set the negation flag and skip it. Also,
2891     if the first few characters (either before or after ^) are \Q\E or \E we
2892     skip them too. This makes for compatibility with Perl. */
2893 
2894     negate_class = FALSE;
2895     for (;;)
2896       {
2897       c = *(++ptr);
2898       if (c == CHAR_BACKSLASH)
2899         {
2900         if (ptr[1] == CHAR_E)
2901           ptr++;
2902         else if (strncmp((const char *)ptr+1,
2903                           STR_Q STR_BACKSLASH STR_E, 3) == 0)
2904           ptr += 3;
2905         else
2906           break;
2907         }
2908       else if (!negate_class && c == CHAR_CIRCUMFLEX_ACCENT)
2909         negate_class = TRUE;
2910       else break;
2911       }
2912 
2913     /* Empty classes are allowed in JavaScript compatibility mode. Otherwise,
2914     an initial ']' is taken as a data character -- the code below handles
2915     that. In JS mode, [] must always fail, so generate OP_FAIL, whereas
2916     [^] must match any character, so generate OP_ALLANY. */
2917 
2918     if (c == CHAR_RIGHT_SQUARE_BRACKET &&
2919         (cd->external_options & PCRE_JAVASCRIPT_COMPAT) != 0)
2920       {
2921       *code++ = negate_class? OP_ALLANY : OP_FAIL;
2922       if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
2923       zerofirstbyte = firstbyte;
2924       break;
2925       }
2926 
2927     /* If a class contains a negative special such as \S, we need to flip the
2928     negation flag at the end, so that support for characters > 255 works
2929     correctly (they are all included in the class). */
2930 
2931     should_flip_negation = FALSE;
2932 
2933     /* Keep a count of chars with values < 256 so that we can optimize the case
2934     of just a single character (as long as it's < 256). However, For higher
2935     valued UTF-8 characters, we don't yet do any optimization. */
2936 
2937     class_charcount = 0;
2938     class_lastchar = -1;
2939 
2940     /* Initialize the 32-char bit map to all zeros. We build the map in a
2941     temporary bit of memory, in case the class contains only 1 character (less
2942     than 256), because in that case the compiled code doesn't use the bit map.
2943     */
2944 
2945     memset(classbits, 0, 32 * sizeof(uschar));
2946 
2947 #ifdef SUPPORT_UTF8
2948     class_utf8 = FALSE;                       /* No chars >= 256 */
2949     class_utf8data = code + LINK_SIZE + 2;    /* For UTF-8 items */
2950     class_utf8data_base = class_utf8data;     /* For resetting in pass 1 */
2951 #endif
2952 
2953     /* Process characters until ] is reached. By writing this as a "do" it
2954     means that an initial ] is taken as a data character. At the start of the
2955     loop, c contains the first byte of the character. */
2956 
2957     if (c != 0) do
2958       {
2959       const uschar *oldptr;
2960 
2961 #ifdef SUPPORT_UTF8
2962       if (utf8 && c > 127)
2963         {                           /* Braces are required because the */
2964         GETCHARLEN(c, ptr, ptr);    /* macro generates multiple statements */
2965         }
2966 
2967       /* In the pre-compile phase, accumulate the length of any UTF-8 extra
2968       data and reset the pointer. This is so that very large classes that
2969       contain a zillion UTF-8 characters no longer overwrite the work space
2970       (which is on the stack). */
2971 
2972       if (lengthptr != NULL)
2973         {
2974         *lengthptr += class_utf8data - class_utf8data_base;
2975         class_utf8data = class_utf8data_base;
2976         }
2977 
2978 #endif
2979 
2980       /* Inside \Q...\E everything is literal except \E */
2981 
2982       if (inescq)
2983         {
2984         if (c == CHAR_BACKSLASH && ptr[1] == CHAR_E)  /* If we are at \E */
2985           {
2986           inescq = FALSE;                   /* Reset literal state */
2987           ptr++;                            /* Skip the 'E' */
2988           continue;                         /* Carry on with next */
2989           }
2990         goto CHECK_RANGE;                   /* Could be range if \E follows */
2991         }
2992 
2993       /* Handle POSIX class names. Perl allows a negation extension of the
2994       form [:^name:]. A square bracket that doesn't match the syntax is
2995       treated as a literal. We also recognize the POSIX constructions
2996       [.ch.] and [=ch=] ("collating elements") and fault them, as Perl
2997       5.6 and 5.8 do. */
2998 
2999       if (c == CHAR_LEFT_SQUARE_BRACKET &&
3000           (ptr[1] == CHAR_COLON || ptr[1] == CHAR_DOT ||
3001            ptr[1] == CHAR_EQUALS_SIGN) && check_posix_syntax(ptr, &tempptr))
3002         {
3003         BOOL local_negate = FALSE;
3004         int posix_class, taboffset, tabopt;
3005         register const uschar *cbits = cd->cbits;
3006         uschar pbits[32];
3007 
3008         if (ptr[1] != CHAR_COLON)
3009           {
3010           *errorcodeptr = ERR31;
3011           goto FAILED;
3012           }
3013 
3014         ptr += 2;
3015         if (*ptr == CHAR_CIRCUMFLEX_ACCENT)
3016           {
3017           local_negate = TRUE;
3018           should_flip_negation = TRUE;  /* Note negative special */
3019           ptr++;
3020           }
3021 
3022         posix_class = check_posix_name(ptr, tempptr - ptr);
3023         if (posix_class < 0)
3024           {
3025           *errorcodeptr = ERR30;
3026           goto FAILED;
3027           }
3028 
3029         /* If matching is caseless, upper and lower are converted to
3030         alpha. This relies on the fact that the class table starts with
3031         alpha, lower, upper as the first 3 entries. */
3032 
3033         if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
3034           posix_class = 0;
3035 
3036         /* We build the bit map for the POSIX class in a chunk of local store
3037         because we may be adding and subtracting from it, and we don't want to
3038         subtract bits that may be in the main map already. At the end we or the
3039         result into the bit map that is being built. */
3040 
3041         posix_class *= 3;
3042 
3043         /* Copy in the first table (always present) */
3044 
3045         memcpy(pbits, cbits + posix_class_maps[posix_class],
3046           32 * sizeof(uschar));
3047 
3048         /* If there is a second table, add or remove it as required. */
3049 
3050         taboffset = posix_class_maps[posix_class + 1];
3051         tabopt = posix_class_maps[posix_class + 2];
3052 
3053         if (taboffset >= 0)
3054           {
3055           if (tabopt >= 0)
3056             for (c = 0; c < 32; c++) pbits[c] |= cbits[c + taboffset];
3057           else
3058             for (c = 0; c < 32; c++) pbits[c] &= ~cbits[c + taboffset];
3059           }
3060 
3061         /* Not see if we need to remove any special characters. An option
3062         value of 1 removes vertical space and 2 removes underscore. */
3063 
3064         if (tabopt < 0) tabopt = -tabopt;
3065         if (tabopt == 1) pbits[1] &= ~0x3c;
3066           else if (tabopt == 2) pbits[11] &= 0x7f;
3067 
3068         /* Add the POSIX table or its complement into the main table that is
3069         being built and we are done. */
3070 
3071         if (local_negate)
3072           for (c = 0; c < 32; c++) classbits[c] |= ~pbits[c];
3073         else
3074           for (c = 0; c < 32; c++) classbits[c] |= pbits[c];
3075 
3076         ptr = tempptr + 1;
3077         class_charcount = 10;  /* Set > 1; assumes more than 1 per class */
3078         continue;    /* End of POSIX syntax handling */
3079         }
3080 
3081       /* Backslash may introduce a single character, or it may introduce one
3082       of the specials, which just set a flag. The sequence \b is a special
3083       case. Inside a class (and only there) it is treated as backspace.
3084       Elsewhere it marks a word boundary. Other escapes have preset maps ready
3085       to 'or' into the one we are building. We assume they have more than one
3086       character in them, so set class_charcount bigger than one. */
3087 
3088       if (c == CHAR_BACKSLASH)
3089         {
3090         c = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3091         if (*errorcodeptr != 0) goto FAILED;
3092 
3093         if (-c == ESC_b) c = CHAR_BS;       /* \b is backspace in a class */
3094         else if (-c == ESC_X) c = CHAR_X;   /* \X is literal X in a class */
3095         else if (-c == ESC_R) c = CHAR_R;   /* \R is literal R in a class */
3096         else if (-c == ESC_Q)            /* Handle start of quoted string */
3097           {
3098           if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3099             {
3100             ptr += 2; /* avoid empty string */
3101             }
3102           else inescq = TRUE;
3103           continue;
3104           }
3105         else if (-c == ESC_E) continue;  /* Ignore orphan \E */
3106 
3107         if (c < 0)
3108           {
3109           register const uschar *cbits = cd->cbits;
3110           class_charcount += 2;     /* Greater than 1 is what matters */
3111 
3112           /* Save time by not doing this in the pre-compile phase. */
3113 
3114           if (lengthptr == NULL) switch (-c)
3115             {
3116             case ESC_d:
3117             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_digit];
3118             continue;
3119 
3120             case ESC_D:
3121             should_flip_negation = TRUE;
3122             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_digit];
3123             continue;
3124 
3125             case ESC_w:
3126             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_word];
3127             continue;
3128 
3129             case ESC_W:
3130             should_flip_negation = TRUE;
3131             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_word];
3132             continue;
3133 
3134             case ESC_s:
3135             for (c = 0; c < 32; c++) classbits[c] |= cbits[c+cbit_space];
3136             classbits[1] &= ~0x08;   /* Perl 5.004 onwards omits VT from \s */
3137             continue;
3138 
3139             case ESC_S:
3140             should_flip_negation = TRUE;
3141             for (c = 0; c < 32; c++) classbits[c] |= ~cbits[c+cbit_space];
3142             classbits[1] |= 0x08;    /* Perl 5.004 onwards omits VT from \s */
3143             continue;
3144 
3145             default:    /* Not recognized; fall through */
3146             break;      /* Need "default" setting to stop compiler warning. */
3147             }
3148 
3149           /* In the pre-compile phase, just do the recognition. */
3150 
3151           else if (c == -ESC_d || c == -ESC_D || c == -ESC_w ||
3152                    c == -ESC_W || c == -ESC_s || c == -ESC_S) continue;
3153 
3154           /* We need to deal with \H, \h, \V, and \v in both phases because
3155           they use extra memory. */
3156 
3157           if (-c == ESC_h)
3158             {
3159             SETBIT(classbits, 0x09); /* VT */
3160             SETBIT(classbits, 0x20); /* SPACE */
3161             SETBIT(classbits, 0xa0); /* NSBP */
3162 #ifdef SUPPORT_UTF8
3163             if (utf8)
3164               {
3165               class_utf8 = TRUE;
3166               *class_utf8data++ = XCL_SINGLE;
3167               class_utf8data += _pcre_ord2utf8(0x1680, class_utf8data);
3168               *class_utf8data++ = XCL_SINGLE;
3169               class_utf8data += _pcre_ord2utf8(0x180e, class_utf8data);
3170               *class_utf8data++ = XCL_RANGE;
3171               class_utf8data += _pcre_ord2utf8(0x2000, class_utf8data);
3172               class_utf8data += _pcre_ord2utf8(0x200A, class_utf8data);
3173               *class_utf8data++ = XCL_SINGLE;
3174               class_utf8data += _pcre_ord2utf8(0x202f, class_utf8data);
3175               *class_utf8data++ = XCL_SINGLE;
3176               class_utf8data += _pcre_ord2utf8(0x205f, class_utf8data);
3177               *class_utf8data++ = XCL_SINGLE;
3178               class_utf8data += _pcre_ord2utf8(0x3000, class_utf8data);
3179               }
3180 #endif
3181             continue;
3182             }
3183 
3184           if (-c == ESC_H)
3185             {
3186             for (c = 0; c < 32; c++)
3187               {
3188               int x = 0xff;
3189               switch (c)
3190                 {
3191                 case 0x09/8: x ^= 1 << (0x09%8); break;
3192                 case 0x20/8: x ^= 1 << (0x20%8); break;
3193                 case 0xa0/8: x ^= 1 << (0xa0%8); break;
3194                 default: break;
3195                 }
3196               classbits[c] |= x;
3197               }
3198 
3199 #ifdef SUPPORT_UTF8
3200             if (utf8)
3201               {
3202               class_utf8 = TRUE;
3203               *class_utf8data++ = XCL_RANGE;
3204               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3205               class_utf8data += _pcre_ord2utf8(0x167f, class_utf8data);
3206               *class_utf8data++ = XCL_RANGE;
3207               class_utf8data += _pcre_ord2utf8(0x1681, class_utf8data);
3208               class_utf8data += _pcre_ord2utf8(0x180d, class_utf8data);
3209               *class_utf8data++ = XCL_RANGE;
3210               class_utf8data += _pcre_ord2utf8(0x180f, class_utf8data);
3211               class_utf8data += _pcre_ord2utf8(0x1fff, class_utf8data);
3212               *class_utf8data++ = XCL_RANGE;
3213               class_utf8data += _pcre_ord2utf8(0x200B, class_utf8data);
3214               class_utf8data += _pcre_ord2utf8(0x202e, class_utf8data);
3215               *class_utf8data++ = XCL_RANGE;
3216               class_utf8data += _pcre_ord2utf8(0x2030, class_utf8data);
3217               class_utf8data += _pcre_ord2utf8(0x205e, class_utf8data);
3218               *class_utf8data++ = XCL_RANGE;
3219               class_utf8data += _pcre_ord2utf8(0x2060, class_utf8data);
3220               class_utf8data += _pcre_ord2utf8(0x2fff, class_utf8data);
3221               *class_utf8data++ = XCL_RANGE;
3222               class_utf8data += _pcre_ord2utf8(0x3001, class_utf8data);
3223               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3224               }
3225 #endif
3226             continue;
3227             }
3228 
3229           if (-c == ESC_v)
3230             {
3231             SETBIT(classbits, 0x0a); /* LF */
3232             SETBIT(classbits, 0x0b); /* VT */
3233             SETBIT(classbits, 0x0c); /* FF */
3234             SETBIT(classbits, 0x0d); /* CR */
3235             SETBIT(classbits, 0x85); /* NEL */
3236 #ifdef SUPPORT_UTF8
3237             if (utf8)
3238               {
3239               class_utf8 = TRUE;
3240               *class_utf8data++ = XCL_RANGE;
3241               class_utf8data += _pcre_ord2utf8(0x2028, class_utf8data);
3242               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3243               }
3244 #endif
3245             continue;
3246             }
3247 
3248           if (-c == ESC_V)
3249             {
3250             for (c = 0; c < 32; c++)
3251               {
3252               int x = 0xff;
3253               switch (c)
3254                 {
3255                 case 0x0a/8: x ^= 1 << (0x0a%8);
3256                              x ^= 1 << (0x0b%8);
3257                              x ^= 1 << (0x0c%8);
3258                              x ^= 1 << (0x0d%8);
3259                              break;
3260                 case 0x85/8: x ^= 1 << (0x85%8); break;
3261                 default: break;
3262                 }
3263               classbits[c] |= x;
3264               }
3265 
3266 #ifdef SUPPORT_UTF8
3267             if (utf8)
3268               {
3269               class_utf8 = TRUE;
3270               *class_utf8data++ = XCL_RANGE;
3271               class_utf8data += _pcre_ord2utf8(0x0100, class_utf8data);
3272               class_utf8data += _pcre_ord2utf8(0x2027, class_utf8data);
3273               *class_utf8data++ = XCL_RANGE;
3274               class_utf8data += _pcre_ord2utf8(0x2029, class_utf8data);
3275               class_utf8data += _pcre_ord2utf8(0x7fffffff, class_utf8data);
3276               }
3277 #endif
3278             continue;
3279             }
3280 
3281           /* We need to deal with \P and \p in both phases. */
3282 
3283 #ifdef SUPPORT_UCP
3284           if (-c == ESC_p || -c == ESC_P)
3285             {
3286             BOOL negated;
3287             int pdata;
3288             int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
3289             if (ptype < 0) goto FAILED;
3290             class_utf8 = TRUE;
3291             *class_utf8data++ = ((-c == ESC_p) != negated)?
3292               XCL_PROP : XCL_NOTPROP;
3293             *class_utf8data++ = ptype;
3294             *class_utf8data++ = pdata;
3295             class_charcount -= 2;   /* Not a < 256 character */
3296             continue;
3297             }
3298 #endif
3299           /* Unrecognized escapes are faulted if PCRE is running in its
3300           strict mode. By default, for compatibility with Perl, they are
3301           treated as literals. */
3302 
3303           if ((options & PCRE_EXTRA) != 0)
3304             {
3305             *errorcodeptr = ERR7;
3306             goto FAILED;
3307             }
3308 
3309           class_charcount -= 2;  /* Undo the default count from above */
3310           c = *ptr;              /* Get the final character and fall through */
3311           }
3312 
3313         /* Fall through if we have a single character (c >= 0). This may be
3314         greater than 256 in UTF-8 mode. */
3315 
3316         }   /* End of backslash handling */
3317 
3318       /* A single character may be followed by '-' to form a range. However,
3319       Perl does not permit ']' to be the end of the range. A '-' character
3320       at the end is treated as a literal. Perl ignores orphaned \E sequences
3321       entirely. The code for handling \Q and \E is messy. */
3322 
3323       CHECK_RANGE:
3324       while (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
3325         {
3326         inescq = FALSE;
3327         ptr += 2;
3328         }
3329 
3330       oldptr = ptr;
3331 
3332       /* Remember \r or \n */
3333 
3334       if (c == CHAR_CR || c == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3335 
3336       /* Check for range */
3337 
3338       if (!inescq && ptr[1] == CHAR_MINUS)
3339         {
3340         int d;
3341         ptr += 2;
3342         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E) ptr += 2;
3343 
3344         /* If we hit \Q (not followed by \E) at this point, go into escaped
3345         mode. */
3346 
3347         while (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_Q)
3348           {
3349           ptr += 2;
3350           if (*ptr == CHAR_BACKSLASH && ptr[1] == CHAR_E)
3351             { ptr += 2; continue; }
3352           inescq = TRUE;
3353           break;
3354           }
3355 
3356         if (*ptr == 0 || (!inescq && *ptr == CHAR_RIGHT_SQUARE_BRACKET))
3357           {
3358           ptr = oldptr;
3359           goto LONE_SINGLE_CHARACTER;
3360           }
3361 
3362 #ifdef SUPPORT_UTF8
3363         if (utf8)
3364           {                           /* Braces are required because the */
3365           GETCHARLEN(d, ptr, ptr);    /* macro generates multiple statements */
3366           }
3367         else
3368 #endif
3369         d = *ptr;  /* Not UTF-8 mode */
3370 
3371         /* The second part of a range can be a single-character escape, but
3372         not any of the other escapes. Perl 5.6 treats a hyphen as a literal
3373         in such circumstances. */
3374 
3375         if (!inescq && d == CHAR_BACKSLASH)
3376           {
3377           d = check_escape(&ptr, errorcodeptr, cd->bracount, options, TRUE);
3378           if (*errorcodeptr != 0) goto FAILED;
3379 
3380           /* \b is backspace; \X is literal X; \R is literal R; any other
3381           special means the '-' was literal */
3382 
3383           if (d < 0)
3384             {
3385             if (d == -ESC_b) d = CHAR_BS;
3386             else if (d == -ESC_X) d = CHAR_X;
3387             else if (d == -ESC_R) d = CHAR_R; else
3388               {
3389               ptr = oldptr;
3390               goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3391               }
3392             }
3393           }
3394 
3395         /* Check that the two values are in the correct order. Optimize
3396         one-character ranges */
3397 
3398         if (d < c)
3399           {
3400           *errorcodeptr = ERR8;
3401           goto FAILED;
3402           }
3403 
3404         if (d == c) goto LONE_SINGLE_CHARACTER;  /* A few lines below */
3405 
3406         /* Remember \r or \n */
3407 
3408         if (d == CHAR_CR || d == CHAR_NL) cd->external_flags |= PCRE_HASCRORLF;
3409 
3410         /* In UTF-8 mode, if the upper limit is > 255, or > 127 for caseless
3411         matching, we have to use an XCLASS with extra data items. Caseless
3412         matching for characters > 127 is available only if UCP support is
3413         available. */
3414 
3415 #ifdef SUPPORT_UTF8
3416         if (utf8 && (d > 255 || ((options & PCRE_CASELESS) != 0 && d > 127)))
3417           {
3418           class_utf8 = TRUE;
3419 
3420           /* With UCP support, we can find the other case equivalents of
3421           the relevant characters. There may be several ranges. Optimize how
3422           they fit with the basic range. */
3423 
3424 #ifdef SUPPORT_UCP
3425           if ((options & PCRE_CASELESS) != 0)
3426             {
3427             unsigned int occ, ocd;
3428             unsigned int cc = c;
3429             unsigned int origd = d;
3430             while (get_othercase_range(&cc, origd, &occ, &ocd))
3431               {
3432               if (occ >= (unsigned int)c &&
3433                   ocd <= (unsigned int)d)
3434                 continue;                          /* Skip embedded ranges */
3435 
3436               if (occ < (unsigned int)c  &&
3437                   ocd >= (unsigned int)c - 1)      /* Extend the basic range */
3438                 {                                  /* if there is overlap,   */
3439                 c = occ;                           /* noting that if occ < c */
3440                 continue;                          /* we can't have ocd > d  */
3441                 }                                  /* because a subrange is  */
3442               if (ocd > (unsigned int)d &&
3443                   occ <= (unsigned int)d + 1)      /* always shorter than    */
3444                 {                                  /* the basic range.       */
3445                 d = ocd;
3446                 continue;
3447                 }
3448 
3449               if (occ == ocd)
3450                 {
3451                 *class_utf8data++ = XCL_SINGLE;
3452                 }
3453               else
3454                 {
3455                 *class_utf8data++ = XCL_RANGE;
3456                 class_utf8data += _pcre_ord2utf8(occ, class_utf8data);
3457                 }
3458               class_utf8data += _pcre_ord2utf8(ocd, class_utf8data);
3459               }
3460             }
3461 #endif  /* SUPPORT_UCP */
3462 
3463           /* Now record the original range, possibly modified for UCP caseless
3464           overlapping ranges. */
3465 
3466           *class_utf8data++ = XCL_RANGE;
3467           class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3468           class_utf8data += _pcre_ord2utf8(d, class_utf8data);
3469 
3470           /* With UCP support, we are done. Without UCP support, there is no
3471           caseless matching for UTF-8 characters > 127; we can use the bit map
3472           for the smaller ones. */
3473 
3474 #ifdef SUPPORT_UCP
3475           continue;    /* With next character in the class */
3476 #else
3477           if ((options & PCRE_CASELESS) == 0 || c > 127) continue;
3478 
3479           /* Adjust upper limit and fall through to set up the map */
3480 
3481           d = 127;
3482 
3483 #endif  /* SUPPORT_UCP */
3484           }
3485 #endif  /* SUPPORT_UTF8 */
3486 
3487         /* We use the bit map for all cases when not in UTF-8 mode; else
3488         ranges that lie entirely within 0-127 when there is UCP support; else
3489         for partial ranges without UCP support. */
3490 
3491         class_charcount += d - c + 1;
3492         class_lastchar = d;
3493 
3494         /* We can save a bit of time by skipping this in the pre-compile. */
3495 
3496         if (lengthptr == NULL) for (; c <= d; c++)
3497           {
3498           classbits[c/8] |= (1 << (c&7));
3499           if ((options & PCRE_CASELESS) != 0)
3500             {
3501             int uc = cd->fcc[c];           /* flip case */
3502             classbits[uc/8] |= (1 << (uc&7));
3503             }
3504           }
3505 
3506         continue;   /* Go get the next char in the class */
3507         }
3508 
3509       /* Handle a lone single character - we can get here for a normal
3510       non-escape char, or after \ that introduces a single character or for an
3511       apparent range that isn't. */
3512 
3513       LONE_SINGLE_CHARACTER:
3514 
3515       /* Handle a character that cannot go in the bit map */
3516 
3517 #ifdef SUPPORT_UTF8
3518       if (utf8 && (c > 255 || ((options & PCRE_CASELESS) != 0 && c > 127)))
3519         {
3520         class_utf8 = TRUE;
3521         *class_utf8data++ = XCL_SINGLE;
3522         class_utf8data += _pcre_ord2utf8(c, class_utf8data);
3523 
3524 #ifdef SUPPORT_UCP
3525         if ((options & PCRE_CASELESS) != 0)
3526           {
3527           unsigned int othercase;
3528           if ((othercase = UCD_OTHERCASE(c)) != c)
3529             {
3530             *class_utf8data++ = XCL_SINGLE;
3531             class_utf8data += _pcre_ord2utf8(othercase, class_utf8data);
3532             }
3533           }
3534 #endif  /* SUPPORT_UCP */
3535 
3536         }
3537       else
3538 #endif  /* SUPPORT_UTF8 */
3539 
3540       /* Handle a single-byte character */
3541         {
3542         classbits[c/8] |= (1 << (c&7));
3543         if ((options & PCRE_CASELESS) != 0)
3544           {
3545           c = cd->fcc[c];   /* flip case */
3546           classbits[c/8] |= (1 << (c&7));
3547           }
3548         class_charcount++;
3549         class_lastchar = c;
3550         }
3551       }
3552 
3553     /* Loop until ']' reached. This "while" is the end of the "do" above. */
3554 
3555     while ((c = *(++ptr)) != 0 && (c != CHAR_RIGHT_SQUARE_BRACKET || inescq));
3556 
3557     if (c == 0)                          /* Missing terminating ']' */
3558       {
3559       *errorcodeptr = ERR6;
3560       goto FAILED;
3561       }
3562 
3563 
3564 /* This code has been disabled because it would mean that \s counts as
3565 an explicit \r or \n reference, and that's not really what is wanted. Now
3566 we set the flag only if there is a literal "\r" or "\n" in the class. */
3567 
3568 #if 0
3569     /* Remember whether \r or \n are in this class */
3570 
3571     if (negate_class)
3572       {
3573       if ((classbits[1] & 0x24) != 0x24) cd->external_flags |= PCRE_HASCRORLF;
3574       }
3575     else
3576       {
3577       if ((classbits[1] & 0x24) != 0) cd->external_flags |= PCRE_HASCRORLF;
3578       }
3579 #endif
3580 
3581 
3582     /* If class_charcount is 1, we saw precisely one character whose value is
3583     less than 256. As long as there were no characters >= 128 and there was no
3584     use of \p or \P, in other words, no use of any XCLASS features, we can
3585     optimize.
3586 
3587     In UTF-8 mode, we can optimize the negative case only if there were no
3588     characters >= 128 because OP_NOT and the related opcodes like OP_NOTSTAR
3589     operate on single-bytes only. This is an historical hangover. Maybe one day
3590     we can tidy these opcodes to handle multi-byte characters.
3591 
3592     The optimization throws away the bit map. We turn the item into a
3593     1-character OP_CHAR[NC] if it's positive, or OP_NOT if it's negative. Note
3594     that OP_NOT does not support multibyte characters. In the positive case, it
3595     can cause firstbyte to be set. Otherwise, there can be no first char if
3596     this item is first, whatever repeat count may follow. In the case of
3597     reqbyte, save the previous value for reinstating. */
3598 
3599 #ifdef SUPPORT_UTF8
3600     if (class_charcount == 1 && !class_utf8 &&
3601       (!utf8 || !negate_class || class_lastchar < 128))
3602 #else
3603     if (class_charcount == 1)
3604 #endif
3605       {
3606       zeroreqbyte = reqbyte;
3607 
3608       /* The OP_NOT opcode works on one-byte characters only. */
3609 
3610       if (negate_class)
3611         {
3612         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3613         zerofirstbyte = firstbyte;
3614         *code++ = OP_NOT;
3615         *code++ = class_lastchar;
3616         break;
3617         }
3618 
3619       /* For a single, positive character, get the value into mcbuffer, and
3620       then we can handle this with the normal one-character code. */
3621 
3622 #ifdef SUPPORT_UTF8
3623       if (utf8 && class_lastchar > 127)
3624         mclength = _pcre_ord2utf8(class_lastchar, mcbuffer);
3625       else
3626 #endif
3627         {
3628         mcbuffer[0] = class_lastchar;
3629         mclength = 1;
3630         }
3631       goto ONE_CHAR;
3632       }       /* End of 1-char optimization */
3633 
3634     /* The general case - not the one-char optimization. If this is the first
3635     thing in the branch, there can be no first char setting, whatever the
3636     repeat count. Any reqbyte setting must remain unchanged after any kind of
3637     repeat. */
3638 
3639     if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
3640     zerofirstbyte = firstbyte;
3641     zeroreqbyte = reqbyte;
3642 
3643     /* If there are characters with values > 255, we have to compile an
3644     extended class, with its own opcode, unless there was a negated special
3645     such as \S in the class, because in that case all characters > 255 are in
3646     the class, so any that were explicitly given as well can be ignored. If
3647     (when there are explicit characters > 255 that must be listed) there are no
3648     characters < 256, we can omit the bitmap in the actual compiled code. */
3649 
3650 #ifdef SUPPORT_UTF8
3651     if (class_utf8 && !should_flip_negation)
3652       {
3653       *class_utf8data++ = XCL_END;    /* Marks the end of extra data */
3654       *code++ = OP_XCLASS;
3655       code += LINK_SIZE;
3656       *code = negate_class? XCL_NOT : 0;
3657 
3658       /* If the map is required, move up the extra data to make room for it;
3659       otherwise just move the code pointer to the end of the extra data. */
3660 
3661       if (class_charcount > 0)
3662         {
3663         *code++ |= XCL_MAP;
3664         memmove(code + 32, code, class_utf8data - code);
3665         memcpy(code, classbits, 32);
3666         code = class_utf8data + 32;
3667         }
3668       else code = class_utf8data;
3669 
3670       /* Now fill in the complete length of the item */
3671 
3672       PUT(previous, 1, code - previous);
3673       break;   /* End of class handling */
3674       }
3675 #endif
3676 
3677     /* If there are no characters > 255, set the opcode to OP_CLASS or
3678     OP_NCLASS, depending on whether the whole class was negated and whether
3679     there were negative specials such as \S in the class. Then copy the 32-byte
3680     map into the code vector, negating it if necessary. */
3681 
3682     *code++ = (negate_class == should_flip_negation) ? OP_CLASS : OP_NCLASS;
3683     if (negate_class)
3684       {
3685       if (lengthptr == NULL)    /* Save time in the pre-compile phase */
3686         for (c = 0; c < 32; c++) code[c] = ~classbits[c];
3687       }
3688     else
3689       {
3690       memcpy(code, classbits, 32);
3691       }
3692     code += 32;
3693     break;
3694 
3695 
3696     /* ===================================================================*/
3697     /* Various kinds of repeat; '{' is not necessarily a quantifier, but this
3698     has been tested above. */
3699 
3700     case CHAR_LEFT_CURLY_BRACKET:
3701     if (!is_quantifier) goto NORMAL_CHAR;
3702     ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorcodeptr);
3703     if (*errorcodeptr != 0) goto FAILED;
3704     goto REPEAT;
3705 
3706     case CHAR_ASTERISK:
3707     repeat_min = 0;
3708     repeat_max = -1;
3709     goto REPEAT;
3710 
3711     case CHAR_PLUS:
3712     repeat_min = 1;
3713     repeat_max = -1;
3714     goto REPEAT;
3715 
3716     case CHAR_QUESTION_MARK:
3717     repeat_min = 0;
3718     repeat_max = 1;
3719 
3720     REPEAT:
3721     if (previous == NULL)
3722       {
3723       *errorcodeptr = ERR9;
3724       goto FAILED;
3725       }
3726 
3727     if (repeat_min == 0)
3728       {
3729       firstbyte = zerofirstbyte;    /* Adjust for zero repeat */
3730       reqbyte = zeroreqbyte;        /* Ditto */
3731       }
3732 
3733     /* Remember whether this is a variable length repeat */
3734 
3735     reqvary = (repeat_min == repeat_max)? 0 : REQ_VARY;
3736 
3737     op_type = 0;                    /* Default single-char op codes */
3738     possessive_quantifier = FALSE;  /* Default not possessive quantifier */
3739 
3740     /* Save start of previous item, in case we have to move it up to make space
3741     for an inserted OP_ONCE for the additional '+' extension. */
3742 
3743     tempcode = previous;
3744 
3745     /* If the next character is '+', we have a possessive quantifier. This
3746     implies greediness, whatever the setting of the PCRE_UNGREEDY option.
3747     If the next character is '?' this is a minimizing repeat, by default,
3748     but if PCRE_UNGREEDY is set, it works the other way round. We change the
3749     repeat type to the non-default. */
3750 
3751     if (ptr[1] == CHAR_PLUS)
3752       {
3753       repeat_type = 0;                  /* Force greedy */
3754       possessive_quantifier = TRUE;
3755       ptr++;
3756       }
3757     else if (ptr[1] == CHAR_QUESTION_MARK)
3758       {
3759       repeat_type = greedy_non_default;
3760       ptr++;
3761       }
3762     else repeat_type = greedy_default;
3763 
3764     /* If previous was a character match, abolish the item and generate a
3765     repeat item instead. If a char item has a minumum of more than one, ensure
3766     that it is set in reqbyte - it might not be if a sequence such as x{3} is
3767     the first thing in a branch because the x will have gone into firstbyte
3768     instead.  */
3769 
3770     if (*previous == OP_CHAR || *previous == OP_CHARNC)
3771       {
3772       /* Deal with UTF-8 characters that take up more than one byte. It's
3773       easier to write this out separately than try to macrify it. Use c to
3774       hold the length of the character in bytes, plus 0x80 to flag that it's a
3775       length rather than a small character. */
3776 
3777 #ifdef SUPPORT_UTF8
3778       if (utf8 && (code[-1] & 0x80) != 0)
3779         {
3780         uschar *lastchar = code - 1;
3781         while((*lastchar & 0xc0) == 0x80) lastchar--;
3782         c = code - lastchar;            /* Length of UTF-8 character */
3783         memcpy(utf8_char, lastchar, c); /* Save the char */
3784         c |= 0x80;                      /* Flag c as a length */
3785         }
3786       else
3787 #endif
3788 
3789       /* Handle the case of a single byte - either with no UTF8 support, or
3790       with UTF-8 disabled, or for a UTF-8 character < 128. */
3791 
3792         {
3793         c = code[-1];
3794         if (repeat_min > 1) reqbyte = c | req_caseopt | cd->req_varyopt;
3795         }
3796 
3797       /* If the repetition is unlimited, it pays to see if the next thing on
3798       the line is something that cannot possibly match this character. If so,
3799       automatically possessifying this item gains some performance in the case
3800       where the match fails. */
3801 
3802       if (!possessive_quantifier &&
3803           repeat_max < 0 &&
3804           check_auto_possessive(*previous, c, utf8, utf8_char, ptr + 1,
3805             options, cd))
3806         {
3807         repeat_type = 0;    /* Force greedy */
3808         possessive_quantifier = TRUE;
3809         }
3810 
3811       goto OUTPUT_SINGLE_REPEAT;   /* Code shared with single character types */
3812       }
3813 
3814     /* If previous was a single negated character ([^a] or similar), we use
3815     one of the special opcodes, replacing it. The code is shared with single-
3816     character repeats by setting opt_type to add a suitable offset into
3817     repeat_type. We can also test for auto-possessification. OP_NOT is
3818     currently used only for single-byte chars. */
3819 
3820     else if (*previous == OP_NOT)
3821       {
3822       op_type = OP_NOTSTAR - OP_STAR;  /* Use "not" opcodes */
3823       c = previous[1];
3824       if (!possessive_quantifier &&
3825           repeat_max < 0 &&
3826           check_auto_possessive(OP_NOT, c, utf8, NULL, ptr + 1, options, cd))
3827         {
3828         repeat_type = 0;    /* Force greedy */
3829         possessive_quantifier = TRUE;
3830         }
3831       goto OUTPUT_SINGLE_REPEAT;
3832       }
3833 
3834     /* If previous was a character type match (\d or similar), abolish it and
3835     create a suitable repeat item. The code is shared with single-character
3836     repeats by setting op_type to add a suitable offset into repeat_type. Note
3837     the the Unicode property types will be present only when SUPPORT_UCP is
3838     defined, but we don't wrap the little bits of code here because it just
3839     makes it horribly messy. */
3840 
3841     else if (*previous < OP_EODN)
3842       {
3843       uschar *oldcode;
3844       int prop_type, prop_value;
3845       op_type = OP_TYPESTAR - OP_STAR;  /* Use type opcodes */
3846       c = *previous;
3847 
3848       if (!possessive_quantifier &&
3849           repeat_max < 0 &&
3850           check_auto_possessive(c, 0, utf8, NULL, ptr + 1, options, cd))
3851         {
3852         repeat_type = 0;    /* Force greedy */
3853         possessive_quantifier = TRUE;
3854         }
3855 
3856       OUTPUT_SINGLE_REPEAT:
3857       if (*previous == OP_PROP || *previous == OP_NOTPROP)
3858         {
3859         prop_type = previous[1];
3860         prop_value = previous[2];
3861         }
3862       else prop_type = prop_value = -1;
3863 
3864       oldcode = code;
3865       code = previous;                  /* Usually overwrite previous item */
3866 
3867       /* If the maximum is zero then the minimum must also be zero; Perl allows
3868       this case, so we do too - by simply omitting the item altogether. */
3869 
3870       if (repeat_max == 0) goto END_REPEAT;
3871 
3872       /* All real repeats make it impossible to handle partial matching (maybe
3873       one day we will be able to remove this restriction). */
3874 
3875       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
3876 
3877       /* Combine the op_type with the repeat_type */
3878 
3879       repeat_type += op_type;
3880 
3881       /* A minimum of zero is handled either as the special case * or ?, or as
3882       an UPTO, with the maximum given. */
3883 
3884       if (repeat_min == 0)
3885         {
3886         if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
3887           else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
3888         else
3889           {
3890           *code++ = OP_UPTO + repeat_type;
3891           PUT2INC(code, 0, repeat_max);
3892           }
3893         }
3894 
3895       /* A repeat minimum of 1 is optimized into some special cases. If the
3896       maximum is unlimited, we use OP_PLUS. Otherwise, the original item is
3897       left in place and, if the maximum is greater than 1, we use OP_UPTO with
3898       one less than the maximum. */
3899 
3900       else if (repeat_min == 1)
3901         {
3902         if (repeat_max == -1)
3903           *code++ = OP_PLUS + repeat_type;
3904         else
3905           {
3906           code = oldcode;                 /* leave previous item in place */
3907           if (repeat_max == 1) goto END_REPEAT;
3908           *code++ = OP_UPTO + repeat_type;
3909           PUT2INC(code, 0, repeat_max - 1);
3910           }
3911         }
3912 
3913       /* The case {n,n} is just an EXACT, while the general case {n,m} is
3914       handled as an EXACT followed by an UPTO. */
3915 
3916       else
3917         {
3918         *code++ = OP_EXACT + op_type;  /* NB EXACT doesn't have repeat_type */
3919         PUT2INC(code, 0, repeat_min);
3920 
3921         /* If the maximum is unlimited, insert an OP_STAR. Before doing so,
3922         we have to insert the character for the previous code. For a repeated
3923         Unicode property match, there are two extra bytes that define the
3924         required property. In UTF-8 mode, long characters have their length in
3925         c, with the 0x80 bit as a flag. */
3926 
3927         if (repeat_max < 0)
3928           {
3929 #ifdef SUPPORT_UTF8
3930           if (utf8 && c >= 128)
3931             {
3932             memcpy(code, utf8_char, c & 7);
3933             code += c & 7;
3934             }
3935           else
3936 #endif
3937             {
3938             *code++ = c;
3939             if (prop_type >= 0)
3940               {
3941               *code++ = prop_type;
3942               *code++ = prop_value;
3943               }
3944             }
3945           *code++ = OP_STAR + repeat_type;
3946           }
3947 
3948         /* Else insert an UPTO if the max is greater than the min, again
3949         preceded by the character, for the previously inserted code. If the
3950         UPTO is just for 1 instance, we can use QUERY instead. */
3951 
3952         else if (repeat_max != repeat_min)
3953           {
3954 #ifdef SUPPORT_UTF8
3955           if (utf8 && c >= 128)
3956             {
3957             memcpy(code, utf8_char, c & 7);
3958             code += c & 7;
3959             }
3960           else
3961 #endif
3962           *code++ = c;
3963           if (prop_type >= 0)
3964             {
3965             *code++ = prop_type;
3966             *code++ = prop_value;
3967             }
3968           repeat_max -= repeat_min;
3969 
3970           if (repeat_max == 1)
3971             {
3972             *code++ = OP_QUERY + repeat_type;
3973             }
3974           else
3975             {
3976             *code++ = OP_UPTO + repeat_type;
3977             PUT2INC(code, 0, repeat_max);
3978             }
3979           }
3980         }
3981 
3982       /* The character or character type itself comes last in all cases. */
3983 
3984 #ifdef SUPPORT_UTF8
3985       if (utf8 && c >= 128)
3986         {
3987         memcpy(code, utf8_char, c & 7);
3988         code += c & 7;
3989         }
3990       else
3991 #endif
3992       *code++ = c;
3993 
3994       /* For a repeated Unicode property match, there are two extra bytes that
3995       define the required property. */
3996 
3997 #ifdef SUPPORT_UCP
3998       if (prop_type >= 0)
3999         {
4000         *code++ = prop_type;
4001         *code++ = prop_value;
4002         }
4003 #endif
4004       }
4005 
4006     /* If previous was a character class or a back reference, we put the repeat
4007     stuff after it, but just skip the item if the repeat was {0,0}. */
4008 
4009     else if (*previous == OP_CLASS ||
4010              *previous == OP_NCLASS ||
4011 #ifdef SUPPORT_UTF8
4012              *previous == OP_XCLASS ||
4013 #endif
4014              *previous == OP_REF)
4015       {
4016       if (repeat_max == 0)
4017         {
4018         code = previous;
4019         goto END_REPEAT;
4020         }
4021 
4022       /* All real repeats make it impossible to handle partial matching (maybe
4023       one day we will be able to remove this restriction). */
4024 
4025       if (repeat_max != 1) cd->external_flags |= PCRE_NOPARTIAL;
4026 
4027       if (repeat_min == 0 && repeat_max == -1)
4028         *code++ = OP_CRSTAR + repeat_type;
4029       else if (repeat_min == 1 && repeat_max == -1)
4030         *code++ = OP_CRPLUS + repeat_type;
4031       else if (repeat_min == 0 && repeat_max == 1)
4032         *code++ = OP_CRQUERY + repeat_type;
4033       else
4034         {
4035         *code++ = OP_CRRANGE + repeat_type;
4036         PUT2INC(code, 0, repeat_min);
4037         if (repeat_max == -1) repeat_max = 0;  /* 2-byte encoding for max */
4038         PUT2INC(code, 0, repeat_max);
4039         }
4040       }
4041 
4042     /* If previous was a bracket group, we may have to replicate it in certain
4043     cases. */
4044 
4045     else if (*previous == OP_BRA  || *previous == OP_CBRA ||
4046              *previous == OP_ONCE || *previous == OP_COND)
4047       {
4048       register int i;
4049       int ketoffset = 0;
4050       int len = code - previous;
4051       uschar *bralink = NULL;
4052 
4053       /* Repeating a DEFINE group is pointless */
4054 
4055       if (*previous == OP_COND && previous[LINK_SIZE+1] == OP_DEF)
4056         {
4057         *errorcodeptr = ERR55;
4058         goto FAILED;
4059         }
4060 
4061       /* If the maximum repeat count is unlimited, find the end of the bracket
4062       by scanning through from the start, and compute the offset back to it
4063       from the current code pointer. There may be an OP_OPT setting following
4064       the final KET, so we can't find the end just by going back from the code
4065       pointer. */
4066 
4067       if (repeat_max == -1)
4068         {
4069         register uschar *ket = previous;
4070         do ket += GET(ket, 1); while (*ket != OP_KET);
4071         ketoffset = code - ket;
4072         }
4073 
4074       /* The case of a zero minimum is special because of the need to stick
4075       OP_BRAZERO in front of it, and because the group appears once in the
4076       data, whereas in other cases it appears the minimum number of times. For
4077       this reason, it is simplest to treat this case separately, as otherwise
4078       the code gets far too messy. There are several special subcases when the
4079       minimum is zero. */
4080 
4081       if (repeat_min == 0)
4082         {
4083         /* If the maximum is also zero, we used to just omit the group from the
4084         output altogether, like this:
4085 
4086         ** if (repeat_max == 0)
4087         **   {
4088         **   code = previous;
4089         **   goto END_REPEAT;
4090         **   }
4091 
4092         However, that fails when a group is referenced as a subroutine from
4093         elsewhere in the pattern, so now we stick in OP_SKIPZERO in front of it
4094         so that it is skipped on execution. As we don't have a list of which
4095         groups are referenced, we cannot do this selectively.
4096 
4097         If the maximum is 1 or unlimited, we just have to stick in the BRAZERO
4098         and do no more at this point. However, we do need to adjust any
4099         OP_RECURSE calls inside the group that refer to the group itself or any
4100         internal or forward referenced group, because the offset is from the
4101         start of the whole regex. Temporarily terminate the pattern while doing
4102         this. */
4103 
4104         if (repeat_max <= 1)    /* Covers 0, 1, and unlimited */
4105           {
4106           *code = OP_END;
4107           adjust_recurse(previous, 1, utf8, cd, save_hwm);
4108           memmove(previous+1, previous, len);
4109           code++;
4110           if (repeat_max == 0)
4111             {
4112             *previous++ = OP_SKIPZERO;
4113             goto END_REPEAT;
4114             }
4115           *previous++ = OP_BRAZERO + repeat_type;
4116           }
4117 
4118         /* If the maximum is greater than 1 and limited, we have to replicate
4119         in a nested fashion, sticking OP_BRAZERO before each set of brackets.
4120         The first one has to be handled carefully because it's the original
4121         copy, which has to be moved up. The remainder can be handled by code
4122         that is common with the non-zero minimum case below. We have to
4123         adjust the value or repeat_max, since one less copy is required. Once
4124         again, we may have to adjust any OP_RECURSE calls inside the group. */
4125 
4126         else
4127           {
4128           int offset;
4129           *code = OP_END;
4130           adjust_recurse(previous, 2 + LINK_SIZE, utf8, cd, save_hwm);
4131           memmove(previous + 2 + LINK_SIZE, previous, len);
4132           code += 2 + LINK_SIZE;
4133           *previous++ = OP_BRAZERO + repeat_type;
4134           *previous++ = OP_BRA;
4135 
4136           /* We chain together the bracket offset fields that have to be
4137           filled in later when the ends of the brackets are reached. */
4138 
4139           offset = (bralink == NULL)? 0 : previous - bralink;
4140           bralink = previous;
4141           PUTINC(previous, 0, offset);
4142           }
4143 
4144         repeat_max--;
4145         }
4146 
4147       /* If the minimum is greater than zero, replicate the group as many
4148       times as necessary, and adjust the maximum to the number of subsequent
4149       copies that we need. If we set a first char from the group, and didn't
4150       set a required char, copy the latter from the former. If there are any
4151       forward reference subroutine calls in the group, there will be entries on
4152       the workspace list; replicate these with an appropriate increment. */
4153 
4154       else
4155         {
4156         if (repeat_min > 1)
4157           {
4158           /* In the pre-compile phase, we don't actually do the replication. We
4159           just adjust the length as if we had. Do some paranoid checks for
4160           potential integer overflow. */
4161 
4162           if (lengthptr != NULL)
4163             {
4164             int delta = (repeat_min - 1)*length_prevgroup;
4165             if ((double)(repeat_min - 1)*(double)length_prevgroup >
4166                                                             (double)INT_MAX ||
4167                 OFLOW_MAX - *lengthptr < delta)
4168               {
4169               *errorcodeptr = ERR20;
4170               goto FAILED;
4171               }
4172             *lengthptr += delta;
4173             }
4174 
4175           /* This is compiling for real */
4176 
4177           else
4178             {
4179             if (groupsetfirstbyte && reqbyte < 0) reqbyte = firstbyte;
4180             for (i = 1; i < repeat_min; i++)
4181               {
4182               uschar *hc;
4183               uschar *this_hwm = cd->hwm;
4184               memcpy(code, previous, len);
4185               for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4186                 {
4187                 PUT(cd->hwm, 0, GET(hc, 0) + len);
4188                 cd->hwm += LINK_SIZE;
4189                 }
4190               save_hwm = this_hwm;
4191               code += len;
4192               }
4193             }
4194           }
4195 
4196         if (repeat_max > 0) repeat_max -= repeat_min;
4197         }
4198 
4199       /* This code is common to both the zero and non-zero minimum cases. If
4200       the maximum is limited, it replicates the group in a nested fashion,
4201       remembering the bracket starts on a stack. In the case of a zero minimum,
4202       the first one was set up above. In all cases the repeat_max now specifies
4203       the number of additional copies needed. Again, we must remember to
4204       replicate entries on the forward reference list. */
4205 
4206       if (repeat_max >= 0)
4207         {
4208         /* In the pre-compile phase, we don't actually do the replication. We
4209         just adjust the length as if we had. For each repetition we must add 1
4210         to the length for BRAZERO and for all but the last repetition we must
4211         add 2 + 2*LINKSIZE to allow for the nesting that occurs. Do some
4212         paranoid checks to avoid integer overflow. */
4213 
4214         if (lengthptr != NULL && repeat_max > 0)
4215           {
4216           int delta = repeat_max * (length_prevgroup + 1 + 2 + 2*LINK_SIZE) -
4217                       2 - 2*LINK_SIZE;   /* Last one doesn't nest */
4218           if ((double)repeat_max *
4219                 (double)(length_prevgroup + 1 + 2 + 2*LINK_SIZE)
4220                   > (double)INT_MAX ||
4221               OFLOW_MAX - *lengthptr < delta)
4222             {
4223             *errorcodeptr = ERR20;
4224             goto FAILED;
4225             }
4226           *lengthptr += delta;
4227           }
4228 
4229         /* This is compiling for real */
4230 
4231         else for (i = repeat_max - 1; i >= 0; i--)
4232           {
4233           uschar *hc;
4234           uschar *this_hwm = cd->hwm;
4235 
4236           *code++ = OP_BRAZERO + repeat_type;
4237 
4238           /* All but the final copy start a new nesting, maintaining the
4239           chain of brackets outstanding. */
4240 
4241           if (i != 0)
4242             {
4243             int offset;
4244             *code++ = OP_BRA;
4245             offset = (bralink == NULL)? 0 : code - bralink;
4246             bralink = code;
4247             PUTINC(code, 0, offset);
4248             }
4249 
4250           memcpy(code, previous, len);
4251           for (hc = save_hwm; hc < this_hwm; hc += LINK_SIZE)
4252             {
4253             PUT(cd->hwm, 0, GET(hc, 0) + len + ((i != 0)? 2+LINK_SIZE : 1));
4254             cd->hwm += LINK_SIZE;
4255             }
4256           save_hwm = this_hwm;
4257           code += len;
4258           }
4259 
4260         /* Now chain through the pending brackets, and fill in their length
4261         fields (which are holding the chain links pro tem). */
4262 
4263         while (bralink != NULL)
4264           {
4265           int oldlinkoffset;
4266           int offset = code - bralink + 1;
4267           uschar *bra = code - offset;
4268           oldlinkoffset = GET(bra, 1);
4269           bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
4270           *code++ = OP_KET;
4271           PUTINC(code, 0, offset);
4272           PUT(bra, 1, offset);
4273           }
4274         }
4275 
4276       /* If the maximum is unlimited, set a repeater in the final copy. We
4277       can't just offset backwards from the current code point, because we
4278       don't know if there's been an options resetting after the ket. The
4279       correct offset was computed above.
4280 
4281       Then, when we are doing the actual compile phase, check to see whether
4282       this group is a non-atomic one that could match an empty string. If so,
4283       convert the initial operator to the S form (e.g. OP_BRA -> OP_SBRA) so
4284       that runtime checking can be done. [This check is also applied to
4285       atomic groups at runtime, but in a different way.] */
4286 
4287       else
4288         {
4289         uschar *ketcode = code - ketoffset;
4290         uschar *bracode = ketcode - GET(ketcode, 1);
4291         *ketcode = OP_KETRMAX + repeat_type;
4292         if (lengthptr == NULL && *bracode != OP_ONCE)
4293           {
4294           uschar *scode = bracode;
4295           do
4296             {
4297             if (could_be_empty_branch(scode, ketcode, utf8))
4298               {
4299               *bracode += OP_SBRA - OP_BRA;
4300               break;
4301               }
4302             scode += GET(scode, 1);
4303             }
4304           while (*scode == OP_ALT);
4305           }
4306         }
4307       }
4308 
4309     /* If previous is OP_FAIL, it was generated by an empty class [] in
4310     JavaScript mode. The other ways in which OP_FAIL can be generated, that is
4311     by (*FAIL) or (?!) set previous to NULL, which gives a "nothing to repeat"
4312     error above. We can just ignore the repeat in JS case. */
4313 
4314     else if (*previous == OP_FAIL) goto END_REPEAT;
4315 
4316     /* Else there's some kind of shambles */
4317 
4318     else
4319       {
4320       *errorcodeptr = ERR11;
4321       goto FAILED;
4322       }
4323 
4324     /* If the character following a repeat is '+', or if certain optimization
4325     tests above succeeded, possessive_quantifier is TRUE. For some of the
4326     simpler opcodes, there is an special alternative opcode for this. For
4327     anything else, we wrap the entire repeated item inside OP_ONCE brackets.
4328     The '+' notation is just syntactic sugar, taken from Sun's Java package,
4329     but the special opcodes can optimize it a bit. The repeated item starts at
4330     tempcode, not at previous, which might be the first part of a string whose
4331     (former) last char we repeated.
4332 
4333     Possessifying an 'exact' quantifier has no effect, so we can ignore it. But
4334     an 'upto' may follow. We skip over an 'exact' item, and then test the
4335     length of what remains before proceeding. */
4336 
4337     if (possessive_quantifier)
4338       {
4339       int len;
4340       if (*tempcode == OP_EXACT || *tempcode == OP_TYPEEXACT ||
4341           *tempcode == OP_NOTEXACT)
4342         tempcode += _pcre_OP_lengths[*tempcode] +
4343           ((*tempcode == OP_TYPEEXACT &&
4344              (tempcode[3] == OP_PROP || tempcode[3] == OP_NOTPROP))? 2:0);
4345       len = code - tempcode;
4346       if (len > 0) switch (*tempcode)
4347         {
4348         case OP_STAR:  *tempcode = OP_POSSTAR; break;
4349         case OP_PLUS:  *tempcode = OP_POSPLUS; break;
4350         case OP_QUERY: *tempcode = OP_POSQUERY; break;
4351         case OP_UPTO:  *tempcode = OP_POSUPTO; break;
4352 
4353         case OP_TYPESTAR:  *tempcode = OP_TYPEPOSSTAR; break;
4354         case OP_TYPEPLUS:  *tempcode = OP_TYPEPOSPLUS; break;
4355         case OP_TYPEQUERY: *tempcode = OP_TYPEPOSQUERY; break;
4356         case OP_TYPEUPTO:  *tempcode = OP_TYPEPOSUPTO; break;
4357 
4358         case OP_NOTSTAR:  *tempcode = OP_NOTPOSSTAR; break;
4359         case OP_NOTPLUS:  *tempcode = OP_NOTPOSPLUS; break;
4360         case OP_NOTQUERY: *tempcode = OP_NOTPOSQUERY; break;
4361         case OP_NOTUPTO:  *tempcode = OP_NOTPOSUPTO; break;
4362 
4363         default:
4364         memmove(tempcode + 1+LINK_SIZE, tempcode, len);
4365         code += 1 + LINK_SIZE;
4366         len += 1 + LINK_SIZE;
4367         tempcode[0] = OP_ONCE;
4368         *code++ = OP_KET;
4369         PUTINC(code, 0, len);
4370         PUT(tempcode, 1, len);
4371         break;
4372         }
4373       }
4374 
4375     /* In all case we no longer have a previous item. We also set the
4376     "follows varying string" flag for subsequently encountered reqbytes if
4377     it isn't already set and we have just passed a varying length item. */
4378 
4379     END_REPEAT:
4380     previous = NULL;
4381     cd->req_varyopt |= reqvary;
4382     break;
4383 
4384 
4385     /* ===================================================================*/
4386     /* Start of nested parenthesized sub-expression, or comment or lookahead or
4387     lookbehind or option setting or condition or all the other extended
4388     parenthesis forms.  */
4389 
4390     case CHAR_LEFT_PARENTHESIS:
4391     newoptions = options;
4392     skipbytes = 0;
4393     bravalue = OP_CBRA;
4394     save_hwm = cd->hwm;
4395     reset_bracount = FALSE;
4396 
4397     /* First deal with various "verbs" that can be introduced by '*'. */
4398 
4399     if (*(++ptr) == CHAR_ASTERISK && (cd->ctypes[ptr[1]] & ctype_letter) != 0)
4400       {
4401       int i, namelen;
4402       const char *vn = verbnames;
4403       const uschar *name = ++ptr;
4404       previous = NULL;
4405       while ((cd->ctypes[*++ptr] & ctype_letter) != 0) {};
4406       if (*ptr == CHAR_COLON)
4407         {
4408         *errorcodeptr = ERR59;   /* Not supported */
4409         goto FAILED;
4410         }
4411       if (*ptr != CHAR_RIGHT_PARENTHESIS)
4412         {
4413         *errorcodeptr = ERR60;
4414         goto FAILED;
4415         }
4416       namelen = ptr - name;
4417       for (i = 0; i < verbcount; i++)
4418         {
4419         if (namelen == verbs[i].len &&
4420             strncmp((char *)name, vn, namelen) == 0)
4421           {
4422           *code = verbs[i].op;
4423           if (*code++ == OP_ACCEPT) cd->had_accept = TRUE;
4424           break;
4425           }
4426         vn += verbs[i].len + 1;
4427         }
4428       if (i < verbcount) continue;
4429       *errorcodeptr = ERR60;
4430       goto FAILED;
4431       }
4432 
4433     /* Deal with the extended parentheses; all are introduced by '?', and the
4434     appearance of any of them means that this is not a capturing group. */
4435 
4436     else if (*ptr == CHAR_QUESTION_MARK)
4437       {
4438       int i, set, unset, namelen;
4439       int *optset;
4440       const uschar *name;
4441       uschar *slot;
4442 
4443       switch (*(++ptr))
4444         {
4445         case CHAR_NUMBER_SIGN:                 /* Comment; skip to ket */
4446         ptr++;
4447         while (*ptr != 0 && *ptr != CHAR_RIGHT_PARENTHESIS) ptr++;
4448         if (*ptr == 0)
4449           {
4450           *errorcodeptr = ERR18;
4451           goto FAILED;
4452           }
4453         continue;
4454 
4455 
4456         /* ------------------------------------------------------------ */
4457         case CHAR_VERTICAL_LINE:  /* Reset capture count for each branch */
4458         reset_bracount = TRUE;
4459         /* Fall through */
4460 
4461         /* ------------------------------------------------------------ */
4462         case CHAR_COLON:          /* Non-capturing bracket */
4463         bravalue = OP_BRA;
4464         ptr++;
4465         break;
4466 
4467 
4468         /* ------------------------------------------------------------ */
4469         case CHAR_LEFT_PARENTHESIS:
4470         bravalue = OP_COND;       /* Conditional group */
4471 
4472         /* A condition can be an assertion, a number (referring to a numbered
4473         group), a name (referring to a named group), or 'R', referring to
4474         recursion. R<digits> and R&name are also permitted for recursion tests.
4475 
4476         There are several syntaxes for testing a named group: (?(name)) is used
4477         by Python; Perl 5.10 onwards uses (?(<name>) or (?('name')).
4478 
4479         There are two unfortunate ambiguities, caused by history. (a) 'R' can
4480         be the recursive thing or the name 'R' (and similarly for 'R' followed
4481         by digits), and (b) a number could be a name that consists of digits.
4482         In both cases, we look for a name first; if not found, we try the other
4483         cases. */
4484 
4485         /* For conditions that are assertions, check the syntax, and then exit
4486         the switch. This will take control down to where bracketed groups,
4487         including assertions, are processed. */
4488 
4489         if (ptr[1] == CHAR_QUESTION_MARK && (ptr[2] == CHAR_EQUALS_SIGN ||
4490             ptr[2] == CHAR_EXCLAMATION_MARK || ptr[2] == CHAR_LESS_THAN_SIGN))
4491           break;
4492 
4493         /* Most other conditions use OP_CREF (a couple change to OP_RREF
4494         below), and all need to skip 3 bytes at the start of the group. */
4495 
4496         code[1+LINK_SIZE] = OP_CREF;
4497         skipbytes = 3;
4498         refsign = -1;
4499 
4500         /* Check for a test for recursion in a named group. */
4501 
4502         if (ptr[1] == CHAR_R && ptr[2] == CHAR_AMPERSAND)
4503           {
4504           terminator = -1;
4505           ptr += 2;
4506           code[1+LINK_SIZE] = OP_RREF;    /* Change the type of test */
4507           }
4508 
4509         /* Check for a test for a named group's having been set, using the Perl
4510         syntax (?(<name>) or (?('name') */
4511 
4512         else if (ptr[1] == CHAR_LESS_THAN_SIGN)
4513           {
4514           terminator = CHAR_GREATER_THAN_SIGN;
4515           ptr++;
4516           }
4517         else if (ptr[1] == CHAR_APOSTROPHE)
4518           {
4519           terminator = CHAR_APOSTROPHE;
4520           ptr++;
4521           }
4522         else
4523           {
4524           terminator = 0;
4525           if (ptr[1] == CHAR_MINUS || ptr[1] == CHAR_PLUS) refsign = *(++ptr);
4526           }
4527 
4528         /* We now expect to read a name; any thing else is an error */
4529 
4530         if ((cd->ctypes[ptr[1]] & ctype_word) == 0)
4531           {
4532           ptr += 1;  /* To get the right offset */
4533           *errorcodeptr = ERR28;
4534           goto FAILED;
4535           }
4536 
4537         /* Read the name, but also get it as a number if it's all digits */
4538 
4539         recno = 0;
4540         name = ++ptr;
4541         while ((cd->ctypes[*ptr] & ctype_word) != 0)
4542           {
4543           if (recno >= 0)
4544             recno = ((digitab[*ptr] & ctype_digit) != 0)?
4545               recno * 10 + *ptr - CHAR_0 : -1;
4546           ptr++;
4547           }
4548         namelen = ptr - name;
4549 
4550         if ((terminator > 0 && *ptr++ != terminator) ||
4551             *ptr++ != CHAR_RIGHT_PARENTHESIS)
4552           {
4553           ptr--;      /* Error offset */
4554           *errorcodeptr = ERR26;
4555           goto FAILED;
4556           }
4557 
4558         /* Do no further checking in the pre-compile phase. */
4559 
4560         if (lengthptr != NULL) break;
4561 
4562         /* In the real compile we do the work of looking for the actual
4563         reference. If the string started with "+" or "-" we require the rest to
4564         be digits, in which case recno will be set. */
4565 
4566         if (refsign > 0)
4567           {
4568           if (recno <= 0)
4569             {
4570             *errorcodeptr = ERR58;
4571             goto FAILED;
4572             }
4573           recno = (refsign == CHAR_MINUS)?
4574             cd->bracount - recno + 1 : recno +cd->bracount;
4575           if (recno <= 0 || recno > cd->final_bracount)
4576             {
4577             *errorcodeptr = ERR15;
4578             goto FAILED;
4579             }
4580           PUT2(code, 2+LINK_SIZE, recno);
4581           break;
4582           }
4583 
4584         /* Otherwise (did not start with "+" or "-"), start by looking for the
4585         name. */
4586 
4587         slot = cd->name_table;
4588         for (i = 0; i < cd->names_found; i++)
4589           {
4590           if (strncmp((char *)name, (char *)slot+2, namelen) == 0) break;
4591           slot += cd->name_entry_size;
4592           }
4593 
4594         /* Found a previous named subpattern */
4595 
4596         if (i < cd->names_found)
4597           {
4598           recno = GET2(slot, 0);
4599           PUT2(code, 2+LINK_SIZE, recno);
4600           }
4601 
4602         /* Search the pattern for a forward reference */
4603 
4604         else if ((i = find_parens(cd, name, namelen,
4605                         (options & PCRE_EXTENDED) != 0)) > 0)
4606           {
4607           PUT2(code, 2+LINK_SIZE, i);
4608           }
4609 
4610         /* If terminator == 0 it means that the name followed directly after
4611         the opening parenthesis [e.g. (?(abc)...] and in this case there are
4612         some further alternatives to try. For the cases where terminator != 0
4613         [things like (?(<name>... or (?('name')... or (?(R&name)... ] we have
4614         now checked all the possibilities, so give an error. */
4615 
4616         else if (terminator != 0)
4617           {
4618           *errorcodeptr = ERR15;
4619           goto FAILED;
4620           }
4621 
4622         /* Check for (?(R) for recursion. Allow digits after R to specify a
4623         specific group number. */
4624 
4625         else if (*name == CHAR_R)
4626           {
4627           recno = 0;
4628           for (i = 1; i < namelen; i++)
4629             {
4630             if ((digitab[name[i]] & ctype_digit) == 0)
4631               {
4632               *errorcodeptr = ERR15;
4633               goto FAILED;
4634               }
4635             recno = recno * 10 + name[i] - CHAR_0;
4636             }
4637           if (recno == 0) recno = RREF_ANY;
4638           code[1+LINK_SIZE] = OP_RREF;      /* Change test type */
4639           PUT2(code, 2+LINK_SIZE, recno);
4640           }
4641 
4642         /* Similarly, check for the (?(DEFINE) "condition", which is always
4643         false. */
4644 
4645         else if (namelen == 6 && strncmp((char *)name, STRING_DEFINE, 6) == 0)
4646           {
4647           code[1+LINK_SIZE] = OP_DEF;
4648           skipbytes = 1;
4649           }
4650 
4651         /* Check for the "name" actually being a subpattern number. We are
4652         in the second pass here, so final_bracount is set. */
4653 
4654         else if (recno > 0 && recno <= cd->final_bracount)
4655           {
4656           PUT2(code, 2+LINK_SIZE, recno);
4657           }
4658 
4659         /* Either an unidentified subpattern, or a reference to (?(0) */
4660 
4661         else
4662           {
4663           *errorcodeptr = (recno == 0)? ERR35: ERR15;
4664           goto FAILED;
4665           }
4666         break;
4667 
4668 
4669         /* ------------------------------------------------------------ */
4670         case CHAR_EQUALS_SIGN:                 /* Positive lookahead */
4671         bravalue = OP_ASSERT;
4672         ptr++;
4673         break;
4674 
4675 
4676         /* ------------------------------------------------------------ */
4677         case CHAR_EXCLAMATION_MARK:            /* Negative lookahead */
4678         ptr++;
4679         if (*ptr == CHAR_RIGHT_PARENTHESIS)    /* Optimize (?!) */
4680           {
4681           *code++ = OP_FAIL;
4682           previous = NULL;
4683           continue;
4684           }
4685         bravalue = OP_ASSERT_NOT;
4686         break;
4687 
4688 
4689         /* ------------------------------------------------------------ */
4690         case CHAR_LESS_THAN_SIGN:              /* Lookbehind or named define */
4691         switch (ptr[1])
4692           {
4693           case CHAR_EQUALS_SIGN:               /* Positive lookbehind */
4694           bravalue = OP_ASSERTBACK;
4695           ptr += 2;
4696           break;
4697 
4698           case CHAR_EXCLAMATION_MARK:          /* Negative lookbehind */
4699           bravalue = OP_ASSERTBACK_NOT;
4700           ptr += 2;
4701           break;
4702 
4703           default:                /* Could be name define, else bad */
4704           if ((cd->ctypes[ptr[1]] & ctype_word) != 0) goto DEFINE_NAME;
4705           ptr++;                  /* Correct offset for error */
4706           *errorcodeptr = ERR24;
4707           goto FAILED;
4708           }
4709         break;
4710 
4711 
4712         /* ------------------------------------------------------------ */
4713         case CHAR_GREATER_THAN_SIGN:           /* One-time brackets */
4714         bravalue = OP_ONCE;
4715         ptr++;
4716         break;
4717 
4718 
4719         /* ------------------------------------------------------------ */
4720         case CHAR_C:                 /* Callout - may be followed by digits; */
4721         previous_callout = code;  /* Save for later completion */
4722         after_manual_callout = 1; /* Skip one item before completing */
4723         *code++ = OP_CALLOUT;
4724           {
4725           int n = 0;
4726           while ((digitab[*(++ptr)] & ctype_digit) != 0)
4727             n = n * 10 + *ptr - CHAR_0;
4728           if (*ptr != CHAR_RIGHT_PARENTHESIS)
4729             {
4730             *errorcodeptr = ERR39;
4731             goto FAILED;
4732             }
4733           if (n > 255)
4734             {
4735             *errorcodeptr = ERR38;
4736             goto FAILED;
4737             }
4738           *code++ = n;
4739           PUT(code, 0, ptr - cd->start_pattern + 1);  /* Pattern offset */
4740           PUT(code, LINK_SIZE, 0);                    /* Default length */
4741           code += 2 * LINK_SIZE;
4742           }
4743         previous = NULL;
4744         continue;
4745 
4746 
4747         /* ------------------------------------------------------------ */
4748         case CHAR_P:              /* Python-style named subpattern handling */
4749         if (*(++ptr) == CHAR_EQUALS_SIGN ||
4750             *ptr == CHAR_GREATER_THAN_SIGN)  /* Reference or recursion */
4751           {
4752           is_recurse = *ptr == CHAR_GREATER_THAN_SIGN;
4753           terminator = CHAR_RIGHT_PARENTHESIS;
4754           goto NAMED_REF_OR_RECURSE;
4755           }
4756         else if (*ptr != CHAR_LESS_THAN_SIGN)  /* Test for Python-style defn */
4757           {
4758           *errorcodeptr = ERR41;
4759           goto FAILED;
4760           }
4761         /* Fall through to handle (?P< as (?< is handled */
4762 
4763 
4764         /* ------------------------------------------------------------ */
4765         DEFINE_NAME:    /* Come here from (?< handling */
4766         case CHAR_APOSTROPHE:
4767           {
4768           terminator = (*ptr == CHAR_LESS_THAN_SIGN)?
4769             CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
4770           name = ++ptr;
4771 
4772           while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4773           namelen = ptr - name;
4774 
4775           /* In the pre-compile phase, just do a syntax check. */
4776 
4777           if (lengthptr != NULL)
4778             {
4779             if (*ptr != terminator)
4780               {
4781               *errorcodeptr = ERR42;
4782               goto FAILED;
4783               }
4784             if (cd->names_found >= MAX_NAME_COUNT)
4785               {
4786               *errorcodeptr = ERR49;
4787               goto FAILED;
4788               }
4789             if (namelen + 3 > cd->name_entry_size)
4790               {
4791               cd->name_entry_size = namelen + 3;
4792               if (namelen > MAX_NAME_SIZE)
4793                 {
4794                 *errorcodeptr = ERR48;
4795                 goto FAILED;
4796                 }
4797               }
4798             }
4799 
4800           /* In the real compile, create the entry in the table */
4801 
4802           else
4803             {
4804             slot = cd->name_table;
4805             for (i = 0; i < cd->names_found; i++)
4806               {
4807               int crc = memcmp(name, slot+2, namelen);
4808               if (crc == 0)
4809                 {
4810                 if (slot[2+namelen] == 0)
4811                   {
4812                   if ((options & PCRE_DUPNAMES) == 0)
4813                     {
4814                     *errorcodeptr = ERR43;
4815                     goto FAILED;
4816                     }
4817                   }
4818                 else crc = -1;      /* Current name is substring */
4819                 }
4820               if (crc < 0)
4821                 {
4822                 memmove(slot + cd->name_entry_size, slot,
4823                   (cd->names_found - i) * cd->name_entry_size);
4824                 break;
4825                 }
4826               slot += cd->name_entry_size;
4827               }
4828 
4829             PUT2(slot, 0, cd->bracount + 1);
4830             memcpy(slot + 2, name, namelen);
4831             slot[2+namelen] = 0;
4832             }
4833           }
4834 
4835         /* In both cases, count the number of names we've encountered. */
4836 
4837         ptr++;                    /* Move past > or ' */
4838         cd->names_found++;
4839         goto NUMBERED_GROUP;
4840 
4841 
4842         /* ------------------------------------------------------------ */
4843         case CHAR_AMPERSAND:            /* Perl recursion/subroutine syntax */
4844         terminator = CHAR_RIGHT_PARENTHESIS;
4845         is_recurse = TRUE;
4846         /* Fall through */
4847 
4848         /* We come here from the Python syntax above that handles both
4849         references (?P=name) and recursion (?P>name), as well as falling
4850         through from the Perl recursion syntax (?&name). We also come here from
4851         the Perl \k<name> or \k'name' back reference syntax and the \k{name}
4852         .NET syntax, and the Oniguruma \g<...> and \g'...' subroutine syntax. */
4853 
4854         NAMED_REF_OR_RECURSE:
4855         name = ++ptr;
4856         while ((cd->ctypes[*ptr] & ctype_word) != 0) ptr++;
4857         namelen = ptr - name;
4858 
4859         /* In the pre-compile phase, do a syntax check and set a dummy
4860         reference number. */
4861 
4862         if (lengthptr != NULL)
4863           {
4864           if (namelen == 0)
4865             {
4866             *errorcodeptr = ERR62;
4867             goto FAILED;
4868             }
4869           if (*ptr != terminator)
4870             {
4871             *errorcodeptr = ERR42;
4872             goto FAILED;
4873             }
4874           if (namelen > MAX_NAME_SIZE)
4875             {
4876             *errorcodeptr = ERR48;
4877             goto FAILED;
4878             }
4879           recno = 0;
4880           }
4881 
4882         /* In the real compile, seek the name in the table. We check the name
4883         first, and then check that we have reached the end of the name in the
4884         table. That way, if the name that is longer than any in the table,
4885         the comparison will fail without reading beyond the table entry. */
4886 
4887         else
4888           {
4889           slot = cd->name_table;
4890           for (i = 0; i < cd->names_found; i++)
4891             {
4892             if (strncmp((char *)name, (char *)slot+2, namelen) == 0 &&
4893                 slot[2+namelen] == 0)
4894               break;
4895             slot += cd->name_entry_size;
4896             }
4897 
4898           if (i < cd->names_found)         /* Back reference */
4899             {
4900             recno = GET2(slot, 0);
4901             }
4902           else if ((recno =                /* Forward back reference */
4903                     find_parens(cd, name, namelen,
4904                       (options & PCRE_EXTENDED) != 0)) <= 0)
4905             {
4906             *errorcodeptr = ERR15;
4907             goto FAILED;
4908             }
4909           }
4910 
4911         /* In both phases, we can now go to the code than handles numerical
4912         recursion or backreferences. */
4913 
4914         if (is_recurse) goto HANDLE_RECURSION;
4915           else goto HANDLE_REFERENCE;
4916 
4917 
4918         /* ------------------------------------------------------------ */
4919         case CHAR_R:              /* Recursion */
4920         ptr++;                    /* Same as (?0)      */
4921         /* Fall through */
4922 
4923 
4924         /* ------------------------------------------------------------ */
4925         case CHAR_MINUS: case CHAR_PLUS:  /* Recursion or subroutine */
4926         case CHAR_0: case CHAR_1: case CHAR_2: case CHAR_3: case CHAR_4:
4927         case CHAR_5: case CHAR_6: case CHAR_7: case CHAR_8: case CHAR_9:
4928           {
4929           const uschar *called;
4930           terminator = CHAR_RIGHT_PARENTHESIS;
4931 
4932           /* Come here from the \g<...> and \g'...' code (Oniguruma
4933           compatibility). However, the syntax has been checked to ensure that
4934           the ... are a (signed) number, so that neither ERR63 nor ERR29 will
4935           be called on this path, nor with the jump to OTHER_CHAR_AFTER_QUERY
4936           ever be taken. */
4937 
4938           HANDLE_NUMERICAL_RECURSION:
4939 
4940           if ((refsign = *ptr) == CHAR_PLUS)
4941             {
4942             ptr++;
4943             if ((digitab[*ptr] & ctype_digit) == 0)
4944               {
4945               *errorcodeptr = ERR63;
4946               goto FAILED;
4947               }
4948             }
4949           else if (refsign == CHAR_MINUS)
4950             {
4951             if ((digitab[ptr[1]] & ctype_digit) == 0)
4952               goto OTHER_CHAR_AFTER_QUERY;
4953             ptr++;
4954             }
4955 
4956           recno = 0;
4957           while((digitab[*ptr] & ctype_digit) != 0)
4958             recno = recno * 10 + *ptr++ - CHAR_0;
4959 
4960           if (*ptr != terminator)
4961             {
4962             *errorcodeptr = ERR29;
4963             goto FAILED;
4964             }
4965 
4966           if (refsign == CHAR_MINUS)
4967             {
4968             if (recno == 0)
4969               {
4970               *errorcodeptr = ERR58;
4971               goto FAILED;
4972               }
4973             recno = cd->bracount - recno + 1;
4974             if (recno <= 0)
4975               {
4976               *errorcodeptr = ERR15;
4977               goto FAILED;
4978               }
4979             }
4980           else if (refsign == CHAR_PLUS)
4981             {
4982             if (recno == 0)
4983               {
4984               *errorcodeptr = ERR58;
4985               goto FAILED;
4986               }
4987             recno += cd->bracount;
4988             }
4989 
4990           /* Come here from code above that handles a named recursion */
4991 
4992           HANDLE_RECURSION:
4993 
4994           previous = code;
4995           called = cd->start_code;
4996 
4997           /* When we are actually compiling, find the bracket that is being
4998           referenced. Temporarily end the regex in case it doesn't exist before
4999           this point. If we end up with a forward reference, first check that
5000           the bracket does occur later so we can give the error (and position)
5001           now. Then remember this forward reference in the workspace so it can
5002           be filled in at the end. */
5003 
5004           if (lengthptr == NULL)
5005             {
5006             *code = OP_END;
5007             if (recno != 0) called = find_bracket(cd->start_code, utf8, recno);
5008 
5009             /* Forward reference */
5010 
5011             if (called == NULL)
5012               {
5013               if (find_parens(cd, NULL, recno,
5014                     (options & PCRE_EXTENDED) != 0) < 0)
5015                 {
5016                 *errorcodeptr = ERR15;
5017                 goto FAILED;
5018                 }
5019               called = cd->start_code + recno;
5020               PUTINC(cd->hwm, 0, code + 2 + LINK_SIZE - cd->start_code);
5021               }
5022 
5023             /* If not a forward reference, and the subpattern is still open,
5024             this is a recursive call. We check to see if this is a left
5025             recursion that could loop for ever, and diagnose that case. */
5026 
5027             else if (GET(called, 1) == 0 &&
5028                      could_be_empty(called, code, bcptr, utf8))
5029               {
5030               *errorcodeptr = ERR40;
5031               goto FAILED;
5032               }
5033             }
5034 
5035           /* Insert the recursion/subroutine item, automatically wrapped inside
5036           "once" brackets. Set up a "previous group" length so that a
5037           subsequent quantifier will work. */
5038 
5039           *code = OP_ONCE;
5040           PUT(code, 1, 2 + 2*LINK_SIZE);
5041           code += 1 + LINK_SIZE;
5042 
5043           *code = OP_RECURSE;
5044           PUT(code, 1, called - cd->start_code);
5045           code += 1 + LINK_SIZE;
5046 
5047           *code = OP_KET;
5048           PUT(code, 1, 2 + 2*LINK_SIZE);
5049           code += 1 + LINK_SIZE;
5050 
5051           length_prevgroup = 3 + 3*LINK_SIZE;
5052           }
5053 
5054         /* Can't determine a first byte now */
5055 
5056         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5057         continue;
5058 
5059 
5060         /* ------------------------------------------------------------ */
5061         default:              /* Other characters: check option setting */
5062         OTHER_CHAR_AFTER_QUERY:
5063         set = unset = 0;
5064         optset = &set;
5065 
5066         while (*ptr != CHAR_RIGHT_PARENTHESIS && *ptr != CHAR_COLON)
5067           {
5068           switch (*ptr++)
5069             {
5070             case CHAR_MINUS: optset = &unset; break;
5071 
5072             case CHAR_J:    /* Record that it changed in the external options */
5073             *optset |= PCRE_DUPNAMES;
5074             cd->external_flags |= PCRE_JCHANGED;
5075             break;
5076 
5077             case CHAR_i: *optset |= PCRE_CASELESS; break;
5078             case CHAR_m: *optset |= PCRE_MULTILINE; break;
5079             case CHAR_s: *optset |= PCRE_DOTALL; break;
5080             case CHAR_x: *optset |= PCRE_EXTENDED; break;
5081             case CHAR_U: *optset |= PCRE_UNGREEDY; break;
5082             case CHAR_X: *optset |= PCRE_EXTRA; break;
5083 
5084             default:  *errorcodeptr = ERR12;
5085                       ptr--;    /* Correct the offset */
5086                       goto FAILED;
5087             }
5088           }
5089 
5090         /* Set up the changed option bits, but don't change anything yet. */
5091 
5092         newoptions = (options | set) & (~unset);
5093 
5094         /* If the options ended with ')' this is not the start of a nested
5095         group with option changes, so the options change at this level. If this
5096         item is right at the start of the pattern, the options can be
5097         abstracted and made external in the pre-compile phase, and ignored in
5098         the compile phase. This can be helpful when matching -- for instance in
5099         caseless checking of required bytes.
5100 
5101         If the code pointer is not (cd->start_code + 1 + LINK_SIZE), we are
5102         definitely *not* at the start of the pattern because something has been
5103         compiled. In the pre-compile phase, however, the code pointer can have
5104         that value after the start, because it gets reset as code is discarded
5105         during the pre-compile. However, this can happen only at top level - if
5106         we are within parentheses, the starting BRA will still be present. At
5107         any parenthesis level, the length value can be used to test if anything
5108         has been compiled at that level. Thus, a test for both these conditions
5109         is necessary to ensure we correctly detect the start of the pattern in
5110         both phases.
5111 
5112         If we are not at the pattern start, compile code to change the ims
5113         options if this setting actually changes any of them, and reset the
5114         greedy defaults and the case value for firstbyte and reqbyte. */
5115 
5116         if (*ptr == CHAR_RIGHT_PARENTHESIS)
5117           {
5118           if (code == cd->start_code + 1 + LINK_SIZE &&
5119                (lengthptr == NULL || *lengthptr == 2 + 2*LINK_SIZE))
5120             {
5121             cd->external_options = newoptions;
5122             }
5123          else
5124             {
5125             if ((options & PCRE_IMS) != (newoptions & PCRE_IMS))
5126               {
5127               *code++ = OP_OPT;
5128               *code++ = newoptions & PCRE_IMS;
5129               }
5130             greedy_default = ((newoptions & PCRE_UNGREEDY) != 0);
5131             greedy_non_default = greedy_default ^ 1;
5132             req_caseopt = ((newoptions & PCRE_CASELESS) != 0)? REQ_CASELESS : 0;
5133             }
5134 
5135           /* Change options at this level, and pass them back for use
5136           in subsequent branches. When not at the start of the pattern, this
5137           information is also necessary so that a resetting item can be
5138           compiled at the end of a group (if we are in a group). */
5139 
5140           *optionsptr = options = newoptions;
5141           previous = NULL;       /* This item can't be repeated */
5142           continue;              /* It is complete */
5143           }
5144 
5145         /* If the options ended with ':' we are heading into a nested group
5146         with possible change of options. Such groups are non-capturing and are
5147         not assertions of any kind. All we need to do is skip over the ':';
5148         the newoptions value is handled below. */
5149 
5150         bravalue = OP_BRA;
5151         ptr++;
5152         }     /* End of switch for character following (? */
5153       }       /* End of (? handling */
5154 
5155     /* Opening parenthesis not followed by '?'. If PCRE_NO_AUTO_CAPTURE is set,
5156     all unadorned brackets become non-capturing and behave like (?:...)
5157     brackets. */
5158 
5159     else if ((options & PCRE_NO_AUTO_CAPTURE) != 0)
5160       {
5161       bravalue = OP_BRA;
5162       }
5163 
5164     /* Else we have a capturing group. */
5165 
5166     else
5167       {
5168       NUMBERED_GROUP:
5169       cd->bracount += 1;
5170       PUT2(code, 1+LINK_SIZE, cd->bracount);
5171       skipbytes = 2;
5172       }
5173 
5174     /* Process nested bracketed regex. Assertions may not be repeated, but
5175     other kinds can be. All their opcodes are >= OP_ONCE. We copy code into a
5176     non-register variable in order to be able to pass its address because some
5177     compilers complain otherwise. Pass in a new setting for the ims options if
5178     they have changed. */
5179 
5180     previous = (bravalue >= OP_ONCE)? code : NULL;
5181     *code = bravalue;
5182     tempcode = code;
5183     tempreqvary = cd->req_varyopt;     /* Save value before bracket */
5184     length_prevgroup = 0;              /* Initialize for pre-compile phase */
5185 
5186     if (!compile_regex(
5187          newoptions,                   /* The complete new option state */
5188          options & PCRE_IMS,           /* The previous ims option state */
5189          &tempcode,                    /* Where to put code (updated) */
5190          &ptr,                         /* Input pointer (updated) */
5191          errorcodeptr,                 /* Where to put an error message */
5192          (bravalue == OP_ASSERTBACK ||
5193           bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
5194          reset_bracount,               /* True if (?| group */
5195          skipbytes,                    /* Skip over bracket number */
5196          &subfirstbyte,                /* For possible first char */
5197          &subreqbyte,                  /* For possible last char */
5198          bcptr,                        /* Current branch chain */
5199          cd,                           /* Tables block */
5200          (lengthptr == NULL)? NULL :   /* Actual compile phase */
5201            &length_prevgroup           /* Pre-compile phase */
5202          ))
5203       goto FAILED;
5204 
5205     /* At the end of compiling, code is still pointing to the start of the
5206     group, while tempcode has been updated to point past the end of the group
5207     and any option resetting that may follow it. The pattern pointer (ptr)
5208     is on the bracket. */
5209 
5210     /* If this is a conditional bracket, check that there are no more than
5211     two branches in the group, or just one if it's a DEFINE group. We do this
5212     in the real compile phase, not in the pre-pass, where the whole group may
5213     not be available. */
5214 
5215     if (bravalue == OP_COND && lengthptr == NULL)
5216       {
5217       uschar *tc = code;
5218       int condcount = 0;
5219 
5220       do {
5221          condcount++;
5222          tc += GET(tc,1);
5223          }
5224       while (*tc != OP_KET);
5225 
5226       /* A DEFINE group is never obeyed inline (the "condition" is always
5227       false). It must have only one branch. */
5228 
5229       if (code[LINK_SIZE+1] == OP_DEF)
5230         {
5231         if (condcount > 1)
5232           {
5233           *errorcodeptr = ERR54;
5234           goto FAILED;
5235           }
5236         bravalue = OP_DEF;   /* Just a flag to suppress char handling below */
5237         }
5238 
5239       /* A "normal" conditional group. If there is just one branch, we must not
5240       make use of its firstbyte or reqbyte, because this is equivalent to an
5241       empty second branch. */
5242 
5243       else
5244         {
5245         if (condcount > 2)
5246           {
5247           *errorcodeptr = ERR27;
5248           goto FAILED;
5249           }
5250         if (condcount == 1) subfirstbyte = subreqbyte = REQ_NONE;
5251         }
5252       }
5253 
5254     /* Error if hit end of pattern */
5255 
5256     if (*ptr != CHAR_RIGHT_PARENTHESIS)
5257       {
5258       *errorcodeptr = ERR14;
5259       goto FAILED;
5260       }
5261 
5262     /* In the pre-compile phase, update the length by the length of the group,
5263     less the brackets at either end. Then reduce the compiled code to just a
5264     set of non-capturing brackets so that it doesn't use much memory if it is
5265     duplicated by a quantifier.*/
5266 
5267     if (lengthptr != NULL)
5268       {
5269       if (OFLOW_MAX - *lengthptr < length_prevgroup - 2 - 2*LINK_SIZE)
5270         {
5271         *errorcodeptr = ERR20;
5272         goto FAILED;
5273         }
5274       *lengthptr += length_prevgroup - 2 - 2*LINK_SIZE;
5275       *code++ = OP_BRA;
5276       PUTINC(code, 0, 1 + LINK_SIZE);
5277       *code++ = OP_KET;
5278       PUTINC(code, 0, 1 + LINK_SIZE);
5279       break;    /* No need to waste time with special character handling */
5280       }
5281 
5282     /* Otherwise update the main code pointer to the end of the group. */
5283 
5284     code = tempcode;
5285 
5286     /* For a DEFINE group, required and first character settings are not
5287     relevant. */
5288 
5289     if (bravalue == OP_DEF) break;
5290 
5291     /* Handle updating of the required and first characters for other types of
5292     group. Update for normal brackets of all kinds, and conditions with two
5293     branches (see code above). If the bracket is followed by a quantifier with
5294     zero repeat, we have to back off. Hence the definition of zeroreqbyte and
5295     zerofirstbyte outside the main loop so that they can be accessed for the
5296     back off. */
5297 
5298     zeroreqbyte = reqbyte;
5299     zerofirstbyte = firstbyte;
5300     groupsetfirstbyte = FALSE;
5301 
5302     if (bravalue >= OP_ONCE)
5303       {
5304       /* If we have not yet set a firstbyte in this branch, take it from the
5305       subpattern, remembering that it was set here so that a repeat of more
5306       than one can replicate it as reqbyte if necessary. If the subpattern has
5307       no firstbyte, set "none" for the whole branch. In both cases, a zero
5308       repeat forces firstbyte to "none". */
5309 
5310       if (firstbyte == REQ_UNSET)
5311         {
5312         if (subfirstbyte >= 0)
5313           {
5314           firstbyte = subfirstbyte;
5315           groupsetfirstbyte = TRUE;
5316           }
5317         else firstbyte = REQ_NONE;
5318         zerofirstbyte = REQ_NONE;
5319         }
5320 
5321       /* If firstbyte was previously set, convert the subpattern's firstbyte
5322       into reqbyte if there wasn't one, using the vary flag that was in
5323       existence beforehand. */
5324 
5325       else if (subfirstbyte >= 0 && subreqbyte < 0)
5326         subreqbyte = subfirstbyte | tempreqvary;
5327 
5328       /* If the subpattern set a required byte (or set a first byte that isn't
5329       really the first byte - see above), set it. */
5330 
5331       if (subreqbyte >= 0) reqbyte = subreqbyte;
5332       }
5333 
5334     /* For a forward assertion, we take the reqbyte, if set. This can be
5335     helpful if the pattern that follows the assertion doesn't set a different
5336     char. For example, it's useful for /(?=abcde).+/. We can't set firstbyte
5337     for an assertion, however because it leads to incorrect effect for patterns
5338     such as /(?=a)a.+/ when the "real" "a" would then become a reqbyte instead
5339     of a firstbyte. This is overcome by a scan at the end if there's no
5340     firstbyte, looking for an asserted first char. */
5341 
5342     else if (bravalue == OP_ASSERT && subreqbyte >= 0) reqbyte = subreqbyte;
5343     break;     /* End of processing '(' */
5344 
5345 
5346     /* ===================================================================*/
5347     /* Handle metasequences introduced by \. For ones like \d, the ESC_ values
5348     are arranged to be the negation of the corresponding OP_values. For the
5349     back references, the values are ESC_REF plus the reference number. Only
5350     back references and those types that consume a character may be repeated.
5351     We can test for values between ESC_b and ESC_Z for the latter; this may
5352     have to change if any new ones are ever created. */
5353 
5354     case CHAR_BACKSLASH:
5355     tempptr = ptr;
5356     c = check_escape(&ptr, errorcodeptr, cd->bracount, options, FALSE);
5357     if (*errorcodeptr != 0) goto FAILED;
5358 
5359     if (c < 0)
5360       {
5361       if (-c == ESC_Q)            /* Handle start of quoted string */
5362         {
5363         if (ptr[1] == CHAR_BACKSLASH && ptr[2] == CHAR_E)
5364           ptr += 2;               /* avoid empty string */
5365             else inescq = TRUE;
5366         continue;
5367         }
5368 
5369       if (-c == ESC_E) continue;  /* Perl ignores an orphan \E */
5370 
5371       /* For metasequences that actually match a character, we disable the
5372       setting of a first character if it hasn't already been set. */
5373 
5374       if (firstbyte == REQ_UNSET && -c > ESC_b && -c < ESC_Z)
5375         firstbyte = REQ_NONE;
5376 
5377       /* Set values to reset to if this is followed by a zero repeat. */
5378 
5379       zerofirstbyte = firstbyte;
5380       zeroreqbyte = reqbyte;
5381 
5382       /* \g<name> or \g'name' is a subroutine call by name and \g<n> or \g'n'
5383       is a subroutine call by number (Oniguruma syntax). In fact, the value
5384       -ESC_g is returned only for these cases. So we don't need to check for <
5385       or ' if the value is -ESC_g. For the Perl syntax \g{n} the value is
5386       -ESC_REF+n, and for the Perl syntax \g{name} the result is -ESC_k (as
5387       that is a synonym for a named back reference). */
5388 
5389       if (-c == ESC_g)
5390         {
5391         const uschar *p;
5392         save_hwm = cd->hwm;   /* Normally this is set when '(' is read */
5393         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5394           CHAR_GREATER_THAN_SIGN : CHAR_APOSTROPHE;
5395 
5396         /* These two statements stop the compiler for warning about possibly
5397         unset variables caused by the jump to HANDLE_NUMERICAL_RECURSION. In
5398         fact, because we actually check for a number below, the paths that
5399         would actually be in error are never taken. */
5400 
5401         skipbytes = 0;
5402         reset_bracount = FALSE;
5403 
5404         /* Test for a name */
5405 
5406         if (ptr[1] != CHAR_PLUS && ptr[1] != CHAR_MINUS)
5407           {
5408           BOOL isnumber = TRUE;
5409           for (p = ptr + 1; *p != 0 && *p != terminator; p++)
5410             {
5411             if ((cd->ctypes[*p] & ctype_digit) == 0) isnumber = FALSE;
5412             if ((cd->ctypes[*p] & ctype_word) == 0) break;
5413             }
5414           if (*p != terminator)
5415             {
5416             *errorcodeptr = ERR57;
5417             break;
5418             }
5419           if (isnumber)
5420             {
5421             ptr++;
5422             goto HANDLE_NUMERICAL_RECURSION;
5423             }
5424           is_recurse = TRUE;
5425           goto NAMED_REF_OR_RECURSE;
5426           }
5427 
5428         /* Test a signed number in angle brackets or quotes. */
5429 
5430         p = ptr + 2;
5431         while ((digitab[*p] & ctype_digit) != 0) p++;
5432         if (*p != terminator)
5433           {
5434           *errorcodeptr = ERR57;
5435           break;
5436           }
5437         ptr++;
5438         goto HANDLE_NUMERICAL_RECURSION;
5439         }
5440 
5441       /* \k<name> or \k'name' is a back reference by name (Perl syntax).
5442       We also support \k{name} (.NET syntax) */
5443 
5444       if (-c == ESC_k && (ptr[1] == CHAR_LESS_THAN_SIGN ||
5445           ptr[1] == CHAR_APOSTROPHE || ptr[1] == CHAR_LEFT_CURLY_BRACKET))
5446         {
5447         is_recurse = FALSE;
5448         terminator = (*(++ptr) == CHAR_LESS_THAN_SIGN)?
5449           CHAR_GREATER_THAN_SIGN : (*ptr == CHAR_APOSTROPHE)?
5450           CHAR_APOSTROPHE : CHAR_RIGHT_CURLY_BRACKET;
5451         goto NAMED_REF_OR_RECURSE;
5452         }
5453 
5454       /* Back references are handled specially; must disable firstbyte if
5455       not set to cope with cases like (?=(\w+))\1: which would otherwise set
5456       ':' later. */
5457 
5458       if (-c >= ESC_REF)
5459         {
5460         recno = -c - ESC_REF;
5461 
5462         HANDLE_REFERENCE:    /* Come here from named backref handling */
5463         if (firstbyte == REQ_UNSET) firstbyte = REQ_NONE;
5464         previous = code;
5465         *code++ = OP_REF;
5466         PUT2INC(code, 0, recno);
5467         cd->backref_map |= (recno < 32)? (1 << recno) : 1;
5468         if (recno > cd->top_backref) cd->top_backref = recno;
5469         }
5470 
5471       /* So are Unicode property matches, if supported. */
5472 
5473 #ifdef SUPPORT_UCP
5474       else if (-c == ESC_P || -c == ESC_p)
5475         {
5476         BOOL negated;
5477         int pdata;
5478         int ptype = get_ucp(&ptr, &negated, &pdata, errorcodeptr);
5479         if (ptype < 0) goto FAILED;
5480         previous = code;
5481         *code++ = ((-c == ESC_p) != negated)? OP_PROP : OP_NOTPROP;
5482         *code++ = ptype;
5483         *code++ = pdata;
5484         }
5485 #else
5486 
5487       /* If Unicode properties are not supported, \X, \P, and \p are not
5488       allowed. */
5489 
5490       else if (-c == ESC_X || -c == ESC_P || -c == ESC_p)
5491         {
5492         *errorcodeptr = ERR45;
5493         goto FAILED;
5494         }
5495 #endif
5496 
5497       /* For the rest (including \X when Unicode properties are supported), we
5498       can obtain the OP value by negating the escape value. */
5499 
5500       else
5501         {
5502         previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
5503         *code++ = -c;
5504         }
5505       continue;
5506       }
5507 
5508     /* We have a data character whose value is in c. In UTF-8 mode it may have
5509     a value > 127. We set its representation in the length/buffer, and then
5510     handle it as a data character. */
5511 
5512 #ifdef SUPPORT_UTF8
5513     if (utf8 && c > 127)
5514       mclength = _pcre_ord2utf8(c, mcbuffer);
5515     else
5516 #endif
5517 
5518      {
5519      mcbuffer[0] = c;
5520      mclength = 1;
5521      }
5522     goto ONE_CHAR;
5523 
5524 
5525     /* ===================================================================*/
5526     /* Handle a literal character. It is guaranteed not to be whitespace or #
5527     when the extended flag is set. If we are in UTF-8 mode, it may be a
5528     multi-byte literal character. */
5529 
5530     default:
5531     NORMAL_CHAR:
5532     mclength = 1;
5533     mcbuffer[0] = c;
5534 
5535 #ifdef SUPPORT_UTF8
5536     if (utf8 && c >= 0xc0)
5537       {
5538       while ((ptr[1] & 0xc0) == 0x80)
5539         mcbuffer[mclength++] = *(++ptr);
5540       }
5541 #endif
5542 
5543     /* At this point we have the character's bytes in mcbuffer, and the length
5544     in mclength. When not in UTF-8 mode, the length is always 1. */
5545 
5546     ONE_CHAR:
5547     previous = code;
5548     *code++ = ((options & PCRE_CASELESS) != 0)? OP_CHARNC : OP_CHAR;
5549     for (c = 0; c < mclength; c++) *code++ = mcbuffer[c];
5550 
5551     /* Remember if \r or \n were seen */
5552 
5553     if (mcbuffer[0] == CHAR_CR || mcbuffer[0] == CHAR_NL)
5554       cd->external_flags |= PCRE_HASCRORLF;
5555 
5556     /* Set the first and required bytes appropriately. If no previous first
5557     byte, set it from this character, but revert to none on a zero repeat.
5558     Otherwise, leave the firstbyte value alone, and don't change it on a zero
5559     repeat. */
5560 
5561     if (firstbyte == REQ_UNSET)
5562       {
5563       zerofirstbyte = REQ_NONE;
5564       zeroreqbyte = reqbyte;
5565 
5566       /* If the character is more than one byte long, we can set firstbyte
5567       only if it is not to be matched caselessly. */
5568 
5569       if (mclength == 1 || req_caseopt == 0)
5570         {
5571         firstbyte = mcbuffer[0] | req_caseopt;
5572         if (mclength != 1) reqbyte = code[-1] | cd->req_varyopt;
5573         }
5574       else firstbyte = reqbyte = REQ_NONE;
5575       }
5576 
5577     /* firstbyte was previously set; we can set reqbyte only the length is
5578     1 or the matching is caseful. */
5579 
5580     else
5581       {
5582       zerofirstbyte = firstbyte;
5583       zeroreqbyte = reqbyte;
5584       if (mclength == 1 || req_caseopt == 0)
5585         reqbyte = code[-1] | req_caseopt | cd->req_varyopt;
5586       }
5587 
5588     break;            /* End of literal character handling */
5589     }
5590   }                   /* end of big loop */
5591 
5592 
5593 /* Control never reaches here by falling through, only by a goto for all the
5594 error states. Pass back the position in the pattern so that it can be displayed
5595 to the user for diagnosing the error. */
5596 
5597 FAILED:
5598 *ptrptr = ptr;
5599 return FALSE;
5600 }
5601 
5602 
5603 
5604 
5605 /*************************************************
5606 *     Compile sequence of alternatives           *
5607 *************************************************/
5608 
5609 /* On entry, ptr is pointing past the bracket character, but on return it
5610 points to the closing bracket, or vertical bar, or end of string. The code
5611 variable is pointing at the byte into which the BRA operator has been stored.
5612 If the ims options are changed at the start (for a (?ims: group) or during any
5613 branch, we need to insert an OP_OPT item at the start of every following branch
5614 to ensure they get set correctly at run time, and also pass the new options
5615 into every subsequent branch compile.
5616 
5617 This function is used during the pre-compile phase when we are trying to find
5618 out the amount of memory needed, as well as during the real compile phase. The
5619 value of lengthptr distinguishes the two phases.
5620 
5621 Arguments:
5622   options        option bits, including any changes for this subpattern
5623   oldims         previous settings of ims option bits
5624   codeptr        -> the address of the current code pointer
5625   ptrptr         -> the address of the current pattern pointer
5626   errorcodeptr   -> pointer to error code variable
5627   lookbehind     TRUE if this is a lookbehind assertion
5628   reset_bracount TRUE to reset the count for each branch
5629   skipbytes      skip this many bytes at start (for brackets and OP_COND)
5630   firstbyteptr   place to put the first required character, or a negative number
5631   reqbyteptr     place to put the last required character, or a negative number
5632   bcptr          pointer to the chain of currently open branches
5633   cd             points to the data block with tables pointers etc.
5634   lengthptr      NULL during the real compile phase
5635                  points to length accumulator during pre-compile phase
5636 
5637 Returns:         TRUE on success
5638 */
5639 
5640 static BOOL
5641 compile_regex(int options, int oldims, uschar **codeptr, const uschar **ptrptr,
5642   int *errorcodeptr, BOOL lookbehind, BOOL reset_bracount, int skipbytes,
5643   int *firstbyteptr, int *reqbyteptr, branch_chain *bcptr, compile_data *cd,
5644   int *lengthptr)
5645 {
5646 const uschar *ptr = *ptrptr;
5647 uschar *code = *codeptr;
5648 uschar *last_branch = code;
5649 uschar *start_bracket = code;
5650 uschar *reverse_count = NULL;
5651 int firstbyte, reqbyte;
5652 int branchfirstbyte, branchreqbyte;
5653 int length;
5654 int orig_bracount;
5655 int max_bracount;
5656 branch_chain bc;
5657 
5658 bc.outer = bcptr;
5659 bc.current = code;
5660 
5661 firstbyte = reqbyte = REQ_UNSET;
5662 
5663 /* Accumulate the length for use in the pre-compile phase. Start with the
5664 length of the BRA and KET and any extra bytes that are required at the
5665 beginning. We accumulate in a local variable to save frequent testing of
5666 lenthptr for NULL. We cannot do this by looking at the value of code at the
5667 start and end of each alternative, because compiled items are discarded during
5668 the pre-compile phase so that the work space is not exceeded. */
5669 
5670 length = 2 + 2*LINK_SIZE + skipbytes;
5671 
5672 /* WARNING: If the above line is changed for any reason, you must also change
5673 the code that abstracts option settings at the start of the pattern and makes
5674 them global. It tests the value of length for (2 + 2*LINK_SIZE) in the
5675 pre-compile phase to find out whether anything has yet been compiled or not. */
5676 
5677 /* Offset is set zero to mark that this bracket is still open */
5678 
5679 PUT(code, 1, 0);
5680 code += 1 + LINK_SIZE + skipbytes;
5681 
5682 /* Loop for each alternative branch */
5683 
5684 orig_bracount = max_bracount = cd->bracount;
5685 for (;;)
5686   {
5687   /* For a (?| group, reset the capturing bracket count so that each branch
5688   uses the same numbers. */
5689 
5690   if (reset_bracount) cd->bracount = orig_bracount;
5691 
5692   /* Handle a change of ims options at the start of the branch */
5693 
5694   if ((options & PCRE_IMS) != oldims)
5695     {
5696     *code++ = OP_OPT;
5697     *code++ = options & PCRE_IMS;
5698     length += 2;
5699     }
5700 
5701   /* Set up dummy OP_REVERSE if lookbehind assertion */
5702 
5703   if (lookbehind)
5704     {
5705     *code++ = OP_REVERSE;
5706     reverse_count = code;
5707     PUTINC(code, 0, 0);
5708     length += 1 + LINK_SIZE;
5709     }
5710 
5711   /* Now compile the branch; in the pre-compile phase its length gets added
5712   into the length. */
5713 
5714   if (!compile_branch(&options, &code, &ptr, errorcodeptr, &branchfirstbyte,
5715         &branchreqbyte, &bc, cd, (lengthptr == NULL)? NULL : &length))
5716     {
5717     *ptrptr = ptr;
5718     return FALSE;
5719     }
5720 
5721   /* Keep the highest bracket count in case (?| was used and some branch
5722   has fewer than the rest. */
5723 
5724   if (cd->bracount > max_bracount) max_bracount = cd->bracount;
5725 
5726   /* In the real compile phase, there is some post-processing to be done. */
5727 
5728   if (lengthptr == NULL)
5729     {
5730     /* If this is the first branch, the firstbyte and reqbyte values for the
5731     branch become the values for the regex. */
5732 
5733     if (*last_branch != OP_ALT)
5734       {
5735       firstbyte = branchfirstbyte;
5736       reqbyte = branchreqbyte;
5737       }
5738 
5739     /* If this is not the first branch, the first char and reqbyte have to
5740     match the values from all the previous branches, except that if the
5741     previous value for reqbyte didn't have REQ_VARY set, it can still match,
5742     and we set REQ_VARY for the regex. */
5743 
5744     else
5745       {
5746       /* If we previously had a firstbyte, but it doesn't match the new branch,
5747       we have to abandon the firstbyte for the regex, but if there was
5748       previously no reqbyte, it takes on the value of the old firstbyte. */
5749 
5750       if (firstbyte >= 0 && firstbyte != branchfirstbyte)
5751         {
5752         if (reqbyte < 0) reqbyte = firstbyte;
5753         firstbyte = REQ_NONE;
5754         }
5755 
5756       /* If we (now or from before) have no firstbyte, a firstbyte from the
5757       branch becomes a reqbyte if there isn't a branch reqbyte. */
5758 
5759       if (firstbyte < 0 && branchfirstbyte >= 0 && branchreqbyte < 0)
5760           branchreqbyte = branchfirstbyte;
5761 
5762       /* Now ensure that the reqbytes match */
5763 
5764       if ((reqbyte & ~REQ_VARY) != (branchreqbyte & ~REQ_VARY))
5765         reqbyte = REQ_NONE;
5766       else reqbyte |= branchreqbyte;   /* To "or" REQ_VARY */
5767       }
5768 
5769     /* If lookbehind, check that this branch matches a fixed-length string, and
5770     put the length into the OP_REVERSE item. Temporarily mark the end of the
5771     branch with OP_END. */
5772 
5773     if (lookbehind)
5774       {
5775       int fixed_length;
5776       *code = OP_END;
5777       fixed_length = find_fixedlength(last_branch, options);
5778       DPRINTF(("fixed length = %d\n", fixed_length));
5779       if (fixed_length < 0)
5780         {
5781         *errorcodeptr = (fixed_length == -2)? ERR36 : ERR25;
5782         *ptrptr = ptr;
5783         return FALSE;
5784         }
5785       PUT(reverse_count, 0, fixed_length);
5786       }
5787     }
5788 
5789   /* Reached end of expression, either ')' or end of pattern. In the real
5790   compile phase, go back through the alternative branches and reverse the chain
5791   of offsets, with the field in the BRA item now becoming an offset to the
5792   first alternative. If there are no alternatives, it points to the end of the
5793   group. The length in the terminating ket is always the length of the whole
5794   bracketed item. If any of the ims options were changed inside the group,
5795   compile a resetting op-code following, except at the very end of the pattern.
5796   Return leaving the pointer at the terminating char. */
5797 
5798   if (*ptr != CHAR_VERTICAL_LINE)
5799     {
5800     if (lengthptr == NULL)
5801       {
5802       int branch_length = code - last_branch;
5803       do
5804         {
5805         int prev_length = GET(last_branch, 1);
5806         PUT(last_branch, 1, branch_length);
5807         branch_length = prev_length;
5808         last_branch -= branch_length;
5809         }
5810       while (branch_length > 0);
5811       }
5812 
5813     /* Fill in the ket */
5814 
5815     *code = OP_KET;
5816     PUT(code, 1, code - start_bracket);
5817     code += 1 + LINK_SIZE;
5818 
5819     /* Resetting option if needed */
5820 
5821     if ((options & PCRE_IMS) != oldims && *ptr == CHAR_RIGHT_PARENTHESIS)
5822       {
5823       *code++ = OP_OPT;
5824       *code++ = oldims;
5825       length += 2;
5826       }
5827 
5828     /* Retain the highest bracket number, in case resetting was used. */
5829 
5830     cd->bracount = max_bracount;
5831 
5832     /* Set values to pass back */
5833 
5834     *codeptr = code;
5835     *ptrptr = ptr;
5836     *firstbyteptr = firstbyte;
5837     *reqbyteptr = reqbyte;
5838     if (lengthptr != NULL)
5839       {
5840       if (OFLOW_MAX - *lengthptr < length)
5841         {
5842         *errorcodeptr = ERR20;
5843         return FALSE;
5844         }
5845       *lengthptr += length;
5846       }
5847     return TRUE;
5848     }
5849 
5850   /* Another branch follows. In the pre-compile phase, we can move the code
5851   pointer back to where it was for the start of the first branch. (That is,
5852   pretend that each branch is the only one.)
5853 
5854   In the real compile phase, insert an ALT node. Its length field points back
5855   to the previous branch while the bracket remains open. At the end the chain
5856   is reversed. It's done like this so that the start of the bracket has a
5857   zero offset until it is closed, making it possible to detect recursion. */
5858 
5859   if (lengthptr != NULL)
5860     {
5861     code = *codeptr + 1 + LINK_SIZE + skipbytes;
5862     length += 1 + LINK_SIZE;
5863     }
5864   else
5865     {
5866     *code = OP_ALT;
5867     PUT(code, 1, code - last_branch);
5868     bc.current = last_branch = code;
5869     code += 1 + LINK_SIZE;
5870     }
5871 
5872   ptr++;
5873   }
5874 /* Control never reaches here */
5875 }
5876 
5877 
5878 
5879 
5880 /*************************************************
5881 *          Check for anchored expression         *
5882 *************************************************/
5883 
5884 /* Try to find out if this is an anchored regular expression. Consider each
5885 alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
5886 all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
5887 it's anchored. However, if this is a multiline pattern, then only OP_SOD
5888 counts, since OP_CIRC can match in the middle.
5889 
5890 We can also consider a regex to be anchored if OP_SOM starts all its branches.
5891 This is the code for \G, which means "match at start of match position, taking
5892 into account the match offset".
5893 
5894 A branch is also implicitly anchored if it starts with .* and DOTALL is set,
5895 because that will try the rest of the pattern at all possible matching points,
5896 so there is no point trying again.... er ....
5897 
5898 .... except when the .* appears inside capturing parentheses, and there is a
5899 subsequent back reference to those parentheses. We haven't enough information
5900 to catch that case precisely.
5901 
5902 At first, the best we could do was to detect when .* was in capturing brackets
5903 and the highest back reference was greater than or equal to that level.
5904 However, by keeping a bitmap of the first 31 back references, we can catch some
5905 of the more common cases more precisely.
5906 
5907 Arguments:
5908   code           points to start of expression (the bracket)
5909   options        points to the options setting
5910   bracket_map    a bitmap of which brackets we are inside while testing; this
5911                   handles up to substring 31; after that we just have to take
5912                   the less precise approach
5913   backref_map    the back reference bitmap
5914 
5915 Returns:     TRUE or FALSE
5916 */
5917 
5918 static BOOL
5919 is_anchored(register const uschar *code, int *options, unsigned int bracket_map,
5920   unsigned int backref_map)
5921 {
5922 do {
5923    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
5924      options, PCRE_MULTILINE, FALSE);
5925    register int op = *scode;
5926 
5927    /* Non-capturing brackets */
5928 
5929    if (op == OP_BRA)
5930      {
5931      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5932      }
5933 
5934    /* Capturing brackets */
5935 
5936    else if (op == OP_CBRA)
5937      {
5938      int n = GET2(scode, 1+LINK_SIZE);
5939      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
5940      if (!is_anchored(scode, options, new_map, backref_map)) return FALSE;
5941      }
5942 
5943    /* Other brackets */
5944 
5945    else if (op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
5946      {
5947      if (!is_anchored(scode, options, bracket_map, backref_map)) return FALSE;
5948      }
5949 
5950    /* .* is not anchored unless DOTALL is set (which generates OP_ALLANY) and
5951    it isn't in brackets that are or may be referenced. */
5952 
5953    else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR ||
5954              op == OP_TYPEPOSSTAR))
5955      {
5956      if (scode[1] != OP_ALLANY || (bracket_map & backref_map) != 0)
5957        return FALSE;
5958      }
5959 
5960    /* Check for explicit anchoring */
5961 
5962    else if (op != OP_SOD && op != OP_SOM &&
5963            ((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
5964      return FALSE;
5965    code += GET(code, 1);
5966    }
5967 while (*code == OP_ALT);   /* Loop for each alternative */
5968 return TRUE;
5969 }
5970 
5971 
5972 
5973 /*************************************************
5974 *         Check for starting with ^ or .*        *
5975 *************************************************/
5976 
5977 /* This is called to find out if every branch starts with ^ or .* so that
5978 "first char" processing can be done to speed things up in multiline
5979 matching and for non-DOTALL patterns that start with .* (which must start at
5980 the beginning or after \n). As in the case of is_anchored() (see above), we
5981 have to take account of back references to capturing brackets that contain .*
5982 because in that case we can't make the assumption.
5983 
5984 Arguments:
5985   code           points to start of expression (the bracket)
5986   bracket_map    a bitmap of which brackets we are inside while testing; this
5987                   handles up to substring 31; after that we just have to take
5988                   the less precise approach
5989   backref_map    the back reference bitmap
5990 
5991 Returns:         TRUE or FALSE
5992 */
5993 
5994 static BOOL
5995 is_startline(const uschar *code, unsigned int bracket_map,
5996   unsigned int backref_map)
5997 {
5998 do {
5999    const uschar *scode = first_significant_code(code + _pcre_OP_lengths[*code],
6000      NULL, 0, FALSE);
6001    register int op = *scode;
6002 
6003    /* If we are at the start of a conditional assertion group, *both* the
6004    conditional assertion *and* what follows the condition must satisfy the test
6005    for start of line. Other kinds of condition fail. Note that there may be an
6006    auto-callout at the start of a condition. */
6007 
6008    if (op == OP_COND)
6009      {
6010      scode += 1 + LINK_SIZE;
6011      if (*scode == OP_CALLOUT) scode += _pcre_OP_lengths[OP_CALLOUT];
6012      switch (*scode)
6013        {
6014        case OP_CREF:
6015        case OP_RREF:
6016        case OP_DEF:
6017        return FALSE;
6018 
6019        default:     /* Assertion */
6020        if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6021        do scode += GET(scode, 1); while (*scode == OP_ALT);
6022        scode += 1 + LINK_SIZE;
6023        break;
6024        }
6025      scode = first_significant_code(scode, NULL, 0, FALSE);
6026      op = *scode;
6027      }
6028 
6029    /* Non-capturing brackets */
6030 
6031    if (op == OP_BRA)
6032      {
6033      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6034      }
6035 
6036    /* Capturing brackets */
6037 
6038    else if (op == OP_CBRA)
6039      {
6040      int n = GET2(scode, 1+LINK_SIZE);
6041      int new_map = bracket_map | ((n < 32)? (1 << n) : 1);
6042      if (!is_startline(scode, new_map, backref_map)) return FALSE;
6043      }
6044 
6045    /* Other brackets */
6046 
6047    else if (op == OP_ASSERT || op == OP_ONCE)
6048      {
6049      if (!is_startline(scode, bracket_map, backref_map)) return FALSE;
6050      }
6051 
6052    /* .* means "start at start or after \n" if it isn't in brackets that
6053    may be referenced. */
6054 
6055    else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR || op == OP_TYPEPOSSTAR)
6056      {
6057      if (scode[1] != OP_ANY || (bracket_map & backref_map) != 0) return FALSE;
6058      }
6059 
6060    /* Check for explicit circumflex */
6061 
6062    else if (op != OP_CIRC) return FALSE;
6063 
6064    /* Move on to the next alternative */
6065 
6066    code += GET(code, 1);
6067    }
6068 while (*code == OP_ALT);  /* Loop for each alternative */
6069 return TRUE;
6070 }
6071 
6072 
6073 
6074 /*************************************************
6075 *       Check for asserted fixed first char      *
6076 *************************************************/
6077 
6078 /* During compilation, the "first char" settings from forward assertions are
6079 discarded, because they can cause conflicts with actual literals that follow.
6080 However, if we end up without a first char setting for an unanchored pattern,
6081 it is worth scanning the regex to see if there is an initial asserted first
6082 char. If all branches start with the same asserted char, or with a bracket all
6083 of whose alternatives start with the same asserted char (recurse ad lib), then
6084 we return that char, otherwise -1.
6085 
6086 Arguments:
6087   code       points to start of expression (the bracket)
6088   options    pointer to the options (used to check casing changes)
6089   inassert   TRUE if in an assertion
6090 
6091 Returns:     -1 or the fixed first char
6092 */
6093 
6094 static int
6095 find_firstassertedchar(const uschar *code, int *options, BOOL inassert)
6096 {
6097 register int c = -1;
6098 do {
6099    int d;
6100    const uschar *scode =
6101      first_significant_code(code + 1+LINK_SIZE, options, PCRE_CASELESS, TRUE);
6102    register int op = *scode;
6103 
6104    switch(op)
6105      {
6106      default:
6107      return -1;
6108 
6109      case OP_BRA:
6110      case OP_CBRA:
6111      case OP_ASSERT:
6112      case OP_ONCE:
6113      case OP_COND:
6114      if ((d = find_firstassertedchar(scode, options, op == OP_ASSERT)) < 0)
6115        return -1;
6116      if (c < 0) c = d; else if (c != d) return -1;
6117      break;
6118 
6119      case OP_EXACT:       /* Fall through */
6120      scode += 2;
6121 
6122      case OP_CHAR:
6123      case OP_CHARNC:
6124      case OP_PLUS:
6125      case OP_MINPLUS:
6126      case OP_POSPLUS:
6127      if (!inassert) return -1;
6128      if (c < 0)
6129        {
6130        c = scode[1];
6131        if ((*options & PCRE_CASELESS) != 0) c |= REQ_CASELESS;
6132        }
6133      else if (c != scode[1]) return -1;
6134      break;
6135      }
6136 
6137    code += GET(code, 1);
6138    }
6139 while (*code == OP_ALT);
6140 return c;
6141 }
6142 
6143 
6144 
6145 /*************************************************
6146 *        Compile a Regular Expression            *
6147 *************************************************/
6148 
6149 /* This function takes a string and returns a pointer to a block of store
6150 holding a compiled version of the expression. The original API for this
6151 function had no error code return variable; it is retained for backwards
6152 compatibility. The new function is given a new name.
6153 
6154 Arguments:
6155   pattern       the regular expression
6156   options       various option bits
6157   errorcodeptr  pointer to error code variable (pcre_compile2() only)
6158                   can be NULL if you don't want a code value
6159   errorptr      pointer to pointer to error text
6160   erroroffset   ptr offset in pattern where error was detected
6161   tables        pointer to character tables or NULL
6162 
6163 Returns:        pointer to compiled data block, or NULL on error,
6164                 with errorptr and erroroffset set
6165 */
6166 
6167 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6168 pcre_compile(const char *pattern, int options, const char **errorptr,
6169   int *erroroffset, const unsigned char *tables)
6170 {
6171 return pcre_compile2(pattern, options, NULL, errorptr, erroroffset, tables);
6172 }
6173 
6174 
6175 PCRE_EXP_DEFN pcre * PCRE_CALL_CONVENTION
6176 pcre_compile2(const char *pattern, int options, int *errorcodeptr,
6177   const char **errorptr, int *erroroffset, const unsigned char *tables)
6178 {
6179 real_pcre *re;
6180 int length = 1;  /* For final END opcode */
6181 int firstbyte, reqbyte, newline;
6182 int errorcode = 0;
6183 int skipatstart = 0;
6184 #ifdef SUPPORT_UTF8
6185 BOOL utf8;
6186 #endif
6187 size_t size;
6188 uschar *code;
6189 const uschar *codestart;
6190 const uschar *ptr;
6191 compile_data compile_block;
6192 compile_data *cd = &compile_block;
6193 
6194 /* This space is used for "compiling" into during the first phase, when we are
6195 computing the amount of memory that is needed. Compiled items are thrown away
6196 as soon as possible, so that a fairly large buffer should be sufficient for
6197 this purpose. The same space is used in the second phase for remembering where
6198 to fill in forward references to subpatterns. */
6199 
6200 uschar cworkspace[COMPILE_WORK_SIZE];
6201 
6202 /* Set this early so that early errors get offset 0. */
6203 
6204 ptr = (const uschar *)pattern;
6205 
6206 /* We can't pass back an error message if errorptr is NULL; I guess the best we
6207 can do is just return NULL, but we can set a code value if there is a code
6208 pointer. */
6209 
6210 if (errorptr == NULL)
6211   {
6212   if (errorcodeptr != NULL) *errorcodeptr = 99;
6213   return NULL;
6214   }
6215 
6216 *errorptr = NULL;
6217 if (errorcodeptr != NULL) *errorcodeptr = ERR0;
6218 
6219 /* However, we can give a message for this error */
6220 
6221 if (erroroffset == NULL)
6222   {
6223   errorcode = ERR16;
6224   goto PCRE_EARLY_ERROR_RETURN2;
6225   }
6226 
6227 *erroroffset = 0;
6228 
6229 /* Set up pointers to the individual character tables */
6230 
6231 if (tables == NULL) tables = _pcre_default_tables;
6232 cd->lcc = tables + lcc_offset;
6233 cd->fcc = tables + fcc_offset;
6234 cd->cbits = tables + cbits_offset;
6235 cd->ctypes = tables + ctypes_offset;
6236 
6237 /* Check that all undefined public option bits are zero */
6238 
6239 if ((options & ~PUBLIC_COMPILE_OPTIONS) != 0)
6240   {
6241   errorcode = ERR17;
6242   goto PCRE_EARLY_ERROR_RETURN;
6243   }
6244 
6245 /* Check for global one-time settings at the start of the pattern, and remember
6246 the offset for later. */
6247 
6248 while (ptr[skipatstart] == CHAR_LEFT_PARENTHESIS &&
6249        ptr[skipatstart+1] == CHAR_ASTERISK)
6250   {
6251   int newnl = 0;
6252   int newbsr = 0;
6253 
6254   if (strncmp((char *)(ptr+skipatstart+2), STRING_UTF8_RIGHTPAR, 5) == 0)
6255     { skipatstart += 7; options |= PCRE_UTF8; continue; }
6256 
6257   if (strncmp((char *)(ptr+skipatstart+2), STRING_CR_RIGHTPAR, 3) == 0)
6258     { skipatstart += 5; newnl = PCRE_NEWLINE_CR; }
6259   else if (strncmp((char *)(ptr+skipatstart+2), STRING_LF_RIGHTPAR, 3)  == 0)
6260     { skipatstart += 5; newnl = PCRE_NEWLINE_LF; }
6261   else if (strncmp((char *)(ptr+skipatstart+2), STRING_CRLF_RIGHTPAR, 5)  == 0)
6262     { skipatstart += 7; newnl = PCRE_NEWLINE_CR + PCRE_NEWLINE_LF; }
6263   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANY_RIGHTPAR, 4) == 0)
6264     { skipatstart += 6; newnl = PCRE_NEWLINE_ANY; }
6265   else if (strncmp((char *)(ptr+skipatstart+2), STRING_ANYCRLF_RIGHTPAR, 8) == 0)
6266     { skipatstart += 10; newnl = PCRE_NEWLINE_ANYCRLF; }
6267 
6268   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_ANYCRLF_RIGHTPAR, 12) == 0)
6269     { skipatstart += 14; newbsr = PCRE_BSR_ANYCRLF; }
6270   else if (strncmp((char *)(ptr+skipatstart+2), STRING_BSR_UNICODE_RIGHTPAR, 12) == 0)
6271     { skipatstart += 14; newbsr = PCRE_BSR_UNICODE; }
6272 
6273   if (newnl != 0)
6274     options = (options & ~PCRE_NEWLINE_BITS) | newnl;
6275   else if (newbsr != 0)
6276     options = (options & ~(PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) | newbsr;
6277   else break;
6278   }
6279 
6280 /* Can't support UTF8 unless PCRE has been compiled to include the code. */
6281 
6282 #ifdef SUPPORT_UTF8
6283 utf8 = (options & PCRE_UTF8) != 0;
6284 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0 &&
6285      (*erroroffset = _pcre_valid_utf8((uschar *)pattern, -1)) >= 0)
6286   {
6287   errorcode = ERR44;
6288   goto PCRE_EARLY_ERROR_RETURN2;
6289   }
6290 #else
6291 if ((options & PCRE_UTF8) != 0)
6292   {
6293   errorcode = ERR32;
6294   goto PCRE_EARLY_ERROR_RETURN;
6295   }
6296 #endif
6297 
6298 /* Check validity of \R options. */
6299 
6300 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
6301   {
6302   case 0:
6303   case PCRE_BSR_ANYCRLF:
6304   case PCRE_BSR_UNICODE:
6305   break;
6306   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6307   }
6308 
6309 /* Handle different types of newline. The three bits give seven cases. The
6310 current code allows for fixed one- or two-byte sequences, plus "any" and
6311 "anycrlf". */
6312 
6313 switch (options & PCRE_NEWLINE_BITS)
6314   {
6315   case 0: newline = NEWLINE; break;   /* Build-time default */
6316   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
6317   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
6318   case PCRE_NEWLINE_CR+
6319        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
6320   case PCRE_NEWLINE_ANY: newline = -1; break;
6321   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
6322   default: errorcode = ERR56; goto PCRE_EARLY_ERROR_RETURN;
6323   }
6324 
6325 if (newline == -2)
6326   {
6327   cd->nltype = NLTYPE_ANYCRLF;
6328   }
6329 else if (newline < 0)
6330   {
6331   cd->nltype = NLTYPE_ANY;
6332   }
6333 else
6334   {
6335   cd->nltype = NLTYPE_FIXED;
6336   if (newline > 255)
6337     {
6338     cd->nllen = 2;
6339     cd->nl[0] = (newline >> 8) & 255;
6340     cd->nl[1] = newline & 255;
6341     }
6342   else
6343     {
6344     cd->nllen = 1;
6345     cd->nl[0] = newline;
6346     }
6347   }
6348 
6349 /* Maximum back reference and backref bitmap. The bitmap records up to 31 back
6350 references to help in deciding whether (.*) can be treated as anchored or not.
6351 */
6352 
6353 cd->top_backref = 0;
6354 cd->backref_map = 0;
6355 
6356 /* Reflect pattern for debugging output */
6357 
6358 DPRINTF(("------------------------------------------------------------------\n"));
6359 DPRINTF(("%s\n", pattern));
6360 
6361 /* Pretend to compile the pattern while actually just accumulating the length
6362 of memory required. This behaviour is triggered by passing a non-NULL final
6363 argument to compile_regex(). We pass a block of workspace (cworkspace) for it
6364 to compile parts of the pattern into; the compiled code is discarded when it is
6365 no longer needed, so hopefully this workspace will never overflow, though there
6366 is a test for its doing so. */
6367 
6368 cd->bracount = cd->final_bracount = 0;
6369 cd->names_found = 0;
6370 cd->name_entry_size = 0;
6371 cd->name_table = NULL;
6372 cd->start_workspace = cworkspace;
6373 cd->start_code = cworkspace;
6374 cd->hwm = cworkspace;
6375 cd->start_pattern = (const uschar *)pattern;
6376 cd->end_pattern = (const uschar *)(pattern + strlen(pattern));
6377 cd->req_varyopt = 0;
6378 cd->external_options = options;
6379 cd->external_flags = 0;
6380 
6381 /* Now do the pre-compile. On error, errorcode will be set non-zero, so we
6382 don't need to look at the result of the function here. The initial options have
6383 been put into the cd block so that they can be changed if an option setting is
6384 found within the regex right at the beginning. Bringing initial option settings
6385 outside can help speed up starting point checks. */
6386 
6387 ptr += skipatstart;
6388 code = cworkspace;
6389 *code = OP_BRA;
6390 (void)compile_regex(cd->external_options, cd->external_options & PCRE_IMS,
6391   &code, &ptr, &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd,
6392   &length);
6393 if (errorcode != 0) goto PCRE_EARLY_ERROR_RETURN;
6394 
6395 DPRINTF(("end pre-compile: length=%d workspace=%d\n", length,
6396   cd->hwm - cworkspace));
6397 
6398 if (length > MAX_PATTERN_SIZE)
6399   {
6400   errorcode = ERR20;
6401   goto PCRE_EARLY_ERROR_RETURN;
6402   }
6403 
6404 /* Compute the size of data block needed and get it, either from malloc or
6405 externally provided function. Integer overflow should no longer be possible
6406 because nowadays we limit the maximum value of cd->names_found and
6407 cd->name_entry_size. */
6408 
6409 size = length + sizeof(real_pcre) + cd->names_found * (cd->name_entry_size + 3);
6410 re = (real_pcre *)(pcre_malloc)(size);
6411 
6412 if (re == NULL)
6413   {
6414   errorcode = ERR21;
6415   goto PCRE_EARLY_ERROR_RETURN;
6416   }
6417 
6418 /* Put in the magic number, and save the sizes, initial options, internal
6419 flags, and character table pointer. NULL is used for the default character
6420 tables. The nullpad field is at the end; it's there to help in the case when a
6421 regex compiled on a system with 4-byte pointers is run on another with 8-byte
6422 pointers. */
6423 
6424 re->magic_number = MAGIC_NUMBER;
6425 re->size = size;
6426 re->options = cd->external_options;
6427 re->flags = cd->external_flags;
6428 re->dummy1 = 0;
6429 re->first_byte = 0;
6430 re->req_byte = 0;
6431 re->name_table_offset = sizeof(real_pcre);
6432 re->name_entry_size = cd->name_entry_size;
6433 re->name_count = cd->names_found;
6434 re->ref_count = 0;
6435 re->tables = (tables == _pcre_default_tables)? NULL : tables;
6436 re->nullpad = NULL;
6437 
6438 /* The starting points of the name/number translation table and of the code are
6439 passed around in the compile data block. The start/end pattern and initial
6440 options are already set from the pre-compile phase, as is the name_entry_size
6441 field. Reset the bracket count and the names_found field. Also reset the hwm
6442 field; this time it's used for remembering forward references to subpatterns.
6443 */
6444 
6445 cd->final_bracount = cd->bracount;  /* Save for checking forward references */
6446 cd->bracount = 0;
6447 cd->names_found = 0;
6448 cd->name_table = (uschar *)re + re->name_table_offset;
6449 codestart = cd->name_table + re->name_entry_size * re->name_count;
6450 cd->start_code = codestart;
6451 cd->hwm = cworkspace;
6452 cd->req_varyopt = 0;
6453 cd->had_accept = FALSE;
6454 
6455 /* Set up a starting, non-extracting bracket, then compile the expression. On
6456 error, errorcode will be set non-zero, so we don't need to look at the result
6457 of the function here. */
6458 
6459 ptr = (const uschar *)pattern + skipatstart;
6460 code = (uschar *)codestart;
6461 *code = OP_BRA;
6462 (void)compile_regex(re->options, re->options & PCRE_IMS, &code, &ptr,
6463   &errorcode, FALSE, FALSE, 0, &firstbyte, &reqbyte, NULL, cd, NULL);
6464 re->top_bracket = cd->bracount;
6465 re->top_backref = cd->top_backref;
6466 re->flags = cd->external_flags;
6467 
6468 if (cd->had_accept) reqbyte = -1;   /* Must disable after (*ACCEPT) */
6469 
6470 /* If not reached end of pattern on success, there's an excess bracket. */
6471 
6472 if (errorcode == 0 && *ptr != 0) errorcode = ERR22;
6473 
6474 /* Fill in the terminating state and check for disastrous overflow, but
6475 if debugging, leave the test till after things are printed out. */
6476 
6477 *code++ = OP_END;
6478 
6479 #ifndef DEBUG
6480 if (code - codestart > length) errorcode = ERR23;
6481 #endif
6482 
6483 /* Fill in any forward references that are required. */
6484 
6485 while (errorcode == 0 && cd->hwm > cworkspace)
6486   {
6487   int offset, recno;
6488   const uschar *groupptr;
6489   cd->hwm -= LINK_SIZE;
6490   offset = GET(cd->hwm, 0);
6491   recno = GET(codestart, offset);
6492   groupptr = find_bracket(codestart, (re->options & PCRE_UTF8) != 0, recno);
6493   if (groupptr == NULL) errorcode = ERR53;
6494     else PUT(((uschar *)codestart), offset, groupptr - codestart);
6495   }
6496 
6497 /* Give an error if there's back reference to a non-existent capturing
6498 subpattern. */
6499 
6500 if (errorcode == 0 && re->top_backref > re->top_bracket) errorcode = ERR15;
6501 
6502 /* Failed to compile, or error while post-processing */
6503 
6504 if (errorcode != 0)
6505   {
6506   (pcre_free)(re);
6507   PCRE_EARLY_ERROR_RETURN:
6508   *erroroffset = ptr - (const uschar *)pattern;
6509   PCRE_EARLY_ERROR_RETURN2:
6510   *errorptr = find_error_text(errorcode);
6511   if (errorcodeptr != NULL) *errorcodeptr = errorcode;
6512   return NULL;
6513   }
6514 
6515 /* If the anchored option was not passed, set the flag if we can determine that
6516 the pattern is anchored by virtue of ^ characters or \A or anything else (such
6517 as starting with .* when DOTALL is set).
6518 
6519 Otherwise, if we know what the first byte has to be, save it, because that
6520 speeds up unanchored matches no end. If not, see if we can set the
6521 PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
6522 start with ^. and also when all branches start with .* for non-DOTALL matches.
6523 */
6524 
6525 if ((re->options & PCRE_ANCHORED) == 0)
6526   {
6527   int temp_options = re->options;   /* May get changed during these scans */
6528   if (is_anchored(codestart, &temp_options, 0, cd->backref_map))
6529     re->options |= PCRE_ANCHORED;
6530   else
6531     {
6532     if (firstbyte < 0)
6533       firstbyte = find_firstassertedchar(codestart, &temp_options, FALSE);
6534     if (firstbyte >= 0)   /* Remove caseless flag for non-caseable chars */
6535       {
6536       int ch = firstbyte & 255;
6537       re->first_byte = ((firstbyte & REQ_CASELESS) != 0 &&
6538          cd->fcc[ch] == ch)? ch : firstbyte;
6539       re->flags |= PCRE_FIRSTSET;
6540       }
6541     else if (is_startline(codestart, 0, cd->backref_map))
6542       re->flags |= PCRE_STARTLINE;
6543     }
6544   }
6545 
6546 /* For an anchored pattern, we use the "required byte" only if it follows a
6547 variable length item in the regex. Remove the caseless flag for non-caseable
6548 bytes. */
6549 
6550 if (reqbyte >= 0 &&
6551      ((re->options & PCRE_ANCHORED) == 0 || (reqbyte & REQ_VARY) != 0))
6552   {
6553   int ch = reqbyte & 255;
6554   re->req_byte = ((reqbyte & REQ_CASELESS) != 0 &&
6555     cd->fcc[ch] == ch)? (reqbyte & ~REQ_CASELESS) : reqbyte;
6556   re->flags |= PCRE_REQCHSET;
6557   }
6558 
6559 /* Print out the compiled data if debugging is enabled. This is never the
6560 case when building a production library. */
6561 
6562 #ifdef DEBUG
6563 
6564 printf("Length = %d top_bracket = %d top_backref = %d\n",
6565   length, re->top_bracket, re->top_backref);
6566 
6567 printf("Options=%08x\n", re->options);
6568 
6569 if ((re->flags & PCRE_FIRSTSET) != 0)
6570   {
6571   int ch = re->first_byte & 255;
6572   const char *caseless = ((re->first_byte & REQ_CASELESS) == 0)?
6573     "" : " (caseless)";
6574   if (isprint(ch)) printf("First char = %c%s\n", ch, caseless);
6575     else printf("First char = \\x%02x%s\n", ch, caseless);
6576   }
6577 
6578 if ((re->flags & PCRE_REQCHSET) != 0)
6579   {
6580   int ch = re->req_byte & 255;
6581   const char *caseless = ((re->req_byte & REQ_CASELESS) == 0)?
6582     "" : " (caseless)";
6583   if (isprint(ch)) printf("Req char = %c%s\n", ch, caseless);
6584     else printf("Req char = \\x%02x%s\n", ch, caseless);
6585   }
6586 
6587 pcre_printint(re, stdout, TRUE);
6588 
6589 /* This check is done here in the debugging case so that the code that
6590 was compiled can be seen. */
6591 
6592 if (code - codestart > length)
6593   {
6594   (pcre_free)(re);
6595   *errorptr = find_error_text(ERR23);
6596   *erroroffset = ptr - (uschar *)pattern;
6597   if (errorcodeptr != NULL) *errorcodeptr = ERR23;
6598   return NULL;
6599   }
6600 #endif   /* DEBUG */
6601 
6602 return (pcre *)re;
6603 }
6604 
6605 /* End of pcre_compile.c */
6606 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.