NCBI C++ Toolkit Cross Reference

C++/src/util/regexp/pcre_dfa_exec.c


  1 /*************************************************
  2 *      Perl-Compatible Regular Expressions       *
  3 *************************************************/
  4 
  5 /* PCRE is a library of functions to support regular expressions whose syntax
  6 and semantics are as close as possible to those of the Perl 5 language (but see
  7 below for why this module is different).
  8 
  9                        Written by Philip Hazel
 10            Copyright (c) 1997-2009 University of Cambridge
 11 
 12 -----------------------------------------------------------------------------
 13 Redistribution and use in source and binary forms, with or without
 14 modification, are permitted provided that the following conditions are met:
 15 
 16     * Redistributions of source code must retain the above copyright notice,
 17       this list of conditions and the following disclaimer.
 18 
 19     * Redistributions in binary form must reproduce the above copyright
 20       notice, this list of conditions and the following disclaimer in the
 21       documentation and/or other materials provided with the distribution.
 22 
 23     * Neither the name of the University of Cambridge nor the names of its
 24       contributors may be used to endorse or promote products derived from
 25       this software without specific prior written permission.
 26 
 27 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 28 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 29 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 30 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
 31 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 32 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 33 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 34 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 35 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 36 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 37 POSSIBILITY OF SUCH DAMAGE.
 38 -----------------------------------------------------------------------------
 39 */
 40 
 41 
 42 /* This module contains the external function pcre_dfa_exec(), which is an
 43 alternative matching function that uses a sort of DFA algorithm (not a true
 44 FSM). This is NOT Perl- compatible, but it has advantages in certain
 45 applications. */
 46 
 47 
 48 #ifdef HAVE_CONFIG_H
 49 #include "config.h"
 50 #endif
 51 
 52 #define NLBLOCK md             /* Block containing newline information */
 53 #define PSSTART start_subject  /* Field containing processed string start */
 54 #define PSEND   end_subject    /* Field containing processed string end */
 55 
 56 #include "pcre_internal.h"
 57 
 58 
 59 /* For use to indent debugging output */
 60 
 61 #define SP "                   "
 62 
 63 
 64 /*************************************************
 65 *      Code parameters and static tables         *
 66 *************************************************/
 67 
 68 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
 69 into others, under special conditions. A gap of 20 between the blocks should be
 70 enough. The resulting opcodes don't have to be less than 256 because they are
 71 never stored, so we push them well clear of the normal opcodes. */
 72 
 73 #define OP_PROP_EXTRA       300
 74 #define OP_EXTUNI_EXTRA     320
 75 #define OP_ANYNL_EXTRA      340
 76 #define OP_HSPACE_EXTRA     360
 77 #define OP_VSPACE_EXTRA     380
 78 
 79 
 80 /* This table identifies those opcodes that are followed immediately by a
 81 character that is to be tested in some way. This makes is possible to
 82 centralize the loading of these characters. In the case of Type * etc, the
 83 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
 84 small value. ***NOTE*** If the start of this table is modified, the two tables
 85 that follow must also be modified. */
 86 
 87 static const uschar coptable[] = {
 88   0,                             /* End                                    */
 89   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
 90   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
 91   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
 92   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
 93   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
 94   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
 95   1,                             /* Char                                   */
 96   1,                             /* Charnc                                 */
 97   1,                             /* not                                    */
 98   /* Positive single-char repeats                                          */
 99   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
100   3, 3, 3,                       /* upto, minupto, exact                   */
101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
102   /* Negative single-char repeats - only for chars < 256                   */
103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
104   3, 3, 3,                       /* NOT upto, minupto, exact               */
105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
106   /* Positive type repeats                                                 */
107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
108   3, 3, 3,                       /* Type upto, minupto, exact              */
109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
110   /* Character class & ref repeats                                         */
111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
113   0,                             /* CLASS                                  */
114   0,                             /* NCLASS                                 */
115   0,                             /* XCLASS - variable length               */
116   0,                             /* REF                                    */
117   0,                             /* RECURSE                                */
118   0,                             /* CALLOUT                                */
119   0,                             /* Alt                                    */
120   0,                             /* Ket                                    */
121   0,                             /* KetRmax                                */
122   0,                             /* KetRmin                                */
123   0,                             /* Assert                                 */
124   0,                             /* Assert not                             */
125   0,                             /* Assert behind                          */
126   0,                             /* Assert behind not                      */
127   0,                             /* Reverse                                */
128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
130   0,                             /* CREF                                   */
131   0,                             /* RREF                                   */
132   0,                             /* DEF                                    */
133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
136 };
137 
138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
139 and \w */
140 
141 static const uschar toptable1[] = {
142   0, 0, 0, 0, 0, 0,
143   ctype_digit, ctype_digit,
144   ctype_space, ctype_space,
145   ctype_word,  ctype_word,
146   0, 0                            /* OP_ANY, OP_ALLANY */
147 };
148 
149 static const uschar toptable2[] = {
150   0, 0, 0, 0, 0, 0,
151   ctype_digit, 0,
152   ctype_space, 0,
153   ctype_word,  0,
154   1, 1                            /* OP_ANY, OP_ALLANY */
155 };
156 
157 
158 /* Structure for holding data about a particular state, which is in effect the
159 current data for an active path through the match tree. It must consist
160 entirely of ints because the working vector we are passed, and which we put
161 these structures in, is a vector of ints. */
162 
163 typedef struct stateblock {
164   int offset;                     /* Offset to opcode */
165   int count;                      /* Count for repeats */
166   int ims;                        /* ims flag bits */
167   int data;                       /* Some use extra data */
168 } stateblock;
169 
170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
171 
172 
173 #ifdef DEBUG
174 /*************************************************
175 *             Print character string             *
176 *************************************************/
177 
178 /* Character string printing function for debugging.
179 
180 Arguments:
181   p            points to string
182   length       number of bytes
183   f            where to print
184 
185 Returns:       nothing
186 */
187 
188 static void
189 pchars(unsigned char *p, int length, FILE *f)
190 {
191 int c;
192 while (length-- > 0)
193   {
194   if (isprint(c = *(p++)))
195     fprintf(f, "%c", c);
196   else
197     fprintf(f, "\\x%02x", c);
198   }
199 }
200 #endif
201 
202 
203 
204 /*************************************************
205 *    Execute a Regular Expression - DFA engine   *
206 *************************************************/
207 
208 /* This internal function applies a compiled pattern to a subject string,
209 starting at a given point, using a DFA engine. This function is called from the
210 external one, possibly multiple times if the pattern is not anchored. The
211 function calls itself recursively for some kinds of subpattern.
212 
213 Arguments:
214   md                the match_data block with fixed information
215   this_start_code   the opening bracket of this subexpression's code
216   current_subject   where we currently are in the subject string
217   start_offset      start offset in the subject string
218   offsets           vector to contain the matching string offsets
219   offsetcount       size of same
220   workspace         vector of workspace
221   wscount           size of same
222   ims               the current ims flags
223   rlevel            function call recursion level
224   recursing         regex recursive call level
225 
226 Returns:            > 0 => number of match offset pairs placed in offsets
227                     = 0 => offsets overflowed; longest matches are present
228                      -1 => failed to match
229                    < -1 => some kind of unexpected problem
230 
231 The following macros are used for adding states to the two state vectors (one
232 for the current character, one for the following character). */
233 
234 #define ADD_ACTIVE(x,y) \
235   if (active_count++ < wscount) \
236     { \
237     next_active_state->offset = (x); \
238     next_active_state->count  = (y); \
239     next_active_state->ims    = ims; \
240     next_active_state++; \
241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
242     } \
243   else return PCRE_ERROR_DFA_WSSIZE
244 
245 #define ADD_ACTIVE_DATA(x,y,z) \
246   if (active_count++ < wscount) \
247     { \
248     next_active_state->offset = (x); \
249     next_active_state->count  = (y); \
250     next_active_state->ims    = ims; \
251     next_active_state->data   = (z); \
252     next_active_state++; \
253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
254     } \
255   else return PCRE_ERROR_DFA_WSSIZE
256 
257 #define ADD_NEW(x,y) \
258   if (new_count++ < wscount) \
259     { \
260     next_new_state->offset = (x); \
261     next_new_state->count  = (y); \
262     next_new_state->ims    = ims; \
263     next_new_state++; \
264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
265     } \
266   else return PCRE_ERROR_DFA_WSSIZE
267 
268 #define ADD_NEW_DATA(x,y,z) \
269   if (new_count++ < wscount) \
270     { \
271     next_new_state->offset = (x); \
272     next_new_state->count  = (y); \
273     next_new_state->ims    = ims; \
274     next_new_state->data   = (z); \
275     next_new_state++; \
276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
277     } \
278   else return PCRE_ERROR_DFA_WSSIZE
279 
280 /* And now, here is the code */
281 
282 static int
283 internal_dfa_exec(
284   dfa_match_data *md,
285   const uschar *this_start_code,
286   const uschar *current_subject,
287   int start_offset,
288   int *offsets,
289   int offsetcount,
290   int *workspace,
291   int wscount,
292   int ims,
293   int  rlevel,
294   int  recursing)
295 {
296 stateblock *active_states, *new_states, *temp_states;
297 stateblock *next_active_state, *next_new_state;
298 
299 const uschar *ctypes, *lcc, *fcc;
300 const uschar *ptr;
301 const uschar *end_code, *first_op;
302 
303 int active_count, new_count, match_count;
304 
305 /* Some fields in the md block are frequently referenced, so we load them into
306 independent variables in the hope that this will perform better. */
307 
308 const uschar *start_subject = md->start_subject;
309 const uschar *end_subject = md->end_subject;
310 const uschar *start_code = md->start_code;
311 
312 #ifdef SUPPORT_UTF8
313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
314 #else
315 BOOL utf8 = FALSE;
316 #endif
317 
318 rlevel++;
319 offsetcount &= (-2);
320 
321 wscount -= 2;
322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
323           (2 * INTS_PER_STATEBLOCK);
324 
325 DPRINTF(("\n%.*s---------------------\n"
326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
328 
329 ctypes = md->tables + ctypes_offset;
330 lcc = md->tables + lcc_offset;
331 fcc = md->tables + fcc_offset;
332 
333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
334 
335 active_states = (stateblock *)(workspace + 2);
336 next_new_state = new_states = active_states + wscount;
337 new_count = 0;
338 
339 first_op = this_start_code + 1 + LINK_SIZE +
340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
341 
342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
343 the alternative states onto the list, and find out where the end is. This
344 makes is possible to use this function recursively, when we want to stop at a
345 matching internal ket rather than at the end.
346 
347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
348 a backward assertion. In that case, we have to find out the maximum amount to
349 move back, and set up each alternative appropriately. */
350 
351 if (*first_op == OP_REVERSE)
352   {
353   int max_back = 0;
354   int gone_back;
355 
356   end_code = this_start_code;
357   do
358     {
359     int back = GET(end_code, 2+LINK_SIZE);
360     if (back > max_back) max_back = back;
361     end_code += GET(end_code, 1);
362     }
363   while (*end_code == OP_ALT);
364 
365   /* If we can't go back the amount required for the longest lookbehind
366   pattern, go back as far as we can; some alternatives may still be viable. */
367 
368 #ifdef SUPPORT_UTF8
369   /* In character mode we have to step back character by character */
370 
371   if (utf8)
372     {
373     for (gone_back = 0; gone_back < max_back; gone_back++)
374       {
375       if (current_subject <= start_subject) break;
376       current_subject--;
377       while (current_subject > start_subject &&
378              (*current_subject & 0xc0) == 0x80)
379         current_subject--;
380       }
381     }
382   else
383 #endif
384 
385   /* In byte-mode we can do this quickly. */
386 
387     {
388     gone_back = (current_subject - max_back < start_subject)?
389       current_subject - start_subject : max_back;
390     current_subject -= gone_back;
391     }
392 
393   /* Now we can process the individual branches. */
394 
395   end_code = this_start_code;
396   do
397     {
398     int back = GET(end_code, 2+LINK_SIZE);
399     if (back <= gone_back)
400       {
401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
403       }
404     end_code += GET(end_code, 1);
405     }
406   while (*end_code == OP_ALT);
407  }
408 
409 /* This is the code for a "normal" subpattern (not a backward assertion). The
410 start of a whole pattern is always one of these. If we are at the top level,
411 we may be asked to restart matching from the same point that we reached for a
412 previous partial match. We still have to scan through the top-level branches to
413 find the end state. */
414 
415 else
416   {
417   end_code = this_start_code;
418 
419   /* Restarting */
420 
421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
422     {
423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
424     new_count = workspace[1];
425     if (!workspace[0])
426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
427     }
428 
429   /* Not restarting */
430 
431   else
432     {
433     int length = 1 + LINK_SIZE +
434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
435     do
436       {
437       ADD_NEW(end_code - start_code + length, 0);
438       end_code += GET(end_code, 1);
439       length = 1 + LINK_SIZE;
440       }
441     while (*end_code == OP_ALT);
442     }
443   }
444 
445 workspace[0] = 0;    /* Bit indicating which vector is current */
446 
447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
448 
449 /* Loop for scanning the subject */
450 
451 ptr = current_subject;
452 for (;;)
453   {
454   int i, j;
455   int clen, dlen;
456   unsigned int c, d;
457 
458   /* Make the new state list into the active state list and empty the
459   new state list. */
460 
461   temp_states = active_states;
462   active_states = new_states;
463   new_states = temp_states;
464   active_count = new_count;
465   new_count = 0;
466 
467   workspace[0] ^= 1;              /* Remember for the restarting feature */
468   workspace[1] = active_count;
469 
470 #ifdef DEBUG
471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
473   printf("\"\n");
474 
475   printf("%.*sActive states: ", rlevel*2-2, SP);
476   for (i = 0; i < active_count; i++)
477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
478   printf("\n");
479 #endif
480 
481   /* Set the pointers for adding new states */
482 
483   next_active_state = active_states + active_count;
484   next_new_state = new_states;
485 
486   /* Load the current character from the subject outside the loop, as many
487   different states may want to look at it, and we assume that at least one
488   will. */
489 
490   if (ptr < end_subject)
491     {
492     clen = 1;        /* Number of bytes in the character */
493 #ifdef SUPPORT_UTF8
494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
495 #endif  /* SUPPORT_UTF8 */
496     c = *ptr;
497     }
498   else
499     {
500     clen = 0;        /* This indicates the end of the subject */
501     c = NOTACHAR;    /* This value should never actually be used */
502     }
503 
504   /* Scan up the active states and act on each one. The result of an action
505   may be to add more states to the currently active list (e.g. on hitting a
506   parenthesis) or it may be to put states on the new list, for considering
507   when we move the character pointer on. */
508 
509   for (i = 0; i < active_count; i++)
510     {
511     stateblock *current_state = active_states + i;
512     const uschar *code;
513     int state_offset = current_state->offset;
514     int count, codevalue, rrc;
515 
516 #ifdef DEBUG
517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
518     if (clen == 0) printf("EOL\n");
519       else if (c > 32 && c < 127) printf("'%c'\n", c);
520         else printf("0x%02x\n", c);
521 #endif
522 
523     /* This variable is referred to implicity in the ADD_xxx macros. */
524 
525     ims = current_state->ims;
526 
527     /* A negative offset is a special case meaning "hold off going to this
528     (negated) state until the number of characters in the data field have
529     been skipped". */
530 
531     if (state_offset < 0)
532       {
533       if (current_state->data > 0)
534         {
535         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
536         ADD_NEW_DATA(state_offset, current_state->count,
537           current_state->data - 1);
538         continue;
539         }
540       else
541         {
542         current_state->offset = state_offset = -state_offset;
543         }
544       }
545 
546     /* Check for a duplicate state with the same count, and skip if found. */
547 
548     for (j = 0; j < i; j++)
549       {
550       if (active_states[j].offset == state_offset &&
551           active_states[j].count == current_state->count)
552         {
553         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
554         goto NEXT_ACTIVE_STATE;
555         }
556       }
557 
558     /* The state offset is the offset to the opcode */
559 
560     code = start_code + state_offset;
561     codevalue = *code;
562 
563     /* If this opcode is followed by an inline character, load it. It is
564     tempting to test for the presence of a subject character here, but that
565     is wrong, because sometimes zero repetitions of the subject are
566     permitted.
567 
568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
569     argument that is not a data character - but is always one byte long. We
570     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
571     this case. To keep the other cases fast, convert these ones to new opcodes.
572     */
573 
574     if (coptable[codevalue] > 0)
575       {
576       dlen = 1;
577 #ifdef SUPPORT_UTF8
578       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
579 #endif  /* SUPPORT_UTF8 */
580       d = code[coptable[codevalue]];
581       if (codevalue >= OP_TYPESTAR)
582         {
583         switch(d)
584           {
585           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
586           case OP_NOTPROP:
587           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
588           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
589           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
590           case OP_NOT_HSPACE:
591           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
592           case OP_NOT_VSPACE:
593           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
594           default: break;
595           }
596         }
597       }
598     else
599       {
600       dlen = 0;         /* Not strictly necessary, but compilers moan */
601       d = NOTACHAR;     /* if these variables are not set. */
602       }
603 
604 
605     /* Now process the individual opcodes */
606 
607     switch (codevalue)
608       {
609 
610 /* ========================================================================== */
611       /* Reached a closing bracket. If not at the end of the pattern, carry
612       on with the next opcode. Otherwise, unless we have an empty string and
613       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
614       matches so we always have the longest first. */
615 
616       case OP_KET:
617       case OP_KETRMIN:
618       case OP_KETRMAX:
619       if (code != end_code)
620         {
621         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
622         if (codevalue != OP_KET)
623           {
624           ADD_ACTIVE(state_offset - GET(code, 1), 0);
625           }
626         }
627       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
628         {
629         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
630           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
631             match_count = 0;
632         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
633         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
634         if (offsetcount >= 2)
635           {
636           offsets[0] = current_subject - start_subject;
637           offsets[1] = ptr - start_subject;
638           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
639             offsets[1] - offsets[0], current_subject));
640           }
641         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
642           {
643           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
644             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
645             match_count, rlevel*2-2, SP));
646           return match_count;
647           }
648         }
649       break;
650 
651 /* ========================================================================== */
652       /* These opcodes add to the current list of states without looking
653       at the current character. */
654 
655       /*-----------------------------------------------------------------*/
656       case OP_ALT:
657       do { code += GET(code, 1); } while (*code == OP_ALT);
658       ADD_ACTIVE(code - start_code, 0);
659       break;
660 
661       /*-----------------------------------------------------------------*/
662       case OP_BRA:
663       case OP_SBRA:
664       do
665         {
666         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
667         code += GET(code, 1);
668         }
669       while (*code == OP_ALT);
670       break;
671 
672       /*-----------------------------------------------------------------*/
673       case OP_CBRA:
674       case OP_SCBRA:
675       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
676       code += GET(code, 1);
677       while (*code == OP_ALT)
678         {
679         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
680         code += GET(code, 1);
681         }
682       break;
683 
684       /*-----------------------------------------------------------------*/
685       case OP_BRAZERO:
686       case OP_BRAMINZERO:
687       ADD_ACTIVE(state_offset + 1, 0);
688       code += 1 + GET(code, 2);
689       while (*code == OP_ALT) code += GET(code, 1);
690       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
691       break;
692 
693       /*-----------------------------------------------------------------*/
694       case OP_SKIPZERO:
695       code += 1 + GET(code, 2);
696       while (*code == OP_ALT) code += GET(code, 1);
697       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
698       break;
699 
700       /*-----------------------------------------------------------------*/
701       case OP_CIRC:
702       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
703           ((ims & PCRE_MULTILINE) != 0 &&
704             ptr != end_subject &&
705             WAS_NEWLINE(ptr)))
706         { ADD_ACTIVE(state_offset + 1, 0); }
707       break;
708 
709       /*-----------------------------------------------------------------*/
710       case OP_EOD:
711       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
712       break;
713 
714       /*-----------------------------------------------------------------*/
715       case OP_OPT:
716       ims = code[1];
717       ADD_ACTIVE(state_offset + 2, 0);
718       break;
719 
720       /*-----------------------------------------------------------------*/
721       case OP_SOD:
722       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
723       break;
724 
725       /*-----------------------------------------------------------------*/
726       case OP_SOM:
727       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
728       break;
729 
730 
731 /* ========================================================================== */
732       /* These opcodes inspect the next subject character, and sometimes
733       the previous one as well, but do not have an argument. The variable
734       clen contains the length of the current character and is zero if we are
735       at the end of the subject. */
736 
737       /*-----------------------------------------------------------------*/
738       case OP_ANY:
739       if (clen > 0 && !IS_NEWLINE(ptr))
740         { ADD_NEW(state_offset + 1, 0); }
741       break;
742 
743       /*-----------------------------------------------------------------*/
744       case OP_ALLANY:
745       if (clen > 0)
746         { ADD_NEW(state_offset + 1, 0); }
747       break;
748 
749       /*-----------------------------------------------------------------*/
750       case OP_EODN:
751       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
752         { ADD_ACTIVE(state_offset + 1, 0); }
753       break;
754 
755       /*-----------------------------------------------------------------*/
756       case OP_DOLL:
757       if ((md->moptions & PCRE_NOTEOL) == 0)
758         {
759         if (clen == 0 ||
760             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
761                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
762             ))
763           { ADD_ACTIVE(state_offset + 1, 0); }
764         }
765       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
766         { ADD_ACTIVE(state_offset + 1, 0); }
767       break;
768 
769       /*-----------------------------------------------------------------*/
770 
771       case OP_DIGIT:
772       case OP_WHITESPACE:
773       case OP_WORDCHAR:
774       if (clen > 0 && c < 256 &&
775             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
776         { ADD_NEW(state_offset + 1, 0); }
777       break;
778 
779       /*-----------------------------------------------------------------*/
780       case OP_NOT_DIGIT:
781       case OP_NOT_WHITESPACE:
782       case OP_NOT_WORDCHAR:
783       if (clen > 0 && (c >= 256 ||
784             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
785         { ADD_NEW(state_offset + 1, 0); }
786       break;
787 
788       /*-----------------------------------------------------------------*/
789       case OP_WORD_BOUNDARY:
790       case OP_NOT_WORD_BOUNDARY:
791         {
792         int left_word, right_word;
793 
794         if (ptr > start_subject)
795           {
796           const uschar *temp = ptr - 1;
797 #ifdef SUPPORT_UTF8
798           if (utf8) BACKCHAR(temp);
799 #endif
800           GETCHARTEST(d, temp);
801           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
802           }
803         else left_word = 0;
804 
805         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
806           else right_word = 0;
807 
808         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
809           { ADD_ACTIVE(state_offset + 1, 0); }
810         }
811       break;
812 
813 
814       /*-----------------------------------------------------------------*/
815       /* Check the next character by Unicode property. We will get here only
816       if the support is in the binary; otherwise a compile-time error occurs.
817       */
818 
819 #ifdef SUPPORT_UCP
820       case OP_PROP:
821       case OP_NOTPROP:
822       if (clen > 0)
823         {
824         BOOL OK;
825         const ucd_record * prop = GET_UCD(c);
826         switch(code[1])
827           {
828           case PT_ANY:
829           OK = TRUE;
830           break;
831 
832           case PT_LAMP:
833           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
834           break;
835 
836           case PT_GC:
837           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
838           break;
839 
840           case PT_PC:
841           OK = prop->chartype == code[2];
842           break;
843 
844           case PT_SC:
845           OK = prop->script == code[2];
846           break;
847 
848           /* Should never occur, but keep compilers from grumbling. */
849 
850           default:
851           OK = codevalue != OP_PROP;
852           break;
853           }
854 
855         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
856         }
857       break;
858 #endif
859 
860 
861 
862 /* ========================================================================== */
863       /* These opcodes likewise inspect the subject character, but have an
864       argument that is not a data character. It is one of these opcodes:
865       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
866       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
867 
868       case OP_TYPEPLUS:
869       case OP_TYPEMINPLUS:
870       case OP_TYPEPOSPLUS:
871       count = current_state->count;  /* Already matched */
872       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
873       if (clen > 0)
874         {
875         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
876             (c < 256 &&
877               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
878               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
879           {
880           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
881             {
882             active_count--;            /* Remove non-match possibility */
883             next_active_state--;
884             }
885           count++;
886           ADD_NEW(state_offset, count);
887           }
888         }
889       break;
890 
891       /*-----------------------------------------------------------------*/
892       case OP_TYPEQUERY:
893       case OP_TYPEMINQUERY:
894       case OP_TYPEPOSQUERY:
895       ADD_ACTIVE(state_offset + 2, 0);
896       if (clen > 0)
897         {
898         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
899             (c < 256 &&
900               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
901               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
902           {
903           if (codevalue == OP_TYPEPOSQUERY)
904             {
905             active_count--;            /* Remove non-match possibility */
906             next_active_state--;
907             }
908           ADD_NEW(state_offset + 2, 0);
909           }
910         }
911       break;
912 
913       /*-----------------------------------------------------------------*/
914       case OP_TYPESTAR:
915       case OP_TYPEMINSTAR:
916       case OP_TYPEPOSSTAR:
917       ADD_ACTIVE(state_offset + 2, 0);
918       if (clen > 0)
919         {
920         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
921             (c < 256 &&
922               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
923               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
924           {
925           if (codevalue == OP_TYPEPOSSTAR)
926             {
927             active_count--;            /* Remove non-match possibility */
928             next_active_state--;
929             }
930           ADD_NEW(state_offset, 0);
931           }
932         }
933       break;
934 
935       /*-----------------------------------------------------------------*/
936       case OP_TYPEEXACT:
937       count = current_state->count;  /* Number already matched */
938       if (clen > 0)
939         {
940         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
941             (c < 256 &&
942               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
943               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
944           {
945           if (++count >= GET2(code, 1))
946             { ADD_NEW(state_offset + 4, 0); }
947           else
948             { ADD_NEW(state_offset, count); }
949           }
950         }
951       break;
952 
953       /*-----------------------------------------------------------------*/
954       case OP_TYPEUPTO:
955       case OP_TYPEMINUPTO:
956       case OP_TYPEPOSUPTO:
957       ADD_ACTIVE(state_offset + 4, 0);
958       count = current_state->count;  /* Number already matched */
959       if (clen > 0)
960         {
961         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
962             (c < 256 &&
963               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
964               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
965           {
966           if (codevalue == OP_TYPEPOSUPTO)
967             {
968             active_count--;           /* Remove non-match possibility */
969             next_active_state--;
970             }
971           if (++count >= GET2(code, 1))
972             { ADD_NEW(state_offset + 4, 0); }
973           else
974             { ADD_NEW(state_offset, count); }
975           }
976         }
977       break;
978 
979 /* ========================================================================== */
980       /* These are virtual opcodes that are used when something like
981       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
982       argument. It keeps the code above fast for the other cases. The argument
983       is in the d variable. */
984 
985 #ifdef SUPPORT_UCP
986       case OP_PROP_EXTRA + OP_TYPEPLUS:
987       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
988       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
989       count = current_state->count;           /* Already matched */
990       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
991       if (clen > 0)
992         {
993         BOOL OK;
994         const ucd_record * prop = GET_UCD(c);
995         switch(code[2])
996           {
997           case PT_ANY:
998           OK = TRUE;
999           break;
1000 
1001           case PT_LAMP:
1002           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1003           break;
1004 
1005           case PT_GC:
1006           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1007           break;
1008 
1009           case PT_PC:
1010           OK = prop->chartype == code[3];
1011           break;
1012 
1013           case PT_SC:
1014           OK = prop->script == code[3];
1015           break;
1016 
1017           /* Should never occur, but keep compilers from grumbling. */
1018 
1019           default:
1020           OK = codevalue != OP_PROP;
1021           break;
1022           }
1023 
1024         if (OK == (d == OP_PROP))
1025           {
1026           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1027             {
1028             active_count--;           /* Remove non-match possibility */
1029             next_active_state--;
1030             }
1031           count++;
1032           ADD_NEW(state_offset, count);
1033           }
1034         }
1035       break;
1036 
1037       /*-----------------------------------------------------------------*/
1038       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
1039       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
1040       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
1041       count = current_state->count;  /* Already matched */
1042       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1043       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1044         {
1045         const uschar *nptr = ptr + clen;
1046         int ncount = 0;
1047         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1048           {
1049           active_count--;           /* Remove non-match possibility */
1050           next_active_state--;
1051           }
1052         while (nptr < end_subject)
1053           {
1054           int nd;
1055           int ndlen = 1;
1056           GETCHARLEN(nd, nptr, ndlen);
1057           if (UCD_CATEGORY(nd) != ucp_M) break;
1058           ncount++;
1059           nptr += ndlen;
1060           }
1061         count++;
1062         ADD_NEW_DATA(-state_offset, count, ncount);
1063         }
1064       break;
1065 #endif
1066 
1067       /*-----------------------------------------------------------------*/
1068       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
1069       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
1070       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
1071       count = current_state->count;  /* Already matched */
1072       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1073       if (clen > 0)
1074         {
1075         int ncount = 0;
1076         switch (c)
1077           {
1078           case 0x000b:
1079           case 0x000c:
1080           case 0x0085:
1081           case 0x2028:
1082           case 0x2029:
1083           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1084           goto ANYNL01;
1085 
1086           case 0x000d:
1087           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1088           /* Fall through */
1089 
1090           ANYNL01:
1091           case 0x000a:
1092           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1093             {
1094             active_count--;           /* Remove non-match possibility */
1095             next_active_state--;
1096             }
1097           count++;
1098           ADD_NEW_DATA(-state_offset, count, ncount);
1099           break;
1100 
1101           default:
1102           break;
1103           }
1104         }
1105       break;
1106 
1107       /*-----------------------------------------------------------------*/
1108       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
1109       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
1110       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
1111       count = current_state->count;  /* Already matched */
1112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1113       if (clen > 0)
1114         {
1115         BOOL OK;
1116         switch (c)
1117           {
1118           case 0x000a:
1119           case 0x000b:
1120           case 0x000c:
1121           case 0x000d:
1122           case 0x0085:
1123           case 0x2028:
1124           case 0x2029:
1125           OK = TRUE;
1126           break;
1127 
1128           default:
1129           OK = FALSE;
1130           break;
1131           }
1132 
1133         if (OK == (d == OP_VSPACE))
1134           {
1135           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1136             {
1137             active_count--;           /* Remove non-match possibility */
1138             next_active_state--;
1139             }
1140           count++;
1141           ADD_NEW_DATA(-state_offset, count, 0);
1142           }
1143         }
1144       break;
1145 
1146       /*-----------------------------------------------------------------*/
1147       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
1148       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
1149       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
1150       count = current_state->count;  /* Already matched */
1151       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1152       if (clen > 0)
1153         {
1154         BOOL OK;
1155         switch (c)
1156           {
1157           case 0x09:      /* HT */
1158           case 0x20:      /* SPACE */
1159           case 0xa0:      /* NBSP */
1160           case 0x1680:    /* OGHAM SPACE MARK */
1161           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1162           case 0x2000:    /* EN QUAD */
1163           case 0x2001:    /* EM QUAD */
1164           case 0x2002:    /* EN SPACE */
1165           case 0x2003:    /* EM SPACE */
1166           case 0x2004:    /* THREE-PER-EM SPACE */
1167           case 0x2005:    /* FOUR-PER-EM SPACE */
1168           case 0x2006:    /* SIX-PER-EM SPACE */
1169           case 0x2007:    /* FIGURE SPACE */
1170           case 0x2008:    /* PUNCTUATION SPACE */
1171           case 0x2009:    /* THIN SPACE */
1172           case 0x200A:    /* HAIR SPACE */
1173           case 0x202f:    /* NARROW NO-BREAK SPACE */
1174           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1175           case 0x3000:    /* IDEOGRAPHIC SPACE */
1176           OK = TRUE;
1177           break;
1178 
1179           default:
1180           OK = FALSE;
1181           break;
1182           }
1183 
1184         if (OK == (d == OP_HSPACE))
1185           {
1186           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1187             {
1188             active_count--;           /* Remove non-match possibility */
1189             next_active_state--;
1190             }
1191           count++;
1192           ADD_NEW_DATA(-state_offset, count, 0);
1193           }
1194         }
1195       break;
1196 
1197       /*-----------------------------------------------------------------*/
1198 #ifdef SUPPORT_UCP
1199       case OP_PROP_EXTRA + OP_TYPEQUERY:
1200       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
1201       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
1202       count = 4;
1203       goto QS1;
1204 
1205       case OP_PROP_EXTRA + OP_TYPESTAR:
1206       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
1207       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
1208       count = 0;
1209 
1210       QS1:
1211 
1212       ADD_ACTIVE(state_offset + 4, 0);
1213       if (clen > 0)
1214         {
1215         BOOL OK;
1216         const ucd_record * prop = GET_UCD(c);
1217         switch(code[2])
1218           {
1219           case PT_ANY:
1220           OK = TRUE;
1221           break;
1222 
1223           case PT_LAMP:
1224           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1225           break;
1226 
1227           case PT_GC:
1228           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
1229           break;
1230 
1231           case PT_PC:
1232           OK = prop->chartype == code[3];
1233           break;
1234 
1235           case PT_SC:
1236           OK = prop->script == code[3];
1237           break;
1238 
1239           /* Should never occur, but keep compilers from grumbling. */
1240 
1241           default:
1242           OK = codevalue != OP_PROP;
1243           break;
1244           }
1245 
1246         if (OK == (d == OP_PROP))
1247           {
1248           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1249               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1250             {
1251             active_count--;           /* Remove non-match possibility */
1252             next_active_state--;
1253             }
1254           ADD_NEW(state_offset + count, 0);
1255           }
1256         }
1257       break;
1258 
1259       /*-----------------------------------------------------------------*/
1260       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
1261       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
1262       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
1263       count = 2;
1264       goto QS2;
1265 
1266       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
1267       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
1268       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
1269       count = 0;
1270 
1271       QS2:
1272 
1273       ADD_ACTIVE(state_offset + 2, 0);
1274       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1275         {
1276         const uschar *nptr = ptr + clen;
1277         int ncount = 0;
1278         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1279             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1280           {
1281           active_count--;           /* Remove non-match possibility */
1282           next_active_state--;
1283           }
1284         while (nptr < end_subject)
1285           {
1286           int nd;
1287           int ndlen = 1;
1288           GETCHARLEN(nd, nptr, ndlen);
1289           if (UCD_CATEGORY(nd) != ucp_M) break;
1290           ncount++;
1291           nptr += ndlen;
1292           }
1293         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1294         }
1295       break;
1296 #endif
1297 
1298       /*-----------------------------------------------------------------*/
1299       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
1300       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
1301       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
1302       count = 2;
1303       goto QS3;
1304 
1305       case OP_ANYNL_EXTRA + OP_TYPESTAR:
1306       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
1307       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
1308       count = 0;
1309 
1310       QS3:
1311       ADD_ACTIVE(state_offset + 2, 0);
1312       if (clen > 0)
1313         {
1314         int ncount = 0;
1315         switch (c)
1316           {
1317           case 0x000b:
1318           case 0x000c:
1319           case 0x0085:
1320           case 0x2028:
1321           case 0x2029:
1322           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1323           goto ANYNL02;
1324 
1325           case 0x000d:
1326           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1327           /* Fall through */
1328 
1329           ANYNL02:
1330           case 0x000a:
1331           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1332               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1333             {
1334             active_count--;           /* Remove non-match possibility */
1335             next_active_state--;
1336             }
1337           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1338           break;
1339 
1340           default:
1341           break;
1342           }
1343         }
1344       break;
1345 
1346       /*-----------------------------------------------------------------*/
1347       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
1348       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
1349       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
1350       count = 2;
1351       goto QS4;
1352 
1353       case OP_VSPACE_EXTRA + OP_TYPESTAR:
1354       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
1355       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
1356       count = 0;
1357 
1358       QS4:
1359       ADD_ACTIVE(state_offset + 2, 0);
1360       if (clen > 0)
1361         {
1362         BOOL OK;
1363         switch (c)
1364           {
1365           case 0x000a:
1366           case 0x000b:
1367           case 0x000c:
1368           case 0x000d:
1369           case 0x0085:
1370           case 0x2028:
1371           case 0x2029:
1372           OK = TRUE;
1373           break;
1374 
1375           default:
1376           OK = FALSE;
1377           break;
1378           }
1379         if (OK == (d == OP_VSPACE))
1380           {
1381           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1382               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1383             {
1384             active_count--;           /* Remove non-match possibility */
1385             next_active_state--;
1386             }
1387           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1388           }
1389         }
1390       break;
1391 
1392       /*-----------------------------------------------------------------*/
1393       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
1394       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
1395       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
1396       count = 2;
1397       goto QS5;
1398 
1399       case OP_HSPACE_EXTRA + OP_TYPESTAR:
1400       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
1401       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
1402       count = 0;
1403 
1404       QS5:
1405       ADD_ACTIVE(state_offset + 2, 0);
1406       if (clen > 0)
1407         {
1408         BOOL OK;
1409         switch (c)
1410           {
1411           case 0x09:      /* HT */
1412           case 0x20:      /* SPACE */
1413           case 0xa0:      /* NBSP */
1414           case 0x1680:    /* OGHAM SPACE MARK */
1415           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1416           case 0x2000:    /* EN QUAD */
1417           case 0x2001:    /* EM QUAD */
1418           case 0x2002:    /* EN SPACE */
1419           case 0x2003:    /* EM SPACE */
1420           case 0x2004:    /* THREE-PER-EM SPACE */
1421           case 0x2005:    /* FOUR-PER-EM SPACE */
1422           case 0x2006:    /* SIX-PER-EM SPACE */
1423           case 0x2007:    /* FIGURE SPACE */
1424           case 0x2008:    /* PUNCTUATION SPACE */
1425           case 0x2009:    /* THIN SPACE */
1426           case 0x200A:    /* HAIR SPACE */
1427           case 0x202f:    /* NARROW NO-BREAK SPACE */
1428           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1429           case 0x3000:    /* IDEOGRAPHIC SPACE */
1430           OK = TRUE;
1431           break;
1432 
1433           default:
1434           OK = FALSE;
1435           break;
1436           }
1437 
1438         if (OK == (d == OP_HSPACE))
1439           {
1440           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1441               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1442             {
1443             active_count--;           /* Remove non-match possibility */
1444             next_active_state--;
1445             }
1446           ADD_NEW_DATA(-(state_offset + count), 0, 0);
1447           }
1448         }
1449       break;
1450 
1451       /*-----------------------------------------------------------------*/
1452 #ifdef SUPPORT_UCP
1453       case OP_PROP_EXTRA + OP_TYPEEXACT:
1454       case OP_PROP_EXTRA + OP_TYPEUPTO:
1455       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
1456       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
1457       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
1458         { ADD_ACTIVE(state_offset + 6, 0); }
1459       count = current_state->count;  /* Number already matched */
1460       if (clen > 0)
1461         {
1462         BOOL OK;
1463         const ucd_record * prop = GET_UCD(c);
1464         switch(code[4])
1465           {
1466           case PT_ANY:
1467           OK = TRUE;
1468           break;
1469 
1470           case PT_LAMP:
1471           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
1472           break;
1473 
1474           case PT_GC:
1475           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
1476           break;
1477 
1478           case PT_PC:
1479           OK = prop->chartype == code[5];
1480           break;
1481 
1482           case PT_SC:
1483           OK = prop->script == code[5];
1484           break;
1485 
1486           /* Should never occur, but keep compilers from grumbling. */
1487 
1488           default:
1489           OK = codevalue != OP_PROP;
1490           break;
1491           }
1492 
1493         if (OK == (d == OP_PROP))
1494           {
1495           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
1496             {
1497             active_count--;           /* Remove non-match possibility */
1498             next_active_state--;
1499             }
1500           if (++count >= GET2(code, 1))
1501             { ADD_NEW(state_offset + 6, 0); }
1502           else
1503             { ADD_NEW(state_offset, count); }
1504           }
1505         }
1506       break;
1507 
1508       /*-----------------------------------------------------------------*/
1509       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
1510       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
1511       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
1512       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
1513       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
1514         { ADD_ACTIVE(state_offset + 4, 0); }
1515       count = current_state->count;  /* Number already matched */
1516       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1517         {
1518         const uschar *nptr = ptr + clen;
1519         int ncount = 0;
1520         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
1521           {
1522           active_count--;           /* Remove non-match possibility */
1523           next_active_state--;
1524           }
1525         while (nptr < end_subject)
1526           {
1527           int nd;
1528           int ndlen = 1;
1529           GETCHARLEN(nd, nptr, ndlen);
1530           if (UCD_CATEGORY(nd) != ucp_M) break;
1531           ncount++;
1532           nptr += ndlen;
1533           }
1534         if (++count >= GET2(code, 1))
1535           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1536         else
1537           { ADD_NEW_DATA(-state_offset, count, ncount); }
1538         }
1539       break;
1540 #endif
1541 
1542       /*-----------------------------------------------------------------*/
1543       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
1544       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
1545       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
1546       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
1547       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
1548         { ADD_ACTIVE(state_offset + 4, 0); }
1549       count = current_state->count;  /* Number already matched */
1550       if (clen > 0)
1551         {
1552         int ncount = 0;
1553         switch (c)
1554           {
1555           case 0x000b:
1556           case 0x000c:
1557           case 0x0085:
1558           case 0x2028:
1559           case 0x2029:
1560           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1561           goto ANYNL03;
1562 
1563           case 0x000d:
1564           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
1565           /* Fall through */
1566 
1567           ANYNL03:
1568           case 0x000a:
1569           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
1570             {
1571             active_count--;           /* Remove non-match possibility */
1572             next_active_state--;
1573             }
1574           if (++count >= GET2(code, 1))
1575             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
1576           else
1577             { ADD_NEW_DATA(-state_offset, count, ncount); }
1578           break;
1579 
1580           default:
1581           break;
1582           }
1583         }
1584       break;
1585 
1586       /*-----------------------------------------------------------------*/
1587       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
1588       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
1589       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
1590       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
1591       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
1592         { ADD_ACTIVE(state_offset + 4, 0); }
1593       count = current_state->count;  /* Number already matched */
1594       if (clen > 0)
1595         {
1596         BOOL OK;
1597         switch (c)
1598           {
1599           case 0x000a:
1600           case 0x000b:
1601           case 0x000c:
1602           case 0x000d:
1603           case 0x0085:
1604           case 0x2028:
1605           case 0x2029:
1606           OK = TRUE;
1607           break;
1608 
1609           default:
1610           OK = FALSE;
1611           }
1612 
1613         if (OK == (d == OP_VSPACE))
1614           {
1615           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
1616             {
1617             active_count--;           /* Remove non-match possibility */
1618             next_active_state--;
1619             }
1620           if (++count >= GET2(code, 1))
1621             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1622           else
1623             { ADD_NEW_DATA(-state_offset, count, 0); }
1624           }
1625         }
1626       break;
1627 
1628       /*-----------------------------------------------------------------*/
1629       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
1630       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
1631       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
1632       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
1633       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
1634         { ADD_ACTIVE(state_offset + 4, 0); }
1635       count = current_state->count;  /* Number already matched */
1636       if (clen > 0)
1637         {
1638         BOOL OK;
1639         switch (c)
1640           {
1641           case 0x09:      /* HT */
1642           case 0x20:      /* SPACE */
1643           case 0xa0:      /* NBSP */
1644           case 0x1680:    /* OGHAM SPACE MARK */
1645           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1646           case 0x2000:    /* EN QUAD */
1647           case 0x2001:    /* EM QUAD */
1648           case 0x2002:    /* EN SPACE */
1649           case 0x2003:    /* EM SPACE */
1650           case 0x2004:    /* THREE-PER-EM SPACE */
1651           case 0x2005:    /* FOUR-PER-EM SPACE */
1652           case 0x2006:    /* SIX-PER-EM SPACE */
1653           case 0x2007:    /* FIGURE SPACE */
1654           case 0x2008:    /* PUNCTUATION SPACE */
1655           case 0x2009:    /* THIN SPACE */
1656           case 0x200A:    /* HAIR SPACE */
1657           case 0x202f:    /* NARROW NO-BREAK SPACE */
1658           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1659           case 0x3000:    /* IDEOGRAPHIC SPACE */
1660           OK = TRUE;
1661           break;
1662 
1663           default:
1664           OK = FALSE;
1665           break;
1666           }
1667 
1668         if (OK == (d == OP_HSPACE))
1669           {
1670           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
1671             {
1672             active_count--;           /* Remove non-match possibility */
1673             next_active_state--;
1674             }
1675           if (++count >= GET2(code, 1))
1676             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
1677           else
1678             { ADD_NEW_DATA(-state_offset, count, 0); }
1679           }
1680         }
1681       break;
1682 
1683 /* ========================================================================== */
1684       /* These opcodes are followed by a character that is usually compared
1685       to the current subject character; it is loaded into d. We still get
1686       here even if there is no subject character, because in some cases zero
1687       repetitions are permitted. */
1688 
1689       /*-----------------------------------------------------------------*/
1690       case OP_CHAR:
1691       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
1692       break;
1693 
1694       /*-----------------------------------------------------------------*/
1695       case OP_CHARNC:
1696       if (clen == 0) break;
1697 
1698 #ifdef SUPPORT_UTF8
1699       if (utf8)
1700         {
1701         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
1702           {
1703           unsigned int othercase;
1704           if (c < 128) othercase = fcc[c]; else
1705 
1706           /* If we have Unicode property support, we can use it to test the
1707           other case of the character. */
1708 
1709 #ifdef SUPPORT_UCP
1710           othercase = UCD_OTHERCASE(c);
1711 #else
1712           othercase = NOTACHAR;
1713 #endif
1714 
1715           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
1716           }
1717         }
1718       else
1719 #endif  /* SUPPORT_UTF8 */
1720 
1721       /* Non-UTF-8 mode */
1722         {
1723         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
1724         }
1725       break;
1726 
1727 
1728 #ifdef SUPPORT_UCP
1729       /*-----------------------------------------------------------------*/
1730       /* This is a tricky one because it can match more than one character.
1731       Find out how many characters to skip, and then set up a negative state
1732       to wait for them to pass before continuing. */
1733 
1734       case OP_EXTUNI:
1735       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
1736         {
1737         const uschar *nptr = ptr + clen;
1738         int ncount = 0;
1739         while (nptr < end_subject)
1740           {
1741           int nclen = 1;
1742           GETCHARLEN(c, nptr, nclen);
1743           if (UCD_CATEGORY(c) != ucp_M) break;
1744           ncount++;
1745           nptr += nclen;
1746           }
1747         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
1748         }
1749       break;
1750 #endif
1751 
1752       /*-----------------------------------------------------------------*/
1753       /* This is a tricky like EXTUNI because it too can match more than one
1754       character (when CR is followed by LF). In this case, set up a negative
1755       state to wait for one character to pass before continuing. */
1756 
1757       case OP_ANYNL:
1758       if (clen > 0) switch(c)
1759         {
1760         case 0x000b:
1761         case 0x000c:
1762         case 0x0085:
1763         case 0x2028:
1764         case 0x2029:
1765         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
1766 
1767         case 0x000a:
1768         ADD_NEW(state_offset + 1, 0);
1769         break;
1770 
1771         case 0x000d:
1772         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
1773           {
1774           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1775           }
1776         else
1777           {
1778           ADD_NEW(state_offset + 1, 0);
1779           }
1780         break;
1781         }
1782       break;
1783 
1784       /*-----------------------------------------------------------------*/
1785       case OP_NOT_VSPACE:
1786       if (clen > 0) switch(c)
1787         {
1788         case 0x000a:
1789         case 0x000b:
1790         case 0x000c:
1791         case 0x000d:
1792         case 0x0085:
1793         case 0x2028:
1794         case 0x2029:
1795         break;
1796 
1797         default:
1798         ADD_NEW(state_offset + 1, 0);
1799         break;
1800         }
1801       break;
1802 
1803       /*-----------------------------------------------------------------*/
1804       case OP_VSPACE:
1805       if (clen > 0) switch(c)
1806         {
1807         case 0x000a:
1808         case 0x000b:
1809         case 0x000c:
1810         case 0x000d:
1811         case 0x0085:
1812         case 0x2028:
1813         case 0x2029:
1814         ADD_NEW(state_offset + 1, 0);
1815         break;
1816 
1817         default: break;
1818         }
1819       break;
1820 
1821       /*-----------------------------------------------------------------*/
1822       case OP_NOT_HSPACE:
1823       if (clen > 0) switch(c)
1824         {
1825         case 0x09:      /* HT */
1826         case 0x20:      /* SPACE */
1827         case 0xa0:      /* NBSP */
1828         case 0x1680:    /* OGHAM SPACE MARK */
1829         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1830         case 0x2000:    /* EN QUAD */
1831         case 0x2001:    /* EM QUAD */
1832         case 0x2002:    /* EN SPACE */
1833         case 0x2003:    /* EM SPACE */
1834         case 0x2004:    /* THREE-PER-EM SPACE */
1835         case 0x2005:    /* FOUR-PER-EM SPACE */
1836         case 0x2006:    /* SIX-PER-EM SPACE */
1837         case 0x2007:    /* FIGURE SPACE */
1838         case 0x2008:    /* PUNCTUATION SPACE */
1839         case 0x2009:    /* THIN SPACE */
1840         case 0x200A:    /* HAIR SPACE */
1841         case 0x202f:    /* NARROW NO-BREAK SPACE */
1842         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1843         case 0x3000:    /* IDEOGRAPHIC SPACE */
1844         break;
1845 
1846         default:
1847         ADD_NEW(state_offset + 1, 0);
1848         break;
1849         }
1850       break;
1851 
1852       /*-----------------------------------------------------------------*/
1853       case OP_HSPACE:
1854       if (clen > 0) switch(c)
1855         {
1856         case 0x09:      /* HT */
1857         case 0x20:      /* SPACE */
1858         case 0xa0:      /* NBSP */
1859         case 0x1680:    /* OGHAM SPACE MARK */
1860         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
1861         case 0x2000:    /* EN QUAD */
1862         case 0x2001:    /* EM QUAD */
1863         case 0x2002:    /* EN SPACE */
1864         case 0x2003:    /* EM SPACE */
1865         case 0x2004:    /* THREE-PER-EM SPACE */
1866         case 0x2005:    /* FOUR-PER-EM SPACE */
1867         case 0x2006:    /* SIX-PER-EM SPACE */
1868         case 0x2007:    /* FIGURE SPACE */
1869         case 0x2008:    /* PUNCTUATION SPACE */
1870         case 0x2009:    /* THIN SPACE */
1871         case 0x200A:    /* HAIR SPACE */
1872         case 0x202f:    /* NARROW NO-BREAK SPACE */
1873         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
1874         case 0x3000:    /* IDEOGRAPHIC SPACE */
1875         ADD_NEW(state_offset + 1, 0);
1876         break;
1877         }
1878       break;
1879 
1880       /*-----------------------------------------------------------------*/
1881       /* Match a negated single character. This is only used for one-byte
1882       characters, that is, we know that d < 256. The character we are
1883       checking (c) can be multibyte. */
1884 
1885       case OP_NOT:
1886       if (clen > 0)
1887         {
1888         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
1889         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
1890         }
1891       break;
1892 
1893       /*-----------------------------------------------------------------*/
1894       case OP_PLUS:
1895       case OP_MINPLUS:
1896       case OP_POSPLUS:
1897       case OP_NOTPLUS:
1898       case OP_NOTMINPLUS:
1899       case OP_NOTPOSPLUS:
1900       count = current_state->count;  /* Already matched */
1901       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
1902       if (clen > 0)
1903         {
1904         unsigned int otherd = NOTACHAR;
1905         if ((ims & PCRE_CASELESS) != 0)
1906           {
1907 #ifdef SUPPORT_UTF8
1908           if (utf8 && d >= 128)
1909             {
1910 #ifdef SUPPORT_UCP
1911             otherd = UCD_OTHERCASE(d);
1912 #endif  /* SUPPORT_UCP */
1913             }
1914           else
1915 #endif  /* SUPPORT_UTF8 */
1916           otherd = fcc[d];
1917           }
1918         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1919           {
1920           if (count > 0 &&
1921               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
1922             {
1923             active_count--;             /* Remove non-match possibility */
1924             next_active_state--;
1925             }
1926           count++;
1927           ADD_NEW(state_offset, count);
1928           }
1929         }
1930       break;
1931 
1932       /*-----------------------------------------------------------------*/
1933       case OP_QUERY:
1934       case OP_MINQUERY:
1935       case OP_POSQUERY:
1936       case OP_NOTQUERY:
1937       case OP_NOTMINQUERY:
1938       case OP_NOTPOSQUERY:
1939       ADD_ACTIVE(state_offset + dlen + 1, 0);
1940       if (clen > 0)
1941         {
1942         unsigned int otherd = NOTACHAR;
1943         if ((ims & PCRE_CASELESS) != 0)
1944           {
1945 #ifdef SUPPORT_UTF8
1946           if (utf8 && d >= 128)
1947             {
1948 #ifdef SUPPORT_UCP
1949             otherd = UCD_OTHERCASE(d);
1950 #endif  /* SUPPORT_UCP */
1951             }
1952           else
1953 #endif  /* SUPPORT_UTF8 */
1954           otherd = fcc[d];
1955           }
1956         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1957           {
1958           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
1959             {
1960             active_count--;            /* Remove non-match possibility */
1961             next_active_state--;
1962             }
1963           ADD_NEW(state_offset + dlen + 1, 0);
1964           }
1965         }
1966       break;
1967 
1968       /*-----------------------------------------------------------------*/
1969       case OP_STAR:
1970       case OP_MINSTAR:
1971       case OP_POSSTAR:
1972       case OP_NOTSTAR:
1973       case OP_NOTMINSTAR:
1974       case OP_NOTPOSSTAR:
1975       ADD_ACTIVE(state_offset + dlen + 1, 0);
1976       if (clen > 0)
1977         {
1978         unsigned int otherd = NOTACHAR;
1979         if ((ims & PCRE_CASELESS) != 0)
1980           {
1981 #ifdef SUPPORT_UTF8
1982           if (utf8 && d >= 128)
1983             {
1984 #ifdef SUPPORT_UCP
1985             otherd = UCD_OTHERCASE(d);
1986 #endif  /* SUPPORT_UCP */
1987             }
1988           else
1989 #endif  /* SUPPORT_UTF8 */
1990           otherd = fcc[d];
1991           }
1992         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
1993           {
1994           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
1995             {
1996             active_count--;            /* Remove non-match possibility */
1997             next_active_state--;
1998             }
1999           ADD_NEW(state_offset, 0);
2000           }
2001         }
2002       break;
2003 
2004       /*-----------------------------------------------------------------*/
2005       case OP_EXACT:
2006       case OP_NOTEXACT:
2007       count = current_state->count;  /* Number already matched */
2008       if (clen > 0)
2009         {
2010         unsigned int otherd = NOTACHAR;
2011         if ((ims & PCRE_CASELESS) != 0)
2012           {
2013 #ifdef SUPPORT_UTF8
2014           if (utf8 && d >= 128)
2015             {
2016 #ifdef SUPPORT_UCP
2017             otherd = UCD_OTHERCASE(d);
2018 #endif  /* SUPPORT_UCP */
2019             }
2020           else
2021 #endif  /* SUPPORT_UTF8 */
2022           otherd = fcc[d];
2023           }
2024         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2025           {
2026           if (++count >= GET2(code, 1))
2027             { ADD_NEW(state_offset + dlen + 3, 0); }
2028           else
2029             { ADD_NEW(state_offset, count); }
2030           }
2031         }
2032       break;
2033 
2034       /*-----------------------------------------------------------------*/
2035       case OP_UPTO:
2036       case OP_MINUPTO:
2037       case OP_POSUPTO:
2038       case OP_NOTUPTO:
2039       case OP_NOTMINUPTO:
2040       case OP_NOTPOSUPTO:
2041       ADD_ACTIVE(state_offset + dlen + 3, 0);
2042       count = current_state->count;  /* Number already matched */
2043       if (clen > 0)
2044         {
2045         unsigned int otherd = NOTACHAR;
2046         if ((ims & PCRE_CASELESS) != 0)
2047           {
2048 #ifdef SUPPORT_UTF8
2049           if (utf8 && d >= 128)
2050             {
2051 #ifdef SUPPORT_UCP
2052             otherd = UCD_OTHERCASE(d);
2053 #endif  /* SUPPORT_UCP */
2054             }
2055           else
2056 #endif  /* SUPPORT_UTF8 */
2057           otherd = fcc[d];
2058           }
2059         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2060           {
2061           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2062             {
2063             active_count--;             /* Remove non-match possibility */
2064             next_active_state--;
2065             }
2066           if (++count >= GET2(code, 1))
2067             { ADD_NEW(state_offset + dlen + 3, 0); }
2068           else
2069             { ADD_NEW(state_offset, count); }
2070           }
2071         }
2072       break;
2073 
2074 
2075 /* ========================================================================== */
2076       /* These are the class-handling opcodes */
2077 
2078       case OP_CLASS:
2079       case OP_NCLASS:
2080       case OP_XCLASS:
2081         {
2082         BOOL isinclass = FALSE;
2083         int next_state_offset;
2084         const uschar *ecode;
2085 
2086         /* For a simple class, there is always just a 32-byte table, and we
2087         can set isinclass from it. */
2088 
2089         if (codevalue != OP_XCLASS)
2090           {
2091           ecode = code + 33;
2092           if (clen > 0)
2093             {
2094             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2095               ((code[1 + c/8] & (1 << (c&7))) != 0);
2096             }
2097           }
2098 
2099         /* An extended class may have a table or a list of single characters,
2100         ranges, or both, and it may be positive or negative. There's a
2101         function that sorts all this out. */
2102 
2103         else
2104          {
2105          ecode = code + GET(code, 1);
2106          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
2107          }
2108 
2109         /* At this point, isinclass is set for all kinds of class, and ecode
2110         points to the byte after the end of the class. If there is a
2111         quantifier, this is where it will be. */
2112 
2113         next_state_offset = ecode - start_code;
2114 
2115         switch (*ecode)
2116           {
2117           case OP_CRSTAR:
2118           case OP_CRMINSTAR:
2119           ADD_ACTIVE(next_state_offset + 1, 0);
2120           if (isinclass) { ADD_NEW(state_offset, 0); }
2121           break;
2122 
2123           case OP_CRPLUS:
2124           case OP_CRMINPLUS:
2125           count = current_state->count;  /* Already matched */
2126           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2127           if (isinclass) { count++; ADD_NEW(state_offset, count); }
2128           break;
2129 
2130           case OP_CRQUERY:
2131           case OP_CRMINQUERY:
2132           ADD_ACTIVE(next_state_offset + 1, 0);
2133           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
2134           break;
2135 
2136           case OP_CRRANGE:
2137           case OP_CRMINRANGE:
2138           count = current_state->count;  /* Already matched */
2139           if (count >= GET2(ecode, 1))
2140             { ADD_ACTIVE(next_state_offset + 5, 0); }
2141           if (isinclass)
2142             {
2143             int max = GET2(ecode, 3);
2144             if (++count >= max && max != 0)   /* Max 0 => no limit */
2145               { ADD_NEW(next_state_offset + 5, 0); }
2146             else
2147               { ADD_NEW(state_offset, count); }
2148             }
2149           break;
2150 
2151           default:
2152           if (isinclass) { ADD_NEW(next_state_offset, 0); }
2153           break;
2154           }
2155         }
2156       break;
2157 
2158 /* ========================================================================== */
2159       /* These are the opcodes for fancy brackets of various kinds. We have
2160       to use recursion in order to handle them. The "always failing" assersion
2161       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
2162       though the other "backtracking verbs" are not supported. */
2163 
2164       case OP_FAIL:
2165       break;
2166 
2167       case OP_ASSERT:
2168       case OP_ASSERT_NOT:
2169       case OP_ASSERTBACK:
2170       case OP_ASSERTBACK_NOT:
2171         {
2172         int rc;
2173         int local_offsets[2];
2174         int local_workspace[1000];
2175         const uschar *endasscode = code + GET(code, 1);
2176 
2177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2178 
2179         rc = internal_dfa_exec(
2180           md,                                   /* static match data */
2181           code,                                 /* this subexpression's code */
2182           ptr,                                  /* where we currently are */
2183           ptr - start_subject,                  /* start offset */
2184           local_offsets,                        /* offset vector */
2185           sizeof(local_offsets)/sizeof(int),    /* size of same */
2186           local_workspace,                      /* workspace vector */
2187           sizeof(local_workspace)/sizeof(int),  /* size of same */
2188           ims,                                  /* the current ims flags */
2189           rlevel,                               /* function recursion level */
2190           recursing);                           /* pass on regex recursion */
2191 
2192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2194         }
2195       break;
2196 
2197       /*-----------------------------------------------------------------*/
2198       case OP_COND:
2199       case OP_SCOND:
2200         {
2201         int local_offsets[1000];
2202         int local_workspace[1000];
2203         int codelink = GET(code, 1);
2204         int condcode;
2205 
2206         /* Because of the way auto-callout works during compile, a callout item
2207         is inserted between OP_COND and an assertion condition. This does not
2208         happen for the other conditions. */
2209 
2210         if (code[LINK_SIZE+1] == OP_CALLOUT)
2211           {
2212           rrc = 0;
2213           if (pcre_callout != NULL)
2214             {
2215             pcre_callout_block cb;
2216             cb.version          = 1;   /* Version 1 of the callout block */
2217             cb.callout_number   = code[LINK_SIZE+2];
2218             cb.offset_vector    = offsets;
2219             cb.subject          = (PCRE_SPTR)start_subject;
2220             cb.subject_length   = end_subject - start_subject;
2221             cb.start_match      = current_subject - start_subject;
2222             cb.current_position = ptr - start_subject;
2223             cb.pattern_position = GET(code, LINK_SIZE + 3);
2224             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
2225             cb.capture_top      = 1;
2226             cb.capture_last     = -1;
2227             cb.callout_data     = md->callout_data;
2228             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2229             }
2230           if (rrc > 0) break;                      /* Fail this thread */
2231           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
2232           }
2233 
2234         condcode = code[LINK_SIZE+1];
2235 
2236         /* Back reference conditions are not supported */
2237 
2238         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
2239 
2240         /* The DEFINE condition is always false */
2241 
2242         if (condcode == OP_DEF)
2243           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2244 
2245         /* The only supported version of OP_RREF is for the value RREF_ANY,
2246         which means "test if in any recursion". We can't test for specifically
2247         recursed groups. */
2248 
2249         else if (condcode == OP_RREF)
2250           {
2251           int value = GET2(code, LINK_SIZE+2);
2252           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
2253           if (recursing > 0)
2254             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
2255           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2256           }
2257 
2258         /* Otherwise, the condition is an assertion */
2259 
2260         else
2261           {
2262           int rc;
2263           const uschar *asscode = code + LINK_SIZE + 1;
2264           const uschar *endasscode = asscode + GET(asscode, 1);
2265 
2266           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2267 
2268           rc = internal_dfa_exec(
2269             md,                                   /* fixed match data */
2270             asscode,                              /* this subexpression's code */
2271             ptr,                                  /* where we currently are */
2272             ptr - start_subject,                  /* start offset */
2273             local_offsets,                        /* offset vector */
2274             sizeof(local_offsets)/sizeof(int),    /* size of same */
2275             local_workspace,                      /* workspace vector */
2276             sizeof(local_workspace)/sizeof(int),  /* size of same */
2277             ims,                                  /* the current ims flags */
2278             rlevel,                               /* function recursion level */
2279             recursing);                           /* pass on regex recursion */
2280 
2281           if ((rc >= 0) ==
2282                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2283             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
2284           else
2285             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2286           }
2287         }
2288       break;
2289 
2290       /*-----------------------------------------------------------------*/
2291       case OP_RECURSE:
2292         {
2293         int local_offsets[1000];
2294         int local_workspace[1000];
2295         int rc;
2296 
2297         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
2298           recursing + 1));
2299 
2300         rc = internal_dfa_exec(
2301           md,                                   /* fixed match data */
2302           start_code + GET(code, 1),            /* this subexpression's code */
2303           ptr,                                  /* where we currently are */
2304           ptr - start_subject,                  /* start offset */
2305           local_offsets,                        /* offset vector */
2306           sizeof(local_offsets)/sizeof(int),    /* size of same */
2307           local_workspace,                      /* workspace vector */
2308           sizeof(local_workspace)/sizeof(int),  /* size of same */
2309           ims,                                  /* the current ims flags */
2310           rlevel,                               /* function recursion level */
2311           recursing + 1);                       /* regex recurse level */
2312 
2313         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
2314           recursing + 1, rc));
2315 
2316         /* Ran out of internal offsets */
2317 
2318         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
2319 
2320         /* For each successful matched substring, set up the next state with a
2321         count of characters to skip before trying it. Note that the count is in
2322         characters, not bytes. */
2323 
2324         if (rc > 0)
2325           {
2326           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
2327             {
2328             const uschar *p = start_subject + local_offsets[rc];
2329             const uschar *pp = start_subject + local_offsets[rc+1];
2330             int charcount = local_offsets[rc+1] - local_offsets[rc];
2331             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2332             if (charcount > 0)
2333               {
2334               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
2335               }
2336             else
2337               {
2338               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
2339               }
2340             }
2341           }
2342         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2343         }
2344       break;
2345 
2346       /*-----------------------------------------------------------------*/
2347       case OP_ONCE:
2348         {
2349         int local_offsets[2];
2350         int local_workspace[1000];
2351 
2352         int rc = internal_dfa_exec(
2353           md,                                   /* fixed match data */
2354           code,                                 /* this subexpression's code */
2355           ptr,                                  /* where we currently are */
2356           ptr - start_subject,                  /* start offset */
2357           local_offsets,                        /* offset vector */
2358           sizeof(local_offsets)/sizeof(int),    /* size of same */
2359           local_workspace,                      /* workspace vector */
2360           sizeof(local_workspace)/sizeof(int),  /* size of same */
2361           ims,                                  /* the current ims flags */
2362           rlevel,                               /* function recursion level */
2363           recursing);                           /* pass on regex recursion */
2364 
2365         if (rc >= 0)
2366           {
2367           const uschar *end_subpattern = code;
2368           int charcount = local_offsets[1] - local_offsets[0];
2369           int next_state_offset, repeat_state_offset;
2370 
2371           do { end_subpattern += GET(end_subpattern, 1); }
2372             while (*end_subpattern == OP_ALT);
2373           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
2374 
2375           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
2376           arrange for the repeat state also to be added to the relevant list.
2377           Calculate the offset, or set -1 for no repeat. */
2378 
2379           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
2380                                  *end_subpattern == OP_KETRMIN)?
2381             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
2382 
2383           /* If we have matched an empty string, add the next state at the
2384           current character pointer. This is important so that the duplicate
2385           checking kicks in, which is what breaks infinite loops that match an
2386           empty string. */
2387 
2388           if (charcount == 0)
2389             {
2390             ADD_ACTIVE(next_state_offset, 0);
2391             }
2392 
2393           /* Optimization: if there are no more active states, and there
2394           are no new states yet set up, then skip over the subject string
2395           right here, to save looping. Otherwise, set up the new state to swing
2396           into action when the end of the substring is reached. */
2397 
2398           else if (i + 1 >= active_count && new_count == 0)
2399             {
2400             ptr += charcount;
2401             clen = 0;
2402             ADD_NEW(next_state_offset, 0);
2403 
2404             /* If we are adding a repeat state at the new character position,
2405             we must fudge things so that it is the only current state.
2406             Otherwise, it might be a duplicate of one we processed before, and
2407             that would cause it to be skipped. */
2408 
2409             if (repeat_state_offset >= 0)
2410               {
2411               next_active_state = active_states;
2412               active_count = 0;
2413               i = -1;
2414               ADD_ACTIVE(repeat_state_offset, 0);
2415               }
2416             }
2417           else
2418             {
2419             const uschar *p = start_subject + local_offsets[0];
2420             const uschar *pp = start_subject + local_offsets[1];
2421             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
2422             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
2423             if (repeat_state_offset >= 0)
2424               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
2425             }
2426 
2427           }
2428         else if (rc != PCRE_ERROR_NOMATCH) return rc;
2429         }
2430       break;
2431 
2432 
2433 /* ========================================================================== */
2434       /* Handle callouts */
2435 
2436       case OP_CALLOUT:
2437       rrc = 0;
2438       if (pcre_callout != NULL)
2439         {
2440         pcre_callout_block cb;
2441         cb.version          = 1;   /* Version 1 of the callout block */
2442         cb.callout_number   = code[1];
2443         cb.offset_vector    = offsets;
2444         cb.subject          = (PCRE_SPTR)start_subject;
2445         cb.subject_length   = end_subject - start_subject;
2446         cb.start_match      = current_subject - start_subject;
2447         cb.current_position = ptr - start_subject;
2448         cb.pattern_position = GET(code, 2);
2449         cb.next_item_length = GET(code, 2 + LINK_SIZE);
2450         cb.capture_top      = 1;
2451         cb.capture_last     = -1;
2452         cb.callout_data     = md->callout_data;
2453         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
2454         }
2455       if (rrc == 0)
2456         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
2457       break;
2458 
2459 
2460 /* ========================================================================== */
2461       default:        /* Unsupported opcode */
2462       return PCRE_ERROR_DFA_UITEM;
2463       }
2464 
2465     NEXT_ACTIVE_STATE: continue;
2466 
2467     }      /* End of loop scanning active states */
2468 
2469   /* We have finished the processing at the current subject character. If no
2470   new states have been set for the next character, we have found all the
2471   matches that we are going to find. If we are at the top level and partial
2472   matching has been requested, check for appropriate conditions. */
2473 
2474   if (new_count <= 0)
2475     {
2476     if (match_count < 0 &&                     /* No matches found */
2477         rlevel == 1 &&                         /* Top level match function */
2478         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
2479         ptr >= end_subject &&                  /* Reached end of subject */
2480         ptr > current_subject)                 /* Matched non-empty string */
2481       {
2482       if (offsetcount >= 2)
2483         {
2484         offsets[0] = current_subject - start_subject;
2485         offsets[1] = end_subject - start_subject;
2486         }
2487       match_count = PCRE_ERROR_PARTIAL;
2488       }
2489 
2490     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
2491       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
2492       rlevel*2-2, SP));
2493     break;        /* In effect, "return", but see the comment below */
2494     }
2495 
2496   /* One or more states are active for the next character. */
2497 
2498   ptr += clen;    /* Advance to next subject character */
2499   }               /* Loop to move along the subject string */
2500 
2501 /* Control gets here from "break" a few lines above. We do it this way because
2502 if we use "return" above, we have compiler trouble. Some compilers warn if
2503 there's nothing here because they think the function doesn't return a value. On
2504 the other hand, if we put a dummy statement here, some more clever compilers
2505 complain that it can't be reached. Sigh. */
2506 
2507 return match_count;
2508 }
2509 
2510 
2511 
2512 
2513 /*************************************************
2514 *    Execute a Regular Expression - DFA engine   *
2515 *************************************************/
2516 
2517 /* This external function applies a compiled re to a subject string using a DFA
2518 engine. This function calls the internal function multiple times if the pattern
2519 is not anchored.
2520 
2521 Arguments:
2522   argument_re     points to the compiled expression
2523   extra_data      points to extra data or is NULL
2524   subject         points to the subject string
2525   length          length of subject string (may contain binary zeros)
2526   start_offset    where to start in the subject string
2527   options         option bits
2528   offsets         vector of match offsets
2529   offsetcount     size of same
2530   workspace       workspace vector
2531   wscount         size of same
2532 
2533 Returns:          > 0 => number of match offset pairs placed in offsets
2534                   = 0 => offsets overflowed; longest matches are present
2535                    -1 => failed to match
2536                  < -1 => some kind of unexpected problem
2537 */
2538 
2539 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
2540 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
2541   const char *subject, int length, int start_offset, int options, int *offsets,
2542   int offsetcount, int *workspace, int wscount)
2543 {
2544 real_pcre *re = (real_pcre *)argument_re;
2545 dfa_match_data match_block;
2546 dfa_match_data *md = &match_block;
2547 BOOL utf8, anchored, startline, firstline;
2548 const uschar *current_subject, *end_subject, *lcc;
2549 
2550 pcre_study_data internal_study;
2551 const pcre_study_data *study = NULL;
2552 real_pcre internal_re;
2553 
2554 const uschar *req_byte_ptr;
2555 const uschar *start_bits = NULL;
2556 BOOL first_byte_caseless = FALSE;
2557 BOOL req_byte_caseless = FALSE;
2558 int first_byte = -1;
2559 int req_byte = -1;
2560 int req_byte2 = -1;
2561 int newline;
2562 
2563 /* Plausibility checks */
2564 
2565 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
2566 if (re == NULL || subject == NULL || workspace == NULL ||
2567    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
2568 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
2569 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
2570 
2571 /* We need to find the pointer to any study data before we test for byte
2572 flipping, so we scan the extra_data block first. This may set two fields in the
2573 match block, so we must initialize them beforehand. However, the other fields
2574 in the match block must not be set until after the byte flipping. */
2575 
2576 md->tables = re->tables;
2577 md->callout_data = NULL;
2578 
2579 if (extra_data != NULL)
2580   {
2581   unsigned int flags = extra_data->flags;
2582   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
2583     study = (const pcre_study_data *)extra_data->study_data;
2584   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
2585   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
2586     return PCRE_ERROR_DFA_UMLIMIT;
2587   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
2588     md->callout_data = extra_data->callout_data;
2589   if ((flags & PCRE_EXTRA_TABLES) != 0)
2590     md->tables = extra_data->tables;
2591   }
2592 
2593 /* Check that the first field in the block is the magic number. If it is not,
2594 test for a regex that was compiled on a host of opposite endianness. If this is
2595 the case, flipped values are put in internal_re and internal_study if there was
2596 study data too. */
2597 
2598 if (re->magic_number != MAGIC_NUMBER)
2599   {
2600   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
2601   if (re == NULL) return PCRE_ERROR_BADMAGIC;
2602   if (study != NULL) study = &internal_study;
2603   }
2604 
2605 /* Set some local values */
2606 
2607 current_subject = (const unsigned char *)subject + start_offset;
2608 end_subject = (const unsigned char *)subject + length;
2609 req_byte_ptr = current_subject - 1;
2610 
2611 #ifdef SUPPORT_UTF8
2612 utf8 = (re->options & PCRE_UTF8) != 0;
2613 #else
2614 utf8 = FALSE;
2615 #endif
2616 
2617 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
2618   (re->options & PCRE_ANCHORED) != 0;
2619 
2620 /* The remaining fixed data for passing around. */
2621 
2622 md->start_code = (const uschar *)argument_re +
2623     re->name_table_offset + re->name_count * re->name_entry_size;
2624 md->start_subject = (const unsigned char *)subject;
2625 md->end_subject = end_subject;
2626 md->moptions = options;
2627 md->poptions = re->options;
2628 
2629 /* If the BSR option is not set at match time, copy what was set
2630 at compile time. */
2631 
2632 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
2633   {
2634   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
2635     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
2636 #ifdef BSR_ANYCRLF
2637   else md->moptions |= PCRE_BSR_ANYCRLF;
2638 #endif
2639   }
2640 
2641 /* Handle different types of newline. The three bits give eight cases. If
2642 nothing is set at run time, whatever was used at compile time applies. */
2643 
2644 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
2645          PCRE_NEWLINE_BITS)
2646   {
2647   case 0: newline = NEWLINE; break;   /* Compile-time default */
2648   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
2649   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
2650   case PCRE_NEWLINE_CR+
2651        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
2652   case PCRE_NEWLINE_ANY: newline = -1; break;
2653   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
2654   default: return PCRE_ERROR_BADNEWLINE;
2655   }
2656 
2657 if (newline == -2)
2658   {
2659   md->nltype = NLTYPE_ANYCRLF;
2660   }
2661 else if (newline < 0)
2662   {
2663   md->nltype = NLTYPE_ANY;
2664   }
2665 else
2666   {
2667   md->nltype = NLTYPE_FIXED;
2668   if (newline > 255)
2669     {
2670     md->nllen = 2;
2671     md->nl[0] = (newline >> 8) & 255;
2672     md->nl[1] = newline & 255;
2673     }
2674   else
2675     {
2676     md->nllen = 1;
2677     md->nl[0] = newline;
2678     }
2679   }
2680 
2681 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
2682 back the character offset. */
2683 
2684 #ifdef SUPPORT_UTF8
2685 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
2686   {
2687   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
2688     return PCRE_ERROR_BADUTF8;
2689   if (start_offset > 0 && start_offset < length)
2690     {
2691     int tb = ((uschar *)subject)[start_offset];
2692     if (tb > 127)
2693       {
2694       tb &= 0xc0;
2695       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
2696       }
2697     }
2698   }
2699 #endif
2700 
2701 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
2702 is a feature that makes it possible to save compiled regex and re-use them
2703 in other programs later. */
2704 
2705 if (md->tables == NULL) md->tables = _pcre_default_tables;
2706 
2707 /* The lower casing table and the "must be at the start of a line" flag are
2708 used in a loop when finding where to start. */
2709 
2710 lcc = md->tables + lcc_offset;
2711 startline = (re->flags & PCRE_STARTLINE) != 0;
2712 firstline = (re->options & PCRE_FIRSTLINE) != 0;
2713 
2714 /* Set up the first character to match, if available. The first_byte value is
2715 never set for an anchored regular expression, but the anchoring may be forced
2716 at run time, so we have to test for anchoring. The first char may be unset for
2717 an unanchored pattern, of course. If there's no first char and the pattern was
2718 studied, there may be a bitmap of possible first characters. */
2719 
2720 if (!anchored)
2721   {
2722   if ((re->flags & PCRE_FIRSTSET) != 0)
2723     {
2724     first_byte = re->first_byte & 255;
2725     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
2726       first_byte = lcc[first_byte];
2727     }
2728   else
2729     {
2730     if (startline && study != NULL &&
2731          (study->options & PCRE_STUDY_MAPPED) != 0)
2732       start_bits = study->start_bits;
2733     }
2734   }
2735 
2736 /* For anchored or unanchored matches, there may be a "last known required
2737 character" set. */
2738 
2739 if ((re->flags & PCRE_REQCHSET) != 0)
2740   {
2741   req_byte = re->req_byte & 255;
2742   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
2743   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
2744   }
2745 
2746 /* Call the main matching function, looping for a non-anchored regex after a
2747 failed match. If not restarting, perform certain optimizations at the start of
2748 a match. */
2749 
2750 for (;;)
2751   {
2752   int rc;
2753 
2754   if ((options & PCRE_DFA_RESTART) == 0)
2755     {
2756     const uschar *save_end_subject = end_subject;
2757 
2758     /* If firstline is TRUE, the start of the match is constrained to the first
2759     line of a multiline string. Implement this by temporarily adjusting
2760     end_subject so that we stop scanning at a newline. If the match fails at
2761     the newline, later code breaks this loop. */
2762 
2763     if (firstline)
2764       {
2765       USPTR t = current_subject;
2766 #ifdef SUPPORT_UTF8
2767       if (utf8)
2768         {
2769         while (t < md->end_subject && !IS_NEWLINE(t))
2770           {
2771           t++;
2772           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
2773           }
2774         }
2775       else
2776 #endif
2777       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
2778       end_subject = t;
2779       }
2780 
2781     /* There are some optimizations that avoid running the match if a known
2782     starting point is not found, or if a known later character is not present.
2783     However, there is an option that disables these, for testing and for
2784     ensuring that all callouts do actually occur. */
2785 
2786     if ((options & PCRE_NO_START_OPTIMIZE) == 0)
2787       {
2788 
2789       /* Advance to a known first byte. */
2790 
2791       if (first_byte >= 0)
2792         {
2793         if (first_byte_caseless)
2794           while (current_subject < end_subject &&
2795                  lcc[*current_subject] != first_byte)
2796             current_subject++;
2797         else
2798           while (current_subject < end_subject &&
2799                  *current_subject != first_byte)
2800             current_subject++;
2801         }
2802 
2803       /* Or to just after a linebreak for a multiline match if possible */
2804 
2805       else if (startline)
2806         {
2807         if (current_subject > md->start_subject + start_offset)
2808           {
2809 #ifdef SUPPORT_UTF8
2810           if (utf8)
2811             {
2812             while (current_subject < end_subject &&
2813                    !WAS_NEWLINE(current_subject))
2814               {
2815               current_subject++;
2816               while(current_subject < end_subject &&
2817                     (*current_subject & 0xc0) == 0x80)
2818                 current_subject++;
2819               }
2820             }
2821           else
2822 #endif
2823           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
2824             current_subject++;
2825 
2826           /* If we have just passed a CR and the newline option is ANY or
2827           ANYCRLF, and we are now at a LF, advance the match position by one
2828           more character. */
2829 
2830           if (current_subject[-1] == CHAR_CR &&
2831                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
2832                current_subject < end_subject &&
2833                *current_subject == CHAR_NL)
2834             current_subject++;
2835           }
2836         }
2837 
2838       /* Or to a non-unique first char after study */
2839 
2840       else if (start_bits != NULL)
2841         {
2842         while (current_subject < end_subject)
2843           {
2844           register unsigned int c = *current_subject;
2845           if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
2846             else break;
2847           }
2848         }
2849       }
2850 
2851     /* Restore fudged end_subject */
2852 
2853     end_subject = save_end_subject;
2854     }
2855 
2856   /* If req_byte is set, we know that that character must appear in the subject
2857   for the match to succeed. If the first character is set, req_byte must be
2858   later in the subject; otherwise the test starts at the match point. This
2859   optimization can save a huge amount of work in patterns with nested unlimited
2860   repeats that aren't going to match. Writing separate code for cased/caseless
2861   versions makes it go faster, as does using an autoincrement and backing off
2862   on a match.
2863 
2864   HOWEVER: when the subject string is very, very long, searching to its end can
2865   take a long time, and give bad performance on quite ordinary patterns. This
2866   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
2867   don't do this when the string is sufficiently long.
2868 
2869   ALSO: this processing is disabled when partial matching is requested, and can
2870   also be explicitly deactivated. */
2871 
2872   if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
2873       req_byte >= 0 &&
2874       end_subject - current_subject < REQ_BYTE_MAX &&
2875       (options & PCRE_PARTIAL) == 0)
2876     {
2877     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
2878 
2879     /* We don't need to repeat the search if we haven't yet reached the
2880     place we found it at last time. */
2881 
2882     if (p > req_byte_ptr)
2883       {
2884       if (req_byte_caseless)
2885         {
2886         while (p < end_subject)
2887           {
2888           register int pp = *p++;
2889           if (pp == req_byte || pp == req_byte2) { p--; break; }
2890           }
2891         }
2892       else
2893         {
2894         while (p < end_subject)
2895           {
2896           if (*p++ == req_byte) { p--; break; }
2897           }
2898         }
2899 
2900       /* If we can't find the required character, break the matching loop,
2901       which will cause a return or PCRE_ERROR_NOMATCH. */
2902 
2903       if (p >= end_subject) break;
2904 
2905       /* If we have found the required character, save the point where we
2906       found it, so that we don't search again next time round the loop if
2907       the start hasn't passed this character yet. */
2908 
2909       req_byte_ptr = p;
2910       }
2911     }
2912 
2913   /* OK, now we can do the business */
2914 
2915   rc = internal_dfa_exec(
2916     md,                                /* fixed match data */
2917     md->start_code,                    /* this subexpression's code */
2918     current_subject,                   /* where we currently are */
2919     start_offset,                      /* start offset in subject */
2920     offsets,                           /* offset vector */
2921     offsetcount,                       /* size of same */
2922     workspace,                         /* workspace vector */
2923     wscount,                           /* size of same */
2924     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
2925     0,                                 /* function recurse level */
2926     0);                                /* regex recurse level */
2927 
2928   /* Anything other than "no match" means we are done, always; otherwise, carry
2929   on only if not anchored. */
2930 
2931   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
2932 
2933   /* Advance to the next subject character unless we are at the end of a line
2934   and firstline is set. */
2935 
2936   if (firstline && IS_NEWLINE(current_subject)) break;
2937   current_subject++;
2938   if (utf8)
2939     {
2940     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
2941       current_subject++;
2942     }
2943   if (current_subject > end_subject) break;
2944 
2945   /* If we have just passed a CR and we are now at a LF, and the pattern does
2946   not contain any explicit matches for \r or \n, and the newline option is CRLF
2947   or ANY or ANYCRLF, advance the match position by one more character. */
2948 
2949   if (current_subject[-1] == CHAR_CR &&
2950       current_subject < end_subject &&
2951       *current_subject == CHAR_NL &&
2952       (re->flags & PCRE_HASCRORLF) == 0 &&
2953         (md->nltype == NLTYPE_ANY ||
2954          md->nltype == NLTYPE_ANYCRLF ||
2955          md->nllen == 2))
2956     current_subject++;
2957 
2958   }   /* "Bumpalong" loop */
2959 
2960 return PCRE_ERROR_NOMATCH;
2961 }
2962 
2963 /* End of pcre_dfa_exec.c */
2964 

source navigation ]   [ diff markup ]   [ identifier search ]   [ freetext search ]   [ file search ]  

This page was automatically generated by the LXR engine.
Visit the LXR main site for more information.