NCBI C++ ToolKit
pcre_dfa_exec.c
Go to the documentation of this file.

Go to the SVN repository for this file.

00001 /*************************************************
00002 *      Perl-Compatible Regular Expressions       *
00003 *************************************************/
00004 
00005 /* PCRE is a library of functions to support regular expressions whose syntax
00006 and semantics are as close as possible to those of the Perl 5 language (but see
00007 below for why this module is different).
00008 
00009                        Written by Philip Hazel
00010            Copyright (c) 1997-2009 University of Cambridge
00011 
00012 -----------------------------------------------------------------------------
00013 Redistribution and use in source and binary forms, with or without
00014 modification, are permitted provided that the following conditions are met:
00015 
00016     * Redistributions of source code must retain the above copyright notice,
00017       this list of conditions and the following disclaimer.
00018 
00019     * Redistributions in binary form must reproduce the above copyright
00020       notice, this list of conditions and the following disclaimer in the
00021       documentation and/or other materials provided with the distribution.
00022 
00023     * Neither the name of the University of Cambridge nor the names of its
00024       contributors may be used to endorse or promote products derived from
00025       this software without specific prior written permission.
00026 
00027 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
00028 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
00029 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
00030 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
00031 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
00032 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
00033 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
00034 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
00035 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
00036 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
00037 POSSIBILITY OF SUCH DAMAGE.
00038 -----------------------------------------------------------------------------
00039 */
00040 
00041 
00042 /* This module contains the external function pcre_dfa_exec(), which is an
00043 alternative matching function that uses a sort of DFA algorithm (not a true
00044 FSM). This is NOT Perl- compatible, but it has advantages in certain
00045 applications. */
00046 
00047 
00048 #ifdef HAVE_CONFIG_H
00049 #include "config.h"
00050 #endif
00051 
00052 #define NLBLOCK md             /* Block containing newline information */
00053 #define PSSTART start_subject  /* Field containing processed string start */
00054 #define PSEND   end_subject    /* Field containing processed string end */
00055 
00056 #include "pcre_internal.h"
00057 
00058 
00059 /* For use to indent debugging output */
00060 
00061 #define SP "                   "
00062 
00063 
00064 /*************************************************
00065 *      Code parameters and static tables         *
00066 *************************************************/
00067 
00068 /* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
00069 into others, under special conditions. A gap of 20 between the blocks should be
00070 enough. The resulting opcodes don't have to be less than 256 because they are
00071 never stored, so we push them well clear of the normal opcodes. */
00072 
00073 #define OP_PROP_EXTRA       300
00074 #define OP_EXTUNI_EXTRA     320
00075 #define OP_ANYNL_EXTRA      340
00076 #define OP_HSPACE_EXTRA     360
00077 #define OP_VSPACE_EXTRA     380
00078 
00079 
00080 /* This table identifies those opcodes that are followed immediately by a
00081 character that is to be tested in some way. This makes is possible to
00082 centralize the loading of these characters. In the case of Type * etc, the
00083 "character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
00084 small value. ***NOTE*** If the start of this table is modified, the two tables
00085 that follow must also be modified. */
00086 
00087 static const uschar coptable[] = {
00088   0,                             /* End                                    */
00089   0, 0, 0, 0, 0,                 /* \A, \G, \K, \B, \b                     */
00090   0, 0, 0, 0, 0, 0,              /* \D, \d, \S, \s, \W, \w                 */
00091   0, 0, 0,                       /* Any, AllAny, Anybyte                   */
00092   0, 0, 0,                       /* NOTPROP, PROP, EXTUNI                  */
00093   0, 0, 0, 0, 0,                 /* \R, \H, \h, \V, \v                     */
00094   0, 0, 0, 0, 0,                 /* \Z, \z, Opt, ^, $                      */
00095   1,                             /* Char                                   */
00096   1,                             /* Charnc                                 */
00097   1,                             /* not                                    */
00098   /* Positive single-char repeats                                          */
00099   1, 1, 1, 1, 1, 1,              /* *, *?, +, +?, ?, ??                    */
00100   3, 3, 3,                       /* upto, minupto, exact                   */
00101   1, 1, 1, 3,                    /* *+, ++, ?+, upto+                      */
00102   /* Negative single-char repeats - only for chars < 256                   */
00103   1, 1, 1, 1, 1, 1,              /* NOT *, *?, +, +?, ?, ??                */
00104   3, 3, 3,                       /* NOT upto, minupto, exact               */
00105   1, 1, 1, 3,                    /* NOT *+, ++, ?+, updo+                  */
00106   /* Positive type repeats                                                 */
00107   1, 1, 1, 1, 1, 1,              /* Type *, *?, +, +?, ?, ??               */
00108   3, 3, 3,                       /* Type upto, minupto, exact              */
00109   1, 1, 1, 3,                    /* Type *+, ++, ?+, upto+                 */
00110   /* Character class & ref repeats                                         */
00111   0, 0, 0, 0, 0, 0,              /* *, *?, +, +?, ?, ??                    */
00112   0, 0,                          /* CRRANGE, CRMINRANGE                    */
00113   0,                             /* CLASS                                  */
00114   0,                             /* NCLASS                                 */
00115   0,                             /* XCLASS - variable length               */
00116   0,                             /* REF                                    */
00117   0,                             /* RECURSE                                */
00118   0,                             /* CALLOUT                                */
00119   0,                             /* Alt                                    */
00120   0,                             /* Ket                                    */
00121   0,                             /* KetRmax                                */
00122   0,                             /* KetRmin                                */
00123   0,                             /* Assert                                 */
00124   0,                             /* Assert not                             */
00125   0,                             /* Assert behind                          */
00126   0,                             /* Assert behind not                      */
00127   0,                             /* Reverse                                */
00128   0, 0, 0, 0,                    /* ONCE, BRA, CBRA, COND                  */
00129   0, 0, 0,                       /* SBRA, SCBRA, SCOND                     */
00130   0,                             /* CREF                                   */
00131   0,                             /* RREF                                   */
00132   0,                             /* DEF                                    */
00133   0, 0,                          /* BRAZERO, BRAMINZERO                    */
00134   0, 0, 0, 0,                    /* PRUNE, SKIP, THEN, COMMIT              */
00135   0, 0, 0                        /* FAIL, ACCEPT, SKIPZERO                 */
00136 };
00137 
00138 /* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
00139 and \w */
00140 
00141 static const uschar toptable1[] = {
00142   0, 0, 0, 0, 0, 0,
00143   ctype_digit, ctype_digit,
00144   ctype_space, ctype_space,
00145   ctype_word,  ctype_word,
00146   0, 0                            /* OP_ANY, OP_ALLANY */
00147 };
00148 
00149 static const uschar toptable2[] = {
00150   0, 0, 0, 0, 0, 0,
00151   ctype_digit, 0,
00152   ctype_space, 0,
00153   ctype_word,  0,
00154   1, 1                            /* OP_ANY, OP_ALLANY */
00155 };
00156 
00157 
00158 /* Structure for holding data about a particular state, which is in effect the
00159 current data for an active path through the match tree. It must consist
00160 entirely of ints because the working vector we are passed, and which we put
00161 these structures in, is a vector of ints. */
00162 
00163 typedef struct stateblock {
00164   int offset;                     /* Offset to opcode */
00165   int count;                      /* Count for repeats */
00166   int ims;                        /* ims flag bits */
00167   int data;                       /* Some use extra data */
00168 } stateblock;
00169 
00170 #define INTS_PER_STATEBLOCK  (sizeof(stateblock)/sizeof(int))
00171 
00172 
00173 #ifdef DEBUG
00174 /*************************************************
00175 *             Print character string             *
00176 *************************************************/
00177 
00178 /* Character string printing function for debugging.
00179 
00180 Arguments:
00181   p            points to string
00182   length       number of bytes
00183   f            where to print
00184 
00185 Returns:       nothing
00186 */
00187 
00188 static void
00189 pchars(unsigned char *p, int length, FILE *f)
00190 {
00191 int c;
00192 while (length-- > 0)
00193   {
00194   if (isprint(c = *(p++)))
00195     fprintf(f, "%c", c);
00196   else
00197     fprintf(f, "\\x%02x", c);
00198   }
00199 }
00200 #endif
00201 
00202 
00203 
00204 /*************************************************
00205 *    Execute a Regular Expression - DFA engine   *
00206 *************************************************/
00207 
00208 /* This internal function applies a compiled pattern to a subject string,
00209 starting at a given point, using a DFA engine. This function is called from the
00210 external one, possibly multiple times if the pattern is not anchored. The
00211 function calls itself recursively for some kinds of subpattern.
00212 
00213 Arguments:
00214   md                the match_data block with fixed information
00215   this_start_code   the opening bracket of this subexpression's code
00216   current_subject   where we currently are in the subject string
00217   start_offset      start offset in the subject string
00218   offsets           vector to contain the matching string offsets
00219   offsetcount       size of same
00220   workspace         vector of workspace
00221   wscount           size of same
00222   ims               the current ims flags
00223   rlevel            function call recursion level
00224   recursing         regex recursive call level
00225 
00226 Returns:            > 0 => number of match offset pairs placed in offsets
00227                     = 0 => offsets overflowed; longest matches are present
00228                      -1 => failed to match
00229                    < -1 => some kind of unexpected problem
00230 
00231 The following macros are used for adding states to the two state vectors (one
00232 for the current character, one for the following character). */
00233 
00234 #define ADD_ACTIVE(x,y) \
00235   if (active_count++ < wscount) \
00236     { \
00237     next_active_state->offset = (x); \
00238     next_active_state->count  = (y); \
00239     next_active_state->ims    = ims; \
00240     next_active_state++; \
00241     DPRINTF(("%.*sADD_ACTIVE(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00242     } \
00243   else return PCRE_ERROR_DFA_WSSIZE
00244 
00245 #define ADD_ACTIVE_DATA(x,y,z) \
00246   if (active_count++ < wscount) \
00247     { \
00248     next_active_state->offset = (x); \
00249     next_active_state->count  = (y); \
00250     next_active_state->ims    = ims; \
00251     next_active_state->data   = (z); \
00252     next_active_state++; \
00253     DPRINTF(("%.*sADD_ACTIVE_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00254     } \
00255   else return PCRE_ERROR_DFA_WSSIZE
00256 
00257 #define ADD_NEW(x,y) \
00258   if (new_count++ < wscount) \
00259     { \
00260     next_new_state->offset = (x); \
00261     next_new_state->count  = (y); \
00262     next_new_state->ims    = ims; \
00263     next_new_state++; \
00264     DPRINTF(("%.*sADD_NEW(%d,%d)\n", rlevel*2-2, SP, (x), (y))); \
00265     } \
00266   else return PCRE_ERROR_DFA_WSSIZE
00267 
00268 #define ADD_NEW_DATA(x,y,z) \
00269   if (new_count++ < wscount) \
00270     { \
00271     next_new_state->offset = (x); \
00272     next_new_state->count  = (y); \
00273     next_new_state->ims    = ims; \
00274     next_new_state->data   = (z); \
00275     next_new_state++; \
00276     DPRINTF(("%.*sADD_NEW_DATA(%d,%d,%d)\n", rlevel*2-2, SP, (x), (y), (z))); \
00277     } \
00278   else return PCRE_ERROR_DFA_WSSIZE
00279 
00280 /* And now, here is the code */
00281 
00282 static int
00283 internal_dfa_exec(
00284   dfa_match_data *md,
00285   const uschar *this_start_code,
00286   const uschar *current_subject,
00287   int start_offset,
00288   int *offsets,
00289   int offsetcount,
00290   int *workspace,
00291   int wscount,
00292   int ims,
00293   int  rlevel,
00294   int  recursing)
00295 {
00296 stateblock *active_states, *new_states, *temp_states;
00297 stateblock *next_active_state, *next_new_state;
00298 
00299 const uschar *ctypes, *lcc, *fcc;
00300 const uschar *ptr;
00301 const uschar *end_code, *first_op;
00302 
00303 int active_count, new_count, match_count;
00304 
00305 /* Some fields in the md block are frequently referenced, so we load them into
00306 independent variables in the hope that this will perform better. */
00307 
00308 const uschar *start_subject = md->start_subject;
00309 const uschar *end_subject = md->end_subject;
00310 const uschar *start_code = md->start_code;
00311 
00312 #ifdef SUPPORT_UTF8
00313 BOOL utf8 = (md->poptions & PCRE_UTF8) != 0;
00314 #else
00315 BOOL utf8 = FALSE;
00316 #endif
00317 
00318 rlevel++;
00319 offsetcount &= (-2);
00320 
00321 wscount -= 2;
00322 wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
00323           (2 * INTS_PER_STATEBLOCK);
00324 
00325 DPRINTF(("\n%.*s---------------------\n"
00326   "%.*sCall to internal_dfa_exec f=%d r=%d\n",
00327   rlevel*2-2, SP, rlevel*2-2, SP, rlevel, recursing));
00328 
00329 ctypes = md->tables + ctypes_offset;
00330 lcc = md->tables + lcc_offset;
00331 fcc = md->tables + fcc_offset;
00332 
00333 match_count = PCRE_ERROR_NOMATCH;   /* A negative number */
00334 
00335 active_states = (stateblock *)(workspace + 2);
00336 next_new_state = new_states = active_states + wscount;
00337 new_count = 0;
00338 
00339 first_op = this_start_code + 1 + LINK_SIZE +
00340   ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00341 
00342 /* The first thing in any (sub) pattern is a bracket of some sort. Push all
00343 the alternative states onto the list, and find out where the end is. This
00344 makes is possible to use this function recursively, when we want to stop at a
00345 matching internal ket rather than at the end.
00346 
00347 If the first opcode in the first alternative is OP_REVERSE, we are dealing with
00348 a backward assertion. In that case, we have to find out the maximum amount to
00349 move back, and set up each alternative appropriately. */
00350 
00351 if (*first_op == OP_REVERSE)
00352   {
00353   int max_back = 0;
00354   int gone_back;
00355 
00356   end_code = this_start_code;
00357   do
00358     {
00359     int back = GET(end_code, 2+LINK_SIZE);
00360     if (back > max_back) max_back = back;
00361     end_code += GET(end_code, 1);
00362     }
00363   while (*end_code == OP_ALT);
00364 
00365   /* If we can't go back the amount required for the longest lookbehind
00366   pattern, go back as far as we can; some alternatives may still be viable. */
00367 
00368 #ifdef SUPPORT_UTF8
00369   /* In character mode we have to step back character by character */
00370 
00371   if (utf8)
00372     {
00373     for (gone_back = 0; gone_back < max_back; gone_back++)
00374       {
00375       if (current_subject <= start_subject) break;
00376       current_subject--;
00377       while (current_subject > start_subject &&
00378              (*current_subject & 0xc0) == 0x80)
00379         current_subject--;
00380       }
00381     }
00382   else
00383 #endif
00384 
00385   /* In byte-mode we can do this quickly. */
00386 
00387     {
00388     gone_back = (current_subject - max_back < start_subject)?
00389       current_subject - start_subject : max_back;
00390     current_subject -= gone_back;
00391     }
00392 
00393   /* Now we can process the individual branches. */
00394 
00395   end_code = this_start_code;
00396   do
00397     {
00398     int back = GET(end_code, 2+LINK_SIZE);
00399     if (back <= gone_back)
00400       {
00401       int bstate = end_code - start_code + 2 + 2*LINK_SIZE;
00402       ADD_NEW_DATA(-bstate, 0, gone_back - back);
00403       }
00404     end_code += GET(end_code, 1);
00405     }
00406   while (*end_code == OP_ALT);
00407  }
00408 
00409 /* This is the code for a "normal" subpattern (not a backward assertion). The
00410 start of a whole pattern is always one of these. If we are at the top level,
00411 we may be asked to restart matching from the same point that we reached for a
00412 previous partial match. We still have to scan through the top-level branches to
00413 find the end state. */
00414 
00415 else
00416   {
00417   end_code = this_start_code;
00418 
00419   /* Restarting */
00420 
00421   if (rlevel == 1 && (md->moptions & PCRE_DFA_RESTART) != 0)
00422     {
00423     do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
00424     new_count = workspace[1];
00425     if (!workspace[0])
00426       memcpy(new_states, active_states, new_count * sizeof(stateblock));
00427     }
00428 
00429   /* Not restarting */
00430 
00431   else
00432     {
00433     int length = 1 + LINK_SIZE +
00434       ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA)? 2:0);
00435     do
00436       {
00437       ADD_NEW(end_code - start_code + length, 0);
00438       end_code += GET(end_code, 1);
00439       length = 1 + LINK_SIZE;
00440       }
00441     while (*end_code == OP_ALT);
00442     }
00443   }
00444 
00445 workspace[0] = 0;    /* Bit indicating which vector is current */
00446 
00447 DPRINTF(("%.*sEnd state = %d\n", rlevel*2-2, SP, end_code - start_code));
00448 
00449 /* Loop for scanning the subject */
00450 
00451 ptr = current_subject;
00452 for (;;)
00453   {
00454   int i, j;
00455   int clen, dlen;
00456   unsigned int c, d;
00457 
00458   /* Make the new state list into the active state list and empty the
00459   new state list. */
00460 
00461   temp_states = active_states;
00462   active_states = new_states;
00463   new_states = temp_states;
00464   active_count = new_count;
00465   new_count = 0;
00466 
00467   workspace[0] ^= 1;              /* Remember for the restarting feature */
00468   workspace[1] = active_count;
00469 
00470 #ifdef DEBUG
00471   printf("%.*sNext character: rest of subject = \"", rlevel*2-2, SP);
00472   pchars((uschar *)ptr, strlen((char *)ptr), stdout);
00473   printf("\"\n");
00474 
00475   printf("%.*sActive states: ", rlevel*2-2, SP);
00476   for (i = 0; i < active_count; i++)
00477     printf("%d/%d ", active_states[i].offset, active_states[i].count);
00478   printf("\n");
00479 #endif
00480 
00481   /* Set the pointers for adding new states */
00482 
00483   next_active_state = active_states + active_count;
00484   next_new_state = new_states;
00485 
00486   /* Load the current character from the subject outside the loop, as many
00487   different states may want to look at it, and we assume that at least one
00488   will. */
00489 
00490   if (ptr < end_subject)
00491     {
00492     clen = 1;        /* Number of bytes in the character */
00493 #ifdef SUPPORT_UTF8
00494     if (utf8) { GETCHARLEN(c, ptr, clen); } else
00495 #endif  /* SUPPORT_UTF8 */
00496     c = *ptr;
00497     }
00498   else
00499     {
00500     clen = 0;        /* This indicates the end of the subject */
00501     c = NOTACHAR;    /* This value should never actually be used */
00502     }
00503 
00504   /* Scan up the active states and act on each one. The result of an action
00505   may be to add more states to the currently active list (e.g. on hitting a
00506   parenthesis) or it may be to put states on the new list, for considering
00507   when we move the character pointer on. */
00508 
00509   for (i = 0; i < active_count; i++)
00510     {
00511     stateblock *current_state = active_states + i;
00512     const uschar *code;
00513     int state_offset = current_state->offset;
00514     int count, codevalue, rrc;
00515 
00516 #ifdef DEBUG
00517     printf ("%.*sProcessing state %d c=", rlevel*2-2, SP, state_offset);
00518     if (clen == 0) printf("EOL\n");
00519       else if (c > 32 && c < 127) printf("'%c'\n", c);
00520         else printf("0x%02x\n", c);
00521 #endif
00522 
00523     /* This variable is referred to implicity in the ADD_xxx macros. */
00524 
00525     ims = current_state->ims;
00526 
00527     /* A negative offset is a special case meaning "hold off going to this
00528     (negated) state until the number of characters in the data field have
00529     been skipped". */
00530 
00531     if (state_offset < 0)
00532       {
00533       if (current_state->data > 0)
00534         {
00535         DPRINTF(("%.*sSkipping this character\n", rlevel*2-2, SP));
00536         ADD_NEW_DATA(state_offset, current_state->count,
00537           current_state->data - 1);
00538         continue;
00539         }
00540       else
00541         {
00542         current_state->offset = state_offset = -state_offset;
00543         }
00544       }
00545 
00546     /* Check for a duplicate state with the same count, and skip if found. */
00547 
00548     for (j = 0; j < i; j++)
00549       {
00550       if (active_states[j].offset == state_offset &&
00551           active_states[j].count == current_state->count)
00552         {
00553         DPRINTF(("%.*sDuplicate state: skipped\n", rlevel*2-2, SP));
00554         goto NEXT_ACTIVE_STATE;
00555         }
00556       }
00557 
00558     /* The state offset is the offset to the opcode */
00559 
00560     code = start_code + state_offset;
00561     codevalue = *code;
00562 
00563     /* If this opcode is followed by an inline character, load it. It is
00564     tempting to test for the presence of a subject character here, but that
00565     is wrong, because sometimes zero repetitions of the subject are
00566     permitted.
00567 
00568     We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
00569     argument that is not a data character - but is always one byte long. We
00570     have to take special action to deal with  \P, \p, \H, \h, \V, \v and \X in
00571     this case. To keep the other cases fast, convert these ones to new opcodes.
00572     */
00573 
00574     if (coptable[codevalue] > 0)
00575       {
00576       dlen = 1;
00577 #ifdef SUPPORT_UTF8
00578       if (utf8) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
00579 #endif  /* SUPPORT_UTF8 */
00580       d = code[coptable[codevalue]];
00581       if (codevalue >= OP_TYPESTAR)
00582         {
00583         switch(d)
00584           {
00585           case OP_ANYBYTE: return PCRE_ERROR_DFA_UITEM;
00586           case OP_NOTPROP:
00587           case OP_PROP: codevalue += OP_PROP_EXTRA; break;
00588           case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
00589           case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
00590           case OP_NOT_HSPACE:
00591           case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
00592           case OP_NOT_VSPACE:
00593           case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
00594           default: break;
00595           }
00596         }
00597       }
00598     else
00599       {
00600       dlen = 0;         /* Not strictly necessary, but compilers moan */
00601       d = NOTACHAR;     /* if these variables are not set. */
00602       }
00603 
00604 
00605     /* Now process the individual opcodes */
00606 
00607     switch (codevalue)
00608       {
00609 
00610 /* ========================================================================== */
00611       /* Reached a closing bracket. If not at the end of the pattern, carry
00612       on with the next opcode. Otherwise, unless we have an empty string and
00613       PCRE_NOTEMPTY is set, save the match data, shifting up all previous
00614       matches so we always have the longest first. */
00615 
00616       case OP_KET:
00617       case OP_KETRMIN:
00618       case OP_KETRMAX:
00619       if (code != end_code)
00620         {
00621         ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
00622         if (codevalue != OP_KET)
00623           {
00624           ADD_ACTIVE(state_offset - GET(code, 1), 0);
00625           }
00626         }
00627       else if (ptr > current_subject || (md->moptions & PCRE_NOTEMPTY) == 0)
00628         {
00629         if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
00630           else if (match_count > 0 && ++match_count * 2 >= offsetcount)
00631             match_count = 0;
00632         count = ((match_count == 0)? offsetcount : match_count * 2) - 2;
00633         if (count > 0) memmove(offsets + 2, offsets, count * sizeof(int));
00634         if (offsetcount >= 2)
00635           {
00636           offsets[0] = current_subject - start_subject;
00637           offsets[1] = ptr - start_subject;
00638           DPRINTF(("%.*sSet matched string = \"%.*s\"\n", rlevel*2-2, SP,
00639             offsets[1] - offsets[0], current_subject));
00640           }
00641         if ((md->moptions & PCRE_DFA_SHORTEST) != 0)
00642           {
00643           DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
00644             "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel,
00645             match_count, rlevel*2-2, SP));
00646           return match_count;
00647           }
00648         }
00649       break;
00650 
00651 /* ========================================================================== */
00652       /* These opcodes add to the current list of states without looking
00653       at the current character. */
00654 
00655       /*-----------------------------------------------------------------*/
00656       case OP_ALT:
00657       do { code += GET(code, 1); } while (*code == OP_ALT);
00658       ADD_ACTIVE(code - start_code, 0);
00659       break;
00660 
00661       /*-----------------------------------------------------------------*/
00662       case OP_BRA:
00663       case OP_SBRA:
00664       do
00665         {
00666         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00667         code += GET(code, 1);
00668         }
00669       while (*code == OP_ALT);
00670       break;
00671 
00672       /*-----------------------------------------------------------------*/
00673       case OP_CBRA:
00674       case OP_SCBRA:
00675       ADD_ACTIVE(code - start_code + 3 + LINK_SIZE,  0);
00676       code += GET(code, 1);
00677       while (*code == OP_ALT)
00678         {
00679         ADD_ACTIVE(code - start_code + 1 + LINK_SIZE,  0);
00680         code += GET(code, 1);
00681         }
00682       break;
00683 
00684       /*-----------------------------------------------------------------*/
00685       case OP_BRAZERO:
00686       case OP_BRAMINZERO:
00687       ADD_ACTIVE(state_offset + 1, 0);
00688       code += 1 + GET(code, 2);
00689       while (*code == OP_ALT) code += GET(code, 1);
00690       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00691       break;
00692 
00693       /*-----------------------------------------------------------------*/
00694       case OP_SKIPZERO:
00695       code += 1 + GET(code, 2);
00696       while (*code == OP_ALT) code += GET(code, 1);
00697       ADD_ACTIVE(code - start_code + 1 + LINK_SIZE, 0);
00698       break;
00699 
00700       /*-----------------------------------------------------------------*/
00701       case OP_CIRC:
00702       if ((ptr == start_subject && (md->moptions & PCRE_NOTBOL) == 0) ||
00703           ((ims & PCRE_MULTILINE) != 0 &&
00704             ptr != end_subject &&
00705             WAS_NEWLINE(ptr)))
00706         { ADD_ACTIVE(state_offset + 1, 0); }
00707       break;
00708 
00709       /*-----------------------------------------------------------------*/
00710       case OP_EOD:
00711       if (ptr >= end_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00712       break;
00713 
00714       /*-----------------------------------------------------------------*/
00715       case OP_OPT:
00716       ims = code[1];
00717       ADD_ACTIVE(state_offset + 2, 0);
00718       break;
00719 
00720       /*-----------------------------------------------------------------*/
00721       case OP_SOD:
00722       if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
00723       break;
00724 
00725       /*-----------------------------------------------------------------*/
00726       case OP_SOM:
00727       if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
00728       break;
00729 
00730 
00731 /* ========================================================================== */
00732       /* These opcodes inspect the next subject character, and sometimes
00733       the previous one as well, but do not have an argument. The variable
00734       clen contains the length of the current character and is zero if we are
00735       at the end of the subject. */
00736 
00737       /*-----------------------------------------------------------------*/
00738       case OP_ANY:
00739       if (clen > 0 && !IS_NEWLINE(ptr))
00740         { ADD_NEW(state_offset + 1, 0); }
00741       break;
00742 
00743       /*-----------------------------------------------------------------*/
00744       case OP_ALLANY:
00745       if (clen > 0)
00746         { ADD_NEW(state_offset + 1, 0); }
00747       break;
00748 
00749       /*-----------------------------------------------------------------*/
00750       case OP_EODN:
00751       if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - md->nllen))
00752         { ADD_ACTIVE(state_offset + 1, 0); }
00753       break;
00754 
00755       /*-----------------------------------------------------------------*/
00756       case OP_DOLL:
00757       if ((md->moptions & PCRE_NOTEOL) == 0)
00758         {
00759         if (clen == 0 ||
00760             ((md->poptions & PCRE_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
00761                ((ims & PCRE_MULTILINE) != 0 || ptr == end_subject - md->nllen)
00762             ))
00763           { ADD_ACTIVE(state_offset + 1, 0); }
00764         }
00765       else if ((ims & PCRE_MULTILINE) != 0 && IS_NEWLINE(ptr))
00766         { ADD_ACTIVE(state_offset + 1, 0); }
00767       break;
00768 
00769       /*-----------------------------------------------------------------*/
00770 
00771       case OP_DIGIT:
00772       case OP_WHITESPACE:
00773       case OP_WORDCHAR:
00774       if (clen > 0 && c < 256 &&
00775             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
00776         { ADD_NEW(state_offset + 1, 0); }
00777       break;
00778 
00779       /*-----------------------------------------------------------------*/
00780       case OP_NOT_DIGIT:
00781       case OP_NOT_WHITESPACE:
00782       case OP_NOT_WORDCHAR:
00783       if (clen > 0 && (c >= 256 ||
00784             ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
00785         { ADD_NEW(state_offset + 1, 0); }
00786       break;
00787 
00788       /*-----------------------------------------------------------------*/
00789       case OP_WORD_BOUNDARY:
00790       case OP_NOT_WORD_BOUNDARY:
00791         {
00792         int left_word, right_word;
00793 
00794         if (ptr > start_subject)
00795           {
00796           const uschar *temp = ptr - 1;
00797 #ifdef SUPPORT_UTF8
00798           if (utf8) BACKCHAR(temp);
00799 #endif
00800           GETCHARTEST(d, temp);
00801           left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
00802           }
00803         else left_word = 0;
00804 
00805         if (clen > 0) right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
00806           else right_word = 0;
00807 
00808         if ((left_word == right_word) == (codevalue == OP_NOT_WORD_BOUNDARY))
00809           { ADD_ACTIVE(state_offset + 1, 0); }
00810         }
00811       break;
00812 
00813 
00814       /*-----------------------------------------------------------------*/
00815       /* Check the next character by Unicode property. We will get here only
00816       if the support is in the binary; otherwise a compile-time error occurs.
00817       */
00818 
00819 #ifdef SUPPORT_UCP
00820       case OP_PROP:
00821       case OP_NOTPROP:
00822       if (clen > 0)
00823         {
00824         BOOL OK;
00825         const ucd_record * prop = GET_UCD(c);
00826         switch(code[1])
00827           {
00828           case PT_ANY:
00829           OK = TRUE;
00830           break;
00831 
00832           case PT_LAMP:
00833           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
00834           break;
00835 
00836           case PT_GC:
00837           OK = _pcre_ucp_gentype[prop->chartype] == code[2];
00838           break;
00839 
00840           case PT_PC:
00841           OK = prop->chartype == code[2];
00842           break;
00843 
00844           case PT_SC:
00845           OK = prop->script == code[2];
00846           break;
00847 
00848           /* Should never occur, but keep compilers from grumbling. */
00849 
00850           default:
00851           OK = codevalue != OP_PROP;
00852           break;
00853           }
00854 
00855         if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
00856         }
00857       break;
00858 #endif
00859 
00860 
00861 
00862 /* ========================================================================== */
00863       /* These opcodes likewise inspect the subject character, but have an
00864       argument that is not a data character. It is one of these opcodes:
00865       OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
00866       OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
00867 
00868       case OP_TYPEPLUS:
00869       case OP_TYPEMINPLUS:
00870       case OP_TYPEPOSPLUS:
00871       count = current_state->count;  /* Already matched */
00872       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
00873       if (clen > 0)
00874         {
00875         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00876             (c < 256 &&
00877               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00878               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00879           {
00880           if (count > 0 && codevalue == OP_TYPEPOSPLUS)
00881             {
00882             active_count--;            /* Remove non-match possibility */
00883             next_active_state--;
00884             }
00885           count++;
00886           ADD_NEW(state_offset, count);
00887           }
00888         }
00889       break;
00890 
00891       /*-----------------------------------------------------------------*/
00892       case OP_TYPEQUERY:
00893       case OP_TYPEMINQUERY:
00894       case OP_TYPEPOSQUERY:
00895       ADD_ACTIVE(state_offset + 2, 0);
00896       if (clen > 0)
00897         {
00898         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00899             (c < 256 &&
00900               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00901               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00902           {
00903           if (codevalue == OP_TYPEPOSQUERY)
00904             {
00905             active_count--;            /* Remove non-match possibility */
00906             next_active_state--;
00907             }
00908           ADD_NEW(state_offset + 2, 0);
00909           }
00910         }
00911       break;
00912 
00913       /*-----------------------------------------------------------------*/
00914       case OP_TYPESTAR:
00915       case OP_TYPEMINSTAR:
00916       case OP_TYPEPOSSTAR:
00917       ADD_ACTIVE(state_offset + 2, 0);
00918       if (clen > 0)
00919         {
00920         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00921             (c < 256 &&
00922               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00923               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00924           {
00925           if (codevalue == OP_TYPEPOSSTAR)
00926             {
00927             active_count--;            /* Remove non-match possibility */
00928             next_active_state--;
00929             }
00930           ADD_NEW(state_offset, 0);
00931           }
00932         }
00933       break;
00934 
00935       /*-----------------------------------------------------------------*/
00936       case OP_TYPEEXACT:
00937       count = current_state->count;  /* Number already matched */
00938       if (clen > 0)
00939         {
00940         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00941             (c < 256 &&
00942               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00943               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00944           {
00945           if (++count >= GET2(code, 1))
00946             { ADD_NEW(state_offset + 4, 0); }
00947           else
00948             { ADD_NEW(state_offset, count); }
00949           }
00950         }
00951       break;
00952 
00953       /*-----------------------------------------------------------------*/
00954       case OP_TYPEUPTO:
00955       case OP_TYPEMINUPTO:
00956       case OP_TYPEPOSUPTO:
00957       ADD_ACTIVE(state_offset + 4, 0);
00958       count = current_state->count;  /* Number already matched */
00959       if (clen > 0)
00960         {
00961         if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
00962             (c < 256 &&
00963               (d != OP_ANY || !IS_NEWLINE(ptr)) &&
00964               ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
00965           {
00966           if (codevalue == OP_TYPEPOSUPTO)
00967             {
00968             active_count--;           /* Remove non-match possibility */
00969             next_active_state--;
00970             }
00971           if (++count >= GET2(code, 1))
00972             { ADD_NEW(state_offset + 4, 0); }
00973           else
00974             { ADD_NEW(state_offset, count); }
00975           }
00976         }
00977       break;
00978 
00979 /* ========================================================================== */
00980       /* These are virtual opcodes that are used when something like
00981       OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
00982       argument. It keeps the code above fast for the other cases. The argument
00983       is in the d variable. */
00984 
00985 #ifdef SUPPORT_UCP
00986       case OP_PROP_EXTRA + OP_TYPEPLUS:
00987       case OP_PROP_EXTRA + OP_TYPEMINPLUS:
00988       case OP_PROP_EXTRA + OP_TYPEPOSPLUS:
00989       count = current_state->count;           /* Already matched */
00990       if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
00991       if (clen > 0)
00992         {
00993         BOOL OK;
00994         const ucd_record * prop = GET_UCD(c);
00995         switch(code[2])
00996           {
00997           case PT_ANY:
00998           OK = TRUE;
00999           break;
01000 
01001           case PT_LAMP:
01002           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01003           break;
01004 
01005           case PT_GC:
01006           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01007           break;
01008 
01009           case PT_PC:
01010           OK = prop->chartype == code[3];
01011           break;
01012 
01013           case PT_SC:
01014           OK = prop->script == code[3];
01015           break;
01016 
01017           /* Should never occur, but keep compilers from grumbling. */
01018 
01019           default:
01020           OK = codevalue != OP_PROP;
01021           break;
01022           }
01023 
01024         if (OK == (d == OP_PROP))
01025           {
01026           if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
01027             {
01028             active_count--;           /* Remove non-match possibility */
01029             next_active_state--;
01030             }
01031           count++;
01032           ADD_NEW(state_offset, count);
01033           }
01034         }
01035       break;
01036 
01037       /*-----------------------------------------------------------------*/
01038       case OP_EXTUNI_EXTRA + OP_TYPEPLUS:
01039       case OP_EXTUNI_EXTRA + OP_TYPEMINPLUS:
01040       case OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS:
01041       count = current_state->count;  /* Already matched */
01042       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01043       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01044         {
01045         const uschar *nptr = ptr + clen;
01046         int ncount = 0;
01047         if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
01048           {
01049           active_count--;           /* Remove non-match possibility */
01050           next_active_state--;
01051           }
01052         while (nptr < end_subject)
01053           {
01054           int nd;
01055           int ndlen = 1;
01056           GETCHARLEN(nd, nptr, ndlen);
01057           if (UCD_CATEGORY(nd) != ucp_M) break;
01058           ncount++;
01059           nptr += ndlen;
01060           }
01061         count++;
01062         ADD_NEW_DATA(-state_offset, count, ncount);
01063         }
01064       break;
01065 #endif
01066 
01067       /*-----------------------------------------------------------------*/
01068       case OP_ANYNL_EXTRA + OP_TYPEPLUS:
01069       case OP_ANYNL_EXTRA + OP_TYPEMINPLUS:
01070       case OP_ANYNL_EXTRA + OP_TYPEPOSPLUS:
01071       count = current_state->count;  /* Already matched */
01072       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01073       if (clen > 0)
01074         {
01075         int ncount = 0;
01076         switch (c)
01077           {
01078           case 0x000b:
01079           case 0x000c:
01080           case 0x0085:
01081           case 0x2028:
01082           case 0x2029:
01083           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01084           goto ANYNL01;
01085 
01086           case 0x000d:
01087           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01088           /* Fall through */
01089 
01090           ANYNL01:
01091           case 0x000a:
01092           if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
01093             {
01094             active_count--;           /* Remove non-match possibility */
01095             next_active_state--;
01096             }
01097           count++;
01098           ADD_NEW_DATA(-state_offset, count, ncount);
01099           break;
01100 
01101           default:
01102           break;
01103           }
01104         }
01105       break;
01106 
01107       /*-----------------------------------------------------------------*/
01108       case OP_VSPACE_EXTRA + OP_TYPEPLUS:
01109       case OP_VSPACE_EXTRA + OP_TYPEMINPLUS:
01110       case OP_VSPACE_EXTRA + OP_TYPEPOSPLUS:
01111       count = current_state->count;  /* Already matched */
01112       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01113       if (clen > 0)
01114         {
01115         BOOL OK;
01116         switch (c)
01117           {
01118           case 0x000a:
01119           case 0x000b:
01120           case 0x000c:
01121           case 0x000d:
01122           case 0x0085:
01123           case 0x2028:
01124           case 0x2029:
01125           OK = TRUE;
01126           break;
01127 
01128           default:
01129           OK = FALSE;
01130           break;
01131           }
01132 
01133         if (OK == (d == OP_VSPACE))
01134           {
01135           if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
01136             {
01137             active_count--;           /* Remove non-match possibility */
01138             next_active_state--;
01139             }
01140           count++;
01141           ADD_NEW_DATA(-state_offset, count, 0);
01142           }
01143         }
01144       break;
01145 
01146       /*-----------------------------------------------------------------*/
01147       case OP_HSPACE_EXTRA + OP_TYPEPLUS:
01148       case OP_HSPACE_EXTRA + OP_TYPEMINPLUS:
01149       case OP_HSPACE_EXTRA + OP_TYPEPOSPLUS:
01150       count = current_state->count;  /* Already matched */
01151       if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
01152       if (clen > 0)
01153         {
01154         BOOL OK;
01155         switch (c)
01156           {
01157           case 0x09:      /* HT */
01158           case 0x20:      /* SPACE */
01159           case 0xa0:      /* NBSP */
01160           case 0x1680:    /* OGHAM SPACE MARK */
01161           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01162           case 0x2000:    /* EN QUAD */
01163           case 0x2001:    /* EM QUAD */
01164           case 0x2002:    /* EN SPACE */
01165           case 0x2003:    /* EM SPACE */
01166           case 0x2004:    /* THREE-PER-EM SPACE */
01167           case 0x2005:    /* FOUR-PER-EM SPACE */
01168           case 0x2006:    /* SIX-PER-EM SPACE */
01169           case 0x2007:    /* FIGURE SPACE */
01170           case 0x2008:    /* PUNCTUATION SPACE */
01171           case 0x2009:    /* THIN SPACE */
01172           case 0x200A:    /* HAIR SPACE */
01173           case 0x202f:    /* NARROW NO-BREAK SPACE */
01174           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01175           case 0x3000:    /* IDEOGRAPHIC SPACE */
01176           OK = TRUE;
01177           break;
01178 
01179           default:
01180           OK = FALSE;
01181           break;
01182           }
01183 
01184         if (OK == (d == OP_HSPACE))
01185           {
01186           if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
01187             {
01188             active_count--;           /* Remove non-match possibility */
01189             next_active_state--;
01190             }
01191           count++;
01192           ADD_NEW_DATA(-state_offset, count, 0);
01193           }
01194         }
01195       break;
01196 
01197       /*-----------------------------------------------------------------*/
01198 #ifdef SUPPORT_UCP
01199       case OP_PROP_EXTRA + OP_TYPEQUERY:
01200       case OP_PROP_EXTRA + OP_TYPEMINQUERY:
01201       case OP_PROP_EXTRA + OP_TYPEPOSQUERY:
01202       count = 4;
01203       goto QS1;
01204 
01205       case OP_PROP_EXTRA + OP_TYPESTAR:
01206       case OP_PROP_EXTRA + OP_TYPEMINSTAR:
01207       case OP_PROP_EXTRA + OP_TYPEPOSSTAR:
01208       count = 0;
01209 
01210       QS1:
01211 
01212       ADD_ACTIVE(state_offset + 4, 0);
01213       if (clen > 0)
01214         {
01215         BOOL OK;
01216         const ucd_record * prop = GET_UCD(c);
01217         switch(code[2])
01218           {
01219           case PT_ANY:
01220           OK = TRUE;
01221           break;
01222 
01223           case PT_LAMP:
01224           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01225           break;
01226 
01227           case PT_GC:
01228           OK = _pcre_ucp_gentype[prop->chartype] == code[3];
01229           break;
01230 
01231           case PT_PC:
01232           OK = prop->chartype == code[3];
01233           break;
01234 
01235           case PT_SC:
01236           OK = prop->script == code[3];
01237           break;
01238 
01239           /* Should never occur, but keep compilers from grumbling. */
01240 
01241           default:
01242           OK = codevalue != OP_PROP;
01243           break;
01244           }
01245 
01246         if (OK == (d == OP_PROP))
01247           {
01248           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
01249               codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
01250             {
01251             active_count--;           /* Remove non-match possibility */
01252             next_active_state--;
01253             }
01254           ADD_NEW(state_offset + count, 0);
01255           }
01256         }
01257       break;
01258 
01259       /*-----------------------------------------------------------------*/
01260       case OP_EXTUNI_EXTRA + OP_TYPEQUERY:
01261       case OP_EXTUNI_EXTRA + OP_TYPEMINQUERY:
01262       case OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY:
01263       count = 2;
01264       goto QS2;
01265 
01266       case OP_EXTUNI_EXTRA + OP_TYPESTAR:
01267       case OP_EXTUNI_EXTRA + OP_TYPEMINSTAR:
01268       case OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR:
01269       count = 0;
01270 
01271       QS2:
01272 
01273       ADD_ACTIVE(state_offset + 2, 0);
01274       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01275         {
01276         const uschar *nptr = ptr + clen;
01277         int ncount = 0;
01278         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
01279             codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
01280           {
01281           active_count--;           /* Remove non-match possibility */
01282           next_active_state--;
01283           }
01284         while (nptr < end_subject)
01285           {
01286           int nd;
01287           int ndlen = 1;
01288           GETCHARLEN(nd, nptr, ndlen);
01289           if (UCD_CATEGORY(nd) != ucp_M) break;
01290           ncount++;
01291           nptr += ndlen;
01292           }
01293         ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01294         }
01295       break;
01296 #endif
01297 
01298       /*-----------------------------------------------------------------*/
01299       case OP_ANYNL_EXTRA + OP_TYPEQUERY:
01300       case OP_ANYNL_EXTRA + OP_TYPEMINQUERY:
01301       case OP_ANYNL_EXTRA + OP_TYPEPOSQUERY:
01302       count = 2;
01303       goto QS3;
01304 
01305       case OP_ANYNL_EXTRA + OP_TYPESTAR:
01306       case OP_ANYNL_EXTRA + OP_TYPEMINSTAR:
01307       case OP_ANYNL_EXTRA + OP_TYPEPOSSTAR:
01308       count = 0;
01309 
01310       QS3:
01311       ADD_ACTIVE(state_offset + 2, 0);
01312       if (clen > 0)
01313         {
01314         int ncount = 0;
01315         switch (c)
01316           {
01317           case 0x000b:
01318           case 0x000c:
01319           case 0x0085:
01320           case 0x2028:
01321           case 0x2029:
01322           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01323           goto ANYNL02;
01324 
01325           case 0x000d:
01326           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01327           /* Fall through */
01328 
01329           ANYNL02:
01330           case 0x000a:
01331           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
01332               codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
01333             {
01334             active_count--;           /* Remove non-match possibility */
01335             next_active_state--;
01336             }
01337           ADD_NEW_DATA(-(state_offset + count), 0, ncount);
01338           break;
01339 
01340           default:
01341           break;
01342           }
01343         }
01344       break;
01345 
01346       /*-----------------------------------------------------------------*/
01347       case OP_VSPACE_EXTRA + OP_TYPEQUERY:
01348       case OP_VSPACE_EXTRA + OP_TYPEMINQUERY:
01349       case OP_VSPACE_EXTRA + OP_TYPEPOSQUERY:
01350       count = 2;
01351       goto QS4;
01352 
01353       case OP_VSPACE_EXTRA + OP_TYPESTAR:
01354       case OP_VSPACE_EXTRA + OP_TYPEMINSTAR:
01355       case OP_VSPACE_EXTRA + OP_TYPEPOSSTAR:
01356       count = 0;
01357 
01358       QS4:
01359       ADD_ACTIVE(state_offset + 2, 0);
01360       if (clen > 0)
01361         {
01362         BOOL OK;
01363         switch (c)
01364           {
01365           case 0x000a:
01366           case 0x000b:
01367           case 0x000c:
01368           case 0x000d:
01369           case 0x0085:
01370           case 0x2028:
01371           case 0x2029:
01372           OK = TRUE;
01373           break;
01374 
01375           default:
01376           OK = FALSE;
01377           break;
01378           }
01379         if (OK == (d == OP_VSPACE))
01380           {
01381           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
01382               codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
01383             {
01384             active_count--;           /* Remove non-match possibility */
01385             next_active_state--;
01386             }
01387           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01388           }
01389         }
01390       break;
01391 
01392       /*-----------------------------------------------------------------*/
01393       case OP_HSPACE_EXTRA + OP_TYPEQUERY:
01394       case OP_HSPACE_EXTRA + OP_TYPEMINQUERY:
01395       case OP_HSPACE_EXTRA + OP_TYPEPOSQUERY:
01396       count = 2;
01397       goto QS5;
01398 
01399       case OP_HSPACE_EXTRA + OP_TYPESTAR:
01400       case OP_HSPACE_EXTRA + OP_TYPEMINSTAR:
01401       case OP_HSPACE_EXTRA + OP_TYPEPOSSTAR:
01402       count = 0;
01403 
01404       QS5:
01405       ADD_ACTIVE(state_offset + 2, 0);
01406       if (clen > 0)
01407         {
01408         BOOL OK;
01409         switch (c)
01410           {
01411           case 0x09:      /* HT */
01412           case 0x20:      /* SPACE */
01413           case 0xa0:      /* NBSP */
01414           case 0x1680:    /* OGHAM SPACE MARK */
01415           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01416           case 0x2000:    /* EN QUAD */
01417           case 0x2001:    /* EM QUAD */
01418           case 0x2002:    /* EN SPACE */
01419           case 0x2003:    /* EM SPACE */
01420           case 0x2004:    /* THREE-PER-EM SPACE */
01421           case 0x2005:    /* FOUR-PER-EM SPACE */
01422           case 0x2006:    /* SIX-PER-EM SPACE */
01423           case 0x2007:    /* FIGURE SPACE */
01424           case 0x2008:    /* PUNCTUATION SPACE */
01425           case 0x2009:    /* THIN SPACE */
01426           case 0x200A:    /* HAIR SPACE */
01427           case 0x202f:    /* NARROW NO-BREAK SPACE */
01428           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01429           case 0x3000:    /* IDEOGRAPHIC SPACE */
01430           OK = TRUE;
01431           break;
01432 
01433           default:
01434           OK = FALSE;
01435           break;
01436           }
01437 
01438         if (OK == (d == OP_HSPACE))
01439           {
01440           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
01441               codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
01442             {
01443             active_count--;           /* Remove non-match possibility */
01444             next_active_state--;
01445             }
01446           ADD_NEW_DATA(-(state_offset + count), 0, 0);
01447           }
01448         }
01449       break;
01450 
01451       /*-----------------------------------------------------------------*/
01452 #ifdef SUPPORT_UCP
01453       case OP_PROP_EXTRA + OP_TYPEEXACT:
01454       case OP_PROP_EXTRA + OP_TYPEUPTO:
01455       case OP_PROP_EXTRA + OP_TYPEMINUPTO:
01456       case OP_PROP_EXTRA + OP_TYPEPOSUPTO:
01457       if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
01458         { ADD_ACTIVE(state_offset + 6, 0); }
01459       count = current_state->count;  /* Number already matched */
01460       if (clen > 0)
01461         {
01462         BOOL OK;
01463         const ucd_record * prop = GET_UCD(c);
01464         switch(code[4])
01465           {
01466           case PT_ANY:
01467           OK = TRUE;
01468           break;
01469 
01470           case PT_LAMP:
01471           OK = prop->chartype == ucp_Lu || prop->chartype == ucp_Ll || prop->chartype == ucp_Lt;
01472           break;
01473 
01474           case PT_GC:
01475           OK = _pcre_ucp_gentype[prop->chartype] == code[5];
01476           break;
01477 
01478           case PT_PC:
01479           OK = prop->chartype == code[5];
01480           break;
01481 
01482           case PT_SC:
01483           OK = prop->script == code[5];
01484           break;
01485 
01486           /* Should never occur, but keep compilers from grumbling. */
01487 
01488           default:
01489           OK = codevalue != OP_PROP;
01490           break;
01491           }
01492 
01493         if (OK == (d == OP_PROP))
01494           {
01495           if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
01496             {
01497             active_count--;           /* Remove non-match possibility */
01498             next_active_state--;
01499             }
01500           if (++count >= GET2(code, 1))
01501             { ADD_NEW(state_offset + 6, 0); }
01502           else
01503             { ADD_NEW(state_offset, count); }
01504           }
01505         }
01506       break;
01507 
01508       /*-----------------------------------------------------------------*/
01509       case OP_EXTUNI_EXTRA + OP_TYPEEXACT:
01510       case OP_EXTUNI_EXTRA + OP_TYPEUPTO:
01511       case OP_EXTUNI_EXTRA + OP_TYPEMINUPTO:
01512       case OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO:
01513       if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
01514         { ADD_ACTIVE(state_offset + 4, 0); }
01515       count = current_state->count;  /* Number already matched */
01516       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01517         {
01518         const uschar *nptr = ptr + clen;
01519         int ncount = 0;
01520         if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
01521           {
01522           active_count--;           /* Remove non-match possibility */
01523           next_active_state--;
01524           }
01525         while (nptr < end_subject)
01526           {
01527           int nd;
01528           int ndlen = 1;
01529           GETCHARLEN(nd, nptr, ndlen);
01530           if (UCD_CATEGORY(nd) != ucp_M) break;
01531           ncount++;
01532           nptr += ndlen;
01533           }
01534         if (++count >= GET2(code, 1))
01535           { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01536         else
01537           { ADD_NEW_DATA(-state_offset, count, ncount); }
01538         }
01539       break;
01540 #endif
01541 
01542       /*-----------------------------------------------------------------*/
01543       case OP_ANYNL_EXTRA + OP_TYPEEXACT:
01544       case OP_ANYNL_EXTRA + OP_TYPEUPTO:
01545       case OP_ANYNL_EXTRA + OP_TYPEMINUPTO:
01546       case OP_ANYNL_EXTRA + OP_TYPEPOSUPTO:
01547       if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
01548         { ADD_ACTIVE(state_offset + 4, 0); }
01549       count = current_state->count;  /* Number already matched */
01550       if (clen > 0)
01551         {
01552         int ncount = 0;
01553         switch (c)
01554           {
01555           case 0x000b:
01556           case 0x000c:
01557           case 0x0085:
01558           case 0x2028:
01559           case 0x2029:
01560           if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01561           goto ANYNL03;
01562 
01563           case 0x000d:
01564           if (ptr + 1 < end_subject && ptr[1] == 0x0a) ncount = 1;
01565           /* Fall through */
01566 
01567           ANYNL03:
01568           case 0x000a:
01569           if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
01570             {
01571             active_count--;           /* Remove non-match possibility */
01572             next_active_state--;
01573             }
01574           if (++count >= GET2(code, 1))
01575             { ADD_NEW_DATA(-(state_offset + 4), 0, ncount); }
01576           else
01577             { ADD_NEW_DATA(-state_offset, count, ncount); }
01578           break;
01579 
01580           default:
01581           break;
01582           }
01583         }
01584       break;
01585 
01586       /*-----------------------------------------------------------------*/
01587       case OP_VSPACE_EXTRA + OP_TYPEEXACT:
01588       case OP_VSPACE_EXTRA + OP_TYPEUPTO:
01589       case OP_VSPACE_EXTRA + OP_TYPEMINUPTO:
01590       case OP_VSPACE_EXTRA + OP_TYPEPOSUPTO:
01591       if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
01592         { ADD_ACTIVE(state_offset + 4, 0); }
01593       count = current_state->count;  /* Number already matched */
01594       if (clen > 0)
01595         {
01596         BOOL OK;
01597         switch (c)
01598           {
01599           case 0x000a:
01600           case 0x000b:
01601           case 0x000c:
01602           case 0x000d:
01603           case 0x0085:
01604           case 0x2028:
01605           case 0x2029:
01606           OK = TRUE;
01607           break;
01608 
01609           default:
01610           OK = FALSE;
01611           }
01612 
01613         if (OK == (d == OP_VSPACE))
01614           {
01615           if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
01616             {
01617             active_count--;           /* Remove non-match possibility */
01618             next_active_state--;
01619             }
01620           if (++count >= GET2(code, 1))
01621             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01622           else
01623             { ADD_NEW_DATA(-state_offset, count, 0); }
01624           }
01625         }
01626       break;
01627 
01628       /*-----------------------------------------------------------------*/
01629       case OP_HSPACE_EXTRA + OP_TYPEEXACT:
01630       case OP_HSPACE_EXTRA + OP_TYPEUPTO:
01631       case OP_HSPACE_EXTRA + OP_TYPEMINUPTO:
01632       case OP_HSPACE_EXTRA + OP_TYPEPOSUPTO:
01633       if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
01634         { ADD_ACTIVE(state_offset + 4, 0); }
01635       count = current_state->count;  /* Number already matched */
01636       if (clen > 0)
01637         {
01638         BOOL OK;
01639         switch (c)
01640           {
01641           case 0x09:      /* HT */
01642           case 0x20:      /* SPACE */
01643           case 0xa0:      /* NBSP */
01644           case 0x1680:    /* OGHAM SPACE MARK */
01645           case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01646           case 0x2000:    /* EN QUAD */
01647           case 0x2001:    /* EM QUAD */
01648           case 0x2002:    /* EN SPACE */
01649           case 0x2003:    /* EM SPACE */
01650           case 0x2004:    /* THREE-PER-EM SPACE */
01651           case 0x2005:    /* FOUR-PER-EM SPACE */
01652           case 0x2006:    /* SIX-PER-EM SPACE */
01653           case 0x2007:    /* FIGURE SPACE */
01654           case 0x2008:    /* PUNCTUATION SPACE */
01655           case 0x2009:    /* THIN SPACE */
01656           case 0x200A:    /* HAIR SPACE */
01657           case 0x202f:    /* NARROW NO-BREAK SPACE */
01658           case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01659           case 0x3000:    /* IDEOGRAPHIC SPACE */
01660           OK = TRUE;
01661           break;
01662 
01663           default:
01664           OK = FALSE;
01665           break;
01666           }
01667 
01668         if (OK == (d == OP_HSPACE))
01669           {
01670           if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
01671             {
01672             active_count--;           /* Remove non-match possibility */
01673             next_active_state--;
01674             }
01675           if (++count >= GET2(code, 1))
01676             { ADD_NEW_DATA(-(state_offset + 4), 0, 0); }
01677           else
01678             { ADD_NEW_DATA(-state_offset, count, 0); }
01679           }
01680         }
01681       break;
01682 
01683 /* ========================================================================== */
01684       /* These opcodes are followed by a character that is usually compared
01685       to the current subject character; it is loaded into d. We still get
01686       here even if there is no subject character, because in some cases zero
01687       repetitions are permitted. */
01688 
01689       /*-----------------------------------------------------------------*/
01690       case OP_CHAR:
01691       if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
01692       break;
01693 
01694       /*-----------------------------------------------------------------*/
01695       case OP_CHARNC:
01696       if (clen == 0) break;
01697 
01698 #ifdef SUPPORT_UTF8
01699       if (utf8)
01700         {
01701         if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
01702           {
01703           unsigned int othercase;
01704           if (c < 128) othercase = fcc[c]; else
01705 
01706           /* If we have Unicode property support, we can use it to test the
01707           other case of the character. */
01708 
01709 #ifdef SUPPORT_UCP
01710           othercase = UCD_OTHERCASE(c);
01711 #else
01712           othercase = NOTACHAR;
01713 #endif
01714 
01715           if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
01716           }
01717         }
01718       else
01719 #endif  /* SUPPORT_UTF8 */
01720 
01721       /* Non-UTF-8 mode */
01722         {
01723         if (lcc[c] == lcc[d]) { ADD_NEW(state_offset + 2, 0); }
01724         }
01725       break;
01726 
01727 
01728 #ifdef SUPPORT_UCP
01729       /*-----------------------------------------------------------------*/
01730       /* This is a tricky one because it can match more than one character.
01731       Find out how many characters to skip, and then set up a negative state
01732       to wait for them to pass before continuing. */
01733 
01734       case OP_EXTUNI:
01735       if (clen > 0 && UCD_CATEGORY(c) != ucp_M)
01736         {
01737         const uschar *nptr = ptr + clen;
01738         int ncount = 0;
01739         while (nptr < end_subject)
01740           {
01741           int nclen = 1;
01742           GETCHARLEN(c, nptr, nclen);
01743           if (UCD_CATEGORY(c) != ucp_M) break;
01744           ncount++;
01745           nptr += nclen;
01746           }
01747         ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
01748         }
01749       break;
01750 #endif
01751 
01752       /*-----------------------------------------------------------------*/
01753       /* This is a tricky like EXTUNI because it too can match more than one
01754       character (when CR is followed by LF). In this case, set up a negative
01755       state to wait for one character to pass before continuing. */
01756 
01757       case OP_ANYNL:
01758       if (clen > 0) switch(c)
01759         {
01760         case 0x000b:
01761         case 0x000c:
01762         case 0x0085:
01763         case 0x2028:
01764         case 0x2029:
01765         if ((md->moptions & PCRE_BSR_ANYCRLF) != 0) break;
01766 
01767         case 0x000a:
01768         ADD_NEW(state_offset + 1, 0);
01769         break;
01770 
01771         case 0x000d:
01772         if (ptr + 1 < end_subject && ptr[1] == 0x0a)
01773           {
01774           ADD_NEW_DATA(-(state_offset + 1), 0, 1);
01775           }
01776         else
01777           {
01778           ADD_NEW(state_offset + 1, 0);
01779           }
01780         break;
01781         }
01782       break;
01783 
01784       /*-----------------------------------------------------------------*/
01785       case OP_NOT_VSPACE:
01786       if (clen > 0) switch(c)
01787         {
01788         case 0x000a:
01789         case 0x000b:
01790         case 0x000c:
01791         case 0x000d:
01792         case 0x0085:
01793         case 0x2028:
01794         case 0x2029:
01795         break;
01796 
01797         default:
01798         ADD_NEW(state_offset + 1, 0);
01799         break;
01800         }
01801       break;
01802 
01803       /*-----------------------------------------------------------------*/
01804       case OP_VSPACE:
01805       if (clen > 0) switch(c)
01806         {
01807         case 0x000a:
01808         case 0x000b:
01809         case 0x000c:
01810         case 0x000d:
01811         case 0x0085:
01812         case 0x2028:
01813         case 0x2029:
01814         ADD_NEW(state_offset + 1, 0);
01815         break;
01816 
01817         default: break;
01818         }
01819       break;
01820 
01821       /*-----------------------------------------------------------------*/
01822       case OP_NOT_HSPACE:
01823       if (clen > 0) switch(c)
01824         {
01825         case 0x09:      /* HT */
01826         case 0x20:      /* SPACE */
01827         case 0xa0:      /* NBSP */
01828         case 0x1680:    /* OGHAM SPACE MARK */
01829         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01830         case 0x2000:    /* EN QUAD */
01831         case 0x2001:    /* EM QUAD */
01832         case 0x2002:    /* EN SPACE */
01833         case 0x2003:    /* EM SPACE */
01834         case 0x2004:    /* THREE-PER-EM SPACE */
01835         case 0x2005:    /* FOUR-PER-EM SPACE */
01836         case 0x2006:    /* SIX-PER-EM SPACE */
01837         case 0x2007:    /* FIGURE SPACE */
01838         case 0x2008:    /* PUNCTUATION SPACE */
01839         case 0x2009:    /* THIN SPACE */
01840         case 0x200A:    /* HAIR SPACE */
01841         case 0x202f:    /* NARROW NO-BREAK SPACE */
01842         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01843         case 0x3000:    /* IDEOGRAPHIC SPACE */
01844         break;
01845 
01846         default:
01847         ADD_NEW(state_offset + 1, 0);
01848         break;
01849         }
01850       break;
01851 
01852       /*-----------------------------------------------------------------*/
01853       case OP_HSPACE:
01854       if (clen > 0) switch(c)
01855         {
01856         case 0x09:      /* HT */
01857         case 0x20:      /* SPACE */
01858         case 0xa0:      /* NBSP */
01859         case 0x1680:    /* OGHAM SPACE MARK */
01860         case 0x180e:    /* MONGOLIAN VOWEL SEPARATOR */
01861         case 0x2000:    /* EN QUAD */
01862         case 0x2001:    /* EM QUAD */
01863         case 0x2002:    /* EN SPACE */
01864         case 0x2003:    /* EM SPACE */
01865         case 0x2004:    /* THREE-PER-EM SPACE */
01866         case 0x2005:    /* FOUR-PER-EM SPACE */
01867         case 0x2006:    /* SIX-PER-EM SPACE */
01868         case 0x2007:    /* FIGURE SPACE */
01869         case 0x2008:    /* PUNCTUATION SPACE */
01870         case 0x2009:    /* THIN SPACE */
01871         case 0x200A:    /* HAIR SPACE */
01872         case 0x202f:    /* NARROW NO-BREAK SPACE */
01873         case 0x205f:    /* MEDIUM MATHEMATICAL SPACE */
01874         case 0x3000:    /* IDEOGRAPHIC SPACE */
01875         ADD_NEW(state_offset + 1, 0);
01876         break;
01877         }
01878       break;
01879 
01880       /*-----------------------------------------------------------------*/
01881       /* Match a negated single character. This is only used for one-byte
01882       characters, that is, we know that d < 256. The character we are
01883       checking (c) can be multibyte. */
01884 
01885       case OP_NOT:
01886       if (clen > 0)
01887         {
01888         unsigned int otherd = ((ims & PCRE_CASELESS) != 0)? fcc[d] : d;
01889         if (c != d && c != otherd) { ADD_NEW(state_offset + dlen + 1, 0); }
01890         }
01891       break;
01892 
01893       /*-----------------------------------------------------------------*/
01894       case OP_PLUS:
01895       case OP_MINPLUS:
01896       case OP_POSPLUS:
01897       case OP_NOTPLUS:
01898       case OP_NOTMINPLUS:
01899       case OP_NOTPOSPLUS:
01900       count = current_state->count;  /* Already matched */
01901       if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
01902       if (clen > 0)
01903         {
01904         unsigned int otherd = NOTACHAR;
01905         if ((ims & PCRE_CASELESS) != 0)
01906           {
01907 #ifdef SUPPORT_UTF8
01908           if (utf8 && d >= 128)
01909             {
01910 #ifdef SUPPORT_UCP
01911             otherd = UCD_OTHERCASE(d);
01912 #endif  /* SUPPORT_UCP */
01913             }
01914           else
01915 #endif  /* SUPPORT_UTF8 */
01916           otherd = fcc[d];
01917           }
01918         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01919           {
01920           if (count > 0 &&
01921               (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
01922             {
01923             active_count--;             /* Remove non-match possibility */
01924             next_active_state--;
01925             }
01926           count++;
01927           ADD_NEW(state_offset, count);
01928           }
01929         }
01930       break;
01931 
01932       /*-----------------------------------------------------------------*/
01933       case OP_QUERY:
01934       case OP_MINQUERY:
01935       case OP_POSQUERY:
01936       case OP_NOTQUERY:
01937       case OP_NOTMINQUERY:
01938       case OP_NOTPOSQUERY:
01939       ADD_ACTIVE(state_offset + dlen + 1, 0);
01940       if (clen > 0)
01941         {
01942         unsigned int otherd = NOTACHAR;
01943         if ((ims & PCRE_CASELESS) != 0)
01944           {
01945 #ifdef SUPPORT_UTF8
01946           if (utf8 && d >= 128)
01947             {
01948 #ifdef SUPPORT_UCP
01949             otherd = UCD_OTHERCASE(d);
01950 #endif  /* SUPPORT_UCP */
01951             }
01952           else
01953 #endif  /* SUPPORT_UTF8 */
01954           otherd = fcc[d];
01955           }
01956         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01957           {
01958           if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
01959             {
01960             active_count--;            /* Remove non-match possibility */
01961             next_active_state--;
01962             }
01963           ADD_NEW(state_offset + dlen + 1, 0);
01964           }
01965         }
01966       break;
01967 
01968       /*-----------------------------------------------------------------*/
01969       case OP_STAR:
01970       case OP_MINSTAR:
01971       case OP_POSSTAR:
01972       case OP_NOTSTAR:
01973       case OP_NOTMINSTAR:
01974       case OP_NOTPOSSTAR:
01975       ADD_ACTIVE(state_offset + dlen + 1, 0);
01976       if (clen > 0)
01977         {
01978         unsigned int otherd = NOTACHAR;
01979         if ((ims & PCRE_CASELESS) != 0)
01980           {
01981 #ifdef SUPPORT_UTF8
01982           if (utf8 && d >= 128)
01983             {
01984 #ifdef SUPPORT_UCP
01985             otherd = UCD_OTHERCASE(d);
01986 #endif  /* SUPPORT_UCP */
01987             }
01988           else
01989 #endif  /* SUPPORT_UTF8 */
01990           otherd = fcc[d];
01991           }
01992         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
01993           {
01994           if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
01995             {
01996             active_count--;            /* Remove non-match possibility */
01997             next_active_state--;
01998             }
01999           ADD_NEW(state_offset, 0);
02000           }
02001         }
02002       break;
02003 
02004       /*-----------------------------------------------------------------*/
02005       case OP_EXACT:
02006       case OP_NOTEXACT:
02007       count = current_state->count;  /* Number already matched */
02008       if (clen > 0)
02009         {
02010         unsigned int otherd = NOTACHAR;
02011         if ((ims & PCRE_CASELESS) != 0)
02012           {
02013 #ifdef SUPPORT_UTF8
02014           if (utf8 && d >= 128)
02015             {
02016 #ifdef SUPPORT_UCP
02017             otherd = UCD_OTHERCASE(d);
02018 #endif  /* SUPPORT_UCP */
02019             }
02020           else
02021 #endif  /* SUPPORT_UTF8 */
02022           otherd = fcc[d];
02023           }
02024         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02025           {
02026           if (++count >= GET2(code, 1))
02027             { ADD_NEW(state_offset + dlen + 3, 0); }
02028           else
02029             { ADD_NEW(state_offset, count); }
02030           }
02031         }
02032       break;
02033 
02034       /*-----------------------------------------------------------------*/
02035       case OP_UPTO:
02036       case OP_MINUPTO:
02037       case OP_POSUPTO:
02038       case OP_NOTUPTO:
02039       case OP_NOTMINUPTO:
02040       case OP_NOTPOSUPTO:
02041       ADD_ACTIVE(state_offset + dlen + 3, 0);
02042       count = current_state->count;  /* Number already matched */
02043       if (clen > 0)
02044         {
02045         unsigned int otherd = NOTACHAR;
02046         if ((ims & PCRE_CASELESS) != 0)
02047           {
02048 #ifdef SUPPORT_UTF8
02049           if (utf8 && d >= 128)
02050             {
02051 #ifdef SUPPORT_UCP
02052             otherd = UCD_OTHERCASE(d);
02053 #endif  /* SUPPORT_UCP */
02054             }
02055           else
02056 #endif  /* SUPPORT_UTF8 */
02057           otherd = fcc[d];
02058           }
02059         if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
02060           {
02061           if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
02062             {
02063             active_count--;             /* Remove non-match possibility */
02064             next_active_state--;
02065             }
02066           if (++count >= GET2(code, 1))
02067             { ADD_NEW(state_offset + dlen + 3, 0); }
02068           else
02069             { ADD_NEW(state_offset, count); }
02070           }
02071         }
02072       break;
02073 
02074 
02075 /* ========================================================================== */
02076       /* These are the class-handling opcodes */
02077 
02078       case OP_CLASS:
02079       case OP_NCLASS:
02080       case OP_XCLASS:
02081         {
02082         BOOL isinclass = FALSE;
02083         int next_state_offset;
02084         const uschar *ecode;
02085 
02086         /* For a simple class, there is always just a 32-byte table, and we
02087         can set isinclass from it. */
02088 
02089         if (codevalue != OP_XCLASS)
02090           {
02091           ecode = code + 33;
02092           if (clen > 0)
02093             {
02094             isinclass = (c > 255)? (codevalue == OP_NCLASS) :
02095               ((code[1 + c/8] & (1 << (c&7))) != 0);
02096             }
02097           }
02098 
02099         /* An extended class may have a table or a list of single characters,
02100         ranges, or both, and it may be positive or negative. There's a
02101         function that sorts all this out. */
02102 
02103         else
02104          {
02105          ecode = code + GET(code, 1);
02106          if (clen > 0) isinclass = _pcre_xclass(c, code + 1 + LINK_SIZE);
02107          }
02108 
02109         /* At this point, isinclass is set for all kinds of class, and ecode
02110         points to the byte after the end of the class. If there is a
02111         quantifier, this is where it will be. */
02112 
02113         next_state_offset = ecode - start_code;
02114 
02115         switch (*ecode)
02116           {
02117           case OP_CRSTAR:
02118           case OP_CRMINSTAR:
02119           ADD_ACTIVE(next_state_offset + 1, 0);
02120           if (isinclass) { ADD_NEW(state_offset, 0); }
02121           break;
02122 
02123           case OP_CRPLUS:
02124           case OP_CRMINPLUS:
02125           count = current_state->count;  /* Already matched */
02126           if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
02127           if (isinclass) { count++; ADD_NEW(state_offset, count); }
02128           break;
02129 
02130           case OP_CRQUERY:
02131           case OP_CRMINQUERY:
02132           ADD_ACTIVE(next_state_offset + 1, 0);
02133           if (isinclass) { ADD_NEW(next_state_offset + 1, 0); }
02134           break;
02135 
02136           case OP_CRRANGE:
02137           case OP_CRMINRANGE:
02138           count = current_state->count;  /* Already matched */
02139           if (count >= GET2(ecode, 1))
02140             { ADD_ACTIVE(next_state_offset + 5, 0); }
02141           if (isinclass)
02142             {
02143             int max = GET2(ecode, 3);
02144             if (++count >= max && max != 0)   /* Max 0 => no limit */
02145               { ADD_NEW(next_state_offset + 5, 0); }
02146             else
02147               { ADD_NEW(state_offset, count); }
02148             }
02149           break;
02150 
02151           default:
02152           if (isinclass) { ADD_NEW(next_state_offset, 0); }
02153           break;
02154           }
02155         }
02156       break;
02157 
02158 /* ========================================================================== */
02159       /* These are the opcodes for fancy brackets of various kinds. We have
02160       to use recursion in order to handle them. The "always failing" assersion
02161       (?!) is optimised when compiling to OP_FAIL, so we have to support that,
02162       though the other "backtracking verbs" are not supported. */
02163 
02164       case OP_FAIL:
02165       break;
02166 
02167       case OP_ASSERT:
02168       case OP_ASSERT_NOT:
02169       case OP_ASSERTBACK:
02170       case OP_ASSERTBACK_NOT:
02171         {
02172         int rc;
02173         int local_offsets[2];
02174         int local_workspace[1000];
02175         const uschar *endasscode = code + GET(code, 1);
02176 
02177         while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02178 
02179         rc = internal_dfa_exec(
02180           md,                                   /* static match data */
02181           code,                                 /* this subexpression's code */
02182           ptr,                                  /* where we currently are */
02183           ptr - start_subject,                  /* start offset */
02184           local_offsets,                        /* offset vector */
02185           sizeof(local_offsets)/sizeof(int),    /* size of same */
02186           local_workspace,                      /* workspace vector */
02187           sizeof(local_workspace)/sizeof(int),  /* size of same */
02188           ims,                                  /* the current ims flags */
02189           rlevel,                               /* function recursion level */
02190           recursing);                           /* pass on regex recursion */
02191 
02192         if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
02193             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02194         }
02195       break;
02196 
02197       /*-----------------------------------------------------------------*/
02198       case OP_COND:
02199       case OP_SCOND:
02200         {
02201         int local_offsets[1000];
02202         int local_workspace[1000];
02203         int codelink = GET(code, 1);
02204         int condcode;
02205 
02206         /* Because of the way auto-callout works during compile, a callout item
02207         is inserted between OP_COND and an assertion condition. This does not
02208         happen for the other conditions. */
02209 
02210         if (code[LINK_SIZE+1] == OP_CALLOUT)
02211           {
02212           rrc = 0;
02213           if (pcre_callout != NULL)
02214             {
02215             pcre_callout_block cb;
02216             cb.version          = 1;   /* Version 1 of the callout block */
02217             cb.callout_number   = code[LINK_SIZE+2];
02218             cb.offset_vector    = offsets;
02219             cb.subject          = (PCRE_SPTR)start_subject;
02220             cb.subject_length   = end_subject - start_subject;
02221             cb.start_match      = current_subject - start_subject;
02222             cb.current_position = ptr - start_subject;
02223             cb.pattern_position = GET(code, LINK_SIZE + 3);
02224             cb.next_item_length = GET(code, 3 + 2*LINK_SIZE);
02225             cb.capture_top      = 1;
02226             cb.capture_last     = -1;
02227             cb.callout_data     = md->callout_data;
02228             if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
02229             }
02230           if (rrc > 0) break;                      /* Fail this thread */
02231           code += _pcre_OP_lengths[OP_CALLOUT];    /* Skip callout data */
02232           }
02233 
02234         condcode = code[LINK_SIZE+1];
02235 
02236         /* Back reference conditions are not supported */
02237 
02238         if (condcode == OP_CREF) return PCRE_ERROR_DFA_UCOND;
02239 
02240         /* The DEFINE condition is always false */
02241 
02242         if (condcode == OP_DEF)
02243           { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02244 
02245         /* The only supported version of OP_RREF is for the value RREF_ANY,
02246         which means "test if in any recursion". We can't test for specifically
02247         recursed groups. */
02248 
02249         else if (condcode == OP_RREF)
02250           {
02251           int value = GET2(code, LINK_SIZE+2);
02252           if (value != RREF_ANY) return PCRE_ERROR_DFA_UCOND;
02253           if (recursing > 0)
02254             { ADD_ACTIVE(state_offset + LINK_SIZE + 4, 0); }
02255           else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02256           }
02257 
02258         /* Otherwise, the condition is an assertion */
02259 
02260         else
02261           {
02262           int rc;
02263           const uschar *asscode = code + LINK_SIZE + 1;
02264           const uschar *endasscode = asscode + GET(asscode, 1);
02265 
02266           while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
02267 
02268           rc = internal_dfa_exec(
02269             md,                                   /* fixed match data */
02270             asscode,                              /* this subexpression's code */
02271             ptr,                                  /* where we currently are */
02272             ptr - start_subject,                  /* start offset */
02273             local_offsets,                        /* offset vector */
02274             sizeof(local_offsets)/sizeof(int),    /* size of same */
02275             local_workspace,                      /* workspace vector */
02276             sizeof(local_workspace)/sizeof(int),  /* size of same */
02277             ims,                                  /* the current ims flags */
02278             rlevel,                               /* function recursion level */
02279             recursing);                           /* pass on regex recursion */
02280 
02281           if ((rc >= 0) ==
02282                 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
02283             { ADD_ACTIVE(endasscode + LINK_SIZE + 1 - start_code, 0); }
02284           else
02285             { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
02286           }
02287         }
02288       break;
02289 
02290       /*-----------------------------------------------------------------*/
02291       case OP_RECURSE:
02292         {
02293         int local_offsets[1000];
02294         int local_workspace[1000];
02295         int rc;
02296 
02297         DPRINTF(("%.*sStarting regex recursion %d\n", rlevel*2-2, SP,
02298           recursing + 1));
02299 
02300         rc = internal_dfa_exec(
02301           md,                                   /* fixed match data */
02302           start_code + GET(code, 1),            /* this subexpression's code */
02303           ptr,                                  /* where we currently are */
02304           ptr - start_subject,                  /* start offset */
02305           local_offsets,                        /* offset vector */
02306           sizeof(local_offsets)/sizeof(int),    /* size of same */
02307           local_workspace,                      /* workspace vector */
02308           sizeof(local_workspace)/sizeof(int),  /* size of same */
02309           ims,                                  /* the current ims flags */
02310           rlevel,                               /* function recursion level */
02311           recursing + 1);                       /* regex recurse level */
02312 
02313         DPRINTF(("%.*sReturn from regex recursion %d: rc=%d\n", rlevel*2-2, SP,
02314           recursing + 1, rc));
02315 
02316         /* Ran out of internal offsets */
02317 
02318         if (rc == 0) return PCRE_ERROR_DFA_RECURSE;
02319 
02320         /* For each successful matched substring, set up the next state with a
02321         count of characters to skip before trying it. Note that the count is in
02322         characters, not bytes. */
02323 
02324         if (rc > 0)
02325           {
02326           for (rc = rc*2 - 2; rc >= 0; rc -= 2)
02327             {
02328             const uschar *p = start_subject + local_offsets[rc];
02329             const uschar *pp = start_subject + local_offsets[rc+1];
02330             int charcount = local_offsets[rc+1] - local_offsets[rc];
02331             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02332             if (charcount > 0)
02333               {
02334               ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0, (charcount - 1));
02335               }
02336             else
02337               {
02338               ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
02339               }
02340             }
02341           }
02342         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02343         }
02344       break;
02345 
02346       /*-----------------------------------------------------------------*/
02347       case OP_ONCE:
02348         {
02349         int local_offsets[2];
02350         int local_workspace[1000];
02351 
02352         int rc = internal_dfa_exec(
02353           md,                                   /* fixed match data */
02354           code,                                 /* this subexpression's code */
02355           ptr,                                  /* where we currently are */
02356           ptr - start_subject,                  /* start offset */
02357           local_offsets,                        /* offset vector */
02358           sizeof(local_offsets)/sizeof(int),    /* size of same */
02359           local_workspace,                      /* workspace vector */
02360           sizeof(local_workspace)/sizeof(int),  /* size of same */
02361           ims,                                  /* the current ims flags */
02362           rlevel,                               /* function recursion level */
02363           recursing);                           /* pass on regex recursion */
02364 
02365         if (rc >= 0)
02366           {
02367           const uschar *end_subpattern = code;
02368           int charcount = local_offsets[1] - local_offsets[0];
02369           int next_state_offset, repeat_state_offset;
02370 
02371           do { end_subpattern += GET(end_subpattern, 1); }
02372             while (*end_subpattern == OP_ALT);
02373           next_state_offset = end_subpattern - start_code + LINK_SIZE + 1;
02374 
02375           /* If the end of this subpattern is KETRMAX or KETRMIN, we must
02376           arrange for the repeat state also to be added to the relevant list.
02377           Calculate the offset, or set -1 for no repeat. */
02378 
02379           repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
02380                                  *end_subpattern == OP_KETRMIN)?
02381             end_subpattern - start_code - GET(end_subpattern, 1) : -1;
02382 
02383           /* If we have matched an empty string, add the next state at the
02384           current character pointer. This is important so that the duplicate
02385           checking kicks in, which is what breaks infinite loops that match an
02386           empty string. */
02387 
02388           if (charcount == 0)
02389             {
02390             ADD_ACTIVE(next_state_offset, 0);
02391             }
02392 
02393           /* Optimization: if there are no more active states, and there
02394           are no new states yet set up, then skip over the subject string
02395           right here, to save looping. Otherwise, set up the new state to swing
02396           into action when the end of the substring is reached. */
02397 
02398           else if (i + 1 >= active_count && new_count == 0)
02399             {
02400             ptr += charcount;
02401             clen = 0;
02402             ADD_NEW(next_state_offset, 0);
02403 
02404             /* If we are adding a repeat state at the new character position,
02405             we must fudge things so that it is the only current state.
02406             Otherwise, it might be a duplicate of one we processed before, and
02407             that would cause it to be skipped. */
02408 
02409             if (repeat_state_offset >= 0)
02410               {
02411               next_active_state = active_states;
02412               active_count = 0;
02413               i = -1;
02414               ADD_ACTIVE(repeat_state_offset, 0);
02415               }
02416             }
02417           else
02418             {
02419             const uschar *p = start_subject + local_offsets[0];
02420             const uschar *pp = start_subject + local_offsets[1];
02421             while (p < pp) if ((*p++ & 0xc0) == 0x80) charcount--;
02422             ADD_NEW_DATA(-next_state_offset, 0, (charcount - 1));
02423             if (repeat_state_offset >= 0)
02424               { ADD_NEW_DATA(-repeat_state_offset, 0, (charcount - 1)); }
02425             }
02426 
02427           }
02428         else if (rc != PCRE_ERROR_NOMATCH) return rc;
02429         }
02430       break;
02431 
02432 
02433 /* ========================================================================== */
02434       /* Handle callouts */
02435 
02436       case OP_CALLOUT:
02437       rrc = 0;
02438       if (pcre_callout != NULL)
02439         {
02440         pcre_callout_block cb;
02441         cb.version          = 1;   /* Version 1 of the callout block */
02442         cb.callout_number   = code[1];
02443         cb.offset_vector    = offsets;
02444         cb.subject          = (PCRE_SPTR)start_subject;
02445         cb.subject_length   = end_subject - start_subject;
02446         cb.start_match      = current_subject - start_subject;
02447         cb.current_position = ptr - start_subject;
02448         cb.pattern_position = GET(code, 2);
02449         cb.next_item_length = GET(code, 2 + LINK_SIZE);
02450         cb.capture_top      = 1;
02451         cb.capture_last     = -1;
02452         cb.callout_data     = md->callout_data;
02453         if ((rrc = (*pcre_callout)(&cb)) < 0) return rrc;   /* Abandon */
02454         }
02455       if (rrc == 0)
02456         { ADD_ACTIVE(state_offset + _pcre_OP_lengths[OP_CALLOUT], 0); }
02457       break;
02458 
02459 
02460 /* ========================================================================== */
02461       default:        /* Unsupported opcode */
02462       return PCRE_ERROR_DFA_UITEM;
02463       }
02464 
02465     NEXT_ACTIVE_STATE: continue;
02466 
02467     }      /* End of loop scanning active states */
02468 
02469   /* We have finished the processing at the current subject character. If no
02470   new states have been set for the next character, we have found all the
02471   matches that we are going to find. If we are at the top level and partial
02472   matching has been requested, check for appropriate conditions. */
02473 
02474   if (new_count <= 0)
02475     {
02476     if (match_count < 0 &&                     /* No matches found */
02477         rlevel == 1 &&                         /* Top level match function */
02478         (md->moptions & PCRE_PARTIAL) != 0 &&  /* Want partial matching */
02479         ptr >= end_subject &&                  /* Reached end of subject */
02480         ptr > current_subject)                 /* Matched non-empty string */
02481       {
02482       if (offsetcount >= 2)
02483         {
02484         offsets[0] = current_subject - start_subject;
02485         offsets[1] = end_subject - start_subject;
02486         }
02487       match_count = PCRE_ERROR_PARTIAL;
02488       }
02489 
02490     DPRINTF(("%.*sEnd of internal_dfa_exec %d: returning %d\n"
02491       "%.*s---------------------\n\n", rlevel*2-2, SP, rlevel, match_count,
02492       rlevel*2-2, SP));
02493     break;        /* In effect, "return", but see the comment below */
02494     }
02495 
02496   /* One or more states are active for the next character. */
02497 
02498   ptr += clen;    /* Advance to next subject character */
02499   }               /* Loop to move along the subject string */
02500 
02501 /* Control gets here from "break" a few lines above. We do it this way because
02502 if we use "return" above, we have compiler trouble. Some compilers warn if
02503 there's nothing here because they think the function doesn't return a value. On
02504 the other hand, if we put a dummy statement here, some more clever compilers
02505 complain that it can't be reached. Sigh. */
02506 
02507 return match_count;
02508 }
02509 
02510 
02511 
02512 
02513 /*************************************************
02514 *    Execute a Regular Expression - DFA engine   *
02515 *************************************************/
02516 
02517 /* This external function applies a compiled re to a subject string using a DFA
02518 engine. This function calls the internal function multiple times if the pattern
02519 is not anchored.
02520 
02521 Arguments:
02522   argument_re     points to the compiled expression
02523   extra_data      points to extra data or is NULL
02524   subject         points to the subject string
02525   length          length of subject string (may contain binary zeros)
02526   start_offset    where to start in the subject string
02527   options         option bits
02528   offsets         vector of match offsets
02529   offsetcount     size of same
02530   workspace       workspace vector
02531   wscount         size of same
02532 
02533 Returns:          > 0 => number of match offset pairs placed in offsets
02534                   = 0 => offsets overflowed; longest matches are present
02535                    -1 => failed to match
02536                  < -1 => some kind of unexpected problem
02537 */
02538 
02539 PCRE_EXP_DEFN int PCRE_CALL_CONVENTION
02540 pcre_dfa_exec(const pcre *argument_re, const pcre_extra *extra_data,
02541   const char *subject, int length, int start_offset, int options, int *offsets,
02542   int offsetcount, int *workspace, int wscount)
02543 {
02544 real_pcre *re = (real_pcre *)argument_re;
02545 dfa_match_data match_block;
02546 dfa_match_data *md = &match_block;
02547 BOOL utf8, anchored, startline, firstline;
02548 const uschar *current_subject, *end_subject, *lcc;
02549 
02550 pcre_study_data internal_study;
02551 const pcre_study_data *study = NULL;
02552 real_pcre internal_re;
02553 
02554 const uschar *req_byte_ptr;
02555 const uschar *start_bits = NULL;
02556 BOOL first_byte_caseless = FALSE;
02557 BOOL req_byte_caseless = FALSE;
02558 int first_byte = -1;
02559 int req_byte = -1;
02560 int req_byte2 = -1;
02561 int newline;
02562 
02563 /* Plausibility checks */
02564 
02565 if ((options & ~PUBLIC_DFA_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
02566 if (re == NULL || subject == NULL || workspace == NULL ||
02567    (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
02568 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
02569 if (wscount < 20) return PCRE_ERROR_DFA_WSSIZE;
02570 
02571 /* We need to find the pointer to any study data before we test for byte
02572 flipping, so we scan the extra_data block first. This may set two fields in the
02573 match block, so we must initialize them beforehand. However, the other fields
02574 in the match block must not be set until after the byte flipping. */
02575 
02576 md->tables = re->tables;
02577 md->callout_data = NULL;
02578 
02579 if (extra_data != NULL)
02580   {
02581   unsigned int flags = extra_data->flags;
02582   if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
02583     study = (const pcre_study_data *)extra_data->study_data;
02584   if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0) return PCRE_ERROR_DFA_UMLIMIT;
02585   if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
02586     return PCRE_ERROR_DFA_UMLIMIT;
02587   if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
02588     md->callout_data = extra_data->callout_data;
02589   if ((flags & PCRE_EXTRA_TABLES) != 0)
02590     md->tables = extra_data->tables;
02591   }
02592 
02593 /* Check that the first field in the block is the magic number. If it is not,
02594 test for a regex that was compiled on a host of opposite endianness. If this is
02595 the case, flipped values are put in internal_re and internal_study if there was
02596 study data too. */
02597 
02598 if (re->magic_number != MAGIC_NUMBER)
02599   {
02600   re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
02601   if (re == NULL) return PCRE_ERROR_BADMAGIC;
02602   if (study != NULL) study = &internal_study;
02603   }
02604 
02605 /* Set some local values */
02606 
02607 current_subject = (const unsigned char *)subject + start_offset;
02608 end_subject = (const unsigned char *)subject + length;
02609 req_byte_ptr = current_subject - 1;
02610 
02611 #ifdef SUPPORT_UTF8
02612 utf8 = (re->options & PCRE_UTF8) != 0;
02613 #else
02614 utf8 = FALSE;
02615 #endif
02616 
02617 anchored = (options & (PCRE_ANCHORED|PCRE_DFA_RESTART)) != 0 ||
02618   (re->options & PCRE_ANCHORED) != 0;
02619 
02620 /* The remaining fixed data for passing around. */
02621 
02622 md->start_code = (const uschar *)argument_re +
02623     re->name_table_offset + re->name_count * re->name_entry_size;
02624 md->start_subject = (const unsigned char *)subject;
02625 md->end_subject = end_subject;
02626 md->moptions = options;
02627 md->poptions = re->options;
02628 
02629 /* If the BSR option is not set at match time, copy what was set
02630 at compile time. */
02631 
02632 if ((md->moptions & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) == 0)
02633   {
02634   if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
02635     md->moptions |= re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE);
02636 #ifdef BSR_ANYCRLF
02637   else md->moptions |= PCRE_BSR_ANYCRLF;
02638 #endif
02639   }
02640 
02641 /* Handle different types of newline. The three bits give eight cases. If
02642 nothing is set at run time, whatever was used at compile time applies. */
02643 
02644 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options : (pcre_uint32)options) &
02645          PCRE_NEWLINE_BITS)
02646   {
02647   case 0: newline = NEWLINE; break;   /* Compile-time default */
02648   case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
02649   case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
02650   case PCRE_NEWLINE_CR+
02651        PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
02652   case PCRE_NEWLINE_ANY: newline = -1; break;
02653   case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
02654   default: return PCRE_ERROR_BADNEWLINE;
02655   }
02656 
02657 if (newline == -2)
02658   {
02659   md->nltype = NLTYPE_ANYCRLF;
02660   }
02661 else if (newline < 0)
02662   {
02663   md->nltype = NLTYPE_ANY;
02664   }
02665 else
02666   {
02667   md->nltype = NLTYPE_FIXED;
02668   if (newline > 255)
02669     {
02670     md->nllen = 2;
02671     md->nl[0] = (newline >> 8) & 255;
02672     md->nl[1] = newline & 255;
02673     }
02674   else
02675     {
02676     md->nllen = 1;
02677     md->nl[0] = newline;
02678     }
02679   }
02680 
02681 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
02682 back the character offset. */
02683 
02684 #ifdef SUPPORT_UTF8
02685 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
02686   {
02687   if (_pcre_valid_utf8((uschar *)subject, length) >= 0)
02688     return PCRE_ERROR_BADUTF8;
02689   if (start_offset > 0 && start_offset < length)
02690     {
02691     int tb = ((uschar *)subject)[start_offset];
02692     if (tb > 127)
02693       {
02694       tb &= 0xc0;
02695       if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
02696       }
02697     }
02698   }
02699 #endif
02700 
02701 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
02702 is a feature that makes it possible to save compiled regex and re-use them
02703 in other programs later. */
02704 
02705 if (md->tables == NULL) md->tables = _pcre_default_tables;
02706 
02707 /* The lower casing table and the "must be at the start of a line" flag are
02708 used in a loop when finding where to start. */
02709 
02710 lcc = md->tables + lcc_offset;
02711 startline = (re->flags & PCRE_STARTLINE) != 0;
02712 firstline = (re->options & PCRE_FIRSTLINE) != 0;
02713 
02714 /* Set up the first character to match, if available. The first_byte value is
02715 never set for an anchored regular expression, but the anchoring may be forced
02716 at run time, so we have to test for anchoring. The first char may be unset for
02717 an unanchored pattern, of course. If there's no first char and the pattern was
02718 studied, there may be a bitmap of possible first characters. */
02719 
02720 if (!anchored)
02721   {
02722   if ((re->flags & PCRE_FIRSTSET) != 0)
02723     {
02724     first_byte = re->first_byte & 255;
02725     if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
02726       first_byte = lcc[first_byte];
02727     }
02728   else
02729     {
02730     if (startline && study != NULL &&
02731          (study->options & PCRE_STUDY_MAPPED) != 0)
02732       start_bits = study->start_bits;
02733     }
02734   }
02735 
02736 /* For anchored or unanchored matches, there may be a "last known required
02737 character" set. */
02738 
02739 if ((re->flags & PCRE_REQCHSET) != 0)
02740   {
02741   req_byte = re->req_byte & 255;
02742   req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
02743   req_byte2 = (md->tables + fcc_offset)[req_byte];  /* case flipped */
02744   }
02745 
02746 /* Call the main matching function, looping for a non-anchored regex after a
02747 failed match. If not restarting, perform certain optimizations at the start of
02748 a match. */
02749 
02750 for (;;)
02751   {
02752   int rc;
02753 
02754   if ((options & PCRE_DFA_RESTART) == 0)
02755     {
02756     const uschar *save_end_subject = end_subject;
02757 
02758     /* If firstline is TRUE, the start of the match is constrained to the first
02759     line of a multiline string. Implement this by temporarily adjusting
02760     end_subject so that we stop scanning at a newline. If the match fails at
02761     the newline, later code breaks this loop. */
02762 
02763     if (firstline)
02764       {
02765       USPTR t = current_subject;
02766 #ifdef SUPPORT_UTF8
02767       if (utf8)
02768         {
02769         while (t < md->end_subject && !IS_NEWLINE(t))
02770           {
02771           t++;
02772           while (t < end_subject && (*t & 0xc0) == 0x80) t++;
02773           }
02774         }
02775       else
02776 #endif
02777       while (t < md->end_subject && !IS_NEWLINE(t)) t++;
02778       end_subject = t;
02779       }
02780 
02781     /* There are some optimizations that avoid running the match if a known
02782     starting point is not found, or if a known later character is not present.
02783     However, there is an option that disables these, for testing and for
02784     ensuring that all callouts do actually occur. */
02785 
02786     if ((options & PCRE_NO_START_OPTIMIZE) == 0)
02787       {
02788 
02789       /* Advance to a known first byte. */
02790 
02791       if (first_byte >= 0)
02792         {
02793         if (first_byte_caseless)
02794           while (current_subject < end_subject &&
02795                  lcc[*current_subject] != first_byte)
02796             current_subject++;
02797         else
02798           while (current_subject < end_subject &&
02799                  *current_subject != first_byte)
02800             current_subject++;
02801         }
02802 
02803       /* Or to just after a linebreak for a multiline match if possible */
02804 
02805       else if (startline)
02806         {
02807         if (current_subject > md->start_subject + start_offset)
02808           {
02809 #ifdef SUPPORT_UTF8
02810           if (utf8)
02811             {
02812             while (current_subject < end_subject &&
02813                    !WAS_NEWLINE(current_subject))
02814               {
02815               current_subject++;
02816               while(current_subject < end_subject &&
02817                     (*current_subject & 0xc0) == 0x80)
02818                 current_subject++;
02819               }
02820             }
02821           else
02822 #endif
02823           while (current_subject < end_subject && !WAS_NEWLINE(current_subject))
02824             current_subject++;
02825 
02826           /* If we have just passed a CR and the newline option is ANY or
02827           ANYCRLF, and we are now at a LF, advance the match position by one
02828           more character. */
02829 
02830           if (current_subject[-1] == CHAR_CR &&
02831                (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
02832                current_subject < end_subject &&
02833                *current_subject == CHAR_NL)
02834             current_subject++;
02835           }
02836         }
02837 
02838       /* Or to a non-unique first char after study */
02839 
02840       else if (start_bits != NULL)
02841         {
02842         while (current_subject < end_subject)
02843           {
02844           register unsigned int c = *current_subject;
02845           if ((start_bits[c/8] & (1 << (c&7))) == 0) current_subject++;
02846             else break;
02847           }
02848         }
02849       }
02850 
02851     /* Restore fudged end_subject */
02852 
02853     end_subject = save_end_subject;
02854     }
02855 
02856   /* If req_byte is set, we know that that character must appear in the subject
02857   for the match to succeed. If the first character is set, req_byte must be
02858   later in the subject; otherwise the test starts at the match point. This
02859   optimization can save a huge amount of work in patterns with nested unlimited
02860   repeats that aren't going to match. Writing separate code for cased/caseless
02861   versions makes it go faster, as does using an autoincrement and backing off
02862   on a match.
02863 
02864   HOWEVER: when the subject string is very, very long, searching to its end can
02865   take a long time, and give bad performance on quite ordinary patterns. This
02866   showed up when somebody was matching /^C/ on a 32-megabyte string... so we
02867   don't do this when the string is sufficiently long.
02868 
02869   ALSO: this processing is disabled when partial matching is requested, and can
02870   also be explicitly deactivated. */
02871 
02872   if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
02873       req_byte >= 0 &&
02874       end_subject - current_subject < REQ_BYTE_MAX &&
02875       (options & PCRE_PARTIAL) == 0)
02876     {
02877     register const uschar *p = current_subject + ((first_byte >= 0)? 1 : 0);
02878 
02879     /* We don't need to repeat the search if we haven't yet reached the
02880     place we found it at last time. */
02881 
02882     if (p > req_byte_ptr)
02883       {
02884       if (req_byte_caseless)
02885         {
02886         while (p < end_subject)
02887           {
02888           register int pp = *p++;
02889           if (pp == req_byte || pp == req_byte2) { p--; break; }
02890           }
02891         }
02892       else
02893         {
02894         while (p < end_subject)
02895           {
02896           if (*p++ == req_byte) { p--; break; }
02897           }
02898         }
02899 
02900       /* If we can't find the required character, break the matching loop,
02901       which will cause a return or PCRE_ERROR_NOMATCH. */
02902 
02903       if (p >= end_subject) break;
02904 
02905       /* If we have found the required character, save the point where we
02906       found it, so that we don't search again next time round the loop if
02907       the start hasn't passed this character yet. */
02908 
02909       req_byte_ptr = p;
02910       }
02911     }
02912 
02913   /* OK, now we can do the business */
02914 
02915   rc = internal_dfa_exec(
02916     md,                                /* fixed match data */
02917     md->start_code,                    /* this subexpression's code */
02918     current_subject,                   /* where we currently are */
02919     start_offset,                      /* start offset in subject */
02920     offsets,                           /* offset vector */
02921     offsetcount,                       /* size of same */
02922     workspace,                         /* workspace vector */
02923     wscount,                           /* size of same */
02924     re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL), /* ims flags */
02925     0,                                 /* function recurse level */
02926     0);                                /* regex recurse level */
02927 
02928   /* Anything other than "no match" means we are done, always; otherwise, carry
02929   on only if not anchored. */
02930 
02931   if (rc != PCRE_ERROR_NOMATCH || anchored) return rc;
02932 
02933   /* Advance to the next subject character unless we are at the end of a line
02934   and firstline is set. */
02935 
02936   if (firstline && IS_NEWLINE(current_subject)) break;
02937   current_subject++;
02938   if (utf8)
02939     {
02940     while (current_subject < end_subject && (*current_subject & 0xc0) == 0x80)
02941       current_subject++;
02942     }
02943   if (current_subject > end_subject) break;
02944 
02945   /* If we have just passed a CR and we are now at a LF, and the pattern does
02946   not contain any explicit matches for \r or \n, and the newline option is CRLF
02947   or ANY or ANYCRLF, advance the match position by one more character. */
02948 
02949   if (current_subject[-1] == CHAR_CR &&
02950       current_subject < end_subject &&
02951       *current_subject == CHAR_NL &&
02952       (re->flags & PCRE_HASCRORLF) == 0 &&
02953         (md->nltype == NLTYPE_ANY ||
02954          md->nltype == NLTYPE_ANYCRLF ||
02955          md->nllen == 2))
02956     current_subject++;
02957 
02958   }   /* "Bumpalong" loop */
02959 
02960 return PCRE_ERROR_NOMATCH;
02961 }
02962 
02963 /* End of pcre_dfa_exec.c */
Modified on Sun Sep 21 18:23:49 2014 by modify_doxy.py rev. 426318