NCBI C++ ToolKit
pcre_exec.c
Go to the documentation of this file.

Go to the SVN repository for this file.

1 /*************************************************
2 * Perl-Compatible Regular Expressions *
3 *************************************************/
4 
5 /* PCRE is a library of functions to support regular expressions whose syntax
6 and semantics are as close as possible to those of the Perl 5 language.
7 
8  Written by Philip Hazel
9  Copyright (c) 1997-2009 University of Cambridge
10 
11 -----------------------------------------------------------------------------
12 Redistribution and use in source and binary forms, with or without
13 modification, are permitted provided that the following conditions are met:
14 
15  * Redistributions of source code must retain the above copyright notice,
16  this list of conditions and the following disclaimer.
17 
18  * Redistributions in binary form must reproduce the above copyright
19  notice, this list of conditions and the following disclaimer in the
20  documentation and/or other materials provided with the distribution.
21 
22  * Neither the name of the University of Cambridge nor the names of its
23  contributors may be used to endorse or promote products derived from
24  this software without specific prior written permission.
25 
26 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
27 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
30 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
31 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
32 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
33 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
34 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
35 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
36 POSSIBILITY OF SUCH DAMAGE.
37 -----------------------------------------------------------------------------
38 */
39 
40 
41 /* This module contains pcre_exec(), the externally visible function that does
42 pattern matching using an NFA algorithm, trying to mimic Perl as closely as
43 possible. There are also some static supporting functions. */
44 
45 #ifdef HAVE_CONFIG_H
46 #include "config.h"
47 #endif
48 
49 #define NLBLOCK md /* Block containing newline information */
50 #define PSSTART start_subject /* Field containing processed string start */
51 #define PSEND end_subject /* Field containing processed string end */
52 
53 #include "pcre_internal.h"
54 
55 /* Undefine some potentially clashing cpp symbols */
56 
57 #undef min
58 #undef max
59 
60 /* Flag bits for the match() function */
61 
62 #define match_condassert 0x01 /* Called to check a condition assertion */
63 #define match_cbegroup 0x02 /* Could-be-empty unlimited repeat group */
64 
65 /* Non-error returns from the match() function. Error returns are externally
66 defined PCRE_ERROR_xxx codes, which are all negative. */
67 
68 #define MATCH_MATCH 1
69 #define MATCH_NOMATCH 0
70 
71 /* Special internal returns from the match() function. Make them sufficiently
72 negative to avoid the external error codes. */
73 
74 #define MATCH_COMMIT (-999)
75 #define MATCH_PRUNE (-998)
76 #define MATCH_SKIP (-997)
77 #define MATCH_THEN (-996)
78 
79 /* Maximum number of ints of offset to save on the stack for recursive calls.
80 If the offset vector is bigger, malloc is used. This should be a multiple of 3,
81 because the offset vector is always a multiple of 3 long. */
82 
83 #define REC_STACK_SAVE_MAX 30
84 
85 /* Min and max values for the common repeats; for the maxima, 0 => infinity */
86 
87 static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
88 static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
89 
90 
91 
92 #ifdef DEBUG
93 /*************************************************
94 * Debugging function to print chars *
95 *************************************************/
96 
97 /* Print a sequence of chars in printable format, stopping at the end of the
98 subject if the requested.
99 
100 Arguments:
101  p points to characters
102  length number to print
103  is_subject TRUE if printing from within md->start_subject
104  md pointer to matching data block, if is_subject is TRUE
105 
106 Returns: nothing
107 */
108 
109 static void
110 pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
111 {
112 unsigned int c;
113 if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
114 while (length-- > 0)
115  if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
116 }
117 #endif
118 
119 
120 
121 /*************************************************
122 * Match a back-reference *
123 *************************************************/
124 
125 /* If a back reference hasn't been set, the length that is passed is greater
126 than the number of characters left in the string, so the match fails.
127 
128 Arguments:
129  offset index into the offset vector
130  eptr points into the subject
131  length length to be matched
132  md points to match data block
133  ims the ims flags
134 
135 Returns: TRUE if matched
136 */
137 
138 static BOOL
139 match_ref(int offset, register USPTR eptr, int length, match_data *md,
140  unsigned long int ims)
141 {
142 USPTR p = md->start_subject + md->offset_vector[offset];
143 
144 #ifdef DEBUG
145 if (eptr >= md->end_subject)
146  printf("matching subject <null>");
147 else
148  {
149  printf("matching subject ");
150  pchars(eptr, length, TRUE, md);
151  }
152 printf(" against backref ");
153 pchars(p, length, FALSE, md);
154 printf("\n");
155 #endif
156 
157 /* Always fail if not enough characters left */
158 
159 if (length > md->end_subject - eptr) return FALSE;
160 
161 /* Separate the caseless case for speed. In UTF-8 mode we can only do this
162 properly if Unicode properties are supported. Otherwise, we can check only
163 ASCII characters. */
164 
165 if ((ims & PCRE_CASELESS) != 0)
166  {
167 #ifdef SUPPORT_UTF8
168 #ifdef SUPPORT_UCP
169  if (md->utf8)
170  {
171  USPTR endptr = eptr + length;
172  while (eptr < endptr)
173  {
174  int c, d;
175  GETCHARINC(c, eptr);
176  GETCHARINC(d, p);
177  if (c != d && c != UCD_OTHERCASE(d)) return FALSE;
178  }
179  }
180  else
181 #endif
182 #endif
183 
184  /* The same code works when not in UTF-8 mode and in UTF-8 mode when there
185  is no UCP support. */
186 
187  while (length-- > 0)
188  { if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE; }
189  }
190 
191 /* In the caseful case, we can just compare the bytes, whether or not we
192 are in UTF-8 mode. */
193 
194 else
195  { while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
196 
197 return TRUE;
198 }
199 
200 
201 
202 /***************************************************************************
203 ****************************************************************************
204  RECURSION IN THE match() FUNCTION
205 
206 The match() function is highly recursive, though not every recursive call
207 increases the recursive depth. Nevertheless, some regular expressions can cause
208 it to recurse to a great depth. I was writing for Unix, so I just let it call
209 itself recursively. This uses the stack for saving everything that has to be
210 saved for a recursive call. On Unix, the stack can be large, and this works
211 fine.
212 
213 It turns out that on some non-Unix-like systems there are problems with
214 programs that use a lot of stack. (This despite the fact that every last chip
215 has oodles of memory these days, and techniques for extending the stack have
216 been known for decades.) So....
217 
218 There is a fudge, triggered by defining NO_RECURSE, which avoids recursive
219 calls by keeping local variables that need to be preserved in blocks of memory
220 obtained from malloc() instead instead of on the stack. Macros are used to
221 achieve this so that the actual code doesn't look very different to what it
222 always used to.
223 
224 The original heap-recursive code used longjmp(). However, it seems that this
225 can be very slow on some operating systems. Following a suggestion from Stan
226 Switzer, the use of longjmp() has been abolished, at the cost of having to
227 provide a unique number for each call to RMATCH. There is no way of generating
228 a sequence of numbers at compile time in C. I have given them names, to make
229 them stand out more clearly.
230 
231 Crude tests on x86 Linux show a small speedup of around 5-8%. However, on
232 FreeBSD, avoiding longjmp() more than halves the time taken to run the standard
233 tests. Furthermore, not using longjmp() means that local dynamic variables
234 don't have indeterminate values; this has meant that the frame size can be
235 reduced because the result can be "passed back" by straight setting of the
236 variable instead of being passed in the frame.
237 ****************************************************************************
238 ***************************************************************************/
239 
240 /* Numbers for RMATCH calls. When this list is changed, the code at HEAP_RETURN
241 below must be updated in sync. */
242 
243 enum { RM1=1, RM2, RM3, RM4, RM5, RM6, RM7, RM8, RM9, RM10,
249 
250 /* These versions of the macros use the stack, as normal. There are debugging
251 versions and production versions. Note that the "rw" argument of RMATCH isn't
252 actuall used in this definition. */
253 
254 #ifndef NO_RECURSE
255 #define REGISTER register
256 
257 #ifdef DEBUG
258 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
259  { \
260  printf("match() called in line %d\n", __LINE__); \
261  rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1); \
262  printf("to line %d\n", __LINE__); \
263  }
264 #define RRETURN(ra) \
265  { \
266  printf("match() returned %d from line %d ", ra, __LINE__); \
267  return ra; \
268  }
269 #else
270 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw) \
271  rrc = match(ra,rb,mstart,rc,rd,re,rf,rg,rdepth+1)
272 #define RRETURN(ra) return ra
273 #endif
274 
275 #else
276 
277 
278 /* These versions of the macros manage a private stack on the heap. Note that
279 the "rd" argument of RMATCH isn't actually used in this definition. It's the md
280 argument of match(), which never changes. */
281 
282 #define REGISTER
283 
284 #define RMATCH(ra,rb,rc,rd,re,rf,rg,rw)\
285  {\
286  heapframe *newframe = (pcre_stack_malloc)(sizeof(heapframe));\
287  frame->Xwhere = rw; \
288  newframe->Xeptr = ra;\
289  newframe->Xecode = rb;\
290  newframe->Xmstart = mstart;\
291  newframe->Xoffset_top = rc;\
292  newframe->Xims = re;\
293  newframe->Xeptrb = rf;\
294  newframe->Xflags = rg;\
295  newframe->Xrdepth = frame->Xrdepth + 1;\
296  newframe->Xprevframe = frame;\
297  frame = newframe;\
298  DPRINTF(("restarting from line %d\n", __LINE__));\
299  goto HEAP_RECURSE;\
300  L_##rw:\
301  DPRINTF(("jumped back to line %d\n", __LINE__));\
302  }
303 
304 #define RRETURN(ra)\
305  {\
306  heapframe *newframe = frame;\
307  frame = newframe->Xprevframe;\
308  (pcre_stack_free)(newframe);\
309  if (frame != NULL)\
310  {\
311  rrc = ra;\
312  goto HEAP_RETURN;\
313  }\
314  return ra;\
315  }
316 
317 
318 /* Structure for remembering the local variables in a private frame */
319 
320 typedef struct heapframe {
321  struct heapframe *Xprevframe;
322 
323  /* Function arguments that may change */
324 
325  USPTR Xeptr;
326  const uschar *Xecode;
327  USPTR Xmstart;
328  int Xoffset_top;
329  long int Xims;
330  eptrblock *Xeptrb;
331  int Xflags;
332  unsigned int Xrdepth;
333 
334  /* Function local variables */
335 
336  USPTR Xcallpat;
337 #ifdef SUPPORT_UTF8
338  USPTR Xcharptr;
339 #endif
340  USPTR Xdata;
341  USPTR Xnext;
342  USPTR Xpp;
343  USPTR Xprev;
344  USPTR Xsaved_eptr;
345 
346  recursion_info Xnew_recursive;
347 
348  BOOL Xcur_is_word;
349  BOOL Xcondition;
350  BOOL Xprev_is_word;
351 
352  unsigned long int Xoriginal_ims;
353 
354 #ifdef SUPPORT_UCP
355  int Xprop_type;
356  int Xprop_value;
357  int Xprop_fail_result;
358  int Xprop_category;
359  int Xprop_chartype;
360  int Xprop_script;
361  int Xoclength;
362  uschar Xocchars[8];
363 #endif
364 
365  int Xcodelink;
366  int Xctype;
367  unsigned int Xfc;
368  int Xfi;
369  int Xlength;
370  int Xmax;
371  int Xmin;
372  int Xnumber;
373  int Xoffset;
374  int Xop;
375  int Xsave_capture_last;
376  int Xsave_offset1, Xsave_offset2, Xsave_offset3;
377  int Xstacksave[REC_STACK_SAVE_MAX];
378 
379  eptrblock Xnewptrb;
380 
381  /* Where to jump back to */
382 
383  int Xwhere;
384 
385 } heapframe;
386 
387 #endif
388 
389 
390 /***************************************************************************
391 ***************************************************************************/
392 
393 
394 
395 /*************************************************
396 * Match from current position *
397 *************************************************/
398 
399 /* This function is called recursively in many circumstances. Whenever it
400 returns a negative (error) response, the outer incarnation must also return the
401 same response.
402 
403 Performance note: It might be tempting to extract commonly used fields from the
404 md structure (e.g. utf8, end_subject) into individual variables to improve
405 performance. Tests using gcc on a SPARC disproved this; in the first case, it
406 made performance worse.
407 
408 Arguments:
409  eptr pointer to current character in subject
410  ecode pointer to current position in compiled code
411  mstart pointer to the current match start position (can be modified
412  by encountering \K)
413  offset_top current top pointer
414  md pointer to "static" info for the match
415  ims current /i, /m, and /s options
416  eptrb pointer to chain of blocks containing eptr at start of
417  brackets - for testing for empty matches
418  flags can contain
419  match_condassert - this is an assertion condition
420  match_cbegroup - this is the start of an unlimited repeat
421  group that can match an empty string
422  rdepth the recursion depth
423 
424 Returns: MATCH_MATCH if matched ) these values are >= 0
425  MATCH_NOMATCH if failed to match )
426  a negative PCRE_ERROR_xxx value if aborted by an error condition
427  (e.g. stopped by repeated call or recursion limit)
428 */
429 
430 static int
431 match(REGISTER USPTR eptr, REGISTER const uschar *ecode, USPTR mstart,
432  int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
433  int flags, unsigned int rdepth)
434 {
435 /* These variables do not need to be preserved over recursion in this function,
436 so they can be ordinary variables in all cases. Mark some of them with
437 "register" because they are used a lot in loops. */
438 
439 register int rrc; /* Returns from recursive calls */
440 register int i; /* Used for loops not involving calls to RMATCH() */
441 register unsigned int c; /* Character values not kept over RMATCH() calls */
442 register BOOL utf8; /* Local copy of UTF-8 flag for speed */
443 
444 BOOL minimize, possessive; /* Quantifier options */
445 int condcode;
446 
447 /* When recursion is not being used, all "local" variables that have to be
448 preserved over calls to RMATCH() are part of a "frame" which is obtained from
449 heap storage. Set up the top-level frame here; others are obtained from the
450 heap whenever RMATCH() does a "recursion". See the macro definitions above. */
451 
452 #ifdef NO_RECURSE
453 heapframe *frame = (pcre_stack_malloc)(sizeof(heapframe));
454 frame->Xprevframe = NULL; /* Marks the top level */
455 
456 /* Copy in the original argument variables */
457 
458 frame->Xeptr = eptr;
459 frame->Xecode = ecode;
460 frame->Xmstart = mstart;
461 frame->Xoffset_top = offset_top;
462 frame->Xims = ims;
463 frame->Xeptrb = eptrb;
464 frame->Xflags = flags;
465 frame->Xrdepth = rdepth;
466 
467 /* This is where control jumps back to to effect "recursion" */
468 
469 HEAP_RECURSE:
470 
471 /* Macros make the argument variables come from the current frame */
472 
473 #define eptr frame->Xeptr
474 #define ecode frame->Xecode
475 #define mstart frame->Xmstart
476 #define offset_top frame->Xoffset_top
477 #define ims frame->Xims
478 #define eptrb frame->Xeptrb
479 #define flags frame->Xflags
480 #define rdepth frame->Xrdepth
481 
482 /* Ditto for the local variables */
483 
484 #ifdef SUPPORT_UTF8
485 #define charptr frame->Xcharptr
486 #endif
487 #define callpat frame->Xcallpat
488 #define codelink frame->Xcodelink
489 #define data frame->Xdata
490 #define next frame->Xnext
491 #define pp frame->Xpp
492 #define prev frame->Xprev
493 #define saved_eptr frame->Xsaved_eptr
494 
495 #define new_recursive frame->Xnew_recursive
496 
497 #define cur_is_word frame->Xcur_is_word
498 #define condition frame->Xcondition
499 #define prev_is_word frame->Xprev_is_word
500 
501 #define original_ims frame->Xoriginal_ims
502 
503 #ifdef SUPPORT_UCP
504 #define prop_type frame->Xprop_type
505 #define prop_value frame->Xprop_value
506 #define prop_fail_result frame->Xprop_fail_result
507 #define prop_category frame->Xprop_category
508 #define prop_chartype frame->Xprop_chartype
509 #define prop_script frame->Xprop_script
510 #define oclength frame->Xoclength
511 #define occhars frame->Xocchars
512 #endif
513 
514 #define ctype frame->Xctype
515 #define fc frame->Xfc
516 #define fi frame->Xfi
517 #define length frame->Xlength
518 #define max frame->Xmax
519 #define min frame->Xmin
520 #define number frame->Xnumber
521 #define offset frame->Xoffset
522 #define op frame->Xop
523 #define save_capture_last frame->Xsave_capture_last
524 #define save_offset1 frame->Xsave_offset1
525 #define save_offset2 frame->Xsave_offset2
526 #define save_offset3 frame->Xsave_offset3
527 #define stacksave frame->Xstacksave
528 
529 #define newptrb frame->Xnewptrb
530 
531 /* When recursion is being used, local variables are allocated on the stack and
532 get preserved during recursion in the normal way. In this environment, fi and
533 i, and fc and c, can be the same variables. */
534 
535 #else /* NO_RECURSE not defined */
536 #define fi i
537 #define fc c
538 
539 
540 #ifdef SUPPORT_UTF8 /* Many of these variables are used only */
541 const uschar *charptr; /* in small blocks of the code. My normal */
542 #endif /* style of coding would have declared */
543 const uschar *callpat; /* them within each of those blocks. */
544 const uschar *data; /* However, in order to accommodate the */
545 const uschar *next; /* version of this code that uses an */
546 USPTR pp; /* external "stack" implemented on the */
547 const uschar *prev; /* heap, it is easier to declare them all */
548 USPTR saved_eptr; /* here, so the declarations can be cut */
549  /* out in a block. The only declarations */
550 recursion_info new_recursive; /* within blocks below are for variables */
551  /* that do not have to be preserved over */
552 BOOL cur_is_word; /* a recursive call to RMATCH(). */
553 BOOL condition;
554 BOOL prev_is_word;
555 
556 unsigned long int original_ims;
557 
558 #ifdef SUPPORT_UCP
559 int prop_type;
560 int prop_value;
561 int prop_fail_result;
562 int prop_category;
563 int prop_chartype;
564 int prop_script;
565 int oclength;
566 uschar occhars[8];
567 #endif
568 
569 int codelink;
570 int ctype;
571 int length;
572 int max;
573 int min;
574 int number;
575 int offset;
576 int op;
577 int save_capture_last;
578 int save_offset1, save_offset2, save_offset3;
579 int stacksave[REC_STACK_SAVE_MAX];
580 
581 eptrblock newptrb;
582 #endif /* NO_RECURSE */
583 
584 /* These statements are here to stop the compiler complaining about unitialized
585 variables. */
586 
587 #ifdef SUPPORT_UCP
588 prop_value = 0;
589 prop_fail_result = 0;
590 #endif
591 
592 
593 /* This label is used for tail recursion, which is used in a few cases even
594 when NO_RECURSE is not defined, in order to reduce the amount of stack that is
595 used. Thanks to Ian Taylor for noticing this possibility and sending the
596 original patch. */
597 
598 TAIL_RECURSE:
599 
600 /* OK, now we can get on with the real code of the function. Recursive calls
601 are specified by the macro RMATCH and RRETURN is used to return. When
602 NO_RECURSE is *not* defined, these just turn into a recursive call to match()
603 and a "return", respectively (possibly with some debugging if DEBUG is
604 defined). However, RMATCH isn't like a function call because it's quite a
605 complicated macro. It has to be used in one particular way. This shouldn't,
606 however, impact performance when true recursion is being used. */
607 
608 #ifdef SUPPORT_UTF8
609 utf8 = md->utf8; /* Local copy of the flag */
610 #else
611 utf8 = FALSE;
612 #endif
613 
614 /* First check that we haven't called match() too many times, or that we
615 haven't exceeded the recursive call limit. */
616 
619 
620 original_ims = ims; /* Save for resetting on ')' */
621 
622 /* At the start of a group with an unlimited repeat that may match an empty
623 string, the match_cbegroup flag is set. When this is the case, add the current
624 subject pointer to the chain of such remembered pointers, to be checked when we
625 hit the closing ket, in order to break infinite loops that match no characters.
626 When match() is called in other circumstances, don't add to the chain. The
627 match_cbegroup flag must NOT be used with tail recursion, because the memory
628 block that is used is on the stack, so a new one may be required for each
629 match(). */
630 
631 if ((flags & match_cbegroup) != 0)
632  {
633  newptrb.epb_saved_eptr = eptr;
634  newptrb.epb_prev = eptrb;
635  eptrb = &newptrb;
636  }
637 
638 /* Now start processing the opcodes. */
639 
640 for (;;)
641  {
642  minimize = possessive = FALSE;
643  op = *ecode;
644 
645  /* For partial matching, remember if we ever hit the end of the subject after
646  matching at least one subject character. */
647 
648  if (md->partial &&
649  eptr >= md->end_subject &&
650  eptr > mstart)
651  md->hitend = TRUE;
652 
653  switch(op)
654  {
655  case OP_FAIL:
657 
658  case OP_PRUNE:
659  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
660  ims, eptrb, flags, RM51);
661  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
663 
664  case OP_COMMIT:
665  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
666  ims, eptrb, flags, RM52);
667  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
669 
670  case OP_SKIP:
671  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
672  ims, eptrb, flags, RM53);
673  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
674  md->start_match_ptr = eptr; /* Pass back current position */
676 
677  case OP_THEN:
678  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
679  ims, eptrb, flags, RM54);
680  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
682 
683  /* Handle a capturing bracket. If there is space in the offset vector, save
684  the current subject position in the working slot at the top of the vector.
685  We mustn't change the current values of the data slot, because they may be
686  set from a previous iteration of this group, and be referred to by a
687  reference inside the group.
688 
689  If the bracket fails to match, we need to restore this value and also the
690  values of the final offsets, in case they were set by a previous iteration
691  of the same bracket.
692 
693  If there isn't enough space in the offset vector, treat this as if it were
694  a non-capturing bracket. Don't worry about setting the flag for the error
695  case here; that is handled in the code for KET. */
696 
697  case OP_CBRA:
698  case OP_SCBRA:
699  number = GET2(ecode, 1+LINK_SIZE);
700  offset = number << 1;
701 
702 #ifdef DEBUG
703  printf("start bracket %d\n", number);
704  printf("subject=");
705  pchars(eptr, 16, TRUE, md);
706  printf("\n");
707 #endif
708 
709  if (offset < md->offset_max)
710  {
711  save_offset1 = md->offset_vector[offset];
712  save_offset2 = md->offset_vector[offset+1];
713  save_offset3 = md->offset_vector[md->offset_end - number];
714  save_capture_last = md->capture_last;
715 
716  DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
717  md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
718 
719  flags = (op == OP_SCBRA)? match_cbegroup : 0;
720  do
721  {
722  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md,
723  ims, eptrb, flags, RM1);
724  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
725  md->capture_last = save_capture_last;
726  ecode += GET(ecode, 1);
727  }
728  while (*ecode == OP_ALT);
729 
730  DPRINTF(("bracket %d failed\n", number));
731 
732  md->offset_vector[offset] = save_offset1;
733  md->offset_vector[offset+1] = save_offset2;
734  md->offset_vector[md->offset_end - number] = save_offset3;
735 
737  }
738 
739  /* FALL THROUGH ... Insufficient room for saving captured contents. Treat
740  as a non-capturing bracket. */
741 
742  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
743  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
744 
745  DPRINTF(("insufficient capture room: treat as non-capturing\n"));
746 
747  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
748  /* VVVVVVVVVVVVVVVVVVVVVVVVV */
749 
750  /* Non-capturing bracket. Loop for all the alternatives. When we get to the
751  final alternative within the brackets, we would return the result of a
752  recursive call to match() whatever happened. We can reduce stack usage by
753  turning this into a tail recursion, except in the case when match_cbegroup
754  is set.*/
755 
756  case OP_BRA:
757  case OP_SBRA:
758  DPRINTF(("start non-capturing bracket\n"));
759  flags = (op >= OP_SBRA)? match_cbegroup : 0;
760  for (;;)
761  {
762  if (ecode[GET(ecode, 1)] != OP_ALT) /* Final alternative */
763  {
764  if (flags == 0) /* Not a possibly empty group */
765  {
766  ecode += _pcre_OP_lengths[*ecode];
767  DPRINTF(("bracket 0 tail recursion\n"));
768  goto TAIL_RECURSE;
769  }
770 
771  /* Possibly empty group; can't use tail recursion. */
772 
773  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
774  eptrb, flags, RM48);
775  RRETURN(rrc);
776  }
777 
778  /* For non-final alternatives, continue the loop for a NOMATCH result;
779  otherwise return. */
780 
781  RMATCH(eptr, ecode + _pcre_OP_lengths[*ecode], offset_top, md, ims,
782  eptrb, flags, RM2);
783  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
784  ecode += GET(ecode, 1);
785  }
786  /* Control never reaches here. */
787 
788  /* Conditional group: compilation checked that there are no more than
789  two branches. If the condition is false, skipping the first branch takes us
790  past the end if there is only one branch, but that's OK because that is
791  exactly what going to the ket would do. As there is only one branch to be
792  obeyed, we can use tail recursion to avoid using another stack frame. */
793 
794  case OP_COND:
795  case OP_SCOND:
796  codelink= GET(ecode, 1);
797 
798  /* Because of the way auto-callout works during compile, a callout item is
799  inserted between OP_COND and an assertion condition. */
800 
801  if (ecode[LINK_SIZE+1] == OP_CALLOUT)
802  {
803  if (pcre_callout != NULL)
804  {
806  cb.version = 1; /* Version 1 of the callout block */
807  cb.callout_number = ecode[LINK_SIZE+2];
808  cb.offset_vector = md->offset_vector;
809  cb.subject = (PCRE_SPTR)md->start_subject;
811  cb.start_match = mstart - md->start_subject;
812  cb.current_position = eptr - md->start_subject;
813  cb.pattern_position = GET(ecode, LINK_SIZE + 3);
814  cb.next_item_length = GET(ecode, 3 + 2*LINK_SIZE);
815  cb.capture_top = offset_top/2;
816  cb.capture_last = md->capture_last;
817  cb.callout_data = md->callout_data;
818  if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
819  if (rrc < 0) RRETURN(rrc);
820  }
821  ecode += _pcre_OP_lengths[OP_CALLOUT];
822  }
823 
824  condcode = ecode[LINK_SIZE+1];
825 
826  /* Now see what the actual condition is */
827 
828  if (condcode == OP_RREF) /* Recursion test */
829  {
830  offset = GET2(ecode, LINK_SIZE + 2); /* Recursion group number*/
831  condition = md->recursive != NULL &&
832  (offset == RREF_ANY || offset == md->recursive->group_num);
833  ecode += condition? 3 : GET(ecode, 1);
834  }
835 
836  else if (condcode == OP_CREF) /* Group used test */
837  {
838  offset = GET2(ecode, LINK_SIZE+2) << 1; /* Doubled ref number */
839  condition = offset < offset_top && md->offset_vector[offset] >= 0;
840  ecode += condition? 3 : GET(ecode, 1);
841  }
842 
843  else if (condcode == OP_DEF) /* DEFINE - always false */
844  {
845  condition = FALSE;
846  ecode += GET(ecode, 1);
847  }
848 
849  /* The condition is an assertion. Call match() to evaluate it - setting
850  the final argument match_condassert causes it to stop at the end of an
851  assertion. */
852 
853  else
854  {
855  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL,
857  if (rrc == MATCH_MATCH)
858  {
859  condition = TRUE;
860  ecode += 1 + LINK_SIZE + GET(ecode, LINK_SIZE + 2);
861  while (*ecode == OP_ALT) ecode += GET(ecode, 1);
862  }
863  else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
864  {
865  RRETURN(rrc); /* Need braces because of following else */
866  }
867  else
868  {
869  condition = FALSE;
870  ecode += codelink;
871  }
872  }
873 
874  /* We are now at the branch that is to be obeyed. As there is only one,
875  we can use tail recursion to avoid using another stack frame, except when
876  match_cbegroup is required for an unlimited repeat of a possibly empty
877  group. If the second alternative doesn't exist, we can just plough on. */
878 
879  if (condition || *ecode == OP_ALT)
880  {
881  ecode += 1 + LINK_SIZE;
882  if (op == OP_SCOND) /* Possibly empty group */
883  {
884  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, match_cbegroup, RM49);
885  RRETURN(rrc);
886  }
887  else /* Group must match something */
888  {
889  flags = 0;
890  goto TAIL_RECURSE;
891  }
892  }
893  else /* Condition false & no alternative */
894  {
895  ecode += 1 + LINK_SIZE;
896  }
897  break;
898 
899 
900  /* End of the pattern, either real or forced. If we are in a top-level
901  recursion, we should restore the offsets appropriately and continue from
902  after the call. */
903 
904  case OP_ACCEPT:
905  case OP_END:
906  if (md->recursive != NULL && md->recursive->group_num == 0)
907  {
908  recursion_info *rec = md->recursive;
909  DPRINTF(("End of pattern in a (?0) recursion\n"));
910  md->recursive = rec->prevrec;
912  rec->saved_max * sizeof(int));
913  mstart = rec->save_start;
914  ims = original_ims;
915  ecode = rec->after_call;
916  break;
917  }
918 
919  /* Otherwise, if PCRE_NOTEMPTY is set, fail if we have matched an empty
920  string - backtracking will then try other alternatives, if any. */
921 
922  if (md->notempty && eptr == mstart) RRETURN(MATCH_NOMATCH);
923  md->end_match_ptr = eptr; /* Record where we ended */
924  md->end_offset_top = offset_top; /* and how many extracts were taken */
925  md->start_match_ptr = mstart; /* and the start (\K can modify) */
927 
928  /* Change option settings */
929 
930  case OP_OPT:
931  ims = ecode[1];
932  ecode += 2;
933  DPRINTF(("ims set to %02lx\n", ims));
934  break;
935 
936  /* Assertion brackets. Check the alternative branches in turn - the
937  matching won't pass the KET for an assertion. If any one branch matches,
938  the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
939  start of each branch to move the current point backwards, so the code at
940  this level is identical to the lookahead case. */
941 
942  case OP_ASSERT:
943  case OP_ASSERTBACK:
944  do
945  {
946  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
947  RM4);
948  if (rrc == MATCH_MATCH) break;
949  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
950  ecode += GET(ecode, 1);
951  }
952  while (*ecode == OP_ALT);
953  if (*ecode == OP_KET) RRETURN(MATCH_NOMATCH);
954 
955  /* If checking an assertion for a condition, return MATCH_MATCH. */
956 
957  if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
958 
959  /* Continue from after the assertion, updating the offsets high water
960  mark, since extracts may have been taken during the assertion. */
961 
962  do ecode += GET(ecode,1); while (*ecode == OP_ALT);
963  ecode += 1 + LINK_SIZE;
964  offset_top = md->end_offset_top;
965  continue;
966 
967  /* Negative assertion: all branches must fail to match */
968 
969  case OP_ASSERT_NOT:
970  case OP_ASSERTBACK_NOT:
971  do
972  {
973  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, NULL, 0,
974  RM5);
975  if (rrc == MATCH_MATCH) RRETURN(MATCH_NOMATCH);
976  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
977  ecode += GET(ecode,1);
978  }
979  while (*ecode == OP_ALT);
980 
981  if ((flags & match_condassert) != 0) RRETURN(MATCH_MATCH);
982 
983  ecode += 1 + LINK_SIZE;
984  continue;
985 
986  /* Move the subject pointer back. This occurs only at the start of
987  each branch of a lookbehind assertion. If we are too close to the start to
988  move back, this match function fails. When working with UTF-8 we move
989  back a number of characters, not bytes. */
990 
991  case OP_REVERSE:
992 #ifdef SUPPORT_UTF8
993  if (utf8)
994  {
995  i = GET(ecode, 1);
996  while (i-- > 0)
997  {
998  eptr--;
999  if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1000  BACKCHAR(eptr);
1001  }
1002  }
1003  else
1004 #endif
1005 
1006  /* No UTF-8 support, or not in UTF-8 mode: count is byte count */
1007 
1008  {
1009  eptr -= GET(ecode, 1);
1010  if (eptr < md->start_subject) RRETURN(MATCH_NOMATCH);
1011  }
1012 
1013  /* Skip to next op code */
1014 
1015  ecode += 1 + LINK_SIZE;
1016  break;
1017 
1018  /* The callout item calls an external function, if one is provided, passing
1019  details of the match so far. This is mainly for debugging, though the
1020  function is able to force a failure. */
1021 
1022  case OP_CALLOUT:
1023  if (pcre_callout != NULL)
1024  {
1025  pcre_callout_block cb;
1026  cb.version = 1; /* Version 1 of the callout block */
1027  cb.callout_number = ecode[1];
1028  cb.offset_vector = md->offset_vector;
1029  cb.subject = (PCRE_SPTR)md->start_subject;
1030  cb.subject_length = md->end_subject - md->start_subject;
1031  cb.start_match = mstart - md->start_subject;
1032  cb.current_position = eptr - md->start_subject;
1033  cb.pattern_position = GET(ecode, 2);
1034  cb.next_item_length = GET(ecode, 2 + LINK_SIZE);
1035  cb.capture_top = offset_top/2;
1036  cb.capture_last = md->capture_last;
1037  cb.callout_data = md->callout_data;
1038  if ((rrc = (*pcre_callout)(&cb)) > 0) RRETURN(MATCH_NOMATCH);
1039  if (rrc < 0) RRETURN(rrc);
1040  }
1041  ecode += 2 + 2*LINK_SIZE;
1042  break;
1043 
1044  /* Recursion either matches the current regex, or some subexpression. The
1045  offset data is the offset to the starting bracket from the start of the
1046  whole pattern. (This is so that it works from duplicated subpatterns.)
1047 
1048  If there are any capturing brackets started but not finished, we have to
1049  save their starting points and reinstate them after the recursion. However,
1050  we don't know how many such there are (offset_top records the completed
1051  total) so we just have to save all the potential data. There may be up to
1052  65535 such values, which is too large to put on the stack, but using malloc
1053  for small numbers seems expensive. As a compromise, the stack is used when
1054  there are no more than REC_STACK_SAVE_MAX values to store; otherwise malloc
1055  is used. A problem is what to do if the malloc fails ... there is no way of
1056  returning to the top level with an error. Save the top REC_STACK_SAVE_MAX
1057  values on the stack, and accept that the rest may be wrong.
1058 
1059  There are also other values that have to be saved. We use a chained
1060  sequence of blocks that actually live on the stack. Thanks to Robin Houston
1061  for the original version of this logic. */
1062 
1063  case OP_RECURSE:
1064  {
1065  callpat = md->start_code + GET(ecode, 1);
1066  new_recursive.group_num = (callpat == md->start_code)? 0 :
1067  GET2(callpat, 1 + LINK_SIZE);
1068 
1069  /* Add to "recursing stack" */
1070 
1071  new_recursive.prevrec = md->recursive;
1072  md->recursive = &new_recursive;
1073 
1074  /* Find where to continue from afterwards */
1075 
1076  ecode += 1 + LINK_SIZE;
1077  new_recursive.after_call = ecode;
1078 
1079  /* Now save the offset data. */
1080 
1081  new_recursive.saved_max = md->offset_end;
1082  if (new_recursive.saved_max <= REC_STACK_SAVE_MAX)
1083  new_recursive.offset_save = stacksave;
1084  else
1085  {
1086  new_recursive.offset_save =
1087  (int *)(pcre_malloc)(new_recursive.saved_max * sizeof(int));
1088  if (new_recursive.offset_save == NULL) RRETURN(PCRE_ERROR_NOMEMORY);
1089  }
1090 
1091  memcpy(new_recursive.offset_save, md->offset_vector,
1092  new_recursive.saved_max * sizeof(int));
1093  new_recursive.save_start = mstart;
1094  mstart = eptr;
1095 
1096  /* OK, now we can do the recursion. For each top-level alternative we
1097  restore the offset and recursion data. */
1098 
1099  DPRINTF(("Recursing into group %d\n", new_recursive.group_num));
1100  flags = (*callpat >= OP_SBRA)? match_cbegroup : 0;
1101  do
1102  {
1103  RMATCH(eptr, callpat + _pcre_OP_lengths[*callpat], offset_top,
1104  md, ims, eptrb, flags, RM6);
1105  if (rrc == MATCH_MATCH)
1106  {
1107  DPRINTF(("Recursion matched\n"));
1108  md->recursive = new_recursive.prevrec;
1109  if (new_recursive.offset_save != stacksave)
1110  (pcre_free)(new_recursive.offset_save);
1112  }
1113  else if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN)
1114  {
1115  DPRINTF(("Recursion gave error %d\n", rrc));
1116  if (new_recursive.offset_save != stacksave)
1117  (pcre_free)(new_recursive.offset_save);
1118  RRETURN(rrc);
1119  }
1120 
1121  md->recursive = &new_recursive;
1122  memcpy(md->offset_vector, new_recursive.offset_save,
1123  new_recursive.saved_max * sizeof(int));
1124  callpat += GET(callpat, 1);
1125  }
1126  while (*callpat == OP_ALT);
1127 
1128  DPRINTF(("Recursion didn't match\n"));
1129  md->recursive = new_recursive.prevrec;
1130  if (new_recursive.offset_save != stacksave)
1131  (pcre_free)(new_recursive.offset_save);
1133  }
1134  /* Control never reaches here */
1135 
1136  /* "Once" brackets are like assertion brackets except that after a match,
1137  the point in the subject string is not moved back. Thus there can never be
1138  a move back into the brackets. Friedl calls these "atomic" subpatterns.
1139  Check the alternative branches in turn - the matching won't pass the KET
1140  for this kind of subpattern. If any one branch matches, we carry on as at
1141  the end of a normal bracket, leaving the subject pointer. */
1142 
1143  case OP_ONCE:
1144  prev = ecode;
1145  saved_eptr = eptr;
1146 
1147  do
1148  {
1149  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM7);
1150  if (rrc == MATCH_MATCH) break;
1151  if (rrc != MATCH_NOMATCH && rrc != MATCH_THEN) RRETURN(rrc);
1152  ecode += GET(ecode,1);
1153  }
1154  while (*ecode == OP_ALT);
1155 
1156  /* If hit the end of the group (which could be repeated), fail */
1157 
1158  if (*ecode != OP_ONCE && *ecode != OP_ALT) RRETURN(MATCH_NOMATCH);
1159 
1160  /* Continue as from after the assertion, updating the offsets high water
1161  mark, since extracts may have been taken. */
1162 
1163  do ecode += GET(ecode, 1); while (*ecode == OP_ALT);
1164 
1165  offset_top = md->end_offset_top;
1166  eptr = md->end_match_ptr;
1167 
1168  /* For a non-repeating ket, just continue at this level. This also
1169  happens for a repeating ket if no characters were matched in the group.
1170  This is the forcible breaking of infinite loops as implemented in Perl
1171  5.005. If there is an options reset, it will get obeyed in the normal
1172  course of events. */
1173 
1174  if (*ecode == OP_KET || eptr == saved_eptr)
1175  {
1176  ecode += 1+LINK_SIZE;
1177  break;
1178  }
1179 
1180  /* The repeating kets try the rest of the pattern or restart from the
1181  preceding bracket, in the appropriate order. The second "call" of match()
1182  uses tail recursion, to avoid using another stack frame. We need to reset
1183  any options that changed within the bracket before re-running it, so
1184  check the next opcode. */
1185 
1186  if (ecode[1+LINK_SIZE] == OP_OPT)
1187  {
1188  ims = (ims & ~PCRE_IMS) | ecode[4];
1189  DPRINTF(("ims set to %02lx at group repeat\n", ims));
1190  }
1191 
1192  if (*ecode == OP_KETRMIN)
1193  {
1194  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM8);
1195  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1196  ecode = prev;
1197  flags = 0;
1198  goto TAIL_RECURSE;
1199  }
1200  else /* OP_KETRMAX */
1201  {
1202  RMATCH(eptr, prev, offset_top, md, ims, eptrb, match_cbegroup, RM9);
1203  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1204  ecode += 1 + LINK_SIZE;
1205  flags = 0;
1206  goto TAIL_RECURSE;
1207  }
1208  /* Control never gets here */
1209 
1210  /* An alternation is the end of a branch; scan along to find the end of the
1211  bracketed group and go to there. */
1212 
1213  case OP_ALT:
1214  do ecode += GET(ecode,1); while (*ecode == OP_ALT);
1215  break;
1216 
1217  /* BRAZERO, BRAMINZERO and SKIPZERO occur just before a bracket group,
1218  indicating that it may occur zero times. It may repeat infinitely, or not
1219  at all - i.e. it could be ()* or ()? or even (){0} in the pattern. Brackets
1220  with fixed upper repeat limits are compiled as a number of copies, with the
1221  optional ones preceded by BRAZERO or BRAMINZERO. */
1222 
1223  case OP_BRAZERO:
1224  {
1225  next = ecode+1;
1226  RMATCH(eptr, next, offset_top, md, ims, eptrb, 0, RM10);
1227  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1228  do next += GET(next,1); while (*next == OP_ALT);
1229  ecode = next + 1 + LINK_SIZE;
1230  }
1231  break;
1232 
1233  case OP_BRAMINZERO:
1234  {
1235  next = ecode+1;
1236  do next += GET(next, 1); while (*next == OP_ALT);
1237  RMATCH(eptr, next + 1+LINK_SIZE, offset_top, md, ims, eptrb, 0, RM11);
1238  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1239  ecode++;
1240  }
1241  break;
1242 
1243  case OP_SKIPZERO:
1244  {
1245  next = ecode+1;
1246  do next += GET(next,1); while (*next == OP_ALT);
1247  ecode = next + 1 + LINK_SIZE;
1248  }
1249  break;
1250 
1251  /* End of a group, repeated or non-repeating. */
1252 
1253  case OP_KET:
1254  case OP_KETRMIN:
1255  case OP_KETRMAX:
1256  prev = ecode - GET(ecode, 1);
1257 
1258  /* If this was a group that remembered the subject start, in order to break
1259  infinite repeats of empty string matches, retrieve the subject start from
1260  the chain. Otherwise, set it NULL. */
1261 
1262  if (*prev >= OP_SBRA)
1263  {
1264  saved_eptr = eptrb->epb_saved_eptr; /* Value at start of group */
1265  eptrb = eptrb->epb_prev; /* Backup to previous group */
1266  }
1267  else saved_eptr = NULL;
1268 
1269  /* If we are at the end of an assertion group, stop matching and return
1270  MATCH_MATCH, but record the current high water mark for use by positive
1271  assertions. Do this also for the "once" (atomic) groups. */
1272 
1273  if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
1274  *prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
1275  *prev == OP_ONCE)
1276  {
1277  md->end_match_ptr = eptr; /* For ONCE */
1278  md->end_offset_top = offset_top;
1280  }
1281 
1282  /* For capturing groups we have to check the group number back at the start
1283  and if necessary complete handling an extraction by setting the offsets and
1284  bumping the high water mark. Note that whole-pattern recursion is coded as
1285  a recurse into group 0, so it won't be picked up here. Instead, we catch it
1286  when the OP_END is reached. Other recursion is handled here. */
1287 
1288  if (*prev == OP_CBRA || *prev == OP_SCBRA)
1289  {
1290  number = GET2(prev, 1+LINK_SIZE);
1291  offset = number << 1;
1292 
1293 #ifdef DEBUG
1294  printf("end bracket %d", number);
1295  printf("\n");
1296 #endif
1297 
1298  md->capture_last = number;
1299  if (offset >= md->offset_max) md->offset_overflow = TRUE; else
1300  {
1301  md->offset_vector[offset] =
1302  md->offset_vector[md->offset_end - number];
1303  md->offset_vector[offset+1] = eptr - md->start_subject;
1304  if (offset_top <= offset) offset_top = offset + 2;
1305  }
1306 
1307  /* Handle a recursively called group. Restore the offsets
1308  appropriately and continue from after the call. */
1309 
1310  if (md->recursive != NULL && md->recursive->group_num == number)
1311  {
1312  recursion_info *rec = md->recursive;
1313  DPRINTF(("Recursion (%d) succeeded - continuing\n", number));
1314  md->recursive = rec->prevrec;
1315  mstart = rec->save_start;
1316  memcpy(md->offset_vector, rec->offset_save,
1317  rec->saved_max * sizeof(int));
1318  ecode = rec->after_call;
1319  ims = original_ims;
1320  break;
1321  }
1322  }
1323 
1324  /* For both capturing and non-capturing groups, reset the value of the ims
1325  flags, in case they got changed during the group. */
1326 
1327  ims = original_ims;
1328  DPRINTF(("ims reset to %02lx\n", ims));
1329 
1330  /* For a non-repeating ket, just continue at this level. This also
1331  happens for a repeating ket if no characters were matched in the group.
1332  This is the forcible breaking of infinite loops as implemented in Perl
1333  5.005. If there is an options reset, it will get obeyed in the normal
1334  course of events. */
1335 
1336  if (*ecode == OP_KET || eptr == saved_eptr)
1337  {
1338  ecode += 1 + LINK_SIZE;
1339  break;
1340  }
1341 
1342  /* The repeating kets try the rest of the pattern or restart from the
1343  preceding bracket, in the appropriate order. In the second case, we can use
1344  tail recursion to avoid using another stack frame, unless we have an
1345  unlimited repeat of a group that can match an empty string. */
1346 
1347  flags = (*prev >= OP_SBRA)? match_cbegroup : 0;
1348 
1349  if (*ecode == OP_KETRMIN)
1350  {
1351  RMATCH(eptr, ecode + 1 + LINK_SIZE, offset_top, md, ims, eptrb, 0, RM12);
1352  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1353  if (flags != 0) /* Could match an empty string */
1354  {
1355  RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM50);
1356  RRETURN(rrc);
1357  }
1358  ecode = prev;
1359  goto TAIL_RECURSE;
1360  }
1361  else /* OP_KETRMAX */
1362  {
1363  RMATCH(eptr, prev, offset_top, md, ims, eptrb, flags, RM13);
1364  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1365  ecode += 1 + LINK_SIZE;
1366  flags = 0;
1367  goto TAIL_RECURSE;
1368  }
1369  /* Control never gets here */
1370 
1371  /* Start of subject unless notbol, or after internal newline if multiline */
1372 
1373  case OP_CIRC:
1374  if (md->notbol && eptr == md->start_subject) RRETURN(MATCH_NOMATCH);
1375  if ((ims & PCRE_MULTILINE) != 0)
1376  {
1377  if (eptr != md->start_subject &&
1378  (eptr == md->end_subject || !WAS_NEWLINE(eptr)))
1380  ecode++;
1381  break;
1382  }
1383  /* ... else fall through */
1384 
1385  /* Start of subject assertion */
1386 
1387  case OP_SOD:
1388  if (eptr != md->start_subject) RRETURN(MATCH_NOMATCH);
1389  ecode++;
1390  break;
1391 
1392  /* Start of match assertion */
1393 
1394  case OP_SOM:
1395  if (eptr != md->start_subject + md->start_offset) RRETURN(MATCH_NOMATCH);
1396  ecode++;
1397  break;
1398 
1399  /* Reset the start of match point */
1400 
1401  case OP_SET_SOM:
1402  mstart = eptr;
1403  ecode++;
1404  break;
1405 
1406  /* Assert before internal newline if multiline, or before a terminating
1407  newline unless endonly is set, else end of subject unless noteol is set. */
1408 
1409  case OP_DOLL:
1410  if ((ims & PCRE_MULTILINE) != 0)
1411  {
1412  if (eptr < md->end_subject)
1413  { if (!IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH); }
1414  else
1415  { if (md->noteol) RRETURN(MATCH_NOMATCH); }
1416  ecode++;
1417  break;
1418  }
1419  else
1420  {
1421  if (md->noteol) RRETURN(MATCH_NOMATCH);
1422  if (!md->endonly)
1423  {
1424  if (eptr != md->end_subject &&
1425  (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1427  ecode++;
1428  break;
1429  }
1430  }
1431  /* ... else fall through for endonly */
1432 
1433  /* End of subject assertion (\z) */
1434 
1435  case OP_EOD:
1436  if (eptr < md->end_subject) RRETURN(MATCH_NOMATCH);
1437  ecode++;
1438  break;
1439 
1440  /* End of subject or ending \n assertion (\Z) */
1441 
1442  case OP_EODN:
1443  if (eptr != md->end_subject &&
1444  (!IS_NEWLINE(eptr) || eptr != md->end_subject - md->nllen))
1446  ecode++;
1447  break;
1448 
1449  /* Word boundary assertions */
1450 
1451  case OP_NOT_WORD_BOUNDARY:
1452  case OP_WORD_BOUNDARY:
1453  {
1454 
1455  /* Find out if the previous and current characters are "word" characters.
1456  It takes a bit more work in UTF-8 mode. Characters > 255 are assumed to
1457  be "non-word" characters. */
1458 
1459 #ifdef SUPPORT_UTF8
1460  if (utf8)
1461  {
1462  if (eptr == md->start_subject) prev_is_word = FALSE; else
1463  {
1464  USPTR lastptr = eptr - 1;
1465  while((*lastptr & 0xc0) == 0x80) lastptr--;
1466  GETCHAR(c, lastptr);
1467  prev_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1468  }
1469  if (eptr >= md->end_subject) cur_is_word = FALSE; else
1470  {
1471  GETCHAR(c, eptr);
1472  cur_is_word = c < 256 && (md->ctypes[c] & ctype_word) != 0;
1473  }
1474  }
1475  else
1476 #endif
1477 
1478  /* More streamlined when not in UTF-8 mode */
1479 
1480  {
1481  prev_is_word = (eptr != md->start_subject) &&
1482  ((md->ctypes[eptr[-1]] & ctype_word) != 0);
1483  cur_is_word = (eptr < md->end_subject) &&
1484  ((md->ctypes[*eptr] & ctype_word) != 0);
1485  }
1486 
1487  /* Now see if the situation is what we want */
1488 
1489  if ((*ecode++ == OP_WORD_BOUNDARY)?
1490  cur_is_word == prev_is_word : cur_is_word != prev_is_word)
1492  }
1493  break;
1494 
1495  /* Match a single character type; inline for speed */
1496 
1497  case OP_ANY:
1498  if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
1499  /* Fall through */
1500 
1501  case OP_ALLANY:
1502  if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1503  if (utf8) while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
1504  ecode++;
1505  break;
1506 
1507  /* Match a single byte, even in UTF-8 mode. This opcode really does match
1508  any byte, even newline, independent of the setting of PCRE_DOTALL. */
1509 
1510  case OP_ANYBYTE:
1511  if (eptr++ >= md->end_subject) RRETURN(MATCH_NOMATCH);
1512  ecode++;
1513  break;
1514 
1515  case OP_NOT_DIGIT:
1516  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1517  GETCHARINCTEST(c, eptr);
1518  if (
1519 #ifdef SUPPORT_UTF8
1520  c < 256 &&
1521 #endif
1522  (md->ctypes[c] & ctype_digit) != 0
1523  )
1525  ecode++;
1526  break;
1527 
1528  case OP_DIGIT:
1529  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1530  GETCHARINCTEST(c, eptr);
1531  if (
1532 #ifdef SUPPORT_UTF8
1533  c >= 256 ||
1534 #endif
1535  (md->ctypes[c] & ctype_digit) == 0
1536  )
1538  ecode++;
1539  break;
1540 
1541  case OP_NOT_WHITESPACE:
1542  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1543  GETCHARINCTEST(c, eptr);
1544  if (
1545 #ifdef SUPPORT_UTF8
1546  c < 256 &&
1547 #endif
1548  (md->ctypes[c] & ctype_space) != 0
1549  )
1551  ecode++;
1552  break;
1553 
1554  case OP_WHITESPACE:
1555  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1556  GETCHARINCTEST(c, eptr);
1557  if (
1558 #ifdef SUPPORT_UTF8
1559  c >= 256 ||
1560 #endif
1561  (md->ctypes[c] & ctype_space) == 0
1562  )
1564  ecode++;
1565  break;
1566 
1567  case OP_NOT_WORDCHAR:
1568  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1569  GETCHARINCTEST(c, eptr);
1570  if (
1571 #ifdef SUPPORT_UTF8
1572  c < 256 &&
1573 #endif
1574  (md->ctypes[c] & ctype_word) != 0
1575  )
1577  ecode++;
1578  break;
1579 
1580  case OP_WORDCHAR:
1581  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1582  GETCHARINCTEST(c, eptr);
1583  if (
1584 #ifdef SUPPORT_UTF8
1585  c >= 256 ||
1586 #endif
1587  (md->ctypes[c] & ctype_word) == 0
1588  )
1590  ecode++;
1591  break;
1592 
1593  case OP_ANYNL:
1594  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1595  GETCHARINCTEST(c, eptr);
1596  switch(c)
1597  {
1598  default: RRETURN(MATCH_NOMATCH);
1599  case 0x000d:
1600  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
1601  break;
1602 
1603  case 0x000a:
1604  break;
1605 
1606  case 0x000b:
1607  case 0x000c:
1608  case 0x0085:
1609  case 0x2028:
1610  case 0x2029:
1611  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
1612  break;
1613  }
1614  ecode++;
1615  break;
1616 
1617  case OP_NOT_HSPACE:
1618  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1619  GETCHARINCTEST(c, eptr);
1620  switch(c)
1621  {
1622  default: break;
1623  case 0x09: /* HT */
1624  case 0x20: /* SPACE */
1625  case 0xa0: /* NBSP */
1626  case 0x1680: /* OGHAM SPACE MARK */
1627  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1628  case 0x2000: /* EN QUAD */
1629  case 0x2001: /* EM QUAD */
1630  case 0x2002: /* EN SPACE */
1631  case 0x2003: /* EM SPACE */
1632  case 0x2004: /* THREE-PER-EM SPACE */
1633  case 0x2005: /* FOUR-PER-EM SPACE */
1634  case 0x2006: /* SIX-PER-EM SPACE */
1635  case 0x2007: /* FIGURE SPACE */
1636  case 0x2008: /* PUNCTUATION SPACE */
1637  case 0x2009: /* THIN SPACE */
1638  case 0x200A: /* HAIR SPACE */
1639  case 0x202f: /* NARROW NO-BREAK SPACE */
1640  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1641  case 0x3000: /* IDEOGRAPHIC SPACE */
1643  }
1644  ecode++;
1645  break;
1646 
1647  case OP_HSPACE:
1648  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1649  GETCHARINCTEST(c, eptr);
1650  switch(c)
1651  {
1652  default: RRETURN(MATCH_NOMATCH);
1653  case 0x09: /* HT */
1654  case 0x20: /* SPACE */
1655  case 0xa0: /* NBSP */
1656  case 0x1680: /* OGHAM SPACE MARK */
1657  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
1658  case 0x2000: /* EN QUAD */
1659  case 0x2001: /* EM QUAD */
1660  case 0x2002: /* EN SPACE */
1661  case 0x2003: /* EM SPACE */
1662  case 0x2004: /* THREE-PER-EM SPACE */
1663  case 0x2005: /* FOUR-PER-EM SPACE */
1664  case 0x2006: /* SIX-PER-EM SPACE */
1665  case 0x2007: /* FIGURE SPACE */
1666  case 0x2008: /* PUNCTUATION SPACE */
1667  case 0x2009: /* THIN SPACE */
1668  case 0x200A: /* HAIR SPACE */
1669  case 0x202f: /* NARROW NO-BREAK SPACE */
1670  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
1671  case 0x3000: /* IDEOGRAPHIC SPACE */
1672  break;
1673  }
1674  ecode++;
1675  break;
1676 
1677  case OP_NOT_VSPACE:
1678  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1679  GETCHARINCTEST(c, eptr);
1680  switch(c)
1681  {
1682  default: break;
1683  case 0x0a: /* LF */
1684  case 0x0b: /* VT */
1685  case 0x0c: /* FF */
1686  case 0x0d: /* CR */
1687  case 0x85: /* NEL */
1688  case 0x2028: /* LINE SEPARATOR */
1689  case 0x2029: /* PARAGRAPH SEPARATOR */
1691  }
1692  ecode++;
1693  break;
1694 
1695  case OP_VSPACE:
1696  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1697  GETCHARINCTEST(c, eptr);
1698  switch(c)
1699  {
1700  default: RRETURN(MATCH_NOMATCH);
1701  case 0x0a: /* LF */
1702  case 0x0b: /* VT */
1703  case 0x0c: /* FF */
1704  case 0x0d: /* CR */
1705  case 0x85: /* NEL */
1706  case 0x2028: /* LINE SEPARATOR */
1707  case 0x2029: /* PARAGRAPH SEPARATOR */
1708  break;
1709  }
1710  ecode++;
1711  break;
1712 
1713 #ifdef SUPPORT_UCP
1714  /* Check the next character by Unicode property. We will get here only
1715  if the support is in the binary; otherwise a compile-time error occurs. */
1716 
1717  case OP_PROP:
1718  case OP_NOTPROP:
1719  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1720  GETCHARINCTEST(c, eptr);
1721  {
1722  const ucd_record *prop = GET_UCD(c);
1723 
1724  switch(ecode[1])
1725  {
1726  case PT_ANY:
1727  if (op == OP_NOTPROP) RRETURN(MATCH_NOMATCH);
1728  break;
1729 
1730  case PT_LAMP:
1731  if ((prop->chartype == ucp_Lu ||
1732  prop->chartype == ucp_Ll ||
1733  prop->chartype == ucp_Lt) == (op == OP_NOTPROP))
1735  break;
1736 
1737  case PT_GC:
1738  if ((ecode[2] != _pcre_ucp_gentype[prop->chartype]) == (op == OP_PROP))
1740  break;
1741 
1742  case PT_PC:
1743  if ((ecode[2] != prop->chartype) == (op == OP_PROP))
1745  break;
1746 
1747  case PT_SC:
1748  if ((ecode[2] != prop->script) == (op == OP_PROP))
1750  break;
1751 
1752  default:
1754  }
1755 
1756  ecode += 3;
1757  }
1758  break;
1759 
1760  /* Match an extended Unicode sequence. We will get here only if the support
1761  is in the binary; otherwise a compile-time error occurs. */
1762 
1763  case OP_EXTUNI:
1764  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1765  GETCHARINCTEST(c, eptr);
1766  {
1767  int category = UCD_CATEGORY(c);
1768  if (category == ucp_M) RRETURN(MATCH_NOMATCH);
1769  while (eptr < md->end_subject)
1770  {
1771  int len = 1;
1772  if (!utf8) c = *eptr; else
1773  {
1774  GETCHARLEN(c, eptr, len);
1775  }
1776  category = UCD_CATEGORY(c);
1777  if (category != ucp_M) break;
1778  eptr += len;
1779  }
1780  }
1781  ecode++;
1782  break;
1783 #endif
1784 
1785 
1786  /* Match a back reference, possibly repeatedly. Look past the end of the
1787  item to see if there is repeat information following. The code is similar
1788  to that for character classes, but repeated for efficiency. Then obey
1789  similar code to character type repeats - written out again for speed.
1790  However, if the referenced string is the empty string, always treat
1791  it as matched, any number of times (otherwise there could be infinite
1792  loops). */
1793 
1794  case OP_REF:
1795  {
1796  offset = GET2(ecode, 1) << 1; /* Doubled ref number */
1797  ecode += 3;
1798 
1799  /* If the reference is unset, there are two possibilities:
1800 
1801  (a) In the default, Perl-compatible state, set the length to be longer
1802  than the amount of subject left; this ensures that every attempt at a
1803  match fails. We can't just fail here, because of the possibility of
1804  quantifiers with zero minima.
1805 
1806  (b) If the JavaScript compatibility flag is set, set the length to zero
1807  so that the back reference matches an empty string.
1808 
1809  Otherwise, set the length to the length of what was matched by the
1810  referenced subpattern. */
1811 
1812  if (offset >= offset_top || md->offset_vector[offset] < 0)
1813  length = (md->jscript_compat)? 0 : md->end_subject - eptr + 1;
1814  else
1815  length = md->offset_vector[offset+1] - md->offset_vector[offset];
1816 
1817  /* Set up for repetition, or handle the non-repeated case */
1818 
1819  switch (*ecode)
1820  {
1821  case OP_CRSTAR:
1822  case OP_CRMINSTAR:
1823  case OP_CRPLUS:
1824  case OP_CRMINPLUS:
1825  case OP_CRQUERY:
1826  case OP_CRMINQUERY:
1827  c = *ecode++ - OP_CRSTAR;
1828  minimize = (c & 1) != 0;
1829  min = rep_min[c]; /* Pick up values from tables; */
1830  max = rep_max[c]; /* zero for max => infinity */
1831  if (max == 0) max = INT_MAX;
1832  break;
1833 
1834  case OP_CRRANGE:
1835  case OP_CRMINRANGE:
1836  minimize = (*ecode == OP_CRMINRANGE);
1837  min = GET2(ecode, 1);
1838  max = GET2(ecode, 3);
1839  if (max == 0) max = INT_MAX;
1840  ecode += 5;
1841  break;
1842 
1843  default: /* No repeat follows */
1844  if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1845  eptr += length;
1846  continue; /* With the main loop */
1847  }
1848 
1849  /* If the length of the reference is zero, just continue with the
1850  main loop. */
1851 
1852  if (length == 0) continue;
1853 
1854  /* First, ensure the minimum number of matches are present. We get back
1855  the length of the reference string explicitly rather than passing the
1856  address of eptr, so that eptr can be a register variable. */
1857 
1858  for (i = 1; i <= min; i++)
1859  {
1860  if (!match_ref(offset, eptr, length, md, ims)) RRETURN(MATCH_NOMATCH);
1861  eptr += length;
1862  }
1863 
1864  /* If min = max, continue at the same level without recursion.
1865  They are not both allowed to be zero. */
1866 
1867  if (min == max) continue;
1868 
1869  /* If minimizing, keep trying and advancing the pointer */
1870 
1871  if (minimize)
1872  {
1873  for (fi = min;; fi++)
1874  {
1875  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM14);
1876  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1877  if (fi >= max || !match_ref(offset, eptr, length, md, ims))
1879  eptr += length;
1880  }
1881  /* Control never gets here */
1882  }
1883 
1884  /* If maximizing, find the longest string and work backwards */
1885 
1886  else
1887  {
1888  pp = eptr;
1889  for (i = min; i < max; i++)
1890  {
1891  if (!match_ref(offset, eptr, length, md, ims)) break;
1892  eptr += length;
1893  }
1894  while (eptr >= pp)
1895  {
1896  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM15);
1897  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
1898  eptr -= length;
1899  }
1901  }
1902  }
1903  /* Control never gets here */
1904 
1905 
1906 
1907  /* Match a bit-mapped character class, possibly repeatedly. This op code is
1908  used when all the characters in the class have values in the range 0-255,
1909  and either the matching is caseful, or the characters are in the range
1910  0-127 when UTF-8 processing is enabled. The only difference between
1911  OP_CLASS and OP_NCLASS occurs when a data character outside the range is
1912  encountered.
1913 
1914  First, look past the end of the item to see if there is repeat information
1915  following. Then obey similar code to character type repeats - written out
1916  again for speed. */
1917 
1918  case OP_NCLASS:
1919  case OP_CLASS:
1920  {
1921  data = ecode + 1; /* Save for matching */
1922  ecode += 33; /* Advance past the item */
1923 
1924  switch (*ecode)
1925  {
1926  case OP_CRSTAR:
1927  case OP_CRMINSTAR:
1928  case OP_CRPLUS:
1929  case OP_CRMINPLUS:
1930  case OP_CRQUERY:
1931  case OP_CRMINQUERY:
1932  c = *ecode++ - OP_CRSTAR;
1933  minimize = (c & 1) != 0;
1934  min = rep_min[c]; /* Pick up values from tables; */
1935  max = rep_max[c]; /* zero for max => infinity */
1936  if (max == 0) max = INT_MAX;
1937  break;
1938 
1939  case OP_CRRANGE:
1940  case OP_CRMINRANGE:
1941  minimize = (*ecode == OP_CRMINRANGE);
1942  min = GET2(ecode, 1);
1943  max = GET2(ecode, 3);
1944  if (max == 0) max = INT_MAX;
1945  ecode += 5;
1946  break;
1947 
1948  default: /* No repeat follows */
1949  min = max = 1;
1950  break;
1951  }
1952 
1953  /* First, ensure the minimum number of matches are present. */
1954 
1955 #ifdef SUPPORT_UTF8
1956  /* UTF-8 mode */
1957  if (utf8)
1958  {
1959  for (i = 1; i <= min; i++)
1960  {
1961  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1962  GETCHARINC(c, eptr);
1963  if (c > 255)
1964  {
1965  if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
1966  }
1967  else
1968  {
1969  if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1970  }
1971  }
1972  }
1973  else
1974 #endif
1975  /* Not UTF-8 mode */
1976  {
1977  for (i = 1; i <= min; i++)
1978  {
1979  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
1980  c = *eptr++;
1981  if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
1982  }
1983  }
1984 
1985  /* If max == min we can continue with the main loop without the
1986  need to recurse. */
1987 
1988  if (min == max) continue;
1989 
1990  /* If minimizing, keep testing the rest of the expression and advancing
1991  the pointer while it matches the class. */
1992 
1993  if (minimize)
1994  {
1995 #ifdef SUPPORT_UTF8
1996  /* UTF-8 mode */
1997  if (utf8)
1998  {
1999  for (fi = min;; fi++)
2000  {
2001  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM16);
2002  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2003  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2004  GETCHARINC(c, eptr);
2005  if (c > 255)
2006  {
2007  if (op == OP_CLASS) RRETURN(MATCH_NOMATCH);
2008  }
2009  else
2010  {
2011  if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2012  }
2013  }
2014  }
2015  else
2016 #endif
2017  /* Not UTF-8 mode */
2018  {
2019  for (fi = min;; fi++)
2020  {
2021  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM17);
2022  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2023  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2024  c = *eptr++;
2025  if ((data[c/8] & (1 << (c&7))) == 0) RRETURN(MATCH_NOMATCH);
2026  }
2027  }
2028  /* Control never gets here */
2029  }
2030 
2031  /* If maximizing, find the longest possible run, then work backwards. */
2032 
2033  else
2034  {
2035  pp = eptr;
2036 
2037 #ifdef SUPPORT_UTF8
2038  /* UTF-8 mode */
2039  if (utf8)
2040  {
2041  for (i = min; i < max; i++)
2042  {
2043  int len = 1;
2044  if (eptr >= md->end_subject) break;
2045  GETCHARLEN(c, eptr, len);
2046  if (c > 255)
2047  {
2048  if (op == OP_CLASS) break;
2049  }
2050  else
2051  {
2052  if ((data[c/8] & (1 << (c&7))) == 0) break;
2053  }
2054  eptr += len;
2055  }
2056  for (;;)
2057  {
2058  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM18);
2059  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2060  if (eptr-- == pp) break; /* Stop if tried at original pos */
2061  BACKCHAR(eptr);
2062  }
2063  }
2064  else
2065 #endif
2066  /* Not UTF-8 mode */
2067  {
2068  for (i = min; i < max; i++)
2069  {
2070  if (eptr >= md->end_subject) break;
2071  c = *eptr;
2072  if ((data[c/8] & (1 << (c&7))) == 0) break;
2073  eptr++;
2074  }
2075  while (eptr >= pp)
2076  {
2077  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM19);
2078  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2079  eptr--;
2080  }
2081  }
2082 
2084  }
2085  }
2086  /* Control never gets here */
2087 
2088 
2089  /* Match an extended character class. This opcode is encountered only
2090  when UTF-8 mode mode is supported. Nevertheless, we may not be in UTF-8
2091  mode, because Unicode properties are supported in non-UTF-8 mode. */
2092 
2093 #ifdef SUPPORT_UTF8
2094  case OP_XCLASS:
2095  {
2096  data = ecode + 1 + LINK_SIZE; /* Save for matching */
2097  ecode += GET(ecode, 1); /* Advance past the item */
2098 
2099  switch (*ecode)
2100  {
2101  case OP_CRSTAR:
2102  case OP_CRMINSTAR:
2103  case OP_CRPLUS:
2104  case OP_CRMINPLUS:
2105  case OP_CRQUERY:
2106  case OP_CRMINQUERY:
2107  c = *ecode++ - OP_CRSTAR;
2108  minimize = (c & 1) != 0;
2109  min = rep_min[c]; /* Pick up values from tables; */
2110  max = rep_max[c]; /* zero for max => infinity */
2111  if (max == 0) max = INT_MAX;
2112  break;
2113 
2114  case OP_CRRANGE:
2115  case OP_CRMINRANGE:
2116  minimize = (*ecode == OP_CRMINRANGE);
2117  min = GET2(ecode, 1);
2118  max = GET2(ecode, 3);
2119  if (max == 0) max = INT_MAX;
2120  ecode += 5;
2121  break;
2122 
2123  default: /* No repeat follows */
2124  min = max = 1;
2125  break;
2126  }
2127 
2128  /* First, ensure the minimum number of matches are present. */
2129 
2130  for (i = 1; i <= min; i++)
2131  {
2132  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2133  GETCHARINCTEST(c, eptr);
2134  if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2135  }
2136 
2137  /* If max == min we can continue with the main loop without the
2138  need to recurse. */
2139 
2140  if (min == max) continue;
2141 
2142  /* If minimizing, keep testing the rest of the expression and advancing
2143  the pointer while it matches the class. */
2144 
2145  if (minimize)
2146  {
2147  for (fi = min;; fi++)
2148  {
2149  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM20);
2150  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2151  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2152  GETCHARINCTEST(c, eptr);
2153  if (!_pcre_xclass(c, data)) RRETURN(MATCH_NOMATCH);
2154  }
2155  /* Control never gets here */
2156  }
2157 
2158  /* If maximizing, find the longest possible run, then work backwards. */
2159 
2160  else
2161  {
2162  pp = eptr;
2163  for (i = min; i < max; i++)
2164  {
2165  int len = 1;
2166  if (eptr >= md->end_subject) break;
2167  GETCHARLENTEST(c, eptr, len);
2168  if (!_pcre_xclass(c, data)) break;
2169  eptr += len;
2170  }
2171  for(;;)
2172  {
2173  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM21);
2174  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2175  if (eptr-- == pp) break; /* Stop if tried at original pos */
2176  if (utf8) BACKCHAR(eptr);
2177  }
2179  }
2180 
2181  /* Control never gets here */
2182  }
2183 #endif /* End of XCLASS */
2184 
2185  /* Match a single character, casefully */
2186 
2187  case OP_CHAR:
2188 #ifdef SUPPORT_UTF8
2189  if (utf8)
2190  {
2191  length = 1;
2192  ecode++;
2193  GETCHARLEN(fc, ecode, length);
2194  if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2195  while (length-- > 0) if (*ecode++ != *eptr++) RRETURN(MATCH_NOMATCH);
2196  }
2197  else
2198 #endif
2199 
2200  /* Non-UTF-8 mode */
2201  {
2202  if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2203  if (ecode[1] != *eptr++) RRETURN(MATCH_NOMATCH);
2204  ecode += 2;
2205  }
2206  break;
2207 
2208  /* Match a single character, caselessly */
2209 
2210  case OP_CHARNC:
2211 #ifdef SUPPORT_UTF8
2212  if (utf8)
2213  {
2214  length = 1;
2215  ecode++;
2216  GETCHARLEN(fc, ecode, length);
2217 
2218  if (length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2219 
2220  /* If the pattern character's value is < 128, we have only one byte, and
2221  can use the fast lookup table. */
2222 
2223  if (fc < 128)
2224  {
2225  if (md->lcc[*ecode++] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2226  }
2227 
2228  /* Otherwise we must pick up the subject character */
2229 
2230  else
2231  {
2232  unsigned int dc;
2233  GETCHARINC(dc, eptr);
2234  ecode += length;
2235 
2236  /* If we have Unicode property support, we can use it to test the other
2237  case of the character, if there is one. */
2238 
2239  if (fc != dc)
2240  {
2241 #ifdef SUPPORT_UCP
2242  if (dc != UCD_OTHERCASE(fc))
2243 #endif
2245  }
2246  }
2247  }
2248  else
2249 #endif /* SUPPORT_UTF8 */
2250 
2251  /* Non-UTF-8 mode */
2252  {
2253  if (md->end_subject - eptr < 1) RRETURN(MATCH_NOMATCH);
2254  if (md->lcc[ecode[1]] != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2255  ecode += 2;
2256  }
2257  break;
2258 
2259  /* Match a single character repeatedly. */
2260 
2261  case OP_EXACT:
2262  min = max = GET2(ecode, 1);
2263  ecode += 3;
2264  goto REPEATCHAR;
2265 
2266  case OP_POSUPTO:
2267  possessive = TRUE;
2268  /* Fall through */
2269 
2270  case OP_UPTO:
2271  case OP_MINUPTO:
2272  min = 0;
2273  max = GET2(ecode, 1);
2274  minimize = *ecode == OP_MINUPTO;
2275  ecode += 3;
2276  goto REPEATCHAR;
2277 
2278  case OP_POSSTAR:
2279  possessive = TRUE;
2280  min = 0;
2281  max = INT_MAX;
2282  ecode++;
2283  goto REPEATCHAR;
2284 
2285  case OP_POSPLUS:
2286  possessive = TRUE;
2287  min = 1;
2288  max = INT_MAX;
2289  ecode++;
2290  goto REPEATCHAR;
2291 
2292  case OP_POSQUERY:
2293  possessive = TRUE;
2294  min = 0;
2295  max = 1;
2296  ecode++;
2297  goto REPEATCHAR;
2298 
2299  case OP_STAR:
2300  case OP_MINSTAR:
2301  case OP_PLUS:
2302  case OP_MINPLUS:
2303  case OP_QUERY:
2304  case OP_MINQUERY:
2305  c = *ecode++ - OP_STAR;
2306  minimize = (c & 1) != 0;
2307  min = rep_min[c]; /* Pick up values from tables; */
2308  max = rep_max[c]; /* zero for max => infinity */
2309  if (max == 0) max = INT_MAX;
2310 
2311  /* Common code for all repeated single-character matches. We can give
2312  up quickly if there are fewer than the minimum number of characters left in
2313  the subject. */
2314 
2315  REPEATCHAR:
2316 #ifdef SUPPORT_UTF8
2317  if (utf8)
2318  {
2319  length = 1;
2320  charptr = ecode;
2321  GETCHARLEN(fc, ecode, length);
2322  if (min * length > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2323  ecode += length;
2324 
2325  /* Handle multibyte character matching specially here. There is
2326  support for caseless matching if UCP support is present. */
2327 
2328  if (length > 1)
2329  {
2330 #ifdef SUPPORT_UCP
2331  unsigned int othercase;
2332  if ((ims & PCRE_CASELESS) != 0 &&
2333  (othercase = UCD_OTHERCASE(fc)) != fc)
2334  oclength = _pcre_ord2utf8(othercase, occhars);
2335  else oclength = 0;
2336 #endif /* SUPPORT_UCP */
2337 
2338  for (i = 1; i <= min; i++)
2339  {
2340  if (memcmp(eptr, charptr, length) == 0) eptr += length;
2341 #ifdef SUPPORT_UCP
2342  /* Need braces because of following else */
2343  else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2344  else
2345  {
2346  if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2347  eptr += oclength;
2348  }
2349 #else /* without SUPPORT_UCP */
2350  else { RRETURN(MATCH_NOMATCH); }
2351 #endif /* SUPPORT_UCP */
2352  }
2353 
2354  if (min == max) continue;
2355 
2356  if (minimize)
2357  {
2358  for (fi = min;; fi++)
2359  {
2360  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM22);
2361  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2362  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2363  if (memcmp(eptr, charptr, length) == 0) eptr += length;
2364 #ifdef SUPPORT_UCP
2365  /* Need braces because of following else */
2366  else if (oclength == 0) { RRETURN(MATCH_NOMATCH); }
2367  else
2368  {
2369  if (memcmp(eptr, occhars, oclength) != 0) RRETURN(MATCH_NOMATCH);
2370  eptr += oclength;
2371  }
2372 #else /* without SUPPORT_UCP */
2373  else { RRETURN (MATCH_NOMATCH); }
2374 #endif /* SUPPORT_UCP */
2375  }
2376  /* Control never gets here */
2377  }
2378 
2379  else /* Maximize */
2380  {
2381  pp = eptr;
2382  for (i = min; i < max; i++)
2383  {
2384  if (eptr > md->end_subject - length) break;
2385  if (memcmp(eptr, charptr, length) == 0) eptr += length;
2386 #ifdef SUPPORT_UCP
2387  else if (oclength == 0) break;
2388  else
2389  {
2390  if (memcmp(eptr, occhars, oclength) != 0) break;
2391  eptr += oclength;
2392  }
2393 #else /* without SUPPORT_UCP */
2394  else break;
2395 #endif /* SUPPORT_UCP */
2396  }
2397 
2398  if (possessive) continue;
2399  for(;;)
2400  {
2401  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM23);
2402  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2403  if (eptr == pp) RRETURN(MATCH_NOMATCH);
2404 #ifdef SUPPORT_UCP
2405  eptr--;
2406  BACKCHAR(eptr);
2407 #else /* without SUPPORT_UCP */
2408  eptr -= length;
2409 #endif /* SUPPORT_UCP */
2410  }
2411  }
2412  /* Control never gets here */
2413  }
2414 
2415  /* If the length of a UTF-8 character is 1, we fall through here, and
2416  obey the code as for non-UTF-8 characters below, though in this case the
2417  value of fc will always be < 128. */
2418  }
2419  else
2420 #endif /* SUPPORT_UTF8 */
2421 
2422  /* When not in UTF-8 mode, load a single-byte character. */
2423  {
2424  if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2425  fc = *ecode++;
2426  }
2427 
2428  /* The value of fc at this point is always less than 256, though we may or
2429  may not be in UTF-8 mode. The code is duplicated for the caseless and
2430  caseful cases, for speed, since matching characters is likely to be quite
2431  common. First, ensure the minimum number of matches are present. If min =
2432  max, continue at the same level without recursing. Otherwise, if
2433  minimizing, keep trying the rest of the expression and advancing one
2434  matching character if failing, up to the maximum. Alternatively, if
2435  maximizing, find the maximum number of characters and work backwards. */
2436 
2437  DPRINTF(("matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2438  max, eptr));
2439 
2440  if ((ims & PCRE_CASELESS) != 0)
2441  {
2442  fc = md->lcc[fc];
2443  for (i = 1; i <= min; i++)
2444  if (fc != md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2445  if (min == max) continue;
2446  if (minimize)
2447  {
2448  for (fi = min;; fi++)
2449  {
2450  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM24);
2451  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2452  if (fi >= max || eptr >= md->end_subject ||
2453  fc != md->lcc[*eptr++])
2455  }
2456  /* Control never gets here */
2457  }
2458  else /* Maximize */
2459  {
2460  pp = eptr;
2461  for (i = min; i < max; i++)
2462  {
2463  if (eptr >= md->end_subject || fc != md->lcc[*eptr]) break;
2464  eptr++;
2465  }
2466  if (possessive) continue;
2467  while (eptr >= pp)
2468  {
2469  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM25);
2470  eptr--;
2471  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2472  }
2474  }
2475  /* Control never gets here */
2476  }
2477 
2478  /* Caseful comparisons (includes all multi-byte characters) */
2479 
2480  else
2481  {
2482  for (i = 1; i <= min; i++) if (fc != *eptr++) RRETURN(MATCH_NOMATCH);
2483  if (min == max) continue;
2484  if (minimize)
2485  {
2486  for (fi = min;; fi++)
2487  {
2488  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM26);
2489  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2490  if (fi >= max || eptr >= md->end_subject || fc != *eptr++)
2492  }
2493  /* Control never gets here */
2494  }
2495  else /* Maximize */
2496  {
2497  pp = eptr;
2498  for (i = min; i < max; i++)
2499  {
2500  if (eptr >= md->end_subject || fc != *eptr) break;
2501  eptr++;
2502  }
2503  if (possessive) continue;
2504  while (eptr >= pp)
2505  {
2506  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM27);
2507  eptr--;
2508  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2509  }
2511  }
2512  }
2513  /* Control never gets here */
2514 
2515  /* Match a negated single one-byte character. The character we are
2516  checking can be multibyte. */
2517 
2518  case OP_NOT:
2519  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2520  ecode++;
2521  GETCHARINCTEST(c, eptr);
2522  if ((ims & PCRE_CASELESS) != 0)
2523  {
2524 #ifdef SUPPORT_UTF8
2525  if (c < 256)
2526 #endif
2527  c = md->lcc[c];
2528  if (md->lcc[*ecode++] == c) RRETURN(MATCH_NOMATCH);
2529  }
2530  else
2531  {
2532  if (*ecode++ == c) RRETURN(MATCH_NOMATCH);
2533  }
2534  break;
2535 
2536  /* Match a negated single one-byte character repeatedly. This is almost a
2537  repeat of the code for a repeated single character, but I haven't found a
2538  nice way of commoning these up that doesn't require a test of the
2539  positive/negative option for each character match. Maybe that wouldn't add
2540  very much to the time taken, but character matching *is* what this is all
2541  about... */
2542 
2543  case OP_NOTEXACT:
2544  min = max = GET2(ecode, 1);
2545  ecode += 3;
2546  goto REPEATNOTCHAR;
2547 
2548  case OP_NOTUPTO:
2549  case OP_NOTMINUPTO:
2550  min = 0;
2551  max = GET2(ecode, 1);
2552  minimize = *ecode == OP_NOTMINUPTO;
2553  ecode += 3;
2554  goto REPEATNOTCHAR;
2555 
2556  case OP_NOTPOSSTAR:
2557  possessive = TRUE;
2558  min = 0;
2559  max = INT_MAX;
2560  ecode++;
2561  goto REPEATNOTCHAR;
2562 
2563  case OP_NOTPOSPLUS:
2564  possessive = TRUE;
2565  min = 1;
2566  max = INT_MAX;
2567  ecode++;
2568  goto REPEATNOTCHAR;
2569 
2570  case OP_NOTPOSQUERY:
2571  possessive = TRUE;
2572  min = 0;
2573  max = 1;
2574  ecode++;
2575  goto REPEATNOTCHAR;
2576 
2577  case OP_NOTPOSUPTO:
2578  possessive = TRUE;
2579  min = 0;
2580  max = GET2(ecode, 1);
2581  ecode += 3;
2582  goto REPEATNOTCHAR;
2583 
2584  case OP_NOTSTAR:
2585  case OP_NOTMINSTAR:
2586  case OP_NOTPLUS:
2587  case OP_NOTMINPLUS:
2588  case OP_NOTQUERY:
2589  case OP_NOTMINQUERY:
2590  c = *ecode++ - OP_NOTSTAR;
2591  minimize = (c & 1) != 0;
2592  min = rep_min[c]; /* Pick up values from tables; */
2593  max = rep_max[c]; /* zero for max => infinity */
2594  if (max == 0) max = INT_MAX;
2595 
2596  /* Common code for all repeated single-byte matches. We can give up quickly
2597  if there are fewer than the minimum number of bytes left in the
2598  subject. */
2599 
2600  REPEATNOTCHAR:
2601  if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2602  fc = *ecode++;
2603 
2604  /* The code is duplicated for the caseless and caseful cases, for speed,
2605  since matching characters is likely to be quite common. First, ensure the
2606  minimum number of matches are present. If min = max, continue at the same
2607  level without recursing. Otherwise, if minimizing, keep trying the rest of
2608  the expression and advancing one matching character if failing, up to the
2609  maximum. Alternatively, if maximizing, find the maximum number of
2610  characters and work backwards. */
2611 
2612  DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", fc, min, max,
2613  max, eptr));
2614 
2615  if ((ims & PCRE_CASELESS) != 0)
2616  {
2617  fc = md->lcc[fc];
2618 
2619 #ifdef SUPPORT_UTF8
2620  /* UTF-8 mode */
2621  if (utf8)
2622  {
2623  register unsigned int d;
2624  for (i = 1; i <= min; i++)
2625  {
2626  GETCHARINC(d, eptr);
2627  if (d < 256) d = md->lcc[d];
2628  if (fc == d) RRETURN(MATCH_NOMATCH);
2629  }
2630  }
2631  else
2632 #endif
2633 
2634  /* Not UTF-8 mode */
2635  {
2636  for (i = 1; i <= min; i++)
2637  if (fc == md->lcc[*eptr++]) RRETURN(MATCH_NOMATCH);
2638  }
2639 
2640  if (min == max) continue;
2641 
2642  if (minimize)
2643  {
2644 #ifdef SUPPORT_UTF8
2645  /* UTF-8 mode */
2646  if (utf8)
2647  {
2648  register unsigned int d;
2649  for (fi = min;; fi++)
2650  {
2651  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM28);
2652  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2653  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2654  GETCHARINC(d, eptr);
2655  if (d < 256) d = md->lcc[d];
2656  if (fc == d) RRETURN(MATCH_NOMATCH);
2657 
2658  }
2659  }
2660  else
2661 #endif
2662  /* Not UTF-8 mode */
2663  {
2664  for (fi = min;; fi++)
2665  {
2666  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM29);
2667  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2668  if (fi >= max || eptr >= md->end_subject || fc == md->lcc[*eptr++])
2670  }
2671  }
2672  /* Control never gets here */
2673  }
2674 
2675  /* Maximize case */
2676 
2677  else
2678  {
2679  pp = eptr;
2680 
2681 #ifdef SUPPORT_UTF8
2682  /* UTF-8 mode */
2683  if (utf8)
2684  {
2685  register unsigned int d;
2686  for (i = min; i < max; i++)
2687  {
2688  int len = 1;
2689  if (eptr >= md->end_subject) break;
2690  GETCHARLEN(d, eptr, len);
2691  if (d < 256) d = md->lcc[d];
2692  if (fc == d) break;
2693  eptr += len;
2694  }
2695  if (possessive) continue;
2696  for(;;)
2697  {
2698  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM30);
2699  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2700  if (eptr-- == pp) break; /* Stop if tried at original pos */
2701  BACKCHAR(eptr);
2702  }
2703  }
2704  else
2705 #endif
2706  /* Not UTF-8 mode */
2707  {
2708  for (i = min; i < max; i++)
2709  {
2710  if (eptr >= md->end_subject || fc == md->lcc[*eptr]) break;
2711  eptr++;
2712  }
2713  if (possessive) continue;
2714  while (eptr >= pp)
2715  {
2716  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM31);
2717  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2718  eptr--;
2719  }
2720  }
2721 
2723  }
2724  /* Control never gets here */
2725  }
2726 
2727  /* Caseful comparisons */
2728 
2729  else
2730  {
2731 #ifdef SUPPORT_UTF8
2732  /* UTF-8 mode */
2733  if (utf8)
2734  {
2735  register unsigned int d;
2736  for (i = 1; i <= min; i++)
2737  {
2738  GETCHARINC(d, eptr);
2739  if (fc == d) RRETURN(MATCH_NOMATCH);
2740  }
2741  }
2742  else
2743 #endif
2744  /* Not UTF-8 mode */
2745  {
2746  for (i = 1; i <= min; i++)
2747  if (fc == *eptr++) RRETURN(MATCH_NOMATCH);
2748  }
2749 
2750  if (min == max) continue;
2751 
2752  if (minimize)
2753  {
2754 #ifdef SUPPORT_UTF8
2755  /* UTF-8 mode */
2756  if (utf8)
2757  {
2758  register unsigned int d;
2759  for (fi = min;; fi++)
2760  {
2761  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM32);
2762  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2763  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2764  GETCHARINC(d, eptr);
2765  if (fc == d) RRETURN(MATCH_NOMATCH);
2766  }
2767  }
2768  else
2769 #endif
2770  /* Not UTF-8 mode */
2771  {
2772  for (fi = min;; fi++)
2773  {
2774  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM33);
2775  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2776  if (fi >= max || eptr >= md->end_subject || fc == *eptr++)
2778  }
2779  }
2780  /* Control never gets here */
2781  }
2782 
2783  /* Maximize case */
2784 
2785  else
2786  {
2787  pp = eptr;
2788 
2789 #ifdef SUPPORT_UTF8
2790  /* UTF-8 mode */
2791  if (utf8)
2792  {
2793  register unsigned int d;
2794  for (i = min; i < max; i++)
2795  {
2796  int len = 1;
2797  if (eptr >= md->end_subject) break;
2798  GETCHARLEN(d, eptr, len);
2799  if (fc == d) break;
2800  eptr += len;
2801  }
2802  if (possessive) continue;
2803  for(;;)
2804  {
2805  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM34);
2806  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2807  if (eptr-- == pp) break; /* Stop if tried at original pos */
2808  BACKCHAR(eptr);
2809  }
2810  }
2811  else
2812 #endif
2813  /* Not UTF-8 mode */
2814  {
2815  for (i = min; i < max; i++)
2816  {
2817  if (eptr >= md->end_subject || fc == *eptr) break;
2818  eptr++;
2819  }
2820  if (possessive) continue;
2821  while (eptr >= pp)
2822  {
2823  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM35);
2824  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
2825  eptr--;
2826  }
2827  }
2828 
2830  }
2831  }
2832  /* Control never gets here */
2833 
2834  /* Match a single character type repeatedly; several different opcodes
2835  share code. This is very similar to the code for single characters, but we
2836  repeat it in the interests of efficiency. */
2837 
2838  case OP_TYPEEXACT:
2839  min = max = GET2(ecode, 1);
2840  minimize = TRUE;
2841  ecode += 3;
2842  goto REPEATTYPE;
2843 
2844  case OP_TYPEUPTO:
2845  case OP_TYPEMINUPTO:
2846  min = 0;
2847  max = GET2(ecode, 1);
2848  minimize = *ecode == OP_TYPEMINUPTO;
2849  ecode += 3;
2850  goto REPEATTYPE;
2851 
2852  case OP_TYPEPOSSTAR:
2853  possessive = TRUE;
2854  min = 0;
2855  max = INT_MAX;
2856  ecode++;
2857  goto REPEATTYPE;
2858 
2859  case OP_TYPEPOSPLUS:
2860  possessive = TRUE;
2861  min = 1;
2862  max = INT_MAX;
2863  ecode++;
2864  goto REPEATTYPE;
2865 
2866  case OP_TYPEPOSQUERY:
2867  possessive = TRUE;
2868  min = 0;
2869  max = 1;
2870  ecode++;
2871  goto REPEATTYPE;
2872 
2873  case OP_TYPEPOSUPTO:
2874  possessive = TRUE;
2875  min = 0;
2876  max = GET2(ecode, 1);
2877  ecode += 3;
2878  goto REPEATTYPE;
2879 
2880  case OP_TYPESTAR:
2881  case OP_TYPEMINSTAR:
2882  case OP_TYPEPLUS:
2883  case OP_TYPEMINPLUS:
2884  case OP_TYPEQUERY:
2885  case OP_TYPEMINQUERY:
2886  c = *ecode++ - OP_TYPESTAR;
2887  minimize = (c & 1) != 0;
2888  min = rep_min[c]; /* Pick up values from tables; */
2889  max = rep_max[c]; /* zero for max => infinity */
2890  if (max == 0) max = INT_MAX;
2891 
2892  /* Common code for all repeated single character type matches. Note that
2893  in UTF-8 mode, '.' matches a character of any length, but for the other
2894  character types, the valid characters are all one-byte long. */
2895 
2896  REPEATTYPE:
2897  ctype = *ecode++; /* Code for the character type */
2898 
2899 #ifdef SUPPORT_UCP
2900  if (ctype == OP_PROP || ctype == OP_NOTPROP)
2901  {
2902  prop_fail_result = ctype == OP_NOTPROP;
2903  prop_type = *ecode++;
2904  prop_value = *ecode++;
2905  }
2906  else prop_type = -1;
2907 #endif
2908 
2909  /* First, ensure the minimum number of matches are present. Use inline
2910  code for maximizing the speed, and do the type test once at the start
2911  (i.e. keep it out of the loop). Also we can test that there are at least
2912  the minimum number of bytes before we start. This isn't as effective in
2913  UTF-8 mode, but it does no harm. Separate the UTF-8 code completely as that
2914  is tidier. Also separate the UCP code, which can be the same for both UTF-8
2915  and single-bytes. */
2916 
2917  if (min > md->end_subject - eptr) RRETURN(MATCH_NOMATCH);
2918  if (min > 0)
2919  {
2920 #ifdef SUPPORT_UCP
2921  if (prop_type >= 0)
2922  {
2923  switch(prop_type)
2924  {
2925  case PT_ANY:
2926  if (prop_fail_result) RRETURN(MATCH_NOMATCH);
2927  for (i = 1; i <= min; i++)
2928  {
2929  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2930  GETCHARINCTEST(c, eptr);
2931  }
2932  break;
2933 
2934  case PT_LAMP:
2935  for (i = 1; i <= min; i++)
2936  {
2937  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2938  GETCHARINCTEST(c, eptr);
2939  prop_chartype = UCD_CHARTYPE(c);
2940  if ((prop_chartype == ucp_Lu ||
2941  prop_chartype == ucp_Ll ||
2942  prop_chartype == ucp_Lt) == prop_fail_result)
2944  }
2945  break;
2946 
2947  case PT_GC:
2948  for (i = 1; i <= min; i++)
2949  {
2950  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2951  GETCHARINCTEST(c, eptr);
2952  prop_category = UCD_CATEGORY(c);
2953  if ((prop_category == prop_value) == prop_fail_result)
2955  }
2956  break;
2957 
2958  case PT_PC:
2959  for (i = 1; i <= min; i++)
2960  {
2961  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2962  GETCHARINCTEST(c, eptr);
2963  prop_chartype = UCD_CHARTYPE(c);
2964  if ((prop_chartype == prop_value) == prop_fail_result)
2966  }
2967  break;
2968 
2969  case PT_SC:
2970  for (i = 1; i <= min; i++)
2971  {
2972  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
2973  GETCHARINCTEST(c, eptr);
2974  prop_script = UCD_SCRIPT(c);
2975  if ((prop_script == prop_value) == prop_fail_result)
2977  }
2978  break;
2979 
2980  default:
2982  }
2983  }
2984 
2985  /* Match extended Unicode sequences. We will get here only if the
2986  support is in the binary; otherwise a compile-time error occurs. */
2987 
2988  else if (ctype == OP_EXTUNI)
2989  {
2990  for (i = 1; i <= min; i++)
2991  {
2992  GETCHARINCTEST(c, eptr);
2993  prop_category = UCD_CATEGORY(c);
2994  if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
2995  while (eptr < md->end_subject)
2996  {
2997  int len = 1;
2998  if (!utf8) c = *eptr; else
2999  {
3000  GETCHARLEN(c, eptr, len);
3001  }
3002  prop_category = UCD_CATEGORY(c);
3003  if (prop_category != ucp_M) break;
3004  eptr += len;
3005  }
3006  }
3007  }
3008 
3009  else
3010 #endif /* SUPPORT_UCP */
3011 
3012 /* Handle all other cases when the coding is UTF-8 */
3013 
3014 #ifdef SUPPORT_UTF8
3015  if (utf8) switch(ctype)
3016  {
3017  case OP_ANY:
3018  for (i = 1; i <= min; i++)
3019  {
3020  if (eptr >= md->end_subject || IS_NEWLINE(eptr))
3022  eptr++;
3023  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3024  }
3025  break;
3026 
3027  case OP_ALLANY:
3028  for (i = 1; i <= min; i++)
3029  {
3030  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3031  eptr++;
3032  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3033  }
3034  break;
3035 
3036  case OP_ANYBYTE:
3037  eptr += min;
3038  break;
3039 
3040  case OP_ANYNL:
3041  for (i = 1; i <= min; i++)
3042  {
3043  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3044  GETCHARINC(c, eptr);
3045  switch(c)
3046  {
3047  default: RRETURN(MATCH_NOMATCH);
3048  case 0x000d:
3049  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3050  break;
3051 
3052  case 0x000a:
3053  break;
3054 
3055  case 0x000b:
3056  case 0x000c:
3057  case 0x0085:
3058  case 0x2028:
3059  case 0x2029:
3060  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3061  break;
3062  }
3063  }
3064  break;
3065 
3066  case OP_NOT_HSPACE:
3067  for (i = 1; i <= min; i++)
3068  {
3069  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3070  GETCHARINC(c, eptr);
3071  switch(c)
3072  {
3073  default: break;
3074  case 0x09: /* HT */
3075  case 0x20: /* SPACE */
3076  case 0xa0: /* NBSP */
3077  case 0x1680: /* OGHAM SPACE MARK */
3078  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3079  case 0x2000: /* EN QUAD */
3080  case 0x2001: /* EM QUAD */
3081  case 0x2002: /* EN SPACE */
3082  case 0x2003: /* EM SPACE */
3083  case 0x2004: /* THREE-PER-EM SPACE */
3084  case 0x2005: /* FOUR-PER-EM SPACE */
3085  case 0x2006: /* SIX-PER-EM SPACE */
3086  case 0x2007: /* FIGURE SPACE */
3087  case 0x2008: /* PUNCTUATION SPACE */
3088  case 0x2009: /* THIN SPACE */
3089  case 0x200A: /* HAIR SPACE */
3090  case 0x202f: /* NARROW NO-BREAK SPACE */
3091  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3092  case 0x3000: /* IDEOGRAPHIC SPACE */
3094  }
3095  }
3096  break;
3097 
3098  case OP_HSPACE:
3099  for (i = 1; i <= min; i++)
3100  {
3101  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3102  GETCHARINC(c, eptr);
3103  switch(c)
3104  {
3105  default: RRETURN(MATCH_NOMATCH);
3106  case 0x09: /* HT */
3107  case 0x20: /* SPACE */
3108  case 0xa0: /* NBSP */
3109  case 0x1680: /* OGHAM SPACE MARK */
3110  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3111  case 0x2000: /* EN QUAD */
3112  case 0x2001: /* EM QUAD */
3113  case 0x2002: /* EN SPACE */
3114  case 0x2003: /* EM SPACE */
3115  case 0x2004: /* THREE-PER-EM SPACE */
3116  case 0x2005: /* FOUR-PER-EM SPACE */
3117  case 0x2006: /* SIX-PER-EM SPACE */
3118  case 0x2007: /* FIGURE SPACE */
3119  case 0x2008: /* PUNCTUATION SPACE */
3120  case 0x2009: /* THIN SPACE */
3121  case 0x200A: /* HAIR SPACE */
3122  case 0x202f: /* NARROW NO-BREAK SPACE */
3123  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3124  case 0x3000: /* IDEOGRAPHIC SPACE */
3125  break;
3126  }
3127  }
3128  break;
3129 
3130  case OP_NOT_VSPACE:
3131  for (i = 1; i <= min; i++)
3132  {
3133  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3134  GETCHARINC(c, eptr);
3135  switch(c)
3136  {
3137  default: break;
3138  case 0x0a: /* LF */
3139  case 0x0b: /* VT */
3140  case 0x0c: /* FF */
3141  case 0x0d: /* CR */
3142  case 0x85: /* NEL */
3143  case 0x2028: /* LINE SEPARATOR */
3144  case 0x2029: /* PARAGRAPH SEPARATOR */
3146  }
3147  }
3148  break;
3149 
3150  case OP_VSPACE:
3151  for (i = 1; i <= min; i++)
3152  {
3153  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3154  GETCHARINC(c, eptr);
3155  switch(c)
3156  {
3157  default: RRETURN(MATCH_NOMATCH);
3158  case 0x0a: /* LF */
3159  case 0x0b: /* VT */
3160  case 0x0c: /* FF */
3161  case 0x0d: /* CR */
3162  case 0x85: /* NEL */
3163  case 0x2028: /* LINE SEPARATOR */
3164  case 0x2029: /* PARAGRAPH SEPARATOR */
3165  break;
3166  }
3167  }
3168  break;
3169 
3170  case OP_NOT_DIGIT:
3171  for (i = 1; i <= min; i++)
3172  {
3173  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3174  GETCHARINC(c, eptr);
3175  if (c < 128 && (md->ctypes[c] & ctype_digit) != 0)
3177  }
3178  break;
3179 
3180  case OP_DIGIT:
3181  for (i = 1; i <= min; i++)
3182  {
3183  if (eptr >= md->end_subject ||
3184  *eptr >= 128 || (md->ctypes[*eptr++] & ctype_digit) == 0)
3186  /* No need to skip more bytes - we know it's a 1-byte character */
3187  }
3188  break;
3189 
3190  case OP_NOT_WHITESPACE:
3191  for (i = 1; i <= min; i++)
3192  {
3193  if (eptr >= md->end_subject ||
3194  (*eptr < 128 && (md->ctypes[*eptr] & ctype_space) != 0))
3196  while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3197  }
3198  break;
3199 
3200  case OP_WHITESPACE:
3201  for (i = 1; i <= min; i++)
3202  {
3203  if (eptr >= md->end_subject ||
3204  *eptr >= 128 || (md->ctypes[*eptr++] & ctype_space) == 0)
3206  /* No need to skip more bytes - we know it's a 1-byte character */
3207  }
3208  break;
3209 
3210  case OP_NOT_WORDCHAR:
3211  for (i = 1; i <= min; i++)
3212  {
3213  if (eptr >= md->end_subject ||
3214  (*eptr < 128 && (md->ctypes[*eptr] & ctype_word) != 0))
3216  while (++eptr < md->end_subject && (*eptr & 0xc0) == 0x80);
3217  }
3218  break;
3219 
3220  case OP_WORDCHAR:
3221  for (i = 1; i <= min; i++)
3222  {
3223  if (eptr >= md->end_subject ||
3224  *eptr >= 128 || (md->ctypes[*eptr++] & ctype_word) == 0)
3226  /* No need to skip more bytes - we know it's a 1-byte character */
3227  }
3228  break;
3229 
3230  default:
3232  } /* End switch(ctype) */
3233 
3234  else
3235 #endif /* SUPPORT_UTF8 */
3236 
3237  /* Code for the non-UTF-8 case for minimum matching of operators other
3238  than OP_PROP and OP_NOTPROP. We can assume that there are the minimum
3239  number of bytes present, as this was tested above. */
3240 
3241  switch(ctype)
3242  {
3243  case OP_ANY:
3244  for (i = 1; i <= min; i++)
3245  {
3246  if (IS_NEWLINE(eptr)) RRETURN(MATCH_NOMATCH);
3247  eptr++;
3248  }
3249  break;
3250 
3251  case OP_ALLANY:
3252  eptr += min;
3253  break;
3254 
3255  case OP_ANYBYTE:
3256  eptr += min;
3257  break;
3258 
3259  /* Because of the CRLF case, we can't assume the minimum number of
3260  bytes are present in this case. */
3261 
3262  case OP_ANYNL:
3263  for (i = 1; i <= min; i++)
3264  {
3265  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3266  switch(*eptr++)
3267  {
3268  default: RRETURN(MATCH_NOMATCH);
3269  case 0x000d:
3270  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3271  break;
3272  case 0x000a:
3273  break;
3274 
3275  case 0x000b:
3276  case 0x000c:
3277  case 0x0085:
3278  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3279  break;
3280  }
3281  }
3282  break;
3283 
3284  case OP_NOT_HSPACE:
3285  for (i = 1; i <= min; i++)
3286  {
3287  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3288  switch(*eptr++)
3289  {
3290  default: break;
3291  case 0x09: /* HT */
3292  case 0x20: /* SPACE */
3293  case 0xa0: /* NBSP */
3295  }
3296  }
3297  break;
3298 
3299  case OP_HSPACE:
3300  for (i = 1; i <= min; i++)
3301  {
3302  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3303  switch(*eptr++)
3304  {
3305  default: RRETURN(MATCH_NOMATCH);
3306  case 0x09: /* HT */
3307  case 0x20: /* SPACE */
3308  case 0xa0: /* NBSP */
3309  break;
3310  }
3311  }
3312  break;
3313 
3314  case OP_NOT_VSPACE:
3315  for (i = 1; i <= min; i++)
3316  {
3317  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3318  switch(*eptr++)
3319  {
3320  default: break;
3321  case 0x0a: /* LF */
3322  case 0x0b: /* VT */
3323  case 0x0c: /* FF */
3324  case 0x0d: /* CR */
3325  case 0x85: /* NEL */
3327  }
3328  }
3329  break;
3330 
3331  case OP_VSPACE:
3332  for (i = 1; i <= min; i++)
3333  {
3334  if (eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3335  switch(*eptr++)
3336  {
3337  default: RRETURN(MATCH_NOMATCH);
3338  case 0x0a: /* LF */
3339  case 0x0b: /* VT */
3340  case 0x0c: /* FF */
3341  case 0x0d: /* CR */
3342  case 0x85: /* NEL */
3343  break;
3344  }
3345  }
3346  break;
3347 
3348  case OP_NOT_DIGIT:
3349  for (i = 1; i <= min; i++)
3350  if ((md->ctypes[*eptr++] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3351  break;
3352 
3353  case OP_DIGIT:
3354  for (i = 1; i <= min; i++)
3355  if ((md->ctypes[*eptr++] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3356  break;
3357 
3358  case OP_NOT_WHITESPACE:
3359  for (i = 1; i <= min; i++)
3360  if ((md->ctypes[*eptr++] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3361  break;
3362 
3363  case OP_WHITESPACE:
3364  for (i = 1; i <= min; i++)
3365  if ((md->ctypes[*eptr++] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3366  break;
3367 
3368  case OP_NOT_WORDCHAR:
3369  for (i = 1; i <= min; i++)
3370  if ((md->ctypes[*eptr++] & ctype_word) != 0)
3372  break;
3373 
3374  case OP_WORDCHAR:
3375  for (i = 1; i <= min; i++)
3376  if ((md->ctypes[*eptr++] & ctype_word) == 0)
3378  break;
3379 
3380  default:
3382  }
3383  }
3384 
3385  /* If min = max, continue at the same level without recursing */
3386 
3387  if (min == max) continue;
3388 
3389  /* If minimizing, we have to test the rest of the pattern before each
3390  subsequent match. Again, separate the UTF-8 case for speed, and also
3391  separate the UCP cases. */
3392 
3393  if (minimize)
3394  {
3395 #ifdef SUPPORT_UCP
3396  if (prop_type >= 0)
3397  {
3398  switch(prop_type)
3399  {
3400  case PT_ANY:
3401  for (fi = min;; fi++)
3402  {
3403  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM36);
3404  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3405  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3406  GETCHARINC(c, eptr);
3407  if (prop_fail_result) RRETURN(MATCH_NOMATCH);
3408  }
3409  /* Control never gets here */
3410 
3411  case PT_LAMP:
3412  for (fi = min;; fi++)
3413  {
3414  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM37);
3415  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3416  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3417  GETCHARINC(c, eptr);
3418  prop_chartype = UCD_CHARTYPE(c);
3419  if ((prop_chartype == ucp_Lu ||
3420  prop_chartype == ucp_Ll ||
3421  prop_chartype == ucp_Lt) == prop_fail_result)
3423  }
3424  /* Control never gets here */
3425 
3426  case PT_GC:
3427  for (fi = min;; fi++)
3428  {
3429  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM38);
3430  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3431  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3432  GETCHARINC(c, eptr);
3433  prop_category = UCD_CATEGORY(c);
3434  if ((prop_category == prop_value) == prop_fail_result)
3436  }
3437  /* Control never gets here */
3438 
3439  case PT_PC:
3440  for (fi = min;; fi++)
3441  {
3442  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM39);
3443  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3444  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3445  GETCHARINC(c, eptr);
3446  prop_chartype = UCD_CHARTYPE(c);
3447  if ((prop_chartype == prop_value) == prop_fail_result)
3449  }
3450  /* Control never gets here */
3451 
3452  case PT_SC:
3453  for (fi = min;; fi++)
3454  {
3455  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM40);
3456  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3457  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3458  GETCHARINC(c, eptr);
3459  prop_script = UCD_SCRIPT(c);
3460  if ((prop_script == prop_value) == prop_fail_result)
3462  }
3463  /* Control never gets here */
3464 
3465  default:
3467  }
3468  }
3469 
3470  /* Match extended Unicode sequences. We will get here only if the
3471  support is in the binary; otherwise a compile-time error occurs. */
3472 
3473  else if (ctype == OP_EXTUNI)
3474  {
3475  for (fi = min;; fi++)
3476  {
3477  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM41);
3478  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3479  if (fi >= max || eptr >= md->end_subject) RRETURN(MATCH_NOMATCH);
3480  GETCHARINCTEST(c, eptr);
3481  prop_category = UCD_CATEGORY(c);
3482  if (prop_category == ucp_M) RRETURN(MATCH_NOMATCH);
3483  while (eptr < md->end_subject)
3484  {
3485  int len = 1;
3486  if (!utf8) c = *eptr; else
3487  {
3488  GETCHARLEN(c, eptr, len);
3489  }
3490  prop_category = UCD_CATEGORY(c);
3491  if (prop_category != ucp_M) break;
3492  eptr += len;
3493  }
3494  }
3495  }
3496 
3497  else
3498 #endif /* SUPPORT_UCP */
3499 
3500 #ifdef SUPPORT_UTF8
3501  /* UTF-8 mode */
3502  if (utf8)
3503  {
3504  for (fi = min;; fi++)
3505  {
3506  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM42);
3507  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3508  if (fi >= max || eptr >= md->end_subject ||
3509  (ctype == OP_ANY && IS_NEWLINE(eptr)))
3511 
3512  GETCHARINC(c, eptr);
3513  switch(ctype)
3514  {
3515  case OP_ANY: /* This is the non-NL case */
3516  case OP_ALLANY:
3517  case OP_ANYBYTE:
3518  break;
3519 
3520  case OP_ANYNL:
3521  switch(c)
3522  {
3523  default: RRETURN(MATCH_NOMATCH);
3524  case 0x000d:
3525  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3526  break;
3527  case 0x000a:
3528  break;
3529 
3530  case 0x000b:
3531  case 0x000c:
3532  case 0x0085:
3533  case 0x2028:
3534  case 0x2029:
3535  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3536  break;
3537  }
3538  break;
3539 
3540  case OP_NOT_HSPACE:
3541  switch(c)
3542  {
3543  default: break;
3544  case 0x09: /* HT */
3545  case 0x20: /* SPACE */
3546  case 0xa0: /* NBSP */
3547  case 0x1680: /* OGHAM SPACE MARK */
3548  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3549  case 0x2000: /* EN QUAD */
3550  case 0x2001: /* EM QUAD */
3551  case 0x2002: /* EN SPACE */
3552  case 0x2003: /* EM SPACE */
3553  case 0x2004: /* THREE-PER-EM SPACE */
3554  case 0x2005: /* FOUR-PER-EM SPACE */
3555  case 0x2006: /* SIX-PER-EM SPACE */
3556  case 0x2007: /* FIGURE SPACE */
3557  case 0x2008: /* PUNCTUATION SPACE */
3558  case 0x2009: /* THIN SPACE */
3559  case 0x200A: /* HAIR SPACE */
3560  case 0x202f: /* NARROW NO-BREAK SPACE */
3561  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3562  case 0x3000: /* IDEOGRAPHIC SPACE */
3564  }
3565  break;
3566 
3567  case OP_HSPACE:
3568  switch(c)
3569  {
3570  default: RRETURN(MATCH_NOMATCH);
3571  case 0x09: /* HT */
3572  case 0x20: /* SPACE */
3573  case 0xa0: /* NBSP */
3574  case 0x1680: /* OGHAM SPACE MARK */
3575  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
3576  case 0x2000: /* EN QUAD */
3577  case 0x2001: /* EM QUAD */
3578  case 0x2002: /* EN SPACE */
3579  case 0x2003: /* EM SPACE */
3580  case 0x2004: /* THREE-PER-EM SPACE */
3581  case 0x2005: /* FOUR-PER-EM SPACE */
3582  case 0x2006: /* SIX-PER-EM SPACE */
3583  case 0x2007: /* FIGURE SPACE */
3584  case 0x2008: /* PUNCTUATION SPACE */
3585  case 0x2009: /* THIN SPACE */
3586  case 0x200A: /* HAIR SPACE */
3587  case 0x202f: /* NARROW NO-BREAK SPACE */
3588  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
3589  case 0x3000: /* IDEOGRAPHIC SPACE */
3590  break;
3591  }
3592  break;
3593 
3594  case OP_NOT_VSPACE:
3595  switch(c)
3596  {
3597  default: break;
3598  case 0x0a: /* LF */
3599  case 0x0b: /* VT */
3600  case 0x0c: /* FF */
3601  case 0x0d: /* CR */
3602  case 0x85: /* NEL */
3603  case 0x2028: /* LINE SEPARATOR */
3604  case 0x2029: /* PARAGRAPH SEPARATOR */
3606  }
3607  break;
3608 
3609  case OP_VSPACE:
3610  switch(c)
3611  {
3612  default: RRETURN(MATCH_NOMATCH);
3613  case 0x0a: /* LF */
3614  case 0x0b: /* VT */
3615  case 0x0c: /* FF */
3616  case 0x0d: /* CR */
3617  case 0x85: /* NEL */
3618  case 0x2028: /* LINE SEPARATOR */
3619  case 0x2029: /* PARAGRAPH SEPARATOR */
3620  break;
3621  }
3622  break;
3623 
3624  case OP_NOT_DIGIT:
3625  if (c < 256 && (md->ctypes[c] & ctype_digit) != 0)
3627  break;
3628 
3629  case OP_DIGIT:
3630  if (c >= 256 || (md->ctypes[c] & ctype_digit) == 0)
3632  break;
3633 
3634  case OP_NOT_WHITESPACE:
3635  if (c < 256 && (md->ctypes[c] & ctype_space) != 0)
3637  break;
3638 
3639  case OP_WHITESPACE:
3640  if (c >= 256 || (md->ctypes[c] & ctype_space) == 0)
3642  break;
3643 
3644  case OP_NOT_WORDCHAR:
3645  if (c < 256 && (md->ctypes[c] & ctype_word) != 0)
3647  break;
3648 
3649  case OP_WORDCHAR:
3650  if (c >= 256 || (md->ctypes[c] & ctype_word) == 0)
3652  break;
3653 
3654  default:
3656  }
3657  }
3658  }
3659  else
3660 #endif
3661  /* Not UTF-8 mode */
3662  {
3663  for (fi = min;; fi++)
3664  {
3665  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM43);
3666  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3667  if (fi >= max || eptr >= md->end_subject ||
3668  (ctype == OP_ANY && IS_NEWLINE(eptr)))
3670 
3671  c = *eptr++;
3672  switch(ctype)
3673  {
3674  case OP_ANY: /* This is the non-NL case */
3675  case OP_ALLANY:
3676  case OP_ANYBYTE:
3677  break;
3678 
3679  case OP_ANYNL:
3680  switch(c)
3681  {
3682  default: RRETURN(MATCH_NOMATCH);
3683  case 0x000d:
3684  if (eptr < md->end_subject && *eptr == 0x0a) eptr++;
3685  break;
3686 
3687  case 0x000a:
3688  break;
3689 
3690  case 0x000b:
3691  case 0x000c:
3692  case 0x0085:
3693  if (md->bsr_anycrlf) RRETURN(MATCH_NOMATCH);
3694  break;
3695  }
3696  break;
3697 
3698  case OP_NOT_HSPACE:
3699  switch(c)
3700  {
3701  default: break;
3702  case 0x09: /* HT */
3703  case 0x20: /* SPACE */
3704  case 0xa0: /* NBSP */
3706  }
3707  break;
3708 
3709  case OP_HSPACE:
3710  switch(c)
3711  {
3712  default: RRETURN(MATCH_NOMATCH);
3713  case 0x09: /* HT */
3714  case 0x20: /* SPACE */
3715  case 0xa0: /* NBSP */
3716  break;
3717  }
3718  break;
3719 
3720  case OP_NOT_VSPACE:
3721  switch(c)
3722  {
3723  default: break;
3724  case 0x0a: /* LF */
3725  case 0x0b: /* VT */
3726  case 0x0c: /* FF */
3727  case 0x0d: /* CR */
3728  case 0x85: /* NEL */
3730  }
3731  break;
3732 
3733  case OP_VSPACE:
3734  switch(c)
3735  {
3736  default: RRETURN(MATCH_NOMATCH);
3737  case 0x0a: /* LF */
3738  case 0x0b: /* VT */
3739  case 0x0c: /* FF */
3740  case 0x0d: /* CR */
3741  case 0x85: /* NEL */
3742  break;
3743  }
3744  break;
3745 
3746  case OP_NOT_DIGIT:
3747  if ((md->ctypes[c] & ctype_digit) != 0) RRETURN(MATCH_NOMATCH);
3748  break;
3749 
3750  case OP_DIGIT:
3751  if ((md->ctypes[c] & ctype_digit) == 0) RRETURN(MATCH_NOMATCH);
3752  break;
3753 
3754  case OP_NOT_WHITESPACE:
3755  if ((md->ctypes[c] & ctype_space) != 0) RRETURN(MATCH_NOMATCH);
3756  break;
3757 
3758  case OP_WHITESPACE:
3759  if ((md->ctypes[c] & ctype_space) == 0) RRETURN(MATCH_NOMATCH);
3760  break;
3761 
3762  case OP_NOT_WORDCHAR:
3763  if ((md->ctypes[c] & ctype_word) != 0) RRETURN(MATCH_NOMATCH);
3764  break;
3765 
3766  case OP_WORDCHAR:
3767  if ((md->ctypes[c] & ctype_word) == 0) RRETURN(MATCH_NOMATCH);
3768  break;
3769 
3770  default:
3772  }
3773  }
3774  }
3775  /* Control never gets here */
3776  }
3777 
3778  /* If maximizing, it is worth using inline code for speed, doing the type
3779  test once at the start (i.e. keep it out of the loop). Again, keep the
3780  UTF-8 and UCP stuff separate. */
3781 
3782  else
3783  {
3784  pp = eptr; /* Remember where we started */
3785 
3786 #ifdef SUPPORT_UCP
3787  if (prop_type >= 0)
3788  {
3789  switch(prop_type)
3790  {
3791  case PT_ANY:
3792  for (i = min; i < max; i++)
3793  {
3794  int len = 1;
3795  if (eptr >= md->end_subject) break;
3796  GETCHARLEN(c, eptr, len);
3797  if (prop_fail_result) break;
3798  eptr+= len;
3799  }
3800  break;
3801 
3802  case PT_LAMP:
3803  for (i = min; i < max; i++)
3804  {
3805  int len = 1;
3806  if (eptr >= md->end_subject) break;
3807  GETCHARLEN(c, eptr, len);
3808  prop_chartype = UCD_CHARTYPE(c);
3809  if ((prop_chartype == ucp_Lu ||
3810  prop_chartype == ucp_Ll ||
3811  prop_chartype == ucp_Lt) == prop_fail_result)
3812  break;
3813  eptr+= len;
3814  }
3815  break;
3816 
3817  case PT_GC:
3818  for (i = min; i < max; i++)
3819  {
3820  int len = 1;
3821  if (eptr >= md->end_subject) break;
3822  GETCHARLEN(c, eptr, len);
3823  prop_category = UCD_CATEGORY(c);
3824  if ((prop_category == prop_value) == prop_fail_result)
3825  break;
3826  eptr+= len;
3827  }
3828  break;
3829 
3830  case PT_PC:
3831  for (i = min; i < max; i++)
3832  {
3833  int len = 1;
3834  if (eptr >= md->end_subject) break;
3835  GETCHARLEN(c, eptr, len);
3836  prop_chartype = UCD_CHARTYPE(c);
3837  if ((prop_chartype == prop_value) == prop_fail_result)
3838  break;
3839  eptr+= len;
3840  }
3841  break;
3842 
3843  case PT_SC:
3844  for (i = min; i < max; i++)
3845  {
3846  int len = 1;
3847  if (eptr >= md->end_subject) break;
3848  GETCHARLEN(c, eptr, len);
3849  prop_script = UCD_SCRIPT(c);
3850  if ((prop_script == prop_value) == prop_fail_result)
3851  break;
3852  eptr+= len;
3853  }
3854  break;
3855  }
3856 
3857  /* eptr is now past the end of the maximum run */
3858 
3859  if (possessive) continue;
3860  for(;;)
3861  {
3862  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM44);
3863  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3864  if (eptr-- == pp) break; /* Stop if tried at original pos */
3865  if (utf8) BACKCHAR(eptr);
3866  }
3867  }
3868 
3869  /* Match extended Unicode sequences. We will get here only if the
3870  support is in the binary; otherwise a compile-time error occurs. */
3871 
3872  else if (ctype == OP_EXTUNI)
3873  {
3874  for (i = min; i < max; i++)
3875  {
3876  if (eptr >= md->end_subject) break;
3877  GETCHARINCTEST(c, eptr);
3878  prop_category = UCD_CATEGORY(c);
3879  if (prop_category == ucp_M) break;
3880  while (eptr < md->end_subject)
3881  {
3882  int len = 1;
3883  if (!utf8) c = *eptr; else
3884  {
3885  GETCHARLEN(c, eptr, len);
3886  }
3887  prop_category = UCD_CATEGORY(c);
3888  if (prop_category != ucp_M) break;
3889  eptr += len;
3890  }
3891  }
3892 
3893  /* eptr is now past the end of the maximum run */
3894 
3895  if (possessive) continue;
3896  for(;;)
3897  {
3898  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM45);
3899  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
3900  if (eptr-- == pp) break; /* Stop if tried at original pos */
3901  for (;;) /* Move back over one extended */
3902  {
3903  int len = 1;
3904  if (!utf8) c = *eptr; else
3905  {
3906  BACKCHAR(eptr);
3907  GETCHARLEN(c, eptr, len);
3908  }
3909  prop_category = UCD_CATEGORY(c);
3910  if (prop_category != ucp_M) break;
3911  eptr--;
3912  }
3913  }
3914  }
3915 
3916  else
3917 #endif /* SUPPORT_UCP */
3918 
3919 #ifdef SUPPORT_UTF8
3920  /* UTF-8 mode */
3921 
3922  if (utf8)
3923  {
3924  switch(ctype)
3925  {
3926  case OP_ANY:
3927  if (max < INT_MAX)
3928  {
3929  for (i = min; i < max; i++)
3930  {
3931  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3932  eptr++;
3933  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3934  }
3935  }
3936 
3937  /* Handle unlimited UTF-8 repeat */
3938 
3939  else
3940  {
3941  for (i = min; i < max; i++)
3942  {
3943  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
3944  eptr++;
3945  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3946  }
3947  }
3948  break;
3949 
3950  case OP_ALLANY:
3951  if (max < INT_MAX)
3952  {
3953  for (i = min; i < max; i++)
3954  {
3955  if (eptr >= md->end_subject) break;
3956  eptr++;
3957  while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
3958  }
3959  }
3960  else eptr = md->end_subject; /* Unlimited UTF-8 repeat */
3961  break;
3962 
3963  /* The byte case is the same as non-UTF8 */
3964 
3965  case OP_ANYBYTE:
3966  c = max - min;
3967  if (c > (unsigned int)(md->end_subject - eptr))
3968  c = md->end_subject - eptr;
3969  eptr += c;
3970  break;
3971 
3972  case OP_ANYNL:
3973  for (i = min; i < max; i++)
3974  {
3975  int len = 1;
3976  if (eptr >= md->end_subject) break;
3977  GETCHARLEN(c, eptr, len);
3978  if (c == 0x000d)
3979  {
3980  if (++eptr >= md->end_subject) break;
3981  if (*eptr == 0x000a) eptr++;
3982  }
3983  else
3984  {
3985  if (c != 0x000a &&
3986  (md->bsr_anycrlf ||
3987  (c != 0x000b && c != 0x000c &&
3988  c != 0x0085 && c != 0x2028 && c != 0x2029)))
3989  break;
3990  eptr += len;
3991  }
3992  }
3993  break;
3994 
3995  case OP_NOT_HSPACE:
3996  case OP_HSPACE:
3997  for (i = min; i < max; i++)
3998  {
3999  BOOL gotspace;
4000  int len = 1;
4001  if (eptr >= md->end_subject) break;
4002  GETCHARLEN(c, eptr, len);
4003  switch(c)
4004  {
4005  default: gotspace = FALSE; break;
4006  case 0x09: /* HT */
4007  case 0x20: /* SPACE */
4008  case 0xa0: /* NBSP */
4009  case 0x1680: /* OGHAM SPACE MARK */
4010  case 0x180e: /* MONGOLIAN VOWEL SEPARATOR */
4011  case 0x2000: /* EN QUAD */
4012  case 0x2001: /* EM QUAD */
4013  case 0x2002: /* EN SPACE */
4014  case 0x2003: /* EM SPACE */
4015  case 0x2004: /* THREE-PER-EM SPACE */
4016  case 0x2005: /* FOUR-PER-EM SPACE */
4017  case 0x2006: /* SIX-PER-EM SPACE */
4018  case 0x2007: /* FIGURE SPACE */
4019  case 0x2008: /* PUNCTUATION SPACE */
4020  case 0x2009: /* THIN SPACE */
4021  case 0x200A: /* HAIR SPACE */
4022  case 0x202f: /* NARROW NO-BREAK SPACE */
4023  case 0x205f: /* MEDIUM MATHEMATICAL SPACE */
4024  case 0x3000: /* IDEOGRAPHIC SPACE */
4025  gotspace = TRUE;
4026  break;
4027  }
4028  if (gotspace == (ctype == OP_NOT_HSPACE)) break;
4029  eptr += len;
4030  }
4031  break;
4032 
4033  case OP_NOT_VSPACE:
4034  case OP_VSPACE:
4035  for (i = min; i < max; i++)
4036  {
4037  BOOL gotspace;
4038  int len = 1;
4039  if (eptr >= md->end_subject) break;
4040  GETCHARLEN(c, eptr, len);
4041  switch(c)
4042  {
4043  default: gotspace = FALSE; break;
4044  case 0x0a: /* LF */
4045  case 0x0b: /* VT */
4046  case 0x0c: /* FF */
4047  case 0x0d: /* CR */
4048  case 0x85: /* NEL */
4049  case 0x2028: /* LINE SEPARATOR */
4050  case 0x2029: /* PARAGRAPH SEPARATOR */
4051  gotspace = TRUE;
4052  break;
4053  }
4054  if (gotspace == (ctype == OP_NOT_VSPACE)) break;
4055  eptr += len;
4056  }
4057  break;
4058 
4059  case OP_NOT_DIGIT:
4060  for (i = min; i < max; i++)
4061  {
4062  int len = 1;
4063  if (eptr >= md->end_subject) break;
4064  GETCHARLEN(c, eptr, len);
4065  if (c < 256 && (md->ctypes[c] & ctype_digit) != 0) break;
4066  eptr+= len;
4067  }
4068  break;
4069 
4070  case OP_DIGIT:
4071  for (i = min; i < max; i++)
4072  {
4073  int len = 1;
4074  if (eptr >= md->end_subject) break;
4075  GETCHARLEN(c, eptr, len);
4076  if (c >= 256 ||(md->ctypes[c] & ctype_digit) == 0) break;
4077  eptr+= len;
4078  }
4079  break;
4080 
4081  case OP_NOT_WHITESPACE:
4082  for (i = min; i < max; i++)
4083  {
4084  int len = 1;
4085  if (eptr >= md->end_subject) break;
4086  GETCHARLEN(c, eptr, len);
4087  if (c < 256 && (md->ctypes[c] & ctype_space) != 0) break;
4088  eptr+= len;
4089  }
4090  break;
4091 
4092  case OP_WHITESPACE:
4093  for (i = min; i < max; i++)
4094  {
4095  int len = 1;
4096  if (eptr >= md->end_subject) break;
4097  GETCHARLEN(c, eptr, len);
4098  if (c >= 256 ||(md->ctypes[c] & ctype_space) == 0) break;
4099  eptr+= len;
4100  }
4101  break;
4102 
4103  case OP_NOT_WORDCHAR:
4104  for (i = min; i < max; i++)
4105  {
4106  int len = 1;
4107  if (eptr >= md->end_subject) break;
4108  GETCHARLEN(c, eptr, len);
4109  if (c < 256 && (md->ctypes[c] & ctype_word) != 0) break;
4110  eptr+= len;
4111  }
4112  break;
4113 
4114  case OP_WORDCHAR:
4115  for (i = min; i < max; i++)
4116  {
4117  int len = 1;
4118  if (eptr >= md->end_subject) break;
4119  GETCHARLEN(c, eptr, len);
4120  if (c >= 256 || (md->ctypes[c] & ctype_word) == 0) break;
4121  eptr+= len;
4122  }
4123  break;
4124 
4125  default:
4127  }
4128 
4129  /* eptr is now past the end of the maximum run */
4130 
4131  if (possessive) continue;
4132  for(;;)
4133  {
4134  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM46);
4135  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4136  if (eptr-- == pp) break; /* Stop if tried at original pos */
4137  BACKCHAR(eptr);
4138  }
4139  }
4140  else
4141 #endif /* SUPPORT_UTF8 */
4142 
4143  /* Not UTF-8 mode */
4144  {
4145  switch(ctype)
4146  {
4147  case OP_ANY:
4148  for (i = min; i < max; i++)
4149  {
4150  if (eptr >= md->end_subject || IS_NEWLINE(eptr)) break;
4151  eptr++;
4152  }
4153  break;
4154 
4155  case OP_ALLANY:
4156  case OP_ANYBYTE:
4157  c = max - min;
4158  if (c > (unsigned int)(md->end_subject - eptr))
4159  c = md->end_subject - eptr;
4160  eptr += c;
4161  break;
4162 
4163  case OP_ANYNL:
4164  for (i = min; i < max; i++)
4165  {
4166  if (eptr >= md->end_subject) break;
4167  c = *eptr;
4168  if (c == 0x000d)
4169  {
4170  if (++eptr >= md->end_subject) break;
4171  if (*eptr == 0x000a) eptr++;
4172  }
4173  else
4174  {
4175  if (c != 0x000a &&
4176  (md->bsr_anycrlf ||
4177  (c != 0x000b && c != 0x000c && c != 0x0085)))
4178  break;
4179  eptr++;
4180  }
4181  }
4182  break;
4183 
4184  case OP_NOT_HSPACE:
4185  for (i = min; i < max; i++)
4186  {
4187  if (eptr >= md->end_subject) break;
4188  c = *eptr;
4189  if (c == 0x09 || c == 0x20 || c == 0xa0) break;
4190  eptr++;
4191  }
4192  break;
4193 
4194  case OP_HSPACE:
4195  for (i = min; i < max; i++)
4196  {
4197  if (eptr >= md->end_subject) break;
4198  c = *eptr;
4199  if (c != 0x09 && c != 0x20 && c != 0xa0) break;
4200  eptr++;
4201  }
4202  break;
4203 
4204  case OP_NOT_VSPACE:
4205  for (i = min; i < max; i++)
4206  {
4207  if (eptr >= md->end_subject) break;
4208  c = *eptr;
4209  if (c == 0x0a || c == 0x0b || c == 0x0c || c == 0x0d || c == 0x85)
4210  break;
4211  eptr++;
4212  }
4213  break;
4214 
4215  case OP_VSPACE:
4216  for (i = min; i < max; i++)
4217  {
4218  if (eptr >= md->end_subject) break;
4219  c = *eptr;
4220  if (c != 0x0a && c != 0x0b && c != 0x0c && c != 0x0d && c != 0x85)
4221  break;
4222  eptr++;
4223  }
4224  break;
4225 
4226  case OP_NOT_DIGIT:
4227  for (i = min; i < max; i++)
4228  {
4229  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
4230  break;
4231  eptr++;
4232  }
4233  break;
4234 
4235  case OP_DIGIT:
4236  for (i = min; i < max; i++)
4237  {
4238  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
4239  break;
4240  eptr++;
4241  }
4242  break;
4243 
4244  case OP_NOT_WHITESPACE:
4245  for (i = min; i < max; i++)
4246  {
4247  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
4248  break;
4249  eptr++;
4250  }
4251  break;
4252 
4253  case OP_WHITESPACE:
4254  for (i = min; i < max; i++)
4255  {
4256  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
4257  break;
4258  eptr++;
4259  }
4260  break;
4261 
4262  case OP_NOT_WORDCHAR:
4263  for (i = min; i < max; i++)
4264  {
4265  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
4266  break;
4267  eptr++;
4268  }
4269  break;
4270 
4271  case OP_WORDCHAR:
4272  for (i = min; i < max; i++)
4273  {
4274  if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
4275  break;
4276  eptr++;
4277  }
4278  break;
4279 
4280  default:
4282  }
4283 
4284  /* eptr is now past the end of the maximum run */
4285 
4286  if (possessive) continue;
4287  while (eptr >= pp)
4288  {
4289  RMATCH(eptr, ecode, offset_top, md, ims, eptrb, 0, RM47);
4290  eptr--;
4291  if (rrc != MATCH_NOMATCH) RRETURN(rrc);
4292  }
4293  }
4294 
4295  /* Get here if we can't make it match with any permitted repetitions */
4296 
4298  }
4299  /* Control never gets here */
4300 
4301  /* There's been some horrible disaster. Arrival here can only mean there is
4302  something seriously wrong in the code above or the OP_xxx definitions. */
4303 
4304  default:
4305  DPRINTF(("Unknown opcode %d\n", *ecode));
4307  }
4308 
4309  /* Do not stick any code in here without much thought; it is assumed
4310  that "continue" in the code above comes out to here to repeat the main
4311  loop. */
4312 
4313  } /* End of main loop */
4314 /* Control never reaches here */
4315 
4316 
4317 /* When compiling to use the heap rather than the stack for recursive calls to
4318 match(), the RRETURN() macro jumps here. The number that is saved in
4319 frame->Xwhere indicates which label we actually want to return to. */
4320 
4321 #ifdef NO_RECURSE
4322 #define LBL(val) case val: goto L_RM##val;
4323 HEAP_RETURN:
4324 switch (frame->Xwhere)
4325  {
4326  LBL( 1) LBL( 2) LBL( 3) LBL( 4) LBL( 5) LBL( 6) LBL( 7) LBL( 8)
4327  LBL( 9) LBL(10) LBL(11) LBL(12) LBL(13) LBL(14) LBL(15) LBL(17)
4328  LBL(19) LBL(24) LBL(25) LBL(26) LBL(27) LBL(29) LBL(31) LBL(33)
4329  LBL(35) LBL(43) LBL(47) LBL(48) LBL(49) LBL(50) LBL(51) LBL(52)
4330  LBL(53) LBL(54)
4331 #ifdef SUPPORT_UTF8
4332  LBL(16) LBL(18) LBL(20) LBL(21) LBL(22) LBL(23) LBL(28) LBL(30)
4333  LBL(32) LBL(34) LBL(42) LBL(46)
4334 #ifdef SUPPORT_UCP
4335  LBL(36) LBL(37) LBL(38) LBL(39) LBL(40) LBL(41) LBL(44) LBL(45)
4336 #endif /* SUPPORT_UCP */
4337 #endif /* SUPPORT_UTF8 */
4338  default:
4339  DPRINTF(("jump error in pcre match: label %d non-existent\n", frame->Xwhere));
4340  return PCRE_ERROR_INTERNAL;
4341  }
4342 #undef LBL
4343 #endif /* NO_RECURSE */
4344 }
4345 
4346 
4347 /***************************************************************************
4348 ****************************************************************************
4349  RECURSION IN THE match() FUNCTION
4350 
4351 Undefine all the macros that were defined above to handle this. */
4352 
4353 #ifdef NO_RECURSE
4354 #undef eptr
4355 #undef ecode
4356 #undef mstart
4357 #undef offset_top
4358 #undef ims
4359 #undef eptrb
4360 #undef flags
4361 
4362 #undef callpat
4363 #undef charptr
4364 #undef data
4365 #undef next
4366 #undef pp
4367 #undef prev
4368 #undef saved_eptr
4369 
4370 #undef new_recursive
4371 
4372 #undef cur_is_word
4373 #undef condition
4374 #undef prev_is_word
4375 
4376 #undef original_ims
4377 
4378 #undef ctype
4379 #undef length
4380 #undef max
4381 #undef min
4382 #undef number
4383 #undef offset
4384 #undef op
4385 #undef save_capture_last
4386 #undef save_offset1
4387 #undef save_offset2
4388 #undef save_offset3
4389 #undef stacksave
4390 
4391 #undef newptrb
4392 
4393 #endif
4394 
4395 /* These two are defined as macros in both cases */
4396 
4397 #undef fc
4398 #undef fi
4399 
4400 /***************************************************************************
4401 ***************************************************************************/
4402 
4403 
4404 
4405 /*************************************************
4406 * Execute a Regular Expression *
4407 *************************************************/
4408 
4409 /* This function applies a compiled re to a subject string and picks out
4410 portions of the string if it matches. Two elements in the vector are set for
4411 each substring: the offsets to the start and end of the substring.
4412 
4413 Arguments:
4414  argument_re points to the compiled expression
4415  extra_data points to extra data or is NULL
4416  subject points to the subject string
4417  length length of subject string (may contain binary zeros)
4418  start_offset where to start in the subject string
4419  options option bits
4420  offsets points to a vector of ints to be filled in with offsets
4421  offsetcount the number of elements in the vector
4422 
4423 Returns: > 0 => success; value is the number of elements filled in
4424  = 0 => success, but offsets is not big enough
4425  -1 => failed to match
4426  < -1 => some kind of unexpected problem
4427 */
4428 
4430 pcre_exec(const pcre *argument_re, const pcre_extra *extra_data,
4431  PCRE_SPTR subject, int length, int start_offset, int options, int *offsets,
4432  int offsetcount)
4433 {
4434 int rc, resetcount, ocount;
4435 int first_byte = -1;
4436 int req_byte = -1;
4437 int req_byte2 = -1;
4438 int newline;
4439 unsigned long int ims;
4440 BOOL using_temporary_offsets = FALSE;
4441 BOOL anchored;
4442 BOOL startline;
4443 BOOL firstline;
4444 BOOL first_byte_caseless = FALSE;
4445 BOOL req_byte_caseless = FALSE;
4446 BOOL utf8;
4447 match_data match_block;
4448 match_data *md = &match_block;
4449 const uschar *tables;
4450 const uschar *start_bits = NULL;
4451 USPTR start_match = (USPTR)subject + start_offset;
4452 USPTR end_subject;
4453 USPTR req_byte_ptr = start_match - 1;
4454 
4455 pcre_study_data internal_study;
4456 const pcre_study_data *study;
4457 
4458 real_pcre internal_re;
4459 const real_pcre *external_re = (const real_pcre *)argument_re;
4460 const real_pcre *re = external_re;
4461 
4462 /* Plausibility checks */
4463 
4464 if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
4465 if (re == NULL || subject == NULL ||
4466  (offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
4467 if (offsetcount < 0) return PCRE_ERROR_BADCOUNT;
4468 
4469 /* Fish out the optional data from the extra_data structure, first setting
4470 the default values. */
4471 
4472 study = NULL;
4473 md->match_limit = MATCH_LIMIT;
4475 md->callout_data = NULL;
4476 
4477 /* The table pointer is always in native byte order. */
4478 
4479 tables = external_re->tables;
4480 
4481 if (extra_data != NULL)
4482  {
4483  register unsigned int flags = extra_data->flags;
4484  if ((flags & PCRE_EXTRA_STUDY_DATA) != 0)
4485  study = (const pcre_study_data *)extra_data->study_data;
4486  if ((flags & PCRE_EXTRA_MATCH_LIMIT) != 0)
4487  md->match_limit = extra_data->match_limit;
4488  if ((flags & PCRE_EXTRA_MATCH_LIMIT_RECURSION) != 0)
4489  md->match_limit_recursion = extra_data->match_limit_recursion;
4490  if ((flags & PCRE_EXTRA_CALLOUT_DATA) != 0)
4491  md->callout_data = extra_data->callout_data;
4492  if ((flags & PCRE_EXTRA_TABLES) != 0) tables = extra_data->tables;
4493  }
4494 
4495 /* If the exec call supplied NULL for tables, use the inbuilt ones. This
4496 is a feature that makes it possible to save compiled regex and re-use them
4497 in other programs later. */
4498 
4499 if (tables == NULL) tables = _pcre_default_tables;
4500 
4501 /* Check that the first field in the block is the magic number. If it is not,
4502 test for a regex that was compiled on a host of opposite endianness. If this is
4503 the case, flipped values are put in internal_re and internal_study if there was
4504 study data too. */
4505 
4506 if (re->magic_number != MAGIC_NUMBER)
4507  {
4508  re = _pcre_try_flipped(re, &internal_re, study, &internal_study);
4509  if (re == NULL) return PCRE_ERROR_BADMAGIC;
4510  if (study != NULL) study = &internal_study;
4511  }
4512 
4513 /* Set up other data */
4514 
4515 anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
4516 startline = (re->flags & PCRE_STARTLINE) != 0;
4517 firstline = (re->options & PCRE_FIRSTLINE) != 0;
4518 
4519 /* The code starts after the real_pcre block and the capture name table. */
4520 
4521 md->start_code = (const uschar *)external_re + re->name_table_offset +
4522  re->name_count * re->name_entry_size;
4523 
4524 md->start_subject = (USPTR)subject;
4525 md->start_offset = start_offset;
4526 md->end_subject = md->start_subject + length;
4527 end_subject = md->end_subject;
4528 
4529 md->endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
4530 utf8 = md->utf8 = (re->options & PCRE_UTF8) != 0;
4531 md->jscript_compat = (re->options & PCRE_JAVASCRIPT_COMPAT) != 0;
4532 
4533 md->notbol = (options & PCRE_NOTBOL) != 0;
4534 md->noteol = (options & PCRE_NOTEOL) != 0;
4535 md->notempty = (options & PCRE_NOTEMPTY) != 0;
4536 md->partial = (options & PCRE_PARTIAL) != 0;
4537 md->hitend = FALSE;
4538 
4539 md->recursive = NULL; /* No recursion at top level */
4540 
4541 md->lcc = tables + lcc_offset;
4542 md->ctypes = tables + ctypes_offset;
4543 
4544 /* Handle different \R options. */
4545 
4546 switch (options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE))
4547  {
4548  case 0:
4549  if ((re->options & (PCRE_BSR_ANYCRLF|PCRE_BSR_UNICODE)) != 0)
4550  md->bsr_anycrlf = (re->options & PCRE_BSR_ANYCRLF) != 0;
4551  else
4552 #ifdef BSR_ANYCRLF
4553  md->bsr_anycrlf = TRUE;
4554 #else
4555  md->bsr_anycrlf = FALSE;
4556 #endif
4557  break;
4558 
4559  case PCRE_BSR_ANYCRLF:
4560  md->bsr_anycrlf = TRUE;
4561  break;
4562 
4563  case PCRE_BSR_UNICODE:
4564  md->bsr_anycrlf = FALSE;
4565  break;
4566 
4567  default: return PCRE_ERROR_BADNEWLINE;
4568  }
4569 
4570 /* Handle different types of newline. The three bits give eight cases. If
4571 nothing is set at run time, whatever was used at compile time applies. */
4572 
4573 switch ((((options & PCRE_NEWLINE_BITS) == 0)? re->options :
4574  (pcre_uint32)options) & PCRE_NEWLINE_BITS)
4575  {
4576  case 0: newline = NEWLINE; break; /* Compile-time default */
4577  case PCRE_NEWLINE_CR: newline = CHAR_CR; break;
4578  case PCRE_NEWLINE_LF: newline = CHAR_NL; break;
4579  case PCRE_NEWLINE_CR+
4580  PCRE_NEWLINE_LF: newline = (CHAR_CR << 8) | CHAR_NL; break;
4581  case PCRE_NEWLINE_ANY: newline = -1; break;
4582  case PCRE_NEWLINE_ANYCRLF: newline = -2; break;
4583  default: return PCRE_ERROR_BADNEWLINE;
4584  }
4585 
4586 if (newline == -2)
4587  {
4588  md->nltype = NLTYPE_ANYCRLF;
4589  }
4590 else if (newline < 0)
4591  {
4592  md->nltype = NLTYPE_ANY;
4593  }
4594 else
4595  {
4596  md->nltype = NLTYPE_FIXED;
4597  if (newline > 255)
4598  {
4599  md->nllen = 2;
4600  md->nl[0] = (newline >> 8) & 255;
4601  md->nl[1] = newline & 255;
4602  }
4603  else
4604  {
4605  md->nllen = 1;
4606  md->nl[0] = newline;
4607  }
4608  }
4609 
4610 /* Partial matching is supported only for a restricted set of regexes at the
4611 moment. */
4612 
4613 if (md->partial && (re->flags & PCRE_NOPARTIAL) != 0)
4614  return PCRE_ERROR_BADPARTIAL;
4615 
4616 /* Check a UTF-8 string if required. Unfortunately there's no way of passing
4617 back the character offset. */
4618 
4619 #ifdef SUPPORT_UTF8
4620 if (utf8 && (options & PCRE_NO_UTF8_CHECK) == 0)
4621  {
4622  if (_pcre_valid_utf8((USPTR)subject, length) >= 0)
4623  return PCRE_ERROR_BADUTF8;
4624  if (start_offset > 0 && start_offset < length)
4625  {
4626  int tb = ((USPTR)subject)[start_offset];
4627  if (tb > 127)
4628  {
4629  tb &= 0xc0;
4630  if (tb != 0 && tb != 0xc0) return PCRE_ERROR_BADUTF8_OFFSET;
4631  }
4632  }
4633  }
4634 #endif
4635 
4636 /* The ims options can vary during the matching as a result of the presence
4637 of (?ims) items in the pattern. They are kept in a local variable so that
4638 restoring at the exit of a group is easy. */
4639 
4641 
4642 /* If the expression has got more back references than the offsets supplied can
4643 hold, we get a temporary chunk of working store to use during the matching.
4644 Otherwise, we can use the vector supplied, rounding down its size to a multiple
4645 of 3. */
4646 
4647 ocount = offsetcount - (offsetcount % 3);
4648 
4649 if (re->top_backref > 0 && re->top_backref >= ocount/3)
4650  {
4651  ocount = re->top_backref * 3 + 3;
4652  md->offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
4653  if (md->offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
4654  using_temporary_offsets = TRUE;
4655  DPRINTF(("Got memory to hold back references\n"));
4656  }
4657 else md->offset_vector = offsets;
4658 
4659 md->offset_end = ocount;
4660 md->offset_max = (2*ocount)/3;
4661 md->offset_overflow = FALSE;
4662 md->capture_last = -1;
4663 
4664 /* Compute the minimum number of offsets that we need to reset each time. Doing
4665 this makes a huge difference to execution time when there aren't many brackets
4666 in the pattern. */
4667 
4668 resetcount = 2 + re->top_bracket * 2;
4669 if (resetcount > offsetcount) resetcount = ocount;
4670 
4671 /* Reset the working variable associated with each extraction. These should
4672 never be used unless previously set, but they get saved and restored, and so we
4673 initialize them to avoid reading uninitialized locations. */
4674 
4675 if (md->offset_vector != NULL)
4676  {
4677  register int *iptr = md->offset_vector + ocount;
4678  register int *iend = iptr - resetcount/2 + 1;
4679  while (--iptr >= iend) *iptr = -1;
4680  }
4681 
4682 /* Set up the first character to match, if available. The first_byte value is
4683 never set for an anchored regular expression, but the anchoring may be forced
4684 at run time, so we have to test for anchoring. The first char may be unset for
4685 an unanchored pattern, of course. If there's no first char and the pattern was
4686 studied, there may be a bitmap of possible first characters. */
4687 
4688 if (!anchored)
4689  {
4690  if ((re->flags & PCRE_FIRSTSET) != 0)
4691  {
4692  first_byte = re->first_byte & 255;
4693  if ((first_byte_caseless = ((re->first_byte & REQ_CASELESS) != 0)) == TRUE)
4694  first_byte = md->lcc[first_byte];
4695  }
4696  else
4697  if (!startline && study != NULL &&
4698  (study->options & PCRE_STUDY_MAPPED) != 0)
4699  start_bits = study->start_bits;
4700  }
4701 
4702 /* For anchored or unanchored matches, there may be a "last known required
4703 character" set. */
4704 
4705 if ((re->flags & PCRE_REQCHSET) != 0)
4706  {
4707  req_byte = re->req_byte & 255;
4708  req_byte_caseless = (re->req_byte & REQ_CASELESS) != 0;
4709  req_byte2 = (tables + fcc_offset)[req_byte]; /* case flipped */
4710  }
4711 
4712 
4713 /* ==========================================================================*/
4714 
4715 /* Loop for handling unanchored repeated matching attempts; for anchored regexs
4716 the loop runs just once. */
4717 
4718 for(;;)
4719  {
4720  USPTR save_end_subject = end_subject;
4721  USPTR new_start_match;
4722 
4723  /* Reset the maximum number of extractions we might see. */
4724 
4725  if (md->offset_vector != NULL)
4726  {
4727  register int *iptr = md->offset_vector;
4728  register int *iend = iptr + resetcount;
4729  while (iptr < iend) *iptr++ = -1;
4730  }
4731 
4732  /* If firstline is TRUE, the start of the match is constrained to the first
4733  line of a multiline string. That is, the match must be before or at the first
4734  newline. Implement this by temporarily adjusting end_subject so that we stop
4735  scanning at a newline. If the match fails at the newline, later code breaks
4736  this loop. */
4737 
4738  if (firstline)
4739  {
4740  USPTR t = start_match;
4741 #ifdef SUPPORT_UTF8
4742  if (utf8)
4743  {
4744  while (t < md->end_subject && !IS_NEWLINE(t))
4745  {
4746  t++;
4747  while (t < end_subject && (*t & 0xc0) == 0x80) t++;
4748  }
4749  }
4750  else
4751 #endif
4752  while (t < md->end_subject && !IS_NEWLINE(t)) t++;
4753  end_subject = t;
4754  }
4755 
4756  /* There are some optimizations that avoid running the match if a known
4757  starting point is not found, or if a known later character is not present.
4758  However, there is an option that disables these, for testing and for ensuring
4759  that all callouts do actually occur. */
4760 
4761  if ((options & PCRE_NO_START_OPTIMIZE) == 0)
4762  {
4763  /* Advance to a unique first byte if there is one. */
4764 
4765  if (first_byte >= 0)
4766  {
4767  if (first_byte_caseless)
4768  while (start_match < end_subject && md->lcc[*start_match] != first_byte)
4769  start_match++;
4770  else
4771  while (start_match < end_subject && *start_match != first_byte)
4772  start_match++;
4773  }
4774 
4775  /* Or to just after a linebreak for a multiline match */
4776 
4777  else if (startline)
4778  {
4779  if (start_match > md->start_subject + start_offset)
4780  {
4781 #ifdef SUPPORT_UTF8
4782  if (utf8)
4783  {
4784  while (start_match < end_subject && !WAS_NEWLINE(start_match))
4785  {
4786  start_match++;
4787  while(start_match < end_subject && (*start_match & 0xc0) == 0x80)
4788  start_match++;
4789  }
4790  }
4791  else
4792 #endif
4793  while (start_match < end_subject && !WAS_NEWLINE(start_match))
4794  start_match++;
4795 
4796  /* If we have just passed a CR and the newline option is ANY or ANYCRLF,
4797  and we are now at a LF, advance the match position by one more character.
4798  */
4799 
4800  if (start_match[-1] == CHAR_CR &&
4801  (md->nltype == NLTYPE_ANY || md->nltype == NLTYPE_ANYCRLF) &&
4802  start_match < end_subject &&
4803  *start_match == CHAR_NL)
4804  start_match++;
4805  }
4806  }
4807 
4808  /* Or to a non-unique first byte after study */
4809 
4810  else if (start_bits != NULL)
4811  {
4812  while (start_match < end_subject)
4813  {
4814  register unsigned int c = *start_match;
4815  if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++;
4816  else break;
4817  }
4818  }
4819  } /* Starting optimizations */
4820 
4821  /* Restore fudged end_subject */
4822 
4823  end_subject = save_end_subject;
4824 
4825 #ifdef DEBUG /* Sigh. Some compilers never learn. */
4826  printf(">>>> Match against: ");
4827  pchars(start_match, end_subject - start_match, TRUE, md);
4828  printf("\n");
4829 #endif
4830 
4831  /* If req_byte is set, we know that that character must appear in the
4832  subject for the match to succeed. If the first character is set, req_byte
4833  must be later in the subject; otherwise the test starts at the match point.
4834  This optimization can save a huge amount of backtracking in patterns with
4835  nested unlimited repeats that aren't going to match. Writing separate code
4836  for cased/caseless versions makes it go faster, as does using an
4837  autoincrement and backing off on a match.
4838 
4839  HOWEVER: when the subject string is very, very long, searching to its end
4840  can take a long time, and give bad performance on quite ordinary patterns.
4841  This showed up when somebody was matching something like /^\d+C/ on a
4842  32-megabyte string... so we don't do this when the string is sufficiently
4843  long.
4844 
4845  ALSO: this processing is disabled when partial matching is requested, or if
4846  disabling is explicitly requested. */
4847 
4848  if ((options & PCRE_NO_START_OPTIMIZE) == 0 &&
4849  req_byte >= 0 &&
4850  end_subject - start_match < REQ_BYTE_MAX &&
4851  !md->partial)
4852  {
4853  register USPTR p = start_match + ((first_byte >= 0)? 1 : 0);
4854 
4855  /* We don't need to repeat the search if we haven't yet reached the
4856  place we found it at last time. */
4857 
4858  if (p > req_byte_ptr)
4859  {
4860  if (req_byte_caseless)
4861  {
4862  while (p < end_subject)
4863  {
4864  register int pp = *p++;
4865  if (pp == req_byte || pp == req_byte2) { p--; break; }
4866  }
4867  }
4868  else
4869  {
4870  while (p < end_subject)
4871  {
4872  if (*p++ == req_byte) { p--; break; }
4873  }
4874  }
4875 
4876  /* If we can't find the required character, break the matching loop,
4877  forcing a match failure. */
4878 
4879  if (p >= end_subject)
4880  {
4881  rc = MATCH_NOMATCH;
4882  break;
4883  }
4884 
4885  /* If we have found the required character, save the point where we
4886  found it, so that we don't search again next time round the loop if
4887  the start hasn't passed this character yet. */
4888 
4889  req_byte_ptr = p;
4890  }
4891  }
4892 
4893  /* OK, we can now run the match. */
4894 
4895  md->start_match_ptr = start_match;
4896  md->match_call_count = 0;
4897  rc = match(start_match, md->start_code, start_match, 2, md, ims, NULL, 0, 0);
4898 
4899  switch(rc)
4900  {
4901  /* NOMATCH and PRUNE advance by one character. THEN at this level acts
4902  exactly like PRUNE. */
4903 
4904  case MATCH_NOMATCH:
4905  case MATCH_PRUNE:
4906  case MATCH_THEN:
4907  new_start_match = start_match + 1;
4908 #ifdef SUPPORT_UTF8
4909  if (utf8)
4910  while(new_start_match < end_subject && (*new_start_match & 0xc0) == 0x80)
4911  new_start_match++;
4912 #endif
4913  break;
4914 
4915  /* SKIP passes back the next starting point explicitly. */
4916 
4917  case MATCH_SKIP:
4918  new_start_match = md->start_match_ptr;
4919  break;
4920 
4921  /* COMMIT disables the bumpalong, but otherwise behaves as NOMATCH. */
4922 
4923  case MATCH_COMMIT:
4924  rc = MATCH_NOMATCH;
4925  goto ENDLOOP;
4926 
4927  /* Any other return is some kind of error. */
4928 
4929  default:
4930  goto ENDLOOP;
4931  }
4932 
4933  /* Control reaches here for the various types of "no match at this point"
4934  result. Reset the code to MATCH_NOMATCH for subsequent checking. */
4935 
4936  rc = MATCH_NOMATCH;
4937 
4938  /* If PCRE_FIRSTLINE is set, the match must happen before or at the first
4939  newline in the subject (though it may continue over the newline). Therefore,
4940  if we have just failed to match, starting at a newline, do not continue. */
4941 
4942  if (firstline && IS_NEWLINE(start_match)) break;
4943 
4944  /* Advance to new matching position */
4945 
4946  start_match = new_start_match;
4947 
4948  /* Break the loop if the pattern is anchored or if we have passed the end of
4949  the subject. */
4950 
4951  if (anchored || start_match > end_subject) break;
4952 
4953  /* If we have just passed a CR and we are now at a LF, and the pattern does
4954  not contain any explicit matches for \r or \n, and the newline option is CRLF
4955  or ANY or ANYCRLF, advance the match position by one more character. */
4956 
4957  if (start_match[-1] == CHAR_CR &&
4958  start_match < end_subject &&
4959  *start_match == CHAR_NL &&
4960  (re->flags & PCRE_HASCRORLF) == 0 &&
4961  (md->nltype == NLTYPE_ANY ||
4962  md->nltype == NLTYPE_ANYCRLF ||
4963  md->nllen == 2))
4964  start_match++;
4965 
4966  } /* End of for(;;) "bumpalong" loop */
4967 
4968 /* ==========================================================================*/
4969 
4970 /* We reach here when rc is not MATCH_NOMATCH, or if one of the stopping
4971 conditions is true:
4972 
4973 (1) The pattern is anchored or the match was failed by (*COMMIT);
4974 
4975 (2) We are past the end of the subject;
4976 
4977 (3) PCRE_FIRSTLINE is set and we have failed to match at a newline, because
4978  this option requests that a match occur at or before the first newline in
4979  the subject.
4980 
4981 When we have a match and the offset vector is big enough to deal with any
4982 backreferences, captured substring offsets will already be set up. In the case
4983 where we had to get some local store to hold offsets for backreference
4984 processing, copy those that we can. In this case there need not be overflow if
4985 certain parts of the pattern were not used, even though there are more
4986 capturing parentheses than vector slots. */
4987 
4988 ENDLOOP:
4989 
4990 if (rc == MATCH_MATCH)
4991  {
4992  if (using_temporary_offsets)
4993  {
4994  if (offsetcount >= 4)
4995  {
4996  memcpy(offsets + 2, md->offset_vector + 2,
4997  (offsetcount - 2) * sizeof(int));
4998  DPRINTF(("Copied offsets from temporary memory\n"));
4999  }
5000  if (md->end_offset_top > offsetcount) md->offset_overflow = TRUE;
5001  DPRINTF(("Freeing temporary memory\n"));
5002  (pcre_free)(md->offset_vector);
5003  }
5004 
5005  /* Set the return code to the number of captured strings, or 0 if there are
5006  too many to fit into the vector. */
5007 
5008  rc = md->offset_overflow? 0 : md->end_offset_top/2;
5009 
5010  /* If there is space, set up the whole thing as substring 0. The value of
5011  md->start_match_ptr might be modified if \K was encountered on the success
5012  matching path. */
5013 
5014  if (offsetcount < 2) rc = 0; else
5015  {
5016  offsets[0] = md->start_match_ptr - md->start_subject;
5017  offsets[1] = md->end_match_ptr - md->start_subject;
5018  }
5019 
5020  DPRINTF((">>>> returning %d\n", rc));
5021  return rc;
5022  }
5023 
5024 /* Control gets here if there has been an error, or if the overall match
5025 attempt has failed at all permitted starting positions. */
5026 
5027 if (using_temporary_offsets)
5028  {
5029  DPRINTF(("Freeing temporary memory\n"));
5030  (pcre_free)(md->offset_vector);
5031  }
5032 
5033 if (rc != MATCH_NOMATCH)
5034  {
5035  DPRINTF((">>>> error: returning %d\n", rc));
5036  return rc;
5037  }
5038 else if (md->partial && md->hitend)
5039  {
5040  DPRINTF((">>>> returning PCRE_ERROR_PARTIAL\n"));
5041  return PCRE_ERROR_PARTIAL;
5042  }
5043 else
5044  {
5045  DPRINTF((">>>> returning PCRE_ERROR_NOMATCH\n"));
5046  return PCRE_ERROR_NOMATCH;
5047  }
5048 }
5049 
5050 /* End of pcre_exec.c */
#define REQ_CASELESS
pcre_uint16 name_entry_size
Definition: ucp.h:36
#define REGISTER
Definition: pcre_exec.c:255
uschar nl[4]
#define PCRE_PARTIAL
Definition: pcre.h:119
#define memmove(a, b, c)
#define NLTYPE_ANY
#define UCD_CHARTYPE(ch)
#define PCRE_EXTRA_MATCH_LIMIT_RECURSION
Definition: pcre.h:201
#define PCRE_STARTLINE
#define IS_NEWLINE(p)
real_pcre * _pcre_try_flipped(const real_pcre *, real_pcre *, const pcre_study_data *, pcre_study_data *)
struct recursion_info * prevrec
const unsigned char * tables
#define PT_ANY
Definition: pcre_exec.c:243
#define _pcre_OP_lengths
Definition: pcretest.c:115
#define USPTR
int offset
#define PCRE_ERROR_NOMATCH
Definition: pcre.h:137
#define MATCH_THEN
Definition: pcre_exec.c:77
void * callout_data
Definition: pcre.h:245
#define ctype_digit
#define PCRE_ERROR_BADUTF8_OFFSET
Definition: pcre.h:148
#define PCRE_IMS
pcre_uint16 top_bracket
int * offset_vector
#define PCRE_NOPARTIAL
#define GETCHARLEN(c, eptr, len)
#define fi
const unsigned char * epb_saved_eptr
#define PCRE_NOTBOL
Definition: pcre.h:111
#define PCRE_EXP_DEFN
#define TRUE
bool replacment for C indicating true.
Definition: ncbi_std.h:95
unsigned long int flags
Definition: pcre.h:221
unsigned long int match_limit_recursion
int pattern_position
Definition: pcre.h:247
Definition: pcre_exec.c:243
#define NEWLINE
const unsigned char _pcre_default_tables[]
#define PT_PC
#define PCRE_NOTEOL
Definition: pcre.h:112
int(* pcre_callout)(pcre_callout_block *)
Definition: pcre_globals.c:60
#define PCRE_JAVASCRIPT_COMPAT
Definition: pcre.h:131
#define PCRE_ANCHORED
Definition: pcre.h:108
#define MATCH_SKIP
Definition: pcre_exec.c:76
const unsigned char * start_subject
int next_item_length
Definition: pcre.h:248
int pcre_exec(const pcre *argument_re, const pcre_extra *extra_data, const char *subject, int length, int start_offset, int options, int *offsets, int offsetcount)
Definition: pcre_exec.c:4430
int BOOL
Definition: sybdb.h:150
#define DPRINTF(args)
Definition: mdb.c:485
#define CHAR_CR
#define PCRE_FIRSTSET
Definition: pcre_exec.c:243
#define PCRE_ERROR_BADMAGIC
Definition: pcre.h:140
#define NULL
Definition: ncbistd.hpp:225
#define PCRE_ERROR_NULL
Definition: pcre.h:138
Definition: pcre_exec.c:243
static const char rep_min[]
Definition: pcre_exec.c:87
const int _pcre_ucp_gentype[]
#define NLTYPE_ANYCRLF
#define PCRE_ERROR_BADOPTION
Definition: pcre.h:139
#define fc
#define PCRE_FIRSTLINE
Definition: pcre.h:122
const uschar * start_code
unsigned char uschar
int * offset_vector
Definition: pcre.h:238
int i
pcre_uint16 name_table_offset
#define PCRE_EXTRA_STUDY_DATA
Definition: pcre.h:197
#define PCRE_SPTR
Definition: pcre.h:213
#define PCRE_EXTRA_CALLOUT_DATA
Definition: pcre.h:199
uschar start_bits[32]
#define UCD_CATEGORY(ch)
#define PCRE_ERROR_MATCHLIMIT
Definition: pcre.h:145
#define PCRE_ERROR_BADUTF8
Definition: pcre.h:147
#define MATCH_PRUNE
Definition: pcre_exec.c:75
BOOL offset_overflow
#define MATCH_LIMIT_RECURSION
Definition: config.h:211
const unsigned char * tables
Definition: pcre.h:225
#define PCRE_NEWLINE_BITS
#define PCRE_STUDY_MAPPED
static DLIST_TYPE *DLIST_NAME() prev(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:61
switch(yytype)
Definition: newick.tab.cpp:737
const unsigned char * end_match_ptr
#define ctype_space
Definition: pcre_exec.c:243
Definition: pcre_exec.c:243
static char * newline
Definition: pcregrep.c:130
#define PCRE_DOTALL
Definition: pcre.h:106
const unsigned char * end_subject
static const char rep_max[]
Definition: pcre_exec.c:88
#define PT_LAMP
int callout_number
Definition: pcre.h:237
#define GETCHAR(c, eptr)
int current_position
Definition: pcre.h:242
#define lcc_offset
pcre_uint16 name_count
#define PCRE_MULTILINE
Definition: pcre.h:105
unsigned long int match_limit
const unsigned char * save_start
#define PCRE_BSR_ANYCRLF
Definition: pcre.h:129
#define PCRE_NEWLINE_LF
Definition: pcre.h:125
#define PCRE_EXTRA_MATCH_LIMIT
Definition: pcre.h:198
pcre_uint32 options
void *(* pcre_stack_malloc)(size_t)
Definition: pcre_globals.c:58
#define FALSE
bool replacment for C indicating false.
Definition: ncbi_std.h:99
#define CHAR_NL
#define ctypes_offset
struct eptrblock * epb_prev
pcre_uint16 req_byte
pcre_uint32 options
#define PCRE_REQCHSET
Definition: pcre_exec.c:243
#define PUBLIC_EXEC_OPTIONS
static int match(register const unsigned char *eptr, register const uschar *ecode, const unsigned char *mstart, int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb, int flags, unsigned int rdepth)
Definition: pcre_exec.c:431
#define MATCH_COMMIT
Definition: pcre_exec.c:74
unsigned long int match_limit
Definition: pcre.h:223
void *(* pcre_malloc)(size_t)
Definition: pcre_globals.c:56
#define UCD_OTHERCASE(ch)
#define PCRE_ERROR_PARTIAL
Definition: pcre.h:149
#define PCRE_ERROR_BADCOUNT
Definition: pcre.h:152
int _pcre_ord2utf8(int, uschar *)
Definition: pcre_ord2utf8.c:66
#define PCRE_NOTEMPTY
Definition: pcre.h:114
#define REQ_BYTE_MAX
#define PCRE_ERROR_BADPARTIAL
Definition: pcre.h:150
static int pchars(unsigned char *p, int length, FILE *f)
Definition: pcretest.c:437
int isprint(Uchar c)
Definition: ncbictype.hpp:67
BOOL jscript_compat
void * callout_data
Definition: pcre.h:224
T max(T x_, T y_)
#define GET_UCD(ch)
#define PT_SC
#define REC_STACK_SAVE_MAX
Definition: pcre_exec.c:83
if(yy_accept[yy_current_state])
static string subject
const uschar * ctypes
void * study_data
Definition: pcre.h:222
T min(T x_, T y_)
#define NLTYPE_FIXED
#define GETCHARINCTEST(c, eptr)
#define PCRE_BSR_UNICODE
Definition: pcre.h:130
#define MATCH_NOMATCH
Definition: pcre_exec.c:69
unsigned long int match_call_count
recursion_info * recursive
#define match_condassert
Definition: pcre_exec.c:62
#define PCRE_NEWLINE_ANY
Definition: pcre.h:127
#define PCRE_ERROR_INTERNAL
Definition: pcre.h:151
#define RRETURN(ra)
Definition: pcre_exec.c:272
#define PCRE_NO_UTF8_CHECK
Definition: pcre.h:117
Definition: pcre_exec.c:243
#define SUPPORT_UTF8
Definition: config.h:319
#define LINK_SIZE
Definition: config.h:187
static BOOL utf8
Definition: pcregrep.c:171
void * callout_data
#define PCRE_ERROR_RECURSIONLIMIT
Definition: pcre.h:158
#define PCRE_NO_START_OPTIMIZE
Definition: pcre.h:132
int len
static uch flags
#define ctype_word
#define match_cbegroup
Definition: pcre_exec.c:63
unsigned int
A callback function used to compare two keys in a database.
Definition: types.hpp:1153
#define PCRE_CALL_CONVENTION
const char * subject
Definition: pcre.h:239
const uschar * lcc
#define RREF_ANY
Definition: ucp.h:32
pcre_uint16 first_byte
#define MAGIC_NUMBER
#define PCRE_NEWLINE_CR
Definition: pcre.h:124
void(* pcre_free)(void *)
Definition: pcre_globals.c:57
#define WAS_NEWLINE(p)
#define PCRE_HASCRORLF
#define PCRE_CASELESS
Definition: pcre.h:104
#define PCRE_NEWLINE_ANYCRLF
Definition: pcre.h:128
static BOOL number
Definition: pcregrep.c:167
pcre_uint32 magic_number
#define GET2(a, n)
#define UCD_SCRIPT(ch)
#define GETCHARINC(c, eptr)
#define PCRE_ERROR_UNKNOWN_OPCODE
Definition: pcre.h:141
static DLIST_TYPE *DLIST_NAME() next(DLIST_LIST_TYPE *list, DLIST_TYPE *item)
Definition: dlist.tmpl.h:56
const uschar * after_call
int capture_last
Definition: pcre.h:244
#define MATCH_LIMIT
Definition: config.h:198
uschar chartype
#define RMATCH(ra, rb, rc, rd, re, rf, rg, rw)
Definition: pcre_exec.c:270
#define fcc_offset
unsigned long int match_limit_recursion
Definition: pcre.h:226
const unsigned char * start_match_ptr
Definition: ucp.h:17
int _pcre_valid_utf8(const uschar *, int)
Definition: pcre_exec.c:243
#define PCRE_UTF8
Definition: pcre.h:115
Definition: ucp.h:35
#define PCRE_ERROR_BADNEWLINE
Definition: pcre.h:160
#define PCRE_DOLLAR_ENDONLY
Definition: pcre.h:109
BOOL _pcre_xclass(int, const uschar *)
Definition: pcre_xclass.c:67
#define PCRE_ERROR_NOMEMORY
Definition: pcre.h:143
#define PT_GC
pcre_uint16 flags
pcre_uint16 top_backref
static BOOL match_ref(int offset, register const unsigned char *eptr, int length, match_data *md, unsigned long int ims)
Definition: pcre_exec.c:139
#define MATCH_MATCH
Definition: pcre_exec.c:68
#define PCRE_EXTRA_TABLES
Definition: pcre.h:200
int subject_length
Definition: pcre.h:240
Modified on Tue Jan 16 15:44:51 2018 by modify_doxy.py rev. 546573