php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
pcre2_dfa_match.c
Go to the documentation of this file.
1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 Original API code Copyright (c) 1997-2012 University of Cambridge
10 New API code Copyright (c) 2016-2023 University of Cambridge
11
12-----------------------------------------------------------------------------
13Redistribution and use in source and binary forms, with or without
14modification, are permitted provided that the following conditions are met:
15
16 * Redistributions of source code must retain the above copyright notice,
17 this list of conditions and the following disclaimer.
18
19 * Redistributions in binary form must reproduce the above copyright
20 notice, this list of conditions and the following disclaimer in the
21 documentation and/or other materials provided with the distribution.
22
23 * Neither the name of the University of Cambridge nor the names of its
24 contributors may be used to endorse or promote products derived from
25 this software without specific prior written permission.
26
27THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
28AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
30ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
31LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
32CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
33SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
34INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
35CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
36ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
37POSSIBILITY OF SUCH DAMAGE.
38-----------------------------------------------------------------------------
39*/
40
41
42/* This module contains the external function pcre2_dfa_match(), which is an
43alternative matching function that uses a sort of DFA algorithm (not a true
44FSM). This is NOT Perl-compatible, but it has advantages in certain
45applications. */
46
47
48/* NOTE ABOUT PERFORMANCE: A user of this function sent some code that improved
49the performance of his patterns greatly. I could not use it as it stood, as it
50was not thread safe, and made assumptions about pattern sizes. Also, it caused
51test 7 to loop, and test 9 to crash with a segfault.
52
53The issue is the check for duplicate states, which is done by a simple linear
54search up the state list. (Grep for "duplicate" below to find the code.) For
55many patterns, there will never be many states active at one time, so a simple
56linear search is fine. In patterns that have many active states, it might be a
57bottleneck. The suggested code used an indexing scheme to remember which states
58had previously been used for each character, and avoided the linear search when
59it knew there was no chance of a duplicate. This was implemented when adding
60states to the state lists.
61
62I wrote some thread-safe, not-limited code to try something similar at the time
63of checking for duplicates (instead of when adding states), using index vectors
64on the stack. It did give a 13% improvement with one specially constructed
65pattern for certain subject strings, but on other strings and on many of the
66simpler patterns in the test suite it did worse. The major problem, I think,
67was the extra time to initialize the index. This had to be done for each call
68of internal_dfa_match(). (The supplied patch used a static vector, initialized
69only once - I suspect this was the cause of the problems with the tests.)
70
71Overall, I concluded that the gains in some cases did not outweigh the losses
72in others, so I abandoned this code. */
73
74
75#ifdef HAVE_CONFIG_H
76#include "config.h"
77#endif
78
79#define NLBLOCK mb /* Block containing newline information */
80#define PSSTART start_subject /* Field containing processed string start */
81#define PSEND end_subject /* Field containing processed string end */
82
83#include "pcre2_internal.h"
84
85#define PUBLIC_DFA_MATCH_OPTIONS \
86 (PCRE2_ANCHORED|PCRE2_ENDANCHORED|PCRE2_NOTBOL|PCRE2_NOTEOL|PCRE2_NOTEMPTY| \
87 PCRE2_NOTEMPTY_ATSTART|PCRE2_NO_UTF_CHECK|PCRE2_PARTIAL_HARD| \
88 PCRE2_PARTIAL_SOFT|PCRE2_DFA_SHORTEST|PCRE2_DFA_RESTART| \
89 PCRE2_COPY_MATCHED_SUBJECT)
90
91
92/*************************************************
93* Code parameters and static tables *
94*************************************************/
95
96/* These are offsets that are used to turn the OP_TYPESTAR and friends opcodes
97into others, under special conditions. A gap of 20 between the blocks should be
98enough. The resulting opcodes don't have to be less than 256 because they are
99never stored, so we push them well clear of the normal opcodes. */
100
101#define OP_PROP_EXTRA 300
102#define OP_EXTUNI_EXTRA 320
103#define OP_ANYNL_EXTRA 340
104#define OP_HSPACE_EXTRA 360
105#define OP_VSPACE_EXTRA 380
106
107
108/* This table identifies those opcodes that are followed immediately by a
109character that is to be tested in some way. This makes it possible to
110centralize the loading of these characters. In the case of Type * etc, the
111"character" is the opcode for \D, \d, \S, \s, \W, or \w, which will always be a
112small value. Non-zero values in the table are the offsets from the opcode where
113the character is to be found. ***NOTE*** If the start of this table is
114modified, the three tables that follow must also be modified. */
115
116static const uint8_t coptable[] = {
117 0, /* End */
118 0, 0, 0, 0, 0, /* \A, \G, \K, \B, \b */
119 0, 0, 0, 0, 0, 0, /* \D, \d, \S, \s, \W, \w */
120 0, 0, 0, /* Any, AllAny, Anybyte */
121 0, 0, /* \P, \p */
122 0, 0, 0, 0, 0, /* \R, \H, \h, \V, \v */
123 0, /* \X */
124 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
125 1, /* Char */
126 1, /* Chari */
127 1, /* not */
128 1, /* noti */
129 /* Positive single-char repeats */
130 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
131 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto, minupto */
132 1+IMM2_SIZE, /* exact */
133 1, 1, 1, 1+IMM2_SIZE, /* *+, ++, ?+, upto+ */
134 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
135 1+IMM2_SIZE, 1+IMM2_SIZE, /* upto I, minupto I */
136 1+IMM2_SIZE, /* exact I */
137 1, 1, 1, 1+IMM2_SIZE, /* *+I, ++I, ?+I, upto+I */
138 /* Negative single-char repeats - only for chars < 256 */
139 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
140 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto, minupto */
141 1+IMM2_SIZE, /* NOT exact */
142 1, 1, 1, 1+IMM2_SIZE, /* NOT *+, ++, ?+, upto+ */
143 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
144 1+IMM2_SIZE, 1+IMM2_SIZE, /* NOT upto I, minupto I */
145 1+IMM2_SIZE, /* NOT exact I */
146 1, 1, 1, 1+IMM2_SIZE, /* NOT *+I, ++I, ?+I, upto+I */
147 /* Positive type repeats */
148 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
149 1+IMM2_SIZE, 1+IMM2_SIZE, /* Type upto, minupto */
150 1+IMM2_SIZE, /* Type exact */
151 1, 1, 1, 1+IMM2_SIZE, /* Type *+, ++, ?+, upto+ */
152 /* Character class & ref repeats */
153 0, 0, 0, 0, 0, 0, /* *, *?, +, +?, ?, ?? */
154 0, 0, /* CRRANGE, CRMINRANGE */
155 0, 0, 0, 0, /* Possessive *+, ++, ?+, CRPOSRANGE */
156 0, /* CLASS */
157 0, /* NCLASS */
158 0, /* XCLASS - variable length */
159 0, /* REF */
160 0, /* REFI */
161 0, /* DNREF */
162 0, /* DNREFI */
163 0, /* RECURSE */
164 0, /* CALLOUT */
165 0, /* CALLOUT_STR */
166 0, /* Alt */
167 0, /* Ket */
168 0, /* KetRmax */
169 0, /* KetRmin */
170 0, /* KetRpos */
171 0, 0, /* Reverse, Vreverse */
172 0, /* Assert */
173 0, /* Assert not */
174 0, /* Assert behind */
175 0, /* Assert behind not */
176 0, /* NA assert */
177 0, /* NA assert behind */
178 0, /* ONCE */
179 0, /* SCRIPT_RUN */
180 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
181 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
182 0, 0, /* CREF, DNCREF */
183 0, 0, /* RREF, DNRREF */
184 0, 0, /* FALSE, TRUE */
185 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
186 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
187 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
188 0, 0, /* COMMIT, COMMIT_ARG */
189 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
190 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
191 0, 0 /* \B and \b in UCP mode */
192};
193
194/* This table identifies those opcodes that inspect a character. It is used to
195remember the fact that a character could have been inspected when the end of
196the subject is reached. ***NOTE*** If the start of this table is modified, the
197two tables that follow must also be modified. */
198
199static const uint8_t poptable[] = {
200 0, /* End */
201 0, 0, 0, 1, 1, /* \A, \G, \K, \B, \b */
202 1, 1, 1, 1, 1, 1, /* \D, \d, \S, \s, \W, \w */
203 1, 1, 1, /* Any, AllAny, Anybyte */
204 1, 1, /* \P, \p */
205 1, 1, 1, 1, 1, /* \R, \H, \h, \V, \v */
206 1, /* \X */
207 0, 0, 0, 0, 0, 0, /* \Z, \z, $, $M, ^, ^M */
208 1, /* Char */
209 1, /* Chari */
210 1, /* not */
211 1, /* noti */
212 /* Positive single-char repeats */
213 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
214 1, 1, 1, /* upto, minupto, exact */
215 1, 1, 1, 1, /* *+, ++, ?+, upto+ */
216 1, 1, 1, 1, 1, 1, /* *I, *?I, +I, +?I, ?I, ??I */
217 1, 1, 1, /* upto I, minupto I, exact I */
218 1, 1, 1, 1, /* *+I, ++I, ?+I, upto+I */
219 /* Negative single-char repeats - only for chars < 256 */
220 1, 1, 1, 1, 1, 1, /* NOT *, *?, +, +?, ?, ?? */
221 1, 1, 1, /* NOT upto, minupto, exact */
222 1, 1, 1, 1, /* NOT *+, ++, ?+, upto+ */
223 1, 1, 1, 1, 1, 1, /* NOT *I, *?I, +I, +?I, ?I, ??I */
224 1, 1, 1, /* NOT upto I, minupto I, exact I */
225 1, 1, 1, 1, /* NOT *+I, ++I, ?+I, upto+I */
226 /* Positive type repeats */
227 1, 1, 1, 1, 1, 1, /* Type *, *?, +, +?, ?, ?? */
228 1, 1, 1, /* Type upto, minupto, exact */
229 1, 1, 1, 1, /* Type *+, ++, ?+, upto+ */
230 /* Character class & ref repeats */
231 1, 1, 1, 1, 1, 1, /* *, *?, +, +?, ?, ?? */
232 1, 1, /* CRRANGE, CRMINRANGE */
233 1, 1, 1, 1, /* Possessive *+, ++, ?+, CRPOSRANGE */
234 1, /* CLASS */
235 1, /* NCLASS */
236 1, /* XCLASS - variable length */
237 0, /* REF */
238 0, /* REFI */
239 0, /* DNREF */
240 0, /* DNREFI */
241 0, /* RECURSE */
242 0, /* CALLOUT */
243 0, /* CALLOUT_STR */
244 0, /* Alt */
245 0, /* Ket */
246 0, /* KetRmax */
247 0, /* KetRmin */
248 0, /* KetRpos */
249 0, 0, /* Reverse, Vreverse */
250 0, /* Assert */
251 0, /* Assert not */
252 0, /* Assert behind */
253 0, /* Assert behind not */
254 0, /* NA assert */
255 0, /* NA assert behind */
256 0, /* ONCE */
257 0, /* SCRIPT_RUN */
258 0, 0, 0, 0, 0, /* BRA, BRAPOS, CBRA, CBRAPOS, COND */
259 0, 0, 0, 0, 0, /* SBRA, SBRAPOS, SCBRA, SCBRAPOS, SCOND */
260 0, 0, /* CREF, DNCREF */
261 0, 0, /* RREF, DNRREF */
262 0, 0, /* FALSE, TRUE */
263 0, 0, 0, /* BRAZERO, BRAMINZERO, BRAPOSZERO */
264 0, 0, 0, /* MARK, PRUNE, PRUNE_ARG */
265 0, 0, 0, 0, /* SKIP, SKIP_ARG, THEN, THEN_ARG */
266 0, 0, /* COMMIT, COMMIT_ARG */
267 0, 0, 0, /* FAIL, ACCEPT, ASSERT_ACCEPT */
268 0, 0, 0, /* CLOSE, SKIPZERO, DEFINE */
269 1, 1 /* \B and \b in UCP mode */
270};
271
272/* These 2 tables allow for compact code for testing for \D, \d, \S, \s, \W,
273and \w */
274
275static const uint8_t toptable1[] = {
276 0, 0, 0, 0, 0, 0,
280 0, 0 /* OP_ANY, OP_ALLANY */
281};
282
283static const uint8_t toptable2[] = {
284 0, 0, 0, 0, 0, 0,
285 ctype_digit, 0,
286 ctype_space, 0,
287 ctype_word, 0,
288 1, 1 /* OP_ANY, OP_ALLANY */
289};
290
291
292/* Structure for holding data about a particular state, which is in effect the
293current data for an active path through the match tree. It must consist
294entirely of ints because the working vector we are passed, and which we put
295these structures in, is a vector of ints. */
296
297typedef struct stateblock {
298 int offset; /* Offset to opcode (-ve has meaning) */
299 int count; /* Count for repeats */
300 int data; /* Some use extra data */
302
303#define INTS_PER_STATEBLOCK (int)(sizeof(stateblock)/sizeof(int))
304
305
306/* Before version 10.32 the recursive calls of internal_dfa_match() were passed
307local working space and output vectors that were created on the stack. This has
308caused issues for some patterns, especially in small-stack environments such as
309Windows. A new scheme is now in use which sets up a vector on the stack, but if
310this is too small, heap memory is used, up to the heap_limit. The main
311parameters are all numbers of ints because the workspace is a vector of ints.
312
313The size of the starting stack vector, DFA_START_RWS_SIZE, is in bytes, and is
314defined in pcre2_internal.h so as to be available to pcre2test when it is
315finding the minimum heap requirement for a match. */
316
317#define OVEC_UNIT (sizeof(PCRE2_SIZE)/sizeof(int))
318
319#define RWS_BASE_SIZE (DFA_START_RWS_SIZE/sizeof(int)) /* Stack vector */
320#define RWS_RSIZE 1000 /* Work size for recursion */
321#define RWS_OVEC_RSIZE (1000*OVEC_UNIT) /* Ovector for recursion */
322#define RWS_OVEC_OSIZE (2*OVEC_UNIT) /* Ovector in other cases */
323
324/* This structure is at the start of each workspace block. */
325
326typedef struct RWS_anchor {
328 uint32_t size; /* Number of ints */
329 uint32_t free; /* Number of ints */
331
332#define RWS_ANCHOR_SIZE (sizeof(RWS_anchor)/sizeof(int))
333
334
335
336/*************************************************
337* Process a callout *
338*************************************************/
339
340/* This function is called to perform a callout.
341
342Arguments:
343 code current code pointer
344 offsets points to current capture offsets
345 current_subject start of current subject match
346 ptr current position in subject
347 mb the match block
348 extracode extra code offset when called from condition
349 lengthptr where to return the callout length
350
351Returns: the return from the callout
352*/
353
354static int
355do_callout_dfa(PCRE2_SPTR code, PCRE2_SIZE *offsets, PCRE2_SPTR current_subject,
357 PCRE2_SIZE *lengthptr)
358{
360
361*lengthptr = (code[extracode] == OP_CALLOUT)?
363 (PCRE2_SIZE)GET(code, 1 + 2*LINK_SIZE + extracode);
364
365if (mb->callout == NULL) return 0; /* No callout provided */
366
367/* Fixed fields in the callout block are set once and for all at the start of
368matching. */
369
370cb->offset_vector = offsets;
371cb->start_match = (PCRE2_SIZE)(current_subject - mb->start_subject);
372cb->current_position = (PCRE2_SIZE)(ptr - mb->start_subject);
373cb->pattern_position = GET(code, 1 + extracode);
374cb->next_item_length = GET(code, 1 + LINK_SIZE + extracode);
375
376if (code[extracode] == OP_CALLOUT)
377 {
378 cb->callout_number = code[1 + 2*LINK_SIZE + extracode];
379 cb->callout_string_offset = 0;
380 cb->callout_string = NULL;
381 cb->callout_string_length = 0;
382 }
383else
384 {
385 cb->callout_number = 0;
386 cb->callout_string_offset = GET(code, 1 + 3*LINK_SIZE + extracode);
387 cb->callout_string = code + (1 + 4*LINK_SIZE + extracode) + 1;
388 cb->callout_string_length = *lengthptr - (1 + 4*LINK_SIZE) - 2;
389 }
390
391return (mb->callout)(cb, mb->callout_data);
392}
393
394
395
396/*************************************************
397* Expand local workspace memory *
398*************************************************/
399
400/* This function is called when internal_dfa_match() is about to be called
401recursively and there is insufficient working space left in the current
402workspace block. If there's an existing next block, use it; otherwise get a new
403block unless the heap limit is reached.
404
405Arguments:
406 rwsptr pointer to block pointer (updated)
407 ovecsize space needed for an ovector
408 mb the match block
409
410Returns: 0 rwsptr has been updated
411 !0 an error code
412*/
413
414static int
415more_workspace(RWS_anchor **rwsptr, unsigned int ovecsize, dfa_match_block *mb)
416{
417RWS_anchor *rws = *rwsptr;
418RWS_anchor *new;
419
420if (rws->next != NULL)
421 {
422 new = rws->next;
423 }
424
425/* Sizes in the RWS_anchor blocks are in units of sizeof(int), but
426mb->heap_limit and mb->heap_used are in kibibytes. Play carefully, to avoid
427overflow. */
428
429else
430 {
431 uint32_t newsize = (rws->size >= UINT32_MAX/(sizeof(int)*2))? UINT32_MAX/sizeof(int) : rws->size * 2;
432 uint32_t newsizeK = newsize/(1024/sizeof(int));
433
434 if (newsizeK + mb->heap_used > mb->heap_limit)
435 newsizeK = (uint32_t)(mb->heap_limit - mb->heap_used);
436 newsize = newsizeK*(1024/sizeof(int));
437
438 if (newsize < RWS_RSIZE + ovecsize + RWS_ANCHOR_SIZE)
440 new = mb->memctl.malloc(newsize*sizeof(int), mb->memctl.memory_data);
441 if (new == NULL) return PCRE2_ERROR_NOMEMORY;
442 mb->heap_used += newsizeK;
443 new->next = NULL;
444 new->size = newsize;
445 rws->next = new;
446 }
447
448new->free = new->size - RWS_ANCHOR_SIZE;
449*rwsptr = new;
450return 0;
451}
452
453
454
455/*************************************************
456* Match a Regular Expression - DFA engine *
457*************************************************/
458
459/* This internal function applies a compiled pattern to a subject string,
460starting at a given point, using a DFA engine. This function is called from the
461external one, possibly multiple times if the pattern is not anchored. The
462function calls itself recursively for some kinds of subpattern.
463
464Arguments:
465 mb the match_data block with fixed information
466 this_start_code the opening bracket of this subexpression's code
467 current_subject where we currently are in the subject string
468 start_offset start offset in the subject string
469 offsets vector to contain the matching string offsets
470 offsetcount size of same
471 workspace vector of workspace
472 wscount size of same
473 rlevel function call recursion level
474
475Returns: > 0 => number of match offset pairs placed in offsets
476 = 0 => offsets overflowed; longest matches are present
477 -1 => failed to match
478 < -1 => some kind of unexpected problem
479
480The following macros are used for adding states to the two state vectors (one
481for the current character, one for the following character). */
482
483#define ADD_ACTIVE(x,y) \
484 if (active_count++ < wscount) \
485 { \
486 next_active_state->offset = (x); \
487 next_active_state->count = (y); \
488 next_active_state++; \
489 } \
490 else return PCRE2_ERROR_DFA_WSSIZE
491
492#define ADD_ACTIVE_DATA(x,y,z) \
493 if (active_count++ < wscount) \
494 { \
495 next_active_state->offset = (x); \
496 next_active_state->count = (y); \
497 next_active_state->data = (z); \
498 next_active_state++; \
499 } \
500 else return PCRE2_ERROR_DFA_WSSIZE
501
502#define ADD_NEW(x,y) \
503 if (new_count++ < wscount) \
504 { \
505 next_new_state->offset = (x); \
506 next_new_state->count = (y); \
507 next_new_state++; \
508 } \
509 else return PCRE2_ERROR_DFA_WSSIZE
510
511#define ADD_NEW_DATA(x,y,z) \
512 if (new_count++ < wscount) \
513 { \
514 next_new_state->offset = (x); \
515 next_new_state->count = (y); \
516 next_new_state->data = (z); \
517 next_new_state++; \
518 } \
519 else return PCRE2_ERROR_DFA_WSSIZE
520
521/* And now, here is the code */
522
523static int
524internal_dfa_match(
525 dfa_match_block *mb,
526 PCRE2_SPTR this_start_code,
527 PCRE2_SPTR current_subject,
528 PCRE2_SIZE start_offset,
529 PCRE2_SIZE *offsets,
530 uint32_t offsetcount,
531 int *workspace,
532 int wscount,
533 uint32_t rlevel,
534 int *RWS)
535{
536stateblock *active_states, *new_states, *temp_states;
537stateblock *next_active_state, *next_new_state;
538const uint8_t *ctypes, *lcc, *fcc;
540PCRE2_SPTR end_code;
541dfa_recursion_info new_recursive;
542int active_count, new_count, match_count;
543
544/* Some fields in the mb block are frequently referenced, so we load them into
545independent variables in the hope that this will perform better. */
546
547PCRE2_SPTR start_subject = mb->start_subject;
548PCRE2_SPTR end_subject = mb->end_subject;
549PCRE2_SPTR start_code = mb->start_code;
550
551#ifdef SUPPORT_UNICODE
552BOOL utf = (mb->poptions & PCRE2_UTF) != 0;
553BOOL utf_or_ucp = utf || (mb->poptions & PCRE2_UCP) != 0;
554#else
555BOOL utf = FALSE;
556#endif
557
558BOOL reset_could_continue = FALSE;
559
561if (rlevel++ > mb->match_limit_depth) return PCRE2_ERROR_DEPTHLIMIT;
562offsetcount &= (uint32_t)(-2); /* Round down */
563
564wscount -= 2;
565wscount = (wscount - (wscount % (INTS_PER_STATEBLOCK * 2))) /
567
568ctypes = mb->tables + ctypes_offset;
569lcc = mb->tables + lcc_offset;
570fcc = mb->tables + fcc_offset;
571
572match_count = PCRE2_ERROR_NOMATCH; /* A negative number */
573
574active_states = (stateblock *)(workspace + 2);
575next_new_state = new_states = active_states + wscount;
576new_count = 0;
577
578/* The first thing in any (sub) pattern is a bracket of some sort. Push all
579the alternative states onto the list, and find out where the end is. This
580makes is possible to use this function recursively, when we want to stop at a
581matching internal ket rather than at the end.
582
583If we are dealing with a backward assertion we have to find out the maximum
584amount to move back, and set up each alternative appropriately. */
585
586if (*this_start_code == OP_ASSERTBACK || *this_start_code == OP_ASSERTBACK_NOT)
587 {
588 size_t max_back = 0;
589 size_t gone_back;
590
591 end_code = this_start_code;
592 do
593 {
594 size_t back = (size_t)GET2(end_code, 2+LINK_SIZE);
595 if (back > max_back) max_back = back;
596 end_code += GET(end_code, 1);
597 }
598 while (*end_code == OP_ALT);
599
600 /* If we can't go back the amount required for the longest lookbehind
601 pattern, go back as far as we can; some alternatives may still be viable. */
602
603#ifdef SUPPORT_UNICODE
604 /* In character mode we have to step back character by character */
605
606 if (utf)
607 {
608 for (gone_back = 0; gone_back < max_back; gone_back++)
609 {
610 if (current_subject <= start_subject) break;
611 current_subject--;
612 ACROSSCHAR(current_subject > start_subject, current_subject,
613 current_subject--);
614 }
615 }
616 else
617#endif
618
619 /* In byte-mode we can do this quickly. */
620
621 {
622 size_t current_offset = (size_t)(current_subject - start_subject);
623 gone_back = (current_offset < max_back)? current_offset : max_back;
624 current_subject -= gone_back;
625 }
626
627 /* Save the earliest consulted character */
628
629 if (current_subject < mb->start_used_ptr)
630 mb->start_used_ptr = current_subject;
631
632 /* Now we can process the individual branches. There will be an OP_REVERSE at
633 the start of each branch, except when the length of the branch is zero. */
634
635 end_code = this_start_code;
636 do
637 {
638 uint32_t revlen = (end_code[1+LINK_SIZE] == OP_REVERSE)? 1 + IMM2_SIZE : 0;
639 size_t back = (revlen == 0)? 0 : (size_t)GET2(end_code, 2+LINK_SIZE);
640 if (back <= gone_back)
641 {
642 int bstate = (int)(end_code - start_code + 1 + LINK_SIZE + revlen);
643 ADD_NEW_DATA(-bstate, 0, (int)(gone_back - back));
644 }
645 end_code += GET(end_code, 1);
646 }
647 while (*end_code == OP_ALT);
648 }
649
650/* This is the code for a "normal" subpattern (not a backward assertion). The
651start of a whole pattern is always one of these. If we are at the top level,
652we may be asked to restart matching from the same point that we reached for a
653previous partial match. We still have to scan through the top-level branches to
654find the end state. */
655
656else
657 {
658 end_code = this_start_code;
659
660 /* Restarting */
661
662 if (rlevel == 1 && (mb->moptions & PCRE2_DFA_RESTART) != 0)
663 {
664 do { end_code += GET(end_code, 1); } while (*end_code == OP_ALT);
665 new_count = workspace[1];
666 if (!workspace[0])
667 memcpy(new_states, active_states, (size_t)new_count * sizeof(stateblock));
668 }
669
670 /* Not restarting */
671
672 else
673 {
674 int length = 1 + LINK_SIZE +
675 ((*this_start_code == OP_CBRA || *this_start_code == OP_SCBRA ||
676 *this_start_code == OP_CBRAPOS || *this_start_code == OP_SCBRAPOS)
677 ? IMM2_SIZE:0);
678 do
679 {
680 ADD_NEW((int)(end_code - start_code + length), 0);
681 end_code += GET(end_code, 1);
682 length = 1 + LINK_SIZE;
683 }
684 while (*end_code == OP_ALT);
685 }
686 }
687
688workspace[0] = 0; /* Bit indicating which vector is current */
689
690/* Loop for scanning the subject */
691
692ptr = current_subject;
693for (;;)
694 {
695 int i, j;
696 int clen, dlen;
697 uint32_t c, d;
698 int forced_fail = 0;
699 BOOL partial_newline = FALSE;
700 BOOL could_continue = reset_could_continue;
701 reset_could_continue = FALSE;
702
703 if (ptr > mb->last_used_ptr) mb->last_used_ptr = ptr;
704
705 /* Make the new state list into the active state list and empty the
706 new state list. */
707
708 temp_states = active_states;
709 active_states = new_states;
710 new_states = temp_states;
711 active_count = new_count;
712 new_count = 0;
713
714 workspace[0] ^= 1; /* Remember for the restarting feature */
715 workspace[1] = active_count;
716
717 /* Set the pointers for adding new states */
718
719 next_active_state = active_states + active_count;
720 next_new_state = new_states;
721
722 /* Load the current character from the subject outside the loop, as many
723 different states may want to look at it, and we assume that at least one
724 will. */
725
726 if (ptr < end_subject)
727 {
728 clen = 1; /* Number of data items in the character */
729#ifdef SUPPORT_UNICODE
730 GETCHARLENTEST(c, ptr, clen);
731#else
732 c = *ptr;
733#endif /* SUPPORT_UNICODE */
734 }
735 else
736 {
737 clen = 0; /* This indicates the end of the subject */
738 c = NOTACHAR; /* This value should never actually be used */
739 }
740
741 /* Scan up the active states and act on each one. The result of an action
742 may be to add more states to the currently active list (e.g. on hitting a
743 parenthesis) or it may be to put states on the new list, for considering
744 when we move the character pointer on. */
745
746 for (i = 0; i < active_count; i++)
747 {
748 stateblock *current_state = active_states + i;
749 BOOL caseless = FALSE;
750 PCRE2_SPTR code;
751 uint32_t codevalue;
752 int state_offset = current_state->offset;
753 int rrc;
754 int count;
755
756 /* A negative offset is a special case meaning "hold off going to this
757 (negated) state until the number of characters in the data field have
758 been skipped". If the could_continue flag was passed over from a previous
759 state, arrange for it to passed on. */
760
761 if (state_offset < 0)
762 {
763 if (current_state->data > 0)
764 {
765 ADD_NEW_DATA(state_offset, current_state->count,
766 current_state->data - 1);
767 if (could_continue) reset_could_continue = TRUE;
768 continue;
769 }
770 else
771 {
772 current_state->offset = state_offset = -state_offset;
773 }
774 }
775
776 /* Check for a duplicate state with the same count, and skip if found.
777 See the note at the head of this module about the possibility of improving
778 performance here. */
779
780 for (j = 0; j < i; j++)
781 {
782 if (active_states[j].offset == state_offset &&
783 active_states[j].count == current_state->count)
784 goto NEXT_ACTIVE_STATE;
785 }
786
787 /* The state offset is the offset to the opcode */
788
789 code = start_code + state_offset;
790 codevalue = *code;
791
792 /* If this opcode inspects a character, but we are at the end of the
793 subject, remember the fact for use when testing for a partial match. */
794
795 if (clen == 0 && poptable[codevalue] != 0)
796 could_continue = TRUE;
797
798 /* If this opcode is followed by an inline character, load it. It is
799 tempting to test for the presence of a subject character here, but that
800 is wrong, because sometimes zero repetitions of the subject are
801 permitted.
802
803 We also use this mechanism for opcodes such as OP_TYPEPLUS that take an
804 argument that is not a data character - but is always one byte long because
805 the values are small. We have to take special action to deal with \P, \p,
806 \H, \h, \V, \v and \X in this case. To keep the other cases fast, convert
807 these ones to new opcodes. */
808
809 if (coptable[codevalue] > 0)
810 {
811 dlen = 1;
812#ifdef SUPPORT_UNICODE
813 if (utf) { GETCHARLEN(d, (code + coptable[codevalue]), dlen); } else
814#endif /* SUPPORT_UNICODE */
815 d = code[coptable[codevalue]];
816 if (codevalue >= OP_TYPESTAR)
817 {
818 switch(d)
819 {
821 case OP_NOTPROP:
822 case OP_PROP: codevalue += OP_PROP_EXTRA; break;
823 case OP_ANYNL: codevalue += OP_ANYNL_EXTRA; break;
824 case OP_EXTUNI: codevalue += OP_EXTUNI_EXTRA; break;
825 case OP_NOT_HSPACE:
826 case OP_HSPACE: codevalue += OP_HSPACE_EXTRA; break;
827 case OP_NOT_VSPACE:
828 case OP_VSPACE: codevalue += OP_VSPACE_EXTRA; break;
829 default: break;
830 }
831 }
832 }
833 else
834 {
835 dlen = 0; /* Not strictly necessary, but compilers moan */
836 d = NOTACHAR; /* if these variables are not set. */
837 }
838
839
840 /* Now process the individual opcodes */
841
842 switch (codevalue)
843 {
844/* ========================================================================== */
845 /* These cases are never obeyed. This is a fudge that causes a compile-
846 time error if the vectors coptable or poptable, which are indexed by
847 opcode, are not the correct length. It seems to be the only way to do
848 such a check at compile time, as the sizeof() operator does not work
849 in the C preprocessor. */
850
851 case OP_TABLE_LENGTH:
852 case OP_TABLE_LENGTH +
853 ((sizeof(coptable) == OP_TABLE_LENGTH) &&
854 (sizeof(poptable) == OP_TABLE_LENGTH)):
855 return 0;
856
857/* ========================================================================== */
858 /* Reached a closing bracket. If not at the end of the pattern, carry
859 on with the next opcode. For repeating opcodes, also add the repeat
860 state. Note that KETRPOS will always be encountered at the end of the
861 subpattern, because the possessive subpattern repeats are always handled
862 using recursive calls. Thus, it never adds any new states.
863
864 At the end of the (sub)pattern, unless we have an empty string and
865 PCRE2_NOTEMPTY is set, or PCRE2_NOTEMPTY_ATSTART is set and we are at the
866 start of the subject, save the match data, shifting up all previous
867 matches so we always have the longest first. */
868
869 case OP_KET:
870 case OP_KETRMIN:
871 case OP_KETRMAX:
872 case OP_KETRPOS:
873 if (code != end_code)
874 {
875 ADD_ACTIVE(state_offset + 1 + LINK_SIZE, 0);
876 if (codevalue != OP_KET)
877 {
878 ADD_ACTIVE(state_offset - (int)GET(code, 1), 0);
879 }
880 }
881 else
882 {
883 if (ptr > current_subject ||
884 ((mb->moptions & PCRE2_NOTEMPTY) == 0 &&
885 ((mb->moptions & PCRE2_NOTEMPTY_ATSTART) == 0 ||
886 current_subject > start_subject + mb->start_offset)))
887 {
888 if (match_count < 0) match_count = (offsetcount >= 2)? 1 : 0;
889 else if (match_count > 0 && ++match_count * 2 > (int)offsetcount)
890 match_count = 0;
891 count = ((match_count == 0)? (int)offsetcount : match_count * 2) - 2;
892 if (count > 0) (void)memmove(offsets + 2, offsets,
893 (size_t)count * sizeof(PCRE2_SIZE));
894 if (offsetcount >= 2)
895 {
896 offsets[0] = (PCRE2_SIZE)(current_subject - start_subject);
897 offsets[1] = (PCRE2_SIZE)(ptr - start_subject);
898 }
899 if ((mb->moptions & PCRE2_DFA_SHORTEST) != 0) return match_count;
900 }
901 }
902 break;
903
904/* ========================================================================== */
905 /* These opcodes add to the current list of states without looking
906 at the current character. */
907
908 /*-----------------------------------------------------------------*/
909 case OP_ALT:
910 do { code += GET(code, 1); } while (*code == OP_ALT);
911 ADD_ACTIVE((int)(code - start_code), 0);
912 break;
913
914 /*-----------------------------------------------------------------*/
915 case OP_BRA:
916 case OP_SBRA:
917 do
918 {
919 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
920 code += GET(code, 1);
921 }
922 while (*code == OP_ALT);
923 break;
924
925 /*-----------------------------------------------------------------*/
926 case OP_CBRA:
927 case OP_SCBRA:
928 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE + IMM2_SIZE), 0);
929 code += GET(code, 1);
930 while (*code == OP_ALT)
931 {
932 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
933 code += GET(code, 1);
934 }
935 break;
936
937 /*-----------------------------------------------------------------*/
938 case OP_BRAZERO:
939 case OP_BRAMINZERO:
940 ADD_ACTIVE(state_offset + 1, 0);
941 code += 1 + GET(code, 2);
942 while (*code == OP_ALT) code += GET(code, 1);
943 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
944 break;
945
946 /*-----------------------------------------------------------------*/
947 case OP_SKIPZERO:
948 code += 1 + GET(code, 2);
949 while (*code == OP_ALT) code += GET(code, 1);
950 ADD_ACTIVE((int)(code - start_code + 1 + LINK_SIZE), 0);
951 break;
952
953 /*-----------------------------------------------------------------*/
954 case OP_CIRC:
955 if (ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0)
956 { ADD_ACTIVE(state_offset + 1, 0); }
957 break;
958
959 /*-----------------------------------------------------------------*/
960 case OP_CIRCM:
961 if ((ptr == start_subject && (mb->moptions & PCRE2_NOTBOL) == 0) ||
962 ((ptr != end_subject || (mb->poptions & PCRE2_ALT_CIRCUMFLEX) != 0 )
963 && WAS_NEWLINE(ptr)))
964 { ADD_ACTIVE(state_offset + 1, 0); }
965 break;
966
967 /*-----------------------------------------------------------------*/
968 case OP_EOD:
969 if (ptr >= end_subject)
970 {
971 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
972 return PCRE2_ERROR_PARTIAL;
973 else { ADD_ACTIVE(state_offset + 1, 0); }
974 }
975 break;
976
977 /*-----------------------------------------------------------------*/
978 case OP_SOD:
979 if (ptr == start_subject) { ADD_ACTIVE(state_offset + 1, 0); }
980 break;
981
982 /*-----------------------------------------------------------------*/
983 case OP_SOM:
984 if (ptr == start_subject + start_offset) { ADD_ACTIVE(state_offset + 1, 0); }
985 break;
986
987
988/* ========================================================================== */
989 /* These opcodes inspect the next subject character, and sometimes
990 the previous one as well, but do not have an argument. The variable
991 clen contains the length of the current character and is zero if we are
992 at the end of the subject. */
993
994 /*-----------------------------------------------------------------*/
995 case OP_ANY:
996 if (clen > 0 && !IS_NEWLINE(ptr))
997 {
998 if (ptr + 1 >= mb->end_subject &&
999 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1000 NLBLOCK->nltype == NLTYPE_FIXED &&
1001 NLBLOCK->nllen == 2 &&
1002 c == NLBLOCK->nl[0])
1003 {
1004 could_continue = partial_newline = TRUE;
1005 }
1006 else
1007 {
1008 ADD_NEW(state_offset + 1, 0);
1009 }
1010 }
1011 break;
1012
1013 /*-----------------------------------------------------------------*/
1014 case OP_ALLANY:
1015 if (clen > 0)
1016 { ADD_NEW(state_offset + 1, 0); }
1017 break;
1018
1019 /*-----------------------------------------------------------------*/
1020 case OP_EODN:
1021 if (clen == 0 || (IS_NEWLINE(ptr) && ptr == end_subject - mb->nllen))
1022 {
1023 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1024 return PCRE2_ERROR_PARTIAL;
1025 ADD_ACTIVE(state_offset + 1, 0);
1026 }
1027 break;
1028
1029 /*-----------------------------------------------------------------*/
1030 case OP_DOLL:
1031 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1032 {
1033 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1034 could_continue = TRUE;
1035 else if (clen == 0 ||
1036 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr) &&
1037 (ptr == end_subject - mb->nllen)
1038 ))
1039 { ADD_ACTIVE(state_offset + 1, 0); }
1040 else if (ptr + 1 >= mb->end_subject &&
1042 NLBLOCK->nltype == NLTYPE_FIXED &&
1043 NLBLOCK->nllen == 2 &&
1044 c == NLBLOCK->nl[0])
1045 {
1046 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1047 {
1048 reset_could_continue = TRUE;
1049 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1050 }
1051 else could_continue = partial_newline = TRUE;
1052 }
1053 }
1054 break;
1055
1056 /*-----------------------------------------------------------------*/
1057 case OP_DOLLM:
1058 if ((mb->moptions & PCRE2_NOTEOL) == 0)
1059 {
1060 if (clen == 0 && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1061 could_continue = TRUE;
1062 else if (clen == 0 ||
1063 ((mb->poptions & PCRE2_DOLLAR_ENDONLY) == 0 && IS_NEWLINE(ptr)))
1064 { ADD_ACTIVE(state_offset + 1, 0); }
1065 else if (ptr + 1 >= mb->end_subject &&
1067 NLBLOCK->nltype == NLTYPE_FIXED &&
1068 NLBLOCK->nllen == 2 &&
1069 c == NLBLOCK->nl[0])
1070 {
1071 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
1072 {
1073 reset_could_continue = TRUE;
1074 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
1075 }
1076 else could_continue = partial_newline = TRUE;
1077 }
1078 }
1079 else if (IS_NEWLINE(ptr))
1080 { ADD_ACTIVE(state_offset + 1, 0); }
1081 break;
1082
1083 /*-----------------------------------------------------------------*/
1084
1085 case OP_DIGIT:
1086 case OP_WHITESPACE:
1087 case OP_WORDCHAR:
1088 if (clen > 0 && c < 256 &&
1089 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0)
1090 { ADD_NEW(state_offset + 1, 0); }
1091 break;
1092
1093 /*-----------------------------------------------------------------*/
1094 case OP_NOT_DIGIT:
1095 case OP_NOT_WHITESPACE:
1096 case OP_NOT_WORDCHAR:
1097 if (clen > 0 && (c >= 256 ||
1098 ((ctypes[c] & toptable1[codevalue]) ^ toptable2[codevalue]) != 0))
1099 { ADD_NEW(state_offset + 1, 0); }
1100 break;
1101
1102 /*-----------------------------------------------------------------*/
1103 case OP_WORD_BOUNDARY:
1107 {
1108 int left_word, right_word;
1109
1110 if (ptr > start_subject)
1111 {
1112 PCRE2_SPTR temp = ptr - 1;
1113 if (temp < mb->start_used_ptr) mb->start_used_ptr = temp;
1114#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1115 if (utf) { BACKCHAR(temp); }
1116#endif
1117 GETCHARTEST(d, temp);
1118#ifdef SUPPORT_UNICODE
1119 if (codevalue == OP_UCP_WORD_BOUNDARY ||
1120 codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1121 {
1122 int chartype = UCD_CHARTYPE(d);
1123 int category = PRIV(ucp_gentype)[chartype];
1124 left_word = (category == ucp_L || category == ucp_N ||
1125 chartype == ucp_Mn || chartype == ucp_Pc);
1126 }
1127 else
1128#endif
1129 left_word = d < 256 && (ctypes[d] & ctype_word) != 0;
1130 }
1131 else left_word = FALSE;
1132
1133 if (clen > 0)
1134 {
1135 if (ptr >= mb->last_used_ptr)
1136 {
1137 PCRE2_SPTR temp = ptr + 1;
1138#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1139 if (utf) { FORWARDCHARTEST(temp, mb->end_subject); }
1140#endif
1141 mb->last_used_ptr = temp;
1142 }
1143#ifdef SUPPORT_UNICODE
1144 if (codevalue == OP_UCP_WORD_BOUNDARY ||
1145 codevalue == OP_NOT_UCP_WORD_BOUNDARY)
1146 {
1147 int chartype = UCD_CHARTYPE(c);
1148 int category = PRIV(ucp_gentype)[chartype];
1149 right_word = (category == ucp_L || category == ucp_N ||
1150 chartype == ucp_Mn || chartype == ucp_Pc);
1151 }
1152 else
1153#endif
1154 right_word = c < 256 && (ctypes[c] & ctype_word) != 0;
1155 }
1156 else right_word = FALSE;
1157
1158 if ((left_word == right_word) ==
1159 (codevalue == OP_NOT_WORD_BOUNDARY ||
1160 codevalue == OP_NOT_UCP_WORD_BOUNDARY))
1161 { ADD_ACTIVE(state_offset + 1, 0); }
1162 }
1163 break;
1164
1165
1166 /*-----------------------------------------------------------------*/
1167 /* Check the next character by Unicode property. We will get here only
1168 if the support is in the binary; otherwise a compile-time error occurs.
1169 */
1170
1171#ifdef SUPPORT_UNICODE
1172 case OP_PROP:
1173 case OP_NOTPROP:
1174 if (clen > 0)
1175 {
1176 BOOL OK;
1177 int chartype;
1178 const uint32_t *cp;
1179 const ucd_record * prop = GET_UCD(c);
1180 switch(code[1])
1181 {
1182 case PT_ANY:
1183 OK = TRUE;
1184 break;
1185
1186 case PT_LAMP:
1187 chartype = prop->chartype;
1188 OK = chartype == ucp_Lu || chartype == ucp_Ll ||
1189 chartype == ucp_Lt;
1190 break;
1191
1192 case PT_GC:
1193 OK = PRIV(ucp_gentype)[prop->chartype] == code[2];
1194 break;
1195
1196 case PT_PC:
1197 OK = prop->chartype == code[2];
1198 break;
1199
1200 case PT_SC:
1201 OK = prop->script == code[2];
1202 break;
1203
1204 case PT_SCX:
1205 OK = (prop->script == code[2] ||
1206 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[2]) != 0);
1207 break;
1208
1209 /* These are specials for combination cases. */
1210
1211 case PT_ALNUM:
1212 chartype = prop->chartype;
1213 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1214 PRIV(ucp_gentype)[chartype] == ucp_N;
1215 break;
1216
1217 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1218 which means that Perl space and POSIX space are now identical. PCRE
1219 was changed at release 8.34. */
1220
1221 case PT_SPACE: /* Perl space */
1222 case PT_PXSPACE: /* POSIX space */
1223 switch(c)
1224 {
1227 OK = TRUE;
1228 break;
1229
1230 default:
1231 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1232 break;
1233 }
1234 break;
1235
1236 case PT_WORD:
1237 chartype = prop->chartype;
1238 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1239 PRIV(ucp_gentype)[chartype] == ucp_N ||
1240 chartype == ucp_Mn || chartype == ucp_Pc;
1241 break;
1242
1243 case PT_CLIST:
1244#if PCRE2_CODE_UNIT_WIDTH == 32
1245 if (c > MAX_UTF_CODE_POINT)
1246 {
1247 OK = FALSE;
1248 break;
1249 }
1250#endif
1251 cp = PRIV(ucd_caseless_sets) + code[2];
1252 for (;;)
1253 {
1254 if (c < *cp) { OK = FALSE; break; }
1255 if (c == *cp++) { OK = TRUE; break; }
1256 }
1257 break;
1258
1259 case PT_UCNC:
1260 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1261 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1262 c >= 0xe000;
1263 break;
1264
1265 case PT_BIDICL:
1266 OK = UCD_BIDICLASS(c) == code[2];
1267 break;
1268
1269 case PT_BOOL:
1270 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1271 UCD_BPROPS_PROP(prop), code[2]) != 0;
1272 break;
1273
1274 /* Should never occur, but keep compilers from grumbling. */
1275
1276 default:
1277 OK = codevalue != OP_PROP;
1278 break;
1279 }
1280
1281 if (OK == (codevalue == OP_PROP)) { ADD_NEW(state_offset + 3, 0); }
1282 }
1283 break;
1284#endif
1285
1286
1287
1288/* ========================================================================== */
1289 /* These opcodes likewise inspect the subject character, but have an
1290 argument that is not a data character. It is one of these opcodes:
1291 OP_ANY, OP_ALLANY, OP_DIGIT, OP_NOT_DIGIT, OP_WHITESPACE, OP_NOT_SPACE,
1292 OP_WORDCHAR, OP_NOT_WORDCHAR. The value is loaded into d. */
1293
1294 case OP_TYPEPLUS:
1295 case OP_TYPEMINPLUS:
1296 case OP_TYPEPOSPLUS:
1297 count = current_state->count; /* Already matched */
1298 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1299 if (clen > 0)
1300 {
1301 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1302 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1303 NLBLOCK->nltype == NLTYPE_FIXED &&
1304 NLBLOCK->nllen == 2 &&
1305 c == NLBLOCK->nl[0])
1306 {
1307 could_continue = partial_newline = TRUE;
1308 }
1309 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1310 (c < 256 &&
1311 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1312 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1313 {
1314 if (count > 0 && codevalue == OP_TYPEPOSPLUS)
1315 {
1316 active_count--; /* Remove non-match possibility */
1317 next_active_state--;
1318 }
1319 count++;
1320 ADD_NEW(state_offset, count);
1321 }
1322 }
1323 break;
1324
1325 /*-----------------------------------------------------------------*/
1326 case OP_TYPEQUERY:
1327 case OP_TYPEMINQUERY:
1328 case OP_TYPEPOSQUERY:
1329 ADD_ACTIVE(state_offset + 2, 0);
1330 if (clen > 0)
1331 {
1332 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1333 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1334 NLBLOCK->nltype == NLTYPE_FIXED &&
1335 NLBLOCK->nllen == 2 &&
1336 c == NLBLOCK->nl[0])
1337 {
1338 could_continue = partial_newline = TRUE;
1339 }
1340 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1341 (c < 256 &&
1342 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1343 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1344 {
1345 if (codevalue == OP_TYPEPOSQUERY)
1346 {
1347 active_count--; /* Remove non-match possibility */
1348 next_active_state--;
1349 }
1350 ADD_NEW(state_offset + 2, 0);
1351 }
1352 }
1353 break;
1354
1355 /*-----------------------------------------------------------------*/
1356 case OP_TYPESTAR:
1357 case OP_TYPEMINSTAR:
1358 case OP_TYPEPOSSTAR:
1359 ADD_ACTIVE(state_offset + 2, 0);
1360 if (clen > 0)
1361 {
1362 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1363 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1364 NLBLOCK->nltype == NLTYPE_FIXED &&
1365 NLBLOCK->nllen == 2 &&
1366 c == NLBLOCK->nl[0])
1367 {
1368 could_continue = partial_newline = TRUE;
1369 }
1370 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1371 (c < 256 &&
1372 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1373 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1374 {
1375 if (codevalue == OP_TYPEPOSSTAR)
1376 {
1377 active_count--; /* Remove non-match possibility */
1378 next_active_state--;
1379 }
1380 ADD_NEW(state_offset, 0);
1381 }
1382 }
1383 break;
1384
1385 /*-----------------------------------------------------------------*/
1386 case OP_TYPEEXACT:
1387 count = current_state->count; /* Number already matched */
1388 if (clen > 0)
1389 {
1390 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1391 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1392 NLBLOCK->nltype == NLTYPE_FIXED &&
1393 NLBLOCK->nllen == 2 &&
1394 c == NLBLOCK->nl[0])
1395 {
1396 could_continue = partial_newline = TRUE;
1397 }
1398 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1399 (c < 256 &&
1400 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1401 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1402 {
1403 if (++count >= (int)GET2(code, 1))
1404 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 1, 0); }
1405 else
1406 { ADD_NEW(state_offset, count); }
1407 }
1408 }
1409 break;
1410
1411 /*-----------------------------------------------------------------*/
1412 case OP_TYPEUPTO:
1413 case OP_TYPEMINUPTO:
1414 case OP_TYPEPOSUPTO:
1415 ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0);
1416 count = current_state->count; /* Number already matched */
1417 if (clen > 0)
1418 {
1419 if (d == OP_ANY && ptr + 1 >= mb->end_subject &&
1420 (mb->moptions & (PCRE2_PARTIAL_HARD)) != 0 &&
1421 NLBLOCK->nltype == NLTYPE_FIXED &&
1422 NLBLOCK->nllen == 2 &&
1423 c == NLBLOCK->nl[0])
1424 {
1425 could_continue = partial_newline = TRUE;
1426 }
1427 else if ((c >= 256 && d != OP_DIGIT && d != OP_WHITESPACE && d != OP_WORDCHAR) ||
1428 (c < 256 &&
1429 (d != OP_ANY || !IS_NEWLINE(ptr)) &&
1430 ((ctypes[c] & toptable1[d]) ^ toptable2[d]) != 0))
1431 {
1432 if (codevalue == OP_TYPEPOSUPTO)
1433 {
1434 active_count--; /* Remove non-match possibility */
1435 next_active_state--;
1436 }
1437 if (++count >= (int)GET2(code, 1))
1438 { ADD_NEW(state_offset + 2 + IMM2_SIZE, 0); }
1439 else
1440 { ADD_NEW(state_offset, count); }
1441 }
1442 }
1443 break;
1444
1445/* ========================================================================== */
1446 /* These are virtual opcodes that are used when something like
1447 OP_TYPEPLUS has OP_PROP, OP_NOTPROP, OP_ANYNL, or OP_EXTUNI as its
1448 argument. It keeps the code above fast for the other cases. The argument
1449 is in the d variable. */
1450
1451#ifdef SUPPORT_UNICODE
1455 count = current_state->count; /* Already matched */
1456 if (count > 0) { ADD_ACTIVE(state_offset + 4, 0); }
1457 if (clen > 0)
1458 {
1459 BOOL OK;
1460 int chartype;
1461 const uint32_t *cp;
1462 const ucd_record * prop = GET_UCD(c);
1463 switch(code[2])
1464 {
1465 case PT_ANY:
1466 OK = TRUE;
1467 break;
1468
1469 case PT_LAMP:
1470 chartype = prop->chartype;
1471 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1472 break;
1473
1474 case PT_GC:
1475 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1476 break;
1477
1478 case PT_PC:
1479 OK = prop->chartype == code[3];
1480 break;
1481
1482 case PT_SC:
1483 OK = prop->script == code[3];
1484 break;
1485
1486 case PT_SCX:
1487 OK = (prop->script == code[3] ||
1488 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1489 break;
1490
1491 /* These are specials for combination cases. */
1492
1493 case PT_ALNUM:
1494 chartype = prop->chartype;
1495 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1496 PRIV(ucp_gentype)[chartype] == ucp_N;
1497 break;
1498
1499 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1500 which means that Perl space and POSIX space are now identical. PCRE
1501 was changed at release 8.34. */
1502
1503 case PT_SPACE: /* Perl space */
1504 case PT_PXSPACE: /* POSIX space */
1505 switch(c)
1506 {
1509 OK = TRUE;
1510 break;
1511
1512 default:
1513 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1514 break;
1515 }
1516 break;
1517
1518 case PT_WORD:
1519 chartype = prop->chartype;
1520 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1521 PRIV(ucp_gentype)[chartype] == ucp_N ||
1522 chartype == ucp_Mn || chartype == ucp_Pc;
1523 break;
1524
1525 case PT_CLIST:
1526#if PCRE2_CODE_UNIT_WIDTH == 32
1527 if (c > MAX_UTF_CODE_POINT)
1528 {
1529 OK = FALSE;
1530 break;
1531 }
1532#endif
1533 cp = PRIV(ucd_caseless_sets) + code[3];
1534 for (;;)
1535 {
1536 if (c < *cp) { OK = FALSE; break; }
1537 if (c == *cp++) { OK = TRUE; break; }
1538 }
1539 break;
1540
1541 case PT_UCNC:
1542 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1543 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1544 c >= 0xe000;
1545 break;
1546
1547 case PT_BIDICL:
1548 OK = UCD_BIDICLASS(c) == code[3];
1549 break;
1550
1551 case PT_BOOL:
1552 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1553 UCD_BPROPS_PROP(prop), code[3]) != 0;
1554 break;
1555
1556 /* Should never occur, but keep compilers from grumbling. */
1557
1558 default:
1559 OK = codevalue != OP_PROP;
1560 break;
1561 }
1562
1563 if (OK == (d == OP_PROP))
1564 {
1565 if (count > 0 && codevalue == OP_PROP_EXTRA + OP_TYPEPOSPLUS)
1566 {
1567 active_count--; /* Remove non-match possibility */
1568 next_active_state--;
1569 }
1570 count++;
1571 ADD_NEW(state_offset, count);
1572 }
1573 }
1574 break;
1575
1576 /*-----------------------------------------------------------------*/
1580 count = current_state->count; /* Already matched */
1581 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1582 if (clen > 0)
1583 {
1584 int ncount = 0;
1585 if (count > 0 && codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSPLUS)
1586 {
1587 active_count--; /* Remove non-match possibility */
1588 next_active_state--;
1589 }
1590 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1591 &ncount);
1592 count++;
1593 ADD_NEW_DATA(-state_offset, count, ncount);
1594 }
1595 break;
1596#endif
1597
1598 /*-----------------------------------------------------------------*/
1602 count = current_state->count; /* Already matched */
1603 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1604 if (clen > 0)
1605 {
1606 int ncount = 0;
1607 switch (c)
1608 {
1609 case CHAR_VT:
1610 case CHAR_FF:
1611 case CHAR_NEL:
1612#ifndef EBCDIC
1613 case 0x2028:
1614 case 0x2029:
1615#endif /* Not EBCDIC */
1616 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1617 goto ANYNL01;
1618
1619 case CHAR_CR:
1620 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1621 /* Fall through */
1622
1623 ANYNL01:
1624 case CHAR_LF:
1625 if (count > 0 && codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSPLUS)
1626 {
1627 active_count--; /* Remove non-match possibility */
1628 next_active_state--;
1629 }
1630 count++;
1631 ADD_NEW_DATA(-state_offset, count, ncount);
1632 break;
1633
1634 default:
1635 break;
1636 }
1637 }
1638 break;
1639
1640 /*-----------------------------------------------------------------*/
1644 count = current_state->count; /* Already matched */
1645 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1646 if (clen > 0)
1647 {
1648 BOOL OK;
1649 switch (c)
1650 {
1652 OK = TRUE;
1653 break;
1654
1655 default:
1656 OK = FALSE;
1657 break;
1658 }
1659
1660 if (OK == (d == OP_VSPACE))
1661 {
1662 if (count > 0 && codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSPLUS)
1663 {
1664 active_count--; /* Remove non-match possibility */
1665 next_active_state--;
1666 }
1667 count++;
1668 ADD_NEW_DATA(-state_offset, count, 0);
1669 }
1670 }
1671 break;
1672
1673 /*-----------------------------------------------------------------*/
1677 count = current_state->count; /* Already matched */
1678 if (count > 0) { ADD_ACTIVE(state_offset + 2, 0); }
1679 if (clen > 0)
1680 {
1681 BOOL OK;
1682 switch (c)
1683 {
1685 OK = TRUE;
1686 break;
1687
1688 default:
1689 OK = FALSE;
1690 break;
1691 }
1692
1693 if (OK == (d == OP_HSPACE))
1694 {
1695 if (count > 0 && codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSPLUS)
1696 {
1697 active_count--; /* Remove non-match possibility */
1698 next_active_state--;
1699 }
1700 count++;
1701 ADD_NEW_DATA(-state_offset, count, 0);
1702 }
1703 }
1704 break;
1705
1706 /*-----------------------------------------------------------------*/
1707#ifdef SUPPORT_UNICODE
1711 count = 4;
1712 goto QS1;
1713
1717 count = 0;
1718
1719 QS1:
1720
1721 ADD_ACTIVE(state_offset + 4, 0);
1722 if (clen > 0)
1723 {
1724 BOOL OK;
1725 int chartype;
1726 const uint32_t *cp;
1727 const ucd_record * prop = GET_UCD(c);
1728 switch(code[2])
1729 {
1730 case PT_ANY:
1731 OK = TRUE;
1732 break;
1733
1734 case PT_LAMP:
1735 chartype = prop->chartype;
1736 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
1737 break;
1738
1739 case PT_GC:
1740 OK = PRIV(ucp_gentype)[prop->chartype] == code[3];
1741 break;
1742
1743 case PT_PC:
1744 OK = prop->chartype == code[3];
1745 break;
1746
1747 case PT_SC:
1748 OK = prop->script == code[3];
1749 break;
1750
1751 case PT_SCX:
1752 OK = (prop->script == code[3] ||
1753 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop), code[3]) != 0);
1754 break;
1755
1756 /* These are specials for combination cases. */
1757
1758 case PT_ALNUM:
1759 chartype = prop->chartype;
1760 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1761 PRIV(ucp_gentype)[chartype] == ucp_N;
1762 break;
1763
1764 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
1765 which means that Perl space and POSIX space are now identical. PCRE
1766 was changed at release 8.34. */
1767
1768 case PT_SPACE: /* Perl space */
1769 case PT_PXSPACE: /* POSIX space */
1770 switch(c)
1771 {
1774 OK = TRUE;
1775 break;
1776
1777 default:
1778 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
1779 break;
1780 }
1781 break;
1782
1783 case PT_WORD:
1784 chartype = prop->chartype;
1785 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
1786 PRIV(ucp_gentype)[chartype] == ucp_N ||
1787 chartype == ucp_Mn || chartype == ucp_Pc;
1788 break;
1789
1790 case PT_CLIST:
1791#if PCRE2_CODE_UNIT_WIDTH == 32
1792 if (c > MAX_UTF_CODE_POINT)
1793 {
1794 OK = FALSE;
1795 break;
1796 }
1797#endif
1798 cp = PRIV(ucd_caseless_sets) + code[3];
1799 for (;;)
1800 {
1801 if (c < *cp) { OK = FALSE; break; }
1802 if (c == *cp++) { OK = TRUE; break; }
1803 }
1804 break;
1805
1806 case PT_UCNC:
1807 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
1808 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
1809 c >= 0xe000;
1810 break;
1811
1812 case PT_BIDICL:
1813 OK = UCD_BIDICLASS(c) == code[3];
1814 break;
1815
1816 case PT_BOOL:
1817 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
1818 UCD_BPROPS_PROP(prop), code[3]) != 0;
1819 break;
1820
1821 /* Should never occur, but keep compilers from grumbling. */
1822
1823 default:
1824 OK = codevalue != OP_PROP;
1825 break;
1826 }
1827
1828 if (OK == (d == OP_PROP))
1829 {
1830 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSSTAR ||
1831 codevalue == OP_PROP_EXTRA + OP_TYPEPOSQUERY)
1832 {
1833 active_count--; /* Remove non-match possibility */
1834 next_active_state--;
1835 }
1836 ADD_NEW(state_offset + count, 0);
1837 }
1838 }
1839 break;
1840
1841 /*-----------------------------------------------------------------*/
1845 count = 2;
1846 goto QS2;
1847
1851 count = 0;
1852
1853 QS2:
1854
1855 ADD_ACTIVE(state_offset + 2, 0);
1856 if (clen > 0)
1857 {
1858 int ncount = 0;
1859 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSSTAR ||
1860 codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSQUERY)
1861 {
1862 active_count--; /* Remove non-match possibility */
1863 next_active_state--;
1864 }
1865 (void)PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
1866 &ncount);
1867 ADD_NEW_DATA(-(state_offset + count), 0, ncount);
1868 }
1869 break;
1870#endif
1871
1872 /*-----------------------------------------------------------------*/
1876 count = 2;
1877 goto QS3;
1878
1882 count = 0;
1883
1884 QS3:
1885 ADD_ACTIVE(state_offset + 2, 0);
1886 if (clen > 0)
1887 {
1888 int ncount = 0;
1889 switch (c)
1890 {
1891 case CHAR_VT:
1892 case CHAR_FF:
1893 case CHAR_NEL:
1894#ifndef EBCDIC
1895 case 0x2028:
1896 case 0x2029:
1897#endif /* Not EBCDIC */
1898 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
1899 goto ANYNL02;
1900
1901 case CHAR_CR:
1902 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
1903 /* Fall through */
1904
1905 ANYNL02:
1906 case CHAR_LF:
1907 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSSTAR ||
1908 codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSQUERY)
1909 {
1910 active_count--; /* Remove non-match possibility */
1911 next_active_state--;
1912 }
1913 ADD_NEW_DATA(-(state_offset + (int)count), 0, ncount);
1914 break;
1915
1916 default:
1917 break;
1918 }
1919 }
1920 break;
1921
1922 /*-----------------------------------------------------------------*/
1926 count = 2;
1927 goto QS4;
1928
1932 count = 0;
1933
1934 QS4:
1935 ADD_ACTIVE(state_offset + 2, 0);
1936 if (clen > 0)
1937 {
1938 BOOL OK;
1939 switch (c)
1940 {
1942 OK = TRUE;
1943 break;
1944
1945 default:
1946 OK = FALSE;
1947 break;
1948 }
1949 if (OK == (d == OP_VSPACE))
1950 {
1951 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSSTAR ||
1952 codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSQUERY)
1953 {
1954 active_count--; /* Remove non-match possibility */
1955 next_active_state--;
1956 }
1957 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1958 }
1959 }
1960 break;
1961
1962 /*-----------------------------------------------------------------*/
1966 count = 2;
1967 goto QS5;
1968
1972 count = 0;
1973
1974 QS5:
1975 ADD_ACTIVE(state_offset + 2, 0);
1976 if (clen > 0)
1977 {
1978 BOOL OK;
1979 switch (c)
1980 {
1982 OK = TRUE;
1983 break;
1984
1985 default:
1986 OK = FALSE;
1987 break;
1988 }
1989
1990 if (OK == (d == OP_HSPACE))
1991 {
1992 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSSTAR ||
1993 codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSQUERY)
1994 {
1995 active_count--; /* Remove non-match possibility */
1996 next_active_state--;
1997 }
1998 ADD_NEW_DATA(-(state_offset + (int)count), 0, 0);
1999 }
2000 }
2001 break;
2002
2003 /*-----------------------------------------------------------------*/
2004#ifdef SUPPORT_UNICODE
2009 if (codevalue != OP_PROP_EXTRA + OP_TYPEEXACT)
2010 { ADD_ACTIVE(state_offset + 1 + IMM2_SIZE + 3, 0); }
2011 count = current_state->count; /* Number already matched */
2012 if (clen > 0)
2013 {
2014 BOOL OK;
2015 int chartype;
2016 const uint32_t *cp;
2017 const ucd_record * prop = GET_UCD(c);
2018 switch(code[1 + IMM2_SIZE + 1])
2019 {
2020 case PT_ANY:
2021 OK = TRUE;
2022 break;
2023
2024 case PT_LAMP:
2025 chartype = prop->chartype;
2026 OK = chartype == ucp_Lu || chartype == ucp_Ll || chartype == ucp_Lt;
2027 break;
2028
2029 case PT_GC:
2030 OK = PRIV(ucp_gentype)[prop->chartype] == code[1 + IMM2_SIZE + 2];
2031 break;
2032
2033 case PT_PC:
2034 OK = prop->chartype == code[1 + IMM2_SIZE + 2];
2035 break;
2036
2037 case PT_SC:
2038 OK = prop->script == code[1 + IMM2_SIZE + 2];
2039 break;
2040
2041 case PT_SCX:
2042 OK = (prop->script == code[1 + IMM2_SIZE + 2] ||
2043 MAPBIT(PRIV(ucd_script_sets) + UCD_SCRIPTX_PROP(prop),
2044 code[1 + IMM2_SIZE + 2]) != 0);
2045 break;
2046
2047 /* These are specials for combination cases. */
2048
2049 case PT_ALNUM:
2050 chartype = prop->chartype;
2051 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2052 PRIV(ucp_gentype)[chartype] == ucp_N;
2053 break;
2054
2055 /* Perl space used to exclude VT, but from Perl 5.18 it is included,
2056 which means that Perl space and POSIX space are now identical. PCRE
2057 was changed at release 8.34. */
2058
2059 case PT_SPACE: /* Perl space */
2060 case PT_PXSPACE: /* POSIX space */
2061 switch(c)
2062 {
2065 OK = TRUE;
2066 break;
2067
2068 default:
2069 OK = PRIV(ucp_gentype)[prop->chartype] == ucp_Z;
2070 break;
2071 }
2072 break;
2073
2074 case PT_WORD:
2075 chartype = prop->chartype;
2076 OK = PRIV(ucp_gentype)[chartype] == ucp_L ||
2077 PRIV(ucp_gentype)[chartype] == ucp_N ||
2078 chartype == ucp_Mn || chartype == ucp_Pc;
2079 break;
2080
2081 case PT_CLIST:
2082#if PCRE2_CODE_UNIT_WIDTH == 32
2083 if (c > MAX_UTF_CODE_POINT)
2084 {
2085 OK = FALSE;
2086 break;
2087 }
2088#endif
2089 cp = PRIV(ucd_caseless_sets) + code[1 + IMM2_SIZE + 2];
2090 for (;;)
2091 {
2092 if (c < *cp) { OK = FALSE; break; }
2093 if (c == *cp++) { OK = TRUE; break; }
2094 }
2095 break;
2096
2097 case PT_UCNC:
2098 OK = c == CHAR_DOLLAR_SIGN || c == CHAR_COMMERCIAL_AT ||
2099 c == CHAR_GRAVE_ACCENT || (c >= 0xa0 && c <= 0xd7ff) ||
2100 c >= 0xe000;
2101 break;
2102
2103 case PT_BIDICL:
2104 OK = UCD_BIDICLASS(c) == code[1 + IMM2_SIZE + 2];
2105 break;
2106
2107 case PT_BOOL:
2108 OK = MAPBIT(PRIV(ucd_boolprop_sets) +
2109 UCD_BPROPS_PROP(prop), code[1 + IMM2_SIZE + 2]) != 0;
2110 break;
2111
2112 /* Should never occur, but keep compilers from grumbling. */
2113
2114 default:
2115 OK = codevalue != OP_PROP;
2116 break;
2117 }
2118
2119 if (OK == (d == OP_PROP))
2120 {
2121 if (codevalue == OP_PROP_EXTRA + OP_TYPEPOSUPTO)
2122 {
2123 active_count--; /* Remove non-match possibility */
2124 next_active_state--;
2125 }
2126 if (++count >= (int)GET2(code, 1))
2127 { ADD_NEW(state_offset + 1 + IMM2_SIZE + 3, 0); }
2128 else
2129 { ADD_NEW(state_offset, count); }
2130 }
2131 }
2132 break;
2133
2134 /*-----------------------------------------------------------------*/
2139 if (codevalue != OP_EXTUNI_EXTRA + OP_TYPEEXACT)
2140 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2141 count = current_state->count; /* Number already matched */
2142 if (clen > 0)
2143 {
2144 PCRE2_SPTR nptr;
2145 int ncount = 0;
2146 if (codevalue == OP_EXTUNI_EXTRA + OP_TYPEPOSUPTO)
2147 {
2148 active_count--; /* Remove non-match possibility */
2149 next_active_state--;
2150 }
2151 nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject, end_subject, utf,
2152 &ncount);
2153 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2154 reset_could_continue = TRUE;
2155 if (++count >= (int)GET2(code, 1))
2156 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2157 else
2158 { ADD_NEW_DATA(-state_offset, count, ncount); }
2159 }
2160 break;
2161#endif
2162
2163 /*-----------------------------------------------------------------*/
2168 if (codevalue != OP_ANYNL_EXTRA + OP_TYPEEXACT)
2169 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2170 count = current_state->count; /* Number already matched */
2171 if (clen > 0)
2172 {
2173 int ncount = 0;
2174 switch (c)
2175 {
2176 case CHAR_VT:
2177 case CHAR_FF:
2178 case CHAR_NEL:
2179#ifndef EBCDIC
2180 case 0x2028:
2181 case 0x2029:
2182#endif /* Not EBCDIC */
2183 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2184 goto ANYNL03;
2185
2186 case CHAR_CR:
2187 if (ptr + 1 < end_subject && UCHAR21TEST(ptr + 1) == CHAR_LF) ncount = 1;
2188 /* Fall through */
2189
2190 ANYNL03:
2191 case CHAR_LF:
2192 if (codevalue == OP_ANYNL_EXTRA + OP_TYPEPOSUPTO)
2193 {
2194 active_count--; /* Remove non-match possibility */
2195 next_active_state--;
2196 }
2197 if (++count >= (int)GET2(code, 1))
2198 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, ncount); }
2199 else
2200 { ADD_NEW_DATA(-state_offset, count, ncount); }
2201 break;
2202
2203 default:
2204 break;
2205 }
2206 }
2207 break;
2208
2209 /*-----------------------------------------------------------------*/
2214 if (codevalue != OP_VSPACE_EXTRA + OP_TYPEEXACT)
2215 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2216 count = current_state->count; /* Number already matched */
2217 if (clen > 0)
2218 {
2219 BOOL OK;
2220 switch (c)
2221 {
2223 OK = TRUE;
2224 break;
2225
2226 default:
2227 OK = FALSE;
2228 }
2229
2230 if (OK == (d == OP_VSPACE))
2231 {
2232 if (codevalue == OP_VSPACE_EXTRA + OP_TYPEPOSUPTO)
2233 {
2234 active_count--; /* Remove non-match possibility */
2235 next_active_state--;
2236 }
2237 if (++count >= (int)GET2(code, 1))
2238 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2239 else
2240 { ADD_NEW_DATA(-state_offset, count, 0); }
2241 }
2242 }
2243 break;
2244
2245 /*-----------------------------------------------------------------*/
2250 if (codevalue != OP_HSPACE_EXTRA + OP_TYPEEXACT)
2251 { ADD_ACTIVE(state_offset + 2 + IMM2_SIZE, 0); }
2252 count = current_state->count; /* Number already matched */
2253 if (clen > 0)
2254 {
2255 BOOL OK;
2256 switch (c)
2257 {
2259 OK = TRUE;
2260 break;
2261
2262 default:
2263 OK = FALSE;
2264 break;
2265 }
2266
2267 if (OK == (d == OP_HSPACE))
2268 {
2269 if (codevalue == OP_HSPACE_EXTRA + OP_TYPEPOSUPTO)
2270 {
2271 active_count--; /* Remove non-match possibility */
2272 next_active_state--;
2273 }
2274 if (++count >= (int)GET2(code, 1))
2275 { ADD_NEW_DATA(-(state_offset + 2 + IMM2_SIZE), 0, 0); }
2276 else
2277 { ADD_NEW_DATA(-state_offset, count, 0); }
2278 }
2279 }
2280 break;
2281
2282/* ========================================================================== */
2283 /* These opcodes are followed by a character that is usually compared
2284 to the current subject character; it is loaded into d. We still get
2285 here even if there is no subject character, because in some cases zero
2286 repetitions are permitted. */
2287
2288 /*-----------------------------------------------------------------*/
2289 case OP_CHAR:
2290 if (clen > 0 && c == d) { ADD_NEW(state_offset + dlen + 1, 0); }
2291 break;
2292
2293 /*-----------------------------------------------------------------*/
2294 case OP_CHARI:
2295 if (clen == 0) break;
2296
2297#ifdef SUPPORT_UNICODE
2298 if (utf_or_ucp)
2299 {
2300 if (c == d) { ADD_NEW(state_offset + dlen + 1, 0); } else
2301 {
2302 unsigned int othercase;
2303 if (c < 128)
2304 othercase = fcc[c];
2305 else
2306 othercase = UCD_OTHERCASE(c);
2307 if (d == othercase) { ADD_NEW(state_offset + dlen + 1, 0); }
2308 }
2309 }
2310 else
2311#endif /* SUPPORT_UNICODE */
2312 /* Not UTF or UCP mode */
2313 {
2314 if (TABLE_GET(c, lcc, c) == TABLE_GET(d, lcc, d))
2315 { ADD_NEW(state_offset + 2, 0); }
2316 }
2317 break;
2318
2319
2320#ifdef SUPPORT_UNICODE
2321 /*-----------------------------------------------------------------*/
2322 /* This is a tricky one because it can match more than one character.
2323 Find out how many characters to skip, and then set up a negative state
2324 to wait for them to pass before continuing. */
2325
2326 case OP_EXTUNI:
2327 if (clen > 0)
2328 {
2329 int ncount = 0;
2330 PCRE2_SPTR nptr = PRIV(extuni)(c, ptr + clen, mb->start_subject,
2331 end_subject, utf, &ncount);
2332 if (nptr >= end_subject && (mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2333 reset_could_continue = TRUE;
2334 ADD_NEW_DATA(-(state_offset + 1), 0, ncount);
2335 }
2336 break;
2337#endif
2338
2339 /*-----------------------------------------------------------------*/
2340 /* This is a tricky like EXTUNI because it too can match more than one
2341 character (when CR is followed by LF). In this case, set up a negative
2342 state to wait for one character to pass before continuing. */
2343
2344 case OP_ANYNL:
2345 if (clen > 0) switch(c)
2346 {
2347 case CHAR_VT:
2348 case CHAR_FF:
2349 case CHAR_NEL:
2350#ifndef EBCDIC
2351 case 0x2028:
2352 case 0x2029:
2353#endif /* Not EBCDIC */
2354 if (mb->bsr_convention == PCRE2_BSR_ANYCRLF) break;
2355 /* Fall through */
2356
2357 case CHAR_LF:
2358 ADD_NEW(state_offset + 1, 0);
2359 break;
2360
2361 case CHAR_CR:
2362 if (ptr + 1 >= end_subject)
2363 {
2364 ADD_NEW(state_offset + 1, 0);
2365 if ((mb->moptions & PCRE2_PARTIAL_HARD) != 0)
2366 reset_could_continue = TRUE;
2367 }
2368 else if (UCHAR21TEST(ptr + 1) == CHAR_LF)
2369 {
2370 ADD_NEW_DATA(-(state_offset + 1), 0, 1);
2371 }
2372 else
2373 {
2374 ADD_NEW(state_offset + 1, 0);
2375 }
2376 break;
2377 }
2378 break;
2379
2380 /*-----------------------------------------------------------------*/
2381 case OP_NOT_VSPACE:
2382 if (clen > 0) switch(c)
2383 {
2385 break;
2386
2387 default:
2388 ADD_NEW(state_offset + 1, 0);
2389 break;
2390 }
2391 break;
2392
2393 /*-----------------------------------------------------------------*/
2394 case OP_VSPACE:
2395 if (clen > 0) switch(c)
2396 {
2398 ADD_NEW(state_offset + 1, 0);
2399 break;
2400
2401 default:
2402 break;
2403 }
2404 break;
2405
2406 /*-----------------------------------------------------------------*/
2407 case OP_NOT_HSPACE:
2408 if (clen > 0) switch(c)
2409 {
2411 break;
2412
2413 default:
2414 ADD_NEW(state_offset + 1, 0);
2415 break;
2416 }
2417 break;
2418
2419 /*-----------------------------------------------------------------*/
2420 case OP_HSPACE:
2421 if (clen > 0) switch(c)
2422 {
2424 ADD_NEW(state_offset + 1, 0);
2425 break;
2426
2427 default:
2428 break;
2429 }
2430 break;
2431
2432 /*-----------------------------------------------------------------*/
2433 /* Match a negated single character casefully. */
2434
2435 case OP_NOT:
2436 if (clen > 0 && c != d) { ADD_NEW(state_offset + dlen + 1, 0); }
2437 break;
2438
2439 /*-----------------------------------------------------------------*/
2440 /* Match a negated single character caselessly. */
2441
2442 case OP_NOTI:
2443 if (clen > 0)
2444 {
2445 uint32_t otherd;
2446#ifdef SUPPORT_UNICODE
2447 if (utf_or_ucp && d >= 128)
2448 otherd = UCD_OTHERCASE(d);
2449 else
2450#endif /* SUPPORT_UNICODE */
2451 otherd = TABLE_GET(d, fcc, d);
2452 if (c != d && c != otherd)
2453 { ADD_NEW(state_offset + dlen + 1, 0); }
2454 }
2455 break;
2456
2457 /*-----------------------------------------------------------------*/
2458 case OP_PLUSI:
2459 case OP_MINPLUSI:
2460 case OP_POSPLUSI:
2461 case OP_NOTPLUSI:
2462 case OP_NOTMINPLUSI:
2463 case OP_NOTPOSPLUSI:
2464 caseless = TRUE;
2465 codevalue -= OP_STARI - OP_STAR;
2466
2467 /* Fall through */
2468 case OP_PLUS:
2469 case OP_MINPLUS:
2470 case OP_POSPLUS:
2471 case OP_NOTPLUS:
2472 case OP_NOTMINPLUS:
2473 case OP_NOTPOSPLUS:
2474 count = current_state->count; /* Already matched */
2475 if (count > 0) { ADD_ACTIVE(state_offset + dlen + 1, 0); }
2476 if (clen > 0)
2477 {
2478 uint32_t otherd = NOTACHAR;
2479 if (caseless)
2480 {
2481#ifdef SUPPORT_UNICODE
2482 if (utf_or_ucp && d >= 128)
2483 otherd = UCD_OTHERCASE(d);
2484 else
2485#endif /* SUPPORT_UNICODE */
2486 otherd = TABLE_GET(d, fcc, d);
2487 }
2488 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2489 {
2490 if (count > 0 &&
2491 (codevalue == OP_POSPLUS || codevalue == OP_NOTPOSPLUS))
2492 {
2493 active_count--; /* Remove non-match possibility */
2494 next_active_state--;
2495 }
2496 count++;
2497 ADD_NEW(state_offset, count);
2498 }
2499 }
2500 break;
2501
2502 /*-----------------------------------------------------------------*/
2503 case OP_QUERYI:
2504 case OP_MINQUERYI:
2505 case OP_POSQUERYI:
2506 case OP_NOTQUERYI:
2507 case OP_NOTMINQUERYI:
2508 case OP_NOTPOSQUERYI:
2509 caseless = TRUE;
2510 codevalue -= OP_STARI - OP_STAR;
2511 /* Fall through */
2512 case OP_QUERY:
2513 case OP_MINQUERY:
2514 case OP_POSQUERY:
2515 case OP_NOTQUERY:
2516 case OP_NOTMINQUERY:
2517 case OP_NOTPOSQUERY:
2518 ADD_ACTIVE(state_offset + dlen + 1, 0);
2519 if (clen > 0)
2520 {
2521 uint32_t otherd = NOTACHAR;
2522 if (caseless)
2523 {
2524#ifdef SUPPORT_UNICODE
2525 if (utf_or_ucp && d >= 128)
2526 otherd = UCD_OTHERCASE(d);
2527 else
2528#endif /* SUPPORT_UNICODE */
2529 otherd = TABLE_GET(d, fcc, d);
2530 }
2531 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2532 {
2533 if (codevalue == OP_POSQUERY || codevalue == OP_NOTPOSQUERY)
2534 {
2535 active_count--; /* Remove non-match possibility */
2536 next_active_state--;
2537 }
2538 ADD_NEW(state_offset + dlen + 1, 0);
2539 }
2540 }
2541 break;
2542
2543 /*-----------------------------------------------------------------*/
2544 case OP_STARI:
2545 case OP_MINSTARI:
2546 case OP_POSSTARI:
2547 case OP_NOTSTARI:
2548 case OP_NOTMINSTARI:
2549 case OP_NOTPOSSTARI:
2550 caseless = TRUE;
2551 codevalue -= OP_STARI - OP_STAR;
2552 /* Fall through */
2553 case OP_STAR:
2554 case OP_MINSTAR:
2555 case OP_POSSTAR:
2556 case OP_NOTSTAR:
2557 case OP_NOTMINSTAR:
2558 case OP_NOTPOSSTAR:
2559 ADD_ACTIVE(state_offset + dlen + 1, 0);
2560 if (clen > 0)
2561 {
2562 uint32_t otherd = NOTACHAR;
2563 if (caseless)
2564 {
2565#ifdef SUPPORT_UNICODE
2566 if (utf_or_ucp && d >= 128)
2567 otherd = UCD_OTHERCASE(d);
2568 else
2569#endif /* SUPPORT_UNICODE */
2570 otherd = TABLE_GET(d, fcc, d);
2571 }
2572 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2573 {
2574 if (codevalue == OP_POSSTAR || codevalue == OP_NOTPOSSTAR)
2575 {
2576 active_count--; /* Remove non-match possibility */
2577 next_active_state--;
2578 }
2579 ADD_NEW(state_offset, 0);
2580 }
2581 }
2582 break;
2583
2584 /*-----------------------------------------------------------------*/
2585 case OP_EXACTI:
2586 case OP_NOTEXACTI:
2587 caseless = TRUE;
2588 codevalue -= OP_STARI - OP_STAR;
2589 /* Fall through */
2590 case OP_EXACT:
2591 case OP_NOTEXACT:
2592 count = current_state->count; /* Number already matched */
2593 if (clen > 0)
2594 {
2595 uint32_t otherd = NOTACHAR;
2596 if (caseless)
2597 {
2598#ifdef SUPPORT_UNICODE
2599 if (utf_or_ucp && d >= 128)
2600 otherd = UCD_OTHERCASE(d);
2601 else
2602#endif /* SUPPORT_UNICODE */
2603 otherd = TABLE_GET(d, fcc, d);
2604 }
2605 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2606 {
2607 if (++count >= (int)GET2(code, 1))
2608 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2609 else
2610 { ADD_NEW(state_offset, count); }
2611 }
2612 }
2613 break;
2614
2615 /*-----------------------------------------------------------------*/
2616 case OP_UPTOI:
2617 case OP_MINUPTOI:
2618 case OP_POSUPTOI:
2619 case OP_NOTUPTOI:
2620 case OP_NOTMINUPTOI:
2621 case OP_NOTPOSUPTOI:
2622 caseless = TRUE;
2623 codevalue -= OP_STARI - OP_STAR;
2624 /* Fall through */
2625 case OP_UPTO:
2626 case OP_MINUPTO:
2627 case OP_POSUPTO:
2628 case OP_NOTUPTO:
2629 case OP_NOTMINUPTO:
2630 case OP_NOTPOSUPTO:
2631 ADD_ACTIVE(state_offset + dlen + 1 + IMM2_SIZE, 0);
2632 count = current_state->count; /* Number already matched */
2633 if (clen > 0)
2634 {
2635 uint32_t otherd = NOTACHAR;
2636 if (caseless)
2637 {
2638#ifdef SUPPORT_UNICODE
2639 if (utf_or_ucp && d >= 128)
2640 otherd = UCD_OTHERCASE(d);
2641 else
2642#endif /* SUPPORT_UNICODE */
2643 otherd = TABLE_GET(d, fcc, d);
2644 }
2645 if ((c == d || c == otherd) == (codevalue < OP_NOTSTAR))
2646 {
2647 if (codevalue == OP_POSUPTO || codevalue == OP_NOTPOSUPTO)
2648 {
2649 active_count--; /* Remove non-match possibility */
2650 next_active_state--;
2651 }
2652 if (++count >= (int)GET2(code, 1))
2653 { ADD_NEW(state_offset + dlen + 1 + IMM2_SIZE, 0); }
2654 else
2655 { ADD_NEW(state_offset, count); }
2656 }
2657 }
2658 break;
2659
2660
2661/* ========================================================================== */
2662 /* These are the class-handling opcodes */
2663
2664 case OP_CLASS:
2665 case OP_NCLASS:
2666 case OP_XCLASS:
2667 {
2668 BOOL isinclass = FALSE;
2669 int next_state_offset;
2670 PCRE2_SPTR ecode;
2671
2672 /* For a simple class, there is always just a 32-byte table, and we
2673 can set isinclass from it. */
2674
2675 if (codevalue != OP_XCLASS)
2676 {
2677 ecode = code + 1 + (32 / sizeof(PCRE2_UCHAR));
2678 if (clen > 0)
2679 {
2680 isinclass = (c > 255)? (codevalue == OP_NCLASS) :
2681 ((((uint8_t *)(code + 1))[c/8] & (1u << (c&7))) != 0);
2682 }
2683 }
2684
2685 /* An extended class may have a table or a list of single characters,
2686 ranges, or both, and it may be positive or negative. There's a
2687 function that sorts all this out. */
2688
2689 else
2690 {
2691 ecode = code + GET(code, 1);
2692 if (clen > 0) isinclass = PRIV(xclass)(c, code + 1 + LINK_SIZE, utf);
2693 }
2694
2695 /* At this point, isinclass is set for all kinds of class, and ecode
2696 points to the byte after the end of the class. If there is a
2697 quantifier, this is where it will be. */
2698
2699 next_state_offset = (int)(ecode - start_code);
2700
2701 switch (*ecode)
2702 {
2703 case OP_CRSTAR:
2704 case OP_CRMINSTAR:
2705 case OP_CRPOSSTAR:
2706 ADD_ACTIVE(next_state_offset + 1, 0);
2707 if (isinclass)
2708 {
2709 if (*ecode == OP_CRPOSSTAR)
2710 {
2711 active_count--; /* Remove non-match possibility */
2712 next_active_state--;
2713 }
2714 ADD_NEW(state_offset, 0);
2715 }
2716 break;
2717
2718 case OP_CRPLUS:
2719 case OP_CRMINPLUS:
2720 case OP_CRPOSPLUS:
2721 count = current_state->count; /* Already matched */
2722 if (count > 0) { ADD_ACTIVE(next_state_offset + 1, 0); }
2723 if (isinclass)
2724 {
2725 if (count > 0 && *ecode == OP_CRPOSPLUS)
2726 {
2727 active_count--; /* Remove non-match possibility */
2728 next_active_state--;
2729 }
2730 count++;
2731 ADD_NEW(state_offset, count);
2732 }
2733 break;
2734
2735 case OP_CRQUERY:
2736 case OP_CRMINQUERY:
2737 case OP_CRPOSQUERY:
2738 ADD_ACTIVE(next_state_offset + 1, 0);
2739 if (isinclass)
2740 {
2741 if (*ecode == OP_CRPOSQUERY)
2742 {
2743 active_count--; /* Remove non-match possibility */
2744 next_active_state--;
2745 }
2746 ADD_NEW(next_state_offset + 1, 0);
2747 }
2748 break;
2749
2750 case OP_CRRANGE:
2751 case OP_CRMINRANGE:
2752 case OP_CRPOSRANGE:
2753 count = current_state->count; /* Already matched */
2754 if (count >= (int)GET2(ecode, 1))
2755 { ADD_ACTIVE(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2756 if (isinclass)
2757 {
2758 int max = (int)GET2(ecode, 1 + IMM2_SIZE);
2759
2760 if (*ecode == OP_CRPOSRANGE && count >= (int)GET2(ecode, 1))
2761 {
2762 active_count--; /* Remove non-match possibility */
2763 next_active_state--;
2764 }
2765
2766 if (++count >= max && max != 0) /* Max 0 => no limit */
2767 { ADD_NEW(next_state_offset + 1 + 2 * IMM2_SIZE, 0); }
2768 else
2769 { ADD_NEW(state_offset, count); }
2770 }
2771 break;
2772
2773 default:
2774 if (isinclass) { ADD_NEW(next_state_offset, 0); }
2775 break;
2776 }
2777 }
2778 break;
2779
2780/* ========================================================================== */
2781 /* These are the opcodes for fancy brackets of various kinds. We have
2782 to use recursion in order to handle them. The "always failing" assertion
2783 (?!) is optimised to OP_FAIL when compiling, so we have to support that,
2784 though the other "backtracking verbs" are not supported. */
2785
2786 case OP_FAIL:
2787 forced_fail++; /* Count FAILs for multiple states */
2788 break;
2789
2790 case OP_ASSERT:
2791 case OP_ASSERT_NOT:
2792 case OP_ASSERTBACK:
2793 case OP_ASSERTBACK_NOT:
2794 {
2795 int rc;
2796 int *local_workspace;
2797 PCRE2_SIZE *local_offsets;
2798 PCRE2_SPTR endasscode = code + GET(code, 1);
2799 RWS_anchor *rws = (RWS_anchor *)RWS;
2800
2801 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2802 {
2803 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2804 if (rc != 0) return rc;
2805 RWS = (int *)rws;
2806 }
2807
2808 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2809 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2810 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2811
2812 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2813
2814 rc = internal_dfa_match(
2815 mb, /* static match data */
2816 code, /* this subexpression's code */
2817 ptr, /* where we currently are */
2818 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2819 local_offsets, /* offset vector */
2820 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2821 local_workspace, /* workspace vector */
2822 RWS_RSIZE, /* size of same */
2823 rlevel, /* function recursion level */
2824 RWS); /* recursion workspace */
2825
2826 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2827
2828 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2829 if ((rc >= 0) == (codevalue == OP_ASSERT || codevalue == OP_ASSERTBACK))
2830 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2831 }
2832 break;
2833
2834 /*-----------------------------------------------------------------*/
2835 case OP_COND:
2836 case OP_SCOND:
2837 {
2838 int codelink = (int)GET(code, 1);
2839 PCRE2_UCHAR condcode;
2840
2841 /* Because of the way auto-callout works during compile, a callout item
2842 is inserted between OP_COND and an assertion condition. This does not
2843 happen for the other conditions. */
2844
2845 if (code[LINK_SIZE + 1] == OP_CALLOUT
2846 || code[LINK_SIZE + 1] == OP_CALLOUT_STR)
2847 {
2848 PCRE2_SIZE callout_length;
2849 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb,
2850 1 + LINK_SIZE, &callout_length);
2851 if (rrc < 0) return rrc; /* Abandon */
2852 if (rrc > 0) break; /* Fail this thread */
2853 code += callout_length; /* Skip callout data */
2854 }
2855
2856 condcode = code[LINK_SIZE+1];
2857
2858 /* Back reference conditions and duplicate named recursion conditions
2859 are not supported */
2860
2861 if (condcode == OP_CREF || condcode == OP_DNCREF ||
2862 condcode == OP_DNRREF)
2863 return PCRE2_ERROR_DFA_UCOND;
2864
2865 /* The DEFINE condition is always false, and the assertion (?!) is
2866 converted to OP_FAIL. */
2867
2868 if (condcode == OP_FALSE || condcode == OP_FAIL)
2869 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2870
2871 /* There is also an always-true condition */
2872
2873 else if (condcode == OP_TRUE)
2874 { ADD_ACTIVE(state_offset + LINK_SIZE + 2, 0); }
2875
2876 /* The only supported version of OP_RREF is for the value RREF_ANY,
2877 which means "test if in any recursion". We can't test for specifically
2878 recursed groups. */
2879
2880 else if (condcode == OP_RREF)
2881 {
2882 unsigned int value = GET2(code, LINK_SIZE + 2);
2883 if (value != RREF_ANY) return PCRE2_ERROR_DFA_UCOND;
2884 if (mb->recursive != NULL)
2885 { ADD_ACTIVE(state_offset + LINK_SIZE + 2 + IMM2_SIZE, 0); }
2886 else { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2887 }
2888
2889 /* Otherwise, the condition is an assertion */
2890
2891 else
2892 {
2893 int rc;
2894 int *local_workspace;
2895 PCRE2_SIZE *local_offsets;
2896 PCRE2_SPTR asscode = code + LINK_SIZE + 1;
2897 PCRE2_SPTR endasscode = asscode + GET(asscode, 1);
2898 RWS_anchor *rws = (RWS_anchor *)RWS;
2899
2900 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
2901 {
2902 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
2903 if (rc != 0) return rc;
2904 RWS = (int *)rws;
2905 }
2906
2907 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2908 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
2909 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
2910
2911 while (*endasscode == OP_ALT) endasscode += GET(endasscode, 1);
2912
2913 rc = internal_dfa_match(
2914 mb, /* fixed match data */
2915 asscode, /* this subexpression's code */
2916 ptr, /* where we currently are */
2917 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2918 local_offsets, /* offset vector */
2919 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
2920 local_workspace, /* workspace vector */
2921 RWS_RSIZE, /* size of same */
2922 rlevel, /* function recursion level */
2923 RWS); /* recursion workspace */
2924
2925 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
2926
2927 if (rc < 0 && rc != PCRE2_ERROR_NOMATCH) return rc;
2928 if ((rc >= 0) ==
2929 (condcode == OP_ASSERT || condcode == OP_ASSERTBACK))
2930 { ADD_ACTIVE((int)(endasscode + LINK_SIZE + 1 - start_code), 0); }
2931 else
2932 { ADD_ACTIVE(state_offset + codelink + LINK_SIZE + 1, 0); }
2933 }
2934 }
2935 break;
2936
2937 /*-----------------------------------------------------------------*/
2938 case OP_RECURSE:
2939 {
2940 int rc;
2941 int *local_workspace;
2942 PCRE2_SIZE *local_offsets;
2943 RWS_anchor *rws = (RWS_anchor *)RWS;
2944 PCRE2_SPTR callpat = start_code + GET(code, 1);
2945 uint32_t recno = (callpat == mb->start_code)? 0 :
2946 GET2(callpat, 1 + LINK_SIZE);
2947
2948 if (rws->free < RWS_RSIZE + RWS_OVEC_RSIZE)
2949 {
2950 rc = more_workspace(&rws, RWS_OVEC_RSIZE, mb);
2951 if (rc != 0) return rc;
2952 RWS = (int *)rws;
2953 }
2954
2955 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
2956 local_workspace = ((int *)local_offsets) + RWS_OVEC_RSIZE;
2957 rws->free -= RWS_RSIZE + RWS_OVEC_RSIZE;
2958
2959 /* Check for repeating a recursion without advancing the subject
2960 pointer or last used character. This should catch convoluted mutual
2961 recursions. (Some simple cases are caught at compile time.) */
2962
2963 for (dfa_recursion_info *ri = mb->recursive;
2964 ri != NULL;
2965 ri = ri->prevrec)
2966 {
2967 if (recno == ri->group_num && ptr == ri->subject_position &&
2968 mb->last_used_ptr == ri->last_used_ptr)
2970 }
2971
2972 /* Remember this recursion and where we started it so as to
2973 catch infinite loops. */
2974
2975 new_recursive.group_num = recno;
2976 new_recursive.subject_position = ptr;
2977 new_recursive.last_used_ptr = mb->last_used_ptr;
2978 new_recursive.prevrec = mb->recursive;
2979 mb->recursive = &new_recursive;
2980
2981 rc = internal_dfa_match(
2982 mb, /* fixed match data */
2983 callpat, /* this subexpression's code */
2984 ptr, /* where we currently are */
2985 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
2986 local_offsets, /* offset vector */
2987 RWS_OVEC_RSIZE/OVEC_UNIT, /* size of same */
2988 local_workspace, /* workspace vector */
2989 RWS_RSIZE, /* size of same */
2990 rlevel, /* function recursion level */
2991 RWS); /* recursion workspace */
2992
2993 rws->free += RWS_RSIZE + RWS_OVEC_RSIZE;
2994 mb->recursive = new_recursive.prevrec; /* Done this recursion */
2995
2996 /* Ran out of internal offsets */
2997
2998 if (rc == 0) return PCRE2_ERROR_DFA_RECURSE;
2999
3000 /* For each successful matched substring, set up the next state with a
3001 count of characters to skip before trying it. Note that the count is in
3002 characters, not bytes. */
3003
3004 if (rc > 0)
3005 {
3006 for (rc = rc*2 - 2; rc >= 0; rc -= 2)
3007 {
3008 PCRE2_SIZE charcount = local_offsets[rc+1] - local_offsets[rc];
3009#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3010 if (utf)
3011 {
3012 PCRE2_SPTR p = start_subject + local_offsets[rc];
3013 PCRE2_SPTR pp = start_subject + local_offsets[rc+1];
3014 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3015 }
3016#endif
3017 if (charcount > 0)
3018 {
3019 ADD_NEW_DATA(-(state_offset + LINK_SIZE + 1), 0,
3020 (int)(charcount - 1));
3021 }
3022 else
3023 {
3024 ADD_ACTIVE(state_offset + LINK_SIZE + 1, 0);
3025 }
3026 }
3027 }
3028 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3029 }
3030 break;
3031
3032 /*-----------------------------------------------------------------*/
3033 case OP_BRAPOS:
3034 case OP_SBRAPOS:
3035 case OP_CBRAPOS:
3036 case OP_SCBRAPOS:
3037 case OP_BRAPOSZERO:
3038 {
3039 int rc;
3040 int *local_workspace;
3041 PCRE2_SIZE *local_offsets;
3042 PCRE2_SIZE charcount, matched_count;
3043 PCRE2_SPTR local_ptr = ptr;
3044 RWS_anchor *rws = (RWS_anchor *)RWS;
3045 BOOL allow_zero;
3046
3047 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3048 {
3049 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3050 if (rc != 0) return rc;
3051 RWS = (int *)rws;
3052 }
3053
3054 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3055 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3056 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3057
3058 if (codevalue == OP_BRAPOSZERO)
3059 {
3060 allow_zero = TRUE;
3061 codevalue = *(++code); /* Codevalue will be one of above BRAs */
3062 }
3063 else allow_zero = FALSE;
3064
3065 /* Loop to match the subpattern as many times as possible as if it were
3066 a complete pattern. */
3067
3068 for (matched_count = 0;; matched_count++)
3069 {
3070 rc = internal_dfa_match(
3071 mb, /* fixed match data */
3072 code, /* this subexpression's code */
3073 local_ptr, /* where we currently are */
3074 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3075 local_offsets, /* offset vector */
3076 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3077 local_workspace, /* workspace vector */
3078 RWS_RSIZE, /* size of same */
3079 rlevel, /* function recursion level */
3080 RWS); /* recursion workspace */
3081
3082 /* Failed to match */
3083
3084 if (rc < 0)
3085 {
3086 if (rc != PCRE2_ERROR_NOMATCH) return rc;
3087 break;
3088 }
3089
3090 /* Matched: break the loop if zero characters matched. */
3091
3092 charcount = local_offsets[1] - local_offsets[0];
3093 if (charcount == 0) break;
3094 local_ptr += charcount; /* Advance temporary position ptr */
3095 }
3096
3097 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3098
3099 /* At this point we have matched the subpattern matched_count
3100 times, and local_ptr is pointing to the character after the end of the
3101 last match. */
3102
3103 if (matched_count > 0 || allow_zero)
3104 {
3105 PCRE2_SPTR end_subpattern = code;
3106 int next_state_offset;
3107
3108 do { end_subpattern += GET(end_subpattern, 1); }
3109 while (*end_subpattern == OP_ALT);
3110 next_state_offset =
3111 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3112
3113 /* Optimization: if there are no more active states, and there
3114 are no new states yet set up, then skip over the subject string
3115 right here, to save looping. Otherwise, set up the new state to swing
3116 into action when the end of the matched substring is reached. */
3117
3118 if (i + 1 >= active_count && new_count == 0)
3119 {
3120 ptr = local_ptr;
3121 clen = 0;
3122 ADD_NEW(next_state_offset, 0);
3123 }
3124 else
3125 {
3126 PCRE2_SPTR p = ptr;
3127 PCRE2_SPTR pp = local_ptr;
3128 charcount = (PCRE2_SIZE)(pp - p);
3129#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3130 if (utf) while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3131#endif
3132 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3133 }
3134 }
3135 }
3136 break;
3137
3138 /*-----------------------------------------------------------------*/
3139 case OP_ONCE:
3140 {
3141 int rc;
3142 int *local_workspace;
3143 PCRE2_SIZE *local_offsets;
3144 RWS_anchor *rws = (RWS_anchor *)RWS;
3145
3146 if (rws->free < RWS_RSIZE + RWS_OVEC_OSIZE)
3147 {
3148 rc = more_workspace(&rws, RWS_OVEC_OSIZE, mb);
3149 if (rc != 0) return rc;
3150 RWS = (int *)rws;
3151 }
3152
3153 local_offsets = (PCRE2_SIZE *)(RWS + rws->size - rws->free);
3154 local_workspace = ((int *)local_offsets) + RWS_OVEC_OSIZE;
3155 rws->free -= RWS_RSIZE + RWS_OVEC_OSIZE;
3156
3157 rc = internal_dfa_match(
3158 mb, /* fixed match data */
3159 code, /* this subexpression's code */
3160 ptr, /* where we currently are */
3161 (PCRE2_SIZE)(ptr - start_subject), /* start offset */
3162 local_offsets, /* offset vector */
3163 RWS_OVEC_OSIZE/OVEC_UNIT, /* size of same */
3164 local_workspace, /* workspace vector */
3165 RWS_RSIZE, /* size of same */
3166 rlevel, /* function recursion level */
3167 RWS); /* recursion workspace */
3168
3169 rws->free += RWS_RSIZE + RWS_OVEC_OSIZE;
3170
3171 if (rc >= 0)
3172 {
3173 PCRE2_SPTR end_subpattern = code;
3174 PCRE2_SIZE charcount = local_offsets[1] - local_offsets[0];
3175 int next_state_offset, repeat_state_offset;
3176
3177 do { end_subpattern += GET(end_subpattern, 1); }
3178 while (*end_subpattern == OP_ALT);
3179 next_state_offset =
3180 (int)(end_subpattern - start_code + LINK_SIZE + 1);
3181
3182 /* If the end of this subpattern is KETRMAX or KETRMIN, we must
3183 arrange for the repeat state also to be added to the relevant list.
3184 Calculate the offset, or set -1 for no repeat. */
3185
3186 repeat_state_offset = (*end_subpattern == OP_KETRMAX ||
3187 *end_subpattern == OP_KETRMIN)?
3188 (int)(end_subpattern - start_code - GET(end_subpattern, 1)) : -1;
3189
3190 /* If we have matched an empty string, add the next state at the
3191 current character pointer. This is important so that the duplicate
3192 checking kicks in, which is what breaks infinite loops that match an
3193 empty string. */
3194
3195 if (charcount == 0)
3196 {
3197 ADD_ACTIVE(next_state_offset, 0);
3198 }
3199
3200 /* Optimization: if there are no more active states, and there
3201 are no new states yet set up, then skip over the subject string
3202 right here, to save looping. Otherwise, set up the new state to swing
3203 into action when the end of the matched substring is reached. */
3204
3205 else if (i + 1 >= active_count && new_count == 0)
3206 {
3207 ptr += charcount;
3208 clen = 0;
3209 ADD_NEW(next_state_offset, 0);
3210
3211 /* If we are adding a repeat state at the new character position,
3212 we must fudge things so that it is the only current state.
3213 Otherwise, it might be a duplicate of one we processed before, and
3214 that would cause it to be skipped. */
3215
3216 if (repeat_state_offset >= 0)
3217 {
3218 next_active_state = active_states;
3219 active_count = 0;
3220 i = -1;
3221 ADD_ACTIVE(repeat_state_offset, 0);
3222 }
3223 }
3224 else
3225 {
3226#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
3227 if (utf)
3228 {
3229 PCRE2_SPTR p = start_subject + local_offsets[0];
3230 PCRE2_SPTR pp = start_subject + local_offsets[1];
3231 while (p < pp) if (NOT_FIRSTCU(*p++)) charcount--;
3232 }
3233#endif
3234 ADD_NEW_DATA(-next_state_offset, 0, (int)(charcount - 1));
3235 if (repeat_state_offset >= 0)
3236 { ADD_NEW_DATA(-repeat_state_offset, 0, (int)(charcount - 1)); }
3237 }
3238 }
3239 else if (rc != PCRE2_ERROR_NOMATCH) return rc;
3240 }
3241 break;
3242
3243
3244/* ========================================================================== */
3245 /* Handle callouts */
3246
3247 case OP_CALLOUT:
3248 case OP_CALLOUT_STR:
3249 {
3250 PCRE2_SIZE callout_length;
3251 rrc = do_callout_dfa(code, offsets, current_subject, ptr, mb, 0,
3252 &callout_length);
3253 if (rrc < 0) return rrc; /* Abandon */
3254 if (rrc == 0)
3255 { ADD_ACTIVE(state_offset + (int)callout_length, 0); }
3256 }
3257 break;
3258
3259
3260/* ========================================================================== */
3261 default: /* Unsupported opcode */
3262 return PCRE2_ERROR_DFA_UITEM;
3263 }
3264
3265 NEXT_ACTIVE_STATE: continue;
3266
3267 } /* End of loop scanning active states */
3268
3269 /* We have finished the processing at the current subject character. If no
3270 new states have been set for the next character, we have found all the
3271 matches that we are going to find. If partial matching has been requested,
3272 check for appropriate conditions.
3273
3274 The "forced_ fail" variable counts the number of (*F) encountered for the
3275 character. If it is equal to the original active_count (saved in
3276 workspace[1]) it means that (*F) was found on every active state. In this
3277 case we don't want to give a partial match.
3278
3279 The "could_continue" variable is true if a state could have continued but
3280 for the fact that the end of the subject was reached. */
3281
3282 if (new_count <= 0)
3283 {
3284 if (could_continue && /* Some could go on, and */
3285 forced_fail != workspace[1] && /* Not all forced fail & */
3286 ( /* either... */
3287 (mb->moptions & PCRE2_PARTIAL_HARD) != 0 /* Hard partial */
3288 || /* or... */
3289 ((mb->moptions & PCRE2_PARTIAL_SOFT) != 0 && /* Soft partial and */
3290 match_count < 0) /* no matches */
3291 ) && /* And... */
3292 (
3293 partial_newline || /* Either partial NL */
3294 ( /* or ... */
3295 ptr >= end_subject && /* End of subject and */
3296 ( /* either */
3297 ptr > mb->start_used_ptr || /* Inspected non-empty string */
3298 mb->allowemptypartial /* or pattern has lookbehind */
3299 ) /* or could match empty */
3300 )
3301 ))
3302 match_count = PCRE2_ERROR_PARTIAL;
3303 break; /* Exit from loop along the subject string */
3304 }
3305
3306 /* One or more states are active for the next character. */
3307
3308 ptr += clen; /* Advance to next subject character */
3309 } /* Loop to move along the subject string */
3310
3311/* Control gets here from "break" a few lines above. If we have a match and
3312PCRE2_ENDANCHORED is set, the match fails. */
3313
3314if (match_count >= 0 &&
3315 ((mb->moptions | mb->poptions) & PCRE2_ENDANCHORED) != 0 &&
3316 ptr < end_subject)
3317 match_count = PCRE2_ERROR_NOMATCH;
3318
3319return match_count;
3320}
3321
3322
3323
3324/*************************************************
3325* Match a pattern using the DFA algorithm *
3326*************************************************/
3327
3328/* This function matches a compiled pattern to a subject string, using the
3329alternate matching algorithm that finds all matches at once.
3330
3331Arguments:
3332 code points to the compiled pattern
3333 subject subject string
3334 length length of subject string
3335 startoffset where to start matching in the subject
3336 options option bits
3337 match_data points to a match data structure
3338 gcontext points to a match context
3339 workspace pointer to workspace
3340 wscount size of workspace
3341
3342Returns: > 0 => number of match offset pairs placed in offsets
3343 = 0 => offsets overflowed; longest matches are present
3344 -1 => failed to match
3345 < -1 => some kind of unexpected problem
3346*/
3347
3350 PCRE2_SIZE start_offset, uint32_t options, pcre2_match_data *match_data,
3351 pcre2_match_context *mcontext, int *workspace, PCRE2_SIZE wscount)
3352{
3353int rc;
3354int was_zero_terminated = 0;
3355
3356const pcre2_real_code *re = (const pcre2_real_code *)code;
3357
3358PCRE2_SPTR start_match;
3359PCRE2_SPTR end_subject;
3360PCRE2_SPTR bumpalong_limit;
3361PCRE2_SPTR req_cu_ptr;
3362
3363BOOL utf, anchored, startline, firstline;
3364BOOL has_first_cu = FALSE;
3365BOOL has_req_cu = FALSE;
3366
3367#if PCRE2_CODE_UNIT_WIDTH == 8
3368PCRE2_SPTR memchr_found_first_cu = NULL;
3369PCRE2_SPTR memchr_found_first_cu2 = NULL;
3370#endif
3371
3372PCRE2_UCHAR first_cu = 0;
3373PCRE2_UCHAR first_cu2 = 0;
3374PCRE2_UCHAR req_cu = 0;
3375PCRE2_UCHAR req_cu2 = 0;
3376
3377const uint8_t *start_bits = NULL;
3378
3379/* We need to have mb pointing to a match block, because the IS_NEWLINE macro
3380is used below, and it expects NLBLOCK to be defined as a pointer. */
3381
3383dfa_match_block actual_match_block;
3384dfa_match_block *mb = &actual_match_block;
3385
3386/* Set up a starting block of memory for use during recursive calls to
3387internal_dfa_match(). By putting this on the stack, it minimizes resource use
3388in the case when it is not needed. If this is too small, more memory is
3389obtained from the heap. At the start of each block is an anchor structure.*/
3390
3391int base_recursion_workspace[RWS_BASE_SIZE];
3392RWS_anchor *rws = (RWS_anchor *)base_recursion_workspace;
3393rws->next = NULL;
3394rws->size = RWS_BASE_SIZE;
3396
3397/* Recognize NULL, length 0 as an empty string. */
3398
3399if (subject == NULL && length == 0) subject = (PCRE2_SPTR)"";
3400
3401/* Plausibility checks */
3402
3404if (re == NULL || subject == NULL || workspace == NULL || match_data == NULL)
3405 return PCRE2_ERROR_NULL;
3406
3407if (length == PCRE2_ZERO_TERMINATED)
3408 {
3409 length = PRIV(strlen)(subject);
3410 was_zero_terminated = 1;
3411 }
3412
3413if (wscount < 20) return PCRE2_ERROR_DFA_WSSIZE;
3414if (start_offset > length) return PCRE2_ERROR_BADOFFSET;
3415
3416/* Partial matching and PCRE2_ENDANCHORED are currently not allowed at the same
3417time. */
3418
3420 ((re->overall_options | options) & PCRE2_ENDANCHORED) != 0)
3421 return PCRE2_ERROR_BADOPTION;
3422
3423/* Invalid UTF support is not available for DFA matching. */
3424
3427
3428/* Check that the first field in the block is the magic number. If it is not,
3429return with PCRE2_ERROR_BADMAGIC. */
3430
3432
3433/* Check the code unit width. */
3434
3435if ((re->flags & PCRE2_MODE_MASK) != PCRE2_CODE_UNIT_WIDTH/8)
3436 return PCRE2_ERROR_BADMODE;
3437
3438/* PCRE2_NOTEMPTY and PCRE2_NOTEMPTY_ATSTART are match-time flags in the
3439options variable for this function. Users of PCRE2 who are not calling the
3440function directly would like to have a way of setting these flags, in the same
3441way that they can set pcre2_compile() flags like PCRE2_NO_AUTOPOSSESS with
3442constructions like (*NO_AUTOPOSSESS). To enable this, (*NOTEMPTY) and
3443(*NOTEMPTY_ATSTART) set bits in the pattern's "flag" function which can now be
3444transferred to the options for this function. The bits are guaranteed to be
3445adjacent, but do not have the same values. This bit of Boolean trickery assumes
3446that the match-time bits are not more significant than the flag bits. If by
3447accident this is not the case, a compile-time division by zero error will
3448occur. */
3449
3450#define FF (PCRE2_NOTEMPTY_SET|PCRE2_NE_ATST_SET)
3451#define OO (PCRE2_NOTEMPTY|PCRE2_NOTEMPTY_ATSTART)
3452options |= (re->flags & FF) / ((FF & (~FF+1)) / (OO & (~OO+1)));
3453#undef FF
3454#undef OO
3455
3456/* If restarting after a partial match, do some sanity checks on the contents
3457of the workspace. */
3458
3459if ((options & PCRE2_DFA_RESTART) != 0)
3460 {
3461 if ((workspace[0] & (-2)) != 0 || workspace[1] < 1 ||
3462 workspace[1] > (int)((wscount - 2)/INTS_PER_STATEBLOCK))
3464 }
3465
3466/* Set some local values */
3467
3468utf = (re->overall_options & PCRE2_UTF) != 0;
3469start_match = subject + start_offset;
3470end_subject = subject + length;
3471req_cu_ptr = start_match - 1;
3472anchored = (options & (PCRE2_ANCHORED|PCRE2_DFA_RESTART)) != 0 ||
3473 (re->overall_options & PCRE2_ANCHORED) != 0;
3474
3475/* The "must be at the start of a line" flags are used in a loop when finding
3476where to start. */
3477
3478startline = (re->flags & PCRE2_STARTLINE) != 0;
3479firstline = !anchored && (re->overall_options & PCRE2_FIRSTLINE) != 0;
3480bumpalong_limit = end_subject;
3481
3482/* Initialize and set up the fixed fields in the callout block, with a pointer
3483in the match block. */
3484
3485mb->cb = &cb;
3486cb.version = 2;
3487cb.subject = subject;
3488cb.subject_length = (PCRE2_SIZE)(end_subject - subject);
3489cb.callout_flags = 0;
3490cb.capture_top = 1; /* No capture support */
3491cb.capture_last = 0;
3492cb.mark = NULL; /* No (*MARK) support */
3493
3494/* Get data from the match context, if present, and fill in the remaining
3495fields in the match block. It is an error to set an offset limit without
3496setting the flag at compile time. */
3497
3498if (mcontext == NULL)
3499 {
3500 mb->callout = NULL;
3501 mb->memctl = re->memctl;
3502 mb->match_limit = PRIV(default_match_context).match_limit;
3503 mb->match_limit_depth = PRIV(default_match_context).depth_limit;
3504 mb->heap_limit = PRIV(default_match_context).heap_limit;
3505 }
3506else
3507 {
3508 if (mcontext->offset_limit != PCRE2_UNSET)
3509 {
3510 if ((re->overall_options & PCRE2_USE_OFFSET_LIMIT) == 0)
3512 bumpalong_limit = subject + mcontext->offset_limit;
3513 }
3514 mb->callout = mcontext->callout;
3515 mb->callout_data = mcontext->callout_data;
3516 mb->memctl = mcontext->memctl;
3517 mb->match_limit = mcontext->match_limit;
3518 mb->match_limit_depth = mcontext->depth_limit;
3519 mb->heap_limit = mcontext->heap_limit;
3520 }
3521
3522if (mb->match_limit > re->limit_match)
3523 mb->match_limit = re->limit_match;
3524
3525if (mb->match_limit_depth > re->limit_depth)
3527
3528if (mb->heap_limit > re->limit_heap)
3529 mb->heap_limit = re->limit_heap;
3530
3531mb->start_code = (PCRE2_UCHAR *)((uint8_t *)re + sizeof(pcre2_real_code)) +
3532 re->name_count * re->name_entry_size;
3533mb->tables = re->tables;
3534mb->start_subject = subject;
3535mb->end_subject = end_subject;
3536mb->start_offset = start_offset;
3537mb->allowemptypartial = (re->max_lookbehind > 0) ||
3538 (re->flags & PCRE2_MATCH_EMPTY) != 0;
3539mb->moptions = options;
3540mb->poptions = re->overall_options;
3541mb->match_call_count = 0;
3542mb->heap_used = 0;
3543
3544/* Process the \R and newline settings. */
3545
3547mb->nltype = NLTYPE_FIXED;
3548switch(re->newline_convention)
3549 {
3550 case PCRE2_NEWLINE_CR:
3551 mb->nllen = 1;
3552 mb->nl[0] = CHAR_CR;
3553 break;
3554
3555 case PCRE2_NEWLINE_LF:
3556 mb->nllen = 1;
3557 mb->nl[0] = CHAR_NL;
3558 break;
3559
3560 case PCRE2_NEWLINE_NUL:
3561 mb->nllen = 1;
3562 mb->nl[0] = CHAR_NUL;
3563 break;
3564
3565 case PCRE2_NEWLINE_CRLF:
3566 mb->nllen = 2;
3567 mb->nl[0] = CHAR_CR;
3568 mb->nl[1] = CHAR_NL;
3569 break;
3570
3571 case PCRE2_NEWLINE_ANY:
3572 mb->nltype = NLTYPE_ANY;
3573 break;
3574
3576 mb->nltype = NLTYPE_ANYCRLF;
3577 break;
3578
3579 default: return PCRE2_ERROR_INTERNAL;
3580 }
3581
3582/* Check a UTF string for validity if required. For 8-bit and 16-bit strings,
3583we must also check that a starting offset does not point into the middle of a
3584multiunit character. We check only the portion of the subject that is going to
3585be inspected during matching - from the offset minus the maximum back reference
3586to the given length. This saves time when a small part of a large subject is
3587being matched by the use of a starting offset. Note that the maximum lookbehind
3588is a number of characters, not code units. */
3589
3590#ifdef SUPPORT_UNICODE
3591if (utf && (options & PCRE2_NO_UTF_CHECK) == 0)
3592 {
3593 PCRE2_SPTR check_subject = start_match; /* start_match includes offset */
3594
3595 if (start_offset > 0)
3596 {
3597#if PCRE2_CODE_UNIT_WIDTH != 32
3598 unsigned int i;
3599 if (start_match < end_subject && NOT_FIRSTCU(*start_match))
3601 for (i = re->max_lookbehind; i > 0 && check_subject > subject; i--)
3602 {
3603 check_subject--;
3604 while (check_subject > subject &&
3605#if PCRE2_CODE_UNIT_WIDTH == 8
3606 (*check_subject & 0xc0) == 0x80)
3607#else /* 16-bit */
3608 (*check_subject & 0xfc00) == 0xdc00)
3609#endif /* PCRE2_CODE_UNIT_WIDTH == 8 */
3610 check_subject--;
3611 }
3612#else /* In the 32-bit library, one code unit equals one character. */
3613 check_subject -= re->max_lookbehind;
3614 if (check_subject < subject) check_subject = subject;
3615#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
3616 }
3617
3618 /* Validate the relevant portion of the subject. After an error, adjust the
3619 offset to be an absolute offset in the whole string. */
3620
3621 match_data->rc = PRIV(valid_utf)(check_subject,
3622 length - (PCRE2_SIZE)(check_subject - subject), &(match_data->startchar));
3623 if (match_data->rc != 0)
3624 {
3625 match_data->startchar += (PCRE2_SIZE)(check_subject - subject);
3626 return match_data->rc;
3627 }
3628 }
3629#endif /* SUPPORT_UNICODE */
3630
3631/* Set up the first code unit to match, if available. If there's no first code
3632unit there may be a bitmap of possible first characters. */
3633
3634if ((re->flags & PCRE2_FIRSTSET) != 0)
3635 {
3636 has_first_cu = TRUE;
3637 first_cu = first_cu2 = (PCRE2_UCHAR)(re->first_codeunit);
3638 if ((re->flags & PCRE2_FIRSTCASELESS) != 0)
3639 {
3640 first_cu2 = TABLE_GET(first_cu, mb->tables + fcc_offset, first_cu);
3641#ifdef SUPPORT_UNICODE
3642#if PCRE2_CODE_UNIT_WIDTH == 8
3643 if (first_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3644 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3645#else
3646 if (first_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3647 first_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(first_cu);
3648#endif
3649#endif /* SUPPORT_UNICODE */
3650 }
3651 }
3652else
3653 if (!startline && (re->flags & PCRE2_FIRSTMAPSET) != 0)
3654 start_bits = re->start_bitmap;
3655
3656/* There may be a "last known required code unit" set. */
3657
3658if ((re->flags & PCRE2_LASTSET) != 0)
3659 {
3660 has_req_cu = TRUE;
3661 req_cu = req_cu2 = (PCRE2_UCHAR)(re->last_codeunit);
3662 if ((re->flags & PCRE2_LASTCASELESS) != 0)
3663 {
3664 req_cu2 = TABLE_GET(req_cu, mb->tables + fcc_offset, req_cu);
3665#ifdef SUPPORT_UNICODE
3666#if PCRE2_CODE_UNIT_WIDTH == 8
3667 if (req_cu > 127 && !utf && (re->overall_options & PCRE2_UCP) != 0)
3668 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3669#else
3670 if (req_cu > 127 && (utf || (re->overall_options & PCRE2_UCP) != 0))
3671 req_cu2 = (PCRE2_UCHAR)UCD_OTHERCASE(req_cu);
3672#endif
3673#endif /* SUPPORT_UNICODE */
3674 }
3675 }
3676
3677/* If the match data block was previously used with PCRE2_COPY_MATCHED_SUBJECT,
3678free the memory that was obtained. */
3679
3680if ((match_data->flags & PCRE2_MD_COPIED_SUBJECT) != 0)
3681 {
3682 match_data->memctl.free((void *)match_data->subject,
3683 match_data->memctl.memory_data);
3684 match_data->flags &= ~PCRE2_MD_COPIED_SUBJECT;
3685 }
3686
3687/* Fill in fields that are always returned in the match data. */
3688
3689match_data->code = re;
3690match_data->subject = NULL; /* Default for no match */
3691match_data->mark = NULL;
3692match_data->matchedby = PCRE2_MATCHEDBY_DFA_INTERPRETER;
3693
3694/* Call the main matching function, looping for a non-anchored regex after a
3695failed match. If not restarting, perform certain optimizations at the start of
3696a match. */
3697
3698for (;;)
3699 {
3700 /* ----------------- Start of match optimizations ---------------- */
3701
3702 /* There are some optimizations that avoid running the match if a known
3703 starting point is not found, or if a known later code unit is not present.
3704 However, there is an option (settable at compile time) that disables
3705 these, for testing and for ensuring that all callouts do actually occur.
3706 The optimizations must also be avoided when restarting a DFA match. */
3707
3708 if ((re->overall_options & PCRE2_NO_START_OPTIMIZE) == 0 &&
3709 (options & PCRE2_DFA_RESTART) == 0)
3710 {
3711 /* If firstline is TRUE, the start of the match is constrained to the first
3712 line of a multiline string. That is, the match must be before or at the
3713 first newline following the start of matching. Temporarily adjust
3714 end_subject so that we stop the optimization scans for a first code unit
3715 immediately after the first character of a newline (the first code unit can
3716 legitimately be a newline). If the match fails at the newline, later code
3717 breaks this loop. */
3718
3719 if (firstline)
3720 {
3721 PCRE2_SPTR t = start_match;
3722#ifdef SUPPORT_UNICODE
3723 if (utf)
3724 {
3725 while (t < end_subject && !IS_NEWLINE(t))
3726 {
3727 t++;
3728 ACROSSCHAR(t < end_subject, t, t++);
3729 }
3730 }
3731 else
3732#endif
3733 while (t < end_subject && !IS_NEWLINE(t)) t++;
3734 end_subject = t;
3735 }
3736
3737 /* Anchored: check the first code unit if one is recorded. This may seem
3738 pointless but it can help in detecting a no match case without scanning for
3739 the required code unit. */
3740
3741 if (anchored)
3742 {
3743 if (has_first_cu || start_bits != NULL)
3744 {
3745 BOOL ok = start_match < end_subject;
3746 if (ok)
3747 {
3748 PCRE2_UCHAR c = UCHAR21TEST(start_match);
3749 ok = has_first_cu && (c == first_cu || c == first_cu2);
3750 if (!ok && start_bits != NULL)
3751 {
3752#if PCRE2_CODE_UNIT_WIDTH != 8
3753 if (c > 255) c = 255;
3754#endif
3755 ok = (start_bits[c/8] & (1u << (c&7))) != 0;
3756 }
3757 }
3758 if (!ok) break;
3759 }
3760 }
3761
3762 /* Not anchored. Advance to a unique first code unit if there is one. */
3763
3764 else
3765 {
3766 if (has_first_cu)
3767 {
3768 if (first_cu != first_cu2) /* Caseless */
3769 {
3770 /* In 16-bit and 32_bit modes we have to do our own search, so can
3771 look for both cases at once. */
3772
3773#if PCRE2_CODE_UNIT_WIDTH != 8
3774 PCRE2_UCHAR smc;
3775 while (start_match < end_subject &&
3776 (smc = UCHAR21TEST(start_match)) != first_cu &&
3777 smc != first_cu2)
3778 start_match++;
3779#else
3780 /* In 8-bit mode, the use of memchr() gives a big speed up, even
3781 though we have to call it twice in order to find the earliest
3782 occurrence of the code unit in either of its cases. Caching is used
3783 to remember the positions of previously found code units. This can
3784 make a huge difference when the strings are very long and only one
3785 case is actually present. */
3786
3787 PCRE2_SPTR pp1 = NULL;
3788 PCRE2_SPTR pp2 = NULL;
3789 PCRE2_SIZE searchlength = end_subject - start_match;
3790
3791 /* If we haven't got a previously found position for first_cu, or if
3792 the current starting position is later, we need to do a search. If
3793 the code unit is not found, set it to the end. */
3794
3795 if (memchr_found_first_cu == NULL ||
3796 start_match > memchr_found_first_cu)
3797 {
3798 pp1 = memchr(start_match, first_cu, searchlength);
3799 memchr_found_first_cu = (pp1 == NULL)? end_subject : pp1;
3800 }
3801
3802 /* If the start is before a previously found position, use the
3803 previous position, or NULL if a previous search failed. */
3804
3805 else pp1 = (memchr_found_first_cu == end_subject)? NULL :
3806 memchr_found_first_cu;
3807
3808 /* Do the same thing for the other case. */
3809
3810 if (memchr_found_first_cu2 == NULL ||
3811 start_match > memchr_found_first_cu2)
3812 {
3813 pp2 = memchr(start_match, first_cu2, searchlength);
3814 memchr_found_first_cu2 = (pp2 == NULL)? end_subject : pp2;
3815 }
3816
3817 else pp2 = (memchr_found_first_cu2 == end_subject)? NULL :
3818 memchr_found_first_cu2;
3819
3820 /* Set the start to the end of the subject if neither case was found.
3821 Otherwise, use the earlier found point. */
3822
3823 if (pp1 == NULL)
3824 start_match = (pp2 == NULL)? end_subject : pp2;
3825 else
3826 start_match = (pp2 == NULL || pp1 < pp2)? pp1 : pp2;
3827
3828#endif /* 8-bit handling */
3829 }
3830
3831 /* The caseful case is much simpler. */
3832
3833 else
3834 {
3835#if PCRE2_CODE_UNIT_WIDTH != 8
3836 while (start_match < end_subject && UCHAR21TEST(start_match) !=
3837 first_cu)
3838 start_match++;
3839#else /* 8-bit code units */
3840 start_match = memchr(start_match, first_cu, end_subject - start_match);
3841 if (start_match == NULL) start_match = end_subject;
3842#endif
3843 }
3844
3845 /* If we can't find the required code unit, having reached the true end
3846 of the subject, break the bumpalong loop, to force a match failure,
3847 except when doing partial matching, when we let the next cycle run at
3848 the end of the subject. To see why, consider the pattern /(?<=abc)def/,
3849 which partially matches "abc", even though the string does not contain
3850 the starting character "d". If we have not reached the true end of the
3851 subject (PCRE2_FIRSTLINE caused end_subject to be temporarily modified)
3852 we also let the cycle run, because the matching string is legitimately
3853 allowed to start with the first code unit of a newline. */
3854
3855 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3856 start_match >= mb->end_subject)
3857 break;
3858 }
3859
3860 /* If there's no first code unit, advance to just after a linebreak for a
3861 multiline match if required. */
3862
3863 else if (startline)
3864 {
3865 if (start_match > mb->start_subject + start_offset)
3866 {
3867#ifdef SUPPORT_UNICODE
3868 if (utf)
3869 {
3870 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3871 {
3872 start_match++;
3873 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
3874 }
3875 }
3876 else
3877#endif
3878 while (start_match < end_subject && !WAS_NEWLINE(start_match))
3879 start_match++;
3880
3881 /* If we have just passed a CR and the newline option is ANY or
3882 ANYCRLF, and we are now at a LF, advance the match position by one
3883 more code unit. */
3884
3885 if (start_match[-1] == CHAR_CR &&
3886 (mb->nltype == NLTYPE_ANY || mb->nltype == NLTYPE_ANYCRLF) &&
3887 start_match < end_subject &&
3888 UCHAR21TEST(start_match) == CHAR_NL)
3889 start_match++;
3890 }
3891 }
3892
3893 /* If there's no first code unit or a requirement for a multiline line
3894 start, advance to a non-unique first code unit if any have been
3895 identified. The bitmap contains only 256 bits. When code units are 16 or
3896 32 bits wide, all code units greater than 254 set the 255 bit. */
3897
3898 else if (start_bits != NULL)
3899 {
3900 while (start_match < end_subject)
3901 {
3902 uint32_t c = UCHAR21TEST(start_match);
3903#if PCRE2_CODE_UNIT_WIDTH != 8
3904 if (c > 255) c = 255;
3905#endif
3906 if ((start_bits[c/8] & (1u << (c&7))) != 0) break;
3907 start_match++;
3908 }
3909
3910 /* See comment above in first_cu checking about the next line. */
3911
3912 if ((mb->moptions & (PCRE2_PARTIAL_HARD|PCRE2_PARTIAL_SOFT)) == 0 &&
3913 start_match >= mb->end_subject)
3914 break;
3915 }
3916 } /* End of first code unit handling */
3917
3918 /* Restore fudged end_subject */
3919
3920 end_subject = mb->end_subject;
3921
3922 /* The following two optimizations are disabled for partial matching. */
3923
3925 {
3926 PCRE2_SPTR p;
3927
3928 /* The minimum matching length is a lower bound; no actual string of that
3929 length may actually match the pattern. Although the value is, strictly,
3930 in characters, we treat it as code units to avoid spending too much time
3931 in this optimization. */
3932
3933 if (end_subject - start_match < re->minlength) goto NOMATCH_EXIT;
3934
3935 /* If req_cu is set, we know that that code unit must appear in the
3936 subject for the match to succeed. If the first code unit is set, req_cu
3937 must be later in the subject; otherwise the test starts at the match
3938 point. This optimization can save a huge amount of backtracking in
3939 patterns with nested unlimited repeats that aren't going to match.
3940 Writing separate code for cased/caseless versions makes it go faster, as
3941 does using an autoincrement and backing off on a match. As in the case of
3942 the first code unit, using memchr() in the 8-bit library gives a big
3943 speed up. Unlike the first_cu check above, we do not need to call
3944 memchr() twice in the caseless case because we only need to check for the
3945 presence of the character in either case, not find the first occurrence.
3946
3947 The search can be skipped if the code unit was found later than the
3948 current starting point in a previous iteration of the bumpalong loop.
3949
3950 HOWEVER: when the subject string is very, very long, searching to its end
3951 can take a long time, and give bad performance on quite ordinary
3952 patterns. This showed up when somebody was matching something like
3953 /^\d+C/ on a 32-megabyte string... so we don't do this when the string is
3954 sufficiently long, but it's worth searching a lot more for unanchored
3955 patterns. */
3956
3957 p = start_match + (has_first_cu? 1:0);
3958 if (has_req_cu && p > req_cu_ptr)
3959 {
3960 PCRE2_SIZE check_length = end_subject - start_match;
3961
3962 if (check_length < REQ_CU_MAX ||
3963 (!anchored && check_length < REQ_CU_MAX * 1000))
3964 {
3965 if (req_cu != req_cu2) /* Caseless */
3966 {
3967#if PCRE2_CODE_UNIT_WIDTH != 8
3968 while (p < end_subject)
3969 {
3970 uint32_t pp = UCHAR21INCTEST(p);
3971 if (pp == req_cu || pp == req_cu2) { p--; break; }
3972 }
3973#else /* 8-bit code units */
3974 PCRE2_SPTR pp = p;
3975 p = memchr(pp, req_cu, end_subject - pp);
3976 if (p == NULL)
3977 {
3978 p = memchr(pp, req_cu2, end_subject - pp);
3979 if (p == NULL) p = end_subject;
3980 }
3981#endif /* PCRE2_CODE_UNIT_WIDTH != 8 */
3982 }
3983
3984 /* The caseful case */
3985
3986 else
3987 {
3988#if PCRE2_CODE_UNIT_WIDTH != 8
3989 while (p < end_subject)
3990 {
3991 if (UCHAR21INCTEST(p) == req_cu) { p--; break; }
3992 }
3993
3994#else /* 8-bit code units */
3995 p = memchr(p, req_cu, end_subject - p);
3996 if (p == NULL) p = end_subject;
3997#endif
3998 }
3999
4000 /* If we can't find the required code unit, break the matching loop,
4001 forcing a match failure. */
4002
4003 if (p >= end_subject) break;
4004
4005 /* If we have found the required code unit, save the point where we
4006 found it, so that we don't search again next time round the loop if
4007 the start hasn't passed this code unit yet. */
4008
4009 req_cu_ptr = p;
4010 }
4011 }
4012 }
4013 }
4014
4015 /* ------------ End of start of match optimizations ------------ */
4016
4017 /* Give no match if we have passed the bumpalong limit. */
4018
4019 if (start_match > bumpalong_limit) break;
4020
4021 /* OK, now we can do the business */
4022
4023 mb->start_used_ptr = start_match;
4024 mb->last_used_ptr = start_match;
4025 mb->recursive = NULL;
4026
4027 rc = internal_dfa_match(
4028 mb, /* fixed match data */
4029 mb->start_code, /* this subexpression's code */
4030 start_match, /* where we currently are */
4031 start_offset, /* start offset in subject */
4032 match_data->ovector, /* offset vector */
4033 (uint32_t)match_data->oveccount * 2, /* actual size of same */
4034 workspace, /* workspace vector */
4035 (int)wscount, /* size of same */
4036 0, /* function recurse level */
4037 base_recursion_workspace); /* initial workspace for recursion */
4038
4039 /* Anything other than "no match" means we are done, always; otherwise, carry
4040 on only if not anchored. */
4041
4042 if (rc != PCRE2_ERROR_NOMATCH || anchored)
4043 {
4044 if (rc == PCRE2_ERROR_PARTIAL && match_data->oveccount > 0)
4045 {
4046 match_data->ovector[0] = (PCRE2_SIZE)(start_match - subject);
4047 match_data->ovector[1] = (PCRE2_SIZE)(end_subject - subject);
4048 }
4049 match_data->subject_length = length;
4050 match_data->leftchar = (PCRE2_SIZE)(mb->start_used_ptr - subject);
4051 match_data->rightchar = (PCRE2_SIZE)(mb->last_used_ptr - subject);
4052 match_data->startchar = (PCRE2_SIZE)(start_match - subject);
4053 match_data->rc = rc;
4054
4055 if (rc >= 0 &&(options & PCRE2_COPY_MATCHED_SUBJECT) != 0)
4056 {
4057 length = CU2BYTES(length + was_zero_terminated);
4058 match_data->subject = match_data->memctl.malloc(length,
4059 match_data->memctl.memory_data);
4060 if (match_data->subject == NULL) return PCRE2_ERROR_NOMEMORY;
4061 memcpy((void *)match_data->subject, subject, length);
4062 match_data->flags |= PCRE2_MD_COPIED_SUBJECT;
4063 }
4064 else
4065 {
4066 if (rc >= 0 || rc == PCRE2_ERROR_PARTIAL) match_data->subject = subject;
4067 }
4068 goto EXIT;
4069 }
4070
4071 /* Advance to the next subject character unless we are at the end of a line
4072 and firstline is set. */
4073
4074 if (firstline && IS_NEWLINE(start_match)) break;
4075 start_match++;
4076#ifdef SUPPORT_UNICODE
4077 if (utf)
4078 {
4079 ACROSSCHAR(start_match < end_subject, start_match, start_match++);
4080 }
4081#endif
4082 if (start_match > end_subject) break;
4083
4084 /* If we have just passed a CR and we are now at a LF, and the pattern does
4085 not contain any explicit matches for \r or \n, and the newline option is CRLF
4086 or ANY or ANYCRLF, advance the match position by one more character. */
4087
4088 if (UCHAR21TEST(start_match - 1) == CHAR_CR &&
4089 start_match < end_subject &&
4090 UCHAR21TEST(start_match) == CHAR_NL &&
4091 (re->flags & PCRE2_HASCRORLF) == 0 &&
4092 (mb->nltype == NLTYPE_ANY ||
4093 mb->nltype == NLTYPE_ANYCRLF ||
4094 mb->nllen == 2))
4095 start_match++;
4096
4097 } /* "Bumpalong" loop */
4098
4099NOMATCH_EXIT:
4101
4102EXIT:
4103while (rws->next != NULL)
4104 {
4105 RWS_anchor *next = rws->next;
4106 rws->next = next->next;
4107 mb->memctl.free(next, mb->memctl.memory_data);
4108 }
4109
4110return rc;
4111}
4112
4113/* These #undefs are here to enable unity builds with CMake. */
4114
4115#undef NLBLOCK /* Block containing newline information */
4116#undef PSSTART /* Field containing processed string start */
4117#undef PSEND /* Field containing processed string end */
4118
4119/* End of pcre2_dfa_match.c */
char * cb
Definition assert.c:26
count(Countable|array $value, int $mode=COUNT_NORMAL)
@ OK
Definition bcmath.h:168
#define max(a, b)
Definition exif.c:60
void * ptr
Definition ffi.c:3814
memcpy(ptr1, ptr2, size)
zend_long offset
#define TRUE
Definition gd_gd.c:7
#define FALSE
Definition gd_gd.c:8
#define NULL
Definition gdcache.h:45
again j
#define GET(n)
Definition md5.c:178
#define next(ls)
Definition minilua.c:2661
@ OP_NOT
Definition minilua.c:83
#define pcre2_real_code
Definition pcre2.h:826
#define PCRE2_DOLLAR_ENDONLY
Definition pcre2.h:123
#define PCRE2_ZERO_TERMINATED
Definition pcre2.h:481
#define PCRE2_ANCHORED
Definition pcre2.h:105
#define PCRE2_NOTEMPTY_ATSTART
Definition pcre2.h:179
#define PCRE2_ENDANCHORED
Definition pcre2.h:107
#define PCRE2_ERROR_BADMODE
Definition pcre2.h:373
#define PCRE2_ERROR_DFA_UINVALID_UTF
Definition pcre2.h:408
#define PCRE2_ERROR_DFA_UCOND
Definition pcre2.h:381
#define PCRE2_BSR_ANYCRLF
Definition pcre2.h:218
#define PCRE2_ERROR_HEAPLIMIT
Definition pcre2.h:405
#define PCRE2_ERROR_BADOPTION
Definition pcre2.h:375
#define PCRE2_UNSET
Definition pcre2.h:482
#define PCRE2_ERROR_NOMATCH
Definition pcre2.h:327
#define PCRE2_UCP
Definition pcre2.h:136
#define PCRE2_ERROR_INTERNAL
Definition pcre2.h:385
#define PCRE2_UCHAR
Definition pcre2.h:819
#define PCRE2_ERROR_DFA_UITEM
Definition pcre2.h:383
#define pcre2_code
Definition pcre2.h:822
#define PCRE2_MATCH_INVALID_UTF
Definition pcre2.h:145
#define PCRE2_NEWLINE_ANYCRLF
Definition pcre2.h:214
#define PCRE2_COPY_MATCHED_SUBJECT
Definition pcre2.h:190
#define PCRE2_ERROR_BADMAGIC
Definition pcre2.h:372
#define PCRE2_NEWLINE_CR
Definition pcre2.h:210
#define PCRE2_ERROR_BADOFFSET
Definition pcre2.h:374
#define PCRE2_ERROR_RECURSELOOP
Definition pcre2.h:393
#define PCRE2_ERROR_BADOFFSETLIMIT
Definition pcre2.h:398
#define PCRE2_SIZE
Definition pcre2.h:479
#define PCRE2_PARTIAL_SOFT
Definition pcre2.h:180
#define PCRE2_ERROR_DFA_BADRESTART
Definition pcre2.h:379
#define PCRE2_NOTBOL
Definition pcre2.h:176
#define PCRE2_SPTR
Definition pcre2.h:820
#define PCRE2_PARTIAL_HARD
Definition pcre2.h:181
#define pcre2_match_data
Definition pcre2.h:844
#define PCRE2_ERROR_NULL
Definition pcre2.h:392
#define pcre2_match_context
Definition pcre2.h:843
#define PCRE2_ERROR_DEPTHLIMIT
Definition pcre2.h:394
#define PCRE2_DFA_RESTART
Definition pcre2.h:182
#define PCRE2_USE_OFFSET_LIMIT
Definition pcre2.h:142
#define PCRE2_ERROR_MATCHLIMIT
Definition pcre2.h:388
#define PCRE2_ERROR_DFA_RECURSE
Definition pcre2.h:380
#define PCRE2_ERROR_DFA_WSSIZE
Definition pcre2.h:384
#define pcre2_dfa_match
Definition pcre2.h:862
#define PCRE2_CALL_CONVENTION
Definition pcre2.h:81
#define PCRE2_ALT_CIRCUMFLEX
Definition pcre2.h:140
#define PCRE2_NEWLINE_CRLF
Definition pcre2.h:212
#define PCRE2_UTF
Definition pcre2.h:138
#define PCRE2_NEWLINE_NUL
Definition pcre2.h:215
#define PCRE2_NO_UTF_CHECK
Definition pcre2.h:106
#define PCRE2_NO_START_OPTIMIZE
Definition pcre2.h:135
#define pcre2_callout_block
Definition pcre2.h:837
#define PCRE2_ERROR_PARTIAL
Definition pcre2.h:328
#define PCRE2_DFA_SHORTEST
Definition pcre2.h:183
#define PCRE2_FIRSTLINE
Definition pcre2.h:127
#define PCRE2_NOTEMPTY
Definition pcre2.h:178
#define PCRE2_NEWLINE_ANY
Definition pcre2.h:213
#define PCRE2_NEWLINE_LF
Definition pcre2.h:211
#define PCRE2_NOTEOL
Definition pcre2.h:177
#define PCRE2_ERROR_BADUTFOFFSET
Definition pcre2.h:377
#define PCRE2_ERROR_NOMEMORY
Definition pcre2.h:389
#define NLBLOCK
#define RWS_OVEC_RSIZE
#define FF
#define OP_VSPACE_EXTRA
#define OVEC_UNIT
#define ADD_ACTIVE(x, y)
#define RWS_RSIZE
#define OP_PROP_EXTRA
#define OP_HSPACE_EXTRA
#define INTS_PER_STATEBLOCK
#define PUBLIC_DFA_MATCH_OPTIONS
#define OP_ANYNL_EXTRA
#define ADD_NEW(x, y)
#define OP_EXTUNI_EXTRA
#define RWS_OVEC_OSIZE
#define OO
#define RWS_BASE_SIZE
#define RWS_ANCHOR_SIZE
#define ADD_NEW_DATA(x, y, z)
PCRE2_SPTR PRIV extuni(uint32_t c, PCRE2_SPTR eptr, PCRE2_SPTR start_subject, PCRE2_SPTR end_subject, BOOL utf, int *xcount)
int BOOL
#define PT_PC
@ OP_STARI
@ OP_NOTMINSTARI
@ OP_ANYNL
@ OP_SKIPZERO
@ OP_CHAR
@ OP_CRMINQUERY
@ OP_NOTPOSUPTOI
@ OP_SBRA
@ OP_ONCE
@ OP_NOTPROP
@ OP_NOTPLUS
@ OP_TYPEMINPLUS
@ OP_TYPEQUERY
@ OP_SBRAPOS
@ OP_SCOND
@ OP_ASSERTBACK
@ OP_CIRCM
@ OP_CLASS
@ OP_TYPEPLUS
@ OP_HSPACE
@ OP_NOT_WORDCHAR
@ OP_MINSTARI
@ OP_CRMINPLUS
@ OP_CRRANGE
@ OP_DOLL
@ OP_BRAPOS
@ OP_ASSERT_NOT
@ OP_NOTSTARI
@ OP_DNCREF
@ OP_ASSERT
@ OP_NOTMINPLUSI
@ OP_TYPEPOSSTAR
@ OP_TYPEMINUPTO
@ OP_TYPEPOSPLUS
@ OP_POSSTAR
@ OP_NOTUPTO
@ OP_TYPESTAR
@ OP_BRAMINZERO
@ OP_EXACTI
@ OP_NOTPLUSI
@ OP_NOTQUERYI
@ OP_CRQUERY
@ OP_ASSERTBACK_NOT
@ OP_RREF
@ OP_DNRREF
@ OP_DIGIT
@ OP_KETRPOS
@ OP_EXACT
@ OP_TYPEEXACT
@ OP_PLUS
@ OP_WHITESPACE
@ OP_CRMINSTAR
@ OP_NOTPOSPLUSI
@ OP_NOT_WORD_BOUNDARY
@ OP_KET
@ OP_NOT_DIGIT
@ OP_CALLOUT
@ OP_UCP_WORD_BOUNDARY
@ OP_CRMINRANGE
@ OP_RECURSE
@ OP_BRA
@ OP_FALSE
@ OP_CREF
@ OP_TABLE_LENGTH
@ OP_POSUPTO
@ OP_MINUPTOI
@ OP_NOTPOSUPTO
@ OP_REVERSE
@ OP_NCLASS
@ OP_KETRMIN
@ OP_COND
@ OP_MINPLUS
@ OP_TYPEPOSUPTO
@ OP_WORDCHAR
@ OP_MINQUERY
@ OP_TRUE
@ OP_EODN
@ OP_UPTOI
@ OP_CRPOSRANGE
@ OP_ALT
@ OP_UPTO
@ OP_QUERY
@ OP_POSQUERYI
@ OP_NOTPOSSTARI
@ OP_PROP
@ OP_NOTPOSSTAR
@ OP_PLUSI
@ OP_KETRMAX
@ OP_NOTMINPLUS
@ OP_CBRAPOS
@ OP_BRAZERO
@ OP_QUERYI
@ OP_NOT_UCP_WORD_BOUNDARY
@ OP_POSPLUSI
@ OP_ANYBYTE
@ OP_SCBRAPOS
@ OP_CHARI
@ OP_NOTMINQUERYI
@ OP_TYPEMINQUERY
@ OP_NOT_WHITESPACE
@ OP_NOTMINSTAR
@ OP_NOTSTAR
@ OP_SCBRA
@ OP_CRPOSSTAR
@ OP_MINUPTO
@ OP_NOTPOSQUERYI
@ OP_NOT_VSPACE
@ OP_CRSTAR
@ OP_VSPACE
@ OP_POSQUERY
@ OP_MINSTAR
@ OP_STAR
@ OP_ALLANY
@ OP_DOLLM
@ OP_CRPOSQUERY
@ OP_TYPEMINSTAR
@ OP_NOTMINUPTO
@ OP_NOTMINQUERY
@ OP_CRPLUS
@ OP_TYPEPOSQUERY
@ OP_POSPLUS
@ OP_SOD
@ OP_NOTPOSQUERY
@ OP_TYPEUPTO
@ OP_SOM
@ OP_ANY
@ OP_XCLASS
@ OP_POSSTARI
@ OP_NOT_HSPACE
@ OP_FAIL
@ OP_MINQUERYI
@ OP_MINPLUSI
@ OP_NOTMINUPTOI
@ OP_NOTI
@ OP_NOTQUERY
@ OP_POSUPTOI
@ OP_CALLOUT_STR
@ OP_CBRA
@ OP_BRAPOSZERO
@ OP_NOTUPTOI
@ OP_CRPOSPLUS
@ OP_EXTUNI
@ OP_EOD
@ OP_NOTEXACTI
@ OP_NOTEXACT
@ OP_WORD_BOUNDARY
@ OP_NOTPOSPLUS
@ OP_CIRC
#define CHAR_DOLLAR_SIGN
#define PCRE2_FIRSTMAPSET
#define NLTYPE_ANYCRLF
#define VSPACE_CASES
#define PT_ANY
#define PT_BIDICL
#define PT_GC
#define ctype_digit
#define REQ_CU_MAX
#define ctype_word
#define PCRE2_FIRSTCASELESS
#define PT_CLIST
#define CHAR_GRAVE_ACCENT
#define NLTYPE_FIXED
#define CHAR_NUL
#define MAGIC_NUMBER
#define PCRE2_HASCRORLF
#define ctype_space
#define NOTACHAR
#define PCRE2_LASTCASELESS
#define PCRE2_MODE_MASK
#define MAX_UTF_CODE_POINT
#define CHAR_COMMERCIAL_AT
#define PCRE2_STARTLINE
#define RREF_ANY
#define PT_UCNC
#define PT_WORD
#define CHAR_NL
#define PCRE2_MD_COPIED_SUBJECT
#define PT_SC
@ PCRE2_MATCHEDBY_DFA_INTERPRETER
#define UCD_CHARTYPE(ch)
#define CHAR_CR
#define PCRE2_LASTSET
#define PT_SPACE
#define CHAR_NEL
#define IS_NEWLINE(p)
#define PT_ALNUM
#define PT_PXSPACE
#define PCRE2_EXP_DEFN
#define PT_BOOL
#define CHAR_LF
#define WAS_NEWLINE(p)
#define UCD_BIDICLASS(ch)
#define PT_LAMP
#define fcc_offset
#define CHAR_FF
#define NLTYPE_ANY
#define UCD_OTHERCASE(ch)
#define memmove(a, b, c)
#define PCRE2_FIRSTSET
#define PCRE2_MATCH_EMPTY
#define lcc_offset
#define PRIV(name)
#define UCD_SCRIPTX_PROP(prop)
#define ctypes_offset
#define GET_UCD(ch)
#define CHAR_VT
#define HSPACE_CASES
#define MAPBIT(map, n)
#define UCD_BPROPS_PROP(prop)
#define PT_SCX
#define UCHAR21TEST(eptr)
#define TABLE_GET(c, table, default)
#define UCHAR21INCTEST(eptr)
#define CU2BYTES(x)
#define IMM2_SIZE
#define GETCHARLEN(c, eptr, len)
#define GETCHARTEST(c, eptr)
#define GET2(a, n)
#define OP_lengths
@ ucp_Z
Definition pcre2_ucp.h:65
@ ucp_N
Definition pcre2_ucp.h:62
@ ucp_L
Definition pcre2_ucp.h:60
@ ucp_Mn
Definition pcre2_ucp.h:83
@ ucp_Lu
Definition pcre2_ucp.h:80
@ ucp_Lt
Definition pcre2_ucp.h:79
@ ucp_Ll
Definition pcre2_ucp.h:76
@ ucp_Pc
Definition pcre2_ucp.h:87
int PRIV valid_utf(PCRE2_SPTR string, PCRE2_SIZE length, PCRE2_SIZE *erroroffset)
BOOL PRIV xclass(uint32_t c, PCRE2_SPTR data, BOOL utf)
#define LINK_SIZE
Definition config.h:98
PHP_JSON_API size_t int options
Definition php_json.h:102
#define UINT32_MAX
p
Definition session.c:1105
struct RWS_anchor * next
int(* callout)(pcre2_callout_block *, void *)
PCRE2_SPTR start_subject
dfa_recursion_info * recursive
PCRE2_SPTR start_used_ptr
pcre2_memctl memctl
PCRE2_SPTR last_used_ptr
pcre2_callout_block * cb
PCRE2_UCHAR nl[4]
const uint8_t * tables
PCRE2_SIZE start_offset
struct dfa_recursion_info * prevrec
void *(* malloc)(size_t, void *)
void(* free)(void *, void *)
const uint8_t * tables
uint16_t newline_convention
uint8_t start_bitmap[32]
pcre2_memctl memctl
strlen(string $string)
ZEND_API void(ZEND_FASTCALL *zend_touch_vm_stack_data)(void *vm_stack_data)
value