php-internal-docs 8.4.8
Unofficial docs for php/php-src
Loading...
Searching...
No Matches
pcre2_jit_simd_inc.h
Go to the documentation of this file.
1/*************************************************
2* Perl-Compatible Regular Expressions *
3*************************************************/
4
5/* PCRE is a library of functions to support regular expressions whose syntax
6and semantics are as close as possible to those of the Perl 5 language.
7
8 Written by Philip Hazel
9 This module by Zoltan Herczeg
10 Original API code Copyright (c) 1997-2012 University of Cambridge
11 New API code Copyright (c) 2016-2019 University of Cambridge
12
13-----------------------------------------------------------------------------
14Redistribution and use in source and binary forms, with or without
15modification, are permitted provided that the following conditions are met:
16
17 * Redistributions of source code must retain the above copyright notice,
18 this list of conditions and the following disclaimer.
19
20 * Redistributions in binary form must reproduce the above copyright
21 notice, this list of conditions and the following disclaimer in the
22 documentation and/or other materials provided with the distribution.
23
24 * Neither the name of the University of Cambridge nor the names of its
25 contributors may be used to endorse or promote products derived from
26 this software without specific prior written permission.
27
28THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
29AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
30IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
31ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
32LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
33CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
34SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
35INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
36CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
37ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
38POSSIBILITY OF SUCH DAMAGE.
39-----------------------------------------------------------------------------
40*/
41
42#if !(defined SUPPORT_VALGRIND)
43
44#if ((defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86) \
45 || (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X) \
46 || (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64))
47
48typedef enum {
49 vector_compare_match1,
50 vector_compare_match1i,
51 vector_compare_match2,
52} vector_compare_type;
53
54#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
55static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
56{
57#if PCRE2_CODE_UNIT_WIDTH == 8
58/* The AVX2 code path is currently disabled. */
59/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 31 : 15; */
60return 15;
61#elif PCRE2_CODE_UNIT_WIDTH == 16
62/* The AVX2 code path is currently disabled. */
63/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 15 : 7; */
64return 7;
65#elif PCRE2_CODE_UNIT_WIDTH == 32
66/* The AVX2 code path is currently disabled. */
67/* return sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? 7 : 3; */
68return 3;
69#else
70#error "Unsupported unit width"
71#endif
72}
73#else /* !SLJIT_CONFIG_X86 */
74static SLJIT_INLINE sljit_s32 max_fast_forward_char_pair_offset(void)
75{
76#if PCRE2_CODE_UNIT_WIDTH == 8
77return 15;
78#elif PCRE2_CODE_UNIT_WIDTH == 16
79return 7;
80#elif PCRE2_CODE_UNIT_WIDTH == 32
81return 3;
82#else
83#error "Unsupported unit width"
84#endif
85}
86#endif /* SLJIT_CONFIG_X86 */
87
88#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
89static struct sljit_jump *jump_if_utf_char_start(struct sljit_compiler *compiler, sljit_s32 reg)
90{
91#if PCRE2_CODE_UNIT_WIDTH == 8
92OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xc0);
93return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0x80);
94#elif PCRE2_CODE_UNIT_WIDTH == 16
95OP2(SLJIT_AND, reg, 0, reg, 0, SLJIT_IMM, 0xfc00);
96return CMP(SLJIT_NOT_EQUAL, reg, 0, SLJIT_IMM, 0xdc00);
97#else
98#error "Unknown code width"
99#endif
100}
101#endif
102
103#endif /* SLJIT_CONFIG_X86 || SLJIT_CONFIG_S390X */
104
105#if (defined SLJIT_CONFIG_X86 && SLJIT_CONFIG_X86)
106
107static sljit_s32 character_to_int32(PCRE2_UCHAR chr)
108{
110#if PCRE2_CODE_UNIT_WIDTH == 8
111#define SIMD_COMPARE_TYPE_INDEX 0
112return (sljit_s32)((value << 24) | (value << 16) | (value << 8) | value);
113#elif PCRE2_CODE_UNIT_WIDTH == 16
114#define SIMD_COMPARE_TYPE_INDEX 1
115return (sljit_s32)((value << 16) | value);
116#elif PCRE2_CODE_UNIT_WIDTH == 32
117#define SIMD_COMPARE_TYPE_INDEX 2
118return (sljit_s32)(value);
119#else
120#error "Unsupported unit width"
121#endif
122}
123
124static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
125 sljit_s32 reg_type, int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
126{
127sljit_u8 instruction[4];
128
129if (reg_type == SLJIT_SIMD_REG_128)
130 {
131 instruction[0] = 0x66;
132 instruction[1] = 0x0f;
133 }
134else
135 {
136 /* Two byte VEX prefix. */
137 instruction[0] = 0xc5;
138 instruction[1] = 0xfd;
139 }
140
141SLJIT_ASSERT(step >= 0 && step <= 3);
142
143if (compare_type != vector_compare_match2)
144 {
145 if (step == 0)
146 {
147 if (compare_type == vector_compare_match1i)
148 {
149 /* POR xmm1, xmm2/m128 */
150 if (reg_type == SLJIT_SIMD_REG_256)
151 instruction[1] ^= (dst_ind << 3);
152
153 /* Prefix is filled. */
154 instruction[2] = 0xeb;
155 instruction[3] = 0xc0 | (dst_ind << 3) | cmp2_ind;
156 sljit_emit_op_custom(compiler, instruction, 4);
157 }
158 return;
159 }
160
161 if (step != 2)
162 return;
163
164 /* PCMPEQB/W/D xmm1, xmm2/m128 */
165 if (reg_type == SLJIT_SIMD_REG_256)
166 instruction[1] ^= (dst_ind << 3);
167
168 /* Prefix is filled. */
169 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
170 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
171 sljit_emit_op_custom(compiler, instruction, 4);
172 return;
173 }
174
175if (reg_type == SLJIT_SIMD_REG_256)
176 {
177 if (step == 2)
178 return;
179
180 if (step == 0)
181 {
182 step = 2;
183 instruction[1] ^= (dst_ind << 3);
184 }
185 }
186
187switch (step)
188 {
189 case 0:
190 SLJIT_ASSERT(reg_type == SLJIT_SIMD_REG_128);
191
192 /* MOVDQA xmm1, xmm2/m128 */
193 /* Prefix is filled. */
194 instruction[2] = 0x6f;
195 instruction[3] = 0xc0 | (tmp_ind << 3) | dst_ind;
196 sljit_emit_op_custom(compiler, instruction, 4);
197 return;
198
199 case 1:
200 /* PCMPEQB/W/D xmm1, xmm2/m128 */
201 if (reg_type == SLJIT_SIMD_REG_256)
202 instruction[1] ^= (dst_ind << 3);
203
204 /* Prefix is filled. */
205 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
206 instruction[3] = 0xc0 | (dst_ind << 3) | cmp1_ind;
207 sljit_emit_op_custom(compiler, instruction, 4);
208 return;
209
210 case 2:
211 /* PCMPEQB/W/D xmm1, xmm2/m128 */
212 /* Prefix is filled. */
213 instruction[2] = 0x74 + SIMD_COMPARE_TYPE_INDEX;
214 instruction[3] = 0xc0 | (tmp_ind << 3) | cmp2_ind;
215 sljit_emit_op_custom(compiler, instruction, 4);
216 return;
217
218 case 3:
219 /* POR xmm1, xmm2/m128 */
220 if (reg_type == SLJIT_SIMD_REG_256)
221 instruction[1] ^= (dst_ind << 3);
222
223 /* Prefix is filled. */
224 instruction[2] = 0xeb;
225 instruction[3] = 0xc0 | (dst_ind << 3) | tmp_ind;
226 sljit_emit_op_custom(compiler, instruction, 4);
227 return;
228 }
229}
230
231#define JIT_HAS_FAST_FORWARD_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
232
233static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
234{
235DEFINE_COMPILER;
236sljit_u8 instruction[8];
237/* The AVX2 code path is currently disabled. */
238/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
241struct sljit_label *start;
242#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
243struct sljit_label *restart;
244#endif
245struct sljit_jump *quit;
246struct sljit_jump *partial_quit[2];
247vector_compare_type compare_type = vector_compare_match1;
253sljit_u32 bit = 0;
254int i;
255
257
258if (char1 != char2)
259 {
260 bit = char1 ^ char2;
261 compare_type = vector_compare_match1i;
262
263 if (!is_powerof2(bit))
264 {
265 bit = 0;
266 compare_type = vector_compare_match2;
267 }
268 }
269
270partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
271if (common->mode == PCRE2_JIT_COMPLETE)
272 add_jump(compiler, &common->failed_match, partial_quit[0]);
273
274/* First part (unaligned start) */
276sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
277
278if (char1 != char2)
279 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
280
281OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
282
284
285if (char1 != char2)
287
288#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
289restart = LABEL();
290#endif
291
292value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
293OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
294OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
295
297sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
298
299for (i = 0; i < 4; i++)
300 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
301
302sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
303OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
304OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
305
306quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
307
308OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
309
310/* Second part (aligned) */
311start = LABEL();
312
313value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
314OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
315
316partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
317if (common->mode == PCRE2_JIT_COMPLETE)
318 add_jump(compiler, &common->failed_match, partial_quit[1]);
319
321sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
322for (i = 0; i < 4; i++)
323 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
324
325sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
326CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
327
328JUMPHERE(quit);
329
330SLJIT_ASSERT(tmp1_reg_ind < 8);
331/* BSF r32, r/m32 */
332instruction[0] = 0x0f;
333instruction[1] = 0xbc;
334instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
335sljit_emit_op_custom(compiler, instruction, 3);
336
337OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
338
339if (common->mode != PCRE2_JIT_COMPLETE)
340 {
341 JUMPHERE(partial_quit[0]);
342 JUMPHERE(partial_quit[1]);
343 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
344 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
345 }
346else
347 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
348
349#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
350if (common->utf && offset > 0)
351 {
352 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
353
354 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
355
356 quit = jump_if_utf_char_start(compiler, TMP1);
357
358 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
359 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
360 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
361 JUMPTO(SLJIT_JUMP, restart);
362
363 JUMPHERE(quit);
364 }
365#endif
366}
367
368#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
369
370static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
371{
372DEFINE_COMPILER;
373sljit_u8 instruction[8];
374/* The AVX2 code path is currently disabled. */
375/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
378struct sljit_label *start;
379struct sljit_jump *quit;
380jump_list *not_found = NULL;
381vector_compare_type compare_type = vector_compare_match1;
387sljit_u32 bit = 0;
388int i;
389
390if (char1 != char2)
391 {
392 bit = char1 ^ char2;
393 compare_type = vector_compare_match1i;
394
395 if (!is_powerof2(bit))
396 {
397 bit = 0;
398 compare_type = vector_compare_match2;
399 }
400 }
401
402add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
403OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
404OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
405
406/* First part (unaligned start) */
407
409sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR1, 0, SLJIT_IMM, character_to_int32(char1 | bit));
410
411if (char1 != char2)
412 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, SLJIT_IMM, character_to_int32(bit != 0 ? bit : char2));
413
414OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
415
417
418if (char1 != char2)
420
421value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
422OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, ~value);
423OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
424
426sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
427
428for (i = 0; i < 4; i++)
429 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
430
431sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
432OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
433OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
434
435quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
436
437OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
438
439/* Second part (aligned) */
440start = LABEL();
441
442value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
443OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
444
445add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
446
448sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
449
450for (i = 0; i < 4; i++)
451 fast_forward_char_pair_sse2_compare(compiler, compare_type, reg_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
452
453sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
454CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
455
456JUMPHERE(quit);
457
458SLJIT_ASSERT(tmp1_reg_ind < 8);
459/* BSF r32, r/m32 */
460instruction[0] = 0x0f;
461instruction[1] = 0xbc;
462instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
463sljit_emit_op_custom(compiler, instruction, 3);
464
465OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
466add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
467
468OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
469return not_found;
470}
471
472#ifndef _WIN64
473
474#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD (sljit_has_cpu_feature(SLJIT_HAS_SIMD))
475
476static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
477 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
478{
479DEFINE_COMPILER;
480sljit_u8 instruction[8];
481/* The AVX2 code path is currently disabled. */
482/* sljit_s32 reg_type = sljit_has_cpu_feature(SLJIT_HAS_AVX2) ? SLJIT_SIMD_REG_256 : SLJIT_SIMD_REG_128; */
485vector_compare_type compare1_type = vector_compare_match1;
486vector_compare_type compare2_type = vector_compare_match1;
487sljit_u32 bit1 = 0;
488sljit_u32 bit2 = 0;
489sljit_u32 diff = IN_UCHARS(offs1 - offs2);
499struct sljit_label *start;
500#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
501struct sljit_label *restart;
502#endif
503struct sljit_jump *jump[2];
504int i;
505
506SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2 && offs2 >= 0);
507SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
508
509/* Initialize. */
510if (common->match_end_ptr != 0)
511 {
512 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
513 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
514 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
515
516 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
517 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
518 }
519
520OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
521add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
522
523if (char1a == char1b)
524 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
525else
526 {
527 bit1 = char1a ^ char1b;
528 if (is_powerof2(bit1))
529 {
530 compare1_type = vector_compare_match1i;
531 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a | bit1));
532 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit1));
533 }
534 else
535 {
536 compare1_type = vector_compare_match2;
537 bit1 = 0;
538 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char1a));
539 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char1b));
540 }
541 }
542
544sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR2, 0, TMP1, 0);
545
546if (char1a != char1b)
547 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR4, 0, TMP2, 0);
548
549if (char2a == char2b)
550 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
551else
552 {
553 bit2 = char2a ^ char2b;
554 if (is_powerof2(bit2))
555 {
556 compare2_type = vector_compare_match1i;
557 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a | bit2));
558 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(bit2));
559 }
560 else
561 {
562 compare2_type = vector_compare_match2;
563 bit2 = 0;
564 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, character_to_int32(char2a));
565 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, character_to_int32(char2b));
566 }
567 }
568
569sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR3, 0, TMP1, 0);
570
571if (char2a != char2b)
572 sljit_emit_simd_lane_mov(compiler, value, SLJIT_FR5, 0, TMP2, 0);
573
575if (char1a != char1b)
577
579if (char2a != char2b)
581
582#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
583restart = LABEL();
584#endif
585
586OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
587OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
588value = (reg_type == SLJIT_SIMD_REG_256) ? ~0x1f : ~0xf;
589OP2(SLJIT_AND, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
590
592sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
593
594jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
595
596sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
597jump[1] = JUMP(SLJIT_JUMP);
598
599JUMPHERE(jump[0]);
600
601if (reg_type == SLJIT_SIMD_REG_256)
602 {
603 if (diff != 16)
604 {
605 /* PSLLDQ ymm1, ymm2, imm8 */
606 instruction[0] = 0xc5;
607 instruction[1] = (sljit_u8)(0xf9 ^ (data2_ind << 3));
608 instruction[2] = 0x73;
609 instruction[3] = 0xc0 | (7 << 3) | data1_ind;
610 instruction[4] = diff & 0xf;
611 sljit_emit_op_custom(compiler, instruction, 5);
612 }
613
614 instruction[0] = 0xc4;
615 instruction[1] = 0xe3;
616 if (diff < 16)
617 {
618 /* VINSERTI128 xmm1, xmm2, xmm3/m128 */
619 /* instruction[0] = 0xc4; */
620 /* instruction[1] = 0xe3; */
621 instruction[2] = (sljit_u8)(0x7d ^ (data2_ind << 3));
622 instruction[3] = 0x38;
624 instruction[4] = 0x40 | (data2_ind << 3) | sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
625 instruction[5] = (sljit_u8)(16 - diff);
626 instruction[6] = 1;
627 sljit_emit_op_custom(compiler, instruction, 7);
628 }
629 else
630 {
631 /* VPERM2I128 xmm1, xmm2, xmm3/m128 */
632 /* instruction[0] = 0xc4; */
633 /* instruction[1] = 0xe3; */
634 value = (diff == 16) ? data1_ind : data2_ind;
635 instruction[2] = (sljit_u8)(0x7d ^ (value << 3));
636 instruction[3] = 0x46;
637 instruction[4] = 0xc0 | (data2_ind << 3) | value;
638 instruction[5] = 0x08;
639 sljit_emit_op_custom(compiler, instruction, 6);
640 }
641 }
642else
643 {
644 /* MOVDQA xmm1, xmm2/m128 */
645 instruction[0] = 0x66;
646 instruction[1] = 0x0f;
647 instruction[2] = 0x6f;
648 instruction[3] = 0xc0 | (data2_ind << 3) | data1_ind;
649 sljit_emit_op_custom(compiler, instruction, 4);
650
651 /* PSLLDQ xmm1, imm8 */
652 /* instruction[0] = 0x66; */
653 /* instruction[1] = 0x0f; */
654 instruction[2] = 0x73;
655 instruction[3] = 0xc0 | (7 << 3) | data2_ind;
656 instruction[4] = diff;
657 sljit_emit_op_custom(compiler, instruction, 5);
658 }
659
660JUMPHERE(jump[1]);
661
662value = (reg_type == SLJIT_SIMD_REG_256) ? 0x1f : 0xf;
663OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, value);
664
665for (i = 0; i < 4; i++)
666 {
667 fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
668 fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
669 }
670
672sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
673
674/* Ignore matches before the first STR_PTR. */
675OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
676OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
677
678jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
679
680OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
681
682/* Main loop. */
683start = LABEL();
684
685value = (reg_type == SLJIT_SIMD_REG_256) ? 32 : 16;
686OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, value);
687add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
688
690sljit_emit_simd_mov(compiler, reg_type | value, SLJIT_FR0, SLJIT_MEM1(STR_PTR), 0);
691sljit_emit_simd_mov(compiler, reg_type, SLJIT_FR1, SLJIT_MEM1(STR_PTR), -(sljit_sw)diff);
692
693for (i = 0; i < 4; i++)
694 {
695 fast_forward_char_pair_sse2_compare(compiler, compare1_type, reg_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
696 fast_forward_char_pair_sse2_compare(compiler, compare2_type, reg_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
697 }
698
700sljit_emit_simd_sign(compiler, SLJIT_SIMD_STORE | reg_type | SLJIT_SIMD_ELEM_8, SLJIT_FR0, TMP1, 0);
701
702CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
703
704JUMPHERE(jump[0]);
705
706SLJIT_ASSERT(tmp1_reg_ind < 8);
707/* BSF r32, r/m32 */
708instruction[0] = 0x0f;
709instruction[1] = 0xbc;
710instruction[2] = 0xc0 | (tmp1_reg_ind << 3) | tmp1_reg_ind;
711sljit_emit_op_custom(compiler, instruction, 3);
712
713OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
714
715add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
716
717#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
718if (common->utf)
719 {
720 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
721
722 jump[0] = jump_if_utf_char_start(compiler, TMP1);
723
724 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
725 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
726
727 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
728
729 JUMPHERE(jump[0]);
730 }
731#endif
732
733OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
734
735if (common->match_end_ptr != 0)
736 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
737}
738
739#endif /* !_WIN64 */
740
741#undef SIMD_COMPARE_TYPE_INDEX
742
743#endif /* SLJIT_CONFIG_X86 */
744
745#if (defined SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 && (defined __ARM_NEON || defined __ARM_NEON__))
746
747#include <arm_neon.h>
748
749typedef union {
750 unsigned int x;
751 struct { unsigned char c1, c2, c3, c4; } c;
752} int_char;
753
754#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
755static SLJIT_INLINE int utf_continue(PCRE2_SPTR s)
756{
757#if PCRE2_CODE_UNIT_WIDTH == 8
758return (*s & 0xc0) == 0x80;
759#elif PCRE2_CODE_UNIT_WIDTH == 16
760return (*s & 0xfc00) == 0xdc00;
761#else
762#error "Unknown code width"
763#endif
764}
765#endif /* SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32 */
766
767#if PCRE2_CODE_UNIT_WIDTH == 8
768# define VECTOR_FACTOR 16
769# define vect_t uint8x16_t
770# define VLD1Q(X) vld1q_u8((sljit_u8 *)(X))
771# define VCEQQ vceqq_u8
772# define VORRQ vorrq_u8
773# define VST1Q vst1q_u8
774# define VDUPQ vdupq_n_u8
775# define VEXTQ vextq_u8
776# define VANDQ vandq_u8
777typedef union {
778 uint8_t mem[16];
779 uint64_t dw[2];
780} quad_word;
781#elif PCRE2_CODE_UNIT_WIDTH == 16
782# define VECTOR_FACTOR 8
783# define vect_t uint16x8_t
784# define VLD1Q(X) vld1q_u16((sljit_u16 *)(X))
785# define VCEQQ vceqq_u16
786# define VORRQ vorrq_u16
787# define VST1Q vst1q_u16
788# define VDUPQ vdupq_n_u16
789# define VEXTQ vextq_u16
790# define VANDQ vandq_u16
791typedef union {
792 uint16_t mem[8];
793 uint64_t dw[2];
794} quad_word;
795#else
796# define VECTOR_FACTOR 4
797# define vect_t uint32x4_t
798# define VLD1Q(X) vld1q_u32((sljit_u32 *)(X))
799# define VCEQQ vceqq_u32
800# define VORRQ vorrq_u32
801# define VST1Q vst1q_u32
802# define VDUPQ vdupq_n_u32
803# define VEXTQ vextq_u32
804# define VANDQ vandq_u32
805typedef union {
806 uint32_t mem[4];
807 uint64_t dw[2];
808} quad_word;
809#endif
810
811#define FFCS
812#include "pcre2_jit_neon_inc.h"
813#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
814# define FF_UTF
815# include "pcre2_jit_neon_inc.h"
816# undef FF_UTF
817#endif
818#undef FFCS
819
820#define FFCS_2
821#include "pcre2_jit_neon_inc.h"
822#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
823# define FF_UTF
824# include "pcre2_jit_neon_inc.h"
825# undef FF_UTF
826#endif
827#undef FFCS_2
828
829#define FFCS_MASK
830#include "pcre2_jit_neon_inc.h"
831#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
832# define FF_UTF
833# include "pcre2_jit_neon_inc.h"
834# undef FF_UTF
835#endif
836#undef FFCS_MASK
837
838#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
839
840static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
841{
842DEFINE_COMPILER;
843int_char ic;
844struct sljit_jump *partial_quit, *quit;
845/* Save temporary registers. */
846OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
847OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS1, TMP3, 0);
848
849/* Prepare function arguments */
850OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
851GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
853
854if (char1 == char2)
855 {
856 ic.c.c1 = char1;
857 ic.c.c2 = char2;
858 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
859
860#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
861 if (common->utf && offset > 0)
862 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
863 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_utf));
864 else
865 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
867#else
868 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
870#endif
871 }
872else
873 {
874 PCRE2_UCHAR mask = char1 ^ char2;
875 if (is_powerof2(mask))
876 {
877 ic.c.c1 = char1 | mask;
878 ic.c.c2 = mask;
879 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
880
881#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
882 if (common->utf && offset > 0)
883 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
884 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask_utf));
885 else
886 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
887 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
888#else
889 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
890 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_mask));
891#endif
892 }
893 else
894 {
895 ic.c.c1 = char1;
896 ic.c.c2 = char2;
897 OP1(SLJIT_MOV, SLJIT_R4, 0, SLJIT_IMM, ic.x);
898
899#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
900 if (common->utf && offset > 0)
901 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
902 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2_utf));
903 else
904 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
905 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
906#else
907 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
908 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcs_2));
909#endif
910 }
911 }
912/* Restore registers. */
913OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
914OP1(SLJIT_MOV, TMP3, 0, SLJIT_MEM1(SLJIT_SP), LOCALS1);
915
916/* Check return value. */
917partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
918if (common->mode == PCRE2_JIT_COMPLETE)
919 add_jump(compiler, &common->failed_match, partial_quit);
920
921/* Fast forward STR_PTR to the result of memchr. */
922OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
923if (common->mode != PCRE2_JIT_COMPLETE)
924 {
926 JUMPHERE(partial_quit);
927 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
928 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
929 JUMPHERE(quit);
930 }
931}
932
933typedef enum {
934 compare_match1,
935 compare_match1i,
936 compare_match2,
937} compare_type;
938
939static inline vect_t fast_forward_char_pair_compare(compare_type ctype, vect_t dst, vect_t cmp1, vect_t cmp2)
940{
941if (ctype == compare_match2)
942 {
943 vect_t tmp = dst;
944 dst = VCEQQ(dst, cmp1);
945 tmp = VCEQQ(tmp, cmp2);
946 dst = VORRQ(dst, tmp);
947 return dst;
948 }
949
950if (ctype == compare_match1i)
951 dst = VORRQ(dst, cmp2);
952dst = VCEQQ(dst, cmp1);
953return dst;
954}
955
956static SLJIT_INLINE sljit_u32 max_fast_forward_char_pair_offset(void)
957{
958#if PCRE2_CODE_UNIT_WIDTH == 8
959return 15;
960#elif PCRE2_CODE_UNIT_WIDTH == 16
961return 7;
962#elif PCRE2_CODE_UNIT_WIDTH == 32
963return 3;
964#else
965#error "Unsupported unit width"
966#endif
967}
968
969/* ARM doesn't have a shift left across lanes. */
970static SLJIT_INLINE vect_t shift_left_n_lanes(vect_t a, sljit_u8 n)
971{
972vect_t zero = VDUPQ(0);
973SLJIT_ASSERT(0 < n && n < VECTOR_FACTOR);
974/* VEXTQ takes an immediate as last argument. */
975#define C(X) case X: return VEXTQ(zero, a, VECTOR_FACTOR - X);
976switch (n)
977 {
978 C(1); C(2); C(3);
979#if PCRE2_CODE_UNIT_WIDTH != 32
980 C(4); C(5); C(6); C(7);
981# if PCRE2_CODE_UNIT_WIDTH != 16
982 C(8); C(9); C(10); C(11); C(12); C(13); C(14); C(15);
983# endif
984#endif
985 default:
986 /* Based on the ASSERT(0 < n && n < VECTOR_FACTOR) above, this won't
987 happen. The return is still here for compilers to not warn. */
988 return a;
989 }
990}
991
992#define FFCPS
993#define FFCPS_DIFF1
994#define FFCPS_CHAR1A2A
995
996#define FFCPS_0
997#include "pcre2_jit_neon_inc.h"
998#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
999# define FF_UTF
1000# include "pcre2_jit_neon_inc.h"
1001# undef FF_UTF
1002#endif
1003#undef FFCPS_0
1004
1005#undef FFCPS_CHAR1A2A
1006
1007#define FFCPS_1
1008#include "pcre2_jit_neon_inc.h"
1009#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1010# define FF_UTF
1011# include "pcre2_jit_neon_inc.h"
1012# undef FF_UTF
1013#endif
1014#undef FFCPS_1
1015
1016#undef FFCPS_DIFF1
1017
1018#define FFCPS_DEFAULT
1019#include "pcre2_jit_neon_inc.h"
1020#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1021# define FF_UTF
1022# include "pcre2_jit_neon_inc.h"
1023# undef FF_UTF
1024#endif
1025#undef FFCPS
1026
1027#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1028
1029static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1030 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1031{
1032DEFINE_COMPILER;
1033sljit_u32 diff = IN_UCHARS(offs1 - offs2);
1034struct sljit_jump *partial_quit;
1035int_char ic;
1036SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1037SLJIT_ASSERT(diff <= IN_UCHARS(max_fast_forward_char_pair_offset()));
1038SLJIT_ASSERT(compiler->scratches == 5);
1039
1040/* Save temporary register STR_PTR. */
1041OP1(SLJIT_MOV, SLJIT_MEM1(SLJIT_SP), LOCALS0, STR_PTR, 0);
1042
1043/* Prepare arguments for the function call. */
1044if (common->match_end_ptr == 0)
1045 OP1(SLJIT_MOV, SLJIT_R0, 0, STR_END, 0);
1046else
1047 {
1048 OP1(SLJIT_MOV, SLJIT_R0, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1049 OP2(SLJIT_ADD, SLJIT_R0, 0, SLJIT_R0, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1050
1051 OP2U(SLJIT_SUB | SLJIT_SET_LESS, STR_END, 0, SLJIT_R0, 0);
1052 SELECT(SLJIT_LESS, SLJIT_R0, STR_END, 0, SLJIT_R0);
1053 }
1054
1055GET_LOCAL_BASE(SLJIT_R1, 0, LOCALS0);
1056OP1(SLJIT_MOV_S32, SLJIT_R2, 0, SLJIT_IMM, offs1);
1057OP1(SLJIT_MOV_S32, SLJIT_R3, 0, SLJIT_IMM, offs2);
1058ic.c.c1 = char1a;
1059ic.c.c2 = char1b;
1060ic.c.c3 = char2a;
1061ic.c.c4 = char2b;
1062OP1(SLJIT_MOV_U32, SLJIT_R4, 0, SLJIT_IMM, ic.x);
1063
1064if (diff == 1) {
1065 if (char1a == char1b && char2a == char2b) {
1066#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1067 if (common->utf)
1068 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1069 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0_utf));
1070 else
1071#endif
1072 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1073 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_0));
1074 } else {
1075#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1076 if (common->utf)
1077 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1078 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1_utf));
1079 else
1080#endif
1081 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1082 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_1));
1083 }
1084} else {
1085#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1086 if (common->utf)
1087 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1088 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default_utf));
1089 else
1090#endif
1091 sljit_emit_icall(compiler, SLJIT_CALL, SLJIT_ARGS4(W, W, W, W, W),
1092 SLJIT_IMM, SLJIT_FUNC_ADDR(ffcps_default));
1093}
1094
1095/* Restore STR_PTR register. */
1096OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_MEM1(SLJIT_SP), LOCALS0);
1097
1098/* Check return value. */
1099partial_quit = CMP(SLJIT_EQUAL, SLJIT_RETURN_REG, 0, SLJIT_IMM, 0);
1100add_jump(compiler, &common->failed_match, partial_quit);
1101
1102/* Fast forward STR_PTR to the result of memchr. */
1103OP1(SLJIT_MOV, STR_PTR, 0, SLJIT_RETURN_REG, 0);
1104
1105JUMPHERE(partial_quit);
1106}
1107
1108#endif /* SLJIT_CONFIG_ARM_64 && SLJIT_CONFIG_ARM_64 */
1109
1110#if (defined SLJIT_CONFIG_S390X && SLJIT_CONFIG_S390X)
1111
1112#if PCRE2_CODE_UNIT_WIDTH == 8
1113#define VECTOR_ELEMENT_SIZE 0
1114#elif PCRE2_CODE_UNIT_WIDTH == 16
1115#define VECTOR_ELEMENT_SIZE 1
1116#elif PCRE2_CODE_UNIT_WIDTH == 32
1117#define VECTOR_ELEMENT_SIZE 2
1118#else
1119#error "Unsupported unit width"
1120#endif
1121
1122static void load_from_mem_vector(struct sljit_compiler *compiler, BOOL vlbb, sljit_s32 dst_vreg,
1123 sljit_s32 base_reg, sljit_s32 index_reg)
1124{
1125sljit_u16 instruction[3];
1126
1127instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | index_reg);
1128instruction[1] = (sljit_u16)(base_reg << 12);
1129instruction[2] = (sljit_u16)((0x8 << 8) | (vlbb ? 0x07 : 0x06));
1130
1131sljit_emit_op_custom(compiler, instruction, 6);
1132}
1133
1134#if PCRE2_CODE_UNIT_WIDTH == 32
1135
1136static void replicate_imm_vector(struct sljit_compiler *compiler, int step, sljit_s32 dst_vreg,
1137 PCRE2_UCHAR chr, sljit_s32 tmp_general_reg)
1138{
1139sljit_u16 instruction[3];
1140
1141SLJIT_ASSERT(step >= 0 && step <= 1);
1142
1143if (chr < 0x7fff)
1144 {
1145 if (step == 1)
1146 return;
1147
1148 /* VREPI */
1149 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4));
1150 instruction[1] = (sljit_u16)chr;
1151 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1152 sljit_emit_op_custom(compiler, instruction, 6);
1153 return;
1154 }
1155
1156if (step == 0)
1157 {
1158 OP1(SLJIT_MOV, tmp_general_reg, 0, SLJIT_IMM, chr);
1159
1160 /* VLVG */
1161 instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | sljit_get_register_index(SLJIT_GP_REGISTER, tmp_general_reg));
1162 instruction[1] = 0;
1163 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x22);
1164 sljit_emit_op_custom(compiler, instruction, 6);
1165 return;
1166 }
1167
1168/* VREP */
1169instruction[0] = (sljit_u16)(0xe700 | (dst_vreg << 4) | dst_vreg);
1170instruction[1] = 0;
1171instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xc << 8) | 0x4d);
1172sljit_emit_op_custom(compiler, instruction, 6);
1173}
1174
1175#endif
1176
1177static void fast_forward_char_pair_sse2_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1178 int step, sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1179{
1180sljit_u16 instruction[3];
1181
1182SLJIT_ASSERT(step >= 0 && step <= 2);
1183
1184if (step == 1)
1185 {
1186 /* VCEQ */
1187 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1188 instruction[1] = (sljit_u16)(cmp1_ind << 12);
1189 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1190 sljit_emit_op_custom(compiler, instruction, 6);
1191 return;
1192 }
1193
1194if (compare_type != vector_compare_match2)
1195 {
1196 if (step == 0 && compare_type == vector_compare_match1i)
1197 {
1198 /* VO */
1199 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1200 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1201 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1202 sljit_emit_op_custom(compiler, instruction, 6);
1203 }
1204 return;
1205 }
1206
1207switch (step)
1208 {
1209 case 0:
1210 /* VCEQ */
1211 instruction[0] = (sljit_u16)(0xe700 | (tmp_ind << 4) | dst_ind);
1212 instruction[1] = (sljit_u16)(cmp2_ind << 12);
1213 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0xf8);
1214 sljit_emit_op_custom(compiler, instruction, 6);
1215 return;
1216
1217 case 2:
1218 /* VO */
1219 instruction[0] = (sljit_u16)(0xe700 | (dst_ind << 4) | dst_ind);
1220 instruction[1] = (sljit_u16)(tmp_ind << 12);
1221 instruction[2] = (sljit_u16)((0xe << 8) | 0x6a);
1222 sljit_emit_op_custom(compiler, instruction, 6);
1223 return;
1224 }
1225}
1226
1227#define JIT_HAS_FAST_FORWARD_CHAR_SIMD 1
1228
1229static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1230{
1231DEFINE_COMPILER;
1232sljit_u16 instruction[3];
1233struct sljit_label *start;
1234#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1235struct sljit_label *restart;
1236#endif
1237struct sljit_jump *quit;
1238struct sljit_jump *partial_quit[2];
1239vector_compare_type compare_type = vector_compare_match1;
1241sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1242sljit_s32 data_ind = 0;
1243sljit_s32 tmp_ind = 1;
1244sljit_s32 cmp1_ind = 2;
1245sljit_s32 cmp2_ind = 3;
1246sljit_s32 zero_ind = 4;
1247sljit_u32 bit = 0;
1248int i;
1249
1251
1252if (char1 != char2)
1253 {
1254 bit = char1 ^ char2;
1255 compare_type = vector_compare_match1i;
1256
1257 if (!is_powerof2(bit))
1258 {
1259 bit = 0;
1260 compare_type = vector_compare_match2;
1261 }
1262 }
1263
1264partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1265if (common->mode == PCRE2_JIT_COMPLETE)
1266 add_jump(compiler, &common->failed_match, partial_quit[0]);
1267
1268/* First part (unaligned start) */
1269
1270OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1271
1272#if PCRE2_CODE_UNIT_WIDTH != 32
1273
1274/* VREPI */
1275instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1276instruction[1] = (sljit_u16)(char1 | bit);
1277instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1278sljit_emit_op_custom(compiler, instruction, 6);
1279
1280if (char1 != char2)
1281 {
1282 /* VREPI */
1283 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1284 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1285 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1286 sljit_emit_op_custom(compiler, instruction, 6);
1287 }
1288
1289#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1290
1291for (int i = 0; i < 2; i++)
1292 {
1293 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP1);
1294
1295 if (char1 != char2)
1296 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP1);
1297 }
1298
1299#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1300
1301if (compare_type == vector_compare_match2)
1302 {
1303 /* VREPI */
1304 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1305 instruction[1] = 0;
1306 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1307 sljit_emit_op_custom(compiler, instruction, 6);
1308 }
1309
1310#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1311restart = LABEL();
1312#endif
1313
1314load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1315OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1316
1317if (compare_type != vector_compare_match2)
1318 {
1319 if (compare_type == vector_compare_match1i)
1320 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1321
1322 /* VFEE */
1323 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1324 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1325 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1326 sljit_emit_op_custom(compiler, instruction, 6);
1327 }
1328else
1329 {
1330 for (i = 0; i < 3; i++)
1331 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1332
1333 /* VFENE */
1334 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1335 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1336 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1337 sljit_emit_op_custom(compiler, instruction, 6);
1338 }
1339
1340/* VLGVB */
1341instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1342instruction[1] = 7;
1343instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1344sljit_emit_op_custom(compiler, instruction, 6);
1345
1346OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1347quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1348
1349OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1350
1351/* Second part (aligned) */
1352start = LABEL();
1353
1354OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1355
1356partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1357if (common->mode == PCRE2_JIT_COMPLETE)
1358 add_jump(compiler, &common->failed_match, partial_quit[1]);
1359
1360load_from_mem_vector(compiler, TRUE, data_ind, str_ptr_reg_ind, 0);
1361
1362if (compare_type != vector_compare_match2)
1363 {
1364 if (compare_type == vector_compare_match1i)
1365 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1366
1367 /* VFEE */
1368 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1369 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1370 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1371 sljit_emit_op_custom(compiler, instruction, 6);
1372 }
1373else
1374 {
1375 for (i = 0; i < 3; i++)
1376 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1377
1378 /* VFENE */
1379 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1380 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1381 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1382 sljit_emit_op_custom(compiler, instruction, 6);
1383 }
1384
1386JUMPTO(SLJIT_OVERFLOW, start);
1387
1388/* VLGVB */
1389instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data_ind);
1390instruction[1] = 7;
1391instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1392sljit_emit_op_custom(compiler, instruction, 6);
1393
1394OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1395
1396JUMPHERE(quit);
1397
1398if (common->mode != PCRE2_JIT_COMPLETE)
1399 {
1400 JUMPHERE(partial_quit[0]);
1401 JUMPHERE(partial_quit[1]);
1402 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
1403 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
1404 }
1405else
1406 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1407
1408#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1409if (common->utf && offset > 0)
1410 {
1411 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1412
1413 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
1414
1415 quit = jump_if_utf_char_start(compiler, TMP1);
1416
1417 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1418 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1419
1420 OP2(SLJIT_ADD, TMP2, 0, STR_PTR, 0, SLJIT_IMM, 16);
1421 JUMPTO(SLJIT_JUMP, restart);
1422
1423 JUMPHERE(quit);
1424 }
1425#endif
1426}
1427
1428#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD 1
1429
1430static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
1431{
1432DEFINE_COMPILER;
1433sljit_u16 instruction[3];
1434struct sljit_label *start;
1435struct sljit_jump *quit;
1436jump_list *not_found = NULL;
1437vector_compare_type compare_type = vector_compare_match1;
1440sljit_s32 data_ind = 0;
1441sljit_s32 tmp_ind = 1;
1442sljit_s32 cmp1_ind = 2;
1443sljit_s32 cmp2_ind = 3;
1444sljit_s32 zero_ind = 4;
1445sljit_u32 bit = 0;
1446int i;
1447
1448if (char1 != char2)
1449 {
1450 bit = char1 ^ char2;
1451 compare_type = vector_compare_match1i;
1452
1453 if (!is_powerof2(bit))
1454 {
1455 bit = 0;
1456 compare_type = vector_compare_match2;
1457 }
1458 }
1459
1460add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1461
1462/* First part (unaligned start) */
1463
1464OP2(SLJIT_ADD, TMP2, 0, TMP1, 0, SLJIT_IMM, 16);
1465
1466#if PCRE2_CODE_UNIT_WIDTH != 32
1467
1468/* VREPI */
1469instruction[0] = (sljit_u16)(0xe700 | (cmp1_ind << 4));
1470instruction[1] = (sljit_u16)(char1 | bit);
1471instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1472sljit_emit_op_custom(compiler, instruction, 6);
1473
1474if (char1 != char2)
1475 {
1476 /* VREPI */
1477 instruction[0] = (sljit_u16)(0xe700 | (cmp2_ind << 4));
1478 instruction[1] = (sljit_u16)(bit != 0 ? bit : char2);
1479 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1480 sljit_emit_op_custom(compiler, instruction, 6);
1481 }
1482
1483#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1484
1485for (int i = 0; i < 2; i++)
1486 {
1487 replicate_imm_vector(compiler, i, cmp1_ind, char1 | bit, TMP3);
1488
1489 if (char1 != char2)
1490 replicate_imm_vector(compiler, i, cmp2_ind, bit != 0 ? bit : char2, TMP3);
1491 }
1492
1493#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1494
1495if (compare_type == vector_compare_match2)
1496 {
1497 /* VREPI */
1498 instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1499 instruction[1] = 0;
1500 instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1501 sljit_emit_op_custom(compiler, instruction, 6);
1502 }
1503
1504load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1505OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, ~15);
1506
1507if (compare_type != vector_compare_match2)
1508 {
1509 if (compare_type == vector_compare_match1i)
1510 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1511
1512 /* VFEE */
1513 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1514 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1515 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1516 sljit_emit_op_custom(compiler, instruction, 6);
1517 }
1518else
1519 {
1520 for (i = 0; i < 3; i++)
1521 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1522
1523 /* VFENE */
1524 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1525 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1526 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1527 sljit_emit_op_custom(compiler, instruction, 6);
1528 }
1529
1530/* VLGVB */
1531instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1532instruction[1] = 7;
1533instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1534sljit_emit_op_custom(compiler, instruction, 6);
1535
1536OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1537quit = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1538
1539OP2(SLJIT_SUB, TMP1, 0, TMP2, 0, SLJIT_IMM, 16);
1540
1541/* Second part (aligned) */
1542start = LABEL();
1543
1544OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, 16);
1545
1546add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1547
1548load_from_mem_vector(compiler, TRUE, data_ind, tmp1_reg_ind, 0);
1549
1550if (compare_type != vector_compare_match2)
1551 {
1552 if (compare_type == vector_compare_match1i)
1553 fast_forward_char_pair_sse2_compare(compiler, compare_type, 0, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1554
1555 /* VFEE */
1556 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1557 instruction[1] = (sljit_u16)((cmp1_ind << 12) | (1 << 4));
1558 instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0xe << 8) | 0x80);
1559 sljit_emit_op_custom(compiler, instruction, 6);
1560 }
1561else
1562 {
1563 for (i = 0; i < 3; i++)
1564 fast_forward_char_pair_sse2_compare(compiler, compare_type, i, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1565
1566 /* VFENE */
1567 instruction[0] = (sljit_u16)(0xe700 | (data_ind << 4) | data_ind);
1568 instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1569 instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1570 sljit_emit_op_custom(compiler, instruction, 6);
1571 }
1572
1574JUMPTO(SLJIT_OVERFLOW, start);
1575
1576/* VLGVB */
1577instruction[0] = (sljit_u16)(0xe700 | (tmp3_reg_ind << 4) | data_ind);
1578instruction[1] = 7;
1579instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1580sljit_emit_op_custom(compiler, instruction, 6);
1581
1582OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, TMP3, 0);
1583
1584JUMPHERE(quit);
1585add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
1586
1587return not_found;
1588}
1589
1590#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD 1
1591
1592static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
1593 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
1594{
1595DEFINE_COMPILER;
1596sljit_u16 instruction[3];
1597struct sljit_label *start;
1598#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1599struct sljit_label *restart;
1600#endif
1601struct sljit_jump *quit;
1602struct sljit_jump *jump[2];
1603vector_compare_type compare1_type = vector_compare_match1;
1604vector_compare_type compare2_type = vector_compare_match1;
1605sljit_u32 bit1 = 0;
1606sljit_u32 bit2 = 0;
1607sljit_s32 diff = IN_UCHARS(offs2 - offs1);
1610sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1611sljit_s32 data1_ind = 0;
1612sljit_s32 data2_ind = 1;
1613sljit_s32 tmp1_ind = 2;
1614sljit_s32 tmp2_ind = 3;
1615sljit_s32 cmp1a_ind = 4;
1616sljit_s32 cmp1b_ind = 5;
1617sljit_s32 cmp2a_ind = 6;
1618sljit_s32 cmp2b_ind = 7;
1619sljit_s32 zero_ind = 8;
1620int i;
1621
1622SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
1623SLJIT_ASSERT(-diff <= (sljit_s32)IN_UCHARS(max_fast_forward_char_pair_offset()));
1624SLJIT_ASSERT(tmp1_reg_ind != 0 && tmp2_reg_ind != 0);
1625
1626if (char1a != char1b)
1627 {
1628 bit1 = char1a ^ char1b;
1629 compare1_type = vector_compare_match1i;
1630
1631 if (!is_powerof2(bit1))
1632 {
1633 bit1 = 0;
1634 compare1_type = vector_compare_match2;
1635 }
1636 }
1637
1638if (char2a != char2b)
1639 {
1640 bit2 = char2a ^ char2b;
1641 compare2_type = vector_compare_match1i;
1642
1643 if (!is_powerof2(bit2))
1644 {
1645 bit2 = 0;
1646 compare2_type = vector_compare_match2;
1647 }
1648 }
1649
1650/* Initialize. */
1651if (common->match_end_ptr != 0)
1652 {
1653 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
1654 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
1655 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
1656
1657 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
1658 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
1659 }
1660
1661OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1662add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1663OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1664
1665#if PCRE2_CODE_UNIT_WIDTH != 32
1666
1667OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1668
1669/* VREPI */
1670instruction[0] = (sljit_u16)(0xe700 | (cmp1a_ind << 4));
1671instruction[1] = (sljit_u16)(char1a | bit1);
1672instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45);
1673sljit_emit_op_custom(compiler, instruction, 6);
1674
1675if (char1a != char1b)
1676 {
1677 /* VREPI */
1678 instruction[0] = (sljit_u16)(0xe700 | (cmp1b_ind << 4));
1679 instruction[1] = (sljit_u16)(bit1 != 0 ? bit1 : char1b);
1680 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1681 sljit_emit_op_custom(compiler, instruction, 6);
1682 }
1683
1684/* VREPI */
1685instruction[0] = (sljit_u16)(0xe700 | (cmp2a_ind << 4));
1686instruction[1] = (sljit_u16)(char2a | bit2);
1687/* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1688sljit_emit_op_custom(compiler, instruction, 6);
1689
1690if (char2a != char2b)
1691 {
1692 /* VREPI */
1693 instruction[0] = (sljit_u16)(0xe700 | (cmp2b_ind << 4));
1694 instruction[1] = (sljit_u16)(bit2 != 0 ? bit2 : char2b);
1695 /* instruction[2] = (sljit_u16)((VECTOR_ELEMENT_SIZE << 12) | (0x8 << 8) | 0x45); */
1696 sljit_emit_op_custom(compiler, instruction, 6);
1697 }
1698
1699#else /* PCRE2_CODE_UNIT_WIDTH == 32 */
1700
1701for (int i = 0; i < 2; i++)
1702 {
1703 replicate_imm_vector(compiler, i, cmp1a_ind, char1a | bit1, TMP1);
1704
1705 if (char1a != char1b)
1706 replicate_imm_vector(compiler, i, cmp1b_ind, bit1 != 0 ? bit1 : char1b, TMP1);
1707
1708 replicate_imm_vector(compiler, i, cmp2a_ind, char2a | bit2, TMP1);
1709
1710 if (char2a != char2b)
1711 replicate_imm_vector(compiler, i, cmp2b_ind, bit2 != 0 ? bit2 : char2b, TMP1);
1712 }
1713
1714OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1715
1716#endif /* PCRE2_CODE_UNIT_WIDTH != 32 */
1717
1718/* VREPI */
1719instruction[0] = (sljit_u16)(0xe700 | (zero_ind << 4));
1720instruction[1] = 0;
1721instruction[2] = (sljit_u16)((0x8 << 8) | 0x45);
1722sljit_emit_op_custom(compiler, instruction, 6);
1723
1724#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1725restart = LABEL();
1726#endif
1727
1728jump[0] = CMP(SLJIT_LESS, TMP1, 0, TMP2, 0);
1729load_from_mem_vector(compiler, TRUE, data2_ind, tmp1_reg_ind, 0);
1730jump[1] = JUMP(SLJIT_JUMP);
1731JUMPHERE(jump[0]);
1732load_from_mem_vector(compiler, FALSE, data2_ind, tmp1_reg_ind, 0);
1733JUMPHERE(jump[1]);
1734
1735load_from_mem_vector(compiler, TRUE, data1_ind, str_ptr_reg_ind, 0);
1736OP2(SLJIT_ADD, TMP2, 0, TMP2, 0, SLJIT_IMM, 16);
1737
1738for (i = 0; i < 3; i++)
1739 {
1740 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1741 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1742 }
1743
1744/* VN */
1745instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1746instruction[1] = (sljit_u16)(data2_ind << 12);
1747instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1748sljit_emit_op_custom(compiler, instruction, 6);
1749
1750/* VFENE */
1751instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1752instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1753instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1754sljit_emit_op_custom(compiler, instruction, 6);
1755
1756/* VLGVB */
1757instruction[0] = (sljit_u16)(0xe700 | (tmp1_reg_ind << 4) | data1_ind);
1758instruction[1] = 7;
1759instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1760sljit_emit_op_custom(compiler, instruction, 6);
1761
1762OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
1763quit = CMP(SLJIT_LESS, STR_PTR, 0, TMP2, 0);
1764
1765OP2(SLJIT_SUB, STR_PTR, 0, TMP2, 0, SLJIT_IMM, 16);
1766OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, diff);
1767
1768/* Main loop. */
1769start = LABEL();
1770
1771OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1772add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1773
1774load_from_mem_vector(compiler, FALSE, data1_ind, str_ptr_reg_ind, 0);
1775load_from_mem_vector(compiler, FALSE, data2_ind, str_ptr_reg_ind, tmp1_reg_ind);
1776
1777for (i = 0; i < 3; i++)
1778 {
1779 fast_forward_char_pair_sse2_compare(compiler, compare1_type, i, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
1780 fast_forward_char_pair_sse2_compare(compiler, compare2_type, i, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
1781 }
1782
1783/* VN */
1784instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1785instruction[1] = (sljit_u16)(data2_ind << 12);
1786instruction[2] = (sljit_u16)((0xe << 8) | 0x68);
1787sljit_emit_op_custom(compiler, instruction, 6);
1788
1789/* VFENE */
1790instruction[0] = (sljit_u16)(0xe700 | (data1_ind << 4) | data1_ind);
1791instruction[1] = (sljit_u16)((zero_ind << 12) | (1 << 4));
1792instruction[2] = (sljit_u16)((0xe << 8) | 0x81);
1793sljit_emit_op_custom(compiler, instruction, 6);
1794
1796JUMPTO(SLJIT_OVERFLOW, start);
1797
1798/* VLGVB */
1799instruction[0] = (sljit_u16)(0xe700 | (tmp2_reg_ind << 4) | data1_ind);
1800instruction[1] = 7;
1801instruction[2] = (sljit_u16)((0x4 << 8) | 0x21);
1802sljit_emit_op_custom(compiler, instruction, 6);
1803
1804OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1805
1806JUMPHERE(quit);
1807
1808add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1809
1810#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1811if (common->utf)
1812 {
1813 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
1814
1815 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
1816
1817 quit = jump_if_utf_char_start(compiler, TMP1);
1818
1819 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
1820 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
1821
1822 /* TMP1 contains diff. */
1823 OP2(SLJIT_AND, TMP2, 0, STR_PTR, 0, SLJIT_IMM, ~15);
1824 OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, -diff);
1825 JUMPTO(SLJIT_JUMP, restart);
1826
1827 JUMPHERE(quit);
1828 }
1829#endif
1830
1831OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
1832
1833if (common->match_end_ptr != 0)
1834 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
1835}
1836
1837#endif /* SLJIT_CONFIG_S390X */
1838
1839#if (defined SLJIT_CONFIG_LOONGARCH_64 && SLJIT_CONFIG_LOONGARCH_64)
1840
1841#ifdef __linux__
1842/* Using getauxval(AT_HWCAP) under Linux for detecting whether LSX is available */
1843#include <sys/auxv.h>
1844#define LOONGARCH_HWCAP_LSX (1 << 4)
1845#define HAS_LSX_SUPPORT ((getauxval(AT_HWCAP) & LOONGARCH_HWCAP_LSX) != 0)
1846#else
1847#define HAS_LSX_SUPPORT 0
1848#endif
1849
1850typedef sljit_ins sljit_u32;
1851
1852#define SI12_IMM_MASK 0x003ffc00
1853#define UI5_IMM_MASK 0x00007c00
1854#define UI2_IMM_MASK 0x00000c00
1855
1856#define VD(vd) ((sljit_ins)vd << 0)
1857#define VJ(vj) ((sljit_ins)vj << 5)
1858#define VK(vk) ((sljit_ins)vk << 10)
1859#define RD_V(rd) ((sljit_ins)rd << 0)
1860#define RJ_V(rj) ((sljit_ins)rj << 5)
1861
1862#define IMM_SI12(imm) (((sljit_ins)(imm) << 10) & SI12_IMM_MASK)
1863#define IMM_UI5(imm) (((sljit_ins)(imm) << 10) & UI5_IMM_MASK)
1864#define IMM_UI2(imm) (((sljit_ins)(imm) << 10) & UI2_IMM_MASK)
1865
1866// LSX OPCODES:
1867#define VLD 0x2c000000
1868#define VOR_V 0x71268000
1869#define VAND_V 0x71260000
1870#define VBSLL_V 0x728e0000
1871#define VMSKLTZ_B 0x729c4000
1872#define VPICKVE2GR_WU 0x72f3e000
1873
1874#if PCRE2_CODE_UNIT_WIDTH == 8
1875#define VREPLGR2VR 0x729f0000
1876#define VSEQ 0x70000000
1877#elif PCRE2_CODE_UNIT_WIDTH == 16
1878#define VREPLGR2VR 0x729f0400
1879#define VSEQ 0x70008000
1880#else
1881#define VREPLGR2VR 0x729f0800
1882#define VSEQ 0x70010000
1883#endif
1884
1885static void fast_forward_char_pair_lsx_compare(struct sljit_compiler *compiler, vector_compare_type compare_type,
1886 sljit_s32 dst_ind, sljit_s32 cmp1_ind, sljit_s32 cmp2_ind, sljit_s32 tmp_ind)
1887{
1888if (compare_type != vector_compare_match2)
1889 {
1890 if (compare_type == vector_compare_match1i)
1891 {
1892 /* VOR.V vd, vj, vk */
1893 push_inst(compiler, VOR_V | VD(dst_ind) | VJ(cmp2_ind) | VK(dst_ind));
1894 }
1895
1896 /* VSEQ.B/H/W vd, vj, vk */
1897 push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1898 return;
1899 }
1900
1901/* VBSLL.V vd, vj, ui5 */
1902push_inst(compiler, VBSLL_V | VD(tmp_ind) | VJ(dst_ind) | IMM_UI5(0));
1903
1904/* VSEQ.B/H/W vd, vj, vk */
1905push_inst(compiler, VSEQ | VD(dst_ind) | VJ(dst_ind) | VK(cmp1_ind));
1906
1907/* VSEQ.B/H/W vd, vj, vk */
1908push_inst(compiler, VSEQ | VD(tmp_ind) | VJ(tmp_ind) | VK(cmp2_ind));
1909
1910/* VOR vd, vj, vk */
1911push_inst(compiler, VOR_V | VD(dst_ind) | VJ(tmp_ind) | VK(dst_ind));
1912return;
1913}
1914
1915#define JIT_HAS_FAST_FORWARD_CHAR_SIMD HAS_LSX_SUPPORT
1916
1917static void fast_forward_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2, sljit_s32 offset)
1918{
1919DEFINE_COMPILER;
1920struct sljit_label *start;
1921#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1922struct sljit_label *restart;
1923#endif
1924struct sljit_jump *quit;
1925struct sljit_jump *partial_quit[2];
1926vector_compare_type compare_type = vector_compare_match1;
1928sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
1929sljit_s32 data_ind = 0;
1930sljit_s32 tmp_ind = 1;
1931sljit_s32 cmp1_ind = 2;
1932sljit_s32 cmp2_ind = 3;
1933sljit_u32 bit = 0;
1934
1936
1937if (char1 != char2)
1938 {
1939 bit = char1 ^ char2;
1940 compare_type = vector_compare_match1i;
1941
1942 if (!is_powerof2(bit))
1943 {
1944 bit = 0;
1945 compare_type = vector_compare_match2;
1946 }
1947 }
1948
1949partial_quit[0] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
1950if (common->mode == PCRE2_JIT_COMPLETE)
1951 add_jump(compiler, &common->failed_match, partial_quit[0]);
1952
1953/* First part (unaligned start) */
1954
1955OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
1956
1957/* VREPLGR2VR.B/H/W vd, rj */
1958push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
1959
1960if (char1 != char2)
1961 {
1962 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
1963
1964 /* VREPLGR2VR.B/H/W vd, rj */
1965 push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
1966 }
1967
1968OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
1969
1970#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
1971restart = LABEL();
1972#endif
1973
1974OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
1975OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1976
1977/* VLD vd, rj, si12 */
1978push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
1979fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
1980
1981/* VMSKLTZ.B vd, vj */
1982push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
1983
1984/* VPICKVE2GR.WU rd, vj, ui2 */
1985push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
1986
1987OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1988OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
1989
1990quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
1991
1992OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
1993
1994/* Second part (aligned) */
1995start = LABEL();
1996
1997OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
1998
1999partial_quit[1] = CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0);
2000if (common->mode == PCRE2_JIT_COMPLETE)
2001 add_jump(compiler, &common->failed_match, partial_quit[1]);
2002
2003/* VLD vd, rj, si12 */
2004push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2005fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2006
2007/* VMSKLTZ.B vd, vj */
2008push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2009
2010/* VPICKVE2GR.WU rd, vj, ui2 */
2011push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2012
2013CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2014
2015JUMPHERE(quit);
2016
2017/* CTZ.W rd, rj */
2018push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2019
2020OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2021
2022if (common->mode != PCRE2_JIT_COMPLETE)
2023 {
2024 JUMPHERE(partial_quit[0]);
2025 JUMPHERE(partial_quit[1]);
2026 OP2U(SLJIT_SUB | SLJIT_SET_GREATER, STR_PTR, 0, STR_END, 0);
2027 SELECT(SLJIT_GREATER, STR_PTR, STR_END, 0, STR_PTR);
2028 }
2029else
2030 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2031
2032#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2033if (common->utf && offset > 0)
2034 {
2035 SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE);
2036
2037 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offset));
2038
2039 quit = jump_if_utf_char_start(compiler, TMP1);
2040
2041 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2042 add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2043 OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2044 JUMPTO(SLJIT_JUMP, restart);
2045
2046 JUMPHERE(quit);
2047 }
2048#endif
2049}
2050
2051#define JIT_HAS_FAST_REQUESTED_CHAR_SIMD HAS_LSX_SUPPORT
2052
2053static jump_list *fast_requested_char_simd(compiler_common *common, PCRE2_UCHAR char1, PCRE2_UCHAR char2)
2054{
2055DEFINE_COMPILER;
2056struct sljit_label *start;
2057struct sljit_jump *quit;
2058jump_list *not_found = NULL;
2059vector_compare_type compare_type = vector_compare_match1;
2061sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2062sljit_s32 data_ind = 0;
2063sljit_s32 tmp_ind = 1;
2064sljit_s32 cmp1_ind = 2;
2065sljit_s32 cmp2_ind = 3;
2066sljit_u32 bit = 0;
2067
2068if (char1 != char2)
2069 {
2070 bit = char1 ^ char2;
2071 compare_type = vector_compare_match1i;
2072
2073 if (!is_powerof2(bit))
2074 {
2075 bit = 0;
2076 compare_type = vector_compare_match2;
2077 }
2078 }
2079
2080add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2081OP1(SLJIT_MOV, TMP2, 0, TMP1, 0);
2082OP1(SLJIT_MOV, TMP3, 0, STR_PTR, 0);
2083
2084/* First part (unaligned start) */
2085
2086OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1 | bit);
2087
2088/* VREPLGR2VR vd, rj */
2089push_inst(compiler, VREPLGR2VR | VD(cmp1_ind) | RJ_V(tmp1_reg_ind));
2090
2091if (char1 != char2)
2092 {
2093 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, bit != 0 ? bit : char2);
2094 /* VREPLGR2VR vd, rj */
2095 push_inst(compiler, VREPLGR2VR | VD(cmp2_ind) | RJ_V(tmp1_reg_ind));
2096 }
2097
2098OP1(SLJIT_MOV, STR_PTR, 0, TMP2, 0);
2099OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2100OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2101
2102/* VLD vd, rj, si12 */
2103push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2104fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2105
2106/* VMSKLTZ.B vd, vj */
2107push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2108
2109/* VPICKVE2GR.WU rd, vj, ui2 */
2110push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2111
2112OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2113OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2114
2115quit = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2116
2117OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2118
2119/* Second part (aligned) */
2120start = LABEL();
2121
2122OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2123
2124add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2125
2126/* VLD vd, rj, si12 */
2127push_inst(compiler, VLD | VD(data_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2128fast_forward_char_pair_lsx_compare(compiler, compare_type, data_ind, cmp1_ind, cmp2_ind, tmp_ind);
2129
2130/* VMSKLTZ.B vd, vj */
2131push_inst(compiler, VMSKLTZ_B | VD(tmp_ind) | VJ(data_ind));
2132
2133/* VPICKVE2GR.WU rd, vj, ui2 */
2134push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp_ind) | IMM_UI2(0));
2135
2136CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2137
2138JUMPHERE(quit);
2139
2140/* CTZ.W rd, rj */
2141push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2142
2143OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, STR_PTR, 0);
2144add_jump(compiler, &not_found, CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_END, 0));
2145
2146OP1(SLJIT_MOV, STR_PTR, 0, TMP3, 0);
2147return not_found;
2148}
2149
2150#define JIT_HAS_FAST_FORWARD_CHAR_PAIR_SIMD HAS_LSX_SUPPORT
2151
2152static void fast_forward_char_pair_simd(compiler_common *common, sljit_s32 offs1,
2153 PCRE2_UCHAR char1a, PCRE2_UCHAR char1b, sljit_s32 offs2, PCRE2_UCHAR char2a, PCRE2_UCHAR char2b)
2154{
2155DEFINE_COMPILER;
2156vector_compare_type compare1_type = vector_compare_match1;
2157vector_compare_type compare2_type = vector_compare_match1;
2158sljit_u32 bit1 = 0;
2159sljit_u32 bit2 = 0;
2160sljit_u32 diff = IN_UCHARS(offs1 - offs2);
2163sljit_s32 str_ptr_reg_ind = sljit_get_register_index(SLJIT_GP_REGISTER, STR_PTR);
2164sljit_s32 data1_ind = 0;
2165sljit_s32 data2_ind = 1;
2166sljit_s32 tmp1_ind = 2;
2167sljit_s32 tmp2_ind = 3;
2168sljit_s32 cmp1a_ind = 4;
2169sljit_s32 cmp1b_ind = 5;
2170sljit_s32 cmp2a_ind = 6;
2171sljit_s32 cmp2b_ind = 7;
2172struct sljit_label *start;
2173#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2174struct sljit_label *restart;
2175#endif
2176struct sljit_jump *jump[2];
2177
2178SLJIT_ASSERT(common->mode == PCRE2_JIT_COMPLETE && offs1 > offs2);
2179SLJIT_ASSERT(diff <= (unsigned)IN_UCHARS(max_fast_forward_char_pair_offset()));
2180
2181/* Initialize. */
2182if (common->match_end_ptr != 0)
2183 {
2184 OP1(SLJIT_MOV, TMP1, 0, SLJIT_MEM1(SLJIT_SP), common->match_end_ptr);
2185 OP2(SLJIT_ADD, TMP1, 0, TMP1, 0, SLJIT_IMM, IN_UCHARS(offs1 + 1));
2186 OP1(SLJIT_MOV, TMP3, 0, STR_END, 0);
2187
2188 OP2U(SLJIT_SUB | SLJIT_SET_LESS, TMP1, 0, STR_END, 0);
2189 SELECT(SLJIT_LESS, STR_END, TMP1, 0, STR_END);
2190 }
2191
2192OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2193add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2194
2195if (char1a == char1b)
2196 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2197else
2198 {
2199 bit1 = char1a ^ char1b;
2200 if (is_powerof2(bit1))
2201 {
2202 compare1_type = vector_compare_match1i;
2203 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a | bit1);
2204 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit1);
2205 }
2206 else
2207 {
2208 compare1_type = vector_compare_match2;
2209 bit1 = 0;
2210 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char1a);
2211 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char1b);
2212 }
2213 }
2214
2215/* VREPLGR2VR vd, rj */
2216push_inst(compiler, VREPLGR2VR | VD(cmp1a_ind) | RJ_V(tmp1_reg_ind));
2217
2218if (char1a != char1b)
2219 {
2220 /* VREPLGR2VR vd, rj */
2221 push_inst(compiler, VREPLGR2VR | VD(cmp1b_ind) | RJ_V(tmp2_reg_ind));
2222 }
2223
2224if (char2a == char2b)
2225 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2226else
2227 {
2228 bit2 = char2a ^ char2b;
2229 if (is_powerof2(bit2))
2230 {
2231 compare2_type = vector_compare_match1i;
2232 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a | bit2);
2233 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, bit2);
2234 }
2235 else
2236 {
2237 compare2_type = vector_compare_match2;
2238 bit2 = 0;
2239 OP1(SLJIT_MOV, TMP1, 0, SLJIT_IMM, char2a);
2240 OP1(SLJIT_MOV, TMP2, 0, SLJIT_IMM, char2b);
2241 }
2242 }
2243
2244/* VREPLGR2VR vd, rj */
2245push_inst(compiler, VREPLGR2VR | VD(cmp2a_ind) | RJ_V(tmp1_reg_ind));
2246
2247if (char2a != char2b)
2248 {
2249 /* VREPLGR2VR vd, rj */
2250 push_inst(compiler, VREPLGR2VR | VD(cmp2b_ind) | RJ_V(tmp2_reg_ind));
2251 }
2252
2253#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2254restart = LABEL();
2255#endif
2256
2257OP2(SLJIT_SUB, TMP1, 0, STR_PTR, 0, SLJIT_IMM, diff);
2258OP1(SLJIT_MOV, TMP2, 0, STR_PTR, 0);
2259OP2(SLJIT_AND, TMP2, 0, TMP2, 0, SLJIT_IMM, 0xf);
2260OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2261
2262/* VLD vd, rj, si12 */
2263push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2264
2265jump[0] = CMP(SLJIT_GREATER_EQUAL, TMP1, 0, STR_PTR, 0);
2266
2267/* VLD vd, rj, si12 */
2268push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2269jump[1] = JUMP(SLJIT_JUMP);
2270
2271JUMPHERE(jump[0]);
2272
2273/* VBSLL.V vd, vj, ui5 */
2274push_inst(compiler, VBSLL_V | VD(data2_ind) | VJ(data1_ind) | IMM_UI5(diff));
2275
2276JUMPHERE(jump[1]);
2277
2278fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp2_ind);
2279fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp1_ind);
2280
2281/* VAND vd, vj, vk */
2282push_inst(compiler, VOR_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2283
2284/* VMSKLTZ.B vd, vj */
2285push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2286
2287/* VPICKVE2GR.WU rd, vj, ui2 */
2288push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2289
2290/* Ignore matches before the first STR_PTR. */
2291OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2292OP2(SLJIT_LSHR, TMP1, 0, TMP1, 0, TMP2, 0);
2293
2294jump[0] = CMP(SLJIT_NOT_ZERO, TMP1, 0, SLJIT_IMM, 0);
2295
2296OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, TMP2, 0);
2297
2298/* Main loop. */
2299start = LABEL();
2300
2301OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, 16);
2302add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2303
2304/* VLD vd, rj, si12 */
2305push_inst(compiler, VLD | VD(data1_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(0));
2306push_inst(compiler, VLD | VD(data2_ind) | RJ_V(str_ptr_reg_ind) | IMM_SI12(-(sljit_s8)diff));
2307
2308fast_forward_char_pair_lsx_compare(compiler, compare1_type, data1_ind, cmp1a_ind, cmp1b_ind, tmp2_ind);
2309fast_forward_char_pair_lsx_compare(compiler, compare2_type, data2_ind, cmp2a_ind, cmp2b_ind, tmp1_ind);
2310
2311/* VAND.V vd, vj, vk */
2312push_inst(compiler, VAND_V | VD(data1_ind) | VJ(data1_ind) | VK(data2_ind));
2313
2314/* VMSKLTZ.B vd, vj */
2315push_inst(compiler, VMSKLTZ_B | VD(tmp1_ind) | VJ(data1_ind));
2316
2317/* VPICKVE2GR.WU rd, vj, ui2 */
2318push_inst(compiler, VPICKVE2GR_WU | RD_V(tmp1_reg_ind) | VJ(tmp1_ind) | IMM_UI2(0));
2319
2320CMPTO(SLJIT_ZERO, TMP1, 0, SLJIT_IMM, 0, start);
2321
2322JUMPHERE(jump[0]);
2323
2324/* CTZ.W rd, rj */
2325push_inst(compiler, CTZ_W | RD_V(tmp1_reg_ind) | RJ_V(tmp1_reg_ind));
2326
2327OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, TMP1, 0);
2328
2329add_jump(compiler, &common->failed_match, CMP(SLJIT_GREATER_EQUAL, STR_PTR, 0, STR_END, 0));
2330
2331#if defined SUPPORT_UNICODE && PCRE2_CODE_UNIT_WIDTH != 32
2332if (common->utf)
2333 {
2334 OP1(MOV_UCHAR, TMP1, 0, SLJIT_MEM1(STR_PTR), IN_UCHARS(-offs1));
2335
2336 jump[0] = jump_if_utf_char_start(compiler, TMP1);
2337
2338 OP2(SLJIT_ADD, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(1));
2339 CMPTO(SLJIT_LESS, STR_PTR, 0, STR_END, 0, restart);
2340
2341 add_jump(compiler, &common->failed_match, JUMP(SLJIT_JUMP));
2342
2343 JUMPHERE(jump[0]);
2344 }
2345#endif
2346
2347OP2(SLJIT_SUB, STR_PTR, 0, STR_PTR, 0, SLJIT_IMM, IN_UCHARS(offs1));
2348
2349if (common->match_end_ptr != 0)
2350 OP1(SLJIT_MOV, STR_END, 0, TMP3, 0);
2351}
2352
2353#endif /* SLJIT_CONFIG_LOONGARCH_64 */
2354
2355#endif /* !SUPPORT_VALGRIND */
chr(int $codepoint)
char s[4]
Definition cdf.c:77
zend_long n
Definition ffi.c:4979
ctype
Definition ffi.c:4208
buf start
Definition ffi.c:4687
zend_long offset
#define TRUE
Definition gd_gd.c:7
#define FALSE
Definition gd_gd.c:8
#define NULL
Definition gdcache.h:45
#define C(x)
Definition hash_gost.c:111
@ VK
Definition minilua.c:114
#define PCRE2_UCHAR
Definition pcre2.h:819
#define PCRE2_SPTR
Definition pcre2.h:820
#define PCRE2_JIT_COMPLETE
Definition pcre2.h:165
int BOOL
#define W(i)
Definition sha1.c:125
#define SLJIT_TMP_FR0
unsigned short int sljit_u16
unsigned char sljit_u8
signed int sljit_s32
unsigned int sljit_u32
signed char sljit_s8
#define SLJIT_ASSERT(x)
#define SLJIT_UNUSED_ARG(arg)
#define SLJIT_INLINE
int sljit_sw
#define SLJIT_FR0
Definition sljitLir.h:229
#define SLJIT_SIMD_MEM_ALIGNED_256
Definition sljitLir.h:1935
#define SLJIT_SIMD_OP2_AND
Definition sljitLir.h:2094
#define SLJIT_FR3
Definition sljitLir.h:232
#define SLJIT_OVERFLOW
Definition sljitLir.h:1574
#define SLJIT_SIMD_ELEM_32
Definition sljitLir.h:1914
#define SLJIT_SIMD_STORE
Definition sljitLir.h:1896
#define SLJIT_R1
Definition sljitLir.h:169
#define SLJIT_SET_OVERFLOW
Definition sljitLir.h:1575
#define SLJIT_SP
Definition sljitLir.h:214
#define SLJIT_ADD
Definition sljitLir.h:1203
#define SLJIT_FUNC_ADDR(func_name)
Definition sljitLir.h:2428
#define SLJIT_RETURN_REG
Definition sljitLir.h:218
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_get_register_index(sljit_s32 type, sljit_s32 reg)
#define SLJIT_NOT_EQUAL
Definition sljitLir.h:1554
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_op2(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 dst_freg, sljit_s32 src1_freg, sljit_s32 src2_freg)
#define SLJIT_NOT_ZERO
Definition sljitLir.h:1555
#define SLJIT_SIMD_ELEM_8
Definition sljitLir.h:1910
#define SLJIT_FR5
Definition sljitLir.h:234
#define SLJIT_FR6
Definition sljitLir.h:235
#define SLJIT_SUB
Definition sljitLir.h:1211
#define SLJIT_SET_LESS
Definition sljitLir.h:1558
SLJIT_API_FUNC_ATTRIBUTE void sljit_set_current_flags(struct sljit_compiler *compiler, sljit_s32 current_flags)
#define SLJIT_IMM
Definition sljitLir.h:931
#define SLJIT_CALL
Definition sljitLir.h:1647
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_mov(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 lane_index, sljit_s32 srcdst, sljit_sw srcdstw)
#define SLJIT_R3
Definition sljitLir.h:176
#define SLJIT_R2
Definition sljitLir.h:170
#define SLJIT_FR1
Definition sljitLir.h:230
#define SLJIT_MEM1(r1)
Definition sljitLir.h:929
#define SLJIT_EQUAL
Definition sljitLir.h:1552
#define SLJIT_JUMP
Definition sljitLir.h:1643
#define SLJIT_GREATER
Definition sljitLir.h:1561
#define SLJIT_AND
Definition sljitLir.h:1221
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_mov(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 srcdst, sljit_sw srcdstw)
#define SLJIT_SIMD_LANE_ZERO
Definition sljitLir.h:1986
#define SLJIT_FLOAT_REGISTER
Definition sljitLir.h:2214
#define SLJIT_R4
Definition sljitLir.h:177
#define SLJIT_MOV_U32
Definition sljitLir.h:1140
#define SLJIT_GREATER_EQUAL
Definition sljitLir.h:1559
#define SLJIT_GP_REGISTER
Definition sljitLir.h:2212
#define SLJIT_SIMD_REG_256
Definition sljitLir.h:1906
#define SLJIT_ARGS4(ret, arg1, arg2, arg3, arg4)
Definition sljitLir.h:407
#define SLJIT_SIMD_MEM_ALIGNED_128
Definition sljitLir.h:1933
#define SLJIT_SIMD_REG_128
Definition sljitLir.h:1904
#define SLJIT_SET_GREATER
Definition sljitLir.h:1562
#define SLJIT_LSHR
Definition sljitLir.h:1246
#define SLJIT_MOV_S32
Definition sljitLir.h:1143
#define SLJIT_ZERO
Definition sljitLir.h:1553
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_sign(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 dst, sljit_sw dstw)
#define SLJIT_R0
Definition sljitLir.h:168
#define SLJIT_FR4
Definition sljitLir.h:233
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_op_custom(struct sljit_compiler *compiler, void *instruction, sljit_u32 size)
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_icall(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 arg_types, sljit_s32 src, sljit_sw srcw)
#define SLJIT_LESS
Definition sljitLir.h:1557
#define SLJIT_MOV
Definition sljitLir.h:1125
#define SLJIT_FR2
Definition sljitLir.h:231
SLJIT_API_FUNC_ATTRIBUTE sljit_s32 sljit_emit_simd_lane_replicate(struct sljit_compiler *compiler, sljit_s32 type, sljit_s32 freg, sljit_s32 src, sljit_s32 src_lane_index)
#define CMP
#define VD(vd)
sljit_u32 sljit_ins
#define VOR_V
#define CTZ_W
#define VREPLGR2VR
#define VLD
#define VAND_V
sljit_s32 scratches
Definition sljitLir.h:479
$obj a
Definition test.php:84
value