22#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
23# include <nmmintrin.h>
24# include <wmmintrin.h>
27#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
31#if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE) || defined(ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER)
33typedef struct _crc32_pclmul_bit_consts {
40static const crc32_pclmul_consts crc32_pclmul_consts_maps[
X86_CRC32_MAX] = {
42 {0x00e6228b11, 0x008833794c},
43 {0x00e8a45605, 0x00c5b9cd4c},
44 {0x00490d678d, 0x00f200aa66},
45 {0x0104d101df, 0x0104c11db7}
48 {0x0154442bd4, 0x01c6e41596},
49 {0x01751997d0, 0x00ccaa009e},
50 {0x0163cd6124, 0x01db710640},
51 {0x01f7011641, 0x01db710641},
54 {0x00740eef02, 0x009e4addf8},
55 {0x00f20c0dfe, 0x014cd00bd6},
56 {0x00dd45aab8, 0x0000000000},
57 {0x00dea713f1, 0x0105ec76f0}
61static uint8_t pclmul_shuf_mask_table[16] = {
62 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
63 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00,
67#define CRC32_FOLDING_BLOCK_SIZE (16)
71size_t crc32_pclmul_batch(uint32_t *crc,
const unsigned char *
p,
size_t nr,
const crc32_pclmul_consts *
consts)
74 __m128i x0, x1, x2, k, shuf_mask;
76 if (nr < CRC32_FOLDING_BLOCK_SIZE) {
80 shuf_mask = _mm_loadu_si128((__m128i *)(pclmul_shuf_mask_table));
81 x0 = _mm_cvtsi32_si128(*crc);
82 x1 = _mm_loadu_si128((__m128i *)(
p + 0x00));
83 x0 = _mm_slli_si128(x0, 12);
84 x1 = _mm_shuffle_epi8(x1, shuf_mask);
85 x0 = _mm_xor_si128(x1, x0);
86 p += CRC32_FOLDING_BLOCK_SIZE;
87 nr -= CRC32_FOLDING_BLOCK_SIZE;
89 if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
92 x1 = _mm_loadu_si128((__m128i *)(
p + 0x00));
93 x1 = _mm_shuffle_epi8(x1, shuf_mask);
94 x2 = _mm_loadu_si128((__m128i *)(
p + 0x10));
95 x2 = _mm_shuffle_epi8(x2, shuf_mask);
96 x3 = _mm_loadu_si128((__m128i *)(
p + 0x20));
97 x3 = _mm_shuffle_epi8(x3, shuf_mask);
98 p += CRC32_FOLDING_BLOCK_SIZE * 3;
99 nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
101 k = _mm_loadu_si128((__m128i *)
consts->k1k2);
103 while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
104 __m128i x5, x6, x7, x8, x9, x10, x11;
105 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
106 x5 = _mm_clmulepi64_si128(x1, k, 0x00);
107 x6 = _mm_clmulepi64_si128(x2, k, 0x00);
108 x7 = _mm_clmulepi64_si128(x3, k, 0x00);
109 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
110 x1 = _mm_clmulepi64_si128(x1, k, 0x11);
111 x2 = _mm_clmulepi64_si128(x2, k, 0x11);
112 x3 = _mm_clmulepi64_si128(x3, k, 0x11);
113 x8 = _mm_loadu_si128((__m128i *)(
p + 0x00));
114 x8 = _mm_shuffle_epi8(x8, shuf_mask);
115 x9 = _mm_loadu_si128((__m128i *)(
p + 0x10));
116 x9 = _mm_shuffle_epi8(x9, shuf_mask);
117 x10 = _mm_loadu_si128((__m128i *)(
p + 0x20));
118 x10 = _mm_shuffle_epi8(x10, shuf_mask);
119 x11 = _mm_loadu_si128((__m128i *)(
p + 0x30));
120 x11 = _mm_shuffle_epi8(x11, shuf_mask);
121 x0 = _mm_xor_si128(x0, x4);
122 x1 = _mm_xor_si128(x1, x5);
123 x2 = _mm_xor_si128(x2, x6);
124 x3 = _mm_xor_si128(x3, x7);
125 x0 = _mm_xor_si128(x0, x8);
126 x1 = _mm_xor_si128(x1, x9);
127 x2 = _mm_xor_si128(x2, x10);
128 x3 = _mm_xor_si128(x3, x11);
130 p += CRC32_FOLDING_BLOCK_SIZE * 4;
131 nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
134 k = _mm_loadu_si128((__m128i *)
consts->k3k4);
136 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
137 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
138 x0 = _mm_xor_si128(x0, x1);
139 x0 = _mm_xor_si128(x0, x4);
140 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
141 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
142 x0 = _mm_xor_si128(x0, x2);
143 x0 = _mm_xor_si128(x0, x4);
144 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
145 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
146 x0 = _mm_xor_si128(x0, x3);
147 x0 = _mm_xor_si128(x0, x4);
150 k = _mm_loadu_si128((__m128i *)
consts->k3k4);
152 while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
154 x2 = _mm_loadu_si128((__m128i *)(
p + 0x00));
155 x2 = _mm_shuffle_epi8(x2, shuf_mask);
156 x1 = _mm_clmulepi64_si128(x0, k, 0x00);
157 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
158 x0 = _mm_xor_si128(x0, x2);
159 x0 = _mm_xor_si128(x0, x1);
160 p += CRC32_FOLDING_BLOCK_SIZE;
161 nr -= CRC32_FOLDING_BLOCK_SIZE;
165 k = _mm_loadu_si128((__m128i*)
consts->k5k6);
166 x1 = _mm_clmulepi64_si128(x0, k, 0x11);
167 x0 = _mm_slli_si128(x0, 8);
168 x0 = _mm_srli_si128(x0, 4);
169 x0 = _mm_xor_si128(x0, x1);
171 x1 = _mm_clmulepi64_si128(x0, k, 0x01);
172 x0 = _mm_xor_si128(x0, x1);
175 k = _mm_loadu_si128((__m128i*)
consts->uPx);
176 x1 = _mm_move_epi64(x0);
177 x1 = _mm_srli_si128(x1, 4);
178 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
179 x1 = _mm_srli_si128(x1, 4);
180 x1 = _mm_clmulepi64_si128(x1, k, 0x10);
181 x0 = _mm_xor_si128(x1, x0);
182 *crc = _mm_extract_epi32(x0, 0);
188size_t crc32_pclmul_reflected_batch(uint32_t *crc,
const unsigned char *
p,
size_t nr,
const crc32_pclmul_consts *
consts)
191 __m128i x0, x1, x2, k;
193 if (nr < CRC32_FOLDING_BLOCK_SIZE) {
197 x0 = _mm_loadu_si128((__m128i *)(
p + 0x00));
198 x0 = _mm_xor_si128(x0, _mm_cvtsi32_si128(*crc));
199 p += CRC32_FOLDING_BLOCK_SIZE;
200 nr -= CRC32_FOLDING_BLOCK_SIZE;
201 if (nr >= (CRC32_FOLDING_BLOCK_SIZE * 3)) {
204 x1 = _mm_loadu_si128((__m128i *)(
p + 0x00));
205 x2 = _mm_loadu_si128((__m128i *)(
p + 0x10));
206 x3 = _mm_loadu_si128((__m128i *)(
p + 0x20));
207 p += CRC32_FOLDING_BLOCK_SIZE * 3;
208 nr -= CRC32_FOLDING_BLOCK_SIZE * 3;
210 k = _mm_loadu_si128((__m128i *)
consts->k1k2);
212 while (nr >= (CRC32_FOLDING_BLOCK_SIZE * 4)) {
213 __m128i x5, x6, x7, x8, x9, x10, x11;
214 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
215 x5 = _mm_clmulepi64_si128(x1, k, 0x00);
216 x6 = _mm_clmulepi64_si128(x2, k, 0x00);
217 x7 = _mm_clmulepi64_si128(x3, k, 0x00);
218 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
219 x1 = _mm_clmulepi64_si128(x1, k, 0x11);
220 x2 = _mm_clmulepi64_si128(x2, k, 0x11);
221 x3 = _mm_clmulepi64_si128(x3, k, 0x11);
222 x8 = _mm_loadu_si128((__m128i *)(
p + 0x00));
223 x9 = _mm_loadu_si128((__m128i *)(
p + 0x10));
224 x10 = _mm_loadu_si128((__m128i *)(
p + 0x20));
225 x11 = _mm_loadu_si128((__m128i *)(
p + 0x30));
226 x0 = _mm_xor_si128(x0, x4);
227 x1 = _mm_xor_si128(x1, x5);
228 x2 = _mm_xor_si128(x2, x6);
229 x3 = _mm_xor_si128(x3, x7);
230 x0 = _mm_xor_si128(x0, x8);
231 x1 = _mm_xor_si128(x1, x9);
232 x2 = _mm_xor_si128(x2, x10);
233 x3 = _mm_xor_si128(x3, x11);
235 p += CRC32_FOLDING_BLOCK_SIZE * 4;
236 nr -= CRC32_FOLDING_BLOCK_SIZE * 4;
239 k = _mm_loadu_si128((__m128i *)
consts->k3k4);
241 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
242 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
243 x0 = _mm_xor_si128(x0, x1);
244 x0 = _mm_xor_si128(x0, x4);
245 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
246 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
247 x0 = _mm_xor_si128(x0, x2);
248 x0 = _mm_xor_si128(x0, x4);
249 x4 = _mm_clmulepi64_si128(x0, k, 0x00);
250 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
251 x0 = _mm_xor_si128(x0, x3);
252 x0 = _mm_xor_si128(x0, x4);
255 k = _mm_loadu_si128((__m128i *)
consts->k3k4);
257 while (nr >= CRC32_FOLDING_BLOCK_SIZE) {
259 x2 = _mm_loadu_si128((__m128i *)(
p + 0x00));
260 x1 = _mm_clmulepi64_si128(x0, k, 0x00);
261 x0 = _mm_clmulepi64_si128(x0, k, 0x11);
262 x0 = _mm_xor_si128(x0, x2);
263 x0 = _mm_xor_si128(x0, x1);
264 p += CRC32_FOLDING_BLOCK_SIZE;
265 nr -= CRC32_FOLDING_BLOCK_SIZE;
269 x1 = _mm_clmulepi64_si128(x0, k, 0x10);
270 x0 = _mm_srli_si128(x0, 8);
271 x0 = _mm_xor_si128(x0, x1);
273 x1 = _mm_shuffle_epi32(x0, 0xfc);
274 x0 = _mm_shuffle_epi32(x0, 0xf9);
275 k = _mm_loadu_si128((__m128i*)
consts->k5k6);
276 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
277 x0 = _mm_xor_si128(x0, x1);
280 x1 = _mm_shuffle_epi32(x0, 0xf3);
281 x0 = _mm_slli_si128(x0, 4);
282 k = _mm_loadu_si128((__m128i*)
consts->uPx);
283 x1 = _mm_clmulepi64_si128(x1, k, 0x00);
284 x1 = _mm_clmulepi64_si128(x1, k, 0x10);
285 x0 = _mm_xor_si128(x1, x0);
286 *crc = _mm_extract_epi32(x0, 2);
290# if defined(ZEND_INTRIN_SSE4_2_PCLMUL_NATIVE)
291size_t crc32_x86_simd_update(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr)
293size_t crc32_sse42_pclmul_update(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr)
299 const crc32_pclmul_consts *
consts = &crc32_pclmul_consts_maps[
type];
303 return crc32_pclmul_batch(crc,
p, nr,
consts);
306 return crc32_pclmul_reflected_batch(crc,
p, nr,
consts);
313#ifdef ZEND_INTRIN_SSE4_2_PCLMUL_RESOLVER
314static size_t crc32_x86_simd_update_default(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr)
319# ifdef ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_PROTO
322typedef size_t (*crc32_x86_simd_func_t)(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr);
326static crc32_x86_simd_func_t resolve_crc32_x86_simd_update(
void) {
327 if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
328 return crc32_sse42_pclmul_update;
330 return crc32_x86_simd_update_default;
333static size_t (*crc32_x86_simd_ptr)(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr) = crc32_x86_simd_update_default;
335size_t crc32_x86_simd_update(
X86_CRC32_TYPE type, uint32_t *crc,
const unsigned char *
p,
size_t nr) {
336 return crc32_x86_simd_ptr(
type, crc,
p, nr);
342 if (zend_cpu_supports_sse42() && zend_cpu_supports_pclmul()) {
343 crc32_x86_simd_ptr = crc32_sse42_pclmul_update;
#define PHP_MINIT_FUNCTION
#define ZEND_NO_SANITIZE_ADDRESS
#define ZEND_ATTRIBUTE_UNUSED
#define ZEND_INTRIN_SSE4_2_PCLMUL_FUNC_DECL(func)