31# include <emmintrin.h>
43mm_bswap_epi32(__m128i
a)
47 a = _mm_or_si128(_mm_slli_epi16(
a, 8), _mm_srli_epi16(
a, 8));
50 a = _mm_shufflelo_epi16(
a, _MM_SHUFFLE(2, 3, 0, 1));
51 a = _mm_shufflehi_epi16(
a, _MM_SHUFFLE(2, 3, 0, 1));
57static const uint32_t Krnd[64] = {
58 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
59 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
60 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
61 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
62 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
63 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
64 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
65 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
66 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
67 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
68 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
69 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
70 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
71 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
72 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
73 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
77#define Ch(x, y, z) ((x & (y ^ z)) ^ z)
78#define Maj(x, y, z) ((x & (y | z)) | (y & z))
79#define ROTR(x, n) ((x >> n) | (x << (32 - n)))
80#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22))
81#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25))
84#define RND(a, b, c, d, e, f, g, h, k) \
85 h += S1(e) + Ch(e, f, g) + k; \
87 h += S0(a) + Maj(a, b, c)
90#define RNDr(S, W, i, ii) \
91 RND(S[(64 - i) % 8], S[(65 - i) % 8], \
92 S[(66 - i) % 8], S[(67 - i) % 8], \
93 S[(68 - i) % 8], S[(69 - i) % 8], \
94 S[(70 - i) % 8], S[(71 - i) % 8], \
95 W[i + ii] + Krnd[i + ii])
98#define SHR32(x, n) (_mm_srli_epi32(x, n))
99#define ROTR32(x, n) (_mm_or_si128(SHR32(x, n), _mm_slli_epi32(x, (32-n))))
100#define s0_128(x) _mm_xor_si128(_mm_xor_si128( \
101 ROTR32(x, 7), ROTR32(x, 18)), SHR32(x, 3))
104s1_128_high(__m128i
a)
110 b = _mm_shuffle_epi32(
a, _MM_SHUFFLE(1, 1, 0, 0));
111 c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
114 c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
117 c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
118 c = _mm_slli_si128(c, 8);
130 b = _mm_shuffle_epi32(
a, _MM_SHUFFLE(3, 3, 2, 2));
131 c = _mm_xor_si128(_mm_srli_epi64(b, 17), _mm_srli_epi64(b, 19));
134 c = _mm_xor_si128(c, _mm_srli_epi32(b, 10));
137 c = _mm_shuffle_epi32(c, _MM_SHUFFLE(2, 0, 2, 0));
138 c = _mm_srli_si128(c, 8);
153#define SPAN_ONE_THREE(a, b) (_mm_shuffle_epi32(_mm_castps_si128( \
154 _mm_move_ss(_mm_castsi128_ps(a), _mm_castsi128_ps(b))), \
155 _MM_SHUFFLE(0, 3, 2, 1)))
170MSG4(__m128i X0, __m128i X1, __m128i X2, __m128i X3)
173 __m128i Xj_minus_seven, Xj_minus_fifteen;
176 Xj_minus_seven = SPAN_ONE_THREE(X2, X3);
177 Xj_minus_fifteen = SPAN_ONE_THREE(X0, X1);
180 X4 = _mm_add_epi32(X0, Xj_minus_seven);
181 X4 = _mm_add_epi32(X4, s0_128(Xj_minus_fifteen));
184 X4 = _mm_add_epi32(X4, s1_128_low(X3));
187 X4 = _mm_add_epi32(X4, s1_128_high(X4));
209 Y[0] = mm_bswap_epi32(_mm_loadu_si128((
const __m128i *)&block[0]));
210 _mm_storeu_si128((__m128i *)&
W[0], Y[0]);
211 Y[1] = mm_bswap_epi32(_mm_loadu_si128((
const __m128i *)&block[16]));
212 _mm_storeu_si128((__m128i *)&
W[4], Y[1]);
213 Y[2] = mm_bswap_epi32(_mm_loadu_si128((
const __m128i *)&block[32]));
214 _mm_storeu_si128((__m128i *)&
W[8], Y[2]);
215 Y[3] = mm_bswap_epi32(_mm_loadu_si128((
const __m128i *)&block[48]));
216 _mm_storeu_si128((__m128i *)&
W[12], Y[3]);
222 for (i = 0; i < 64; i += 16) {
242 Y[0] = MSG4(Y[0], Y[1], Y[2], Y[3]);
243 _mm_storeu_si128((__m128i *)&
W[16 + i + 0], Y[0]);
244 Y[1] = MSG4(Y[1], Y[2], Y[3], Y[0]);
245 _mm_storeu_si128((__m128i *)&
W[16 + i + 4], Y[1]);
246 Y[2] = MSG4(Y[2], Y[3], Y[0], Y[1]);
247 _mm_storeu_si128((__m128i *)&
W[16 + i + 8], Y[2]);
248 Y[3] = MSG4(Y[3], Y[0], Y[1], Y[2]);
249 _mm_storeu_si128((__m128i *)&
W[16 + i + 12], Y[3]);
253 for (i = 0; i < 8; i++)
#define PHP_STATIC_RESTRICT