22#if (defined(__x86_64__) || defined(_M_X64)) && defined(__ADX__) && defined(__BMI2__)
25#if defined(__GNUC__) && !defined(__clang__)
26#pragma GCC diagnostic push
27#pragma GCC diagnostic ignored "-Wpedantic"
30#if defined(__GNUC__) || defined(__clang__)
31 #define FE4X64_FORCE_INLINE __attribute__((always_inline)) inline
32#elif defined(_MSC_VER)
33 #define FE4X64_FORCE_INLINE __forceinline
35 #define FE4X64_FORCE_INLINE inline
38namespace secp256k1::fast::fe4x64 {
44static constexpr std::uint64_t
P0 = 0xFFFFFFFEFFFFFC2FULL;
45static constexpr std::uint64_t
P1 = 0xFFFFFFFFFFFFFFFFULL;
46static constexpr std::uint64_t
P2 = 0xFFFFFFFFFFFFFFFFULL;
47static constexpr std::uint64_t
P3 = 0xFFFFFFFFFFFFFFFFULL;
48static constexpr std::uint64_t K = 0x1000003D1ULL;
65void mul(std::uint64_t* __restrict__ r,
66 const std::uint64_t* __restrict__ a,
67 const std::uint64_t* __restrict__ b)
noexcept {
70 "xorl %%r8d, %%r8d\n\t"
71 "xorl %%r9d, %%r9d\n\t"
72 "xorl %%r10d, %%r10d\n\t"
73 "xorl %%r11d, %%r11d\n\t"
74 "xorl %%r12d, %%r12d\n\t"
75 "xorl %%r13d, %%r13d\n\t"
76 "xorl %%r14d, %%r14d\n\t"
77 "xorl %%r15d, %%r15d\n\t"
80 "movq (%[a]), %%rdx\n\t"
81 "xorl %%ecx, %%ecx\n\t"
82 "mulxq (%[b]), %%rax, %%rbx\n\t"
83 "adcxq %%rax, %%r8\n\t"
84 "adoxq %%rbx, %%r9\n\t"
85 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
86 "adcxq %%rax, %%r9\n\t"
87 "adoxq %%rbx, %%r10\n\t"
88 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
89 "adcxq %%rax, %%r10\n\t"
90 "adoxq %%rbx, %%r11\n\t"
91 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
92 "adcxq %%rax, %%r11\n\t"
93 "adoxq %%rbx, %%r12\n\t"
94 "adcxq %%rcx, %%r12\n\t"
95 "adoxq %%rcx, %%r13\n\t"
98 "movq 8(%[a]), %%rdx\n\t"
99 "xorl %%ecx, %%ecx\n\t"
100 "mulxq (%[b]), %%rax, %%rbx\n\t"
101 "adcxq %%rax, %%r9\n\t"
102 "adoxq %%rbx, %%r10\n\t"
103 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
104 "adcxq %%rax, %%r10\n\t"
105 "adoxq %%rbx, %%r11\n\t"
106 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
107 "adcxq %%rax, %%r11\n\t"
108 "adoxq %%rbx, %%r12\n\t"
109 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
110 "adcxq %%rax, %%r12\n\t"
111 "adoxq %%rbx, %%r13\n\t"
112 "adcxq %%rcx, %%r13\n\t"
113 "adoxq %%rcx, %%r14\n\t"
116 "movq 16(%[a]), %%rdx\n\t"
117 "xorl %%ecx, %%ecx\n\t"
118 "mulxq (%[b]), %%rax, %%rbx\n\t"
119 "adcxq %%rax, %%r10\n\t"
120 "adoxq %%rbx, %%r11\n\t"
121 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
122 "adcxq %%rax, %%r11\n\t"
123 "adoxq %%rbx, %%r12\n\t"
124 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
125 "adcxq %%rax, %%r12\n\t"
126 "adoxq %%rbx, %%r13\n\t"
127 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
128 "adcxq %%rax, %%r13\n\t"
129 "adoxq %%rbx, %%r14\n\t"
130 "adcxq %%rcx, %%r14\n\t"
131 "adoxq %%rcx, %%r15\n\t"
134 "movq 24(%[a]), %%rdx\n\t"
135 "xorl %%ecx, %%ecx\n\t"
136 "mulxq (%[b]), %%rax, %%rbx\n\t"
137 "adcxq %%rax, %%r11\n\t"
138 "adoxq %%rbx, %%r12\n\t"
139 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
140 "adcxq %%rax, %%r12\n\t"
141 "adoxq %%rbx, %%r13\n\t"
142 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
143 "adcxq %%rax, %%r13\n\t"
144 "adoxq %%rbx, %%r14\n\t"
145 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
146 "adcxq %%rax, %%r14\n\t"
147 "adoxq %%rbx, %%r15\n\t"
148 "adcxq %%rcx, %%r15\n\t"
152 "movabsq $0x1000003D1, %%rdx\n\t"
153 "xorl %%ecx, %%ecx\n\t"
155 "mulxq %%r12, %%rax, %%rbx\n\t"
156 "addq %%rax, %%r8\n\t"
157 "adcq %%rbx, %%r9\n\t"
162 "mulxq %%r13, %%rax, %%rbx\n\t"
163 "addq %%rax, %%r9\n\t"
164 "adcq %%rbx, %%r10\n\t"
168 "mulxq %%r14, %%rax, %%rbx\n\t"
169 "addq %%rax, %%r10\n\t"
170 "adcq %%rbx, %%r11\n\t"
173 "mulxq %%r15, %%rax, %%rbx\n\t"
174 "addq %%rax, %%r11\n\t"
175 "adcq %%rbx, %%rcx\n\t"
178 "mulxq %%rcx, %%rax, %%rbx\n\t"
179 "xorl %%ecx, %%ecx\n\t"
180 "addq %%rax, %%r8\n\t"
181 "adcq %%rbx, %%r9\n\t"
187 "movabsq $0x1000003D1, %%rax\n\t"
188 "andq %%rcx, %%rax\n\t"
189 "addq %%rax, %%r8\n\t"
195 "movabsq $0x1000003D1, %%rcx\n\t"
196 "movq %%r8, %%rax\n\t"
197 "addq %%rcx, %%rax\n\t"
198 "movq %%r9, %%rbx\n\t"
200 "movq %%r10, %%r12\n\t"
202 "movq %%r11, %%r13\n\t"
204 "cmovcq %%rax, %%r8\n\t"
205 "cmovcq %%rbx, %%r9\n\t"
206 "cmovcq %%r12, %%r10\n\t"
207 "cmovcq %%r13, %%r11\n\t"
210 "movq %[r_ptr], %%rax\n\t"
211 "movq %%r8, (%%rax)\n\t"
212 "movq %%r9, 8(%%rax)\n\t"
213 "movq %%r10, 16(%%rax)\n\t"
214 "movq %%r11, 24(%%rax)\n\t"
218 : [a]
"r" (a), [b]
"r" (b), [r_ptr]
"m" (r)
219 :
"rax",
"rbx",
"rcx",
"rdx",
220 "r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15",
233void sqr(std::uint64_t* __restrict__ r,
234 const std::uint64_t* __restrict__ a)
noexcept {
235 __asm__ __volatile__(
237 "xorl %%r8d, %%r8d\n\t"
238 "xorl %%r9d, %%r9d\n\t"
239 "xorl %%r10d, %%r10d\n\t"
240 "xorl %%r11d, %%r11d\n\t"
241 "xorl %%r12d, %%r12d\n\t"
242 "xorl %%r13d, %%r13d\n\t"
243 "xorl %%r14d, %%r14d\n\t"
244 "xorl %%r15d, %%r15d\n\t"
247 "movq (%[a]), %%rdx\n\t"
248 "xorl %%ecx, %%ecx\n\t"
249 "mulxq 8(%[a]), %%rax, %%rbx\n\t"
250 "adcxq %%rax, %%r9\n\t"
251 "adoxq %%rbx, %%r10\n\t"
252 "mulxq 16(%[a]), %%rax, %%rbx\n\t"
253 "adcxq %%rax, %%r10\n\t"
254 "adoxq %%rbx, %%r11\n\t"
255 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
256 "adcxq %%rax, %%r11\n\t"
257 "adoxq %%rbx, %%r12\n\t"
258 "adcxq %%rcx, %%r12\n\t"
259 "adoxq %%rcx, %%r13\n\t"
262 "movq 8(%[a]), %%rdx\n\t"
263 "xorl %%ecx, %%ecx\n\t"
264 "mulxq 16(%[a]), %%rax, %%rbx\n\t"
265 "adcxq %%rax, %%r11\n\t"
266 "adoxq %%rbx, %%r12\n\t"
267 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
268 "adcxq %%rax, %%r12\n\t"
269 "adoxq %%rbx, %%r13\n\t"
270 "adcxq %%rcx, %%r13\n\t"
271 "adoxq %%rcx, %%r14\n\t"
274 "movq 16(%[a]), %%rdx\n\t"
275 "xorl %%ecx, %%ecx\n\t"
276 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
277 "adcxq %%rax, %%r13\n\t"
278 "adoxq %%rbx, %%r14\n\t"
279 "adcxq %%rcx, %%r14\n\t"
280 "adoxq %%rcx, %%r15\n\t"
283 "addq %%r9, %%r9\n\t"
284 "adcq %%r10, %%r10\n\t"
285 "adcq %%r11, %%r11\n\t"
286 "adcq %%r12, %%r12\n\t"
287 "adcq %%r13, %%r13\n\t"
288 "adcq %%r14, %%r14\n\t"
289 "adcq %%r15, %%r15\n\t"
292 "xorl %%ecx, %%ecx\n\t"
293 "movq (%[a]), %%rdx\n\t"
294 "mulxq %%rdx, %%rax, %%rbx\n\t"
295 "adcxq %%rax, %%r8\n\t"
296 "adcxq %%rbx, %%r9\n\t"
297 "movq 8(%[a]), %%rdx\n\t"
298 "mulxq %%rdx, %%rax, %%rbx\n\t"
299 "adcxq %%rax, %%r10\n\t"
300 "adcxq %%rbx, %%r11\n\t"
301 "movq 16(%[a]), %%rdx\n\t"
302 "mulxq %%rdx, %%rax, %%rbx\n\t"
303 "adcxq %%rax, %%r12\n\t"
304 "adcxq %%rbx, %%r13\n\t"
305 "movq 24(%[a]), %%rdx\n\t"
306 "mulxq %%rdx, %%rax, %%rbx\n\t"
307 "adcxq %%rax, %%r14\n\t"
308 "adcxq %%rbx, %%r15\n\t"
311 "movabsq $0x1000003D1, %%rdx\n\t"
312 "xorl %%ecx, %%ecx\n\t"
314 "mulxq %%r12, %%rax, %%rbx\n\t"
315 "addq %%rax, %%r8\n\t"
316 "adcq %%rbx, %%r9\n\t"
321 "mulxq %%r13, %%rax, %%rbx\n\t"
322 "addq %%rax, %%r9\n\t"
323 "adcq %%rbx, %%r10\n\t"
327 "mulxq %%r14, %%rax, %%rbx\n\t"
328 "addq %%rax, %%r10\n\t"
329 "adcq %%rbx, %%r11\n\t"
332 "mulxq %%r15, %%rax, %%rbx\n\t"
333 "addq %%rax, %%r11\n\t"
334 "adcq %%rbx, %%rcx\n\t"
337 "mulxq %%rcx, %%rax, %%rbx\n\t"
338 "xorl %%ecx, %%ecx\n\t"
339 "addq %%rax, %%r8\n\t"
340 "adcq %%rbx, %%r9\n\t"
346 "movabsq $0x1000003D1, %%rax\n\t"
347 "andq %%rcx, %%rax\n\t"
348 "addq %%rax, %%r8\n\t"
354 "movabsq $0x1000003D1, %%rcx\n\t"
355 "movq %%r8, %%rax\n\t"
356 "addq %%rcx, %%rax\n\t"
357 "movq %%r9, %%rbx\n\t"
359 "movq %%r10, %%r12\n\t"
361 "movq %%r11, %%r13\n\t"
363 "cmovcq %%rax, %%r8\n\t"
364 "cmovcq %%rbx, %%r9\n\t"
365 "cmovcq %%r12, %%r10\n\t"
366 "cmovcq %%r13, %%r11\n\t"
369 "movq %[r_ptr], %%rax\n\t"
370 "movq %%r8, (%%rax)\n\t"
371 "movq %%r9, 8(%%rax)\n\t"
372 "movq %%r10, 16(%%rax)\n\t"
373 "movq %%r11, 24(%%rax)\n\t"
376 : [a]
"r" (a), [r_ptr]
"m" (r)
377 :
"rax",
"rbx",
"rcx",
"rdx",
378 "r8",
"r9",
"r10",
"r11",
"r12",
"r13",
"r14",
"r15",
387void add(std::uint64_t* __restrict__ r,
388 const std::uint64_t* __restrict__ a,
389 const std::uint64_t* __restrict__ b)
noexcept {
391 unsigned __int128 s = (
unsigned __int128)a[0] + b[0];
392 std::uint64_t r0 = (std::uint64_t)s; s >>= 64;
393 s += (
unsigned __int128)a[1] + b[1];
394 std::uint64_t r1 = (std::uint64_t)s; s >>= 64;
395 s += (
unsigned __int128)a[2] + b[2];
396 std::uint64_t r2 = (std::uint64_t)s; s >>= 64;
397 s += (
unsigned __int128)a[3] + b[3];
398 std::uint64_t r3 = (std::uint64_t)s;
399 std::uint64_t carry_s = (std::uint64_t)(s >> 64);
402 unsigned __int128 t = (
unsigned __int128)r0 + K;
403 std::uint64_t t0 = (std::uint64_t)t; t >>= 64;
405 std::uint64_t t1 = (std::uint64_t)t; t >>= 64;
407 std::uint64_t t2 = (std::uint64_t)t; t >>= 64;
409 std::uint64_t t3 = (std::uint64_t)t;
410 std::uint64_t carry_t = (std::uint64_t)(t >> 64);
413 std::uint64_t use_t = carry_s | carry_t;
414 std::uint64_t mask = (std::uint64_t)0 - use_t;
416 r[0] = (r0 & ~mask) | (t0 & mask);
417 r[1] = (r1 & ~mask) | (t1 & mask);
418 r[2] = (r2 & ~mask) | (t2 & mask);
419 r[3] = (r3 & ~mask) | (t3 & mask);
426void sub(std::uint64_t* __restrict__ r,
427 const std::uint64_t* __restrict__ a,
428 const std::uint64_t* __restrict__ b)
noexcept {
430 unsigned __int128 d = (
unsigned __int128)a[0] - b[0];
431 std::uint64_t r0 = (std::uint64_t)d;
432 std::uint64_t borrow = (d >> 127) & 1;
434 d = (
unsigned __int128)a[1] - b[1] - borrow;
435 std::uint64_t r1 = (std::uint64_t)d;
436 borrow = (d >> 127) & 1;
438 d = (
unsigned __int128)a[2] - b[2] - borrow;
439 std::uint64_t r2 = (std::uint64_t)d;
440 borrow = (d >> 127) & 1;
442 d = (
unsigned __int128)a[3] - b[3] - borrow;
443 std::uint64_t r3 = (std::uint64_t)d;
444 borrow = (d >> 127) & 1;
449 std::uint64_t mask = (std::uint64_t)0 - borrow;
450 std::uint64_t corr = K & mask;
452 d = (
unsigned __int128)r0 - corr;
453 r[0] = (std::uint64_t)d;
454 borrow = (d >> 127) & 1;
456 d = (
unsigned __int128)r1 - borrow;
457 r[1] = (std::uint64_t)d;
458 borrow = (d >> 127) & 1;
460 d = (
unsigned __int128)r2 - borrow;
461 r[2] = (std::uint64_t)d;
462 borrow = (d >> 127) & 1;
471void negate(std::uint64_t* __restrict__ r,
472 const std::uint64_t* __restrict__ a)
noexcept {
475 unsigned __int128 d = (
unsigned __int128)P0 - a[0];
476 r[0] = (std::uint64_t)d;
477 std::uint64_t borrow = (d >> 127) & 1;
479 d = (
unsigned __int128)P1 - a[1] - borrow;
480 r[1] = (std::uint64_t)d;
481 borrow = (d >> 127) & 1;
483 d = (
unsigned __int128)P2 - a[2] - borrow;
484 r[2] = (std::uint64_t)d;
485 borrow = (d >> 127) & 1;
487 r[3] =
P3 - a[3] - borrow;
490 std::uint64_t nonzero = a[0] | a[1] | a[2] | a[3];
491 std::uint64_t mask = (std::uint64_t)0 - (std::uint64_t)(nonzero != 0);
502void mul_int(std::uint64_t* __restrict__ r,
503 const std::uint64_t* __restrict__ a,
504 std::uint32_t k)
noexcept {
505 using u128 =
unsigned __int128;
506 u128 c = (u128)a[0] * k;
507 r[0] = (std::uint64_t)c; c >>= 64;
509 r[1] = (std::uint64_t)c; c >>= 64;
511 r[2] = (std::uint64_t)c; c >>= 64;
513 r[3] = (std::uint64_t)c;
514 std::uint64_t overflow = (std::uint64_t)(c >> 64);
518 u128 f = (u128)overflow * K + r[0];
519 r[0] = (std::uint64_t)f;
520 std::uint64_t carry = (std::uint64_t)(f >> 64);
522 carry = (r[1] < carry) ? 1ULL : 0ULL;
524 carry = (r[2] < carry) ? 1ULL : 0ULL;
535void half(std::uint64_t* __restrict__ r,
536 const std::uint64_t* __restrict__ a)
noexcept {
538 std::uint64_t mask = (std::uint64_t)0 - (a[0] & 1);
539 unsigned __int128 s = (
unsigned __int128)a[0] + (P0 & mask);
540 std::uint64_t t0 = (std::uint64_t)s; s >>= 64;
541 s += (
unsigned __int128)a[1] + (P1 & mask);
542 std::uint64_t t1 = (std::uint64_t)s; s >>= 64;
543 s += (
unsigned __int128)a[2] + (P2 & mask);
544 std::uint64_t t2 = (std::uint64_t)s; s >>= 64;
545 s += (
unsigned __int128)a[3] + (P3 & mask);
546 std::uint64_t t3 = (std::uint64_t)s;
547 std::uint64_t carry = (std::uint64_t)(s >> 64);
550 r[0] = (t0 >> 1) | (t1 << 63);
551 r[1] = (t1 >> 1) | (t2 << 63);
552 r[2] = (t2 >> 1) | (t3 << 63);
553 r[3] = (t3 >> 1) | (carry << 63);
560void copy(std::uint64_t* __restrict__ dst,
561 const std::uint64_t* __restrict__ src)
noexcept {
573void cmov(std::uint64_t* __restrict__ dst,
574 const std::uint64_t* __restrict__ src,
575 std::uint64_t flag)
noexcept {
576 std::uint64_t mask = (std::uint64_t)0 - (std::uint64_t)(flag != 0);
577 dst[0] = (dst[0] & ~mask) | (src[0] & mask);
578 dst[1] = (dst[1] & ~mask) | (src[1] & mask);
579 dst[2] = (dst[2] & ~mask) | (src[2] & mask);
580 dst[3] = (dst[3] & ~mask) | (src[3] & mask);
587bool is_zero(
const std::uint64_t* a)
noexcept {
588 return (a[0] | a[1] | a[2] | a[3]) == 0;
593#if defined(__GNUC__) && !defined(__clang__)
594#pragma GCC diagnostic pop
constexpr std::uint32_t P0
constexpr std::uint32_t P2
constexpr std::uint32_t P1
constexpr std::uint32_t P3