UltrafastSecp256k1 3.50.0
Ultra high-performance secp256k1 elliptic curve cryptography library
Loading...
Searching...
No Matches
field_4x64_inline.hpp
Go to the documentation of this file.
1#pragma once
2// ===========================================================================
3// Inline 4x64 Field Operations for secp256k1 hot paths
4// ===========================================================================
5// These functions operate on uint64_t[4] field elements in 4x64 representation.
6// They use MULX + ADCX/ADOX inline assembly for maximum throughput.
7//
8// Advantages over 5x52 compiler path:
9// - 4 limbs vs 5: lower register pressure (critical in hot loops)
10// - 20 MULX per mul vs 30: fewer multiplications
11// - 14 MULX per sqr vs 20: fewer multiplications
12// - Hand-scheduled ADCX/ADOX dual carry chains
13// - No function call overhead (inline)
14//
15// Reduction constant: p = 2^256 - K, where K = 0x1000003D1
16// All results are fully normalized (< p).
17// ===========================================================================
18
19#include <cstdint>
20#include <cstring>
21
22#if (defined(__x86_64__) || defined(_M_X64)) && defined(__ADX__) && defined(__BMI2__)
23
24// Suppress GCC -Wpedantic for unsigned __int128 (ISO C++ extension, required here)
25#if defined(__GNUC__) && !defined(__clang__)
26#pragma GCC diagnostic push
27#pragma GCC diagnostic ignored "-Wpedantic"
28#endif
29
30#if defined(__GNUC__) || defined(__clang__)
31 #define FE4X64_FORCE_INLINE __attribute__((always_inline)) inline
32#elif defined(_MSC_VER)
33 #define FE4X64_FORCE_INLINE __forceinline
34#else
35 #define FE4X64_FORCE_INLINE inline
36#endif
37
38namespace secp256k1::fast::fe4x64 {
39
40// ---------------------------------------------------------------------------
41// Constants
42// ---------------------------------------------------------------------------
43// p = FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFEFFFFFC2F
44static constexpr std::uint64_t P0 = 0xFFFFFFFEFFFFFC2FULL;
45static constexpr std::uint64_t P1 = 0xFFFFFFFFFFFFFFFFULL;
46static constexpr std::uint64_t P2 = 0xFFFFFFFFFFFFFFFFULL;
47static constexpr std::uint64_t P3 = 0xFFFFFFFFFFFFFFFFULL;
48static constexpr std::uint64_t K = 0x1000003D1ULL; // 2^256 - p
49
50// ---------------------------------------------------------------------------
51// fe4x64_mul: Modular multiplication (a * b) mod p
52// ---------------------------------------------------------------------------
53// Uses MULX + ADCX/ADOX for 256x256 -> 512-bit product, then secp256k1
54// fast reduction (high limbs * K folded into low limbs).
55// Output is strictly < p (branchless mod-p normalization).
56//
57// Register usage:
58// r8..r15 : 512-bit accumulator (all clobbered)
59// rdx : MULX implicit source
60// rax,rbx : MULX hi/lo outputs
61// rcx : zero register / carry
62// [a],[b],[r] : input/output pointers (compiler-chosen registers)
63// ---------------------------------------------------------------------------
64FE4X64_FORCE_INLINE
65void mul(std::uint64_t* __restrict__ r,
66 const std::uint64_t* __restrict__ a,
67 const std::uint64_t* __restrict__ b) noexcept {
68 __asm__ __volatile__(
69 // ---- 256x256 -> 512 multiplication (MULX + ADCX/ADOX) ----
70 "xorl %%r8d, %%r8d\n\t"
71 "xorl %%r9d, %%r9d\n\t"
72 "xorl %%r10d, %%r10d\n\t"
73 "xorl %%r11d, %%r11d\n\t"
74 "xorl %%r12d, %%r12d\n\t"
75 "xorl %%r13d, %%r13d\n\t"
76 "xorl %%r14d, %%r14d\n\t"
77 "xorl %%r15d, %%r15d\n\t"
78
79 // Row 0: a[0] * b[0..3]
80 "movq (%[a]), %%rdx\n\t"
81 "xorl %%ecx, %%ecx\n\t"
82 "mulxq (%[b]), %%rax, %%rbx\n\t"
83 "adcxq %%rax, %%r8\n\t"
84 "adoxq %%rbx, %%r9\n\t"
85 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
86 "adcxq %%rax, %%r9\n\t"
87 "adoxq %%rbx, %%r10\n\t"
88 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
89 "adcxq %%rax, %%r10\n\t"
90 "adoxq %%rbx, %%r11\n\t"
91 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
92 "adcxq %%rax, %%r11\n\t"
93 "adoxq %%rbx, %%r12\n\t"
94 "adcxq %%rcx, %%r12\n\t"
95 "adoxq %%rcx, %%r13\n\t"
96
97 // Row 1: a[1] * b[0..3]
98 "movq 8(%[a]), %%rdx\n\t"
99 "xorl %%ecx, %%ecx\n\t"
100 "mulxq (%[b]), %%rax, %%rbx\n\t"
101 "adcxq %%rax, %%r9\n\t"
102 "adoxq %%rbx, %%r10\n\t"
103 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
104 "adcxq %%rax, %%r10\n\t"
105 "adoxq %%rbx, %%r11\n\t"
106 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
107 "adcxq %%rax, %%r11\n\t"
108 "adoxq %%rbx, %%r12\n\t"
109 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
110 "adcxq %%rax, %%r12\n\t"
111 "adoxq %%rbx, %%r13\n\t"
112 "adcxq %%rcx, %%r13\n\t"
113 "adoxq %%rcx, %%r14\n\t"
114
115 // Row 2: a[2] * b[0..3]
116 "movq 16(%[a]), %%rdx\n\t"
117 "xorl %%ecx, %%ecx\n\t"
118 "mulxq (%[b]), %%rax, %%rbx\n\t"
119 "adcxq %%rax, %%r10\n\t"
120 "adoxq %%rbx, %%r11\n\t"
121 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
122 "adcxq %%rax, %%r11\n\t"
123 "adoxq %%rbx, %%r12\n\t"
124 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
125 "adcxq %%rax, %%r12\n\t"
126 "adoxq %%rbx, %%r13\n\t"
127 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
128 "adcxq %%rax, %%r13\n\t"
129 "adoxq %%rbx, %%r14\n\t"
130 "adcxq %%rcx, %%r14\n\t"
131 "adoxq %%rcx, %%r15\n\t"
132
133 // Row 3: a[3] * b[0..3]
134 "movq 24(%[a]), %%rdx\n\t"
135 "xorl %%ecx, %%ecx\n\t"
136 "mulxq (%[b]), %%rax, %%rbx\n\t"
137 "adcxq %%rax, %%r11\n\t"
138 "adoxq %%rbx, %%r12\n\t"
139 "mulxq 8(%[b]), %%rax, %%rbx\n\t"
140 "adcxq %%rax, %%r12\n\t"
141 "adoxq %%rbx, %%r13\n\t"
142 "mulxq 16(%[b]), %%rax, %%rbx\n\t"
143 "adcxq %%rax, %%r13\n\t"
144 "adoxq %%rbx, %%r14\n\t"
145 "mulxq 24(%[b]), %%rax, %%rbx\n\t"
146 "adcxq %%rax, %%r14\n\t"
147 "adoxq %%rbx, %%r15\n\t"
148 "adcxq %%rcx, %%r15\n\t"
149
150 // ---- Reduction (512 -> 256) ----
151 // K = 0x1000003D1; high[i] * K added to low[i]
152 "movabsq $0x1000003D1, %%rdx\n\t"
153 "xorl %%ecx, %%ecx\n\t"
154
155 "mulxq %%r12, %%rax, %%rbx\n\t"
156 "addq %%rax, %%r8\n\t"
157 "adcq %%rbx, %%r9\n\t"
158 "adcq $0, %%r10\n\t"
159 "adcq $0, %%r11\n\t"
160 "adcq $0, %%rcx\n\t"
161
162 "mulxq %%r13, %%rax, %%rbx\n\t"
163 "addq %%rax, %%r9\n\t"
164 "adcq %%rbx, %%r10\n\t"
165 "adcq $0, %%r11\n\t"
166 "adcq $0, %%rcx\n\t"
167
168 "mulxq %%r14, %%rax, %%rbx\n\t"
169 "addq %%rax, %%r10\n\t"
170 "adcq %%rbx, %%r11\n\t"
171 "adcq $0, %%rcx\n\t"
172
173 "mulxq %%r15, %%rax, %%rbx\n\t"
174 "addq %%rax, %%r11\n\t"
175 "adcq %%rbx, %%rcx\n\t"
176
177 // Pass 1: reduce overflow (~34 bits max)
178 "mulxq %%rcx, %%rax, %%rbx\n\t"
179 "xorl %%ecx, %%ecx\n\t"
180 "addq %%rax, %%r8\n\t"
181 "adcq %%rbx, %%r9\n\t"
182 "adcq $0, %%r10\n\t"
183 "adcq $0, %%r11\n\t"
184 "adcq $0, %%rcx\n\t"
185
186 // Pass 2: reduce final 1-bit overflow (branchless)
187 "movabsq $0x1000003D1, %%rax\n\t"
188 "andq %%rcx, %%rax\n\t"
189 "addq %%rax, %%r8\n\t"
190 "adcq $0, %%r9\n\t"
191 "adcq $0, %%r10\n\t"
192 "adcq $0, %%r11\n\t"
193
194 // Branchless mod-p: if result >= p, subtract p
195 "movabsq $0x1000003D1, %%rcx\n\t"
196 "movq %%r8, %%rax\n\t"
197 "addq %%rcx, %%rax\n\t"
198 "movq %%r9, %%rbx\n\t"
199 "adcq $0, %%rbx\n\t"
200 "movq %%r10, %%r12\n\t"
201 "adcq $0, %%r12\n\t"
202 "movq %%r11, %%r13\n\t"
203 "adcq $0, %%r13\n\t"
204 "cmovcq %%rax, %%r8\n\t"
205 "cmovcq %%rbx, %%r9\n\t"
206 "cmovcq %%r12, %%r10\n\t"
207 "cmovcq %%r13, %%r11\n\t"
208
209 // Store result (load output pointer from stack)
210 "movq %[r_ptr], %%rax\n\t"
211 "movq %%r8, (%%rax)\n\t"
212 "movq %%r9, 8(%%rax)\n\t"
213 "movq %%r10, 16(%%rax)\n\t"
214 "movq %%r11, 24(%%rax)\n\t"
215
216
217 : /* no output operands */
218 : [a] "r" (a), [b] "r" (b), [r_ptr] "m" (r)
219 : "rax", "rbx", "rcx", "rdx",
220 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
221 "cc", "memory"
222 );
223}
224
225// ---------------------------------------------------------------------------
226// fe4x64_sqr: Modular squaring (a^2) mod p
227// ---------------------------------------------------------------------------
228// Optimized: 6 cross-products (doubled) + 4 squares = 10 MULX + 4 MULX,
229// then secp256k1 fast reduction (5 MULX).
230// Total: ~19 MULX (vs ~20 for 5x52 sqr via __int128)
231// ---------------------------------------------------------------------------
232FE4X64_FORCE_INLINE
233void sqr(std::uint64_t* __restrict__ r,
234 const std::uint64_t* __restrict__ a) noexcept {
235 __asm__ __volatile__(
236 // ---- Cross products ----
237 "xorl %%r8d, %%r8d\n\t"
238 "xorl %%r9d, %%r9d\n\t"
239 "xorl %%r10d, %%r10d\n\t"
240 "xorl %%r11d, %%r11d\n\t"
241 "xorl %%r12d, %%r12d\n\t"
242 "xorl %%r13d, %%r13d\n\t"
243 "xorl %%r14d, %%r14d\n\t"
244 "xorl %%r15d, %%r15d\n\t"
245
246 // Pass A: a[0] * (a[1], a[2], a[3])
247 "movq (%[a]), %%rdx\n\t"
248 "xorl %%ecx, %%ecx\n\t"
249 "mulxq 8(%[a]), %%rax, %%rbx\n\t"
250 "adcxq %%rax, %%r9\n\t"
251 "adoxq %%rbx, %%r10\n\t"
252 "mulxq 16(%[a]), %%rax, %%rbx\n\t"
253 "adcxq %%rax, %%r10\n\t"
254 "adoxq %%rbx, %%r11\n\t"
255 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
256 "adcxq %%rax, %%r11\n\t"
257 "adoxq %%rbx, %%r12\n\t"
258 "adcxq %%rcx, %%r12\n\t"
259 "adoxq %%rcx, %%r13\n\t"
260
261 // Pass B: a[1] * (a[2], a[3])
262 "movq 8(%[a]), %%rdx\n\t"
263 "xorl %%ecx, %%ecx\n\t"
264 "mulxq 16(%[a]), %%rax, %%rbx\n\t"
265 "adcxq %%rax, %%r11\n\t"
266 "adoxq %%rbx, %%r12\n\t"
267 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
268 "adcxq %%rax, %%r12\n\t"
269 "adoxq %%rbx, %%r13\n\t"
270 "adcxq %%rcx, %%r13\n\t"
271 "adoxq %%rcx, %%r14\n\t"
272
273 // Pass C: a[2] * a[3]
274 "movq 16(%[a]), %%rdx\n\t"
275 "xorl %%ecx, %%ecx\n\t"
276 "mulxq 24(%[a]), %%rax, %%rbx\n\t"
277 "adcxq %%rax, %%r13\n\t"
278 "adoxq %%rbx, %%r14\n\t"
279 "adcxq %%rcx, %%r14\n\t"
280 "adoxq %%rcx, %%r15\n\t"
281
282 // ---- Double the cross products ----
283 "addq %%r9, %%r9\n\t"
284 "adcq %%r10, %%r10\n\t"
285 "adcq %%r11, %%r11\n\t"
286 "adcq %%r12, %%r12\n\t"
287 "adcq %%r13, %%r13\n\t"
288 "adcq %%r14, %%r14\n\t"
289 "adcq %%r15, %%r15\n\t"
290
291 // ---- Add squares (ADCX chain) ----
292 "xorl %%ecx, %%ecx\n\t"
293 "movq (%[a]), %%rdx\n\t"
294 "mulxq %%rdx, %%rax, %%rbx\n\t"
295 "adcxq %%rax, %%r8\n\t"
296 "adcxq %%rbx, %%r9\n\t"
297 "movq 8(%[a]), %%rdx\n\t"
298 "mulxq %%rdx, %%rax, %%rbx\n\t"
299 "adcxq %%rax, %%r10\n\t"
300 "adcxq %%rbx, %%r11\n\t"
301 "movq 16(%[a]), %%rdx\n\t"
302 "mulxq %%rdx, %%rax, %%rbx\n\t"
303 "adcxq %%rax, %%r12\n\t"
304 "adcxq %%rbx, %%r13\n\t"
305 "movq 24(%[a]), %%rdx\n\t"
306 "mulxq %%rdx, %%rax, %%rbx\n\t"
307 "adcxq %%rax, %%r14\n\t"
308 "adcxq %%rbx, %%r15\n\t"
309
310 // ---- Reduction (512 -> 256) ----
311 "movabsq $0x1000003D1, %%rdx\n\t"
312 "xorl %%ecx, %%ecx\n\t"
313
314 "mulxq %%r12, %%rax, %%rbx\n\t"
315 "addq %%rax, %%r8\n\t"
316 "adcq %%rbx, %%r9\n\t"
317 "adcq $0, %%r10\n\t"
318 "adcq $0, %%r11\n\t"
319 "adcq $0, %%rcx\n\t"
320
321 "mulxq %%r13, %%rax, %%rbx\n\t"
322 "addq %%rax, %%r9\n\t"
323 "adcq %%rbx, %%r10\n\t"
324 "adcq $0, %%r11\n\t"
325 "adcq $0, %%rcx\n\t"
326
327 "mulxq %%r14, %%rax, %%rbx\n\t"
328 "addq %%rax, %%r10\n\t"
329 "adcq %%rbx, %%r11\n\t"
330 "adcq $0, %%rcx\n\t"
331
332 "mulxq %%r15, %%rax, %%rbx\n\t"
333 "addq %%rax, %%r11\n\t"
334 "adcq %%rbx, %%rcx\n\t"
335
336 // Pass 1: reduce overflow
337 "mulxq %%rcx, %%rax, %%rbx\n\t"
338 "xorl %%ecx, %%ecx\n\t"
339 "addq %%rax, %%r8\n\t"
340 "adcq %%rbx, %%r9\n\t"
341 "adcq $0, %%r10\n\t"
342 "adcq $0, %%r11\n\t"
343 "adcq $0, %%rcx\n\t"
344
345 // Pass 2: branchless final reduction
346 "movabsq $0x1000003D1, %%rax\n\t"
347 "andq %%rcx, %%rax\n\t"
348 "addq %%rax, %%r8\n\t"
349 "adcq $0, %%r9\n\t"
350 "adcq $0, %%r10\n\t"
351 "adcq $0, %%r11\n\t"
352
353 // Branchless mod-p
354 "movabsq $0x1000003D1, %%rcx\n\t"
355 "movq %%r8, %%rax\n\t"
356 "addq %%rcx, %%rax\n\t"
357 "movq %%r9, %%rbx\n\t"
358 "adcq $0, %%rbx\n\t"
359 "movq %%r10, %%r12\n\t"
360 "adcq $0, %%r12\n\t"
361 "movq %%r11, %%r13\n\t"
362 "adcq $0, %%r13\n\t"
363 "cmovcq %%rax, %%r8\n\t"
364 "cmovcq %%rbx, %%r9\n\t"
365 "cmovcq %%r12, %%r10\n\t"
366 "cmovcq %%r13, %%r11\n\t"
367
368 // Store (load output pointer from stack)
369 "movq %[r_ptr], %%rax\n\t"
370 "movq %%r8, (%%rax)\n\t"
371 "movq %%r9, 8(%%rax)\n\t"
372 "movq %%r10, 16(%%rax)\n\t"
373 "movq %%r11, 24(%%rax)\n\t"
374
375 : /* no output operands */
376 : [a] "r" (a), [r_ptr] "m" (r)
377 : "rax", "rbx", "rcx", "rdx",
378 "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15",
379 "cc", "memory"
380 );
381}
382
383// ---------------------------------------------------------------------------
384// fe4x64_add: Modular addition (a + b) mod p (branchless)
385// ---------------------------------------------------------------------------
386FE4X64_FORCE_INLINE
387void add(std::uint64_t* __restrict__ r,
388 const std::uint64_t* __restrict__ a,
389 const std::uint64_t* __restrict__ b) noexcept {
390 // Sum S = a + b
391 unsigned __int128 s = (unsigned __int128)a[0] + b[0];
392 std::uint64_t r0 = (std::uint64_t)s; s >>= 64;
393 s += (unsigned __int128)a[1] + b[1];
394 std::uint64_t r1 = (std::uint64_t)s; s >>= 64;
395 s += (unsigned __int128)a[2] + b[2];
396 std::uint64_t r2 = (std::uint64_t)s; s >>= 64;
397 s += (unsigned __int128)a[3] + b[3];
398 std::uint64_t r3 = (std::uint64_t)s;
399 std::uint64_t carry_s = (std::uint64_t)(s >> 64);
400
401 // T = S + K (to check if S >= p)
402 unsigned __int128 t = (unsigned __int128)r0 + K;
403 std::uint64_t t0 = (std::uint64_t)t; t >>= 64;
404 t += r1;
405 std::uint64_t t1 = (std::uint64_t)t; t >>= 64;
406 t += r2;
407 std::uint64_t t2 = (std::uint64_t)t; t >>= 64;
408 t += r3;
409 std::uint64_t t3 = (std::uint64_t)t;
410 std::uint64_t carry_t = (std::uint64_t)(t >> 64);
411
412 // If S overflowed (carry_s) OR T overflowed (carry_t), use T
413 std::uint64_t use_t = carry_s | carry_t;
414 std::uint64_t mask = (std::uint64_t)0 - use_t; // all-1s if use_t, all-0s if not
415
416 r[0] = (r0 & ~mask) | (t0 & mask);
417 r[1] = (r1 & ~mask) | (t1 & mask);
418 r[2] = (r2 & ~mask) | (t2 & mask);
419 r[3] = (r3 & ~mask) | (t3 & mask);
420}
421
422// ---------------------------------------------------------------------------
423// fe4x64_sub: Modular subtraction (a - b) mod p (branchless)
424// ---------------------------------------------------------------------------
425FE4X64_FORCE_INLINE
426void sub(std::uint64_t* __restrict__ r,
427 const std::uint64_t* __restrict__ a,
428 const std::uint64_t* __restrict__ b) noexcept {
429 // D = a - b (with borrow tracking)
430 unsigned __int128 d = (unsigned __int128)a[0] - b[0];
431 std::uint64_t r0 = (std::uint64_t)d;
432 std::uint64_t borrow = (d >> 127) & 1; // 1 if borrow
433
434 d = (unsigned __int128)a[1] - b[1] - borrow;
435 std::uint64_t r1 = (std::uint64_t)d;
436 borrow = (d >> 127) & 1;
437
438 d = (unsigned __int128)a[2] - b[2] - borrow;
439 std::uint64_t r2 = (std::uint64_t)d;
440 borrow = (d >> 127) & 1;
441
442 d = (unsigned __int128)a[3] - b[3] - borrow;
443 std::uint64_t r3 = (std::uint64_t)d;
444 borrow = (d >> 127) & 1;
445
446 // If borrow, add p: result += p = (2^256 - K) means result -= K
447 // Wait: a - b < 0 means we need a - b + p = a - b + 2^256 - K.
448 // The 2^256 is implicit (we got the wraparound). So subtract K.
449 std::uint64_t mask = (std::uint64_t)0 - borrow; // all-1s if borrow
450 std::uint64_t corr = K & mask;
451
452 d = (unsigned __int128)r0 - corr;
453 r[0] = (std::uint64_t)d;
454 borrow = (d >> 127) & 1;
455
456 d = (unsigned __int128)r1 - borrow;
457 r[1] = (std::uint64_t)d;
458 borrow = (d >> 127) & 1;
459
460 d = (unsigned __int128)r2 - borrow;
461 r[2] = (std::uint64_t)d;
462 borrow = (d >> 127) & 1;
463
464 r[3] = r3 - borrow;
465}
466
467// ---------------------------------------------------------------------------
468// fe4x64_negate: Modular negation (-a) mod p (branchless)
469// ---------------------------------------------------------------------------
470FE4X64_FORCE_INLINE
471void negate(std::uint64_t* __restrict__ r,
472 const std::uint64_t* __restrict__ a) noexcept {
473 // -a mod p = p - a (if a != 0), 0 (if a == 0)
474 // Branchless: compute p - a, then mask to 0 if a was 0
475 unsigned __int128 d = (unsigned __int128)P0 - a[0];
476 r[0] = (std::uint64_t)d;
477 std::uint64_t borrow = (d >> 127) & 1;
478
479 d = (unsigned __int128)P1 - a[1] - borrow;
480 r[1] = (std::uint64_t)d;
481 borrow = (d >> 127) & 1;
482
483 d = (unsigned __int128)P2 - a[2] - borrow;
484 r[2] = (std::uint64_t)d;
485 borrow = (d >> 127) & 1;
486
487 r[3] = P3 - a[3] - borrow;
488
489 // If a == 0, result should also be 0
490 std::uint64_t nonzero = a[0] | a[1] | a[2] | a[3];
491 std::uint64_t mask = (std::uint64_t)0 - (std::uint64_t)(nonzero != 0);
492 r[0] &= mask;
493 r[1] &= mask;
494 r[2] &= mask;
495 r[3] &= mask;
496}
497
498// ---------------------------------------------------------------------------
499// fe4x64_mul_int: Multiply by small integer (a * k) mod p (branchless)
500// ---------------------------------------------------------------------------
501FE4X64_FORCE_INLINE
502void mul_int(std::uint64_t* __restrict__ r,
503 const std::uint64_t* __restrict__ a,
504 std::uint32_t k) noexcept {
505 using u128 = unsigned __int128;
506 u128 c = (u128)a[0] * k;
507 r[0] = (std::uint64_t)c; c >>= 64;
508 c += (u128)a[1] * k;
509 r[1] = (std::uint64_t)c; c >>= 64;
510 c += (u128)a[2] * k;
511 r[2] = (std::uint64_t)c; c >>= 64;
512 c += (u128)a[3] * k;
513 r[3] = (std::uint64_t)c;
514 std::uint64_t overflow = (std::uint64_t)(c >> 64);
515
516 // Reduce overflow: overflow * K back into limb 0
517 if (overflow) {
518 u128 f = (u128)overflow * K + r[0];
519 r[0] = (std::uint64_t)f;
520 std::uint64_t carry = (std::uint64_t)(f >> 64);
521 r[1] += carry;
522 carry = (r[1] < carry) ? 1ULL : 0ULL;
523 r[2] += carry;
524 carry = (r[2] < carry) ? 1ULL : 0ULL;
525 r[3] += carry;
526 }
527}
528
529// ---------------------------------------------------------------------------
530// fe4x64_half: Modular halving (a / 2) mod p (branchless)
531// ---------------------------------------------------------------------------
532// If a is even: shift right by 1.
533// If a is odd: (a + p) / 2 (p is odd, so a+p is even).
534FE4X64_FORCE_INLINE
535void half(std::uint64_t* __restrict__ r,
536 const std::uint64_t* __restrict__ a) noexcept {
537 // Conditionally add p if odd
538 std::uint64_t mask = (std::uint64_t)0 - (a[0] & 1); // all-1s if odd
539 unsigned __int128 s = (unsigned __int128)a[0] + (P0 & mask);
540 std::uint64_t t0 = (std::uint64_t)s; s >>= 64;
541 s += (unsigned __int128)a[1] + (P1 & mask);
542 std::uint64_t t1 = (std::uint64_t)s; s >>= 64;
543 s += (unsigned __int128)a[2] + (P2 & mask);
544 std::uint64_t t2 = (std::uint64_t)s; s >>= 64;
545 s += (unsigned __int128)a[3] + (P3 & mask);
546 std::uint64_t t3 = (std::uint64_t)s;
547 std::uint64_t carry = (std::uint64_t)(s >> 64);
548
549 // Shift right by 1 (divides the even result by 2)
550 r[0] = (t0 >> 1) | (t1 << 63);
551 r[1] = (t1 >> 1) | (t2 << 63);
552 r[2] = (t2 >> 1) | (t3 << 63);
553 r[3] = (t3 >> 1) | (carry << 63);
554}
555
556// ---------------------------------------------------------------------------
557// fe4x64_copy: Copy field element
558// ---------------------------------------------------------------------------
559FE4X64_FORCE_INLINE
560void copy(std::uint64_t* __restrict__ dst,
561 const std::uint64_t* __restrict__ src) noexcept {
562 dst[0] = src[0];
563 dst[1] = src[1];
564 dst[2] = src[2];
565 dst[3] = src[3];
566}
567
568// ---------------------------------------------------------------------------
569// fe4x64_cmov: Conditional move (branchless)
570// If flag != 0: dst = src. If flag == 0: dst unchanged.
571// ---------------------------------------------------------------------------
572FE4X64_FORCE_INLINE
573void cmov(std::uint64_t* __restrict__ dst,
574 const std::uint64_t* __restrict__ src,
575 std::uint64_t flag) noexcept {
576 std::uint64_t mask = (std::uint64_t)0 - (std::uint64_t)(flag != 0);
577 dst[0] = (dst[0] & ~mask) | (src[0] & mask);
578 dst[1] = (dst[1] & ~mask) | (src[1] & mask);
579 dst[2] = (dst[2] & ~mask) | (src[2] & mask);
580 dst[3] = (dst[3] & ~mask) | (src[3] & mask);
581}
582
583// ---------------------------------------------------------------------------
584// fe4x64_is_zero: Check if field element is zero (constant-time)
585// ---------------------------------------------------------------------------
586FE4X64_FORCE_INLINE
587bool is_zero(const std::uint64_t* a) noexcept {
588 return (a[0] | a[1] | a[2] | a[3]) == 0;
589}
590
591} // namespace secp256k1::fast::fe4x64
592
593#if defined(__GNUC__) && !defined(__clang__)
594#pragma GCC diagnostic pop
595#endif
596
597#endif // x86_64 + ADX + BMI2
constexpr std::uint32_t P0
Definition field_26.hpp:59
constexpr std::uint32_t P2
Definition field_26.hpp:61
constexpr std::uint32_t P1
Definition field_26.hpp:60
constexpr std::uint32_t P3
Definition field_26.hpp:62