UltrafastSecp256k1 3.50.0
Ultra high-performance secp256k1 elliptic curve cryptography library
Loading...
Searching...
No Matches
field_52_impl.hpp
Go to the documentation of this file.
1// ============================================================================
2// 5x52 Field Element -- Inline Hot-Path Implementations
3// ============================================================================
4//
5// All performance-critical 5x52 operations are FORCE-INLINED to eliminate
6// function-call overhead in ECC point operations (the #1 bottleneck).
7//
8// On x86-64 with -march=native, Clang/GCC generate MULX (BMI2) assembly
9// from the __int128 C code -- identical to hand-written assembly but with
10// superior register allocation (no callee-save push/pop overhead).
11//
12// This matches the strategy of bitcoin-core/secp256k1, which uses
13// SECP256K1_INLINE static in field_5x52_int128_impl.h.
14//
15// Impact: eliminates ~2-3ns per field-mul call -> cumulative ~30-50ns
16// savings per point double/add (which has 7+ field mul/sqr calls).
17//
18// Adaptation from bitcoin-core/secp256k1 field_5x52_int128_impl.h
19// (MIT license, Copyright (c) 2013-2024 Pieter Wuille and contributors)
20// ============================================================================
21
22#ifndef SECP256K1_FIELD_52_IMPL_HPP
23#define SECP256K1_FIELD_52_IMPL_HPP
24#pragma once
25
26#include <cstdint>
27
28// Guard: __int128 required for the 5x52 kernels
29// __SIZEOF_INT128__ is the canonical check -- defined on 64-bit GCC/Clang,
30// NOT on 32-bit (ESP32 Xtensa, Cortex-M, etc.) even though __GNUC__ is set.
31#if defined(__SIZEOF_INT128__)
32
33// Suppress GCC -Wpedantic for __int128 (universally supported on 64-bit GCC/Clang)
34#if defined(__GNUC__)
35#pragma GCC diagnostic push
36#pragma GCC diagnostic ignored "-Wpedantic"
37#endif
38
39// -- RISC-V 64-bit optimized FE52 kernels ---------------------------------
40// On SiFive U74 (in-order dual-issue), hand-scheduled MUL/MULHU assembly
41// for 5x52 Comba multiply with integrated secp256k1 reduction outperforms
42// __int128 C++ code because:
43// 1) Explicit register allocation avoids spills (25+ MUL ops)
44// 2) Carry chain scheduling hides MUL latency on in-order pipeline
45// 3) Branchless reduction integrated without separate passes
46//
47// Enabled by default when SECP256K1_HAS_RISCV_FE52_ASM is set by CMake.
48// To disable and fall back to __int128 C++: -DSECP256K1_RISCV_FE52_DISABLE=1
49#if defined(__riscv) && (__riscv_xlen == 64) && defined(SECP256K1_HAS_RISCV_FE52_ASM) \
50 && !defined(SECP256K1_RISCV_FE52_DISABLE)
51 #define SECP256K1_RISCV_FE52_V1 1
52extern "C" {
53 void fe52_mul_inner_riscv64(std::uint64_t* r, const std::uint64_t* a, const std::uint64_t* b);
54 void fe52_sqr_inner_riscv64(std::uint64_t* r, const std::uint64_t* a);
55}
56#endif
57
58// -- 4x64 assembly bridge for boundary-level FE52 optimizations ---------------
59// Provides access to 4x64 ADCX/ADOX field_mul/sqr assembly from FE52 code.
60// Used for pure sqr/mul chains (inverse, sqrt) where conversion at boundaries
61// is negligible (~6ns) compared to per-op savings (~2ns x 269 ops = ~538ns).
62// NOT used per-mul/sqr (GPU-style hybrid: same pointer, no conversion).
63// Requires: SECP256K1_HAS_ASM + x86-64 (4x64 assembly always linked)
64#if defined(SECP256K1_HAS_ASM) && (defined(__x86_64__) || defined(_M_X64))
65 #define SECP256K1_HYBRID_4X64_ACTIVE 1
66 #if defined(_WIN32)
67 extern "C" __attribute__((sysv_abi)) void field_mul_full_asm(
68 const std::uint64_t* a, const std::uint64_t* b, std::uint64_t* result);
69 extern "C" __attribute__((sysv_abi)) void field_sqr_full_asm(
70 const std::uint64_t* a, std::uint64_t* result);
71 #else
72 extern "C" {
73 void field_mul_full_asm(
74 const std::uint64_t* a, const std::uint64_t* b, std::uint64_t* result);
75 void field_sqr_full_asm(
76 const std::uint64_t* a, std::uint64_t* result);
77 }
78 #endif
79#endif // SECP256K1_HAS_ASM && x86-64
80
81// Force-inline attribute -- ensures zero call overhead for field ops.
82// The compiler generates MULX assembly automatically with -mbmi2.
83#if defined(__GNUC__) || defined(__clang__)
84 #define SECP256K1_FE52_FORCE_INLINE __attribute__((always_inline)) inline
85#elif defined(_MSC_VER)
86 #define SECP256K1_FE52_FORCE_INLINE __forceinline
87#else
88 #define SECP256K1_FE52_FORCE_INLINE inline
89#endif
90
91// -- Hybrid 4x64 helper functions (placed after SECP256K1_FE52_FORCE_INLINE) --
92#if defined(SECP256K1_HYBRID_4X64_ACTIVE)
93
94 // Fused normalize_weak + 5x52->4x64 pack (single function, minimal overhead)
95 SECP256K1_FE52_FORCE_INLINE
96 void fe52_normalize_and_pack_4x64(const std::uint64_t* n, std::uint64_t* out) noexcept {
97 constexpr std::uint64_t M = 0xFFFFFFFFFFFFFULL; // 52-bit mask
98 constexpr std::uint64_t M48v = 0xFFFFFFFFFFFFULL; // 48-bit mask
99 std::uint64_t t0 = n[0], t1 = n[1], t2 = n[2], t3 = n[3], t4 = n[4];
100 // Pass 1: carry propagation
101 t1 += (t0 >> 52); t0 &= M;
102 t2 += (t1 >> 52); t1 &= M;
103 t3 += (t2 >> 52); t2 &= M;
104 t4 += (t3 >> 52); t3 &= M;
105 // Overflow fold: x * 2^256 == x * 0x1000003D1 (mod p)
106 std::uint64_t const x = t4 >> 48;
107 t4 &= M48v;
108 t0 += x * 0x1000003D1ULL;
109 // Pass 2: re-propagate carry from fold
110 t1 += (t0 >> 52); t0 &= M;
111 t2 += (t1 >> 52); t1 &= M;
112 t3 += (t2 >> 52); t2 &= M;
113 t4 += (t3 >> 52); t3 &= M;
114 // Pack to 4x64
115 out[0] = t0 | (t1 << 52);
116 out[1] = (t1 >> 12) | (t2 << 40);
117 out[2] = (t2 >> 24) | (t3 << 28);
118 out[3] = (t3 >> 36) | (t4 << 16);
119 }
120
121 // 4x64 -> 5x52 unpack (no normalization needed, output is magnitude 1)
122 SECP256K1_FE52_FORCE_INLINE
123 void fe64_unpack_to_fe52(const std::uint64_t* L, std::uint64_t* r) noexcept {
124 constexpr std::uint64_t M = 0xFFFFFFFFFFFFFULL;
125 r[0] = L[0] & M;
126 r[1] = (L[0] >> 52) | ((L[1] & 0xFFFFFFFFFFULL) << 12);
127 r[2] = (L[1] >> 40) | ((L[2] & 0xFFFFFFFULL) << 24);
128 r[3] = (L[2] >> 28) | ((L[3] & 0xFFFFULL) << 36);
129 r[4] = L[3] >> 16;
130 }
131
132#endif // SECP256K1_HYBRID_4X64_ACTIVE
133
134namespace secp256k1::fast {
135
136using namespace fe52_constants;
137
138// ===========================================================================
139// Core Multiplication Kernel
140// ===========================================================================
141//
142// 5x52 field multiplication with inline secp256k1 reduction.
143// p = 2^256 - 0x1000003D1, so 2^260 == R = 0x1000003D10 (mod p).
144//
145// Product columns 5-8 are reduced by multiplying by R (or R>>4, R<<12)
146// and adding to columns 0-3. Columns processed out of order (3,4,0,1,2)
147// to keep 128-bit accumulators from overflowing.
148//
149// With -mbmi2 -O3: compiles to MULX + ADD/ADC chains (verified).
150// With always_inline: zero function-call overhead.
151
152SECP256K1_FE52_FORCE_INLINE
153void fe52_mul_inner(std::uint64_t* r,
154 const std::uint64_t* a,
155 const std::uint64_t* b) noexcept {
156#if defined(SECP256K1_RISCV_FE52_V1)
157 // RISC-V: Comba 5x52 multiply with integrated reduction in asm.
158 // On U74 in-order core, explicit register scheduling + carry hiding
159 // outperforms __int128 C++ (which Clang compiles to MUL/MULHU pairs
160 // with suboptimal register allocation for 25+ multiplications).
161 fe52_mul_inner_riscv64(r, a, b);
162#elif 0 // MONOLITHIC_MUL_ASM: Disabled -- with -march=native, Clang __int128
163 // already emits optimal MULX+ADCX/ADOX code. The inline asm prevents
164 // cross-operation scheduling and increases register pressure (~30% slower
165 // point_add when enabled). Kept for reference/non-native builds.
166 // ========================================================================
167 // Monolithic x86-64 MULX + ADCX/ADOX field multiply (single asm block)
168 // ========================================================================
169 // Single asm block = ZERO optimization barriers between columns.
170 // ADCX/ADOX dual carry chains enable ILP in columns 1 & 2.
171 //
172 // Register layout — ALL clobbered registers are volatile on Win64:
173 // [a0]-[a4] = a limbs (read-only inputs, compiler picks registers)
174 // [bp] = b pointer (read-only input)
175 // r8 = d_lo accumulator (volatile, clobbered)
176 // r9 = d_hi accumulator (volatile, clobbered)
177 // r10 = c_lo accumulator (volatile, clobbered)
178 // r11 = c_hi accumulator (volatile, clobbered)
179 // rdx = MULX source (volatile, clobbered)
180 // rax = MULX low / scratch (volatile, clobbered)
181 // rcx = MULX high / scratch (volatile, clobbered)
182 // rbp,rsi,rbx,rdi,r12-r15 = FREE for compiler (not touched by asm)
183 // ========================================================================
184 std::uint64_t out0, out1, out2, out3 = 0, out4 = 0;
185 const std::uint64_t a0_v = a[0], a1_v = a[1], a2_v = a[2];
186 const std::uint64_t a3_v = a[3], a4_v = a[4];
187 __asm__ __volatile__ (
188 // ---- Column 3 + reduced column 8 ----
189 "xorl %%r8d, %%r8d\n\t" // d_lo=0, clears CF+OF
190 "xorl %%r9d, %%r9d\n\t" // d_hi=0
191
192 "movq %[a0], %%rdx\n\t"
193 "mulxq 24(%[bp]), %%rax, %%rcx\n\t"
194 "adcxq %%rax, %%r8\n\t"
195 "adcxq %%rcx, %%r9\n\t"
196 "movq %[a1], %%rdx\n\t"
197 "mulxq 16(%[bp]), %%rax, %%rcx\n\t"
198 "adcxq %%rax, %%r8\n\t"
199 "adcxq %%rcx, %%r9\n\t"
200 "movq %[a2], %%rdx\n\t"
201 "mulxq 8(%[bp]), %%rax, %%rcx\n\t"
202 "adcxq %%rax, %%r8\n\t"
203 "adcxq %%rcx, %%r9\n\t"
204 "movq %[a3], %%rdx\n\t"
205 "mulxq (%[bp]), %%rax, %%rcx\n\t"
206 "adcxq %%rax, %%r8\n\t"
207 "adcxq %%rcx, %%r9\n\t"
208
209 // c = a4*b4
210 "movq %[a4], %%rdx\n\t"
211 "mulxq 32(%[bp]), %%r10, %%r11\n\t"
212
213 // d += R52 * c_lo
214 "movabsq $0x1000003D10, %%rdx\n\t"
215 "mulxq %%r10, %%rax, %%rcx\n\t"
216 "addq %%rax, %%r8\n\t"
217 "adcq %%rcx, %%r9\n\t"
218 // c >>= 64
219 "movq %%r11, %%r10\n\t"
220
221 // t3 = d & M52 → store to out3; d >>= 52
222 "movq %%r8, %%rax\n\t"
223 "movq $0xFFFFFFFFFFFFF, %%rcx\n\t"
224 "andq %%rcx, %%rax\n\t"
225 "movq %%rax, %[o3]\n\t"
226 "shrdq $52, %%r9, %%r8\n\t"
227 "shrq $52, %%r9\n\t"
228
229 // ---- Column 4 + column 8 carry ----
230 "xorl %%eax, %%eax\n\t"
231 "movq %[a0], %%rdx\n\t"
232 "mulxq 32(%[bp]), %%rax, %%rcx\n\t"
233 "adcxq %%rax, %%r8\n\t"
234 "adcxq %%rcx, %%r9\n\t"
235 "movq %[a1], %%rdx\n\t"
236 "mulxq 24(%[bp]), %%rax, %%rcx\n\t"
237 "adcxq %%rax, %%r8\n\t"
238 "adcxq %%rcx, %%r9\n\t"
239 "movq %[a2], %%rdx\n\t"
240 "mulxq 16(%[bp]), %%rax, %%rcx\n\t"
241 "adcxq %%rax, %%r8\n\t"
242 "adcxq %%rcx, %%r9\n\t"
243 "movq %[a3], %%rdx\n\t"
244 "mulxq 8(%[bp]), %%rax, %%rcx\n\t"
245 "adcxq %%rax, %%r8\n\t"
246 "adcxq %%rcx, %%r9\n\t"
247 "movq %[a4], %%rdx\n\t"
248 "mulxq (%[bp]), %%rax, %%rcx\n\t"
249 "adcxq %%rax, %%r8\n\t"
250 "adcxq %%rcx, %%r9\n\t"
251
252 // d += (R52 << 12) * c_lo
253 "movabsq $0x1000003D10000, %%rdx\n\t"
254 "mulxq %%r10, %%rax, %%rcx\n\t"
255 "addq %%rax, %%r8\n\t"
256 "adcq %%rcx, %%r9\n\t"
257
258 // t4_full = d & M52 → r10; d >>= 52
259 "movq %%r8, %%r10\n\t"
260 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
261 "andq %%rax, %%r10\n\t"
262 "shrdq $52, %%r9, %%r8\n\t"
263 "shrq $52, %%r9\n\t"
264
265 // ---- Column 0 + reduced column 5 ----
266 "xorl %%eax, %%eax\n\t"
267 "movq %[a1], %%rdx\n\t"
268 "mulxq 32(%[bp]), %%rax, %%rcx\n\t"
269 "adcxq %%rax, %%r8\n\t"
270 "adcxq %%rcx, %%r9\n\t"
271 "movq %[a2], %%rdx\n\t"
272 "mulxq 24(%[bp]), %%rax, %%rcx\n\t"
273 "adcxq %%rax, %%r8\n\t"
274 "adcxq %%rcx, %%r9\n\t"
275 "movq %[a3], %%rdx\n\t"
276 "mulxq 16(%[bp]), %%rax, %%rcx\n\t"
277 "adcxq %%rax, %%r8\n\t"
278 "adcxq %%rcx, %%r9\n\t"
279 "movq %[a4], %%rdx\n\t"
280 "mulxq 8(%[bp]), %%rax, %%rcx\n\t"
281 "adcxq %%rax, %%r8\n\t"
282 "adcxq %%rcx, %%r9\n\t"
283
284 // u0 = ((d & M52) << 4) | (t4_full >> 48)
285 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
286 "movq %%r8, %%rcx\n\t"
287 "andq %%rax, %%rcx\n\t"
288 "shrdq $52, %%r9, %%r8\n\t"
289 "shrq $52, %%r9\n\t"
290 "shlq $4, %%rcx\n\t"
291 "movq %%r10, %%rax\n\t"
292 "shrq $48, %%rax\n\t"
293 "orq %%rax, %%rcx\n\t"
294 // t4 = t4_full & M48 → store to out4
295 "movq $0xFFFFFFFFFFFF, %%rax\n\t"
296 "andq %%rax, %%r10\n\t"
297 "movq %%r10, %[o4]\n\t"
298
299 // c = a0*b0 + u0 * (R52 >> 4)
300 "movq %[a0], %%rdx\n\t"
301 "mulxq (%[bp]), %%r10, %%r11\n\t"
302 "movabsq $0x1000003D1, %%rdx\n\t"
303 "mulxq %%rcx, %%rax, %%rcx\n\t"
304 "addq %%rax, %%r10\n\t"
305 "adcq %%rcx, %%r11\n\t"
306
307 // r[0] = c & M52; c >>= 52
308 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
309 "movq %%r10, %%rcx\n\t"
310 "andq %%rax, %%rcx\n\t"
311 "movq %%rcx, %[o0]\n\t"
312 "shrdq $52, %%r11, %%r10\n\t"
313 "shrq $52, %%r11\n\t"
314
315 // ---- Column 1 + reduced column 6 (dual chain) ----
316 "xorl %%eax, %%eax\n\t"
317 "movq %[a0], %%rdx\n\t"
318 "mulxq 8(%[bp]), %%rax, %%rcx\n\t"
319 "adoxq %%rax, %%r10\n\t"
320 "adoxq %%rcx, %%r11\n\t"
321 "movq %[a2], %%rdx\n\t"
322 "mulxq 32(%[bp]), %%rax, %%rcx\n\t"
323 "adcxq %%rax, %%r8\n\t"
324 "adcxq %%rcx, %%r9\n\t"
325 "movq %[a1], %%rdx\n\t"
326 "mulxq (%[bp]), %%rax, %%rcx\n\t"
327 "adoxq %%rax, %%r10\n\t"
328 "adoxq %%rcx, %%r11\n\t"
329 "movq %[a3], %%rdx\n\t"
330 "mulxq 24(%[bp]), %%rax, %%rcx\n\t"
331 "adcxq %%rax, %%r8\n\t"
332 "adcxq %%rcx, %%r9\n\t"
333 "movq %[a4], %%rdx\n\t"
334 "mulxq 16(%[bp]), %%rax, %%rcx\n\t"
335 "adcxq %%rax, %%r8\n\t"
336 "adcxq %%rcx, %%r9\n\t"
337
338 // c += (d & M52) * R52; d >>= 52
339 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
340 "movq %%r8, %%rcx\n\t"
341 "andq %%rax, %%rcx\n\t"
342 "shrdq $52, %%r9, %%r8\n\t"
343 "shrq $52, %%r9\n\t"
344 "movabsq $0x1000003D10, %%rdx\n\t"
345 "mulxq %%rcx, %%rax, %%rcx\n\t"
346 "addq %%rax, %%r10\n\t"
347 "adcq %%rcx, %%r11\n\t"
348
349 // r[1] = c & M52; c >>= 52
350 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
351 "movq %%r10, %%rcx\n\t"
352 "andq %%rax, %%rcx\n\t"
353 "movq %%rcx, %[o1]\n\t"
354 "shrdq $52, %%r11, %%r10\n\t"
355 "shrq $52, %%r11\n\t"
356
357 // ---- Column 2 + reduced column 7 (dual chain) ----
358 "xorl %%eax, %%eax\n\t"
359 "movq %[a0], %%rdx\n\t"
360 "mulxq 16(%[bp]), %%rax, %%rcx\n\t"
361 "adoxq %%rax, %%r10\n\t"
362 "adoxq %%rcx, %%r11\n\t"
363 "movq %[a3], %%rdx\n\t"
364 "mulxq 32(%[bp]), %%rax, %%rcx\n\t"
365 "adcxq %%rax, %%r8\n\t"
366 "adcxq %%rcx, %%r9\n\t"
367 "movq %[a1], %%rdx\n\t"
368 "mulxq 8(%[bp]), %%rax, %%rcx\n\t"
369 "adoxq %%rax, %%r10\n\t"
370 "adoxq %%rcx, %%r11\n\t"
371 "movq %[a4], %%rdx\n\t"
372 "mulxq 24(%[bp]), %%rax, %%rcx\n\t"
373 "adcxq %%rax, %%r8\n\t"
374 "adcxq %%rcx, %%r9\n\t"
375 "movq %[a2], %%rdx\n\t"
376 "mulxq (%[bp]), %%rax, %%rcx\n\t"
377 "adoxq %%rax, %%r10\n\t"
378 "adoxq %%rcx, %%r11\n\t"
379
380 // c += R52 * d_lo; d = d_hi
381 "movabsq $0x1000003D10, %%rdx\n\t"
382 "mulxq %%r8, %%rax, %%rcx\n\t"
383 "addq %%rax, %%r10\n\t"
384 "adcq %%rcx, %%r11\n\t"
385 "movq %%r9, %%r8\n\t"
386 "xorl %%r9d, %%r9d\n\t"
387
388 // r[2] = c & M52; c >>= 52
389 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
390 "movq %%r10, %%rcx\n\t"
391 "andq %%rax, %%rcx\n\t"
392 "movq %%rcx, %[o2]\n\t"
393 "shrdq $52, %%r11, %%r10\n\t"
394 "shrq $52, %%r11\n\t"
395
396 // ---- Finalize columns 3 and 4 ----
397 "movabsq $0x1000003D10000, %%rdx\n\t"
398 "mulxq %%r8, %%rax, %%rcx\n\t"
399 "addq %%rax, %%r10\n\t"
400 "adcq %%rcx, %%r11\n\t"
401 "addq %[o3], %%r10\n\t"
402 "adcq $0, %%r11\n\t"
403
404 // r[3] = c & M52; c >>= 52
405 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
406 "movq %%r10, %%rcx\n\t"
407 "andq %%rax, %%rcx\n\t"
408 "movq %%rcx, %[o3]\n\t"
409 "shrdq $52, %%r11, %%r10\n\t"
410
411 // r[4] = c + t4
412 "addq %[o4], %%r10\n\t"
413 "movq %%r10, %[o4]\n\t"
414
415 : [o0] "=m"(out0), [o1] "=m"(out1), [o2] "=m"(out2),
416 [o3] "+m"(out3), [o4] "+m"(out4)
417 : [a0] "r"(a0_v), [a1] "r"(a1_v), [a2] "r"(a2_v),
418 [a3] "r"(a3_v), [a4] "r"(a4_v), [bp] "r"(b)
419 : "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "cc", "memory"
420 );
421 r[0] = out0; r[1] = out1; r[2] = out2; r[3] = out3; r[4] = out4;
422#elif 0 // INLINE_ADX disabled: asm barriers prevent ILP, __int128 is 6% faster
423 // ------------------------------------------------------------------
424 // x86-64 inline MULX + ADCX/ADOX dual carry chain path (OPT-IN)
425 // NOTE: opt-in only. In benchmarks, the overhead of asm-block
426 // optimization barriers outweighs the ADCX/ADOX parallel benefit.
427 // The __int128 fallback lets the compiler schedule across column
428 // boundaries, giving ~6% better throughput on Rocket Lake.
429 // ------------------------------------------------------------------
430 // ADCX uses CF flag, ADOX uses OF flag -- truly independent chains.
431 // When both c and d accumulators accumulate products in the same
432 // column, we interleave ADCX (d) and ADOX (c) to overlap execution.
433 //
434 // High-word carry invariant: sum of N products where each product
435 // < 2^104 (52x52 bits) gives total < N*2^104. For N<=5:
436 // 5*2^104 < 2^107 < 2^128. The 64-bit high word never overflows,
437 // so carry-out from adcx/adox on the high part is always 0.
438 // This keeps the continuous flag chain correct.
439 //
440 // Reduction multiplies between columns use __int128 C code (single
441 // MULX+ADD+ADC pair, compiler-optimal for isolated operations).
442 // ------------------------------------------------------------------
443 using u128 = unsigned __int128;
444 std::uint64_t d_lo = 0, d_hi = 0;
445 std::uint64_t c_lo = 0, c_hi = 0;
446 std::uint64_t t3, t4, tx, u0;
447 std::uint64_t sl, sh;
448 const std::uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
449
450 // -- Column 3 + reduced column 8 ---------------------------------
451 // d = a0*b3 + a1*b2 + a2*b1 + a3*b0 (4 products, ADCX/CF)
452 // c = a4*b4 (1 product, ADOX/OF)
453 __asm__ __volatile__(
454 "xor %%ecx, %%ecx\n\t"
455 "mov %[a0], %%rdx\n\t"
456 "mulxq 24(%[bp]), %[sl], %[sh]\n\t"
457 "adcx %[sl], %[dl]\n\t"
458 "adcx %[sh], %[dh]\n\t"
459 "mov %[a4], %%rdx\n\t"
460 "mulxq 32(%[bp]), %[sl], %[sh]\n\t"
461 "adox %[sl], %[cl]\n\t"
462 "adox %[sh], %[ch]\n\t"
463 "mov %[a1], %%rdx\n\t"
464 "mulxq 16(%[bp]), %[sl], %[sh]\n\t"
465 "adcx %[sl], %[dl]\n\t"
466 "adcx %[sh], %[dh]\n\t"
467 "mov %[a2], %%rdx\n\t"
468 "mulxq 8(%[bp]), %[sl], %[sh]\n\t"
469 "adcx %[sl], %[dl]\n\t"
470 "adcx %[sh], %[dh]\n\t"
471 "mov %[a3], %%rdx\n\t"
472 "mulxq (%[bp]), %[sl], %[sh]\n\t"
473 "adcx %[sl], %[dl]\n\t"
474 "adcx %[sh], %[dh]\n\t"
475 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
476 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
477 [sl] "=&r"(sl), [sh] "=&r"(sh)
478 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4),
479 [bp] "r"(b)
480 : "rdx", "rcx", "cc"
481 );
482 // d += R52 * (uint64_t)c
483 { u128 dv = ((u128)d_hi << 64) | d_lo;
484 dv += (u128)R52 * c_lo;
485 d_lo = (std::uint64_t)dv; d_hi = (std::uint64_t)(dv >> 64); }
486 c_lo = c_hi; c_hi = 0;
487 t3 = d_lo & M52;
488 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
489
490 // -- Column 4 + column 8 carry -----------------------------------
491 // d += a0*b4 + a1*b3 + a2*b2 + a3*b1 + a4*b0 (5 products, ADCX only)
492 __asm__ __volatile__(
493 "xor %%ecx, %%ecx\n\t"
494 "mov %[a0], %%rdx\n\t"
495 "mulxq 32(%[bp]), %[sl], %[sh]\n\t"
496 "adcx %[sl], %[dl]\n\t"
497 "adcx %[sh], %[dh]\n\t"
498 "mov %[a1], %%rdx\n\t"
499 "mulxq 24(%[bp]), %[sl], %[sh]\n\t"
500 "adcx %[sl], %[dl]\n\t"
501 "adcx %[sh], %[dh]\n\t"
502 "mov %[a2], %%rdx\n\t"
503 "mulxq 16(%[bp]), %[sl], %[sh]\n\t"
504 "adcx %[sl], %[dl]\n\t"
505 "adcx %[sh], %[dh]\n\t"
506 "mov %[a3], %%rdx\n\t"
507 "mulxq 8(%[bp]), %[sl], %[sh]\n\t"
508 "adcx %[sl], %[dl]\n\t"
509 "adcx %[sh], %[dh]\n\t"
510 "mov %[a4], %%rdx\n\t"
511 "mulxq (%[bp]), %[sl], %[sh]\n\t"
512 "adcx %[sl], %[dl]\n\t"
513 "adcx %[sh], %[dh]\n\t"
514 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
515 [sl] "=&r"(sl), [sh] "=&r"(sh)
516 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4),
517 [bp] "r"(b)
518 : "rdx", "rcx", "cc"
519 );
520 // d += (R52 << 12) * c_lo (c_lo carries column 3's c_hi)
521 { u128 dv = ((u128)d_hi << 64) | d_lo;
522 dv += (u128)(R52 << 12) * c_lo;
523 d_lo = (std::uint64_t)dv; d_hi = (std::uint64_t)(dv >> 64); }
524 t4 = d_lo & M52;
525 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
526 tx = (t4 >> 48); t4 &= (M52 >> 4);
527
528 // -- Column 0 + reduced column 5 ---------------------------------
529 // c = a0*b0 (1 product, ADOX/OF)
530 // d += a1*b4 + a2*b3 + a3*b2 + a4*b1 (4 products, ADCX/CF)
531 c_lo = 0; c_hi = 0;
532 __asm__ __volatile__(
533 "xor %%ecx, %%ecx\n\t"
534 "mov %[a0], %%rdx\n\t"
535 "mulxq (%[bp]), %[sl], %[sh]\n\t"
536 "adox %[sl], %[cl]\n\t"
537 "adox %[sh], %[ch]\n\t"
538 "mov %[a1], %%rdx\n\t"
539 "mulxq 32(%[bp]), %[sl], %[sh]\n\t"
540 "adcx %[sl], %[dl]\n\t"
541 "adcx %[sh], %[dh]\n\t"
542 "mov %[a2], %%rdx\n\t"
543 "mulxq 24(%[bp]), %[sl], %[sh]\n\t"
544 "adcx %[sl], %[dl]\n\t"
545 "adcx %[sh], %[dh]\n\t"
546 "mov %[a3], %%rdx\n\t"
547 "mulxq 16(%[bp]), %[sl], %[sh]\n\t"
548 "adcx %[sl], %[dl]\n\t"
549 "adcx %[sh], %[dh]\n\t"
550 "mov %[a4], %%rdx\n\t"
551 "mulxq 8(%[bp]), %[sl], %[sh]\n\t"
552 "adcx %[sl], %[dl]\n\t"
553 "adcx %[sh], %[dh]\n\t"
554 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
555 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
556 [sl] "=&r"(sl), [sh] "=&r"(sh)
557 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4),
558 [bp] "r"(b)
559 : "rdx", "rcx", "cc"
560 );
561 u0 = d_lo & M52;
562 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
563 u0 = (u0 << 4) | tx;
564 // c += u0 * (R52 >> 4)
565 { u128 cv = ((u128)c_hi << 64) | c_lo;
566 cv += (u128)u0 * (R52 >> 4);
567 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
568 r[0] = c_lo & M52;
569 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
570
571 // -- Column 1 + reduced column 6 ---------------------------------
572 // c += a0*b1 + a1*b0 (2 products, ADOX/OF)
573 // d += a2*b4 + a3*b3 + a4*b2 (3 products, ADCX/CF)
574 __asm__ __volatile__(
575 "xor %%ecx, %%ecx\n\t"
576 "mov %[a0], %%rdx\n\t"
577 "mulxq 8(%[bp]), %[sl], %[sh]\n\t"
578 "adox %[sl], %[cl]\n\t"
579 "adox %[sh], %[ch]\n\t"
580 "mov %[a2], %%rdx\n\t"
581 "mulxq 32(%[bp]), %[sl], %[sh]\n\t"
582 "adcx %[sl], %[dl]\n\t"
583 "adcx %[sh], %[dh]\n\t"
584 "mov %[a1], %%rdx\n\t"
585 "mulxq (%[bp]), %[sl], %[sh]\n\t"
586 "adox %[sl], %[cl]\n\t"
587 "adox %[sh], %[ch]\n\t"
588 "mov %[a3], %%rdx\n\t"
589 "mulxq 24(%[bp]), %[sl], %[sh]\n\t"
590 "adcx %[sl], %[dl]\n\t"
591 "adcx %[sh], %[dh]\n\t"
592 "mov %[a4], %%rdx\n\t"
593 "mulxq 16(%[bp]), %[sl], %[sh]\n\t"
594 "adcx %[sl], %[dl]\n\t"
595 "adcx %[sh], %[dh]\n\t"
596 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
597 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
598 [sl] "=&r"(sl), [sh] "=&r"(sh)
599 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4),
600 [bp] "r"(b)
601 : "rdx", "rcx", "cc"
602 );
603 // c += ((uint64_t)d & M52) * R52
604 { std::uint64_t d_masked = d_lo & M52;
605 u128 cv = ((u128)c_hi << 64) | c_lo;
606 cv += (u128)d_masked * R52;
607 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
608 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
609 r[1] = c_lo & M52;
610 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
611
612 // -- Column 2 + reduced column 7 ---------------------------------
613 // c += a0*b2 + a1*b1 + a2*b0 (3 products, ADOX/OF)
614 // d += a3*b4 + a4*b3 (2 products, ADCX/CF)
615 __asm__ __volatile__(
616 "xor %%ecx, %%ecx\n\t"
617 "mov %[a0], %%rdx\n\t"
618 "mulxq 16(%[bp]), %[sl], %[sh]\n\t"
619 "adox %[sl], %[cl]\n\t"
620 "adox %[sh], %[ch]\n\t"
621 "mov %[a3], %%rdx\n\t"
622 "mulxq 32(%[bp]), %[sl], %[sh]\n\t"
623 "adcx %[sl], %[dl]\n\t"
624 "adcx %[sh], %[dh]\n\t"
625 "mov %[a1], %%rdx\n\t"
626 "mulxq 8(%[bp]), %[sl], %[sh]\n\t"
627 "adox %[sl], %[cl]\n\t"
628 "adox %[sh], %[ch]\n\t"
629 "mov %[a4], %%rdx\n\t"
630 "mulxq 24(%[bp]), %[sl], %[sh]\n\t"
631 "adcx %[sl], %[dl]\n\t"
632 "adcx %[sh], %[dh]\n\t"
633 "mov %[a2], %%rdx\n\t"
634 "mulxq (%[bp]), %[sl], %[sh]\n\t"
635 "adox %[sl], %[cl]\n\t"
636 "adox %[sh], %[ch]\n\t"
637 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
638 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
639 [sl] "=&r"(sl), [sh] "=&r"(sh)
640 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4),
641 [bp] "r"(b)
642 : "rdx", "rcx", "cc"
643 );
644 // c += R52 * (uint64_t)d
645 { u128 cv = ((u128)c_hi << 64) | c_lo;
646 cv += (u128)R52 * d_lo;
647 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
648 d_lo = d_hi; d_hi = 0; // d >>= 64
649 r[2] = c_lo & M52;
650 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
651
652 // -- Finalize columns 3 and 4 ------------------------------------
653 { u128 cv = ((u128)c_hi << 64) | c_lo;
654 cv += (u128)(R52 << 12) * d_lo;
655 cv += t3;
656 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
657 r[3] = c_lo & M52;
658 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
659 c_lo += t4;
660 r[4] = c_lo;
661#else
662 using u128 = unsigned __int128;
663 u128 c = 0, d = 0;
664 std::uint64_t t3 = 0, t4 = 0, tx = 0, u0 = 0;
665 const std::uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
666
667 // -- Column 3 + reduced column 8 ---------------------------------
668 d = (u128)a0 * b[3]
669 + (u128)a1 * b[2]
670 + (u128)a2 * b[1]
671 + (u128)a3 * b[0];
672 c = (u128)a4 * b[4];
673 d += (u128)R52 * (std::uint64_t)c;
674 c >>= 64;
675 t3 = (std::uint64_t)d & M52;
676 d >>= 52;
677
678 // -- Column 4 + column 8 carry -----------------------------------
679 d += (u128)a0 * b[4]
680 + (u128)a1 * b[3]
681 + (u128)a2 * b[2]
682 + (u128)a3 * b[1]
683 + (u128)a4 * b[0];
684 d += (u128)(R52 << 12) * (std::uint64_t)c;
685 t4 = (std::uint64_t)d & M52;
686 d >>= 52;
687 tx = (t4 >> 48); t4 &= (M52 >> 4);
688
689 // -- Column 0 + reduced column 5 ---------------------------------
690 c = (u128)a0 * b[0];
691 d += (u128)a1 * b[4]
692 + (u128)a2 * b[3]
693 + (u128)a3 * b[2]
694 + (u128)a4 * b[1];
695 u0 = (std::uint64_t)d & M52;
696 d >>= 52;
697 u0 = (u0 << 4) | tx;
698 c += (u128)u0 * (R52 >> 4);
699 r[0] = (std::uint64_t)c & M52;
700 c >>= 52;
701
702 // -- Column 1 + reduced column 6 ---------------------------------
703 c += (u128)a0 * b[1]
704 + (u128)a1 * b[0];
705 d += (u128)a2 * b[4]
706 + (u128)a3 * b[3]
707 + (u128)a4 * b[2];
708 c += (u128)((std::uint64_t)d & M52) * R52;
709 d >>= 52;
710 r[1] = (std::uint64_t)c & M52;
711 c >>= 52;
712
713 // -- Column 2 + reduced column 7 ---------------------------------
714 c += (u128)a0 * b[2]
715 + (u128)a1 * b[1]
716 + (u128)a2 * b[0];
717 d += (u128)a3 * b[4]
718 + (u128)a4 * b[3];
719 c += (u128)R52 * (std::uint64_t)d;
720 d >>= 64;
721 r[2] = (std::uint64_t)c & M52;
722 c >>= 52;
723
724 // -- Finalize columns 3 and 4 ------------------------------------
725 c += (u128)(R52 << 12) * (std::uint64_t)d;
726 c += t3;
727 r[3] = (std::uint64_t)c & M52;
728 c >>= 52;
729 c += t4;
730 r[4] = (std::uint64_t)c;
731#endif // ARM64_FE52 / RISCV_FE52 / x64_ADX / generic (mul)
732}
733
734// ===========================================================================
735// Core Squaring Kernel (symmetry-optimized)
736// ===========================================================================
737//
738// Uses a[i]*a[j] == a[j]*a[i] symmetry to halve cross-product count.
739// Cross-products computed once and doubled via (a[i]*2) trick.
740
741SECP256K1_FE52_FORCE_INLINE
742void fe52_sqr_inner(std::uint64_t* r,
743 const std::uint64_t* a) noexcept {
744#if defined(SECP256K1_RISCV_FE52_V1)
745 // RISC-V: Symmetry-optimized squaring in asm.
746 // Cross-products doubled via shift, halving multiplication count.
747 fe52_sqr_inner_riscv64(r, a);
748#elif 0 // MONOLITHIC_SQR_ASM: Disabled -- same rationale as mul ASM above.
749 // ========================================================================
750 // Monolithic x86-64 MULX + ADCX/ADOX field squaring (single asm block)
751 // ========================================================================
752 // Cross-products doubled via LEA (flags-neutral), only 15 MULXes vs 25.
753 // ALL clobbered registers are volatile on Win64 (r8-r11, rax, rcx, rdx).
754 // ========================================================================
755 std::uint64_t out0, out1, out2, out3 = 0, out4 = 0;
756 const std::uint64_t a0_v = a[0], a1_v = a[1], a2_v = a[2];
757 const std::uint64_t a3_v = a[3], a4_v = a[4];
758 __asm__ __volatile__ (
759 // ---- Column 3 + reduced column 8 ----
760 // d = 2*a0*a3 + 2*a1*a2; c = a4^2
761 "xorl %%r8d, %%r8d\n\t"
762 "xorl %%r9d, %%r9d\n\t"
763
764 "leaq (%[a0], %[a0]), %%rdx\n\t"
765 "mulxq %[a3], %%rax, %%rcx\n\t"
766 "adcxq %%rax, %%r8\n\t"
767 "adcxq %%rcx, %%r9\n\t"
768 "leaq (%[a1], %[a1]), %%rdx\n\t"
769 "mulxq %[a2], %%rax, %%rcx\n\t"
770 "adcxq %%rax, %%r8\n\t"
771 "adcxq %%rcx, %%r9\n\t"
772
773 // c = a4*a4
774 "movq %[a4], %%rdx\n\t"
775 "mulxq %[a4], %%r10, %%r11\n\t"
776
777 // d += R52 * c_lo
778 "movabsq $0x1000003D10, %%rdx\n\t"
779 "mulxq %%r10, %%rax, %%rcx\n\t"
780 "addq %%rax, %%r8\n\t"
781 "adcq %%rcx, %%r9\n\t"
782 // c >>= 64
783 "movq %%r11, %%r10\n\t"
784
785 // t3 = d & M52; d >>= 52
786 "movq %%r8, %%rax\n\t"
787 "movq $0xFFFFFFFFFFFFF, %%rcx\n\t"
788 "andq %%rcx, %%rax\n\t"
789 "movq %%rax, %[o3]\n\t"
790 "shrdq $52, %%r9, %%r8\n\t"
791 "shrq $52, %%r9\n\t"
792
793 // ---- Column 4 ----
794 // d += 2*a0*a4 + 2*a1*a3 + a2^2
795 "xorl %%eax, %%eax\n\t"
796 "leaq (%[a0], %[a0]), %%rdx\n\t"
797 "mulxq %[a4], %%rax, %%rcx\n\t"
798 "adcxq %%rax, %%r8\n\t"
799 "adcxq %%rcx, %%r9\n\t"
800 "leaq (%[a1], %[a1]), %%rdx\n\t"
801 "mulxq %[a3], %%rax, %%rcx\n\t"
802 "adcxq %%rax, %%r8\n\t"
803 "adcxq %%rcx, %%r9\n\t"
804 "movq %[a2], %%rdx\n\t"
805 "mulxq %[a2], %%rax, %%rcx\n\t"
806 "adcxq %%rax, %%r8\n\t"
807 "adcxq %%rcx, %%r9\n\t"
808
809 // d += (R52 << 12) * c_lo
810 "movabsq $0x1000003D10000, %%rdx\n\t"
811 "mulxq %%r10, %%rax, %%rcx\n\t"
812 "addq %%rax, %%r8\n\t"
813 "adcq %%rcx, %%r9\n\t"
814
815 // t4_full = d & M52; d >>= 52
816 "movq %%r8, %%r10\n\t"
817 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
818 "andq %%rax, %%r10\n\t"
819 "shrdq $52, %%r9, %%r8\n\t"
820 "shrq $52, %%r9\n\t"
821
822 // ---- Column 0 + reduced column 5 ----
823 // c = a0^2; d += 2*a1*a4 + 2*a2*a3
824 "xorl %%eax, %%eax\n\t"
825 "leaq (%[a1], %[a1]), %%rdx\n\t"
826 "mulxq %[a4], %%rax, %%rcx\n\t"
827 "adcxq %%rax, %%r8\n\t"
828 "adcxq %%rcx, %%r9\n\t"
829 "leaq (%[a2], %[a2]), %%rdx\n\t"
830 "mulxq %[a3], %%rax, %%rcx\n\t"
831 "adcxq %%rax, %%r8\n\t"
832 "adcxq %%rcx, %%r9\n\t"
833
834 // u0 = ((d & M52) << 4) | (t4_full >> 48)
835 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
836 "movq %%r8, %%rcx\n\t"
837 "andq %%rax, %%rcx\n\t"
838 "shrdq $52, %%r9, %%r8\n\t"
839 "shrq $52, %%r9\n\t"
840 "shlq $4, %%rcx\n\t"
841 "movq %%r10, %%rax\n\t"
842 "shrq $48, %%rax\n\t"
843 "orq %%rax, %%rcx\n\t"
844 // t4 = t4_full & M48
845 "movq $0xFFFFFFFFFFFF, %%rax\n\t"
846 "andq %%rax, %%r10\n\t"
847 "movq %%r10, %[o4]\n\t"
848
849 // c = a0*a0 + u0 * (R52 >> 4)
850 "movq %[a0], %%rdx\n\t"
851 "mulxq %[a0], %%r10, %%r11\n\t"
852 "movabsq $0x1000003D1, %%rdx\n\t"
853 "mulxq %%rcx, %%rax, %%rcx\n\t"
854 "addq %%rax, %%r10\n\t"
855 "adcq %%rcx, %%r11\n\t"
856
857 // r[0] = c & M52; c >>= 52
858 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
859 "movq %%r10, %%rcx\n\t"
860 "andq %%rax, %%rcx\n\t"
861 "movq %%rcx, %[o0]\n\t"
862 "shrdq $52, %%r11, %%r10\n\t"
863 "shrq $52, %%r11\n\t"
864
865 // ---- Column 1 + reduced column 6 (dual chain) ----
866 // c += 2*a0*a1 (ADOX); d += 2*a2*a4 + a3^2 (ADCX)
867 "xorl %%eax, %%eax\n\t"
868 "leaq (%[a0], %[a0]), %%rdx\n\t"
869 "mulxq %[a1], %%rax, %%rcx\n\t"
870 "adoxq %%rax, %%r10\n\t"
871 "adoxq %%rcx, %%r11\n\t"
872 "leaq (%[a2], %[a2]), %%rdx\n\t"
873 "mulxq %[a4], %%rax, %%rcx\n\t"
874 "adcxq %%rax, %%r8\n\t"
875 "adcxq %%rcx, %%r9\n\t"
876 "movq %[a3], %%rdx\n\t"
877 "mulxq %[a3], %%rax, %%rcx\n\t"
878 "adcxq %%rax, %%r8\n\t"
879 "adcxq %%rcx, %%r9\n\t"
880
881 // c += (d & M52) * R52; d >>= 52
882 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
883 "movq %%r8, %%rcx\n\t"
884 "andq %%rax, %%rcx\n\t"
885 "shrdq $52, %%r9, %%r8\n\t"
886 "shrq $52, %%r9\n\t"
887 "movabsq $0x1000003D10, %%rdx\n\t"
888 "mulxq %%rcx, %%rax, %%rcx\n\t"
889 "addq %%rax, %%r10\n\t"
890 "adcq %%rcx, %%r11\n\t"
891
892 // r[1] = c & M52; c >>= 52
893 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
894 "movq %%r10, %%rcx\n\t"
895 "andq %%rax, %%rcx\n\t"
896 "movq %%rcx, %[o1]\n\t"
897 "shrdq $52, %%r11, %%r10\n\t"
898 "shrq $52, %%r11\n\t"
899
900 // ---- Column 2 + reduced column 7 (dual chain) ----
901 // c += 2*a0*a2 + a1^2 (ADOX); d += 2*a3*a4 (ADCX)
902 "xorl %%eax, %%eax\n\t"
903 "leaq (%[a0], %[a0]), %%rdx\n\t"
904 "mulxq %[a2], %%rax, %%rcx\n\t"
905 "adoxq %%rax, %%r10\n\t"
906 "adoxq %%rcx, %%r11\n\t"
907 "leaq (%[a3], %[a3]), %%rdx\n\t"
908 "mulxq %[a4], %%rax, %%rcx\n\t"
909 "adcxq %%rax, %%r8\n\t"
910 "adcxq %%rcx, %%r9\n\t"
911 "movq %[a1], %%rdx\n\t"
912 "mulxq %[a1], %%rax, %%rcx\n\t"
913 "adoxq %%rax, %%r10\n\t"
914 "adoxq %%rcx, %%r11\n\t"
915
916 // c += R52 * d_lo; d = d_hi
917 "movabsq $0x1000003D10, %%rdx\n\t"
918 "mulxq %%r8, %%rax, %%rcx\n\t"
919 "addq %%rax, %%r10\n\t"
920 "adcq %%rcx, %%r11\n\t"
921 "movq %%r9, %%r8\n\t"
922 "xorl %%r9d, %%r9d\n\t"
923
924 // r[2] = c & M52; c >>= 52
925 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
926 "movq %%r10, %%rcx\n\t"
927 "andq %%rax, %%rcx\n\t"
928 "movq %%rcx, %[o2]\n\t"
929 "shrdq $52, %%r11, %%r10\n\t"
930 "shrq $52, %%r11\n\t"
931
932 // ---- Finalize columns 3 and 4 ----
933 "movabsq $0x1000003D10000, %%rdx\n\t"
934 "mulxq %%r8, %%rax, %%rcx\n\t"
935 "addq %%rax, %%r10\n\t"
936 "adcq %%rcx, %%r11\n\t"
937 "addq %[o3], %%r10\n\t"
938 "adcq $0, %%r11\n\t"
939
940 // r[3] = c & M52; c >>= 52
941 "movq $0xFFFFFFFFFFFFF, %%rax\n\t"
942 "movq %%r10, %%rcx\n\t"
943 "andq %%rax, %%rcx\n\t"
944 "movq %%rcx, %[o3]\n\t"
945 "shrdq $52, %%r11, %%r10\n\t"
946
947 // r[4] = c + t4
948 "addq %[o4], %%r10\n\t"
949 "movq %%r10, %[o4]\n\t"
950
951 : [o0] "=m"(out0), [o1] "=m"(out1), [o2] "=m"(out2),
952 [o3] "+m"(out3), [o4] "+m"(out4)
953 : [a0] "r"(a0_v), [a1] "r"(a1_v), [a2] "r"(a2_v),
954 [a3] "r"(a3_v), [a4] "r"(a4_v)
955 : "rax", "rcx", "rdx", "r8", "r9", "r10", "r11", "cc", "memory"
956 );
957 r[0] = out0; r[1] = out1; r[2] = out2; r[3] = out3; r[4] = out4;
958#elif 0 // INLINE_ADX disabled: asm barriers prevent ILP, __int128 is 6% faster
959 // ------------------------------------------------------------------
960 // x86-64 inline MULX + ADCX/ADOX squaring (OPT-IN) -- see mul note
961 // ------------------------------------------------------------------
962 // Cross-products doubled via LEA (flags-neutral) then accumulated
963 // with ADCX/ADOX dual carry chains. Square terms use plain MULX.
964 // Same high-word carry invariant as fe52_mul_inner (sum < 2^128).
965 // ------------------------------------------------------------------
966 using u128 = unsigned __int128;
967 std::uint64_t d_lo = 0, d_hi = 0;
968 std::uint64_t c_lo = 0, c_hi = 0;
969 std::uint64_t t3, t4, tx, u0;
970 std::uint64_t sl, sh;
971 const std::uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
972
973 // -- Column 3 + reduced column 8 ---------------------------------
974 // d = (a0*2)*a3 + (a1*2)*a2 (2 cross-products, ADCX/CF)
975 // c = a4*a4 (1 square, ADOX/OF)
976 __asm__ __volatile__(
977 "xor %%ecx, %%ecx\n\t"
978 "lea (%[a0], %[a0]), %%rdx\n\t"
979 "mulxq %[a3], %[sl], %[sh]\n\t"
980 "adcx %[sl], %[dl]\n\t"
981 "adcx %[sh], %[dh]\n\t"
982 "mov %[a4], %%rdx\n\t"
983 "mulxq %[a4], %[sl], %[sh]\n\t"
984 "adox %[sl], %[cl]\n\t"
985 "adox %[sh], %[ch]\n\t"
986 "lea (%[a1], %[a1]), %%rdx\n\t"
987 "mulxq %[a2], %[sl], %[sh]\n\t"
988 "adcx %[sl], %[dl]\n\t"
989 "adcx %[sh], %[dh]\n\t"
990 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
991 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
992 [sl] "=&r"(sl), [sh] "=&r"(sh)
993 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4)
994 : "rdx", "rcx", "cc"
995 );
996 { u128 dv = ((u128)d_hi << 64) | d_lo;
997 dv += (u128)R52 * c_lo;
998 d_lo = (std::uint64_t)dv; d_hi = (std::uint64_t)(dv >> 64); }
999 c_lo = c_hi; c_hi = 0;
1000 t3 = d_lo & M52;
1001 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
1002
1003 // -- Column 4 ----------------------------------------------------
1004 // d += (a0*2)*a4 + (a1*2)*a3 + a2*a2 (3 products, ADCX only)
1005 __asm__ __volatile__(
1006 "xor %%ecx, %%ecx\n\t"
1007 "lea (%[a0], %[a0]), %%rdx\n\t"
1008 "mulxq %[a4], %[sl], %[sh]\n\t"
1009 "adcx %[sl], %[dl]\n\t"
1010 "adcx %[sh], %[dh]\n\t"
1011 "lea (%[a1], %[a1]), %%rdx\n\t"
1012 "mulxq %[a3], %[sl], %[sh]\n\t"
1013 "adcx %[sl], %[dl]\n\t"
1014 "adcx %[sh], %[dh]\n\t"
1015 "mov %[a2], %%rdx\n\t"
1016 "mulxq %[a2], %[sl], %[sh]\n\t"
1017 "adcx %[sl], %[dl]\n\t"
1018 "adcx %[sh], %[dh]\n\t"
1019 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
1020 [sl] "=&r"(sl), [sh] "=&r"(sh)
1021 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4)
1022 : "rdx", "rcx", "cc"
1023 );
1024 { u128 dv = ((u128)d_hi << 64) | d_lo;
1025 dv += (u128)(R52 << 12) * c_lo;
1026 d_lo = (std::uint64_t)dv; d_hi = (std::uint64_t)(dv >> 64); }
1027 t4 = d_lo & M52;
1028 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
1029 tx = (t4 >> 48); t4 &= (M52 >> 4);
1030
1031 // -- Column 0 + reduced column 5 ---------------------------------
1032 // c = a0*a0 (1 square, ADOX/OF)
1033 // d += (a1*2)*a4 + (a2*2)*a3 (2 cross-products, ADCX/CF)
1034 c_lo = 0; c_hi = 0;
1035 __asm__ __volatile__(
1036 "xor %%ecx, %%ecx\n\t"
1037 "mov %[a0], %%rdx\n\t"
1038 "mulxq %[a0], %[sl], %[sh]\n\t"
1039 "adox %[sl], %[cl]\n\t"
1040 "adox %[sh], %[ch]\n\t"
1041 "lea (%[a1], %[a1]), %%rdx\n\t"
1042 "mulxq %[a4], %[sl], %[sh]\n\t"
1043 "adcx %[sl], %[dl]\n\t"
1044 "adcx %[sh], %[dh]\n\t"
1045 "lea (%[a2], %[a2]), %%rdx\n\t"
1046 "mulxq %[a3], %[sl], %[sh]\n\t"
1047 "adcx %[sl], %[dl]\n\t"
1048 "adcx %[sh], %[dh]\n\t"
1049 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
1050 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
1051 [sl] "=&r"(sl), [sh] "=&r"(sh)
1052 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4)
1053 : "rdx", "rcx", "cc"
1054 );
1055 u0 = d_lo & M52;
1056 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
1057 u0 = (u0 << 4) | tx;
1058 { u128 cv = ((u128)c_hi << 64) | c_lo;
1059 cv += (u128)u0 * (R52 >> 4);
1060 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
1061 r[0] = c_lo & M52;
1062 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
1063
1064 // -- Column 1 + reduced column 6 ---------------------------------
1065 // c += (a0*2)*a1 (1 cross-product, ADOX/OF)
1066 // d += (a2*2)*a4 + a3*a3 (2 products, ADCX/CF)
1067 __asm__ __volatile__(
1068 "xor %%ecx, %%ecx\n\t"
1069 "lea (%[a0], %[a0]), %%rdx\n\t"
1070 "mulxq %[a1], %[sl], %[sh]\n\t"
1071 "adox %[sl], %[cl]\n\t"
1072 "adox %[sh], %[ch]\n\t"
1073 "lea (%[a2], %[a2]), %%rdx\n\t"
1074 "mulxq %[a4], %[sl], %[sh]\n\t"
1075 "adcx %[sl], %[dl]\n\t"
1076 "adcx %[sh], %[dh]\n\t"
1077 "mov %[a3], %%rdx\n\t"
1078 "mulxq %[a3], %[sl], %[sh]\n\t"
1079 "adcx %[sl], %[dl]\n\t"
1080 "adcx %[sh], %[dh]\n\t"
1081 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
1082 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
1083 [sl] "=&r"(sl), [sh] "=&r"(sh)
1084 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4)
1085 : "rdx", "rcx", "cc"
1086 );
1087 { std::uint64_t d_masked = d_lo & M52;
1088 u128 cv = ((u128)c_hi << 64) | c_lo;
1089 cv += (u128)d_masked * R52;
1090 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
1091 d_lo = (d_lo >> 52) | (d_hi << 12); d_hi >>= 52;
1092 r[1] = c_lo & M52;
1093 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
1094
1095 // -- Column 2 + reduced column 7 ---------------------------------
1096 // c += (a0*2)*a2 + a1*a1 (2 products, ADOX/OF)
1097 // d += (a3*2)*a4 (1 cross-product, ADCX/CF)
1098 __asm__ __volatile__(
1099 "xor %%ecx, %%ecx\n\t"
1100 "lea (%[a0], %[a0]), %%rdx\n\t"
1101 "mulxq %[a2], %[sl], %[sh]\n\t"
1102 "adox %[sl], %[cl]\n\t"
1103 "adox %[sh], %[ch]\n\t"
1104 "lea (%[a3], %[a3]), %%rdx\n\t"
1105 "mulxq %[a4], %[sl], %[sh]\n\t"
1106 "adcx %[sl], %[dl]\n\t"
1107 "adcx %[sh], %[dh]\n\t"
1108 "mov %[a1], %%rdx\n\t"
1109 "mulxq %[a1], %[sl], %[sh]\n\t"
1110 "adox %[sl], %[cl]\n\t"
1111 "adox %[sh], %[ch]\n\t"
1112 : [dl] "+&r"(d_lo), [dh] "+&r"(d_hi),
1113 [cl] "+&r"(c_lo), [ch] "+&r"(c_hi),
1114 [sl] "=&r"(sl), [sh] "=&r"(sh)
1115 : [a0] "r"(a0), [a1] "r"(a1), [a2] "r"(a2), [a3] "r"(a3), [a4] "r"(a4)
1116 : "rdx", "rcx", "cc"
1117 );
1118 { u128 cv = ((u128)c_hi << 64) | c_lo;
1119 cv += (u128)R52 * d_lo;
1120 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
1121 d_lo = d_hi; d_hi = 0;
1122 r[2] = c_lo & M52;
1123 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
1124
1125 // -- Finalize columns 3 and 4 ------------------------------------
1126 { u128 cv = ((u128)c_hi << 64) | c_lo;
1127 cv += (u128)(R52 << 12) * d_lo;
1128 cv += t3;
1129 c_lo = (std::uint64_t)cv; c_hi = (std::uint64_t)(cv >> 64); }
1130 r[3] = c_lo & M52;
1131 c_lo = (c_lo >> 52) | (c_hi << 12); c_hi >>= 52;
1132 c_lo += t4;
1133 r[4] = c_lo;
1134#else
1135 using u128 = unsigned __int128;
1136 u128 c = 0, d = 0;
1137 std::uint64_t t3 = 0, t4 = 0, tx = 0, u0 = 0;
1138 const std::uint64_t a0 = a[0], a1 = a[1], a2 = a[2], a3 = a[3], a4 = a[4];
1139
1140 // -- Column 3 + reduced column 8 ---------------------------------
1141 d = (u128)(a0 * 2) * a3
1142 + (u128)(a1 * 2) * a2;
1143 c = (u128)a4 * a4;
1144 d += (u128)R52 * (std::uint64_t)c;
1145 c >>= 64;
1146 t3 = (std::uint64_t)d & M52;
1147 d >>= 52;
1148
1149 // -- Column 4 ----------------------------------------------------
1150 d += (u128)(a0 * 2) * a4
1151 + (u128)(a1 * 2) * a3
1152 + (u128)a2 * a2;
1153 d += (u128)(R52 << 12) * (std::uint64_t)c;
1154 t4 = (std::uint64_t)d & M52;
1155 d >>= 52;
1156 tx = (t4 >> 48); t4 &= (M52 >> 4);
1157
1158 // -- Column 0 + reduced column 5 ---------------------------------
1159 c = (u128)a0 * a0;
1160 d += (u128)(a1 * 2) * a4
1161 + (u128)(a2 * 2) * a3;
1162 u0 = (std::uint64_t)d & M52;
1163 d >>= 52;
1164 u0 = (u0 << 4) | tx;
1165 c += (u128)u0 * (R52 >> 4);
1166 r[0] = (std::uint64_t)c & M52;
1167 c >>= 52;
1168
1169 // -- Column 1 + reduced column 6 ---------------------------------
1170 c += (u128)(a0 * 2) * a1;
1171 d += (u128)(a2 * 2) * a4
1172 + (u128)a3 * a3;
1173 c += (u128)((std::uint64_t)d & M52) * R52;
1174 d >>= 52;
1175 r[1] = (std::uint64_t)c & M52;
1176 c >>= 52;
1177
1178 // -- Column 2 + reduced column 7 ---------------------------------
1179 c += (u128)(a0 * 2) * a2
1180 + (u128)a1 * a1;
1181 d += (u128)(a3 * 2) * a4;
1182 c += (u128)R52 * (std::uint64_t)d;
1183 d >>= 64;
1184 r[2] = (std::uint64_t)c & M52;
1185 c >>= 52;
1186
1187 // -- Finalize columns 3 and 4 ------------------------------------
1188 c += (u128)(R52 << 12) * (std::uint64_t)d;
1189 c += t3;
1190 r[3] = (std::uint64_t)c & M52;
1191 c >>= 52;
1192 c += t4;
1193 r[4] = (std::uint64_t)c;
1194#endif // ARM64_FE52 / RISCV_FE52 / x64_ADX / generic (sqr)
1195}
1196
1197// ===========================================================================
1198// Weak Normalization (inline for half() hot path)
1199// ===========================================================================
1200
1201SECP256K1_FE52_FORCE_INLINE
1202void fe52_normalize_weak(std::uint64_t* r) noexcept {
1203 std::uint64_t t0 = r[0], t1 = r[1], t2 = r[2], t3 = r[3], t4 = r[4];
1204 // Pass 1: propagate carries bottom-to-top to get true t4 value.
1205 // Required because our negate convention (1*(m+1)*P, not 2*(m+1)*P)
1206 // allows lower-limb carries that propagate to t4.
1207 t1 += (t0 >> 52); t0 &= M52;
1208 t2 += (t1 >> 52); t1 &= M52;
1209 t3 += (t2 >> 52); t2 &= M52;
1210 t4 += (t3 >> 52); t3 &= M52;
1211 // Fold t4 overflow: x * 2^256 == x * R (mod p)
1212 std::uint64_t const x = t4 >> 48;
1213 t4 &= M48;
1214 t0 += x * 0x1000003D1ULL;
1215 // Pass 2: re-propagate carry from fold
1216 t1 += (t0 >> 52); t0 &= M52;
1217 t2 += (t1 >> 52); t1 &= M52;
1218 t3 += (t2 >> 52); t2 &= M52;
1219 t4 += (t3 >> 52); t3 &= M52;
1220 r[0] = t0; r[1] = t1; r[2] = t2; r[3] = t3; r[4] = t4;
1221}
1222
1223// ===========================================================================
1224// FieldElement52 Method Implementations (all force-inlined)
1225// ===========================================================================
1226
1227// -- Multiplication -------------------------------------------------------
1228
1229SECP256K1_FE52_FORCE_INLINE
1230FieldElement52 FieldElement52::operator*(const FieldElement52& rhs) const noexcept {
1231 FieldElement52 r;
1232 fe52_mul_inner(r.n, n, rhs.n);
1233 return r;
1234}
1235
1236SECP256K1_FE52_FORCE_INLINE
1237FieldElement52 FieldElement52::square() const noexcept {
1238 FieldElement52 r;
1239 fe52_sqr_inner(r.n, n);
1240 return r;
1241}
1242
1243SECP256K1_FE52_FORCE_INLINE
1244void FieldElement52::mul_assign(const FieldElement52& rhs) noexcept {
1245 fe52_mul_inner(n, n, rhs.n);
1246}
1247
1248SECP256K1_FE52_FORCE_INLINE
1249void FieldElement52::square_inplace() noexcept {
1250 fe52_sqr_inner(n, n);
1251}
1252
1253// -- Lazy Addition (NO carry propagation!) --------------------------------
1254
1255SECP256K1_FE52_FORCE_INLINE
1256FieldElement52 FieldElement52::operator+(const FieldElement52& rhs) const noexcept {
1257 FieldElement52 r;
1258 r.n[0] = n[0] + rhs.n[0];
1259 r.n[1] = n[1] + rhs.n[1];
1260 r.n[2] = n[2] + rhs.n[2];
1261 r.n[3] = n[3] + rhs.n[3];
1262 r.n[4] = n[4] + rhs.n[4];
1263 return r;
1264}
1265
1266SECP256K1_FE52_FORCE_INLINE
1267void FieldElement52::add_assign(const FieldElement52& rhs) noexcept {
1268 n[0] += rhs.n[0];
1269 n[1] += rhs.n[1];
1270 n[2] += rhs.n[2];
1271 n[3] += rhs.n[3];
1272 n[4] += rhs.n[4];
1273}
1274
1275// -- Negate: (M+1)*p - a -------------------------------------------------
1276
1277SECP256K1_FE52_FORCE_INLINE
1278FieldElement52 FieldElement52::negate(unsigned magnitude) const noexcept {
1279 using namespace fe52_constants;
1280 FieldElement52 r;
1281 const std::uint64_t m1 = static_cast<std::uint64_t>(magnitude) + 1ULL;
1282 r.n[0] = m1 * P0 - n[0];
1283 r.n[1] = m1 * P1 - n[1];
1284 r.n[2] = m1 * P2 - n[2];
1285 r.n[3] = m1 * P3 - n[3];
1286 r.n[4] = m1 * P4 - n[4];
1287 return r;
1288}
1289
1290SECP256K1_FE52_FORCE_INLINE
1291void FieldElement52::negate_assign(unsigned magnitude) noexcept {
1292 const std::uint64_t m1 = static_cast<std::uint64_t>(magnitude) + 1ULL;
1293 n[0] = m1 * P0 - n[0];
1294 n[1] = m1 * P1 - n[1];
1295 n[2] = m1 * P2 - n[2];
1296 n[3] = m1 * P3 - n[3];
1297 n[4] = m1 * P4 - n[4];
1298}
1299
1300// -- Branchless conditional negate (magnitude 1) --------------------------
1301// sign_mask: 0 = keep original, -1 (0xFFFFFFFF) = negate.
1302// Uses XOR-select to avoid branches on unpredictable sign bits.
1303SECP256K1_FE52_FORCE_INLINE
1304void FieldElement52::conditional_negate_assign(std::int32_t sign_mask) noexcept {
1305 const auto mask = static_cast<std::uint64_t>(static_cast<std::int64_t>(sign_mask));
1306 // Compute negated limbs (magnitude 1: 2*P - n)
1307 const std::uint64_t neg0 = 2ULL * P0 - n[0];
1308 const std::uint64_t neg1 = 2ULL * P1 - n[1];
1309 const std::uint64_t neg2 = 2ULL * P2 - n[2];
1310 const std::uint64_t neg3 = 2ULL * P3 - n[3];
1311 const std::uint64_t neg4 = 2ULL * P4 - n[4];
1312 // Branchless select: mask=0 → keep n[i]; mask=~0 → use neg[i]
1313 n[0] ^= (n[0] ^ neg0) & mask;
1314 n[1] ^= (n[1] ^ neg1) & mask;
1315 n[2] ^= (n[2] ^ neg2) & mask;
1316 n[3] ^= (n[3] ^ neg3) & mask;
1317 n[4] ^= (n[4] ^ neg4) & mask;
1318}
1319
1320// -- Weak Normalization (member) ------------------------------------------
1321
1322SECP256K1_FE52_FORCE_INLINE
1323void FieldElement52::normalize_weak() noexcept {
1324 fe52_normalize_weak(n);
1325}
1326
1327// -- Half (a/2 mod p) -- branchless ---------------------------------------
1328// libsecp-style: mask trick avoids carry propagation entirely.
1329// If odd, add p; then right-shift by 1. The mask is (-(t0 & 1)) >> 12
1330// which produces a 52-bit all-ones mask (0xFFFFFFFFFFFFF) when odd, 0 when even.
1331// Since P1=P2=P3 = M52 = 0xFFFFFFFFFFFFF, and the mask has exactly 52 set bits,
1332// adding mask to P1..P3 limbs can never exceed 2*M52 < 2^53 (fits in 64 bits).
1333// No carry propagation needed!
1334
1335SECP256K1_FE52_FORCE_INLINE
1336FieldElement52 FieldElement52::half() const noexcept {
1337 const std::uint64_t* src = n;
1338 const std::uint64_t one = 1ULL;
1339 const std::uint64_t mask = (0ULL - (src[0] & one)) >> 12; // 52-bit mask if odd
1340
1341 // Conditionally add p (limb-wise, no carry propagation needed)
1342 const std::uint64_t t0 = src[0] + (0xFFFFEFFFFFC2FULL & mask);
1343 const std::uint64_t t1 = src[1] + mask; // P1 = M52 = mask
1344 const std::uint64_t t2 = src[2] + mask; // P2 = M52 = mask
1345 const std::uint64_t t3 = src[3] + mask; // P3 = M52 = mask
1346 const std::uint64_t t4 = src[4] + (mask >> 4); // P4 = 48-bit
1347
1348 // Right shift by 1 (divide by 2)
1349 // MUST use + (not |): without carry propagation, t_i can exceed M52,
1350 // so bit 51 of (t_i >> 1) can be set, overlapping with (t_{i+1} & 1) << 51.
1351 // Addition correctly carries; OR would silently drop the carry.
1352 FieldElement52 r;
1353 r.n[0] = (t0 >> 1) + ((t1 & one) << 51);
1354 r.n[1] = (t1 >> 1) + ((t2 & one) << 51);
1355 r.n[2] = (t2 >> 1) + ((t3 & one) << 51);
1356 r.n[3] = (t3 >> 1) + ((t4 & one) << 51);
1357 r.n[4] = (t4 >> 1);
1358 return r;
1359}
1360
1361SECP256K1_FE52_FORCE_INLINE
1362void FieldElement52::half_assign() noexcept {
1363 const std::uint64_t one = 1ULL;
1364 const std::uint64_t mask = (0ULL - (n[0] & one)) >> 12;
1365
1366 const std::uint64_t t0 = n[0] + (0xFFFFEFFFFFC2FULL & mask);
1367 const std::uint64_t t1 = n[1] + mask;
1368 const std::uint64_t t2 = n[2] + mask;
1369 const std::uint64_t t3 = n[3] + mask;
1370 const std::uint64_t t4 = n[4] + (mask >> 4);
1371
1372 // MUST use + (not |): see half() comment above.
1373 n[0] = (t0 >> 1) + ((t1 & one) << 51);
1374 n[1] = (t1 >> 1) + ((t2 & one) << 51);
1375 n[2] = (t2 >> 1) + ((t3 & one) << 51);
1376 n[3] = (t3 >> 1) + ((t4 & one) << 51);
1377 n[4] = (t4 >> 1);
1378}
1379
1380// -- Multiply by small integer (no carry propagation) ---------------------
1381// Each limb is multiplied by a (scalar <= 32).
1382// Safe as long as magnitude * a * 2^52 < 2^64, i.e. magnitude * a < 4096.
1383
1384SECP256K1_FE52_FORCE_INLINE
1385void FieldElement52::mul_int_assign(std::uint32_t a) noexcept {
1386 n[0] *= a;
1387 n[1] *= a;
1388 n[2] *= a;
1389 n[3] *= a;
1390 n[4] *= a;
1391}
1392
1393// -- Full Normalization: canonical result in [0, p) ----------------------
1394// Fold-first approach (matches libsecp256k1): fold t4 overflow BEFORE carry
1395// propagation so only 2 carry chains are needed instead of 3.
1396
1397SECP256K1_FE52_FORCE_INLINE
1398static void fe52_normalize_inline(std::uint64_t* r) noexcept {
1399 std::uint64_t t0 = r[0], t1 = r[1], t2 = r[2], t3 = r[3], t4 = r[4];
1400
1401 // Reduce t4 overflow first (before carry propagation).
1402 // This ensures at most a single carry from the first pass.
1403 std::uint64_t m = 0;
1404 std::uint64_t x = t4 >> 48; t4 &= M48;
1405
1406 // Single carry propagation pass with m accumulation for >= p check
1407 t0 += x * 0x1000003D1ULL;
1408 t1 += (t0 >> 52); t0 &= M52;
1409 t2 += (t1 >> 52); t1 &= M52; m = t1;
1410 t3 += (t2 >> 52); t2 &= M52; m &= t2;
1411 t4 += (t3 >> 52); t3 &= M52; m &= t3;
1412
1413 // At most a single bit of overflow at bit 48 of t4 (bit 256 of value).
1414 // Check if result >= p:
1415 // bit 48 of t4 set (value >= 2^256), OR
1416 // all limbs at max (t1&t2&t3 == M52, t4 == M48, t0 >= p's low 52 bits)
1417 x = (t4 >> 48) | ((t4 == M48) & (m == M52)
1418 & (t0 >= 0xFFFFEFFFFFC2FULL));
1419
1420 // Conditional final reduction (always executed for constant-time)
1421 t0 += x * 0x1000003D1ULL;
1422 t1 += (t0 >> 52); t0 &= M52;
1423 t2 += (t1 >> 52); t1 &= M52;
1424 t3 += (t2 >> 52); t2 &= M52;
1425 t4 += (t3 >> 52); t3 &= M52;
1426 t4 &= M48;
1427
1428 r[0] = t0; r[1] = t1; r[2] = t2; r[3] = t3; r[4] = t4;
1429}
1430
1431// -- Inline Normalization Method -----------------------------------------
1432
1433SECP256K1_FE52_FORCE_INLINE
1434void FieldElement52::normalize() noexcept {
1435 fe52_normalize_inline(n);
1436}
1437
1438// -- Variable-time Zero Check (full normalize) ----------------------------
1439// Uses fe52_normalize_inline (fold-first carry + conditional p-subtraction)
1440// then checks canonical zero. The previous single-pass implementation
1441// could produce false negatives at magnitude >= 25 (e.g. h = u2 +
1442// negate(23) in mixed-add) because one pass can leave the value in
1443// [p, 2p) -- neither raw-0 nor raw-p.
1444//
1445// Variable-time: safe for non-secret values (point coordinates in ECC).
1446
1447SECP256K1_FE52_FORCE_INLINE
1448bool FieldElement52::normalizes_to_zero() const noexcept {
1449 std::uint64_t t[5] = {n[0], n[1], n[2], n[3], n[4]};
1450 fe52_normalize_inline(t);
1451 return (t[0] | t[1] | t[2] | t[3] | t[4]) == 0;
1452}
1453
1454// -- Variable-time Zero Check with Early Exit ------------------------------
1455// Performs a single normalize_weak pass (carry + overflow reduction + carry),
1456// then checks for raw-zero and p. Avoids the expensive conditional
1457// p-subtraction + branchless-select of fe52_normalize_inline.
1458//
1459// After one normalize_weak pass at any magnitude <= ~4000, the value is
1460// in [0, 2p). The only representations of 0 mod p in [0, 2p) are
1461// raw-zero (all limbs 0) and p itself.
1462//
1463// In the ecmult hot loop, h == 0 occurs with probability ~2^-256,
1464// so the fast non-zero path fires in essentially 100% of calls.
1465// This replaces the old normalize_weak() + normalizes_to_zero() pair
1466// in jac52_add_mixed*, saving ~40 limb ops per mixed add.
1467
1468SECP256K1_FE52_FORCE_INLINE
1469bool FieldElement52::normalizes_to_zero_var() const noexcept {
1470 using namespace fe52_constants;
1471 std::uint64_t t0 = n[0], t4 = n[4];
1472
1473 // Reduce t4 overflow into t0 first (at most one carry fold).
1474 // This ensures the first full carry pass has at most one carry
1475 // propagation step from the injected overflow.
1476 const std::uint64_t x = t4 >> 48;
1477 t0 += x * 0x1000003D1ULL;
1478
1479 // z0 tracks "could be raw zero", z1 tracks "could be p".
1480 // If the low 52 bits of t0 are clearly non-zero AND don't match P0,
1481 // the full value is neither 0 nor p -- early exit without touching n[1..3].
1482 std::uint64_t z0 = t0 & M52;
1483 std::uint64_t z1 = z0 ^ 0x1000003D0ULL;
1484
1485 // Fast return: catches ~100% of cases using only t0 and t4.
1486 if ((z0 != 0ULL) & (z1 != M52)) {
1487 return false;
1488 }
1489
1490 // Slow path: full carry propagation for the remaining cases.
1491 std::uint64_t t1 = n[1], t2 = n[2], t3 = n[3];
1492 t4 &= M48;
1493
1494 t1 += (t0 >> 52);
1495 t2 += (t1 >> 52); t1 &= M52; z0 |= t1; z1 &= t1;
1496 t3 += (t2 >> 52); t2 &= M52; z0 |= t2; z1 &= t2;
1497 t4 += (t3 >> 52); t3 &= M52; z0 |= t3; z1 &= t3;
1498 z0 |= t4; z1 &= t4 ^ 0xF000000000000ULL;
1499
1500 return (z0 == 0) | (z1 == M52);
1501}
1502
1503// -- Conversion: 4x64 -> 5x52 (inline) -----------------------------------
1504
1505SECP256K1_FE52_FORCE_INLINE
1506FieldElement52 FieldElement52::from_fe(const FieldElement& fe) noexcept {
1507 const auto& L = fe.limbs();
1508 FieldElement52 r;
1509 r.n[0] = L[0] & M52;
1510 r.n[1] = (L[0] >> 52) | ((L[1] & 0xFFFFFFFFFFULL) << 12);
1511 r.n[2] = (L[1] >> 40) | ((L[2] & 0xFFFFFFFULL) << 24);
1512 r.n[3] = (L[2] >> 28) | ((L[3] & 0xFFFFULL) << 36);
1513 r.n[4] = L[3] >> 16;
1514 return r;
1515}
1516
1517// -- Conversion: 5x52 -> 4x64 (inline, includes full normalize) ----------
1518
1519SECP256K1_FE52_FORCE_INLINE
1520FieldElement FieldElement52::to_fe() const noexcept {
1521 FieldElement52 tmp = *this;
1522 fe52_normalize_inline(tmp.n);
1523
1525 L[0] = tmp.n[0] | (tmp.n[1] << 52);
1526 L[1] = (tmp.n[1] >> 12) | (tmp.n[2] << 40);
1527 L[2] = (tmp.n[2] >> 24) | (tmp.n[3] << 28);
1528 L[3] = (tmp.n[3] >> 36) | (tmp.n[4] << 16);
1529 return FieldElement::from_limbs_raw(L); // already canonical -- skip redundant normalize
1530}
1531
1532// Convenience serialization: FE52 -> bytes in one call
1533SECP256K1_FE52_FORCE_INLINE
1534void FieldElement52::to_bytes_into(std::uint8_t* out) const noexcept {
1535 // Direct 5x52 -> 32 big-endian bytes (skip intermediate 4x64 conversion).
1536 // Same approach as libsecp256k1's secp256k1_fe_impl_get_b32.
1537 FieldElement52 tmp = *this;
1538 fe52_normalize_inline(tmp.n);
1539
1540 out[ 0] = static_cast<std::uint8_t>(tmp.n[4] >> 40);
1541 out[ 1] = static_cast<std::uint8_t>(tmp.n[4] >> 32);
1542 out[ 2] = static_cast<std::uint8_t>(tmp.n[4] >> 24);
1543 out[ 3] = static_cast<std::uint8_t>(tmp.n[4] >> 16);
1544 out[ 4] = static_cast<std::uint8_t>(tmp.n[4] >> 8);
1545 out[ 5] = static_cast<std::uint8_t>(tmp.n[4] );
1546 out[ 6] = static_cast<std::uint8_t>(tmp.n[3] >> 44);
1547 out[ 7] = static_cast<std::uint8_t>(tmp.n[3] >> 36);
1548 out[ 8] = static_cast<std::uint8_t>(tmp.n[3] >> 28);
1549 out[ 9] = static_cast<std::uint8_t>(tmp.n[3] >> 20);
1550 out[10] = static_cast<std::uint8_t>(tmp.n[3] >> 12);
1551 out[11] = static_cast<std::uint8_t>(tmp.n[3] >> 4);
1552 out[12] = static_cast<std::uint8_t>(((tmp.n[2] >> 48) & 0xF) | ((tmp.n[3] & 0xF) << 4));
1553 out[13] = static_cast<std::uint8_t>(tmp.n[2] >> 40);
1554 out[14] = static_cast<std::uint8_t>(tmp.n[2] >> 32);
1555 out[15] = static_cast<std::uint8_t>(tmp.n[2] >> 24);
1556 out[16] = static_cast<std::uint8_t>(tmp.n[2] >> 16);
1557 out[17] = static_cast<std::uint8_t>(tmp.n[2] >> 8);
1558 out[18] = static_cast<std::uint8_t>(tmp.n[2] );
1559 out[19] = static_cast<std::uint8_t>(tmp.n[1] >> 44);
1560 out[20] = static_cast<std::uint8_t>(tmp.n[1] >> 36);
1561 out[21] = static_cast<std::uint8_t>(tmp.n[1] >> 28);
1562 out[22] = static_cast<std::uint8_t>(tmp.n[1] >> 20);
1563 out[23] = static_cast<std::uint8_t>(tmp.n[1] >> 12);
1564 out[24] = static_cast<std::uint8_t>(tmp.n[1] >> 4);
1565 out[25] = static_cast<std::uint8_t>(((tmp.n[0] >> 48) & 0xF) | ((tmp.n[1] & 0xF) << 4));
1566 out[26] = static_cast<std::uint8_t>(tmp.n[0] >> 40);
1567 out[27] = static_cast<std::uint8_t>(tmp.n[0] >> 32);
1568 out[28] = static_cast<std::uint8_t>(tmp.n[0] >> 24);
1569 out[29] = static_cast<std::uint8_t>(tmp.n[0] >> 16);
1570 out[30] = static_cast<std::uint8_t>(tmp.n[0] >> 8);
1571 out[31] = static_cast<std::uint8_t>(tmp.n[0] );
1572}
1573
1574// Fast serialize for pre-normalized limbs: 5x52 -> 32 big-endian bytes.
1575// Skips fe52_normalize_inline (saves ~10 ns per call on pre-normalized inputs).
1576// Uses bit-slicing to 4x64 intermediary + byte-swap stores.
1577SECP256K1_FE52_FORCE_INLINE
1578void FieldElement52::store_b32_prenorm(std::uint8_t* out) const noexcept {
1579 // 5x52 -> 4x64 bit-slicing (same logic as to_fe, no normalize)
1580 std::uint64_t L0 = n[0] | (n[1] << 52);
1581 std::uint64_t L1 = (n[1] >> 12) | (n[2] << 40);
1582 std::uint64_t L2 = (n[2] >> 24) | (n[3] << 28);
1583 std::uint64_t L3 = (n[3] >> 36) | (n[4] << 16);
1584
1585 // Big-endian store: 4 bswap + 4 unaligned writes (L3 = MSB)
1586#if defined(__GNUC__) || defined(__clang__)
1587 L3 = __builtin_bswap64(L3); L2 = __builtin_bswap64(L2);
1588 L1 = __builtin_bswap64(L1); L0 = __builtin_bswap64(L0);
1589#elif defined(_MSC_VER)
1590 L3 = _byteswap_uint64(L3); L2 = _byteswap_uint64(L2);
1591 L1 = _byteswap_uint64(L1); L0 = _byteswap_uint64(L0);
1592#else
1593 // Portable bswap fallback
1594 auto bswap64 = [](std::uint64_t v) -> std::uint64_t {
1595 v = ((v >> 8) & 0x00FF00FF00FF00FFULL) | ((v & 0x00FF00FF00FF00FFULL) << 8);
1596 v = ((v >> 16) & 0x0000FFFF0000FFFFULL) | ((v & 0x0000FFFF0000FFFFULL) << 16);
1597 return (v >> 32) | (v << 32);
1598 };
1599 L3 = bswap64(L3); L2 = bswap64(L2);
1600 L1 = bswap64(L1); L0 = bswap64(L0);
1601#endif
1602 std::memcpy(out, &L3, 8);
1603 std::memcpy(out + 8, &L2, 8);
1604 std::memcpy(out + 16, &L1, 8);
1605 std::memcpy(out + 24, &L0, 8);
1606}
1607
1608// -- Direct 4x64 limbs -> 5x52 (no FieldElement construction) -------------
1609// Same bit-slicing as from_fe but takes raw uint64_t[4] pointer.
1610// Avoids FieldElement copy + normalization when caller knows value < p.
1611
1612SECP256K1_FE52_FORCE_INLINE
1613FieldElement52 FieldElement52::from_4x64_limbs(const std::uint64_t* L) noexcept {
1614 FieldElement52 r;
1615 r.n[0] = L[0] & M52;
1616 r.n[1] = (L[0] >> 52) | ((L[1] & 0xFFFFFFFFFFULL) << 12);
1617 r.n[2] = (L[1] >> 40) | ((L[2] & 0xFFFFFFFULL) << 24);
1618 r.n[3] = (L[2] >> 28) | ((L[3] & 0xFFFFULL) << 36);
1619 r.n[4] = L[3] >> 16;
1620 return r;
1621}
1622
1623// -- Direct bytes (big-endian) -> 5x52 conversion ------------------------
1624// Combines FieldElement::from_bytes + from_fe into a single step.
1625
1626SECP256K1_FE52_FORCE_INLINE
1627FieldElement52 FieldElement52::from_bytes(const std::uint8_t* bytes) noexcept {
1628 // Read 4 uint64_t limbs from big-endian bytes (same layout as FieldElement::from_bytes)
1629 std::uint64_t L[4];
1630 for (int i = 0; i < 4; ++i) {
1631 std::uint64_t limb = 0;
1632 for (int j = 0; j < 8; ++j) {
1633 limb = (limb << 8) | static_cast<std::uint64_t>(bytes[i * 8 + j]);
1634 }
1635 L[3 - i] = limb;
1636 }
1637 // Reduce mod p if value >= p.
1638 // p = {0xFFFFFFFEFFFFFC2F, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF}
1639 static constexpr std::uint64_t P[4] = {
1640 0xFFFFFFFEFFFFFC2FULL, 0xFFFFFFFFFFFFFFFFULL,
1641 0xFFFFFFFFFFFFFFFFULL, 0xFFFFFFFFFFFFFFFFULL
1642 };
1643 // ge(L, P): check L >= P lexicographically from high limb.
1644 // NOTE: Variable-time comparison -- acceptable because input bytes
1645 // are public data (from wire / serialized keys), not secret.
1646 bool ge_p = true;
1647 for (int i = 3; i >= 0; --i) {
1648 if (L[i] < P[i]) { ge_p = false; break; }
1649 if (L[i] > P[i]) { break; }
1650 }
1651 if (ge_p) {
1652 // L -= P (with borrow)
1653 unsigned __int128 acc = static_cast<unsigned __int128>(L[0]) + (~P[0]) + 1;
1654 L[0] = static_cast<std::uint64_t>(acc);
1655 acc = static_cast<unsigned __int128>(L[1]) + (~P[1]) + (acc >> 64);
1656 L[1] = static_cast<std::uint64_t>(acc);
1657 acc = static_cast<unsigned __int128>(L[2]) + (~P[2]) + (acc >> 64);
1658 L[2] = static_cast<std::uint64_t>(acc);
1659 L[3] = L[3] + (~P[3]) + static_cast<std::uint64_t>(acc >> 64);
1660 }
1661 return from_4x64_limbs(L);
1662}
1663
1664SECP256K1_FE52_FORCE_INLINE
1665FieldElement52 FieldElement52::from_bytes(const std::array<std::uint8_t, 32>& bytes) noexcept {
1666 return from_bytes(bytes.data());
1667}
1668
1669// -- Inverse via safegcd (4x64 round-trip, single wrapper) ---------------
1670// Replaces the common pattern: FieldElement52::from_fe(x.to_fe().inverse())
1671// Returns zero for zero input (consistent with noexcept contract + embedded).
1672
1673SECP256K1_FE52_FORCE_INLINE
1674FieldElement52 FieldElement52::inverse_safegcd() const noexcept {
1675 if (SECP256K1_UNLIKELY(normalizes_to_zero_var())) {
1676 return FieldElement52::zero();
1677 }
1678 // Direct 5x52 → signed62 → SafeGCD → signed62 → 5x52.
1679 // Bypasses the old FE52→to_fe()→inverse()→from_fe() chain
1680 // which had 4 intermediate format conversions (5x52↔4x64↔signed62).
1681 FieldElement52 tmp = *this;
1682 fe52_normalize_inline(tmp.n);
1683 FieldElement52 r;
1684 fe52_inverse_safegcd_var(tmp.n, r.n);
1685 return r;
1686}
1687
1688} // namespace secp256k1::fast
1689
1690#if defined(__GNUC__)
1691#pragma GCC diagnostic pop
1692#endif
1693#endif // __int128 guard
1694
1695#undef SECP256K1_FE52_FORCE_INLINE
1696
1697#endif // SECP256K1_FIELD_52_IMPL_HPP
std::array< std::uint64_t, 4 > limbs_type
Definition field.hpp:32
static FieldElement from_limbs_raw(const limbs_type &limbs) noexcept
Definition field.hpp:66
secp256k1::fast::FieldElement FieldElement
Definition field.hpp:33
constexpr std::uint32_t P0
Definition field_26.hpp:59
constexpr std::uint32_t P4
Definition field_26.hpp:63
constexpr std::uint32_t P2
Definition field_26.hpp:61
constexpr std::uint32_t P1
Definition field_26.hpp:60
constexpr std::uint32_t P3
Definition field_26.hpp:62
constexpr std::uint64_t M48
Definition field_52.hpp:41
constexpr std::uint64_t M52
Definition field_52.hpp:39
void add_assign(const FieldElement52 &rhs) noexcept
static FieldElement52 from_fe(const FieldElement &fe) noexcept
void store_b32_prenorm(std::uint8_t *out) const noexcept
void mul_assign(const FieldElement52 &rhs) noexcept
FieldElement52 inverse_safegcd() const noexcept
static FieldElement52 from_bytes(const std::array< std::uint8_t, 32 > &bytes) noexcept
FieldElement to_fe() const noexcept
void mul_int_assign(std::uint32_t a) noexcept
void negate_assign(unsigned magnitude) noexcept
bool normalizes_to_zero() const noexcept
static FieldElement52 from_4x64_limbs(const std::uint64_t *limbs) noexcept
static FieldElement52 zero() noexcept
void conditional_negate_assign(std::int32_t sign_mask) noexcept
FieldElement52 operator*(const FieldElement52 &rhs) const noexcept
FieldElement52 half() const noexcept
void to_bytes_into(std::uint8_t *out) const noexcept
bool normalizes_to_zero_var() const noexcept
FieldElement52 square() const noexcept
FieldElement52 negate(unsigned magnitude) const noexcept
static FieldElement52 one() noexcept
FieldElement52 operator+(const FieldElement52 &rhs) const noexcept