UltrafastSecp256k1 3.50.0
Ultra high-performance secp256k1 elliptic curve cryptography library
Loading...
Searching...
No Matches
field_asm.hpp
Go to the documentation of this file.
1// Optimized field operations using BMI2 intrinsics (MULX/ADCX/ADOX)
2
3#ifndef C237C0F0_BF55_4453_9221_DE161395FF08
4#define C237C0F0_BF55_4453_9221_DE161395FF08
5// Target: 7-8x speedup on field multiplication/squaring
6// For K constant + Q variable optimization
7
8
9#include "field.hpp"
10#if defined(__x86_64__) || defined(_M_X64)
11#include <immintrin.h> // BMI2 intrinsics
12#endif
13
14namespace secp256k1::fast {
15
16// Check if BMI2 is available at runtime
18// Check if ADX (ADCX/ADOX) is available at runtime
20
21// ===================================================================
22// BMI2-optimized field operations
23// ===================================================================
24
25// Multiply two field elements using BMI2 instructions
26// Expected: ~10-15 ns (vs current ~70-80 ns)
28
29// Square a field element using BMI2 instructions
30// Expected: ~8-10 ns (vs current ~50-60 ns)
32
33// ===================================================================
34// ARM64 (AArch64) assembly optimizations
35// ===================================================================
36
37#if (defined(__aarch64__) || defined(_M_ARM64)) && !defined(SECP256K1_NO_ASM)
38// Internal ARM64 assembly functions (direct pointer interface for hot paths)
39namespace arm64 {
40 void field_mul_arm64(uint64_t out[4], const uint64_t a[4], const uint64_t b[4]) noexcept;
41 void field_sqr_arm64(uint64_t out[4], const uint64_t a[4]) noexcept;
42 void field_add_arm64(uint64_t out[4], const uint64_t a[4], const uint64_t b[4]) noexcept;
43 void field_sub_arm64(uint64_t out[4], const uint64_t a[4], const uint64_t b[4]) noexcept;
44 void field_neg_arm64(uint64_t out[4], const uint64_t a[4]) noexcept;
45} // namespace arm64
46
47// Multiply two field elements using ARM64 MUL/UMULH assembly
48// ~50-80 cycles on Cortex-A76, ~80-120 on Cortex-A55
49FieldElement field_mul_arm64(const FieldElement& a, const FieldElement& b);
50
51// Square a field element using ARM64 assembly (10 muls vs 16)
52FieldElement field_square_arm64(const FieldElement& a);
53
54// Add/Sub with branchless normalization
55FieldElement field_add_arm64(const FieldElement& a, const FieldElement& b);
56FieldElement field_sub_arm64(const FieldElement& a, const FieldElement& b);
57
58// Negate field element (branchless p - a)
59FieldElement field_negate_arm64(const FieldElement& a);
60
61#endif // __aarch64__ || _M_ARM64
62
63// ===================================================================
64// RISC-V assembly optimizations (RV64GC)
65// ===================================================================
66
67#ifdef SECP256K1_HAS_RISCV_ASM
68// Multiply two field elements using RISC-V assembly
69// Expected: 2-3x speedup over portable C++
70FieldElement field_mul_riscv(const FieldElement& a, const FieldElement& b);
71
72// Square a field element using RISC-V assembly
73FieldElement field_square_riscv(const FieldElement& a);
74
75// Add two field elements using RISC-V assembly
76FieldElement field_add_riscv(const FieldElement& a, const FieldElement& b);
77
78// Subtract two field elements using RISC-V assembly
79FieldElement field_sub_riscv(const FieldElement& a, const FieldElement& b);
80
81// Negate a field element using RISC-V assembly
82FieldElement field_negate_riscv(const FieldElement& a);
83#endif // SECP256K1_HAS_RISCV_ASM
84
85// Square using Karatsuba algorithm (recursive decomposition)
86// ~9 multiplications vs 10 in standard, may be faster for some CPUs
88
89// Add two field elements (already fast, but optimize with ADCX)
90// Expected: ~5-8 ns (vs current ~15-20 ns)
92
93// Negate field element (conditional subtraction)
94// Expected: ~3-5 ns (vs current ~10-15 ns)
96
97// ===================================================================
98// Internal implementation details
99// ===================================================================
100
101namespace detail {
102
103// 64x64 -> 128-bit multiplication using MULX
104// MULX: multiplicand in RDX, result in two registers (no flag updates!)
105inline void mulx64(uint64_t a, uint64_t b, uint64_t& lo, uint64_t& hi) {
106 #if defined(_MSC_VER)
107 // MSVC intrinsic
108 lo = _mulx_u64(a, b, &hi);
109 #elif defined(__GNUC__) || defined(__clang__)
110 #if defined(__BMI2__)
111 // GCC/Clang intrinsic with BMI2
112 lo = _mulx_u64(a, b, reinterpret_cast<unsigned long long*>(&hi));
113 #else
114 // GCC/Clang fallback
115 #ifdef SECP256K1_NO_INT128
116 // 32-bit safe implementation
117 uint64_t a_lo = a & 0xFFFFFFFFULL;
118 uint64_t a_hi = a >> 32;
119 uint64_t b_lo = b & 0xFFFFFFFFULL;
120 uint64_t b_hi = b >> 32;
121
122 uint64_t p0 = a_lo * b_lo;
123 uint64_t p1 = a_lo * b_hi;
124 uint64_t p2 = a_hi * b_lo;
125 uint64_t p3 = a_hi * b_hi;
126
127 uint64_t carry = ((p0 >> 32) + (p1 & 0xFFFFFFFFULL) + (p2 & 0xFFFFFFFFULL)) >> 32;
128
129 lo = p0 + (p1 << 32) + (p2 << 32);
130 hi = p3 + (p1 >> 32) + (p2 >> 32) + carry;
131 #else
132 #if defined(__GNUC__)
133 #pragma GCC diagnostic push
134 #pragma GCC diagnostic ignored "-Wpedantic"
135 #endif
136 __uint128_t result = static_cast<__uint128_t>(a) * b;
137 lo = static_cast<uint64_t>(result);
138 hi = static_cast<uint64_t>(result >> 64);
139 #if defined(__GNUC__)
140 #pragma GCC diagnostic pop
141 #endif
142 #endif
143 #endif
144 #else
145 // Fallback: 128-bit wide multiply (GCC/Clang, all 64-bit targets)
146 #if defined(__GNUC__)
147 #pragma GCC diagnostic push
148 #pragma GCC diagnostic ignored "-Wpedantic"
149 #endif
150 __uint128_t result = static_cast<__uint128_t>(a) * b;
151 lo = static_cast<uint64_t>(result);
152 hi = static_cast<uint64_t>(result >> 64);
153 #if defined(__GNUC__)
154 #pragma GCC diagnostic pop
155 #endif
156 #endif
157}
158
159// Add with carry using ADCX (carry flag chain)
160// x86-only: BMI2/ADX intrinsic
161#if defined(__x86_64__) || defined(_M_X64)
162inline uint8_t adcx64(uint64_t a, uint64_t b, uint8_t carry, uint64_t& result) {
163 #if defined(_MSC_VER)
164 // MSVC intrinsic
165 return _addcarry_u64(carry, a, b, reinterpret_cast<unsigned long long*>(&result));
166 #else
167 // GCC/Clang intrinsic (_addcarry_u64 in x86intrin.h/immintrin.h)
168 return _addcarry_u64(carry, a, b, reinterpret_cast<unsigned long long*>(&result));
169 #endif
170}
171#else
172// Portable fallback for non-x86 (RISC-V, ARM, ESP32, etc.)
173inline uint8_t adcx64(uint64_t a, uint64_t b, uint8_t carry, uint64_t& result) {
174 #ifdef SECP256K1_NO_INT128
175 // 32-bit safe implementation
176 result = a + b;
177 uint8_t new_carry = (result < a) ? 1 : 0;
178 if (carry) {
179 uint64_t temp = result + 1;
180 new_carry |= (temp < result) ? 1 : 0;
181 result = temp;
182 }
183 return new_carry;
184 #else
185 #if defined(__GNUC__)
186 #pragma GCC diagnostic push
187 #pragma GCC diagnostic ignored "-Wpedantic"
188 #endif
189 unsigned __int128 sum = static_cast<unsigned __int128>(a) +
190 static_cast<unsigned __int128>(b) +
191 static_cast<unsigned __int128>(carry);
192 result = static_cast<uint64_t>(sum);
193 return static_cast<uint8_t>(sum >> 64);
194 #if defined(__GNUC__)
195 #pragma GCC diagnostic pop
196 #endif
197 #endif
198}
199#endif
200
201// Add with overflow using ADOX (overflow flag chain, independent of ADCX!)
202inline uint8_t adox64(uint64_t a, uint64_t b, uint8_t overflow, uint64_t& result) {
203 // Note: ADOX not directly available as intrinsic, use ADCX for now
204 // In inline assembly, we'd use separate flag chains
205 return adcx64(a, b, overflow, result);
206}
207
208// Full 4x4 limb multiplication (256-bit x 256-bit -> 512-bit)
209// Fully unrolled, using MULX/ADCX/ADOX in parallel
211 const uint64_t a[4],
212 const uint64_t b[4],
213 uint64_t result[8]
214);
215
216// Full 4-limb squaring (optimized, fewer multiplications)
218 const uint64_t a[4],
219 uint64_t result[8]
220);
221
222// Karatsuba squaring algorithm (recursive decomposition)
223// ~9 multiplications vs 10 in standard approach
225 const uint64_t a[4],
226 uint64_t result[8]
227);
228
229// Montgomery reduction modulo p (secp256k1 prime)
230// Input: 512-bit value in result[8]
231// Output: 256-bit reduced value
232void montgomery_reduce_bmi2(uint64_t result[8]);
233
234} // namespace detail
235
236} // namespace secp256k1::fast
237
238
239#endif /* C237C0F0_BF55_4453_9221_DE161395FF08 */
void square_4_karatsuba(const uint64_t a[4], uint64_t result[8])
void mul_4x4_bmi2(const uint64_t a[4], const uint64_t b[4], uint64_t result[8])
void montgomery_reduce_bmi2(uint64_t result[8])
uint8_t adcx64(uint64_t a, uint64_t b, uint8_t carry, uint64_t &result)
uint8_t adox64(uint64_t a, uint64_t b, uint8_t overflow, uint64_t &result)
void mulx64(uint64_t a, uint64_t b, uint64_t &lo, uint64_t &hi)
void square_4_bmi2(const uint64_t a[4], uint64_t result[8])
bool has_bmi2_support()
FieldElement field_negate_bmi2(const FieldElement &a)
FieldElement field_square_bmi2(const FieldElement &a)
FieldElement field_add_bmi2(const FieldElement &a, const FieldElement &b)
FieldElement field_square_karatsuba(const FieldElement &a)
FieldElement field_mul_bmi2(const FieldElement &a, const FieldElement &b)
bool has_adx_support()