UltrafastSecp256k1 3.50.0
Ultra high-performance secp256k1 elliptic curve cryptography library
Loading...
Searching...
No Matches
precompute.hpp
Go to the documentation of this file.
1#ifndef EE840AA6_AA0C_4E9D_B58A_701AC4A267D0
2#define EE840AA6_AA0C_4E9D_B58A_701AC4A267D0
3
4#include <cstddef>
5#include <cstdint>
6#include <string>
7#include <vector>
8
9#include "secp256k1/point.hpp"
10#include "secp256k1/scalar.hpp"
11
12namespace secp256k1::fast {
13
14
15// No-alloc variant: write wNAF digits into caller-provided buffer.
16// - Digits are written LSB-first at indices [0 .. out_len-1]
17// - Valid window_bits range: [2, 16] (practically we use 4..7)
18// - Throws std::runtime_error if output buffer is too small
19void compute_wnaf_into(const Scalar& scalar,
20 unsigned window_bits,
21 int32_t* out,
22 std::size_t max,
23 std::size_t& out_len);
24// Progress callback: (current_points, total_points, window_index, total_windows)
25using ProgressCallback = void(*)(size_t, size_t, unsigned, unsigned);
26
28 unsigned window_bits = 18U; // Default to 18-bit (511 MB cache, OPTIMAL: 11.755us)
29 bool enable_glv = false; // GLV disabled by default (decomposition overhead ~14us makes it 2x slower!)
30 bool use_jsf = false; // Use JSF-based Shamir for GLV (off by default)
31 bool adaptive_glv = false; // If true, automatically disable GLV for small window sizes
32 unsigned glv_min_window_bits = 14U; // Minimum window_bits required to keep GLV enabled when adaptive_glv=true
33 bool use_comb = false; // Use comb method instead of windowed (smaller cache, L1-friendly)
34 unsigned comb_width = 6U; // Comb width in bits (5 or 6 recommended for L1/L2 cache)
35 unsigned thread_count = 0U; // 0 = auto-detect
36
37 // Cache configuration
38 bool use_cache = true; // Enable cache system
39 std::string cache_path{}; // Empty = auto-detect cache path from cache_dir
40 std::string cache_dir = ""; // Default cache directory with all precomputed tables
41 unsigned max_windows_to_load = 0U; // Load all windows for optimal performance
42
43 // Progress reporting
44 ProgressCallback progress_callback = nullptr; // Optional progress reporting
45
46 // Auto-tune configuration (file-based, no environment variables)
47 bool autotune = false; // If true, run auto-tuning once on startup then rewrite config with best settings
48 unsigned autotune_iters = 2000; // Iterations per candidate during tuning
49 unsigned autotune_min_w = 2; // Minimum window_bits to consider
50 unsigned autotune_max_w = 20; // Maximum window_bits to consider (avoid huge caches)
51 std::string autotune_log_path; // Optional: write human-readable tuning report (e.g. "autotune.log")
52
53 // Application-level integration (optional keys from config.ini)
54 // These are not used by core precomputation logic but allow apps to pull
55 // database path and external Bloom filter path from the same unified file.
56 std::string db_path; // e.g. R:\\SmallDB
57 std::string bloom_filter_path; // e.g. R:\\SmallDB\\bloom_full.blmf or "none"
58 std::string fulldb_path; // Secondary full key/value DB (used only when collision occurs)
59};
60
64 bool neg1{false};
65 bool neg2{false};
66};
67
71
72// ----------------------------------------------------------------------------
73// Library-level helpers to reduce per-app boilerplate
74// ----------------------------------------------------------------------------
75// Load FixedBase settings from a simple INI-style file and configure globally.
76// Supported keys (case-insensitive):
77// cache_dir, cache_path, window_bits, enable_glv, use_jsf, use_cache,
78// max_windows, thread_count, use_comb, comb_width
79// Lines starting with '#' or ';' are comments. Empty lines ignored.
80// Example:
81// cache_dir=F:\\EccTables
82// window_bits=18
83// enable_glv=true
84// use_jsf=true
85bool load_fixed_base_config_file(const std::string& path, FixedBaseConfig& out);
86
87// Parse file and immediately apply via configure_fixed_base().
88bool configure_fixed_base_from_file(const std::string& path);
89
90// If env var SECP256K1_CONFIG is set, load that file and configure.
91// Returns true if loaded and applied, false if env not set or load failed.
93
94// Write a default INI with sensible values if file does not exist.
95// Returns true if the file was created, false if already existed or on error.
96bool write_default_fixed_base_config(const std::string& path);
97
98// Ensure config file exists; if missing, create with defaults.
99// Returns true if the file exists after this call (created or pre-existing).
100bool ensure_fixed_base_config_file(const std::string& path);
101
102// Auto: prefer env (SECP256K1_CONFIG), otherwise use "config.ini" in CWD.
103// Creates a default file if missing, then loads and applies it.
105
106// ----------------------------------------------------------------------------
107// Auto-tuning helpers
108// ----------------------------------------------------------------------------
109// Run a quick benchmark sweep across available fixed-base cache windows
110// in the configured cache directory and pick the fastest configuration
111// for this machine. It tries combinations of:
112// - window_bits: discovered from files like cache_w{bits}[ _glv].bin in cache_dir
113// - enable_glv: true if GLV cache exists for that window, false if non-GLV exists
114// - use_jsf: both true/false when GLV is enabled (JSF vs Shamir)
115// It measures average time of scalar_mul_generator across 'iterations'
116// and returns the best FixedBaseConfig in 'best_out'. 'report_out' (optional)
117// receives a human-readable summary of tried candidates.
118// Returns true on success, false if no candidates were found.
120 std::string* report_out = nullptr,
121 unsigned iterations = 5000,
122 unsigned min_w = 2,
123 unsigned max_w = 30);
124
125// Write a FixedBaseConfig to an INI file (overwrites existing).
126// Returns true if write succeeded.
127bool write_fixed_base_config(const std::string& path, const FixedBaseConfig& cfg);
128
129// Convenience: run auto-tune and immediately write the resulting configuration
130// to 'path' (e.g., "config.ini"). Returns true if tuning and write succeed.
131bool auto_tune_and_write_config(const std::string& path,
132 unsigned iterations = 5000,
133 unsigned min_w = 2,
134 unsigned max_w = 30);
135
138
139// GLV with pre-decomposed constants (for benchmarking when decomposition is done once)
140Point scalar_mul_generator_glv_predecomposed(const Scalar& k1, const Scalar& k2, bool neg1, bool neg2);
141
142// wNAF (width-w Non-Adjacent Form) computation
143// Returns wNAF representation with non-adjacent digits
144// window_bits: typically 4-6 for optimal performance
145std::vector<int32_t> compute_wnaf(const Scalar& scalar, unsigned window_bits);
146
147// Optimized scalar multiplication for arbitrary points (non-generator)
148// Uses fixed-window method with precomputed multiples [Q, 3Q, 5Q, ..., (2^w-1)Q]
149// window_bits: typically 4-6 for optimal trade-off (4=16 points, 5=32 points, 6=64 points)
150// For variable-base multiplication (e.g., Q*k where Q changes frequently)
151Point scalar_mul_arbitrary(const Point& base, const Scalar& scalar, unsigned window_bits = 5);
152
153// Multi-scalar multiplication using Shamir's trick (Straus/Shamir algorithm)
154// Computes k1*P + k2*Q efficiently by sharing point doublings
155// Expected: 60-70% faster than two separate scalar_mul_arbitrary() calls
156// Primary use case: ECDSA signature verification (s^-1*u*G + s^-1*r*PublicKey)
157// window_bits: typically 4-5 for joint table (4=16 combinations, 5=64 combinations)
158Point multi_scalar_mul(const Scalar& k1, const Point& P,
159 const Scalar& k2, const Point& Q,
160 unsigned window_bits = 4);
161
162// ============================================================================
163// Optimized API for CONSTANT SCALAR (K fixed, Q variable)
164// ============================================================================
165// Use case: Point navigation with fixed jump distances
166// - K is constant (reused many times)
167// - Q changes every operation
168//
169// Performance: 336 us -> 190-260 us (1.3-1.8x speedup)
170//
171// Usage:
172// 1) ONCE (offline): PrecomputedScalar precomp = precompute_scalar_for_arbitrary(K);
173// 2) MANY TIMES: Point result = scalar_mul_arbitrary_precomputed(Q, precomp);
174
176 // GLV decomposition: K -> (k_1, k_2) where K == k_1 + lambda*k_2 (mod n)
179 bool neg1; // k_1 sign
180 bool neg2; // k_2 sign
181
182 // Precomputed wNAF digits for k_1 and k_2
183 std::vector<int32_t> wnaf1; // wNAF(k_1)
184 std::vector<int32_t> wnaf2; // wNAF(k_2)
185
186 unsigned window_bits; // Window size used (typically 5)
187
188 // Verify this was properly initialized
189 bool is_valid() const { return window_bits >= 4 && window_bits <= 7; }
190};
191
192// Optimized precomputed scalar - ZERO runtime calculations + RLE compression!
193// Uses Run-Length Encoding to skip consecutive zeros (doubles without adds)
195 // Each step: N doubles followed by 0-2 additions
196 struct Step {
197 uint16_t num_doubles; // How many consecutive doubles before additions
198
199 // Addition info for k_1 (Q table)
200 uint8_t idx1; // Point index [0..table_size-1], 0xFF = skip
201 bool neg1; // Negate Y coordinate?
202
203 // Addition info for k_2 (psi(Q) table)
204 uint8_t idx2; // Point index [0..table_size-1], 0xFF = skip
205 bool neg2; // Negate Y coordinate?
206
207 Step() : num_doubles(0), idx1(0xFF), neg1(false), idx2(0xFF), neg2(false) {}
208 };
209
211 bool neg1, neg2;
212 std::vector<Step> steps; // ~100-130 steps (instead of 256 iterations!)
213 unsigned window_bits;
214
215 bool is_valid() const { return window_bits >= 4 && window_bits <= 7 && !steps.empty(); }
216};
217
218// Precompute constant scalar K for repeated use with different points Q
219// This performs GLV decomposition and wNAF generation ONCE
220// Cost: ~14 us one-time overhead, amortized over many operations
221// window_bits: typically 4-5 for optimal performance (4 recommended for speed)
222PrecomputedScalar precompute_scalar_for_arbitrary(const Scalar& K, unsigned window_bits = 4);
223
224// OPTIMIZED: Precompute with ALL calculations done once
225// Eliminates ALL runtime conditionals and arithmetic in main loop!
226// Additional cost: ~2 us one-time, but removes ~30% overhead from main loop
227// Use this for MAXIMUM performance when K is truly constant
229
230// Scalar multiplication with precomputed constant K
231// Cost: NO decomposition overhead, only psi(Q) + 2D loop
232// Expected: 190-260 us (1.3-1.8x faster than scalar_mul_arbitrary)
234
235// OPTIMIZED: Uses PrecomputedScalarOptimized for zero-overhead main loop
236// Expected: 120-160 us with w=4 (2-3x faster than standard precomputed)
237// Target with assembly field ops: 18-24 us (12x faster!)
239
240// NO-TABLE MODE: For single-use Q (eliminates table building overhead!)
241// Uses direct +/-Q and +/-psi(Q) without precomputed tables
242// Expected: ~199 us (saves 12 us from table building)
243// Use when Q is used only once with constant K
245
246// Cache management
247bool save_precompute_cache(const std::string& path);
248bool load_precompute_cache(const std::string& path, unsigned max_windows = 0);
249
250} // namespace secp256k1::fast
251
252#endif /* EE840AA6_AA0C_4E9D_B58A_701AC4A267D0 */
bool load_precompute_cache(const std::string &path, unsigned max_windows=0)
Point scalar_mul_arbitrary_precomputed_optimized(const Point &Q, const PrecomputedScalarOptimized &precomp)
bool write_fixed_base_config(const std::string &path, const FixedBaseConfig &cfg)
std::vector< int32_t > compute_wnaf(const Scalar &scalar, unsigned window_bits)
bool configure_fixed_base_auto()
void ensure_fixed_base_ready()
void compute_wnaf_into(const Scalar &scalar, unsigned window_bits, int32_t *out, std::size_t max, std::size_t &out_len)
Point scalar_mul_arbitrary_precomputed_notable(const Point &Q, const PrecomputedScalarOptimized &precomp)
void(*)(size_t, size_t, unsigned, unsigned) ProgressCallback
Point scalar_mul_generator_glv_predecomposed(const Scalar &k1, const Scalar &k2, bool neg1, bool neg2)
bool fixed_base_ready()
Point scalar_mul_arbitrary_precomputed(const Point &Q, const PrecomputedScalar &precomp)
bool configure_fixed_base_from_file(const std::string &path)
void configure_fixed_base(const FixedBaseConfig &config)
bool load_fixed_base_config_file(const std::string &path, FixedBaseConfig &out)
bool ensure_fixed_base_config_file(const std::string &path)
bool auto_tune_and_write_config(const std::string &path, unsigned iterations=5000, unsigned min_w=2, unsigned max_w=30)
Point scalar_mul_generator(const Scalar &scalar)
PrecomputedScalarOptimized precompute_scalar_optimized(const Scalar &K, unsigned window_bits=4)
bool configure_fixed_base_from_env()
PrecomputedScalar precompute_scalar_for_arbitrary(const Scalar &K, unsigned window_bits=4)
bool auto_tune_fixed_base(FixedBaseConfig &best_out, std::string *report_out=nullptr, unsigned iterations=5000, unsigned min_w=2, unsigned max_w=30)
ScalarDecomposition split_scalar_glv(const Scalar &scalar)
bool write_default_fixed_base_config(const std::string &path)
Point multi_scalar_mul(const Scalar &k1, const Point &P, const Scalar &k2, const Point &Q, unsigned window_bits=4)
bool save_precompute_cache(const std::string &path)
Point scalar_mul_arbitrary(const Point &base, const Scalar &scalar, unsigned window_bits=5)