UltrafastSecp256k1 3.50.0
Ultra high-performance secp256k1 elliptic curve cryptography library
Loading...
Searching...
No Matches
benchmark_harness.hpp
Go to the documentation of this file.
1// ============================================================================
2// benchmark_harness.hpp -- Production-Grade Benchmark Infrastructure
3// ============================================================================
4//
5// Standards:
6// * RDTSC cycle counter on x86/x64 (sub-ns precision)
7// * std::chrono::high_resolution_clock fallback (RISC-V, ARM, etc.)
8// * Configurable warm-up iterations
9// * Multi-pass measurement (default: 11)
10// * Median filtering
11// * IQR-based outlier removal
12// * Min/Avg/Median/StdDev tracking
13// * DoNotOptimize / ClobberMemory compiler barriers
14// * Optional thread pinning + priority elevation (Windows)
15//
16// Usage:
17// #include "secp256k1/benchmark_harness.hpp"
18//
19// bench::Harness h; // default: 500 warmup, 11 passes
20// double ns = h.run(1000, [&]() { ... }); // 1000 iters per pass
21// h.run_and_print("Field Mul", 100000, [&]() { fe_a * fe_b; });
22//
23// ============================================================================
24
25#pragma once
26
27#include <algorithm>
28#include <array>
29#include <chrono>
30#include <cmath>
31#include <cstdint>
32#include <cstdio>
33#include <numeric>
34#include <vector>
35
36#if defined(_WIN32)
37# define NOMINMAX
38# include <windows.h>
39#endif
40
41namespace bench {
42
43// -- DoNotOptimize / ClobberMemory --------------------------------------------
44// Prevents the compiler from optimizing away benchmark payloads.
45
46#if defined(__GNUC__) || defined(__clang__)
47
48template <typename T>
49inline __attribute__((always_inline)) void DoNotOptimize(T const& value) {
50 asm volatile("" : : "r,m"(value) : "memory");
51}
52
53template <typename T>
54inline __attribute__((always_inline)) void DoNotOptimize(T& value) {
55 asm volatile("" : "+r,m"(value) : : "memory");
56}
57
58inline __attribute__((always_inline)) void ClobberMemory() {
59 asm volatile("" : : : "memory");
60}
61
62#elif defined(_MSC_VER)
63
64template <typename T>
65__forceinline void DoNotOptimize(T const& value) {
66 // Use _ReadWriteBarrier + volatile to prevent optimization
67 volatile auto sink = *reinterpret_cast<const volatile char*>(&value);
68 (void)sink;
69 _ReadWriteBarrier();
70}
71
72template <typename T>
73__forceinline void DoNotOptimize(T& value) {
74 volatile auto sink = *reinterpret_cast<volatile char*>(&value);
75 (void)sink;
76 _ReadWriteBarrier();
77}
78
79__forceinline void ClobberMemory() {
80 _ReadWriteBarrier();
81}
82
83#else
84
85template <typename T>
86inline void DoNotOptimize(T const& value) {
87 volatile auto sink = reinterpret_cast<uintptr_t>(&value);
88 (void)sink;
89}
90
91template <typename T>
92inline void DoNotOptimize(T& value) {
93 volatile auto sink = reinterpret_cast<uintptr_t>(&value);
94 (void)sink;
95}
96
97inline void ClobberMemory() {}
98
99#endif
100
101// -- High-Resolution Timer ----------------------------------------------------
102
103#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86))
104# define BENCH_HAS_RDTSC 1
105#else
106# define BENCH_HAS_RDTSC 0
107#endif
108
109#if BENCH_HAS_RDTSC
110# if defined(_MSC_VER)
111# include <intrin.h>
112# endif
113#endif
114
115struct Timer {
116 // Returns a monotonic tick count.
117 // On x86: RDTSC cycles (serialized with RDTSCP or CPUID+RDTSC).
118 // Elsewhere: nanoseconds from chrono.
119 static inline uint64_t now() noexcept {
120#if BENCH_HAS_RDTSC
121# if defined(_MSC_VER)
122 unsigned int aux = 0;
123 return __rdtscp(&aux);
124# else
125 uint32_t lo = 0; // NOLINT(misc-const-correctness) -- asm output operand
126 uint32_t hi = 0; // NOLINT(misc-const-correctness) -- asm output operand
127 // RDTSCP serializes instruction stream and reads TSC
128 asm volatile("rdtscp" : "=a"(lo), "=d"(hi) : : "%ecx");
129 return (static_cast<uint64_t>(hi) << 32) | lo;
130# endif
131#else
132 return static_cast<uint64_t>(
133 std::chrono::high_resolution_clock::now().time_since_epoch().count());
134#endif
135 }
136
137 // Ticks -> nanoseconds conversion.
138 // On x86 with RDTSC: calibrates TSC frequency at first call.
139 // On non-x86: ticks ARE nanoseconds (chrono).
140 static double ticks_to_ns(uint64_t ticks) noexcept {
141#if BENCH_HAS_RDTSC
142 static const double ns_per_tick = calibrate_tsc();
143 return static_cast<double>(ticks) * ns_per_tick;
144#else
145 // chrono ticks -- convert to nanoseconds using the clock's period
146 using Period = std::chrono::high_resolution_clock::period;
147 constexpr double ratio = static_cast<double>(Period::num) / static_cast<double>(Period::den) * 1e9;
148 return static_cast<double>(ticks) * ratio;
149#endif
150 }
151
152 // Returns the timer type name for display
153 static const char* timer_name() noexcept {
154#if BENCH_HAS_RDTSC
155 return "RDTSCP";
156#else
157 return "chrono::high_resolution_clock";
158#endif
159 }
160
161private:
162#if BENCH_HAS_RDTSC
163 // Calibrate TSC: measure ~10ms with chrono to get ns/tick ratio
164 static double calibrate_tsc() noexcept {
165 using Clock = std::chrono::high_resolution_clock;
166 constexpr int CALIBRATE_MS = 10;
167
168 // Warm up
169 volatile uint64_t const warm = now();
170 (void)warm;
171
172 auto chrono_start = Clock::now();
173 uint64_t const tsc_start = now();
174
175 // Spin for ~10ms
176 auto target = chrono_start + std::chrono::milliseconds(CALIBRATE_MS);
177 while (Clock::now() < target) {
178 // busy-wait
179 }
180
181 uint64_t const tsc_end = now();
182 auto chrono_end = Clock::now();
183
184 double const ns_elapsed = static_cast<double>(
185 std::chrono::duration_cast<std::chrono::nanoseconds>(chrono_end - chrono_start).count());
186 auto const tsc_elapsed = static_cast<double>(tsc_end - tsc_start);
187
188 return (tsc_elapsed > 0.0) ? (ns_elapsed / tsc_elapsed) : 1.0;
189 }
190#endif
191};
192
193// -- Statistics ---------------------------------------------------------------
194
195struct Stats {
196 double min_ns = 0.0;
197 double max_ns = 0.0;
198 double median_ns = 0.0;
199 double mean_ns = 0.0;
200 double stddev_ns = 0.0;
201 int samples = 0; // after outlier removal
202 int outliers = 0; // removed samples
203};
204
205// Compute stats with IQR outlier removal
206inline Stats compute_stats(std::vector<double>& data) {
207 Stats s{};
208 if (data.empty()) return s;
209
210 std::sort(data.begin(), data.end());
211
212 const std::size_t n = data.size();
213
214 if (n < 4) {
215 // Too few samples for IQR -- use all
216 s.min_ns = data.front();
217 s.max_ns = data.back();
218 s.median_ns = data[n / 2];
219 double sum = 0.0;
220 for (auto v : data) sum += v;
221 s.mean_ns = sum / static_cast<double>(n);
222 s.samples = static_cast<int>(n);
223 s.outliers = 0;
224 return s;
225 }
226
227 // IQR-based outlier removal
228 double const q1 = data[n / 4];
229 double const q3 = data[(3 * n) / 4];
230 double const iqr = q3 - q1;
231 double const lower = q1 - 1.5 * iqr;
232 double const upper = q3 + 1.5 * iqr;
233
234 std::vector<double> filtered;
235 filtered.reserve(n);
236 for (auto v : data) {
237 if (v >= lower && v <= upper) {
238 filtered.push_back(v);
239 }
240 }
241
242 if (filtered.empty()) {
243 // All outliers? Fall back to raw data
244 filtered = data;
245 }
246
247 const std::size_t fn = filtered.size();
248 s.outliers = static_cast<int>(n - fn);
249 s.samples = static_cast<int>(fn);
250 s.min_ns = filtered.front();
251 s.max_ns = filtered.back();
252 s.median_ns = filtered[fn / 2];
253
254 double sum = 0.0;
255 for (auto v : filtered) sum += v;
256 s.mean_ns = sum / static_cast<double>(fn);
257
258 double var = 0.0;
259 for (auto v : filtered) {
260 double const d = v - s.mean_ns;
261 var += d * d;
262 }
263 s.stddev_ns = std::sqrt(var / static_cast<double>(fn));
264
265 return s;
266}
267
268// -- Platform Setup -----------------------------------------------------------
269
271#if defined(_WIN32)
272 SetThreadAffinityMask(GetCurrentThread(), 1ULL);
273 SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
274 SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
275#elif defined(__linux__) && !defined(__ANDROID__)
276 // Pin to CPU 0 on Linux
277 cpu_set_t cpuset;
278 CPU_ZERO(&cpuset);
279 CPU_SET(0, &cpuset);
280 sched_setaffinity(0, sizeof(cpuset), &cpuset);
281#endif
282}
283
284// -- Main Harness -------------------------------------------------------------
285
286class Harness {
287public:
288 int warmup_iters = 500; // warmup iterations per function call
289 std::size_t passes = 11; // measurement passes (odd number for clean median)
290
291 Harness() = default;
292 Harness(int warmup, std::size_t p) : warmup_iters(warmup), passes(p) {}
293
294 // Run benchmark: returns median ns/iter after IQR outlier removal.
295 // Func is called `iters` times per pass.
296 template <typename Func>
297 double run(int iters, Func&& func) const {
298 return run_stats(iters, std::forward<Func>(func)).median_ns;
299 }
300
301 // Run benchmark and return full statistics.
302 template <typename Func>
303 Stats run_stats(int iters, Func&& func) const {
304 // Warmup
305 for (int i = 0; i < warmup_iters; ++i) {
306 func();
308 }
309
310 // Measurement passes
311 std::vector<double> ns_per_iter;
312 ns_per_iter.reserve(passes);
313
314 for (std::size_t p = 0; p < passes; ++p) {
315 uint64_t const t0 = Timer::now();
316 for (int i = 0; i < iters; ++i) {
317 func();
318 }
320 uint64_t const t1 = Timer::now();
321
322 double const total_ns = Timer::ticks_to_ns(t1 - t0);
323 ns_per_iter.push_back(total_ns / iters);
324 }
325
326 return compute_stats(ns_per_iter);
327 }
328
329 // Convenience: run + print one-liner result
330 template <typename Func>
331 double run_and_print(const char* name, int iters, Func&& func) const {
332 Stats st = run_stats(iters, func);
333 (void)std::printf(" %-28s %9.2f ns (min=%6.2f median=%6.2f stddev=%5.2f n=%d-%d)\n",
334 name, st.median_ns, st.min_ns, st.median_ns, st.stddev_ns,
335 st.samples, st.outliers);
336 return st.median_ns;
337 }
338
339 // Print harness configuration info
340 void print_config() const {
341 (void)std::printf(" Timer: %s\n", Timer::timer_name());
342 (void)std::printf(" Warmup: %d iterations\n", warmup_iters);
343 (void)std::printf(" Passes: %zu (IQR outlier removal + median)\n", passes);
344 }
345};
346
347// -- Formatting helpers -------------------------------------------------------
348
349inline const char* format_ns(double ns, char* buf, std::size_t buflen) {
350 if (ns < 1000.0) {
351 (void)std::snprintf(buf, buflen, "%.2f ns", ns);
352 } else if (ns < 1000000.0) {
353 (void)std::snprintf(buf, buflen, "%.2f us", ns / 1000.0);
354 } else {
355 (void)std::snprintf(buf, buflen, "%.2f ms", ns / 1000000.0);
356 }
357 return buf;
358}
359
360// Overload that uses a static buffer (not thread-safe, fine for sequential bench output)
361inline const char* format_ns(double ns) {
362 static char buf[64];
363 return format_ns(ns, buf, sizeof(buf));
364}
365
366} // namespace bench
Stats run_stats(int iters, Func &&func) const
Harness()=default
double run(int iters, Func &&func) const
void print_config() const
Harness(int warmup, std::size_t p)
double run_and_print(const char *name, int iters, Func &&func) const
void ClobberMemory()
void DoNotOptimize(T const &value)
void pin_thread_and_elevate()
const char * format_ns(double ns, char *buf, std::size_t buflen)
Stats compute_stats(std::vector< double > &data)
static const char * timer_name() noexcept
static double ticks_to_ns(uint64_t ticks) noexcept
static uint64_t now() noexcept