46#if defined(__GNUC__) || defined(__clang__)
49inline __attribute__((always_inline))
void DoNotOptimize(T
const& value) {
50 asm volatile(
"" : :
"r,m"(value) :
"memory");
54inline __attribute__((always_inline))
void DoNotOptimize(T& value) {
55 asm volatile(
"" :
"+r,m"(value) : :
"memory");
59 asm volatile(
"" : : :
"memory");
62#elif defined(_MSC_VER)
67 volatile auto sink = *
reinterpret_cast<const volatile char*
>(&value);
74 volatile auto sink = *
reinterpret_cast<volatile char*
>(&value);
87 volatile auto sink =
reinterpret_cast<uintptr_t
>(&value);
93 volatile auto sink =
reinterpret_cast<uintptr_t
>(&value);
103#if (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86))
104# define BENCH_HAS_RDTSC 1
106# define BENCH_HAS_RDTSC 0
110# if defined(_MSC_VER)
119 static inline uint64_t
now() noexcept {
121# if defined(_MSC_VER)
122 unsigned int aux = 0;
123 return __rdtscp(&aux);
128 asm volatile(
"rdtscp" :
"=a"(lo),
"=d"(hi) : :
"%ecx");
129 return (
static_cast<uint64_t
>(hi) << 32) | lo;
132 return static_cast<uint64_t
>(
133 std::chrono::high_resolution_clock::now().time_since_epoch().count());
142 static const double ns_per_tick = calibrate_tsc();
143 return static_cast<double>(ticks) * ns_per_tick;
146 using Period = std::chrono::high_resolution_clock::period;
147 constexpr double ratio =
static_cast<double>(Period::num) /
static_cast<double>(Period::den) * 1e9;
148 return static_cast<double>(ticks) * ratio;
157 return "chrono::high_resolution_clock";
164 static double calibrate_tsc() noexcept {
165 using Clock = std::chrono::high_resolution_clock;
166 constexpr int CALIBRATE_MS = 10;
169 volatile uint64_t
const warm =
now();
172 auto chrono_start = Clock::now();
173 uint64_t
const tsc_start =
now();
176 auto target = chrono_start + std::chrono::milliseconds(CALIBRATE_MS);
177 while (Clock::now() < target) {
181 uint64_t
const tsc_end =
now();
182 auto chrono_end = Clock::now();
184 double const ns_elapsed =
static_cast<double>(
185 std::chrono::duration_cast<std::chrono::nanoseconds>(chrono_end - chrono_start).count());
186 auto const tsc_elapsed =
static_cast<double>(tsc_end - tsc_start);
188 return (tsc_elapsed > 0.0) ? (ns_elapsed / tsc_elapsed) : 1.0;
208 if (data.empty())
return s;
210 std::sort(data.begin(), data.end());
212 const std::size_t n = data.size();
216 s.min_ns = data.front();
217 s.max_ns = data.back();
218 s.median_ns = data[n / 2];
220 for (
auto v : data) sum += v;
221 s.mean_ns = sum /
static_cast<double>(n);
222 s.samples =
static_cast<int>(n);
228 double const q1 = data[n / 4];
229 double const q3 = data[(3 * n) / 4];
230 double const iqr = q3 - q1;
231 double const lower = q1 - 1.5 * iqr;
232 double const upper = q3 + 1.5 * iqr;
234 std::vector<double> filtered;
236 for (
auto v : data) {
237 if (v >= lower && v <= upper) {
238 filtered.push_back(v);
242 if (filtered.empty()) {
247 const std::size_t fn = filtered.size();
248 s.outliers =
static_cast<int>(n - fn);
249 s.samples =
static_cast<int>(fn);
250 s.min_ns = filtered.front();
251 s.max_ns = filtered.back();
252 s.median_ns = filtered[fn / 2];
255 for (
auto v : filtered) sum += v;
256 s.mean_ns = sum /
static_cast<double>(fn);
259 for (
auto v : filtered) {
260 double const d = v - s.mean_ns;
263 s.stddev_ns = std::sqrt(var /
static_cast<double>(fn));
272 SetThreadAffinityMask(GetCurrentThread(), 1ULL);
273 SetPriorityClass(GetCurrentProcess(), HIGH_PRIORITY_CLASS);
274 SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_HIGHEST);
275#elif defined(__linux__) && !defined(__ANDROID__)
280 sched_setaffinity(0,
sizeof(cpuset), &cpuset);
296 template <
typename Func>
297 double run(
int iters, Func&& func)
const {
302 template <
typename Func>
311 std::vector<double> ns_per_iter;
312 ns_per_iter.reserve(
passes);
314 for (std::size_t p = 0; p <
passes; ++p) {
316 for (
int i = 0; i < iters; ++i) {
323 ns_per_iter.push_back(total_ns / iters);
330 template <
typename Func>
333 (void)std::printf(
" %-28s %9.2f ns (min=%6.2f median=%6.2f stddev=%5.2f n=%d-%d)\n",
342 (void)std::printf(
" Warmup: %d iterations\n",
warmup_iters);
343 (void)std::printf(
" Passes: %zu (IQR outlier removal + median)\n",
passes);
349inline const char*
format_ns(
double ns,
char* buf, std::size_t buflen) {
351 (void)std::snprintf(buf, buflen,
"%.2f ns", ns);
352 }
else if (ns < 1000000.0) {
353 (void)std::snprintf(buf, buflen,
"%.2f us", ns / 1000.0);
355 (void)std::snprintf(buf, buflen,
"%.2f ms", ns / 1000000.0);
Stats run_stats(int iters, Func &&func) const
double run(int iters, Func &&func) const
void print_config() const
Harness(int warmup, std::size_t p)
double run_and_print(const char *name, int iters, Func &&func) const
void DoNotOptimize(T const &value)
void pin_thread_and_elevate()
const char * format_ns(double ns, char *buf, std::size_t buflen)
Stats compute_stats(std::vector< double > &data)
static const char * timer_name() noexcept
static double ticks_to_ns(uint64_t ticks) noexcept
static uint64_t now() noexcept