From 696eaa156eae83c4389c9c54ef5e803ab9f92247 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Fri, 13 Mar 2026 19:37:03 +0100 Subject: [PATCH 1/3] fix: exclude paused sections from instrumentation measurement PauseTiming/ResumeTiming now stop/start Callgrind instrumentation in CODSPEED_ANALYSIS mode, ensuring setup/teardown code is not included in benchmark measurements. --- google_benchmark/src/benchmark.cc | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/google_benchmark/src/benchmark.cc b/google_benchmark/src/benchmark.cc index 0358c9a..472c1ce 100644 --- a/google_benchmark/src/benchmark.cc +++ b/google_benchmark/src/benchmark.cc @@ -19,7 +19,7 @@ #include "codspeed.h" #include "internal_macros.h" -#ifdef CODSPEED_WALLTIME +#if defined(CODSPEED_WALLTIME) || defined(CODSPEED_ANALYSIS) #include "measurement.hpp" #endif @@ -272,6 +272,9 @@ void State::PauseTiming() { #ifdef CODSPEED_WALLTIME uint64_t pause_timestamp = measurement_current_timestamp(); #endif +#ifdef CODSPEED_ANALYSIS + callgrind_stop_instrumentation(); +#endif // Add in time accumulated so far BM_CHECK(started_ && !finished_ && !skipped()); @@ -310,6 +313,9 @@ void State::ResumeTiming() { BM_CHECK(resume_timestamp_ == 0); resume_timestamp_ = measurement_current_timestamp(); #endif +#ifdef CODSPEED_ANALYSIS + callgrind_start_instrumentation(); +#endif } void State::SkipWithMessage(const std::string& msg) { From eafc113ddc9bf50ed29d059ba9ae1ff8b72f97a2 Mon Sep 17 00:00:00 2001 From: not-matthias Date: Fri, 5 Jun 2026 18:15:18 +0200 Subject: [PATCH 2/3] fix: exclude paused sections via callgrind collection toggling Replace the per-pause CALLGRIND_STOP/START_INSTRUMENTATION calls with CALLGRIND_TOGGLE_COLLECT. Toggling instrumentation flushes callgrind's simulated cache, so every benchmark measured an artificial cold-cache warmup phase and the cache-warming repetition in RunAnalysis was wasted, regressing 76 benchmarks. Toggling collection excludes paused sections without touching simulator state. Gate the toggles to explicit user PauseTiming()/ResumeTiming() calls: TOGGLE_COLLECT is parity-based and collection starts enabled, so the implicit StartKeepRunning()/FinishKeepRunning() bracket must not toggle or it would invert the collection state for the whole run. Bump instrument-hooks for the new callgrind_toggle_collect helper. Fixes COD-2033 --- core/CMakeLists.txt | 2 +- core/instrument-hooks | 2 +- .../include/benchmark/benchmark.h | 9 +++++++ google_benchmark/src/benchmark.cc | 26 +++++++++++++++++-- 4 files changed, 35 insertions(+), 4 deletions(-) diff --git a/core/CMakeLists.txt b/core/CMakeLists.txt index 8a7eca0..ff9053c 100644 --- a/core/CMakeLists.txt +++ b/core/CMakeLists.txt @@ -15,7 +15,7 @@ include(FetchContent) FetchContent_Declare( instrument_hooks_repo GIT_REPOSITORY https://github.com/CodSpeedHQ/instrument-hooks - GIT_TAG 89fb72a076ec71c9eca6eee9bca98bada4b4dfb4 + GIT_TAG b6c4a75ecd81462bfc4d975e13060217799cd6ef ) FetchContent_MakeAvailable(instrument_hooks_repo) FetchContent_GetProperties(instrument_hooks_repo) diff --git a/core/instrument-hooks b/core/instrument-hooks index 89fb72a..b6c4a75 160000 --- a/core/instrument-hooks +++ b/core/instrument-hooks @@ -1 +1 @@ -Subproject commit 89fb72a076ec71c9eca6eee9bca98bada4b4dfb4 +Subproject commit b6c4a75ecd81462bfc4d975e13060217799cd6ef diff --git a/google_benchmark/include/benchmark/benchmark.h b/google_benchmark/include/benchmark/benchmark.h index f55288d..d023cd9 100644 --- a/google_benchmark/include/benchmark/benchmark.h +++ b/google_benchmark/include/benchmark/benchmark.h @@ -949,6 +949,15 @@ class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State { #if defined(CODSPEED_ANALYSIS) || defined(CODSPEED_WALLTIME) codspeed::CodSpeed* codspeed_; #endif +#ifdef CODSPEED_ANALYSIS + // True between the implicit ResumeTiming() in StartKeepRunning() and the + // implicit PauseTiming() in FinishKeepRunning(). Only explicit user + // PauseTiming()/ResumeTiming() calls (inside the benchmark loop) may + // toggle callgrind collection: CALLGRIND_TOGGLE_COLLECT is parity-based, + // and collection starts enabled, so the implicit bracket calls must not + // toggle or they would invert the collection state for the whole run. + bool codspeed_in_benchmark_loop_; +#endif #ifdef CODSPEED_WALLTIME uint64_t resume_timestamp_; #endif diff --git a/google_benchmark/src/benchmark.cc b/google_benchmark/src/benchmark.cc index 472c1ce..3223805 100644 --- a/google_benchmark/src/benchmark.cc +++ b/google_benchmark/src/benchmark.cc @@ -199,6 +199,9 @@ State::State(std::string name, IterationCount max_iters, #if defined(CODSPEED_ANALYSIS) || defined(CODSPEED_WALLTIME) codspeed_(codspeed), #endif +#ifdef CODSPEED_ANALYSIS + codspeed_in_benchmark_loop_(false), +#endif #ifdef CODSPEED_WALLTIME resume_timestamp_(0), #endif @@ -273,7 +276,13 @@ void State::PauseTiming() { uint64_t pause_timestamp = measurement_current_timestamp(); #endif #ifdef CODSPEED_ANALYSIS - callgrind_stop_instrumentation(); + // Suspend callgrind cost collection for the paused section. Toggling + // collection (unlike stopping instrumentation) does not flush the + // simulated cache, so it adds no artificial cold-cache cost to the + // measured region. + if (codspeed_in_benchmark_loop_) { + callgrind_toggle_collect(); + } #endif // Add in time accumulated so far @@ -314,7 +323,10 @@ void State::ResumeTiming() { resume_timestamp_ = measurement_current_timestamp(); #endif #ifdef CODSPEED_ANALYSIS - callgrind_start_instrumentation(); + // Re-enable callgrind cost collection after a paused section. + if (codspeed_in_benchmark_loop_) { + callgrind_toggle_collect(); + } #endif } @@ -367,12 +379,22 @@ void State::StartKeepRunning() { manager_->StartStopBarrier(); if (!skipped()) { ResumeTiming(); +#ifdef CODSPEED_ANALYSIS + // Arm collection toggling only after the implicit ResumeTiming() above, + // so that only explicit user pauses inside the loop toggle collection. + codspeed_in_benchmark_loop_ = true; +#endif } } void State::FinishKeepRunning() { BM_CHECK(started_ && (!finished_ || skipped())); if (!skipped()) { +#ifdef CODSPEED_ANALYSIS + // Disarm before the implicit PauseTiming() below so it does not toggle + // collection off for the rest of the process. + codspeed_in_benchmark_loop_ = false; +#endif PauseTiming(); } // Total iterations has now wrapped around past 0. Fix this. From 4dd226e242b019518d7a19487451d6b64c756a9d Mon Sep 17 00:00:00 2001 From: not-matthias Date: Fri, 5 Jun 2026 18:59:12 +0200 Subject: [PATCH 3/3] ref: exclude benchmark harness from measurement via collection steady-state Toggle callgrind collection off once at process start and make the PauseTiming()/ResumeTiming() toggles unconditional. Collection is now only enabled inside the benchmark loop, so State setup, timer reads and instrument-hooks zero/dump requests no longer appear in the measurement, and the codspeed_in_benchmark_loop_ gating flag is no longer needed. The toggle is inlined via CALLGRIND_TOGGLE_COLLECT directly (instead of calling the instrument-hooks wrapper) so no toggle frame shows up in flamegraphs; the counted boundary shrinks to the ResumeTiming() epilogue (~6 instructions). SkipWithMessage/SkipWithError restore the toggle parity when a benchmark is skipped mid-loop without pausing. Refs COD-2033 --- .../include/benchmark/benchmark.h | 9 ---- google_benchmark/src/benchmark.cc | 52 +++++++++---------- 2 files changed, 24 insertions(+), 37 deletions(-) diff --git a/google_benchmark/include/benchmark/benchmark.h b/google_benchmark/include/benchmark/benchmark.h index d023cd9..f55288d 100644 --- a/google_benchmark/include/benchmark/benchmark.h +++ b/google_benchmark/include/benchmark/benchmark.h @@ -949,15 +949,6 @@ class BENCHMARK_EXPORT BENCHMARK_INTERNAL_CACHELINE_ALIGNED State { #if defined(CODSPEED_ANALYSIS) || defined(CODSPEED_WALLTIME) codspeed::CodSpeed* codspeed_; #endif -#ifdef CODSPEED_ANALYSIS - // True between the implicit ResumeTiming() in StartKeepRunning() and the - // implicit PauseTiming() in FinishKeepRunning(). Only explicit user - // PauseTiming()/ResumeTiming() calls (inside the benchmark loop) may - // toggle callgrind collection: CALLGRIND_TOGGLE_COLLECT is parity-based, - // and collection starts enabled, so the implicit bracket calls must not - // toggle or they would invert the collection state for the whole run. - bool codspeed_in_benchmark_loop_; -#endif #ifdef CODSPEED_WALLTIME uint64_t resume_timestamp_; #endif diff --git a/google_benchmark/src/benchmark.cc b/google_benchmark/src/benchmark.cc index 3223805..aa464ec 100644 --- a/google_benchmark/src/benchmark.cc +++ b/google_benchmark/src/benchmark.cc @@ -61,6 +61,24 @@ #include "thread_timer.h" namespace benchmark { + +#ifdef CODSPEED_ANALYSIS +// Callgrind cost collection is kept off outside the benchmark loop (one +// initial toggle below, inverting the --collect-atstart=yes default) and +// toggled on/off by ResumeTiming()/PauseTiming(). This keeps harness code and +// paused sections out of the measurement without flushing the simulated cache +// (unlike CALLGRIND_STOP_INSTRUMENTATION). The toggle must be the first +// statement in PauseTiming() and the last in ResumeTiming(). +#define CODSPEED_TOGGLE_COLLECT() CALLGRIND_TOGGLE_COLLECT +namespace { +struct CodSpeedCollectInit { + CodSpeedCollectInit() { CODSPEED_TOGGLE_COLLECT(); } +} const codspeed_collect_init; +} // namespace +#else +#define CODSPEED_TOGGLE_COLLECT() +#endif + // Print a list of benchmarks. This option overrides all other options. BM_DEFINE_bool(benchmark_list_tests, false); @@ -199,9 +217,6 @@ State::State(std::string name, IterationCount max_iters, #if defined(CODSPEED_ANALYSIS) || defined(CODSPEED_WALLTIME) codspeed_(codspeed), #endif -#ifdef CODSPEED_ANALYSIS - codspeed_in_benchmark_loop_(false), -#endif #ifdef CODSPEED_WALLTIME resume_timestamp_(0), #endif @@ -275,15 +290,7 @@ void State::PauseTiming() { #ifdef CODSPEED_WALLTIME uint64_t pause_timestamp = measurement_current_timestamp(); #endif -#ifdef CODSPEED_ANALYSIS - // Suspend callgrind cost collection for the paused section. Toggling - // collection (unlike stopping instrumentation) does not flush the - // simulated cache, so it adds no artificial cold-cache cost to the - // measured region. - if (codspeed_in_benchmark_loop_) { - callgrind_toggle_collect(); - } -#endif + CODSPEED_TOGGLE_COLLECT(); // Add in time accumulated so far BM_CHECK(started_ && !finished_ && !skipped()); @@ -322,12 +329,7 @@ void State::ResumeTiming() { BM_CHECK(resume_timestamp_ == 0); resume_timestamp_ = measurement_current_timestamp(); #endif -#ifdef CODSPEED_ANALYSIS - // Re-enable callgrind cost collection after a paused section. - if (codspeed_in_benchmark_loop_) { - callgrind_toggle_collect(); - } -#endif + CODSPEED_TOGGLE_COLLECT(); } void State::SkipWithMessage(const std::string& msg) { @@ -342,6 +344,8 @@ void State::SkipWithMessage(const std::string& msg) { total_iterations_ = 0; if (timer_->running()) { timer_->StopTimer(); + // Skipped mid-loop without pausing: restore the collection-off state. + CODSPEED_TOGGLE_COLLECT(); } } @@ -357,6 +361,8 @@ void State::SkipWithError(const std::string& msg) { total_iterations_ = 0; if (timer_->running()) { timer_->StopTimer(); + // Skipped mid-loop without pausing: restore the collection-off state. + CODSPEED_TOGGLE_COLLECT(); } } @@ -379,22 +385,12 @@ void State::StartKeepRunning() { manager_->StartStopBarrier(); if (!skipped()) { ResumeTiming(); -#ifdef CODSPEED_ANALYSIS - // Arm collection toggling only after the implicit ResumeTiming() above, - // so that only explicit user pauses inside the loop toggle collection. - codspeed_in_benchmark_loop_ = true; -#endif } } void State::FinishKeepRunning() { BM_CHECK(started_ && (!finished_ || skipped())); if (!skipped()) { -#ifdef CODSPEED_ANALYSIS - // Disarm before the implicit PauseTiming() below so it does not toggle - // collection off for the rest of the process. - codspeed_in_benchmark_loop_ = false; -#endif PauseTiming(); } // Total iterations has now wrapped around past 0. Fix this.