guanaqo 1.0.0-alpha.26
Utilities for scientific software
Loading...
Searching...
No Matches
counters.cpp
Go to the documentation of this file.
2#include <atomic>
3
4#if GUANAQO_WITH_PCM
5
6#include <bit>
7#include <cmath>
8#include <format>
9#include <memory>
10#include <mutex>
11#include <vector>
12
13#include <msr.h>
14#include <sched.h>
15#include <types.h>
16#include <unistd.h>
17
18namespace guanaqo::pcm {
19
20using namespace ::pcm;
21
23 static thread_local ThreadPerfCounters counters;
24 return counters;
25}
26
27namespace {
28
29/// Get the current CPU ID.
30int get_cpu(int cpu = -1) {
31 if (cpu < 0)
32 cpu = sched_getcpu();
33 return cpu;
34}
35
36/// Implementation follows cpucounters.cpp:4288-4303 (setEvent lambda)
37/// Creates an IA32_PERFEVTSELx register value from event number and umask
38constexpr uint64 makeEventSelect(uint8_t event, uint8_t umask) {
39 const decltype(EventSelectRegister::fields) fields{
40 .event_select = event, // Bits 7:0 - Event select
41 .umask = umask, // Bits 15:8 - Unit mask
42 .usr = 1, // Bit 16 - Count in user mode (CPL > 0)
43 .os = 1, // Bit 17 - Count in kernel mode (CPL = 0)
44 .edge = 0, // Bit 18 - Edge detect (0=count cycles)
45 .pin_control = 0, // Bit 19 - Pin control
46 .apic_int = 0, // Bit 20 - APIC interrupt enable
47 .any_thread = 0, // Bit 21 - Any thread (0=this thread only)
48 .enable = 1, // Bit 22 - Enable counter
49 .invert = 0, // Bit 23 - Invert counter mask
50 .cmask = 0, // Bits 31:24 - Counter mask
51 .in_tx = 0, // Bit 32 - In TSX transaction
52 .in_txcp = 0, // Bit 33 - In TSX abort handler
53 .reservedX = 0, // Bits 63:34 - Reserved must be zero
54 };
55 return std::bit_cast<uint64>(fields);
56}
57
58/// Helper to extract a range of bits from a 64-bit value.
59constexpr uint64 extract_bits(uint64 value, int start, int end) {
60 uint64 mask = (uint64{1} << (end - start + 1)) - 1;
61 return (value >> start) & mask;
62}
63
64class LightweightPerfCounters {
65 private:
66 std::unique_ptr<MsrHandle> msr;
67
68 /// Event select configuration for different counters
69 struct CounterConfig {
70 uint64 evtsel_addr;
71 uint64 pmc_addr;
72 uint64 config;
73 };
74
75 /// Configure 4 programmable counters with Skylake+ events
76 /// Event configuration follows cpucounters.cpp:3735-3738 and 4318-4319
77 /// Uses `SKL_MEM_LOAD_RETIRED_*` constants from types.h:127-134
78 static constexpr std::array<CounterConfig, 4> counters{{
79 // Counter 0: L2 cache misses
80 // Event: MEM_LOAD_RETIRED.L2_MISS (0xD1:0x10)
81 {.evtsel_addr = IA32_PERFEVTSEL0_ADDR,
82 .pmc_addr = IA32_PMC0,
83 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L2_MISS_EVTNR,
84 SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK)},
85 // Counter 1: L2 cache hits
86 // Event: MEM_LOAD_RETIRED.L2_HIT (0xD1:0x02)
87 {.evtsel_addr = IA32_PERFEVTSEL1_ADDR,
88 .pmc_addr = IA32_PMC1,
89 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR,
90 SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK)},
91 // Counter 2: L3 cache misses
92 // Event: MEM_LOAD_RETIRED.L3_MISS (0xD1:0x20)
93 {.evtsel_addr = IA32_PERFEVTSEL2_ADDR,
94 .pmc_addr = IA32_PMC2,
95 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L3_MISS_EVTNR,
96 SKL_MEM_LOAD_RETIRED_L3_MISS_UMASK)},
97 // Counter 3: Branch mispredictions
98 // Event: BR_MISP_RETIRED.ALL_BRANCHES (0xC5:0x00)
99 {.evtsel_addr = IA32_PERFEVTSEL3_ADDR,
100 .pmc_addr = IA32_PMC3,
101 .config = makeEventSelect(0xC5, 0x00)},
102 }};
103
104 public:
105 LightweightPerfCounters(uint32_t cpu)
106 : msr(std::make_unique<MsrHandle>(cpu)) {}
107 LightweightPerfCounters(const LightweightPerfCounters &) = delete;
108 LightweightPerfCounters &
109 operator=(const LightweightPerfCounters &) = delete;
110 LightweightPerfCounters(LightweightPerfCounters &&) noexcept = default;
111 LightweightPerfCounters &
112 operator=(LightweightPerfCounters &&) noexcept = default;
113 ~LightweightPerfCounters() { stop(); }
114
115 // Start counting
116 // Implementation follows cpucounters.cpp:4252-4422
117 void start() {
118 // Step 1: Disable counters during programming (cpucounters.cpp:4252)
119 // This prevents spurious counts while we're configuring
120 msr->write(pcm::IA32_CR_PERF_GLOBAL_CTRL, 0);
121
122 // Step 2: Configure fixed counter control register (cpucounters.cpp:4138-4162)
123 // IA32_CR_FIXED_CTR_CTRL (MSR 0x38D) controls 3 fixed counters:
124 // Counter 0: Instructions Retired (INST_RETIRED.ANY)
125 // Counter 1: Core Clocks Unhalted (CPU_CLK_UNHALTED.THREAD)
126 // Counter 2: Reference Clocks Unhalted (CPU_CLK_UNHALTED.REF_TSC)
127 // Each counter uses 4 bits: [1:0]=enable user+kernel, [2]=any_thread, [3]=pmi
128 // We set os=1 (kernel) and usr=1 (user) for each counter
129 FixedEventControlRegister fixed_ctrl_reg;
130 fixed_ctrl_reg.value = 0;
131 fixed_ctrl_reg.fields.os0 = 1; // Counter 0: count in kernel mode
132 fixed_ctrl_reg.fields.usr0 = 1; // Counter 0: count in user mode
133 fixed_ctrl_reg.fields.os1 = 1; // Counter 1: count in kernel mode
134 fixed_ctrl_reg.fields.usr1 = 1; // Counter 1: count in user mode
135 fixed_ctrl_reg.fields.os2 = 1; // Counter 2: count in kernel mode
136 fixed_ctrl_reg.fields.usr2 = 1; // Counter 2: count in user mode
137
138 // Check if TOPDOWN.SLOTS is supported (Ice Lake+) and enable fixed counter 3
139 // NOTE: On hybrid CPUs (12th gen+), this only works on P-cores, not E-cores
140 // Use taskset to pin to a P-core: taskset -c 0 ./lightweight_perf_example
141 uint64 test_slots = 0;
142 if (msr->read(TOPDOWN_SLOTS_ADDR, &test_slots) == sizeof(uint64)) {
143 // Counter 3: count in kernel mode (TOPDOWN.SLOTS)
144 fixed_ctrl_reg.fields.os3 = 1;
145 fixed_ctrl_reg.fields.usr3 = 1; // Counter 3: count in user mode
146 }
147
148 // Step 3: Reset all fixed counters BEFORE writing control register (cpucounters.cpp:4257-4260)
149 msr->write(INST_RETIRED_ADDR, 0); // MSR 0x309
150 msr->write(CPU_CLK_UNHALTED_THREAD_ADDR, 0); // MSR 0x30A
151 msr->write(CPU_CLK_UNHALTED_REF_ADDR, 0); // MSR 0x30B
152 msr->write(IA32_CR_FIXED_CTR_CTRL, fixed_ctrl_reg.value);
153
154 // Step 4: Reset programmable counters BEFORE writing event selectors (cpucounters.cpp:4381)
155 for (const auto &counter : counters)
156 msr->write(counter.pmc_addr, 0);
157
158 // Step 5: Program programmable counter event selectors (cpucounters.cpp:4382)
159 for (const auto &counter : counters)
160 msr->write(counter.evtsel_addr, counter.config);
161
162 // Step 6: Reset Top-Down metrics MSRs (cpucounters.cpp:4393-4400)
163 // TOPDOWN_SLOTS (MSR 0x30C): Counts total pipeline slots
164 // PERF_METRICS (MSR 0x329): Accumulated TMA metrics (Ice Lake+)
165 msr->write(TOPDOWN_SLOTS_ADDR, 0); // MSR 0x30C (types.h:53)
166 msr->write(PERF_METRICS_ADDR, 0); // MSR 0x329 (types.h:54)
167
168 // Step 7: Enable all counters globally (cpucounters.cpp:4386-4422)
169 // IA32_CR_PERF_GLOBAL_CTRL (MSR 0x38F):
170 // Bits 0-3: Enable PMC0-3
171 // Bits 32-34: Enable fixed counters 0-2
172 // Start with basic counters only (cpucounters.cpp:4388)
173 uint64 global_ctrl = (uint64{0xF} << 0) | // PMC0-3
174 (uint64{0x7} << 32); // Fixed 0-2
175
176 // Try to enable TOPDOWN.SLOTS (Ice Lake+) - if CPU doesn't support it, reading will fail
177 // On hybrid CPUs, this check will fail on E-cores but succeed on P-cores
178 uint64 test_val = 0;
179 if (msr->read(TOPDOWN_SLOTS_ADDR, &test_val) == sizeof(uint64))
180 // Enable fixed counter 3 (TOPDOWN.SLOTS)
181 global_ctrl |= uint64{1} << 35;
182
183 // Try to enable PERF_METRICS (Ice Lake+)
184 if (msr->read(PERF_METRICS_ADDR, &test_val) == sizeof(uint64))
185 // Enable PERF_METRICS
186 global_ctrl |= uint64{1} << 48;
187
188 // Clear any overflow status bits (cpucounters.cpp:4420)
189 msr->write(IA32_PERF_GLOBAL_OVF_CTRL, global_ctrl);
190 // Enable all configured counters (cpucounters.cpp:4421)
191 msr->write(IA32_CR_PERF_GLOBAL_CTRL, global_ctrl);
192 }
193
194 // Stop counting
195 void stop() {
196 // (cpucounters.cpp:5264-5297)
197 msr->write(IA32_CR_PERF_GLOBAL_CTRL, 0);
198 // Disable event selectors
199 for (const auto &counter : counters)
200 msr->write(counter.evtsel_addr, 0);
201 msr->write(IA32_CR_FIXED_CTR_CTRL, 0);
202 }
203
204 struct Snapshot {
205 uint64 abs_instructions; // Fixed counter 0
206 uint64 abs_cycles; // Fixed counter 1
207 uint64 abs_ref_cycles; // Fixed counter 2
208 uint64 abs_l2_misses; // PMC0
209 uint64 abs_l2_hits; // PMC1
210 uint64 abs_l3_misses; // PMC2
211 uint64 abs_branch_misses; // PMC3
212
213 // Top-Down Microarchitecture Analysis (Ice Lake+)
214 uint64 delta_slots;
215 uint64 delta_frontend_bound_slots;
216 uint64 delta_backend_bound_slots;
217 uint64 delta_bad_speculation_slots;
218 uint64 delta_retiring_slots;
219 uint64 delta_mem_bound_slots; // Level 2 metric
220 uint64 delta_fetch_lat_slots; // Level 2 metric
221 };
222
223 static void accumulate(ThreadPerfCounters &ctr, const Snapshot &before,
224 const Snapshot &after) {
225 ctr.instructions += after.abs_instructions - before.abs_instructions;
226 ctr.cycles += after.abs_cycles - before.abs_cycles;
227 ctr.ref_cycles += after.abs_ref_cycles - before.abs_ref_cycles;
228 ctr.l2_misses += after.abs_l2_misses - before.abs_l2_misses;
229 ctr.l2_hits += after.abs_l2_hits - before.abs_l2_hits;
230 ctr.l3_misses += after.abs_l3_misses - before.abs_l3_misses;
231 ctr.branch_misses += after.abs_branch_misses - before.abs_branch_misses;
232
233 ctr.all_slots += after.delta_slots;
234 ctr.frontend_bound_slots += after.delta_frontend_bound_slots;
235 ctr.backend_bound_slots += after.delta_backend_bound_slots;
236 ctr.bad_speculation_slots += after.delta_bad_speculation_slots;
237 ctr.retiring_slots += after.delta_retiring_slots;
238 ctr.mem_bound_slots += after.delta_mem_bound_slots;
239 ctr.fetch_lat_slots += after.delta_fetch_lat_slots;
240 }
241
242 Snapshot read() {
243 // Check for CPU migration
244 auto current_cpu = static_cast<int32_t>(sched_getcpu());
245 if (current_cpu != msr->getCoreId())
246 throw std::runtime_error(std::format(
247 "CPU migration detected: was on CPU {} , now on CPU {}",
248 msr->getCoreId(), current_cpu));
249
250 Snapshot s{};
251 msr->read(INST_RETIRED_ADDR, &s.abs_instructions);
252 msr->read(CPU_CLK_UNHALTED_THREAD_ADDR, &s.abs_cycles);
253 msr->read(CPU_CLK_UNHALTED_REF_ADDR, &s.abs_ref_cycles);
254 msr->read(counters[0].pmc_addr, &s.abs_l2_misses);
255 msr->read(counters[1].pmc_addr, &s.abs_l2_hits);
256 msr->read(counters[2].pmc_addr, &s.abs_l3_misses);
257 msr->read(counters[3].pmc_addr, &s.abs_branch_misses);
258
259 // Read Top-Down metrics (Ice Lake+)
260 // (cpucounters.cpp:5702-5741)
261 // IMPORTANT: These MSRs accumulate since the last reset - they are STATEFUL
262 // PERF_METRICS: Contains accumulated ratios (8-bit per metric)
263 // TOPDOWN_SLOTS: Contains accumulated slot count
264 // Both must be reset after reading to get deltas on the next read
265 uint64 perf_metrics = 0;
266 uint64 topdown_slots = 0;
267
268 if (msr->read(PERF_METRICS_ADDR, &perf_metrics) == sizeof(uint64) &&
269 msr->read(TOPDOWN_SLOTS_ADDR, &topdown_slots) == sizeof(uint64)) {
270
271 // Reset counters for next measurement period (read+reset as delta)
272 msr->write(PERF_METRICS_ADDR, 0);
273 msr->write(TOPDOWN_SLOTS_ADDR, 0);
274
275 // Extract Level 1 metric ratios (8-bit values, 0-255)
276 uint64 retiring_ratio = extract_bits(perf_metrics, 0, 7);
277 uint64 bad_spec_ratio = extract_bits(perf_metrics, 8, 15);
278 uint64 frontend_ratio = extract_bits(perf_metrics, 16, 23);
279 uint64 backend_ratio = extract_bits(perf_metrics, 24, 31);
280
281 // Extract Level 2 metric ratios (Sapphire Rapids+)
282 uint64 mem_bound_ratio = extract_bits(perf_metrics, 56, 63);
283 uint64 fetch_lat_ratio = extract_bits(perf_metrics, 48, 55);
284
285 // Scale ratios to actual slot counts (DELTA values since last read)
286 // The ratio represents accumulated behavior since last reset
287 const auto total_ratio = retiring_ratio + bad_spec_ratio +
288 frontend_ratio + backend_ratio;
289 const auto inv_total_ratio =
290 total_ratio > 0 ? 1.0 / static_cast<double>(total_ratio) : 0.0;
291 auto delta_slots = [&](uint64 ratio) {
292 return static_cast<uint64_t>(
293 std::round((static_cast<double>(ratio) * inv_total_ratio) *
294 static_cast<double>(topdown_slots)));
295 };
296 s.delta_slots = topdown_slots;
297 s.delta_frontend_bound_slots = delta_slots(frontend_ratio);
298 s.delta_backend_bound_slots = delta_slots(backend_ratio);
299 s.delta_bad_speculation_slots = delta_slots(bad_spec_ratio);
300 s.delta_retiring_slots = delta_slots(retiring_ratio);
301 s.delta_mem_bound_slots = delta_slots(mem_bound_ratio);
302 s.delta_fetch_lat_slots = delta_slots(fetch_lat_ratio);
303 }
304
305 return s;
306 }
307};
308
309struct alignas(128) CountersEntry {
310 // Should be uncontended since each CPU is accessed by only one thread at a time
311 std::mutex mtx;
312 std::optional<LightweightPerfCounters> counters;
313};
314
315GUANAQO_EXPORT std::vector<CountersEntry> &get_all_perf_counters() {
316 static std::vector<CountersEntry> instances = [] {
317 long max_cpus = sysconf(_SC_NPROCESSORS_CONF);
318 return std::vector<CountersEntry>(static_cast<size_t>(max_cpus));
319 }();
320 return instances;
321}
322
323std::pair<std::unique_lock<std::mutex>, LightweightPerfCounters &>
324get_perf_counters(int cpu) {
325 auto &inst = get_all_perf_counters()[static_cast<size_t>(cpu)];
326 std::unique_lock lck{inst.mtx};
327 if (!inst.counters)
328 inst.counters.emplace(static_cast<uint32_t>(cpu)).start();
329 return {std::move(lck), *inst.counters};
330}
331
332void stop_all_perf_counters() {
333 for (auto &entry : get_all_perf_counters()) {
334 std::lock_guard lck{entry.mtx};
335 entry.counters.reset();
336 }
337}
338
339} // namespace
340
341namespace detail {
342namespace {
343
344struct ScopedThreadAffinity {
345 cpu_set_t original_set;
346 ScopedThreadAffinity(int cpu) {
347 // Save original affinity
348 sched_getaffinity(0, sizeof(cpu_set_t), &original_set);
349 cpu_set_t set;
350 CPU_ZERO(&set);
351 CPU_SET(cpu, &set); // Set affinity to target CPU
352 sched_setaffinity(0, sizeof(cpu_set_t), &set);
353 }
354 ScopedThreadAffinity(const ScopedThreadAffinity &) = delete;
355 ScopedThreadAffinity &operator=(const ScopedThreadAffinity &) = delete;
356 ~ScopedThreadAffinity() {
357 // Restore original affinity
358 sched_setaffinity(0, sizeof(cpu_set_t), &original_set);
359 }
360};
361
362struct PCMScopedCounters : ScopedCounters {
363 ScopedThreadAffinity affinity;
364 std::pair<std::unique_lock<std::mutex>, LightweightPerfCounters &>
365 cpu_counters;
366 LightweightPerfCounters::Snapshot start;
367
368 PCMScopedCounters(int cpu)
369 : affinity(cpu), cpu_counters(get_perf_counters(cpu)) {
370 start = cpu_counters.second.read();
371 }
372 ThreadPerfCounters &get() override { return get_thread_perf_counters(); }
373 ThreadPerfCounters &stop() override {
374 auto after = cpu_counters.second.read();
375 auto &thread_counters = get_thread_perf_counters();
376 LightweightPerfCounters::accumulate(thread_counters, start, after);
377 return thread_counters;
378 }
379};
380
381} // namespace
382} // namespace detail
383
384GUANAQO_EXPORT std::atomic_bool &get_counters_enabled_flag() {
385 static std::atomic_bool enabled{false};
386 return enabled;
387}
388
390 if (get_counters_enabled_flag().exchange(false, std::memory_order_relaxed))
391 stop_all_perf_counters();
392}
393
395 get_counters_enabled_flag().store(true, std::memory_order_relaxed);
396}
397
398std::unique_ptr<detail::ScopedCounters> start_counters() {
399 if (get_counters_enabled_flag().load(std::memory_order_relaxed))
400 return std::make_unique<detail::PCMScopedCounters>(get_cpu());
401 return {};
402}
403
404} // namespace guanaqo::pcm
405
406#else // Fallback implementation when PCM is not available
407
408namespace guanaqo::pcm {
409
410std::unique_ptr<detail::ScopedCounters> start_counters() { return {}; }
411void disable_counters();
412void enable_counters();
413
414} // namespace guanaqo::pcm
415
416#endif
Performance counter snapshots and scoped collectors.
void disable_counters()
Disables performance counters globally. Blocks until all active counters have stopped.
Definition counters.cpp:389
std::unique_ptr< detail::ScopedCounters > start_counters()
May return null if PCM is not available.
Definition counters.cpp:398
void enable_counters()
Enables performance counters globally.
Definition counters.cpp:394
ThreadPerfCounters & get_thread_perf_counters()
Definition counters.cpp:22
std::atomic_bool & get_counters_enabled_flag()
Definition counters.cpp:384