30int get_cpu(
int cpu = -1) {
38constexpr uint64 makeEventSelect(uint8_t event, uint8_t umask) {
39 const decltype(EventSelectRegister::fields) fields{
40 .event_select = event,
55 return std::bit_cast<uint64>(fields);
59constexpr uint64 extract_bits(uint64 value,
int start,
int end) {
60 uint64 mask = (uint64{1} << (end - start + 1)) - 1;
61 return (value >> start) & mask;
64class LightweightPerfCounters {
66 std::unique_ptr<MsrHandle> msr;
69 struct CounterConfig {
78 static constexpr std::array<CounterConfig, 4> counters{{
81 {.evtsel_addr = IA32_PERFEVTSEL0_ADDR,
82 .pmc_addr = IA32_PMC0,
83 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L2_MISS_EVTNR,
84 SKL_MEM_LOAD_RETIRED_L2_MISS_UMASK)},
87 {.evtsel_addr = IA32_PERFEVTSEL1_ADDR,
88 .pmc_addr = IA32_PMC1,
89 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L2_HIT_EVTNR,
90 SKL_MEM_LOAD_RETIRED_L2_HIT_UMASK)},
93 {.evtsel_addr = IA32_PERFEVTSEL2_ADDR,
94 .pmc_addr = IA32_PMC2,
95 .config = makeEventSelect(SKL_MEM_LOAD_RETIRED_L3_MISS_EVTNR,
96 SKL_MEM_LOAD_RETIRED_L3_MISS_UMASK)},
99 {.evtsel_addr = IA32_PERFEVTSEL3_ADDR,
100 .pmc_addr = IA32_PMC3,
101 .config = makeEventSelect(0xC5, 0x00)},
105 LightweightPerfCounters(uint32_t cpu)
106 : msr(std::make_unique<MsrHandle>(cpu)) {}
107 LightweightPerfCounters(
const LightweightPerfCounters &) =
delete;
108 LightweightPerfCounters &
109 operator=(
const LightweightPerfCounters &) =
delete;
110 LightweightPerfCounters(LightweightPerfCounters &&) noexcept = default;
111 LightweightPerfCounters &
112 operator=(LightweightPerfCounters &&) noexcept = default;
113 ~LightweightPerfCounters() { stop(); }
120 msr->write(pcm::IA32_CR_PERF_GLOBAL_CTRL, 0);
129 FixedEventControlRegister fixed_ctrl_reg;
130 fixed_ctrl_reg.value = 0;
131 fixed_ctrl_reg.fields.os0 = 1;
132 fixed_ctrl_reg.fields.usr0 = 1;
133 fixed_ctrl_reg.fields.os1 = 1;
134 fixed_ctrl_reg.fields.usr1 = 1;
135 fixed_ctrl_reg.fields.os2 = 1;
136 fixed_ctrl_reg.fields.usr2 = 1;
141 uint64 test_slots = 0;
142 if (msr->read(TOPDOWN_SLOTS_ADDR, &test_slots) ==
sizeof(uint64)) {
144 fixed_ctrl_reg.fields.os3 = 1;
145 fixed_ctrl_reg.fields.usr3 = 1;
149 msr->write(INST_RETIRED_ADDR, 0);
150 msr->write(CPU_CLK_UNHALTED_THREAD_ADDR, 0);
151 msr->write(CPU_CLK_UNHALTED_REF_ADDR, 0);
152 msr->write(IA32_CR_FIXED_CTR_CTRL, fixed_ctrl_reg.value);
155 for (
const auto &counter : counters)
156 msr->write(counter.pmc_addr, 0);
159 for (
const auto &counter : counters)
160 msr->write(counter.evtsel_addr, counter.config);
165 msr->write(TOPDOWN_SLOTS_ADDR, 0);
166 msr->write(PERF_METRICS_ADDR, 0);
173 uint64 global_ctrl = (uint64{0xF} << 0) |
179 if (msr->read(TOPDOWN_SLOTS_ADDR, &test_val) ==
sizeof(uint64))
181 global_ctrl |= uint64{1} << 35;
184 if (msr->read(PERF_METRICS_ADDR, &test_val) ==
sizeof(uint64))
186 global_ctrl |= uint64{1} << 48;
189 msr->write(IA32_PERF_GLOBAL_OVF_CTRL, global_ctrl);
191 msr->write(IA32_CR_PERF_GLOBAL_CTRL, global_ctrl);
197 msr->write(IA32_CR_PERF_GLOBAL_CTRL, 0);
199 for (
const auto &counter : counters)
200 msr->write(counter.evtsel_addr, 0);
201 msr->write(IA32_CR_FIXED_CTR_CTRL, 0);
205 uint64 abs_instructions;
207 uint64 abs_ref_cycles;
208 uint64 abs_l2_misses;
210 uint64 abs_l3_misses;
211 uint64 abs_branch_misses;
215 uint64 delta_frontend_bound_slots;
216 uint64 delta_backend_bound_slots;
217 uint64 delta_bad_speculation_slots;
218 uint64 delta_retiring_slots;
219 uint64 delta_mem_bound_slots;
220 uint64 delta_fetch_lat_slots;
223 static void accumulate(ThreadPerfCounters &ctr,
const Snapshot &before,
224 const Snapshot &after) {
225 ctr.instructions += after.abs_instructions - before.abs_instructions;
226 ctr.cycles += after.abs_cycles - before.abs_cycles;
227 ctr.ref_cycles += after.abs_ref_cycles - before.abs_ref_cycles;
228 ctr.l2_misses += after.abs_l2_misses - before.abs_l2_misses;
229 ctr.l2_hits += after.abs_l2_hits - before.abs_l2_hits;
230 ctr.l3_misses += after.abs_l3_misses - before.abs_l3_misses;
231 ctr.branch_misses += after.abs_branch_misses - before.abs_branch_misses;
233 ctr.all_slots += after.delta_slots;
234 ctr.frontend_bound_slots += after.delta_frontend_bound_slots;
235 ctr.backend_bound_slots += after.delta_backend_bound_slots;
236 ctr.bad_speculation_slots += after.delta_bad_speculation_slots;
237 ctr.retiring_slots += after.delta_retiring_slots;
238 ctr.mem_bound_slots += after.delta_mem_bound_slots;
239 ctr.fetch_lat_slots += after.delta_fetch_lat_slots;
244 auto current_cpu =
static_cast<int32_t
>(sched_getcpu());
245 if (current_cpu != msr->getCoreId())
246 throw std::runtime_error(std::format(
247 "CPU migration detected: was on CPU {} , now on CPU {}",
248 msr->getCoreId(), current_cpu));
251 msr->read(INST_RETIRED_ADDR, &s.abs_instructions);
252 msr->read(CPU_CLK_UNHALTED_THREAD_ADDR, &s.abs_cycles);
253 msr->read(CPU_CLK_UNHALTED_REF_ADDR, &s.abs_ref_cycles);
254 msr->read(counters[0].pmc_addr, &s.abs_l2_misses);
255 msr->read(counters[1].pmc_addr, &s.abs_l2_hits);
256 msr->read(counters[2].pmc_addr, &s.abs_l3_misses);
257 msr->read(counters[3].pmc_addr, &s.abs_branch_misses);
265 uint64 perf_metrics = 0;
266 uint64 topdown_slots = 0;
268 if (msr->read(PERF_METRICS_ADDR, &perf_metrics) ==
sizeof(uint64) &&
269 msr->read(TOPDOWN_SLOTS_ADDR, &topdown_slots) ==
sizeof(uint64)) {
272 msr->write(PERF_METRICS_ADDR, 0);
273 msr->write(TOPDOWN_SLOTS_ADDR, 0);
276 uint64 retiring_ratio = extract_bits(perf_metrics, 0, 7);
277 uint64 bad_spec_ratio = extract_bits(perf_metrics, 8, 15);
278 uint64 frontend_ratio = extract_bits(perf_metrics, 16, 23);
279 uint64 backend_ratio = extract_bits(perf_metrics, 24, 31);
282 uint64 mem_bound_ratio = extract_bits(perf_metrics, 56, 63);
283 uint64 fetch_lat_ratio = extract_bits(perf_metrics, 48, 55);
287 const auto total_ratio = retiring_ratio + bad_spec_ratio +
288 frontend_ratio + backend_ratio;
289 const auto inv_total_ratio =
290 total_ratio > 0 ? 1.0 /
static_cast<double>(total_ratio) : 0.0;
291 auto delta_slots = [&](uint64 ratio) {
292 return static_cast<uint64_t
>(
293 std::round((
static_cast<double>(ratio) * inv_total_ratio) *
294 static_cast<double>(topdown_slots)));
296 s.delta_slots = topdown_slots;
297 s.delta_frontend_bound_slots = delta_slots(frontend_ratio);
298 s.delta_backend_bound_slots = delta_slots(backend_ratio);
299 s.delta_bad_speculation_slots = delta_slots(bad_spec_ratio);
300 s.delta_retiring_slots = delta_slots(retiring_ratio);
301 s.delta_mem_bound_slots = delta_slots(mem_bound_ratio);
302 s.delta_fetch_lat_slots = delta_slots(fetch_lat_ratio);
309struct alignas(128) CountersEntry {
312 std::optional<LightweightPerfCounters> counters;
315GUANAQO_EXPORT std::vector<CountersEntry> &get_all_perf_counters() {
316 static std::vector<CountersEntry> instances = [] {
317 long max_cpus = sysconf(_SC_NPROCESSORS_CONF);
318 return std::vector<CountersEntry>(
static_cast<size_t>(max_cpus));
323std::pair<std::unique_lock<std::mutex>, LightweightPerfCounters &>
324get_perf_counters(
int cpu) {
325 auto &inst = get_all_perf_counters()[
static_cast<size_t>(cpu)];
326 std::unique_lock lck{inst.mtx};
328 inst.counters.emplace(
static_cast<uint32_t
>(cpu)).start();
329 return {std::move(lck), *inst.counters};
332void stop_all_perf_counters() {
333 for (
auto &entry : get_all_perf_counters()) {
334 std::lock_guard lck{entry.mtx};
335 entry.counters.reset();
344struct ScopedThreadAffinity {
345 cpu_set_t original_set;
346 ScopedThreadAffinity(
int cpu) {
348 sched_getaffinity(0,
sizeof(cpu_set_t), &original_set);
352 sched_setaffinity(0,
sizeof(cpu_set_t), &set);
354 ScopedThreadAffinity(
const ScopedThreadAffinity &) =
delete;
355 ScopedThreadAffinity &operator=(
const ScopedThreadAffinity &) =
delete;
356 ~ScopedThreadAffinity() {
358 sched_setaffinity(0,
sizeof(cpu_set_t), &original_set);
363 ScopedThreadAffinity affinity;
364 std::pair<std::unique_lock<std::mutex>, LightweightPerfCounters &>
366 LightweightPerfCounters::Snapshot start;
368 PCMScopedCounters(
int cpu)
369 : affinity(cpu), cpu_counters(get_perf_counters(cpu)) {
370 start = cpu_counters.second.read();
373 ThreadPerfCounters &stop()
override {
374 auto after = cpu_counters.second.read();
376 LightweightPerfCounters::accumulate(thread_counters, start, after);
377 return thread_counters;
385 static std::atomic_bool enabled{
false};
391 stop_all_perf_counters();
400 return std::make_unique<detail::PCMScopedCounters>(get_cpu());
410std::unique_ptr<detail::ScopedCounters>
start_counters() {
return {}; }
Performance counter snapshots and scoped collectors.
void disable_counters()
Disables performance counters globally. Blocks until all active counters have stopped.
std::unique_ptr< detail::ScopedCounters > start_counters()
May return null if PCM is not available.
void enable_counters()
Enables performance counters globally.
ThreadPerfCounters & get_thread_perf_counters()
std::atomic_bool & get_counters_enabled_flag()