batmat develop
Batched linear algebra routines
Loading...
Searching...
No Matches
avx2.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <batmat/config.hpp>
4#include <batmat/simd.hpp>
5
6namespace batmat {
8
9/// Register block size of the matrix-matrix multiplication micro-kernels.
10/// AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator
11/// block of matrix C (leaving some registers for loading A and B):
12/// @note A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even
13/// block size results in full cache lines being consumed. For small matrices, 3×3 is faster
14/// because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for
15/// both small and large matrices (tested using GCC 15.1 on an i7-10750H).
16template <class T, class Abi>
17inline constexpr index_t RowsReg = 3;
18// Vectors greater than the physical vector length use more registers, so decrease the block size.
19template <class T, class Abi>
20 requires(datapar::simd_size<T, Abi>::value * sizeof(T) > 32)
21inline constexpr index_t RowsReg<T, Abi> = 2;
22
23} // namespace linalg::micro_kernels::gemm
24namespace ops {
25
26template <class T>
27inline constexpr index_t RowsRegTranspose = 4;
28template <class T>
29inline constexpr index_t ColsRegTranspose = 4;
30
31} // namespace ops
32} // namespace batmat
std::ptrdiff_t index_t
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition avx-512.hpp:13
constexpr index_t RowsRegTranspose
Definition avx-512.hpp:23
constexpr index_t ColsRegTranspose
Definition avx-512.hpp:25