batmat
0.0.14
Batched linear algebra routines
Loading...
Searching...
No Matches
avx2.hpp
Go to the documentation of this file.
1
#pragma once
2
3
#include <batmat/config.hpp>
4
#include <
batmat/simd.hpp
>
5
6
namespace
batmat
{
7
namespace
linalg::micro_kernels::gemm
{
8
9
/// Register block size of the matrix-matrix multiplication micro-kernels.
10
/// AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator
11
/// block of matrix C (leaving some registers for loading A and B):
12
/// @note A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even
13
/// block size results in full cache lines being consumed. For small matrices, 3×3 is faster
14
/// because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for
15
/// both small and large matrices (tested using GCC 15.1 on an i7-10750H).
16
template
<
class
T,
class
Abi>
17
inline
constexpr
index_t
RowsReg
= 3;
18
// Vectors greater than the physical vector length use more registers, so decrease the block size.
19
template
<
class
T,
class
Abi>
20
requires
(datapar::simd_size<T, Abi>::value *
sizeof
(T) > 32)
21
inline
constexpr
index_t
RowsReg<T, Abi>
= 2;
22
23
}
// namespace linalg::micro_kernels::gemm
24
namespace
ops
{
25
26
template
<
class
T>
27
inline
constexpr
index_t
RowsRegTranspose
= 4;
28
template
<
class
T>
29
inline
constexpr
index_t
ColsRegTranspose
= 4;
30
31
}
// namespace ops
32
}
// namespace batmat
guanaqo::linalg::index_t
std::ptrdiff_t index_t
batmat::linalg::micro_kernels::gemm
Definition
gemm.hpp:9
batmat::linalg::micro_kernels::gemm::RowsReg
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition
avx-512.hpp:13
batmat::ops
Definition
cneg.hpp:11
batmat::ops::RowsRegTranspose
constexpr index_t RowsRegTranspose
Definition
avx-512.hpp:23
batmat::ops::ColsRegTranspose
constexpr index_t ColsRegTranspose
Definition
avx-512.hpp:25
batmat
Definition
kib.hpp:5
simd.hpp
batmat
include
batmat
platform
avx2.hpp
Generated on
for batmat by
1.16.1