batmat
0.0.17
Batched linear algebra routines
Loading...
Searching...
No Matches
neon.hpp
Go to the documentation of this file.
1
#pragma once
2
3
#include <batmat/config.hpp>
4
#include <
batmat/simd.hpp
>
5
6
namespace
batmat
{
7
namespace
linalg::micro_kernels::gemm
{
8
9
/// Register block size of the matrix-matrix multiplication micro-kernels.
10
/// NEON has 32 vector registers, we use 16 registers for a 4×4 accumulator
11
/// block of matrix C (leaving plenty of registers for loading A and B):
12
/// @todo On the Raspberry Pi 3B+ (Cortex A53) I used for testing, a 5×5 accumulator
13
/// was >6% slower for 15×15 matrix-matrix multiplication, and >5% slower for
14
/// 20×20 matrices.
15
/// My conjecture is that since pre-loading the elements of A and B requires
16
/// RowsReg+ColsReg registers, the total number of registers required is then 35
17
/// for the 5×5 case, and the compiler prevents spilling those three extra
18
/// registers by interleaving the loads of A and B with FMA instructions, and
19
/// this is suboptimal because of the higher instruction latencies.
20
template
<
class
T,
class
Abi>
21
inline
constexpr
index_t
RowsReg
= 4;
22
// Vectors greater than the physical vector length use more registers, so decrease the block size.
23
template
<
class
T,
class
Abi>
24
requires
(datapar::simd_size<T, Abi>::value *
sizeof
(T) > 16)
25
inline
constexpr
index_t
RowsReg<T, Abi>
= 3;
26
27
}
// namespace linalg::micro_kernels::gemm
28
namespace
ops
{
29
30
template
<
class
T>
31
inline
constexpr
index_t
RowsRegTranspose
= 4;
32
template
<
class
T>
33
inline
constexpr
index_t
ColsRegTranspose
= 4;
34
35
}
// namespace ops
36
}
// namespace batmat
guanaqo::linalg::index_t
std::ptrdiff_t index_t
batmat::linalg::micro_kernels::gemm
Definition
gemm.hpp:10
batmat::linalg::micro_kernels::gemm::RowsReg
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition
avx-512.hpp:13
batmat::ops
Definition
cneg.hpp:11
batmat::ops::RowsRegTranspose
constexpr index_t RowsRegTranspose
Definition
avx-512.hpp:23
batmat::ops::ColsRegTranspose
constexpr index_t ColsRegTranspose
Definition
avx-512.hpp:25
batmat
Definition
dtypes.hpp:9
simd.hpp
batmat
include
batmat
platform
neon.hpp
Generated on
for batmat by
1.16.1