batmat main
Batched linear algebra routines
Loading...
Searching...
No Matches
neon.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <batmat/config.hpp>
4#include <batmat/simd.hpp>
5
6namespace batmat {
8
9/// Register block size of the matrix-matrix multiplication micro-kernels.
10/// NEON has 32 vector registers, we use 16 registers for a 4×4 accumulator
11/// block of matrix C (leaving plenty of registers for loading A and B):
12/// @todo On the Raspberry Pi 3B+ (Cortex A53) I used for testing, a 5×5 accumulator
13/// was >6% slower for 15×15 matrix-matrix multiplication, and >5% slower for
14/// 20×20 matrices.
15/// My conjecture is that since pre-loading the elements of A and B requires
16/// RowsReg+ColsReg registers, the total number of registers required is then 35
17/// for the 5×5 case, and the compiler prevents spilling those three extra
18/// registers by interleaving the loads of A and B with FMA instructions, and
19/// this is suboptimal because of the higher instruction latencies.
20template <class T, class Abi>
21inline constexpr index_t RowsReg = 4;
22// Vectors greater than the physical vector length use more registers, so decrease the block size.
23template <class T, class Abi>
24 requires(datapar::simd_size<T, Abi>::value * sizeof(T) > 16)
25inline constexpr index_t RowsReg<T, Abi> = 3;
26
27} // namespace linalg::micro_kernels::gemm
28namespace ops {
29
30template <class T>
31inline constexpr index_t RowsRegTranspose = 4;
32template <class T>
33inline constexpr index_t ColsRegTranspose = 4;
34
35} // namespace ops
36} // namespace batmat
std::ptrdiff_t index_t
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition avx-512.hpp:13
constexpr index_t RowsRegTranspose
Definition avx-512.hpp:23
constexpr index_t ColsRegTranspose
Definition avx-512.hpp:25