develop/Doxygen/avx2_8hpp_source.html

#pragma once


#include <batmat/config.hpp>

#include <batmat/simd.hpp>


namespace batmat {

namespace linalg::micro_kernels::gemm {


/// Register block size of the matrix-matrix multiplication micro-kernels.

/// AVX2 has 16 vector registers, we use 9 registers for a 3×3 accumulator

/// block of matrix C (leaving some registers for loading A and B):

/// @note   A block size of 4×4 is slightly faster than 3×3 for large matrices, because the even

///         block size results in full cache lines being consumed. For small matrices, 3×3 is faster

///         because it does not spill any registers in the micro-kernels. 2×2 is slower than 3×3 for

///         both small and large matrices (tested using GCC 15.1 on an i7-10750H).

template <class T, class Abi>

inline constexpr index_t RowsReg = 3;

// Vectors greater than the physical vector length use more registers, so decrease the block size.

template <class T, class Abi>

    requires(datapar::simd_size<T, Abi>::value * sizeof(T) > 32)

inline constexpr index_t RowsReg<T, Abi> = 2;


} // namespace linalg::micro_kernels::gemm

namespace ops {


template <class T>

inline constexpr index_t RowsRegTranspose = 4;

template <class T>

inline constexpr index_t ColsRegTranspose = 4;


} // namespace ops

} // namespace batmat

config.hpp

batmat::linalg::micro_kernels::gemm
Definition gemm.hpp:10

batmat::linalg::micro_kernels::gemm::RowsReg
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition avx-512.hpp:13

batmat::ops
Definition cneg.hpp:11

batmat::ops::RowsRegTranspose
constexpr index_t RowsRegTranspose
Definition avx-512.hpp:23

batmat::ops::ColsRegTranspose
constexpr index_t ColsRegTranspose
Definition avx-512.hpp:25

batmat
Definition dtypes.hpp:9

batmat::index_t
int index_t
Definition config.hpp:13

simd.hpp