batmat/Doxygen/trsm_8tpp_source.html

#pragma once


#include <batmat/assume.hpp>

#include <batmat/linalg/micro-kernels/trsm.hpp>

#include <batmat/linalg/structure.hpp>

#include <batmat/linalg/uview.hpp>

#include <batmat/loop.hpp>

#include <batmat/lut.hpp>

#include <batmat/ops/rotate.hpp>


#define UNROLL_FOR(...) BATMAT_FULLY_UNROLLED_FOR (__VA_ARGS__)


namespace batmat::linalg::micro_kernels::trsm {


template <class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>

inline const constinit auto trsm_copy_lut = make_2d_lut<RowsReg<T, Abi>, ColsReg<T, Abi>>(

    []<index_t Row, index_t Col>(index_constant<Row>, index_constant<Col>) {

        return trsm_copy_microkernel<T, Abi, Conf, Row + 1, Col + 1, OA, OB, OD>;

    });


/// @param  A Lower or upper trapezoidal RowsReg×(k+RowsReg).

/// @param  B RowsReg×ColsReg.

/// @param  D (k+RowsReg)×ColsReg.

/// @param  k Number of columns in the non-triangular part of A.

template <class T, class Abi, KernelConfig Conf, index_t RowsReg, index_t ColsReg, StorageOrder OA,

          StorageOrder OB, StorageOrder OD>

[[gnu::hot, gnu::flatten]] void


trsm_copy_microkernel(const uview<const T, Abi, OA> A, const uview<const T, Abi, OB> B,

                      const uview<T, Abi, OD> D, const index_t k) noexcept {

    static_assert(Conf.struc_A == MatrixStructure::LowerTriangular ||

                  Conf.struc_A == MatrixStructure::UpperTriangular);

    constexpr bool lower = Conf.struc_A == MatrixStructure::LowerTriangular;

    static_assert(RowsReg > 0 && ColsReg > 0);

    using namespace ops;

    using simd = datapar::simd<T, Abi>;

    // Pre-compute the offsets of the columns/rows of B

    const auto B_cached = with_cached_access<RowsReg, ColsReg>(B);

    // Load accumulator into registers

    simd B_reg[RowsReg][ColsReg]; // NOLINT(*-c-arrays)

    UNROLL_FOR (index_t ii = 0; ii < RowsReg; ++ii)

        UNROLL_FOR (index_t jj = 0; jj < ColsReg; ++jj)

            B_reg[ii][jj] = rotl<Conf.rotate_B>(B_cached.load(ii, jj));

    // Matrix multiplication

    const auto D_cached = with_cached_access<0, ColsReg>(D);

    const index_t l0 = lower ? 0 : RowsReg, l1 = lower ? k : k + RowsReg;

    for (index_t l = l0; l < l1; ++l)

        UNROLL_FOR (index_t jj = 0; jj < ColsReg; ++jj) {

            simd Xlj = D_cached.load(l, jj);

            UNROLL_FOR (index_t ii = 0; ii < RowsReg; ++ii) {

                simd Ail  = A.load(ii, l);

                simd &Bij = B_reg[ii][jj];

                Bij -= Ail * Xlj;

            }

        }

    // Triangular solve

    if constexpr (lower) {

        UNROLL_FOR (index_t ii = 0; ii < RowsReg; ++ii) {

            simd Aii = simd{1} / A.load(ii, k + ii);

            UNROLL_FOR (index_t jj = 0; jj < ColsReg; ++jj) {

                simd &Xij = B_reg[ii][jj];

                UNROLL_FOR (index_t ll = 0; ll < ii; ++ll) {

                    simd Ail  = A.load(ii, k + ll);

                    simd &Xlj = B_reg[ll][jj];

                    Xij -= Ail * Xlj;

                }

                Xij *= Aii; // Diagonal already inverted

            }

        }

    } else {

        UNROLL_FOR (index_t ii = RowsReg; ii-- > 0;) {

            simd Aii = simd{1} / A.load(ii, ii);

            UNROLL_FOR (index_t jj = 0; jj < ColsReg; ++jj) {

                simd &Xij = B_reg[ii][jj];

                UNROLL_FOR (index_t ll = ii + 1; ll < RowsReg; ++ll) {

                    simd Ail  = A.load(ii, ll);

                    simd &Xlj = B_reg[ll][jj];

                    Xij -= Ail * Xlj;

                }

                Xij *= Aii; // Diagonal already inverted

            }

        }

    }

    // Store accumulator to memory again

    UNROLL_FOR (index_t ii = 0; ii < RowsReg; ++ii)

        UNROLL_FOR (index_t jj = 0; jj < ColsReg; ++jj)

            D_cached.store(B_reg[ii][jj], lower ? k + ii : ii, jj);

}


/// Triangular solve D = (A⁽ᵀ⁾)⁻¹ B⁽ᵀ⁾ where A⁽ᵀ⁾ is lower triangular. Using register blocking.

/// Note: D = A⁻¹ B  <=>  Dᵀ = Bᵀ A⁻ᵀ

template <class T, class Abi, KernelConfig Conf, StorageOrder OA, StorageOrder OB, StorageOrder OD>


void trsm_copy_register(const view<const T, Abi, OA> A, const view<const T, Abi, OB> B,

                        const view<T, Abi, OD> D) noexcept {

    using enum MatrixStructure;

    static_assert(Conf.struc_A == LowerTriangular || Conf.struc_A == UpperTriangular);

    constexpr auto Rows = RowsReg<T, Abi>, Cols = ColsReg<T, Abi>;

    // Check dimensions

    const index_t I = A.rows(), K = A.cols(), J = B.cols();

    BATMAT_ASSUME(K >= I);

    BATMAT_ASSUME(B.rows() == I);

    BATMAT_ASSUME(D.rows() == K);

    BATMAT_ASSUME(D.cols() == J);

    BATMAT_ASSUME(I > 0);

    BATMAT_ASSUME(J > 0);

    BATMAT_ASSUME(K > 0);

    static const auto microkernel = trsm_copy_lut<T, Abi, Conf, OA, OB, OD>;

    // Sizeless views to partition and pass to the micro-kernels

    const uview<const T, Abi, OA> A_ = A;

    const uview<const T, Abi, OB> B_ = B;

    const uview<T, Abi, OD> D_       = D;


    // Optimization for very small matrices

    if (I <= Rows && J <= Cols)

        return microkernel[I - 1][J - 1](A_, B_, D_, 0);


    // Function to compute a single block X(i,j)

    auto blk = [&] [[gnu::always_inline]] (index_t i, index_t ni, index_t j, index_t nj) {

        // i iterates backwards from I to 0, because we want to process the remainder block first,

        // as processing it last would have poor matrix-matrix performance in the microkernel.

        if constexpr (Conf.struc_A == LowerTriangular) {

            i        = I - i - ni;        // iterate forward, smallest chunk first

            auto Ai0 = A_.middle_rows(i); // subdiagonal block row

            auto Bij = B_.block(i, j);    // rhs block to solve now

            auto X0j = D_.middle_cols(j); // solution up to i and solution block to fill in

            microkernel[ni - 1][nj - 1](Ai0, Bij, X0j, i + K - I);

        } else {

            auto Ai0 = A_.block(i, i); // superdiagonal block row

            auto Bij = B_.block(i, j); // rhs block to solve now

            auto X0j = D_.block(i, j); // solution up to i and solution block to fill in

            microkernel[ni - 1][nj - 1](Ai0, Bij, X0j, K - i - ni);

        }

    };

    if constexpr (OD == StorageOrder::ColMajor)

        foreach_chunked_merged( // Loop over block columns of B and D

            0, J, Cols,

            [&](index_t j, auto nj) {

                foreach_chunked_merged( // Loop over the diagonal blocks of A

                    0, I, Rows, [&](index_t i, auto ni) { blk(i, ni, j, nj); }, LoopDir::Backward);

            },

            LoopDir::Forward);

    else

        foreach_chunked_merged( // Loop over the diagonal blocks of A

            0, I, Rows,

            [&](index_t i, auto ni) {

                foreach_chunked_merged( // Loop over block columns of B and D

                    0, J, Cols, [&](index_t j, auto nj) { blk(i, ni, j, nj); }, LoopDir::Forward);

            },

            LoopDir::Backward);

}


} // namespace batmat::linalg::micro_kernels::trsm

assume.hpp

BATMAT_ASSUME
#define BATMAT_ASSUME(x)
Invokes undefined behavior if the expression x does not evaluate to true.
Definition assume.hpp:17

UNROLL_FOR
#define UNROLL_FOR(...)
Definition gemm-diag.tpp:10

guanaqo::StorageOrder
StorageOrder

batmat::linalg::MatrixStructure
MatrixStructure
Definition structure.hpp:8

batmat::linalg::MatrixStructure::LowerTriangular
@ LowerTriangular
Definition structure.hpp:8

batmat::linalg::MatrixStructure::UpperTriangular
@ UpperTriangular
Definition structure.hpp:8

batmat::foreach_chunked_merged
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
Definition loop.hpp:43

batmat::make_2d_lut
consteval auto make_2d_lut(F f)
Returns a 2D array of the form:
Definition lut.hpp:25

batmat::LoopDir::Forward
@ Forward
Definition loop.hpp:13

batmat::LoopDir::Backward
@ Backward
Definition loop.hpp:14

loop.hpp

lut.hpp

trsm.hpp

batmat::datapar::simd
stdx::simd< Tp, Abi > simd
Definition simd.hpp:148

batmat::linalg::micro_kernels::trsm
Definition trsm.hpp:8

batmat::linalg::micro_kernels::trsm::trsm_copy_microkernel
void trsm_copy_microkernel(uview< const T, Abi, OA > A, uview< const T, Abi, OB > B, uview< T, Abi, OD > D, index_t k) noexcept
Definition trsm.tpp:28

batmat::linalg::micro_kernels::trsm::RowsReg
constexpr index_t RowsReg
Register block size of the matrix-matrix multiplication micro-kernels.
Definition avx-512.hpp:13

batmat::linalg::micro_kernels::trsm::trsm_copy_register
void trsm_copy_register(view< const T, Abi, OA > A, view< const T, Abi, OB > B, view< T, Abi, OD > D) noexcept
Triangular solve D = (A⁽ᵀ⁾)⁻¹ B⁽ᵀ⁾ where A⁽ᵀ⁾ is lower triangular.
Definition trsm.tpp:92

batmat::linalg::micro_kernels::trsm::ColsReg
constexpr index_t ColsReg
Definition trsm.hpp:27

batmat::linalg::micro_kernels::trsm::trsm_copy_lut
const constinit auto trsm_copy_lut
Definition trsm.tpp:16

batmat::linalg::with_cached_access
cached_uview< Order==StorageOrder::ColMajor ? Cols :Rows, T, Abi, Order > with_cached_access(const uview< T, Abi, Order > &o) noexcept
Definition uview.hpp:228

batmat::linalg::view
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
Definition uview.hpp:70

batmat::ops
Definition cneg.hpp:11

batmat::index_constant
std::integral_constant< index_t, I > index_constant
Definition lut.hpp:10

batmat::index_t
int index_t
Definition config.hpp:13

rotate.hpp

batmat::linalg::uview
Definition uview.hpp:80

batmat::linalg::uview::block
Self block(this const Self &self, index_t r, index_t c) noexcept
Definition uview.hpp:110

batmat::linalg::uview::middle_rows
Self middle_rows(this const Self &self, index_t r) noexcept
Definition uview.hpp:114

batmat::linalg::uview::middle_cols
Self middle_cols(this const Self &self, index_t c) noexcept
Definition uview.hpp:118

structure.hpp

uview.hpp