12#include <guanaqo/trace.hpp>
18template <
class T,
class Abi, micro_kernels::hyhound::KernelConfig Conf,
StorageOrder OL,
21 const index_t k = A.cols();
25 [[maybe_unused]]
const index_t flop_count = total(
flops::hyh(L.rows(), L.cols(), k));
27 if (k == 0) [[unlikely]]
36 const index_t k = A.cols();
40 BATMAT_ASSERT(std::make_pair(W.rows(), W.cols()) == (hyhound_W_size<T, Abi>)(L));
41 [[maybe_unused]]
const index_t flop_count = total(
flops::hyh(L.rows(), L.cols(), k));
43 if (k == 0) [[unlikely]]
45 return hyhound_diag_register<T, Abi, Conf>(L, A, D, W);
52 index_t kA_in_offset = 0) {
54 const index_t k = Aout.cols();
62 BATMAT_ASSERT(std::make_pair(W.rows(), W.cols()) == (hyhound_W_size<T, Abi>)(L));
65 [[maybe_unused]]
const index_t flop_count = total(
flops::hyh_apply(L.rows(), L.cols(), k));
67 if (k == 0) [[unlikely]]
69 return hyhound_diag_apply_register<T, Abi, Conf>(L, Ain, Aout, B, D, W, kA_in_offset);
76 const index_t k = A1.cols(), m = L11.rows() + L21.rows();
82 [[maybe_unused]]
const index_t flop_count = total(
flops::hyh(m, L11.cols(), k));
84 if (k == 0) [[unlikely]]
96 const index_t k = A1.cols(), m = L11.rows() + L21.rows() + L31.rows();
107 [[maybe_unused]]
const index_t flop_count = total(
flops::hyh(m, L11.cols(), k));
109 if (k == 0) [[unlikely]]
112 L11, A1, L21, A22, A2_out, L31, A31, A3_out, D);
120 const index_t k = A1.cols(), m = L11.rows() + L21.rows() + Lu1.rows();
133 [[maybe_unused]] index_t flop_count = total(
flops::hyh(m, L11.cols(), k));
135 if (k == 0) [[unlikely]]
138 L11, A1, L21, A2, A2_out, Lu1, Au_out, D, shift_A_out);
150template <MatrixStructure SL, simdifiable VL, simdifiable VA, simdifiable Vd>
159template <MatrixStructure SL, simdifiable VL, simdifiable VA, simdifiable Vd, simdifiable VW>
169template <MatrixStructure SL, simdifiable VL>
185template <simdifiable VL, simdifiable VA, simdifiable VD, simdifiable VB, simdifiable Vd,
191 simdify(W).as_const(), kA_in_offset);
200template <simdifiable VL, simdifiable VA, simdifiable VB, simdifiable Vd, simdifiable VW>
210template <MatrixStructure SL, simdifiable VL, simdifiable VA, simdifiable Vd>
223template <
MatrixStructure SL, simdifiable VL1, simdifiable VA1, simdifiable VL2, simdifiable VA2,
237template <
MatrixStructure SL, simdifiable VL11, simdifiable VA1, simdifiable VL21, simdifiable VA2,
238 simdifiable VA2o, simdifiable VU, simdifiable VA3, simdifiable VA3o, simdifiable Vd>
241 VU &&L31, VA3 &&A31, VA3o &&A3_out, Vd &&d) {
255template <
MatrixStructure SL, simdifiable VL11, simdifiable VA1, simdifiable VL21, simdifiable VA2,
256 simdifiable VA2o, simdifiable VLu1, simdifiable VAuo, simdifiable Vd>
259 VLu1 &&Lu1, VAuo &&Au_out, Vd &&d,
bool shift_A_out =
false) {
constexpr FlopCount hyh(index_t nr, index_t nc, index_t k)
Hyperbolic Householder factorization update with L nr×nc and A nr×k.
constexpr FlopCount hyh_apply(index_t nr, index_t nc, index_t k)
Hyperbolic Householder factorization application to L2 nr×nc and A2 nr×k.
void hyhound_diag_apply(VL &&L, VA &&A, VD &&D, VB &&B, Vd &&d, VW &&W, index_t kA_in_offset=0)
Apply Householder transformation generated by hyhound_diag, computing (L̃, D) = (L,...
void hyhound_diag_riccati(Structured< VL11, SL > L11, VA1 &&A1, VL21 &&L21, VA2 &&A2, VA2o &&A2_out, VLu1 &&Lu1, VAuo &&Au_out, Vd &&d, bool shift_A_out=false)
Update structured Cholesky factor L using structured low-rank term A diag(d) Aᵀ,.
void hyhound_sign(Structured< VL, SL > L, VA &&A, Vd &&d)
Update Cholesky factor L using low-rank term A diag(copysign(1, d)) Aᵀ, where d contains only ±0 valu...
auto hyhound_size_W(Structured< VL, SL > L)
Get the size of the storage for the matrix W returned by hyhound_diag(Structured<VL,...
void hyhound_diag_2(Structured< VL1, SL > L1, VA1 &&A1, VL2 &&L2, VA2 &&A2, Vd &&d)
Update Cholesky factor L using low-rank term A diag(d) Aᵀ, where L and A are stored as two separate b...
void hyhound_diag(Structured< VL, SL > L, VA &&A, Vd &&d)
Update Cholesky factor L using low-rank term A diag(d) Aᵀ.
void hyhound_diag_cyclic(Structured< VL11, SL > L11, VA1 &&A1, VL21 &&L21, VA2 &&A22, VA2o &&A2_out, VU &&L31, VA3 &&A31, VA3o &&A3_out, Vd &&d)
Update structured Cholesky factor L using structured low-rank term A diag(d) Aᵀ,.
#define GUANAQO_TRACE_LINALG(name, gflops)
void hyhound_diag_apply(view< T, Abi, OL > L, view< const T, Abi, OA > Ain, view< T, Abi, OA > Aout, view< const T, Abi, OA > B, view< const T, Abi > D, view< const T, Abi > W, index_t kA_in_offset=0)
void hyhound_diag_riccati(view< T, Abi, OL > L11, view< T, Abi, OA > A1, view< T, Abi, OL > L21, view< const T, Abi, OA > A2, view< T, Abi, OA > A2_out, view< T, Abi, OLu > Lu1, view< T, Abi, OAu > Au_out, view< const T, Abi > D, bool shift_A_out)
void hyhound_diag(view< T, Abi, OL > L, view< T, Abi, OA > A, view< const T, Abi > D)
void hyhound_diag_cyclic(view< T, Abi, OL > L11, view< T, Abi, OW > A1, view< T, Abi, OY > L21, view< const T, Abi, OW > A22, view< T, Abi, OW > A2_out, view< T, Abi, OU > L31, view< const T, Abi, OW > A31, view< T, Abi, OW > A3_out, view< const T, Abi > D)
void hyhound_diag_2(view< T, Abi, OL1 > L11, view< T, Abi, OA1 > A1, view< T, Abi, OL2 > L21, view< T, Abi, OA2 > A2, view< const T, Abi > D)
void hyhound_diag_cyclic_register(view< T, Abi, OL > L11, view< T, Abi, OW > A1, view< T, Abi, OY > L21, view< const T, Abi, OW > A22, view< T, Abi, OW > A2_out, view< T, Abi, OU > L31, view< const T, Abi, OW > A31, view< T, Abi, OW > A3_out, view< const T, Abi > D) noexcept
Performs a factorization update of the following matrix:
void hyhound_diag_register(view< T, Abi, OL > L, view< T, Abi, OA > A, view< const T, Abi > D) noexcept
Block hyperbolic Householder factorization update using register blocking.
void hyhound_diag_riccati_register(view< T, Abi, OL > L11, view< T, Abi, OA > A1, view< T, Abi, OL > L21, view< const T, Abi, OA > A2, view< T, Abi, OA > A2_out, view< T, Abi, OLu > Lu1, view< T, Abi, OAu > Au_out, view< const T, Abi > D, bool shift_A_out) noexcept
Performs a factorization update of the following matrix:
constexpr std::pair< index_t, index_t > hyhound_W_size(view< T, Abi, OL > L)
void hyhound_diag_2_register(view< T, Abi, OL1 > L11, view< T, Abi, OA1 > A1, view< T, Abi, OL2 > L21, view< T, Abi, OA2 > A2, view< const T, Abi > D) noexcept
Same as hyhound_diag_register but for two block rows at once.
typename detail::simdified_abi< V >::type simdified_abi_t
constexpr bool simdify_compatible
constexpr auto simdify(simdifiable auto &&a) -> simdified_view_t< decltype(a)>
simd_view_types< std::remove_const_t< T >, Abi >::template view< T, Order > view
Aligned allocation for matrix storage.
Light-weight wrapper class used for overload resolution of triangular and symmetric matrices.