80 Axj[i] = D.load(i, j);
86 bb[i] += D.load(q, i) * Axj[q];
88 bb[q] += D.load(q, p) * Axj[p];
91 for (
index_t q = max(R + 1, j + 2); q < k; ++q) {
94 Aqx[i] = D.load(q, i);
96 bb[i] += Aqx[i] * Aqx[j];
99 Yl += Aqx[p] * Axj[p];
102 const simd a21 = Axj[j + 1];
114 const simd c̃j = copysign(sqrt(bb[j]), a21), β = a21 + c̃j;
115 const simd inv_τ = β / c̃j, inv_β = simd{1} / β;
123 W.store(bb[i] * inv_β + D.load(j + 1, i), i, j);
124 W.store(inv_τ, j, j);
127 for (
index_t i = max(R + 1, j + 2); i < k; ++i) {
128 simd yi = Y.load(i, j);
129 const simd xi = D.load(i, j);
130 yi += D.load(i, i) * xi;
131 for (
index_t l = max(R + 1, j + 2); l < i; ++l) {
132 simd yl = Y.load(l, j);
133 const simd xl = D.load(l, j), ail = D.load(i, l);
145 Axj1[i] = D.load(i, j + 1);
147 Y.store(bb[i] = inv_τ * (inv_β * bb[i] + Axj1[i]), i, j);
148 for (
index_t i = max(R + 1, j + 2); i < k; ++i) {
149 simd yi = Y.load(i, j);
150 Y.store(inv_τ * (inv_β * yi + D.load(i, j + 1)), i, j);
154 const simd a2 = Axj1[j + 1];
155 const simd a31ᵀa32 = bb[j + 1];
156 const simd ω = inv_τ * (inv_β * a31ᵀa32 + a2);
161 b[l] = inv_β * Axj[l];
163 wᵀb_ω += b[l] * bb[l];
165 for (
index_t l = max(R + 1, j + 2); l < k; ++l) {
166 simd bl = inv_β * D.load(l, j);
168 wᵀb_ω += bl * Y.load(l, j);
170 const simd γ = inv_τ * wᵀb_ω;
171 const simd d2 = a2 - 2 * ω + γ;
172 D.store(-c̃j, j + 1, j);
173 D.store(d2, j + 1, j + 1);
176 const simd γ_ω = γ - ω;
178 simd ã32 = Axj1[l] + γ_ω * b[l] - bb[l];
179 D.store(ã32, l, j + 1);
180 simd yl = bb[l] - simd{0.5} * γ * b[l];
183 for (
index_t l = max(R + 1, j + 2); l < k; ++l) {
184 simd bl = D.load(l, j);
185 simd ã32 = D.load(l, j + 1) + γ_ω * bl - Y.load(l, j);
186 D.store(ã32, l, j + 1);
187 simd yl = Y.load(l, j) - simd{0.5} * γ * bl;
193 for (
index_t i = j + 2; i < k; ++i)
194 for (
index_t l = i; l < k; ++l) {
195 simd Ail = D.load(l, i);
196 Ail -= Y.load(i, j) * D.load(l, j) + Y.load(l, j) * D.load(i, j);
210 (W.cols() == 1 && W.rows() == std::max<index_t>(D.cols(), 1) - 1) ||
215 alignas(W_t::alignment()) T W_sto[W_t::size()];
225 auto Wj = store_full_W ? W_t{W_.
middle_cols(j / R).data} : W_t{W_sto};
226 auto Djj = D_.
block(j, j);
228 if (!store_full_W && W.rows() > 0) [[unlikely]]
229 for (
index_t l = 0; l < rem_j; ++l)
230 W_.
store(Wj.load(l, l), j + l, 0);
void foreach_chunked_merged(index_t i_begin, index_t i_end, auto chunk_size, auto func_chunk, LoopDir dir=LoopDir::Forward)
Iterate over the range [i_begin, i_end) in chunks of size chunk_size, calling func_chunk for each chu...
void sytrd_diag_microkernel(index_t k, triangular_accessor< T, Abi, SizeR< T, Abi > > W, uview< T, Abi, OD > D, uview< T, Abi, StorageOrder::ColMajor > Y) noexcept