batmat 0.0.15
Batched linear algebra routines
Loading...
Searching...
No Matches
rsqrt.hpp
Go to the documentation of this file.
1#pragma once
2
3#include <batmat/simd.hpp>
4
5namespace batmat::ops {
6
7/// @addtogroup topic-low-level-ops
8/// @{
9
10/// @name Inverse square root
11/// @{
12
13/// Inverse square root.
14template <std::floating_point T>
15T rsqrt(T x) {
16 using std::sqrt;
17 return 1 / sqrt(x);
18}
19
20/// Inverse square root. May be implemented using an `rsqrt` instruction followed by Newton
21/// iterations for better performance, depending on the SIMD ABI. This allows it to be performed in
22/// parallel with a normal square root instruction, enabling better performance of the Cholesky
23/// micro-kernels.
24template <class T, class Abi>
28
29/// @}
30
31/// @}
32
33namespace detail {
34
35#if __AVX512F__
36/// Approximation of inverse square root up to a relative error of 2⁻¹⁴.
37inline auto rsqrt_0(datapar::deduced_simd<double, 8> x) {
38 return decltype(x){_mm512_rsqrt14_pd(static_cast<__m512d>(x))};
39}
40inline auto rsqrt_0(datapar::deduced_simd<float, 16> x) {
41 return decltype(x){_mm512_rsqrt14_ps(static_cast<__m512>(x))};
42}
43inline auto rsqrt_0(datapar::deduced_simd<double, 4> x) {
44 return decltype(x){_mm256_rsqrt14_pd(static_cast<__m256d>(x))};
45}
46inline auto rsqrt_0(datapar::deduced_simd<float, 8> x) {
47 return decltype(x){_mm256_rsqrt14_ps(static_cast<__m256>(x))};
48}
49inline auto rsqrt_0(datapar::deduced_simd<double, 2> x) {
50 return decltype(x){_mm_rsqrt14_pd(static_cast<__m128d>(x))};
51}
52inline auto rsqrt_0(datapar::deduced_simd<float, 4> x) {
53 return decltype(x){_mm_rsqrt14_ps(static_cast<__m128>(x))};
54}
55#elif __AVX2__
56/// Approximation of inverse square root up to a relative error of 1.5×2⁻¹².
57inline auto rsqrt_0(datapar::deduced_simd<float, 8> x) {
58 return decltype(x){_mm256_rsqrt_ps(static_cast<__m256>(x))};
59}
60#endif
61
62/// rsqrt_0 with a single Newton iteration of refinement.
63template <class T, class Abi>
65 auto y = rsqrt_0(x);
66 const datapar::simd<T, Abi> half{T(0.5)}, three_halves{T(1.5)};
67 return y * (three_halves - (half * x * y * y));
68}
69
70/// rsqrt_0 with two Newton iterations of refinement.
71template <class T, class Abi>
73 auto y = rsqrt_1(x);
74 const datapar::simd<T, Abi> half{T(0.5)}, three_halves{T(1.5)};
75 return y * (three_halves - (half * x * y * y));
76}
77} // namespace detail
78
79#if __AVX512F__
80template <>
82 return detail::rsqrt_2(x);
83}
84template <>
86 return detail::rsqrt_2(x);
87}
88template <>
90 return detail::rsqrt_2(x);
91}
92template <>
94 return detail::rsqrt_1(x);
95}
96template <>
98 return detail::rsqrt_1(x);
99}
100template <>
102 return detail::rsqrt_1(x);
103}
104#elif __AVX2__
105template <>
107 return detail::rsqrt_1(x); // TODO: one or two Newton iterations?
108}
109#endif
110
111#if BATMAT_SCALAR_APPROX_INV_SQRT // Not worth it
112namespace detail {
113
114inline float rsqrt_0(float x) {
115 __m128 input = _mm_set_ss(x);
116 __m128 result = _mm_rsqrt14_ss(input, input);
117 return _mm_cvtss_f32(result);
118}
119
120inline double rsqrt_0(double x) {
121 __m128d input = _mm_set_sd(x);
122 __m128d result = _mm_rsqrt14_sd(input, input);
123 return _mm_cvtsd_f64(result);
124}
125
126/// rsqrt_0 with a single Newton iteration of refinement.
127template <std::floating_point T>
128T rsqrt_1(T x) {
129 auto y = rsqrt_0(x);
130 return y * (T(1.5) - (T(0.5) * x * y * y));
131}
132
133/// rsqrt_0 with two Newton iterations of refinement.
134template <std::floating_point T>
135T rsqrt_2(T x) {
136 auto y = rsqrt_1(x);
137 return y * (T(1.5) - (T(0.5) * x * y * y));
138}
139
140} // namespace detail
141
142inline double rsqrt(double x) { return detail::rsqrt_2(x); }
143inline float rsqrt(float x) { return detail::rsqrt_1(x); }
144
145#endif
146
147} // namespace batmat::ops
T rsqrt(T x)
Inverse square root.
Definition rsqrt.hpp:15
simd< Tp, deduced_abi< Tp, Np > > deduced_simd
Definition simd.hpp:103
stdx::simd< Tp, Abi > simd
Definition simd.hpp:99
datapar::simd< T, Abi > rsqrt_1(datapar::simd< T, Abi > x)
rsqrt_0 with a single Newton iteration of refinement.
Definition rsqrt.hpp:64
datapar::simd< T, Abi > rsqrt_2(datapar::simd< T, Abi > x)
rsqrt_0 with two Newton iterations of refinement.
Definition rsqrt.hpp:72