ARM NEON Compositor  master
Fast SIMD alpha overlay and blending for ARM
All Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Modules Pages
rescale.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 
5 #include "SIMD.h"
6 
9 
10 #if ENABLE_NEON
11 
16 inline uint8x8_t div256_floor(uint16x8_t x) { return vshrn_n_u16(x, 8); }
17 
22 inline uint8x8_t div256_round(uint16x8_t x) {
23  return vrshrn_n_u16(x, 8);
24 }
25 
26 /*
27  * There are some significant differences in how the vector registers are layed
28  * out in ARMv8 AArch64. There are also some new NEON instructions that aren't
29  * available in AArch32 mode.
30  */
31 #ifdef __ARM_ARCH_ISA_A64 // ARMv8 A64
32 
37 inline uint8x8_t div255_floor(uint16x8_t x) {
38  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
39  // 256×256×128/0x8081 ≃ 255
40  uint32x4_t h = vmull_high_n_u16(x, 0x8081);
41  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
42  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
43  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
44  // Divide by 0x80 and narrow from 16 bits to 8 bits
45  return vshrn_n_u16(x, 7);
46 }
47 
57 inline uint8x8_t div255_round_approx(uint16x8_t x) {
58  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
59  // 256×256×128/0x8081 ≃ 255
60  uint32x4_t h = vmull_high_n_u16(x, 0x8081);
61  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
62  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
63  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
64  // Divide by 0x80 and narrow from 16 bits to 8 bits (with rounding)
65  return vrshrn_n_u16(x, 7);
66 }
67 
76 inline uint8x8_t div255_round(uint16x8_t x) {
77  // Add the rounding constant
78  x = vaddq_u16(x, vdupq_n_u16(1 << 7));
79  // Multiply by 0x8080 as 32-bit integers (high and low elements separately)
80  uint32x4_t h = vmull_high_n_u16(x, 0x8080);
81  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
82  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
83  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
84  // Divide by 0x80 and narrow from 16 bits to 8 bits
85  return vshrn_n_u16(x, 7);
86 }
87 
88 #else // ARMv8 A32 or ARMv7 NEON
89 
94 inline uint8x8_t div255_floor(uint16x8_t x) {
95  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
96  // 256×256×128/0x8081 ≃ 255
97  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
98  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
99  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
100  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
101  // Divide by 0x80 and narrow from 16 bits to 8 bits
102  return vshrn_n_u16(x, 7);
103 }
104 
114 inline uint8x8_t div255_round_approx(uint16x8_t x) {
115  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
116  // 256×256×128/0x8081 ≃ 255
117  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
118  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
119  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
120  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
121  // Divide by 0x80 and narrow from 16 bits to 8 bits
122  return vrshrn_n_u16(x, 7);
123 }
124 
133 inline uint8x8_t div255_round(uint16x8_t x) {
134  // Add the rounding constant
135  x = vaddq_u16(x, vdupq_n_u16(1 << 7));
136  // Multiply by 0x8080 as 32-bit integers (high and low elements separately)
137  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8080);
138  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
139  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
140  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
141  // Divide by 0x80 and narrow from 16 bits to 8 bits
142  return vshrn_n_u16(x, 7);
143 }
144 
145 #endif
146 
147 #endif // NEON
148 
153 inline uint8_t div256_floor(uint16_t x) { return x >> 8; }
154 
159 inline uint8_t div256_round(uint16_t x) { return (x + (1 << 7)) >> 8; }
160 
165 inline uint8_t div255_floor(uint16_t x) {
166  uint32_t h = uint32_t(x) * 0x8081;
167  return h >> 23;
168 }
169 
175 inline uint8_t div255_round(uint16_t x) {
176  x += 1 << 7;
177  uint32_t h = uint32_t(x) * 0x101;
178  return h >> 16;
179 }
180 
184 inline uint8_t div255_round_approx(uint16_t x) {
185  return div255_round(x);
186 }
187 
188 // -------------------------------------------------------------------------- //
189 
190 #if ENABLE_NEON
191 
199 template <RescaleType rescale_type = RescaleType::Div255_Round>
200 inline uint8x8_t rescale(uint16x8_t x) {
201  switch (rescale_type) {
202  case RescaleType::Div256_Floor: return div256_floor(x);
203  case RescaleType::Div256_Round: return div256_round(x);
204  case RescaleType::Div255_Floor: return div255_floor(x);
205  case RescaleType::Div255_Round: return div255_round(x);
206  case RescaleType::Div255_Round_Approx: return div255_round_approx(x);
207  default: return vdup_n_u8(0x00);
208  }
209 }
210 #endif
211 
213 template <RescaleType rescale_type = RescaleType::Div255_Round>
214 inline uint8_t rescale(uint16_t x) {
215  switch (rescale_type) {
216  case RescaleType::Div256_Floor: return div256_floor(x);
217  case RescaleType::Div256_Round: return div256_round(x);
218  case RescaleType::Div255_Floor: return div255_floor(x);
219  case RescaleType::Div255_Round: return div255_round(x);
220  case RescaleType::Div255_Round_Approx: return div255_round_approx(x);
221  default: return 0x00;
222  }
223 }
224 
225 /// @}
div255_floor
uint8x8_t div255_floor(uint16x8_t x)
This is an exact flooring division by 255, this is the correct divisor, but requires a little bit mor...
Definition: rescale.hpp:37
div256_round
uint8x8_t div256_round(uint16x8_t x)
This is a rounding division by 256, which is close enough to 255, but the result may be one bit too s...
Definition: rescale.hpp:22
rescale
uint8x8_t rescale(uint16x8_t x)
Rescale the 16-bit color product by dividing by 255 or an approximation thereof.
Definition: rescale.hpp:200
div256_floor
uint8x8_t div256_floor(uint16x8_t x)
This is a flooring division by 256, which is close enough to 255, but the result may be one bit too s...
Definition: rescale.hpp:16
RescaleType::Div255_Round
@ Div255_Round
Exact rounding division by 255.
div255_round
uint8x8_t div255_round(uint16x8_t x)
This is an exact rounding division by 255, this is the correct divisor, and the result is rounded cor...
Definition: rescale.hpp:76
rescale_type.hpp
SIMD.h
div255_round_approx
uint8x8_t div255_round_approx(uint16x8_t x)
This is an approximation of a rounding division by 255, this is the correct divisor,...
Definition: rescale.hpp:57