ARM NEON Compositor  master
Fast SIMD alpha overlay and blending for ARM
rescale.hpp
Go to the documentation of this file.
1 #pragma once
2 
4 
5 #include "SIMD.h"
6 
9 
10 #if ENABLE_NEON
11 
16 inline uint8x8_t div256_floor(uint16x8_t x) { return vshrn_n_u16(x, 8); }
17 
22 inline uint8x8_t div256_round(uint16x8_t x) {
23  return vrshrn_n_u16(x, 8);
24 }
25 
26 /*
27  * There are some significant differences in how the vector registers are layed
28  * out in ARMv8 AArch64. There are also some new NEON instructions that aren't
29  * available in AArch32 mode.
30  */
31 #ifdef __ARM_ARCH_ISA_A64 // ARMv8 A64
32 
37 inline uint8x8_t div255_floor(uint16x8_t x) {
38  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
39  // 256×256×128/0x8081 ≃ 255
40  uint32x4_t h = vmull_high_n_u16(x, 0x8081);
41  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
42  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
43  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
44  // Divide by 0x80 and narrow from 16 bits to 8 bits
45  return vshrn_n_u16(x, 7);
46 }
47 
57 inline uint8x8_t div255_round_approx(uint16x8_t x) {
58  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
59  // 256×256×128/0x8081 ≃ 255
60  uint32x4_t h = vmull_high_n_u16(x, 0x8081);
61  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
62  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
63  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
64  // Divide by 0x80 and narrow from 16 bits to 8 bits (with rounding)
65  return vrshrn_n_u16(x, 7);
66 }
67 
76 inline uint8x8_t div255_round(uint16x8_t x) {
77  // Add the rounding constant
78  x = vaddq_u16(x, vdupq_n_u16(1 << 7));
79  // Multiply by 0x8080 as 32-bit integers (high and low elements separately)
80  uint32x4_t h = vmull_high_n_u16(x, 0x8080);
81  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
82  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
83  x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
84  // Divide by 0x80 and narrow from 16 bits to 8 bits
85  return vshrn_n_u16(x, 7);
86 }
87 
88 #else // ARMv8 A32 or ARMv7 NEON
89 
94 inline uint8x8_t div255_floor(uint16x8_t x) {
95  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
96  // 256×256×128/0x8081 ≃ 255
97  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
98  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
99  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
100  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
101  // Divide by 0x80 and narrow from 16 bits to 8 bits
102  return vshrn_n_u16(x, 7);
103 }
104 
114 inline uint8x8_t div255_round_approx(uint16x8_t x) {
115  // Multiply by 0x8081 as 32-bit integers (high and low elements separately)
116  // 256×256×128/0x8081 ≃ 255
117  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
118  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
119  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
120  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
121  // Divide by 0x80 and narrow from 16 bits to 8 bits
122  return vrshrn_n_u16(x, 7);
123 }
124 
133 inline uint8x8_t div255_round(uint16x8_t x) {
134  // Add the rounding constant
135  x = vaddq_u16(x, vdupq_n_u16(1 << 7));
136  // Multiply by 0x8080 as 32-bit integers (high and low elements separately)
137  uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8080);
138  uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
139  // Extract the 16 high bits of all 32-bit products (division by 0x10000)
140  x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
141  // Divide by 0x80 and narrow from 16 bits to 8 bits
142  return vshrn_n_u16(x, 7);
143 }
144 
145 #endif
146 
147 #endif // NEON
148 
153 inline uint8_t div256_floor(uint16_t x) { return x >> 8; }
154 
159 inline uint8_t div256_round(uint16_t x) { return (x + (1 << 7)) >> 8; }
160 
165 inline uint8_t div255_floor(uint16_t x) {
166  uint32_t h = uint32_t(x) * 0x8081;
167  return h >> 23;
168 }
169 
175 inline uint8_t div255_round(uint16_t x) {
176  x += 1 << 7;
177  uint32_t h = uint32_t(x) * 0x101;
178  return h >> 16;
179 }
180 
184 inline uint8_t div255_round_approx(uint16_t x) {
185  return div255_round(x);
186 }
187 
188 // -------------------------------------------------------------------------- //
189 
190 #if ENABLE_NEON
191 
199 template <RescaleType rescale_type = RescaleType::Div255_Round>
200 inline uint8x8_t rescale(uint16x8_t x) {
201  switch (rescale_type) {
202  case RescaleType::Div256_Floor: return div256_floor(x);
203  case RescaleType::Div256_Round: return div256_round(x);
204  case RescaleType::Div255_Floor: return div255_floor(x);
205  case RescaleType::Div255_Round: return div255_round(x);
206  case RescaleType::Div255_Round_Approx: return div255_round_approx(x);
207  default: return vdup_n_u8(0x00);
208  }
209 }
210 #endif
211 
213 template <RescaleType rescale_type = RescaleType::Div255_Round>
214 inline uint8_t rescale(uint16_t x) {
215  switch (rescale_type) {
216  case RescaleType::Div256_Floor: return div256_floor(x);
217  case RescaleType::Div256_Round: return div256_round(x);
218  case RescaleType::Div255_Floor: return div255_floor(x);
219  case RescaleType::Div255_Round: return div255_round(x);
220  case RescaleType::Div255_Round_Approx: return div255_round_approx(x);
221  default: return 0x00;
222  }
223 }
224 
225 /// @}
div255_floor
uint8x8_t div255_floor(uint16x8_t x)
This is an exact flooring division by 255, this is the correct divisor, but requires a little bit mor...
Definition: rescale.hpp:37
div256_round
uint8x8_t div256_round(uint16x8_t x)
This is a rounding division by 256, which is close enough to 255, but the result may be one bit too s...
Definition: rescale.hpp:22
rescale
uint8x8_t rescale(uint16x8_t x)
Rescale the 16-bit color product by dividing by 255 or an approximation thereof.
Definition: rescale.hpp:200
div256_floor
uint8x8_t div256_floor(uint16x8_t x)
This is a flooring division by 256, which is close enough to 255, but the result may be one bit too s...
Definition: rescale.hpp:16
RescaleType::Div255_Round
@ Div255_Round
Exact rounding division by 255.
div255_round
uint8x8_t div255_round(uint16x8_t x)
This is an exact rounding division by 255, this is the correct divisor, and the result is rounded cor...
Definition: rescale.hpp:76
rescale_type.hpp
SIMD.h
div255_round_approx
uint8x8_t div255_round_approx(uint16x8_t x)
This is an approximation of a rounding division by 255, this is the correct divisor,...
Definition: rescale.hpp:57