16 inline uint8x8_t
div256_floor(uint16x8_t x) {
return vshrn_n_u16(x, 8); }
23 return vrshrn_n_u16(x, 8);
31 #ifdef __ARM_ARCH_ISA_A64 // ARMv8 A64
40 uint32x4_t h = vmull_high_n_u16(x, 0x8081);
41 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
43 x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
45 return vshrn_n_u16(x, 7);
60 uint32x4_t h = vmull_high_n_u16(x, 0x8081);
61 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
63 x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
65 return vrshrn_n_u16(x, 7);
78 x = vaddq_u16(x, vdupq_n_u16(1 << 7));
80 uint32x4_t h = vmull_high_n_u16(x, 0x8080);
81 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
83 x = vuzp2q_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h));
85 return vshrn_n_u16(x, 7);
88 #else // ARMv8 A32 or ARMv7 NEON
97 uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
98 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
100 x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
102 return vshrn_n_u16(x, 7);
117 uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8081);
118 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8081);
120 x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
122 return vrshrn_n_u16(x, 7);
135 x = vaddq_u16(x, vdupq_n_u16(1 << 7));
137 uint32x4_t h = vmull_n_u16(vget_high_u16(x), 0x8080);
138 uint32x4_t l = vmull_n_u16(vget_low_u16(x), 0x8080);
140 x = vuzpq_u16(vreinterpretq_u16_u32(l), vreinterpretq_u16_u32(h)).val[1];
142 return vshrn_n_u16(x, 7);
159 inline uint8_t
div256_round(uint16_t x) {
return (x + (1 << 7)) >> 8; }
166 uint32_t h = uint32_t(x) * 0x8081;
177 uint32_t h = uint32_t(x) * 0x101;
199 template <RescaleType rescale_type = RescaleType::Div255_Round>
201 switch (rescale_type) {
207 default:
return vdup_n_u8(0x00);
213 template <RescaleType rescale_type = RescaleType::Div255_Round>
215 switch (rescale_type) {
221 default:
return 0x00;