ARM NEON Compositor  master
Fast SIMD alpha overlay and blending for ARM
src/alpha-lib/src/overlay_alpha.cpp
Go to the documentation of this file.
3 
4 #include "SIMD.h"
5 #include "rescale.hpp"
6 
7 #include <cassert>
8 #include <cstddef>
9 #include <cstring>
10 
19 template <RescaleType rescale_type>
20 static inline void overlay_alpha_1(const uint8_t *bg_img, const uint8_t *fg_img,
21  uint8_t *out_img) {
22  // Byte order: Blue, Green, Red, Alpha
23  // Alpha [0, 255]
24  uint16_t alpha = fg_img[3];
25  // Complement of Alpha [0, 255]
26  uint16_t alpha_c = 255 - alpha;
27  // 255 * Red out =
28  // Red foreground * Alpha foreground
29  // + Red background * (255 - Alpha foreground)
30  uint16_t r = fg_img[2] * alpha + bg_img[2] * alpha_c;
31  out_img[2] = rescale<rescale_type>(r);
32  uint16_t g = fg_img[1] * alpha + bg_img[1] * alpha_c;
33  out_img[1] = rescale<rescale_type>(g);
34  uint16_t b = fg_img[0] * alpha + bg_img[0] * alpha_c;
35  out_img[0] = rescale<rescale_type>(b);
36  // Alpha channel is not blended, Alpha background is simply copied to output
37  out_img[3] = bg_img[3];
38 }
39 
44 template <RescaleType rescale_type>
45 static void overlay_alpha_8(const uint8_t *bg_img, const uint8_t *fg_img,
46  uint8_t *out_img);
47 
48 #if ENABLE_NEON // Fast vectorized version
49 
50 template <RescaleType rescale_type>
51 static void overlay_alpha_8(const uint8_t *bg_img, const uint8_t *fg_img,
52  uint8_t *out_img) {
53  // Load the four channels of 8 pixels of the foreground image into four
54  // uint8x8 vector registers
55  uint8x8x4_t fg = vld4_u8(fg_img);
56  // Same for the background image
57  uint8x8x4_t bg = vld4_u8(bg_img);
58 
59  // Byte order: Blue, Green, Red, Alpha
60  uint8x8_t alpha = fg.val[3];
61  uint8x8_t alpha_c = vmvn_u8(alpha); // 255 - alpha
62 
63  // r = bg.r * (255 - alpha) + fg.r * alpha
64  uint16x8_t r = vaddq_u16(vmull_u8(bg.val[2], alpha_c), //
65  vmull_u8(fg.val[2], alpha));
66 
67  uint16x8_t g = vaddq_u16(vmull_u8(bg.val[1], alpha_c), //
68  vmull_u8(fg.val[1], alpha));
69 
70  uint16x8_t b = vaddq_u16(vmull_u8(bg.val[0], alpha_c), //
71  vmull_u8(fg.val[0], alpha));
72 
73  // Divide the 16-bit colors by 255 so the result is in [0, 255] again
74  bg.val[2] = rescale<rescale_type>(r);
75  bg.val[1] = rescale<rescale_type>(g);
76  bg.val[0] = rescale<rescale_type>(b);
77 
78  // Store the four channels of 8 pixels to the output image
79  vst4_u8(out_img, bg);
80 }
81 
82 #else // Fallback without NEON
83 
84 template <RescaleType rescale_type>
85 static void overlay_alpha_8(const uint8_t *bg_img, const uint8_t *fg_img,
86  uint8_t *out_img) {
87 
88  // Unroll the loop, and inform the compiler that there are no dependencies
89  // between loop iterations, so it's free to use SIMD instructions as it sees
90  // fit.
91 #pragma GCC unroll 8
92 #pragma GCC ivdep
93  for (uint8_t i = 0; i < 8; ++i)
94  overlay_alpha_1<rescale_type>(&bg_img[4 * i], &fg_img[4 * i],
95  &out_img[4 * i]);
96 }
97 
98 #endif
99 
100 template <RescaleType rescale_type>
101 void overlay_alpha_fast(const uint8_t *bg_img, const uint8_t *fg_img,
102  uint8_t *out_img, size_t n) {
103  // This fast version assumes that the number of pixels is a multiple of 8,
104  // and that the size of the foreground and background images are the same.
105  assert(n % 8 == 0);
106 #pragma omp parallel for
107  for (size_t i = 0; i < n * 4; i += 4 * 8)
108  overlay_alpha_8<rescale_type>(&bg_img[i], &fg_img[i], &out_img[i]);
109 }
110 
111 // Explicit template instantiations
112 template void overlay_alpha_fast<RescaleType::Div255_Round>(
113  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n);
114 template void overlay_alpha_fast<RescaleType::Div255_Round_Approx>(
115  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n);
116 template void overlay_alpha_fast<RescaleType::Div255_Floor>(
117  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n);
118 template void overlay_alpha_fast<RescaleType::Div256_Round>(
119  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n);
120 template void overlay_alpha_fast<RescaleType::Div256_Floor>(
121  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n);
122 
123 template <RescaleType rescale_type>
124 void overlay_alpha_stride(const uint8_t *bg_img, const uint8_t *fg_img,
125  uint8_t *out_img, size_t bg_full_cols, size_t fg_rows,
126  size_t fg_cols, size_t fg_full_cols) {
127  // In this case, the number of pixels doesn't need to be a multiple of 8,
128  // and by using the right strides, the foreground and background images can
129  // have different sizes.
130  const size_t fg_rem_cols = fg_cols % 8;
131 #pragma omp parallel for
132  for (size_t r = 0; r < fg_rows; ++r) {
133  if (fg_cols >= 8) {
134  // Main loop to handle multiples of 8 pixels
135  for (size_t c = 0; c < fg_cols - 7; c += 8) {
136  size_t bg_offset = 4 * (r * bg_full_cols + c);
137  size_t fg_offset = 4 * (r * fg_full_cols + c);
138  overlay_alpha_8<rescale_type>(&bg_img[bg_offset],
139  &fg_img[fg_offset],
140  &out_img[bg_offset]);
141  }
142  }
143  // Handle the remaining columns (< 8)
144  if (fg_rem_cols != 0) {
145  size_t bg_offset = 4 * (r * bg_full_cols + fg_cols - fg_rem_cols);
146  size_t fg_offset = 4 * (r * fg_full_cols + fg_cols - fg_rem_cols);
147  uint8_t tmp_bg[4 * 8];
148  uint8_t tmp_fg[4 * 8];
149  uint8_t tmp_out[4 * 8];
150  memcpy(tmp_bg, &bg_img[bg_offset], 4 * fg_rem_cols);
151  memcpy(tmp_fg, &fg_img[fg_offset], 4 * fg_rem_cols);
152  overlay_alpha_8<rescale_type>(tmp_bg, tmp_fg, tmp_out);
153  memcpy(&out_img[bg_offset], tmp_out, 4 * fg_rem_cols);
154  }
155  }
156 }
157 
158 // Explicit template instantiations
159 template void overlay_alpha_stride<RescaleType::Div255_Round>(
160  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
161  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols);
162 template void overlay_alpha_stride<RescaleType::Div255_Round_Approx>(
163  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
164  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols);
165 template void overlay_alpha_stride<RescaleType::Div255_Floor>(
166  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
167  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols);
168 template void overlay_alpha_stride<RescaleType::Div256_Round>(
169  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
170  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols);
171 template void overlay_alpha_stride<RescaleType::Div256_Floor>(
172  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
173  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols);
174 
175 // C wrappers
176 extern "C" {
178  const uint8_t *fg_img, uint8_t *out_img,
179  size_t bg_full_cols, size_t fg_rows,
180  size_t fg_cols, size_t fg_full_cols) {
181  overlay_alpha_stride<RescaleType::Div255_Round>(
182  bg_img, fg_img, out_img, bg_full_cols, fg_rows, fg_cols, fg_full_cols);
183 }
185  const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img,
186  size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols) {
187  overlay_alpha_stride<RescaleType::Div255_Round_Approx>(
188  bg_img, fg_img, out_img, bg_full_cols, fg_rows, fg_cols, fg_full_cols);
189 }
191  const uint8_t *fg_img, uint8_t *out_img,
192  size_t bg_full_cols, size_t fg_rows,
193  size_t fg_cols, size_t fg_full_cols) {
194  overlay_alpha_stride<RescaleType::Div255_Floor>(
195  bg_img, fg_img, out_img, bg_full_cols, fg_rows, fg_cols, fg_full_cols);
196 }
198  const uint8_t *fg_img, uint8_t *out_img,
199  size_t bg_full_cols, size_t fg_rows,
200  size_t fg_cols, size_t fg_full_cols) {
201  overlay_alpha_stride<RescaleType::Div256_Round>(
202  bg_img, fg_img, out_img, bg_full_cols, fg_rows, fg_cols, fg_full_cols);
203 }
205  const uint8_t *fg_img, uint8_t *out_img,
206  size_t bg_full_cols, size_t fg_rows,
207  size_t fg_cols, size_t fg_full_cols) {
208  overlay_alpha_stride<RescaleType::Div256_Floor>(
209  bg_img, fg_img, out_img, bg_full_cols, fg_rows, fg_cols, fg_full_cols);
210 }
211 }
overlay_alpha_1
static void overlay_alpha_1(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img)
Overlay a single pixel of a foreground image with an alpha channel over one pixel of a background ima...
Definition: src/alpha-lib/src/overlay_alpha.cpp:20
overlay_alpha.hpp
rescale.hpp
overlay_alpha_stride_div256_round
void overlay_alpha_stride_div256_round(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
C wrapper for overlay_alpha_stride().
Definition: src/alpha-lib/src/overlay_alpha.cpp:197
overlay_alpha_stride_div255_round
void overlay_alpha_stride_div255_round(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
C wrapper for overlay_alpha_stride().
Definition: src/alpha-lib/src/overlay_alpha.cpp:177
perf_test.fg_img
fg_img
Definition: perf_test.py:113
overlay_alpha_stride_div255_floor
void overlay_alpha_stride_div255_floor(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
C wrapper for overlay_alpha_stride().
Definition: src/alpha-lib/src/overlay_alpha.cpp:190
overlay_alpha_stride_div256_floor
void overlay_alpha_stride_div256_floor(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
C wrapper for overlay_alpha_stride().
Definition: src/alpha-lib/src/overlay_alpha.cpp:204
overlay_alpha_8
static void overlay_alpha_8(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img)
Overlay 8 pixels of a foreground image with an alpha channel over 8 pixels of a background image.
Definition: src/alpha-lib/src/overlay_alpha.cpp:51
overlay_alpha_stride
void overlay_alpha_stride(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
Overlay a smaller image with an alpha channel over a larger background image.
Definition: src/alpha-lib/src/overlay_alpha.cpp:124
perf_test.out_img
out_img
Definition: perf_test.py:114
overlay_alpha_stride_div255_round_approx
void overlay_alpha_stride_div255_round_approx(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t bg_full_cols, size_t fg_rows, size_t fg_cols, size_t fg_full_cols)
C wrapper for overlay_alpha_stride().
Definition: src/alpha-lib/src/overlay_alpha.cpp:184
SIMD.h
perf_test.bg_img
bg_img
Definition: perf_test.py:112
overlay_alpha.h
overlay_alpha_fast
void overlay_alpha_fast(const uint8_t *bg_img, const uint8_t *fg_img, uint8_t *out_img, size_t n)
Fast function to overlay two images of the same size, where the number of pixels is a multiple of 8.
Definition: src/alpha-lib/src/overlay_alpha.cpp:101