19 template <RescaleType rescale_type>
24 uint16_t alpha =
fg_img[3];
26 uint16_t alpha_c = 255 - alpha;
31 out_img[2] = rescale<rescale_type>(r);
33 out_img[1] = rescale<rescale_type>(g);
35 out_img[0] = rescale<rescale_type>(b);
44 template <RescaleType rescale_type>
48 #if ENABLE_NEON // Fast vectorized version
50 template <RescaleType rescale_type>
55 uint8x8x4_t fg = vld4_u8(
fg_img);
57 uint8x8x4_t bg = vld4_u8(
bg_img);
60 uint8x8_t alpha = fg.val[3];
61 uint8x8_t alpha_c = vmvn_u8(alpha);
64 uint16x8_t r = vaddq_u16(vmull_u8(bg.val[2], alpha_c),
65 vmull_u8(fg.val[2], alpha));
67 uint16x8_t g = vaddq_u16(vmull_u8(bg.val[1], alpha_c),
68 vmull_u8(fg.val[1], alpha));
70 uint16x8_t b = vaddq_u16(vmull_u8(bg.val[0], alpha_c),
71 vmull_u8(fg.val[0], alpha));
74 bg.val[2] = rescale<rescale_type>(r);
75 bg.val[1] = rescale<rescale_type>(g);
76 bg.val[0] = rescale<rescale_type>(b);
82 #else // Fallback without NEON
84 template <RescaleType rescale_type>
93 for (uint8_t i = 0; i < 8; ++i)
94 overlay_alpha_1<rescale_type>(&
bg_img[4 * i], &
fg_img[4 * i],
100 template <RescaleType rescale_type>
106 #pragma omp parallel for
107 for (
size_t i = 0; i < n * 4; i += 4 * 8)
112 template void overlay_alpha_fast<RescaleType::Div255_Round>(
114 template void overlay_alpha_fast<RescaleType::Div255_Round_Approx>(
116 template void overlay_alpha_fast<RescaleType::Div255_Floor>(
118 template void overlay_alpha_fast<RescaleType::Div256_Round>(
120 template void overlay_alpha_fast<RescaleType::Div256_Floor>(
123 template <RescaleType rescale_type>
125 uint8_t *
out_img,
size_t bg_full_cols,
size_t fg_rows,
126 size_t fg_cols,
size_t fg_full_cols) {
130 const size_t fg_rem_cols = fg_cols % 8;
131 #pragma omp parallel for
132 for (
size_t r = 0; r < fg_rows; ++r) {
135 for (
size_t c = 0; c < fg_cols - 7; c += 8) {
136 size_t bg_offset = 4 * (r * bg_full_cols + c);
137 size_t fg_offset = 4 * (r * fg_full_cols + c);
138 overlay_alpha_8<rescale_type>(&
bg_img[bg_offset],
144 if (fg_rem_cols != 0) {
145 size_t bg_offset = 4 * (r * bg_full_cols + fg_cols - fg_rem_cols);
146 size_t fg_offset = 4 * (r * fg_full_cols + fg_cols - fg_rem_cols);
147 uint8_t tmp_bg[4 * 8];
148 uint8_t tmp_fg[4 * 8];
149 uint8_t tmp_out[4 * 8];
150 memcpy(tmp_bg, &
bg_img[bg_offset], 4 * fg_rem_cols);
151 memcpy(tmp_fg, &
fg_img[fg_offset], 4 * fg_rem_cols);
152 overlay_alpha_8<rescale_type>(tmp_bg, tmp_fg, tmp_out);
153 memcpy(&
out_img[bg_offset], tmp_out, 4 * fg_rem_cols);
159 template void overlay_alpha_stride<RescaleType::Div255_Round>(
161 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols);
162 template void overlay_alpha_stride<RescaleType::Div255_Round_Approx>(
164 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols);
165 template void overlay_alpha_stride<RescaleType::Div255_Floor>(
167 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols);
168 template void overlay_alpha_stride<RescaleType::Div256_Round>(
170 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols);
171 template void overlay_alpha_stride<RescaleType::Div256_Floor>(
173 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols);
179 size_t bg_full_cols,
size_t fg_rows,
180 size_t fg_cols,
size_t fg_full_cols) {
181 overlay_alpha_stride<RescaleType::Div255_Round>(
186 size_t bg_full_cols,
size_t fg_rows,
size_t fg_cols,
size_t fg_full_cols) {
187 overlay_alpha_stride<RescaleType::Div255_Round_Approx>(
192 size_t bg_full_cols,
size_t fg_rows,
193 size_t fg_cols,
size_t fg_full_cols) {
194 overlay_alpha_stride<RescaleType::Div255_Floor>(
199 size_t bg_full_cols,
size_t fg_rows,
200 size_t fg_cols,
size_t fg_full_cols) {
201 overlay_alpha_stride<RescaleType::Div256_Round>(
206 size_t bg_full_cols,
size_t fg_rows,
207 size_t fg_cols,
size_t fg_full_cols) {
208 overlay_alpha_stride<RescaleType::Div256_Floor>(