46 #ifdef OJPH_COMPILER_MSVC
49 #include <x86intrin.h>
59 int step_num,
ui32 repeat)
61 float *dst = line_dst->
f32;
62 const float *src1 = line_src1->
f32, *src2 = line_src2->
f32;
65 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
67 __m128 s1 = _mm_load_ps(src1);
68 __m128 s2 = _mm_load_ps(src2);
69 __m128 d = _mm_load_ps(dst);
70 d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
77 bool L_analysis_or_H_synthesis,
ui32 repeat)
79 float *dst = line_dst->
f32;
80 const float *src = line_src->
f32;
84 __m128 factor = _mm_set1_ps(f);
85 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
87 __m128 s = _mm_load_ps(src);
88 _mm_store_ps(dst, _mm_mul_ps(factor, s));
99 float *src = line_src->
f32;
100 float *ldst = line_ldst->
f32, *hdst = line_hdst->
f32;
102 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
103 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
107 src[width] = src[width-2];
109 const float* sp = src + (even ? 1 : 0);
112 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
115 __m128 s1 = _mm_loadu_ps(sp - 1);
116 __m128 s2 = _mm_loadu_ps(sp + 1);
117 __m128 d = _mm_loadu_ps(sp);
118 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
119 __m128 d1 = _mm_add_ps(d, s1);
121 s1 = _mm_loadu_ps(sp - 1);
122 s2 = _mm_loadu_ps(sp + 1);
123 d = _mm_loadu_ps(sp);
124 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
125 __m128 d2 = _mm_add_ps(d, s1);
127 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
128 _mm_store_ps(dph, d);
133 hdst[H_width] = hdst[H_width-1];
136 sp = src + (even ? 0 : 1);
137 const float* sph = hdst + (even ? 0 : 1);
139 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
141 __m128 s1 = _mm_loadu_ps(sph - 1);
142 __m128 s2 = _mm_loadu_ps(sph);
143 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
144 __m128 d1 = _mm_loadu_ps(sp);
145 __m128 d2 = _mm_loadu_ps(sp + 4);
146 __m128 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
147 d = _mm_add_ps(d, s1);
148 _mm_store_ps(dpl, d);
153 ldst[L_width] = ldst[L_width-1];
156 const float* spl = ldst + (even ? 1 : 0);
158 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
160 __m128 s1 = _mm_loadu_ps(spl - 1);
161 __m128 s2 = _mm_loadu_ps(spl);
162 __m128 d = _mm_loadu_ps(dph);
163 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
164 d = _mm_add_ps(d, s1);
165 _mm_store_ps(dph, d);
170 hdst[H_width] = hdst[H_width-1];
173 sph = hdst + (even ? 0 : 1);
175 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
177 __m128 s1 = _mm_loadu_ps(sph - 1);
178 __m128 s2 = _mm_loadu_ps(sph);
179 __m128 d = _mm_loadu_ps(dpl);
180 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
181 d = _mm_add_ps(d, s1);
182 _mm_store_ps(dpl, d);
188 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
190 __m128 d = _mm_load_ps(dp);
191 _mm_store_ps(dp, _mm_mul_ps(factor, d));
195 for (
int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
197 __m128 d = _mm_load_ps(dp);
198 _mm_store_ps(dp, _mm_mul_ps(factor, d));
204 line_ldst->
f32[0] = line_src->
f32[0];
206 line_hdst->
f32[0] = line_src->
f32[0] + line_src->
f32[0];
217 float *lsrc = line_lsrc->
f32, *hsrc = line_hsrc->
f32;
218 float *dst = line_dst->
f32;
220 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
221 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
226 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
228 __m128 d = _mm_load_ps(dp);
229 _mm_store_ps(dp, _mm_mul_ps(factor, d));
233 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
235 __m128 d = _mm_load_ps(dp);
236 _mm_store_ps(dp, _mm_mul_ps(factor, d));
241 hsrc[H_width] = hsrc[H_width-1];
244 const float *sph = hsrc + (even ? 0 : 1);
246 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
248 __m128 s1 = _mm_loadu_ps(sph - 1);
249 __m128 s2 = _mm_loadu_ps(sph);
250 __m128 d = _mm_loadu_ps(dpl);
251 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
252 d = _mm_add_ps(d, s1);
253 _mm_store_ps(dpl, d);
258 lsrc[L_width] = lsrc[L_width-1];
261 const float *spl = lsrc + (even ? 0 : -1);
263 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
265 __m128 s1 = _mm_loadu_ps(spl);
266 __m128 s2 = _mm_loadu_ps(spl + 1);
267 __m128 d = _mm_loadu_ps(dph);
268 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
269 d = _mm_add_ps(d, s1);
270 _mm_store_ps(dph, d);
275 hsrc[H_width] = hsrc[H_width-1];
278 sph = hsrc + (even ? 0 : 1);
280 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
282 __m128 s1 = _mm_loadu_ps(sph - 1);
283 __m128 s2 = _mm_loadu_ps(sph);
284 __m128 d = _mm_loadu_ps(dpl);
285 s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
286 d = _mm_add_ps(d, s1);
287 _mm_store_ps(dpl, d);
292 lsrc[L_width] = lsrc[L_width-1];
295 dp = dst + (even ? 0 : -1);
296 spl = lsrc + (even ? 0 : -1);
298 ui32 width = L_width + (even ? 0 : 1);
299 for (
ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
301 __m128 s1 = _mm_loadu_ps(spl);
302 __m128 s2 = _mm_loadu_ps(spl + 1);
303 __m128 d = _mm_load_ps(sph);
304 s2 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
305 d = _mm_add_ps(d, s2);
306 _mm_storeu_ps(dp, _mm_unpacklo_ps(s1, d));
307 _mm_storeu_ps(dp + 4, _mm_unpackhi_ps(s1, d));
313 line_dst->
f32[0] = line_lsrc->
f32[0];
315 line_dst->
f32[0] = line_hsrc->
f32[0] * 0.5f;
void sse_irrev_horz_wvlt_bwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void sse_irrev_vert_wvlt_step(const line_buf *src1, const line_buf *src2, line_buf *dst, int step_num, ui32 repeat)
static const float steps[8]