46 #ifdef OJPH_COMPILER_MSVC
49 #include <x86intrin.h>
61 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
63 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
65 __m128i s1 = _mm_load_si128((__m128i*)src1);
66 __m128i s2 = _mm_load_si128((__m128i*)src2);
67 __m128i d = _mm_load_si128((__m128i*)dst);
68 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
69 d = _mm_sub_epi32(d, s1);
70 _mm_store_si128((__m128i*)dst, d);
80 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
82 __m128i offset = _mm_set1_epi32(2);
83 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
85 __m128i s1 = _mm_load_si128((__m128i*)src1);
86 s1 = _mm_add_epi32(s1, offset);
87 __m128i s2 = _mm_load_si128((__m128i*)src2);
88 s2 = _mm_add_epi32(s2, s1);
89 __m128i d = _mm_load_si128((__m128i*)dst);
90 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
91 _mm_store_si128((__m128i*)dst, d);
102 si32 *ldst = line_ldst->
i32, *hdst = line_hdst->
i32;
104 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
105 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
109 src[width] = src[width-2];
111 const si32* sp = src + (even ? 1 : 0);
113 for (
ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
116 __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
117 __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
118 __m128i d = _mm_loadu_si128((__m128i*)sp);
119 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
120 __m128i d1 = _mm_sub_epi32(d, s1);
122 s1 = _mm_loadu_si128((__m128i*)(sp-1));
123 s2 = _mm_loadu_si128((__m128i*)(sp+1));
124 d = _mm_loadu_si128((__m128i*)sp);
125 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
126 __m128i d2 = _mm_sub_epi32(d, s1);
128 d = _mm_castps_si128(_mm_shuffle_ps(
129 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
130 _mm_store_si128((__m128i*)dph, d);
135 hdst[H_width] = hdst[H_width-1];
137 sp = src + (even ? 0 : 1);
138 const si32* sph = hdst + (even ? 0 : 1);
140 __m128i offset = _mm_set1_epi32(2);
141 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
143 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
144 s1 = _mm_add_epi32(s1, offset);
145 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
146 s2 = _mm_add_epi32(s2, s1);
147 __m128i d1 = _mm_loadu_si128((__m128i*)sp);
148 __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
149 __m128i d = _mm_castps_si128(_mm_shuffle_ps(
150 _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
151 d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
152 _mm_store_si128((__m128i*)dpl, d);
158 line_ldst->
i32[0] = line_src->
i32[0];
160 line_hdst->
i32[0] = line_src->
i32[0] << 1;
170 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
172 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
174 __m128i s1 = _mm_load_si128((__m128i*)src1);
175 __m128i s2 = _mm_load_si128((__m128i*)src2);
176 __m128i d = _mm_load_si128((__m128i*)dst);
177 s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
178 d = _mm_add_epi32(d, s1);
179 _mm_store_si128((__m128i*)dst, d);
189 const si32 *src1 = line_src1->
i32, *src2 = line_src2->
i32;
191 __m128i offset = _mm_set1_epi32(2);
192 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
194 __m128i s1 = _mm_load_si128((__m128i*)src1);
195 s1 = _mm_add_epi32(s1, offset);
196 __m128i s2 = _mm_load_si128((__m128i*)src2);
197 s2 = _mm_add_epi32(s2, s1);
198 __m128i d = _mm_load_si128((__m128i*)dst);
199 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
200 _mm_store_si128((__m128i*)dst, d);
210 si32 *lsrc = line_lsrc->
i32, *hsrc = line_hsrc->
i32;
213 const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
214 const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
218 hsrc[H_width] = hsrc[H_width-1];
220 const si32 *sph = hsrc + (even ? 0 : 1);
222 __m128i offset = _mm_set1_epi32(2);
223 for (
ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
225 __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
226 s1 = _mm_add_epi32(s1, offset);
227 __m128i s2 = _mm_loadu_si128((__m128i*)sph);
228 s2 = _mm_add_epi32(s2, s1);
229 __m128i d = _mm_load_si128((__m128i*)spl);
230 d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
231 _mm_store_si128((__m128i*)spl, d);
236 lsrc[L_width] = lsrc[L_width - 1];
238 si32 *dp = dst + (even ? 0 : -1);
239 spl = lsrc + (even ? 0 : -1);
241 ui32 width = L_width + (even ? 0 : 1);
242 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
244 __m128i s1 = _mm_loadu_si128((__m128i*)spl);
245 __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
246 __m128i d = _mm_load_si128((__m128i*)sph);
247 s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
248 d = _mm_add_epi32(d, s2);
249 _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
250 _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
256 line_dst->
i32[0] = line_lsrc->
i32[0];
258 line_dst->
i32[0] = line_hsrc->
i32[0] >> 1;
void sse2_rev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse2_rev_vert_wvlt_fwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_horz_wvlt_bwd_tx(line_buf *dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_wvlt_bwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_fwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_bwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)