45 #ifdef OJPH_COMPILER_MSVC
48 #include <x86intrin.h>
58 __m128 shift = _mm_set1_ps(0.5f);
59 __m128 m = _mm_set1_ps(mul);
60 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
62 __m128i t = _mm_castps_si128(_mm_loadu_ps((
float*)sp));
63 __m128 s = _mm_cvtepi32_ps(t);
65 s = _mm_sub_ps(s, shift);
74 __m128 m = _mm_set1_ps(mul);
75 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4, dp+=4)
77 __m128i t = _mm_castps_si128(_mm_loadu_ps((
float*)sp));
78 __m128 s = _mm_cvtepi32_ps(t);
88 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
89 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
90 __m128 shift = _mm_set1_ps(0.5f);
91 __m128 m = _mm_set1_ps(mul);
92 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
94 __m128 t = _mm_load_ps(sp);
95 __m128 s = _mm_add_ps(t, shift);
100 *dp++ = _mm_cvtss_si32(t);
101 t = _mm_shuffle_ps(s, s, 1);
102 *dp++ = _mm_cvtss_si32(t);
103 t = _mm_shuffle_ps(s, s, 2);
104 *dp++ = _mm_cvtss_si32(t);
105 t = _mm_shuffle_ps(s, s, 3);
106 *dp++ = _mm_cvtss_si32(t);
108 _MM_SET_ROUNDING_MODE(rounding_mode);
115 uint32_t rounding_mode = _MM_GET_ROUNDING_MODE();
116 _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
117 __m128 m = _mm_set1_ps(mul);
118 for (
ui32 i = (width + 3) >> 2; i > 0; --i, sp+=4)
120 __m128 t = _mm_load_ps(sp);
121 __m128 s = _mm_mul_ps(t, m);
125 *dp++ = _mm_cvtss_si32(t);
126 t = _mm_shuffle_ps(s, s, 1);
127 *dp++ = _mm_cvtss_si32(t);
128 t = _mm_shuffle_ps(s, s, 2);
129 *dp++ = _mm_cvtss_si32(t);
130 t = _mm_shuffle_ps(s, s, 3);
131 *dp++ = _mm_cvtss_si32(t);
133 _MM_SET_ROUNDING_MODE(rounding_mode);
138 float *y,
float *cb,
float *cr,
ui32 repeat)
145 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
147 __m128 mr = _mm_load_ps(r);
148 __m128 mb = _mm_load_ps(b);
149 __m128 my = _mm_mul_ps(alpha_rf, mr);
150 my = _mm_add_ps(my, _mm_mul_ps(alpha_gf, _mm_load_ps(g)));
151 my = _mm_add_ps(my, _mm_mul_ps(alpha_bf, mb));
153 _mm_store_ps(cb, _mm_mul_ps(beta_cbf, _mm_sub_ps(mb, my)));
154 _mm_store_ps(cr, _mm_mul_ps(beta_crf, _mm_sub_ps(mr, my)));
156 r += 4; g += 4; b += 4;
157 y += 4; cb += 4; cr += 4;
163 float *r,
float *g,
float *b,
ui32 repeat)
169 for (
ui32 i = (repeat + 3) >> 2; i > 0; --i)
171 __m128 my = _mm_load_ps(y);
172 __m128 mcr = _mm_load_ps(cr);
173 __m128 mcb = _mm_load_ps(cb);
174 __m128 mg = _mm_sub_ps(my, _mm_mul_ps(gamma_cr2g, mcr));
175 _mm_store_ps(g, _mm_sub_ps(mg, _mm_mul_ps(gamma_cb2g, mcb)));
176 _mm_store_ps(r, _mm_add_ps(my, _mm_mul_ps(gamma_cr2r, mcr)));
177 _mm_store_ps(b, _mm_add_ps(my, _mm_mul_ps(gamma_cb2b, mcb)));
179 y += 4; cb += 4; cr += 4;
180 r += 4; g += 4; b += 4;
void sse_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void sse_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void sse_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void sse_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void sse_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
void sse_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF