45 #ifdef OJPH_COMPILER_MSVC
48 #include <x86intrin.h>
58 __m256 shift = _mm256_set1_ps(0.5f);
59 __m256 m = _mm256_set1_ps(mul);
60 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
62 __m256i t = _mm256_loadu_si256((__m256i*)sp);
63 __m256 s = _mm256_cvtepi32_ps(t);
64 s = _mm256_mul_ps(s, m);
65 s = _mm256_sub_ps(s, shift);
66 _mm256_store_ps(dp, s);
74 __m256 m = _mm256_set1_ps(mul);
75 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
77 __m256i t = _mm256_loadu_si256((__m256i*)sp);
78 __m256 s = _mm256_cvtepi32_ps(t);
79 s = _mm256_mul_ps(s, m);
80 _mm256_store_ps(dp, s);
88 __m256 shift = _mm256_set1_ps(0.5f);
89 __m256 m = _mm256_set1_ps(mul);
90 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
92 __m256 t = _mm256_load_ps(sp);
93 __m256 s = _mm256_add_ps(t, shift);
94 s = _mm256_mul_ps(s, m);
95 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
96 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
104 __m256 m = _mm256_set1_ps(mul);
105 for (
int i = (width + 7) >> 3; i > 0; --i, sp+=8, dp+=8)
107 __m256 t = _mm256_load_ps(sp);
108 __m256 s = _mm256_mul_ps(t, m);
109 s = _mm256_round_ps(s, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC);
110 _mm256_storeu_si256((__m256i*)dp, _mm256_cvtps_epi32(s));
116 float *y,
float *cb,
float *cr,
ui32 repeat)
123 for (
int i = (repeat + 7) >> 3; i > 0; --i)
125 __m256 mr = _mm256_load_ps(r);
126 __m256 mb = _mm256_load_ps(b);
127 __m256 my = _mm256_mul_ps(alpha_rf, mr);
128 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_gf, _mm256_load_ps(g)));
129 my = _mm256_add_ps(my, _mm256_mul_ps(alpha_bf, mb));
130 _mm256_store_ps(y, my);
131 _mm256_store_ps(cb, _mm256_mul_ps(beta_cbf, _mm256_sub_ps(mb, my)));
132 _mm256_store_ps(cr, _mm256_mul_ps(beta_crf, _mm256_sub_ps(mr, my)));
134 r += 8; g += 8; b += 8;
135 y += 8; cb += 8; cr += 8;
141 float *r,
float *g,
float *b,
ui32 repeat)
147 for (
int i = (repeat + 7) >> 3; i > 0; --i)
149 __m256 my = _mm256_load_ps(y);
150 __m256 mcr = _mm256_load_ps(cr);
151 __m256 mcb = _mm256_load_ps(cb);
152 __m256 mg = _mm256_sub_ps(my, _mm256_mul_ps(gamma_cr2g, mcr));
153 _mm256_store_ps(g, _mm256_sub_ps(mg, _mm256_mul_ps(gamma_cb2g, mcb)));
154 _mm256_store_ps(r, _mm256_add_ps(my, _mm256_mul_ps(gamma_cr2r, mcr)));
155 _mm256_store_ps(b, _mm256_add_ps(my, _mm256_mul_ps(gamma_cb2b, mcb)));
157 y += 8; cb += 8; cr += 8;
158 r += 8; g += 8; b += 8;
void avx_cnvrt_float_to_si32(const float *sp, si32 *dp, float mul, ui32 width)
void avx_ict_forward(const float *r, const float *g, const float *b, float *y, float *cb, float *cr, ui32 repeat)
void avx_cnvrt_si32_to_float_shftd(const si32 *sp, float *dp, float mul, ui32 width)
void avx_cnvrt_si32_to_float(const si32 *sp, float *dp, float mul, ui32 width)
void avx_cnvrt_float_to_si32_shftd(const float *sp, si32 *dp, float mul, ui32 width)
void avx_ict_backward(const float *y, const float *cb, const float *cr, float *r, float *g, float *b, ui32 repeat)
static const float GAMMA_CR2R
static const float BETA_CbF
static const float GAMMA_CB2B
static const float ALPHA_RF
static const float GAMMA_CB2G
static const float GAMMA_CR2G
static const float ALPHA_BF
static const float BETA_CrF
static const float ALPHA_GF