OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_avx.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_avx.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #ifdef OJPH_COMPILER_MSVC
47 #include <intrin.h>
48 #else
49 #include <x86intrin.h>
50 #endif
51 
52 namespace ojph {
53  namespace local {
54 
56  void avx_irrev_vert_wvlt_step(const line_buf* line_src1,
57  const line_buf* line_src2,
58  line_buf *line_dst, int step_num,
59  ui32 repeat)
60  {
61  float *dst = line_dst->f32;
62  const float *src1 = line_src1->f32, *src2 = line_src2->f32;
63 
64  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[step_num]);
65  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
66  {
67  __m256 s1 = _mm256_load_ps(src1);
68  __m256 s2 = _mm256_load_ps(src2);
69  __m256 d = _mm256_load_ps(dst);
70  d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
71  _mm256_store_ps(dst, d);
72  }
73  }
74 
76  void avx_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
77  bool L_analysis_or_H_synthesis, ui32 repeat)
78  {
79  float *dst = line_dst->f32;
80  const float *src = line_src->f32;
81 
82  float f = LIFTING_FACTORS::K_inv;
83  f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
84  __m256 factor = _mm256_set1_ps(f);
85  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
86  {
87  __m256 s = _mm256_load_ps(src);
88  _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
89  }
90  }
91 
92 
94  void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
95  line_buf *line_hdst, ui32 width,
96  bool even)
97  {
98  if (width > 1)
99  {
100  float *src = line_src->f32;
101  float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
102 
103  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
104  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
105 
106  //extension
107  src[-1] = src[1];
108  src[width] = src[width-2];
109  // predict
110  const float* sp = src + (even ? 1 : 0);
111  float *dph = hdst;
112  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[0]);
113  for (ui32 i = (H_width + 3) >> 2; i > 0; --i)
114  { //this is doing twice the work it needs to do
115  //it can be definitely written better
116  __m256 s1 = _mm256_loadu_ps(sp - 1);
117  __m256 s2 = _mm256_loadu_ps(sp + 1);
118  __m256 d = _mm256_loadu_ps(sp);
119  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
120  __m256 d1 = _mm256_add_ps(d, s1);
121  sp += 8;
122  __m128 t1 = _mm256_extractf128_ps(d1, 0);
123  __m128 t2 = _mm256_extractf128_ps(d1, 1);
124  __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
125  _mm_store_ps(dph, t);
126  dph += 4;
127  }
128 
129  // extension
130  hdst[-1] = hdst[0];
131  hdst[H_width] = hdst[H_width-1];
132  // update
133  __m128 factor128 = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
134  sp = src + (even ? 0 : 1);
135  const float* sph = hdst + (even ? 0 : 1);
136  float *dpl = ldst;
137  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
138  {
139  __m256 d1 = _mm256_loadu_ps(sp); //is there an advantage here?
140  __m128 t1 = _mm256_extractf128_ps(d1, 0);
141  __m128 t2 = _mm256_extractf128_ps(d1, 1);
142  __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
143 
144  __m128 s1 = _mm_loadu_ps(sph - 1);
145  __m128 s2 = _mm_loadu_ps(sph);
146  s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
147  d = _mm_add_ps(d, s1);
148  _mm_store_ps(dpl, d);
149  }
150 
151  //extension
152  ldst[-1] = ldst[0];
153  ldst[L_width] = ldst[L_width-1];
154  //predict
155  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[2]);
156  const float* spl = ldst + (even ? 1 : 0);
157  dph = hdst;
158  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
159  {
160  __m256 s1 = _mm256_loadu_ps(spl - 1);
161  __m256 s2 = _mm256_loadu_ps(spl);
162  __m256 d = _mm256_loadu_ps(dph);
163  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
164  d = _mm256_add_ps(d, s1);
165  _mm256_store_ps(dph, d);
166  }
167 
168  // extension
169  hdst[-1] = hdst[0];
170  hdst[H_width] = hdst[H_width-1];
171  // update
172  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[3]);
173  sph = hdst + (even ? 0 : 1);
174  dpl = ldst;
175  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
176  {
177  __m256 s1 = _mm256_loadu_ps(sph - 1);
178  __m256 s2 = _mm256_loadu_ps(sph);
179  __m256 d = _mm256_loadu_ps(dpl);
180  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
181  d = _mm256_add_ps(d, s1);
182  _mm256_store_ps(dpl, d);
183  }
184 
185  //multipliers
186  float *dp = ldst;
187  factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
188  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
189  {
190  __m256 d = _mm256_load_ps(dp);
191  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
192  }
193  dp = hdst;
194  factor = _mm256_set1_ps(LIFTING_FACTORS::K);
195  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
196  {
197  __m256 d = _mm256_load_ps(dp);
198  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
199  }
200  }
201  else
202  {
203  if (even)
204  line_ldst->f32[0] = line_src->f32[0];
205  else
206  line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
207  }
208  }
209 
211  void avx_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
212  line_buf *line_hsrc, ui32 width,
213  bool even)
214  {
215  if (width > 1)
216  {
217  float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
218  float *dst = line_dst->f32;
219 
220  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
221  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
222 
223  //multipliers
224  float *dp = lsrc;
225  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
226  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
227  {
228  __m256 d = _mm256_load_ps(dp);
229  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
230  }
231  dp = hsrc;
232  factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
233  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
234  {
235  __m256 d = _mm256_load_ps(dp);
236  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
237  }
238 
239  //extension
240  hsrc[-1] = hsrc[0];
241  hsrc[H_width] = hsrc[H_width-1];
242  //inverse update
243  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[7]);
244  const float *sph = hsrc + (even ? 0 : 1);
245  float *dpl = lsrc;
246  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
247  {
248  __m256 s1 = _mm256_loadu_ps(sph - 1);
249  __m256 s2 = _mm256_loadu_ps(sph);
250  __m256 d = _mm256_loadu_ps(dpl);
251  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
252  d = _mm256_add_ps(d, s1);
253  _mm256_store_ps(dpl, d);
254  }
255 
256  //extension
257  lsrc[-1] = lsrc[0];
258  lsrc[L_width] = lsrc[L_width-1];
259  //inverse perdict
260  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[6]);
261  const float *spl = lsrc + (even ? 0 : -1);
262  float *dph = hsrc;
263  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
264  {
265  __m256 s1 = _mm256_loadu_ps(spl);
266  __m256 s2 = _mm256_loadu_ps(spl + 1);
267  __m256 d = _mm256_loadu_ps(dph);
268  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
269  d = _mm256_add_ps(d, s1);
270  _mm256_store_ps(dph, d);
271  }
272 
273  //extension
274  hsrc[-1] = hsrc[0];
275  hsrc[H_width] = hsrc[H_width-1];
276  //inverse update
277  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[5]);
278  sph = hsrc + (even ? 0 : 1);
279  dpl = lsrc;
280  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
281  {
282  __m256 s1 = _mm256_loadu_ps(sph - 1);
283  __m256 s2 = _mm256_loadu_ps(sph);
284  __m256 d = _mm256_loadu_ps(dpl);
285  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
286  d = _mm256_add_ps(d, s1);
287  _mm256_store_ps(dpl, d);
288  }
289 
290  //extension
291  lsrc[-1] = lsrc[0];
292  lsrc[L_width] = lsrc[L_width-1];
293  //inverse perdict and combine
294  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[4]);
295  dp = dst + (even ? 0 : -1);
296  spl = lsrc + (even ? 0 : -1);
297  sph = hsrc;
298  ui32 width = L_width + (even ? 0 : 1);
299  for (ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
300  {
301  __m256 s1 = _mm256_loadu_ps(spl);
302  __m256 s2 = _mm256_loadu_ps(spl + 1);
303  __m256 d = _mm256_load_ps(sph);
304  s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
305  d = _mm256_add_ps(d, s2);
306 
307  __m128 a0 = _mm256_extractf128_ps(s1, 0);
308  __m128 a1 = _mm256_extractf128_ps(s1, 1);
309  __m128 a2 = _mm256_extractf128_ps(d, 0);
310  __m128 a3 = _mm256_extractf128_ps(d, 1);
311  _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
312  _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
313  _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
314  _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
315 
316 // s2 = _mm256_unpackhi_ps(s1, d);
317 // s1 = _mm256_unpacklo_ps(s1, d);
318 // d = _mm256_permute2f128_ps(s1, s2, (2 << 4) | 0);
319 // _mm256_storeu_ps(dp, d);
320 // d = _mm256_permute2f128_ps(s1, s2, (3 << 4) | 1);
321 // _mm256_storeu_ps(dp + 1, d);
322  }
323  }
324  else
325  {
326  if (even)
327  line_dst->f32[0] = line_lsrc->f32[0];
328  else
329  line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
330  }
331  }
332  }
333 }
void avx_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void avx_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void avx_irrev_vert_wvlt_step(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, int step_num, ui32 repeat)
uint32_t ui32
Definition: ojph_defs.h:54
float * f32
Definition: ojph_mem.h:156