OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_avx.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_avx.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #include <immintrin.h>
47 
48 namespace ojph {
49  namespace local {
50 
52  void avx_irrev_vert_wvlt_step(const line_buf* line_src1,
53  const line_buf* line_src2,
54  line_buf *line_dst, int step_num,
55  ui32 repeat)
56  {
57  float *dst = line_dst->f32;
58  const float *src1 = line_src1->f32, *src2 = line_src2->f32;
59 
60  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[step_num]);
61  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
62  {
63  __m256 s1 = _mm256_load_ps(src1);
64  __m256 s2 = _mm256_load_ps(src2);
65  __m256 d = _mm256_load_ps(dst);
66  d = _mm256_add_ps(d, _mm256_mul_ps(factor, _mm256_add_ps(s1, s2)));
67  _mm256_store_ps(dst, d);
68  }
69  }
70 
72  void avx_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
73  bool L_analysis_or_H_synthesis, ui32 repeat)
74  {
75  float *dst = line_dst->f32;
76  const float *src = line_src->f32;
77 
78  float f = LIFTING_FACTORS::K_inv;
79  f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
80  __m256 factor = _mm256_set1_ps(f);
81  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src+=8)
82  {
83  __m256 s = _mm256_load_ps(src);
84  _mm256_store_ps(dst, _mm256_mul_ps(factor, s));
85  }
86  }
87 
88 
90  void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
91  line_buf *line_hdst, ui32 width,
92  bool even)
93  {
94  if (width > 1)
95  {
96  float *src = line_src->f32;
97  float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
98 
99  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
100  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
101 
102  //extension
103  src[-1] = src[1];
104  src[width] = src[width-2];
105  // predict
106  const float* sp = src + (even ? 1 : 0);
107  float *dph = hdst;
108  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::steps[0]);
109  for (ui32 i = (H_width + 3) >> 2; i > 0; --i)
110  { //this is doing twice the work it needs to do
111  //it can be definitely written better
112  __m256 s1 = _mm256_loadu_ps(sp - 1);
113  __m256 s2 = _mm256_loadu_ps(sp + 1);
114  __m256 d = _mm256_loadu_ps(sp);
115  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
116  __m256 d1 = _mm256_add_ps(d, s1);
117  sp += 8;
118  __m128 t1 = _mm256_extractf128_ps(d1, 0);
119  __m128 t2 = _mm256_extractf128_ps(d1, 1);
120  __m128 t = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
121  _mm_store_ps(dph, t);
122  dph += 4;
123  }
124 
125  // extension
126  hdst[-1] = hdst[0];
127  hdst[H_width] = hdst[H_width-1];
128  // update
129  __m128 factor128 = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
130  sp = src + (even ? 0 : 1);
131  const float* sph = hdst + (even ? 0 : 1);
132  float *dpl = ldst;
133  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
134  {
135  __m256 d1 = _mm256_loadu_ps(sp); //is there an advantage here?
136  __m128 t1 = _mm256_extractf128_ps(d1, 0);
137  __m128 t2 = _mm256_extractf128_ps(d1, 1);
138  __m128 d = _mm_shuffle_ps(t1, t2, _MM_SHUFFLE(2, 0, 2, 0));
139 
140  __m128 s1 = _mm_loadu_ps(sph - 1);
141  __m128 s2 = _mm_loadu_ps(sph);
142  s1 = _mm_mul_ps(factor128, _mm_add_ps(s1, s2));
143  d = _mm_add_ps(d, s1);
144  _mm_store_ps(dpl, d);
145  }
146 
147  //extension
148  ldst[-1] = ldst[0];
149  ldst[L_width] = ldst[L_width-1];
150  //predict
151  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[2]);
152  const float* spl = ldst + (even ? 1 : 0);
153  dph = hdst;
154  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, spl+=8, dph+=8)
155  {
156  __m256 s1 = _mm256_loadu_ps(spl - 1);
157  __m256 s2 = _mm256_loadu_ps(spl);
158  __m256 d = _mm256_loadu_ps(dph);
159  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
160  d = _mm256_add_ps(d, s1);
161  _mm256_store_ps(dph, d);
162  }
163 
164  // extension
165  hdst[-1] = hdst[0];
166  hdst[H_width] = hdst[H_width-1];
167  // update
168  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[3]);
169  sph = hdst + (even ? 0 : 1);
170  dpl = ldst;
171  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
172  {
173  __m256 s1 = _mm256_loadu_ps(sph - 1);
174  __m256 s2 = _mm256_loadu_ps(sph);
175  __m256 d = _mm256_loadu_ps(dpl);
176  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
177  d = _mm256_add_ps(d, s1);
178  _mm256_store_ps(dpl, d);
179  }
180 
181  //multipliers
182  float *dp = ldst;
183  factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
184  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
185  {
186  __m256 d = _mm256_load_ps(dp);
187  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
188  }
189  dp = hdst;
190  factor = _mm256_set1_ps(LIFTING_FACTORS::K);
191  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
192  {
193  __m256 d = _mm256_load_ps(dp);
194  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
195  }
196  }
197  else
198  {
199  if (even)
200  line_ldst->f32[0] = line_src->f32[0];
201  else
202  line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
203  }
204  }
205 
207  void avx_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
208  line_buf *line_hsrc, ui32 width,
209  bool even)
210  {
211  if (width > 1)
212  {
213  float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
214  float *dst = line_dst->f32;
215 
216  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
217  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
218 
219  //multipliers
220  float *dp = lsrc;
221  __m256 factor = _mm256_set1_ps(LIFTING_FACTORS::K);
222  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dp+=8)
223  {
224  __m256 d = _mm256_load_ps(dp);
225  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
226  }
227  dp = hsrc;
228  factor = _mm256_set1_ps(LIFTING_FACTORS::K_inv);
229  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dp+=8)
230  {
231  __m256 d = _mm256_load_ps(dp);
232  _mm256_store_ps(dp, _mm256_mul_ps(factor, d));
233  }
234 
235  //extension
236  hsrc[-1] = hsrc[0];
237  hsrc[H_width] = hsrc[H_width-1];
238  //inverse update
239  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[7]);
240  const float *sph = hsrc + (even ? 0 : 1);
241  float *dpl = lsrc;
242  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, dpl+=8)
243  {
244  __m256 s1 = _mm256_loadu_ps(sph - 1);
245  __m256 s2 = _mm256_loadu_ps(sph);
246  __m256 d = _mm256_loadu_ps(dpl);
247  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
248  d = _mm256_add_ps(d, s1);
249  _mm256_store_ps(dpl, d);
250  }
251 
252  //extension
253  lsrc[-1] = lsrc[0];
254  lsrc[L_width] = lsrc[L_width-1];
255  //inverse perdict
256  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[6]);
257  const float *spl = lsrc + (even ? 0 : -1);
258  float *dph = hsrc;
259  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8, spl+=8)
260  {
261  __m256 s1 = _mm256_loadu_ps(spl);
262  __m256 s2 = _mm256_loadu_ps(spl + 1);
263  __m256 d = _mm256_loadu_ps(dph);
264  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
265  d = _mm256_add_ps(d, s1);
266  _mm256_store_ps(dph, d);
267  }
268 
269  //extension
270  hsrc[-1] = hsrc[0];
271  hsrc[H_width] = hsrc[H_width-1];
272  //inverse update
273  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[5]);
274  sph = hsrc + (even ? 0 : 1);
275  dpl = lsrc;
276  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, dpl+=8, sph+=8)
277  {
278  __m256 s1 = _mm256_loadu_ps(sph - 1);
279  __m256 s2 = _mm256_loadu_ps(sph);
280  __m256 d = _mm256_loadu_ps(dpl);
281  s1 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
282  d = _mm256_add_ps(d, s1);
283  _mm256_store_ps(dpl, d);
284  }
285 
286  //extension
287  lsrc[-1] = lsrc[0];
288  lsrc[L_width] = lsrc[L_width-1];
289  //inverse perdict and combine
290  factor = _mm256_set1_ps(LIFTING_FACTORS::steps[4]);
291  dp = dst + (even ? 0 : -1);
292  spl = lsrc + (even ? 0 : -1);
293  sph = hsrc;
294  ui32 width = L_width + (even ? 0 : 1);
295  for (ui32 i = (width + 7) >> 3; i > 0; --i, spl+=8, sph+=8)
296  {
297  __m256 s1 = _mm256_loadu_ps(spl);
298  __m256 s2 = _mm256_loadu_ps(spl + 1);
299  __m256 d = _mm256_load_ps(sph);
300  s2 = _mm256_mul_ps(factor, _mm256_add_ps(s1, s2));
301  d = _mm256_add_ps(d, s2);
302 
303  __m128 a0 = _mm256_extractf128_ps(s1, 0);
304  __m128 a1 = _mm256_extractf128_ps(s1, 1);
305  __m128 a2 = _mm256_extractf128_ps(d, 0);
306  __m128 a3 = _mm256_extractf128_ps(d, 1);
307  _mm_storeu_ps(dp, _mm_unpacklo_ps(a0, a2)); dp += 4;
308  _mm_storeu_ps(dp, _mm_unpackhi_ps(a0, a2)); dp += 4;
309  _mm_storeu_ps(dp, _mm_unpacklo_ps(a1, a3)); dp += 4;
310  _mm_storeu_ps(dp, _mm_unpackhi_ps(a1, a3)); dp += 4;
311 
312 // s2 = _mm256_unpackhi_ps(s1, d);
313 // s1 = _mm256_unpacklo_ps(s1, d);
314 // d = _mm256_permute2f128_ps(s1, s2, (2 << 4) | 0);
315 // _mm256_storeu_ps(dp, d);
316 // d = _mm256_permute2f128_ps(s1, s2, (3 << 4) | 1);
317 // _mm256_storeu_ps(dp + 1, d);
318  }
319  }
320  else
321  {
322  if (even)
323  line_dst->f32[0] = line_lsrc->f32[0];
324  else
325  line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
326  }
327  }
328  }
329 }
void avx_irrev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void avx_irrev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void avx_irrev_vert_wvlt_K(const line_buf *line_src, line_buf *line_dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void avx_irrev_vert_wvlt_step(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, int step_num, ui32 repeat)
uint32_t ui32
Definition: ojph_defs.h:54
float * f32
Definition: ojph_mem.h:156