OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_avx2.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_avx2.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #ifdef OJPH_COMPILER_MSVC
47 #include <intrin.h>
48 #else
49 #include <x86intrin.h>
50 #endif
51 
52 namespace ojph {
53  namespace local {
54 
57  const line_buf* line_src2,
58  line_buf *line_dst, ui32 repeat)
59  {
60  si32 *dst = line_dst->i32;
61  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
62 
63  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
64  {
65  __m256i s1 = _mm256_load_si256((__m256i*)src1);
66  __m256i s2 = _mm256_load_si256((__m256i*)src2);
67  __m256i d = _mm256_load_si256((__m256i*)dst);
68  s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
69  d = _mm256_sub_epi32(d, s1);
70  _mm256_store_si256((__m256i*)dst, d);
71  }
72  }
73 
75  void avx2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
76  const line_buf* line_src2,
77  line_buf *line_dst, ui32 repeat)
78  {
79  si32 *dst = line_dst->i32;
80  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
81 
82  __m256i offset = _mm256_set1_epi32(2);
83  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
84  {
85  __m256i s1 = _mm256_load_si256((__m256i*)src1);
86  s1 = _mm256_add_epi32(s1, offset);
87  __m256i s2 = _mm256_load_si256((__m256i*)src2);
88  s2 = _mm256_add_epi32(s2, s1);
89  __m256i d = _mm256_load_si256((__m256i*)dst);
90  d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
91  _mm256_store_si256((__m256i*)dst, d);
92  }
93  }
94 
96  void avx2_rev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
97  line_buf *line_hdst,ui32 width, bool even)
98  {
99  if (width > 1)
100  {
101  si32 *src = line_src->i32;
102  si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
103 
104  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
105  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
106 
107  // extension
108  src[-1] = src[1];
109  src[width] = src[width-2];
110  // predict
111  const si32* sp = src + (even ? 1 : 0);
112  si32 *dph = hdst;
113  const __m256i mask = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7);
114  for (ui32 i = (H_width + 7) >> 3; i > 0; --i, dph+=8)
115  { //this is doing twice the work it needs to do
116  //it can be definitely written better
117  __m256i s1 = _mm256_loadu_si256((__m256i*)(sp-1));
118  __m256i s2 = _mm256_loadu_si256((__m256i*)(sp+1));
119  __m256i d = _mm256_loadu_si256((__m256i*)sp);
120  s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
121  __m256i d1 = _mm256_sub_epi32(d, s1);
122  sp += 8;
123  s1 = _mm256_loadu_si256((__m256i*)(sp-1));
124  s2 = _mm256_loadu_si256((__m256i*)(sp+1));
125  d = _mm256_loadu_si256((__m256i*)sp);
126  s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
127  __m256i d2 = _mm256_sub_epi32(d, s1);
128  sp += 8;
129  d1 = _mm256_permutevar8x32_epi32(d1, mask);
130  d2 = _mm256_permutevar8x32_epi32(d2, mask);
131  d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
132  _mm256_store_si256((__m256i*)dph, d);
133  }
134 
135  // extension
136  hdst[-1] = hdst[0];
137  hdst[H_width] = hdst[H_width-1];
138  // update
139  sp = src + (even ? 0 : 1);
140  const si32* sph = hdst + (even ? 0 : 1);
141  si32 *dpl = ldst;
142  __m256i offset = _mm256_set1_epi32(2);
143  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sp+=16, sph+=8, dpl+=8)
144  {
145  __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
146  s1 = _mm256_add_epi32(s1, offset);
147  __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
148  s2 = _mm256_add_epi32(s2, s1);
149  __m256i d1 = _mm256_loadu_si256((__m256i*)sp);
150  __m256i d2 = _mm256_loadu_si256((__m256i*)sp + 1);
151  d1 = _mm256_permutevar8x32_epi32(d1, mask);
152  d2 = _mm256_permutevar8x32_epi32(d2, mask);
153  __m256i d = _mm256_permute2x128_si256(d1, d2, (2 << 4) | 0);
154  d = _mm256_add_epi32(d, _mm256_srai_epi32(s2, 2));
155  _mm256_store_si256((__m256i*)dpl, d);
156  }
157  }
158  else
159  {
160  if (even)
161  line_ldst->i32[0] = line_src->i32[0];
162  else
163  line_hdst->i32[0] = line_src->i32[0] << 1;
164  }
165  }
166 
169  const line_buf* line_src2,
170  line_buf *line_dst, ui32 repeat)
171  {
172  si32 *dst = line_dst->i32;
173  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
174 
175  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
176  {
177  __m256i s1 = _mm256_load_si256((__m256i*)src1);
178  __m256i s2 = _mm256_load_si256((__m256i*)src2);
179  __m256i d = _mm256_load_si256((__m256i*)dst);
180  s1 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
181  d = _mm256_add_epi32(d, s1);
182  _mm256_store_si256((__m256i*)dst, d);
183  }
184  }
185 
188  const line_buf* line_src2,
189  line_buf *line_dst, ui32 repeat)
190  {
191  si32 *dst = line_dst->i32;
192  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
193 
194  __m256i offset = _mm256_set1_epi32(2);
195  for (ui32 i = (repeat + 7) >> 3; i > 0; --i, dst+=8, src1+=8, src2+=8)
196  {
197  __m256i s1 = _mm256_load_si256((__m256i*)src1);
198  s1 = _mm256_add_epi32(s1, offset);
199  __m256i s2 = _mm256_load_si256((__m256i*)src2);
200  s2 = _mm256_add_epi32(s2, s1);
201  __m256i d = _mm256_load_si256((__m256i*)dst);
202  d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
203  _mm256_store_si256((__m256i*)dst, d);
204  }
205  }
206 
208  void avx2_rev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
209  line_buf *line_hsrc, ui32 width, bool even)
210  {
211  if (width > 1)
212  {
213  si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
214  si32 *dst = line_dst->i32;
215 
216  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
217  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
218 
219  // extension
220  hsrc[-1] = hsrc[0];
221  hsrc[H_width] = hsrc[H_width-1];
222  //inverse update
223  const si32 *sph = hsrc + (even ? 0 : 1);
224  si32 *spl = lsrc;
225  __m256i offset = _mm256_set1_epi32(2);
226  for (ui32 i = (L_width + 7) >> 3; i > 0; --i, sph+=8, spl+=8)
227  {
228  __m256i s1 = _mm256_loadu_si256((__m256i*)(sph-1));
229  s1 = _mm256_add_epi32(s1, offset);
230  __m256i s2 = _mm256_loadu_si256((__m256i*)sph);
231  s2 = _mm256_add_epi32(s2, s1);
232  __m256i d = _mm256_load_si256((__m256i*)spl);
233  d = _mm256_sub_epi32(d, _mm256_srai_epi32(s2, 2));
234  _mm256_store_si256((__m256i*)spl, d);
235  }
236 
237  // extension
238  lsrc[-1] = lsrc[0];
239  lsrc[L_width] = lsrc[L_width - 1];
240  // inverse predict and combine
241  si32 *dp = dst + (even ? 0 : -1);
242  spl = lsrc + (even ? 0 : -1);
243  sph = hsrc;
244  ui32 width = L_width + (even ? 0 : 1);
245  for (ui32 i = (width + 7) >> 3; i > 0; --i, sph+=8, spl+=8, dp+=16)
246  {
247  __m256i s1 = _mm256_loadu_si256((__m256i*)spl);
248  __m256i s2 = _mm256_loadu_si256((__m256i*)(spl+1));
249  __m256i d = _mm256_load_si256((__m256i*)sph);
250  s2 = _mm256_srai_epi32(_mm256_add_epi32(s1, s2), 1);
251  d = _mm256_add_epi32(d, s2);
252  s2 = _mm256_unpackhi_epi32(s1, d);
253  s1 = _mm256_unpacklo_epi32(s1, d);
254  d = _mm256_permute2x128_si256(s1, s2, (2 << 4) | 0);
255  _mm256_storeu_si256((__m256i*)dp, d);
256  d = _mm256_permute2x128_si256(s1, s2, (3 << 4) | 1);
257  _mm256_storeu_si256((__m256i*)dp + 1, d);
258  }
259  }
260  else
261  {
262  if (even)
263  line_dst->i32[0] = line_lsrc->i32[0];
264  else
265  line_dst->i32[0] = line_hsrc->i32[0] >> 1;
266  }
267  }
268  }
269 }
void avx2_rev_vert_wvlt_fwd_update(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void avx2_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst, line_buf *line_hdst, ui32 width, bool even)
void avx2_rev_vert_wvlt_fwd_predict(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void avx2_rev_vert_wvlt_bwd_predict(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
void avx2_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc, line_buf *line_hsrc, ui32 width, bool even)
void avx2_rev_vert_wvlt_bwd_update(const line_buf *line_src1, const line_buf *line_src2, line_buf *line_dst, ui32 repeat)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
si32 * i32
Definition: ojph_mem.h:155