OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_sse.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_sse.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #include <immintrin.h>
47 
48 namespace ojph {
49  namespace local {
50 
52  void sse_irrev_vert_wvlt_step(const line_buf* line_src1,
53  const line_buf* line_src2,
54  line_buf *line_dst,
55  int step_num, ui32 repeat)
56  {
57  float *dst = line_dst->f32;
58  const float *src1 = line_src1->f32, *src2 = line_src2->f32;
59 
60  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[step_num]);
61  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
62  {
63  __m128 s1 = _mm_load_ps(src1);
64  __m128 s2 = _mm_load_ps(src2);
65  __m128 d = _mm_load_ps(dst);
66  d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
67  _mm_store_ps(dst, d);
68  }
69  }
70 
72  void sse_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
73  bool L_analysis_or_H_synthesis, ui32 repeat)
74  {
75  float *dst = line_dst->f32;
76  const float *src = line_src->f32;
77 
78  float f = LIFTING_FACTORS::K_inv;
79  f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
80  __m128 factor = _mm_set1_ps(f);
81  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
82  {
83  __m128 s = _mm_load_ps(src);
84  _mm_store_ps(dst, _mm_mul_ps(factor, s));
85  }
86  }
87 
89  void sse_irrev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
90  line_buf *line_hdst, ui32 width,
91  bool even)
92  {
93  if (width > 1)
94  {
95  float *src = line_src->f32;
96  float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
97 
98  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
99  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
100 
101  //extension
102  src[-1] = src[1];
103  src[width] = src[width-2];
104  // predict
105  const float* sp = src + (even ? 1 : 0);
106  float *dph = hdst;
107  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[0]);
108  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
109  { //this is doing twice the work it needs to do
110  //it can be definitely written better
111  __m128 s1 = _mm_loadu_ps(sp - 1);
112  __m128 s2 = _mm_loadu_ps(sp + 1);
113  __m128 d = _mm_loadu_ps(sp);
114  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
115  __m128 d1 = _mm_add_ps(d, s1);
116  sp += 4;
117  s1 = _mm_loadu_ps(sp - 1);
118  s2 = _mm_loadu_ps(sp + 1);
119  d = _mm_loadu_ps(sp);
120  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
121  __m128 d2 = _mm_add_ps(d, s1);
122  sp += 4;
123  d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
124  _mm_store_ps(dph, d);
125  }
126 
127  // extension
128  hdst[-1] = hdst[0];
129  hdst[H_width] = hdst[H_width-1];
130  // update
131  factor = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
132  sp = src + (even ? 0 : 1);
133  const float* sph = hdst + (even ? 0 : 1);
134  float *dpl = ldst;
135  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
136  {
137  __m128 s1 = _mm_loadu_ps(sph - 1);
138  __m128 s2 = _mm_loadu_ps(sph);
139  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
140  __m128 d1 = _mm_loadu_ps(sp);
141  __m128 d2 = _mm_loadu_ps(sp + 4);
142  __m128 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
143  d = _mm_add_ps(d, s1);
144  _mm_store_ps(dpl, d);
145  }
146 
147  //extension
148  ldst[-1] = ldst[0];
149  ldst[L_width] = ldst[L_width-1];
150  //predict
151  factor = _mm_set1_ps(LIFTING_FACTORS::steps[2]);
152  const float* spl = ldst + (even ? 1 : 0);
153  dph = hdst;
154  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
155  {
156  __m128 s1 = _mm_loadu_ps(spl - 1);
157  __m128 s2 = _mm_loadu_ps(spl);
158  __m128 d = _mm_loadu_ps(dph);
159  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
160  d = _mm_add_ps(d, s1);
161  _mm_store_ps(dph, d);
162  }
163 
164  // extension
165  hdst[-1] = hdst[0];
166  hdst[H_width] = hdst[H_width-1];
167  // update
168  factor = _mm_set1_ps(LIFTING_FACTORS::steps[3]);
169  sph = hdst + (even ? 0 : 1);
170  dpl = ldst;
171  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
172  {
173  __m128 s1 = _mm_loadu_ps(sph - 1);
174  __m128 s2 = _mm_loadu_ps(sph);
175  __m128 d = _mm_loadu_ps(dpl);
176  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
177  d = _mm_add_ps(d, s1);
178  _mm_store_ps(dpl, d);
179  }
180 
181  //multipliers
182  float *dp = ldst;
183  factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
184  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
185  {
186  __m128 d = _mm_load_ps(dp);
187  _mm_store_ps(dp, _mm_mul_ps(factor, d));
188  }
189  dp = hdst;
190  factor = _mm_set1_ps(LIFTING_FACTORS::K);
191  for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
192  {
193  __m128 d = _mm_load_ps(dp);
194  _mm_store_ps(dp, _mm_mul_ps(factor, d));
195  }
196  }
197  else
198  {
199  if (even)
200  line_ldst->f32[0] = line_src->f32[0];
201  else
202  line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
203  }
204  }
205 
207  void sse_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
208  line_buf *line_hsrc, ui32 width,
209  bool even)
210  {
211  if (width > 1)
212  {
213  float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
214  float *dst = line_dst->f32;
215 
216  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
217  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
218 
219  //multipliers
220  float *dp = lsrc;
221  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::K);
222  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
223  {
224  __m128 d = _mm_load_ps(dp);
225  _mm_store_ps(dp, _mm_mul_ps(factor, d));
226  }
227  dp = hsrc;
228  factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
229  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
230  {
231  __m128 d = _mm_load_ps(dp);
232  _mm_store_ps(dp, _mm_mul_ps(factor, d));
233  }
234 
235  //extension
236  hsrc[-1] = hsrc[0];
237  hsrc[H_width] = hsrc[H_width-1];
238  //inverse update
239  factor = _mm_set1_ps(LIFTING_FACTORS::steps[7]);
240  const float *sph = hsrc + (even ? 0 : 1);
241  float *dpl = lsrc;
242  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
243  {
244  __m128 s1 = _mm_loadu_ps(sph - 1);
245  __m128 s2 = _mm_loadu_ps(sph);
246  __m128 d = _mm_loadu_ps(dpl);
247  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
248  d = _mm_add_ps(d, s1);
249  _mm_store_ps(dpl, d);
250  }
251 
252  //extension
253  lsrc[-1] = lsrc[0];
254  lsrc[L_width] = lsrc[L_width-1];
255  //inverse perdict
256  factor = _mm_set1_ps(LIFTING_FACTORS::steps[6]);
257  const float *spl = lsrc + (even ? 0 : -1);
258  float *dph = hsrc;
259  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
260  {
261  __m128 s1 = _mm_loadu_ps(spl);
262  __m128 s2 = _mm_loadu_ps(spl + 1);
263  __m128 d = _mm_loadu_ps(dph);
264  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
265  d = _mm_add_ps(d, s1);
266  _mm_store_ps(dph, d);
267  }
268 
269  //extension
270  hsrc[-1] = hsrc[0];
271  hsrc[H_width] = hsrc[H_width-1];
272  //inverse update
273  factor = _mm_set1_ps(LIFTING_FACTORS::steps[5]);
274  sph = hsrc + (even ? 0 : 1);
275  dpl = lsrc;
276  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
277  {
278  __m128 s1 = _mm_loadu_ps(sph - 1);
279  __m128 s2 = _mm_loadu_ps(sph);
280  __m128 d = _mm_loadu_ps(dpl);
281  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
282  d = _mm_add_ps(d, s1);
283  _mm_store_ps(dpl, d);
284  }
285 
286  //extension
287  lsrc[-1] = lsrc[0];
288  lsrc[L_width] = lsrc[L_width-1];
289  //inverse perdict and combine
290  factor = _mm_set1_ps(LIFTING_FACTORS::steps[4]);
291  dp = dst + (even ? 0 : -1);
292  spl = lsrc + (even ? 0 : -1);
293  sph = hsrc;
294  ui32 width = L_width + (even ? 0 : 1);
295  for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
296  {
297  __m128 s1 = _mm_loadu_ps(spl);
298  __m128 s2 = _mm_loadu_ps(spl + 1);
299  __m128 d = _mm_load_ps(sph);
300  s2 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
301  d = _mm_add_ps(d, s2);
302  _mm_storeu_ps(dp, _mm_unpacklo_ps(s1, d));
303  _mm_storeu_ps(dp + 4, _mm_unpackhi_ps(s1, d));
304  }
305  }
306  else
307  {
308  if (even)
309  line_dst->f32[0] = line_lsrc->f32[0];
310  else
311  line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
312  }
313  }
314  }
315 }
void sse_irrev_horz_wvlt_bwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void sse_irrev_vert_wvlt_step(const line_buf *src1, const line_buf *src2, line_buf *dst, int step_num, ui32 repeat)
uint32_t ui32
Definition: ojph_defs.h:54
float * f32
Definition: ojph_mem.h:156