OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_sse.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_sse.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #ifdef OJPH_COMPILER_MSVC
47 #include <intrin.h>
48 #else
49 #include <x86intrin.h>
50 #endif
51 
52 namespace ojph {
53  namespace local {
54 
56  void sse_irrev_vert_wvlt_step(const line_buf* line_src1,
57  const line_buf* line_src2,
58  line_buf *line_dst,
59  int step_num, ui32 repeat)
60  {
61  float *dst = line_dst->f32;
62  const float *src1 = line_src1->f32, *src2 = line_src2->f32;
63 
64  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[step_num]);
65  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
66  {
67  __m128 s1 = _mm_load_ps(src1);
68  __m128 s2 = _mm_load_ps(src2);
69  __m128 d = _mm_load_ps(dst);
70  d = _mm_add_ps(d, _mm_mul_ps(factor, _mm_add_ps(s1, s2)));
71  _mm_store_ps(dst, d);
72  }
73  }
74 
76  void sse_irrev_vert_wvlt_K(const line_buf* line_src, line_buf* line_dst,
77  bool L_analysis_or_H_synthesis, ui32 repeat)
78  {
79  float *dst = line_dst->f32;
80  const float *src = line_src->f32;
81 
82  float f = LIFTING_FACTORS::K_inv;
83  f = L_analysis_or_H_synthesis ? f : LIFTING_FACTORS::K;
84  __m128 factor = _mm_set1_ps(f);
85  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src+=4)
86  {
87  __m128 s = _mm_load_ps(src);
88  _mm_store_ps(dst, _mm_mul_ps(factor, s));
89  }
90  }
91 
93  void sse_irrev_horz_wvlt_fwd_tx(line_buf* line_src, line_buf *line_ldst,
94  line_buf *line_hdst, ui32 width,
95  bool even)
96  {
97  if (width > 1)
98  {
99  float *src = line_src->f32;
100  float *ldst = line_ldst->f32, *hdst = line_hdst->f32;
101 
102  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
103  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
104 
105  //extension
106  src[-1] = src[1];
107  src[width] = src[width-2];
108  // predict
109  const float* sp = src + (even ? 1 : 0);
110  float *dph = hdst;
111  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::steps[0]);
112  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
113  { //this is doing twice the work it needs to do
114  //it can be definitely written better
115  __m128 s1 = _mm_loadu_ps(sp - 1);
116  __m128 s2 = _mm_loadu_ps(sp + 1);
117  __m128 d = _mm_loadu_ps(sp);
118  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
119  __m128 d1 = _mm_add_ps(d, s1);
120  sp += 4;
121  s1 = _mm_loadu_ps(sp - 1);
122  s2 = _mm_loadu_ps(sp + 1);
123  d = _mm_loadu_ps(sp);
124  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
125  __m128 d2 = _mm_add_ps(d, s1);
126  sp += 4;
127  d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
128  _mm_store_ps(dph, d);
129  }
130 
131  // extension
132  hdst[-1] = hdst[0];
133  hdst[H_width] = hdst[H_width-1];
134  // update
135  factor = _mm_set1_ps(LIFTING_FACTORS::steps[1]);
136  sp = src + (even ? 0 : 1);
137  const float* sph = hdst + (even ? 0 : 1);
138  float *dpl = ldst;
139  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
140  {
141  __m128 s1 = _mm_loadu_ps(sph - 1);
142  __m128 s2 = _mm_loadu_ps(sph);
143  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
144  __m128 d1 = _mm_loadu_ps(sp);
145  __m128 d2 = _mm_loadu_ps(sp + 4);
146  __m128 d = _mm_shuffle_ps(d1, d2, _MM_SHUFFLE(2, 0, 2, 0));
147  d = _mm_add_ps(d, s1);
148  _mm_store_ps(dpl, d);
149  }
150 
151  //extension
152  ldst[-1] = ldst[0];
153  ldst[L_width] = ldst[L_width-1];
154  //predict
155  factor = _mm_set1_ps(LIFTING_FACTORS::steps[2]);
156  const float* spl = ldst + (even ? 1 : 0);
157  dph = hdst;
158  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, spl+=4, dph+=4)
159  {
160  __m128 s1 = _mm_loadu_ps(spl - 1);
161  __m128 s2 = _mm_loadu_ps(spl);
162  __m128 d = _mm_loadu_ps(dph);
163  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
164  d = _mm_add_ps(d, s1);
165  _mm_store_ps(dph, d);
166  }
167 
168  // extension
169  hdst[-1] = hdst[0];
170  hdst[H_width] = hdst[H_width-1];
171  // update
172  factor = _mm_set1_ps(LIFTING_FACTORS::steps[3]);
173  sph = hdst + (even ? 0 : 1);
174  dpl = ldst;
175  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, dpl+=4)
176  {
177  __m128 s1 = _mm_loadu_ps(sph - 1);
178  __m128 s2 = _mm_loadu_ps(sph);
179  __m128 d = _mm_loadu_ps(dpl);
180  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
181  d = _mm_add_ps(d, s1);
182  _mm_store_ps(dpl, d);
183  }
184 
185  //multipliers
186  float *dp = ldst;
187  factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
188  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
189  {
190  __m128 d = _mm_load_ps(dp);
191  _mm_store_ps(dp, _mm_mul_ps(factor, d));
192  }
193  dp = hdst;
194  factor = _mm_set1_ps(LIFTING_FACTORS::K);
195  for (int i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
196  {
197  __m128 d = _mm_load_ps(dp);
198  _mm_store_ps(dp, _mm_mul_ps(factor, d));
199  }
200  }
201  else
202  {
203  if (even)
204  line_ldst->f32[0] = line_src->f32[0];
205  else
206  line_hdst->f32[0] = line_src->f32[0] + line_src->f32[0];
207  }
208  }
209 
211  void sse_irrev_horz_wvlt_bwd_tx(line_buf* line_dst, line_buf *line_lsrc,
212  line_buf *line_hsrc, ui32 width,
213  bool even)
214  {
215  if (width > 1)
216  {
217  float *lsrc = line_lsrc->f32, *hsrc = line_hsrc->f32;
218  float *dst = line_dst->f32;
219 
220  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
221  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
222 
223  //multipliers
224  float *dp = lsrc;
225  __m128 factor = _mm_set1_ps(LIFTING_FACTORS::K);
226  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dp+=4)
227  {
228  __m128 d = _mm_load_ps(dp);
229  _mm_store_ps(dp, _mm_mul_ps(factor, d));
230  }
231  dp = hsrc;
232  factor = _mm_set1_ps(LIFTING_FACTORS::K_inv);
233  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dp+=4)
234  {
235  __m128 d = _mm_load_ps(dp);
236  _mm_store_ps(dp, _mm_mul_ps(factor, d));
237  }
238 
239  //extension
240  hsrc[-1] = hsrc[0];
241  hsrc[H_width] = hsrc[H_width-1];
242  //inverse update
243  factor = _mm_set1_ps(LIFTING_FACTORS::steps[7]);
244  const float *sph = hsrc + (even ? 0 : 1);
245  float *dpl = lsrc;
246  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
247  {
248  __m128 s1 = _mm_loadu_ps(sph - 1);
249  __m128 s2 = _mm_loadu_ps(sph);
250  __m128 d = _mm_loadu_ps(dpl);
251  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
252  d = _mm_add_ps(d, s1);
253  _mm_store_ps(dpl, d);
254  }
255 
256  //extension
257  lsrc[-1] = lsrc[0];
258  lsrc[L_width] = lsrc[L_width-1];
259  //inverse perdict
260  factor = _mm_set1_ps(LIFTING_FACTORS::steps[6]);
261  const float *spl = lsrc + (even ? 0 : -1);
262  float *dph = hsrc;
263  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4, spl+=4)
264  {
265  __m128 s1 = _mm_loadu_ps(spl);
266  __m128 s2 = _mm_loadu_ps(spl + 1);
267  __m128 d = _mm_loadu_ps(dph);
268  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
269  d = _mm_add_ps(d, s1);
270  _mm_store_ps(dph, d);
271  }
272 
273  //extension
274  hsrc[-1] = hsrc[0];
275  hsrc[H_width] = hsrc[H_width-1];
276  //inverse update
277  factor = _mm_set1_ps(LIFTING_FACTORS::steps[5]);
278  sph = hsrc + (even ? 0 : 1);
279  dpl = lsrc;
280  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, dpl+=4, sph+=4)
281  {
282  __m128 s1 = _mm_loadu_ps(sph - 1);
283  __m128 s2 = _mm_loadu_ps(sph);
284  __m128 d = _mm_loadu_ps(dpl);
285  s1 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
286  d = _mm_add_ps(d, s1);
287  _mm_store_ps(dpl, d);
288  }
289 
290  //extension
291  lsrc[-1] = lsrc[0];
292  lsrc[L_width] = lsrc[L_width-1];
293  //inverse perdict and combine
294  factor = _mm_set1_ps(LIFTING_FACTORS::steps[4]);
295  dp = dst + (even ? 0 : -1);
296  spl = lsrc + (even ? 0 : -1);
297  sph = hsrc;
298  ui32 width = L_width + (even ? 0 : 1);
299  for (ui32 i = (width + 3) >> 2; i > 0; --i, spl+=4, sph+=4, dp+=8)
300  {
301  __m128 s1 = _mm_loadu_ps(spl);
302  __m128 s2 = _mm_loadu_ps(spl + 1);
303  __m128 d = _mm_load_ps(sph);
304  s2 = _mm_mul_ps(factor, _mm_add_ps(s1, s2));
305  d = _mm_add_ps(d, s2);
306  _mm_storeu_ps(dp, _mm_unpacklo_ps(s1, d));
307  _mm_storeu_ps(dp + 4, _mm_unpackhi_ps(s1, d));
308  }
309  }
310  else
311  {
312  if (even)
313  line_dst->f32[0] = line_lsrc->f32[0];
314  else
315  line_dst->f32[0] = line_hsrc->f32[0] * 0.5f;
316  }
317  }
318  }
319 }
void sse_irrev_horz_wvlt_bwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse_irrev_vert_wvlt_K(const line_buf *src, line_buf *dst, bool L_analysis_or_H_synthesis, ui32 repeat)
void sse_irrev_vert_wvlt_step(const line_buf *src1, const line_buf *src2, line_buf *dst, int step_num, ui32 repeat)
uint32_t ui32
Definition: ojph_defs.h:54
float * f32
Definition: ojph_mem.h:156