OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_transform_sse2.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_transform_sse2.cpp
34 // Author: Aous Naman
35 // Date: 28 August 2019
36 //***************************************************************************/
37 
38 #include <cstdio>
39 
40 #include "ojph_defs.h"
41 #include "ojph_arch.h"
42 #include "ojph_mem.h"
43 #include "ojph_transform.h"
44 #include "ojph_transform_local.h"
45 
46 #include <immintrin.h>
47 
48 namespace ojph {
49  namespace local {
50 
53  const line_buf* line_src2,
54  line_buf *line_dst, ui32 repeat)
55  {
56  si32 *dst = line_dst->i32;
57  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
58 
59  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
60  {
61  __m128i s1 = _mm_load_si128((__m128i*)src1);
62  __m128i s2 = _mm_load_si128((__m128i*)src2);
63  __m128i d = _mm_load_si128((__m128i*)dst);
64  s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
65  d = _mm_sub_epi32(d, s1);
66  _mm_store_si128((__m128i*)dst, d);
67  }
68  }
69 
71  void sse2_rev_vert_wvlt_fwd_update(const line_buf* line_src1,
72  const line_buf* line_src2,
73  line_buf *line_dst, ui32 repeat)
74  {
75  si32 *dst = line_dst->i32;
76  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
77 
78  __m128i offset = _mm_set1_epi32(2);
79  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
80  {
81  __m128i s1 = _mm_load_si128((__m128i*)src1);
82  s1 = _mm_add_epi32(s1, offset);
83  __m128i s2 = _mm_load_si128((__m128i*)src2);
84  s2 = _mm_add_epi32(s2, s1);
85  __m128i d = _mm_load_si128((__m128i*)dst);
86  d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
87  _mm_store_si128((__m128i*)dst, d);
88  }
89  }
90 
92  void sse2_rev_horz_wvlt_fwd_tx(line_buf *line_src, line_buf *line_ldst,
93  line_buf *line_hdst, ui32 width, bool even)
94  {
95  if (width > 1)
96  {
97  si32 *src = line_src->i32;
98  si32 *ldst = line_ldst->i32, *hdst = line_hdst->i32;
99 
100  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
101  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
102 
103  // extension
104  src[-1] = src[1];
105  src[width] = src[width-2];
106  // predict
107  const si32* sp = src + (even ? 1 : 0);
108  si32 *dph = hdst;
109  for (ui32 i = (H_width + 3) >> 2; i > 0; --i, dph+=4)
110  { //this is doing twice the work it needs to do
111  //it can be definitely written better
112  __m128i s1 = _mm_loadu_si128((__m128i*)(sp-1));
113  __m128i s2 = _mm_loadu_si128((__m128i*)(sp+1));
114  __m128i d = _mm_loadu_si128((__m128i*)sp);
115  s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
116  __m128i d1 = _mm_sub_epi32(d, s1);
117  sp += 4;
118  s1 = _mm_loadu_si128((__m128i*)(sp-1));
119  s2 = _mm_loadu_si128((__m128i*)(sp+1));
120  d = _mm_loadu_si128((__m128i*)sp);
121  s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
122  __m128i d2 = _mm_sub_epi32(d, s1);
123  sp += 4;
124  d = _mm_castps_si128(_mm_shuffle_ps(
125  _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
126  _mm_store_si128((__m128i*)dph, d);
127  }
128 
129  // extension
130  hdst[-1] = hdst[0];
131  hdst[H_width] = hdst[H_width-1];
132  // update
133  sp = src + (even ? 0 : 1);
134  const si32* sph = hdst + (even ? 0 : 1);
135  si32 *dpl = ldst;
136  __m128i offset = _mm_set1_epi32(2);
137  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sp+=8, sph+=4, dpl+=4)
138  {
139  __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
140  s1 = _mm_add_epi32(s1, offset);
141  __m128i s2 = _mm_loadu_si128((__m128i*)sph);
142  s2 = _mm_add_epi32(s2, s1);
143  __m128i d1 = _mm_loadu_si128((__m128i*)sp);
144  __m128i d2 = _mm_loadu_si128((__m128i*)sp + 1);
145  __m128i d = _mm_castps_si128(_mm_shuffle_ps(
146  _mm_castsi128_ps(d1), _mm_castsi128_ps(d2), 0x88));
147  d = _mm_add_epi32(d, _mm_srai_epi32(s2, 2));
148  _mm_store_si128((__m128i*)dpl, d);
149  }
150  }
151  else
152  {
153  if (even)
154  line_ldst->i32[0] = line_src->i32[0];
155  else
156  line_hdst->i32[0] = line_src->i32[0] << 1;
157  }
158  }
159 
162  const line_buf* line_src2,
163  line_buf *line_dst, ui32 repeat)
164  {
165  si32 *dst = line_dst->i32;
166  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
167 
168  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
169  {
170  __m128i s1 = _mm_load_si128((__m128i*)src1);
171  __m128i s2 = _mm_load_si128((__m128i*)src2);
172  __m128i d = _mm_load_si128((__m128i*)dst);
173  s1 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
174  d = _mm_add_epi32(d, s1);
175  _mm_store_si128((__m128i*)dst, d);
176  }
177  }
178 
181  const line_buf* line_src2,
182  line_buf *line_dst, ui32 repeat)
183  {
184  si32 *dst = line_dst->i32;
185  const si32 *src1 = line_src1->i32, *src2 = line_src2->i32;
186 
187  __m128i offset = _mm_set1_epi32(2);
188  for (ui32 i = (repeat + 3) >> 2; i > 0; --i, dst+=4, src1+=4, src2+=4)
189  {
190  __m128i s1 = _mm_load_si128((__m128i*)src1);
191  s1 = _mm_add_epi32(s1, offset);
192  __m128i s2 = _mm_load_si128((__m128i*)src2);
193  s2 = _mm_add_epi32(s2, s1);
194  __m128i d = _mm_load_si128((__m128i*)dst);
195  d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
196  _mm_store_si128((__m128i*)dst, d);
197  }
198  }
199 
201  void sse2_rev_horz_wvlt_bwd_tx(line_buf *line_dst, line_buf *line_lsrc,
202  line_buf *line_hsrc, ui32 width, bool even)
203  {
204  if (width > 1)
205  {
206  si32 *lsrc = line_lsrc->i32, *hsrc = line_hsrc->i32;
207  si32 *dst = line_dst->i32;
208 
209  const ui32 L_width = (width + (even ? 1 : 0)) >> 1;
210  const ui32 H_width = (width + (even ? 0 : 1)) >> 1;
211 
212  // extension
213  hsrc[-1] = hsrc[0];
214  hsrc[H_width] = hsrc[H_width-1];
215  //inverse update
216  const si32 *sph = hsrc + (even ? 0 : 1);
217  si32 *spl = lsrc;
218  __m128i offset = _mm_set1_epi32(2);
219  for (ui32 i = (L_width + 3) >> 2; i > 0; --i, sph+=4, spl+=4)
220  {
221  __m128i s1 = _mm_loadu_si128((__m128i*)(sph-1));
222  s1 = _mm_add_epi32(s1, offset);
223  __m128i s2 = _mm_loadu_si128((__m128i*)sph);
224  s2 = _mm_add_epi32(s2, s1);
225  __m128i d = _mm_load_si128((__m128i*)spl);
226  d = _mm_sub_epi32(d, _mm_srai_epi32(s2, 2));
227  _mm_store_si128((__m128i*)spl, d);
228  }
229 
230  // extension
231  lsrc[-1] = lsrc[0];
232  lsrc[L_width] = lsrc[L_width - 1];
233  // inverse predict and combine
234  si32 *dp = dst + (even ? 0 : -1);
235  spl = lsrc + (even ? 0 : -1);
236  sph = hsrc;
237  ui32 width = L_width + (even ? 0 : 1);
238  for (ui32 i = (width + 3) >> 2; i > 0; --i, sph+=4, spl+=4, dp+=8)
239  {
240  __m128i s1 = _mm_loadu_si128((__m128i*)spl);
241  __m128i s2 = _mm_loadu_si128((__m128i*)(spl+1));
242  __m128i d = _mm_load_si128((__m128i*)sph);
243  s2 = _mm_srai_epi32(_mm_add_epi32(s1, s2), 1);
244  d = _mm_add_epi32(d, s2);
245  _mm_storeu_si128((__m128i*)dp, _mm_unpacklo_epi32(s1, d));
246  _mm_storeu_si128((__m128i*)dp + 1, _mm_unpackhi_epi32(s1, d));
247  }
248  }
249  else
250  {
251  if (even)
252  line_dst->i32[0] = line_lsrc->i32[0];
253  else
254  line_dst->i32[0] = line_hsrc->i32[0] >> 1;
255  }
256  }
257  }
258 }
void sse2_rev_horz_wvlt_fwd_tx(line_buf *src, line_buf *ldst, line_buf *hdst, ui32 width, bool even)
void sse2_rev_vert_wvlt_fwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_horz_wvlt_bwd_tx(line_buf *dst, line_buf *lsrc, line_buf *hsrc, ui32 width, bool even)
void sse2_rev_vert_wvlt_bwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_fwd_predict(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
void sse2_rev_vert_wvlt_bwd_update(const line_buf *src1, const line_buf *src2, line_buf *dst, ui32 repeat)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
si32 * i32
Definition: ojph_mem.h:155