OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_img_io_avx2.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_img_io_avx2.cpp
34 // Author: Aous Naman
35 // Date: 23 May 2022
36 //***************************************************************************/
37 
38 
39 #include <cstdlib>
40 #include <cstring>
41 #include <immintrin.h>
42 
43 #include "ojph_file.h"
44 #include "ojph_img_io.h"
45 #include "ojph_mem.h"
46 #include "ojph_message.h"
47 
48 namespace ojph {
49 
51  static
52  ui16 be2le(const ui16 v)
53  {
54  return (ui16)((v<<8) | (v>>8));
55  }
56 
58  void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
59  const line_buf *ln2, void *dp,
60  int bit_depth, int count)
61  {
62  ojph_unused(ln1);
63  ojph_unused(ln2);
64 
65  __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
66  __m256i zero = _mm256_setzero_si256();
67  __m256i mask = _mm256_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400,
68  0x0F0B07030E0A0602, 0x0D0905010C080400);
69  const si32 *sp = ln0->i32;
70  ui8* p = (ui8 *)dp;
71 
72  // 32 bytes or entries in each loop
73  for ( ; count >= 32; count -= 32, sp += 32, p += 32)
74  {
75  __m256i a, t, u, v0, v1;
76  a = _mm256_load_si256((__m256i*)sp);
77  a = _mm256_max_epi32(a, zero);
78  t = _mm256_min_epi32(a, max_val_vec);
79 
80  a = _mm256_load_si256((__m256i*)sp + 1);
81  a = _mm256_max_epi32(a, zero);
82  a = _mm256_min_epi32(a, max_val_vec);
83  a = _mm256_slli_epi32(a, 16);
84  t = _mm256_or_si256(t, a);
85 
86  a = _mm256_load_si256((__m256i*)sp + 2);
87  a = _mm256_max_epi32(a, zero);
88  u = _mm256_min_epi32(a, max_val_vec);
89 
90  a = _mm256_load_si256((__m256i*)sp + 3);
91  a = _mm256_max_epi32(a, zero);
92  a = _mm256_min_epi32(a, max_val_vec);
93  a = _mm256_slli_epi32(a, 16);
94  u = _mm256_or_si256(u, a);
95 
96  v0 = _mm256_permute2x128_si256(t, u, 0x20);
97  v1 = _mm256_permute2x128_si256(t, u, 0x31);
98  v1 = _mm256_slli_epi32(v1, 8);
99  v0 = _mm256_or_si256(v0, v1);
100 
101  v0 = _mm256_shuffle_epi8(v0, mask);
102  _mm256_storeu_si256((__m256i*)p, v0);
103  }
104 
105  int max_val = (1 << bit_depth) - 1;
106  for ( ; count > 0; --count)
107  {
108  int val = *sp++;
109  val = val >= 0 ? val : 0;
110  val = val <= max_val ? val : max_val;
111  *p++ = (ui8)val;
112  }
113  }
114 
116  void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
117  const line_buf *ln2, void *dp,
118  int bit_depth, int count)
119  {
120  const si32 *sp0 = ln0->i32;
121  const si32 *sp1 = ln1->i32;
122  const si32 *sp2 = ln2->i32;
123  ui8* p = (ui8 *)dp;
124 
125  __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
126  __m256i zero = _mm256_setzero_si256();
127  __m256i m0 = _mm256_set_epi64x(0xFFFFFFFF0E0D0C0A, 0x0908060504020100,
128  0xFFFFFFFF0E0D0C0A, 0x0908060504020100);
129 
130  // 32 entries or entries in each loop
131  for ( ; count >= 32; count -= 32, sp0 += 32, sp1 += 32, sp2 += 32, p += 96)
132  {
133  __m256i a, t, u, v, w;
134  a = _mm256_load_si256((__m256i*)sp0);
135  a = _mm256_max_epi32(a, zero);
136  t = _mm256_min_epi32(a, max_val_vec);
137 
138  a = _mm256_load_si256((__m256i*)sp1);
139  a = _mm256_max_epi32(a, zero);
140  a = _mm256_min_epi32(a, max_val_vec);
141  a = _mm256_slli_epi32(a, 8);
142  t = _mm256_or_si256(t, a);
143 
144  a = _mm256_load_si256((__m256i*)sp2);
145  a = _mm256_max_epi32(a, zero);
146  a = _mm256_min_epi32(a, max_val_vec);
147  a = _mm256_slli_epi32(a, 16);
148  t = _mm256_or_si256(t, a);
149  t = _mm256_shuffle_epi8(t, m0);
150 
151 
152  a = _mm256_load_si256((__m256i*)sp0 + 1);
153  a = _mm256_max_epi32(a, zero);
154  u = _mm256_min_epi32(a, max_val_vec);
155 
156  a = _mm256_load_si256((__m256i*)sp1 + 1);
157  a = _mm256_max_epi32(a, zero);
158  a = _mm256_min_epi32(a, max_val_vec);
159  a = _mm256_slli_epi32(a, 8);
160  u = _mm256_or_si256(u, a);
161 
162  a = _mm256_load_si256((__m256i*)sp2 + 1);
163  a = _mm256_max_epi32(a, zero);
164  a = _mm256_min_epi32(a, max_val_vec);
165  a = _mm256_slli_epi32(a, 16);
166  u = _mm256_or_si256(u, a);
167  u = _mm256_shuffle_epi8(u, m0);
168 
169 
170  a = _mm256_load_si256((__m256i*)sp0 + 2);
171  a = _mm256_max_epi32(a, zero);
172  v = _mm256_min_epi32(a, max_val_vec);
173 
174  a = _mm256_load_si256((__m256i*)sp1 + 2);
175  a = _mm256_max_epi32(a, zero);
176  a = _mm256_min_epi32(a, max_val_vec);
177  a = _mm256_slli_epi32(a, 8);
178  v = _mm256_or_si256(v, a);
179 
180  a = _mm256_load_si256((__m256i*)sp2 + 2);
181  a = _mm256_max_epi32(a, zero);
182  a = _mm256_min_epi32(a, max_val_vec);
183  a = _mm256_slli_epi32(a, 16);
184  v = _mm256_or_si256(v, a);
185  v = _mm256_shuffle_epi8(v, m0);
186 
187 
188  a = _mm256_load_si256((__m256i*)sp0 + 3);
189  a = _mm256_max_epi32(a, zero);
190  w = _mm256_min_epi32(a, max_val_vec);
191 
192  a = _mm256_load_si256((__m256i*)sp1 + 3);
193  a = _mm256_max_epi32(a, zero);
194  a = _mm256_min_epi32(a, max_val_vec);
195  a = _mm256_slli_epi32(a, 8);
196  w = _mm256_or_si256(w, a);
197 
198  a = _mm256_load_si256((__m256i*)sp2 + 3);
199  a = _mm256_max_epi32(a, zero);
200  a = _mm256_min_epi32(a, max_val_vec);
201  a = _mm256_slli_epi32(a, 16);
202  w = _mm256_or_si256(w, a);
203  w = _mm256_shuffle_epi8(w, m0);
204 
205  _mm_storeu_si128((__m128i*)(p ), _mm256_castsi256_si128(t));
206  _mm_storeu_si128((__m128i*)(p + 12), _mm256_extracti128_si256(t, 1));
207  _mm_storeu_si128((__m128i*)(p + 24), _mm256_castsi256_si128(u));
208  _mm_storeu_si128((__m128i*)(p + 36), _mm256_extracti128_si256(u, 1));
209  _mm_storeu_si128((__m128i*)(p + 48), _mm256_castsi256_si128(v));
210  _mm_storeu_si128((__m128i*)(p + 60), _mm256_extracti128_si256(v, 1));
211  _mm_storeu_si128((__m128i*)(p + 72), _mm256_castsi256_si128(w));
212  _mm_storeu_si128((__m128i*)(p + 84), _mm256_extracti128_si256(w, 1));
213  }
214 
215  int max_val = (1<<bit_depth) - 1;
216  for ( ; count > 0; --count)
217  {
218  int val;
219  val = *sp0++;
220  val = val >= 0 ? val : 0;
221  val = val <= max_val ? val : max_val;
222  *p++ = (ui8) val;
223  val = *sp1++;
224  val = val >= 0 ? val : 0;
225  val = val <= max_val ? val : max_val;
226  *p++ = (ui8) val;
227  val = *sp2++;
228  val = val >= 0 ? val : 0;
229  val = val <= max_val ? val : max_val;
230  *p++ = (ui8) val;
231  }
232  }
233 
235  void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
236  const line_buf *ln2, void *dp,
237  int bit_depth, int count)
238  {
239  ojph_unused(ln1);
240  ojph_unused(ln2);
241 
242  __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
243  __m256i zero = _mm256_setzero_si256();
244  __m256i mask = _mm256_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100,
245  0x0F0E0B0A07060302, 0x0D0C090805040100);
246  const si32 *sp = ln0->i32;
247  ui16* p = (ui16 *)dp;
248 
249  // 16 entries in each loop
250  for ( ; count >= 16; count -= 16, sp += 16, p += 16)
251  {
252  __m256i a, t;
253  a = _mm256_load_si256((__m256i*)sp);
254  a = _mm256_max_epi32(a, zero);
255  t = _mm256_min_epi32(a, max_val_vec);
256 
257  a = _mm256_load_si256((__m256i*)sp + 1);
258  a = _mm256_max_epi32(a, zero);
259  a = _mm256_min_epi32(a, max_val_vec);
260  a = _mm256_slli_epi32(a, 16);
261  t = _mm256_or_si256(t, a);
262 
263  t = _mm256_shuffle_epi8(t, mask);
264  t = _mm256_permute4x64_epi64(t, 0xD8);
265  _mm256_storeu_si256((__m256i*)p, t);
266  }
267 
268  int max_val = (1<<bit_depth) - 1;
269  for ( ; count > 0; --count)
270  {
271  int val = *sp++;
272  val = val >= 0 ? val : 0;
273  val = val <= max_val ? val : max_val;
274  *p++ = (ui16) val;
275  }
276  }
277 
279  void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
280  const line_buf *ln2, void *dp,
281  int bit_depth, int count)
282  {
283  ojph_unused(ln1);
284  ojph_unused(ln2);
285 
286  __m256i max_val_vec = _mm256_set1_epi32((1 << bit_depth) - 1);
287  __m256i zero = _mm256_setzero_si256();
288  __m256i mask = _mm256_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001,
289  0x0E0F0A0B06070203, 0x0C0D080904050001);
290  const si32 *sp = ln0->i32;
291  ui16* p = (ui16 *)dp;
292 
293  // 16 entries in each loop
294  for ( ; count >= 16; count -= 16, sp += 16, p += 16)
295  {
296  __m256i a, t;
297  a = _mm256_load_si256((__m256i*)sp);
298  a = _mm256_max_epi32(a, zero);
299  t = _mm256_min_epi32(a, max_val_vec);
300 
301  a = _mm256_load_si256((__m256i*)sp + 1);
302  a = _mm256_max_epi32(a, zero);
303  a = _mm256_min_epi32(a, max_val_vec);
304  a = _mm256_slli_epi32(a, 16);
305  t = _mm256_or_si256(t, a);
306 
307  t = _mm256_shuffle_epi8(t, mask);
308  t = _mm256_permute4x64_epi64(t, 0xD8);
309  _mm256_storeu_si256((__m256i*)p, t);
310  }
311 
312  int max_val = (1<<bit_depth) - 1;
313  for ( ; count > 0; --count)
314  {
315  int val = *sp++;
316  val = val >= 0 ? val : 0;
317  val = val <= max_val ? val : max_val;
318  *p++ = be2le((ui16) val);
319  }
320  }
321 }
void avx2_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void avx2_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint16_t ui16
Definition: ojph_defs.h:52
static ui16 be2le(const ui16 v)
Definition: ojph_img_io.cpp:55
int32_t si32
Definition: ojph_defs.h:55
void avx2_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint8_t ui8
Definition: ojph_defs.h:50
void avx2_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
#define ojph_unused(x)
Definition: ojph_defs.h:78
si32 * i32
Definition: ojph_mem.h:155