OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_img_io_sse41.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2019, Aous Naman
6 // Copyright (c) 2019, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2019, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_img_io_sse41.cpp
34 // Author: Aous Naman
35 // Date: 23 May 2022
36 //***************************************************************************/
37 
38 
39 #include <cstdlib>
40 #include <cstring>
41 #include <immintrin.h>
42 
43 #include "ojph_file.h"
44 #include "ojph_img_io.h"
45 #include "ojph_mem.h"
46 #include "ojph_message.h"
47 
48 namespace ojph {
49 
51  static
52  ui16 be2le(const ui16 v)
53  {
54  return (ui16)((v<<8) | (v>>8));
55  }
56 
58  void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1,
59  const line_buf *ln2, void *dp,
60  int bit_depth, int count)
61  {
62  ojph_unused(ln1);
63  ojph_unused(ln2);
64 
65  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
66  __m128i zero = _mm_setzero_si128();
67  __m128i mask = _mm_set_epi64x(0x0F0B07030E0A0602, 0x0D0905010C080400);
68  const si32 *sp = ln0->i32;
69  ui8* p = (ui8 *)dp;
70 
71  // 16 bytes or entries in each loop
72  for ( ; count >= 16; count -= 16, sp += 16, p += 16)
73  {
74  __m128i a, t;
75  a = _mm_load_si128((__m128i*)sp);
76  a = _mm_max_epi32(a, zero);
77  t = _mm_min_epi32(a, max_val_vec);
78 
79  a = _mm_load_si128((__m128i*)sp + 1);
80  a = _mm_max_epi32(a, zero);
81  a = _mm_min_epi32(a, max_val_vec);
82  a = _mm_slli_epi32(a, 8);
83  t = _mm_or_si128(t, a);
84 
85  a = _mm_load_si128((__m128i*)sp + 2);
86  a = _mm_max_epi32(a, zero);
87  a = _mm_min_epi32(a, max_val_vec);
88  a = _mm_slli_epi32(a, 16);
89  t = _mm_or_si128(t, a);
90 
91  a = _mm_load_si128((__m128i*)sp + 3);
92  a = _mm_max_epi32(a, zero);
93  a = _mm_min_epi32(a, max_val_vec);
94  a = _mm_slli_epi32(a, 24);
95  t = _mm_or_si128(t, a);
96 
97  t = _mm_shuffle_epi8(t, mask);
98  _mm_storeu_si128((__m128i*)p, t);
99  }
100 
101  int max_val = (1 << bit_depth) - 1;
102  for ( ; count > 0; --count)
103  {
104  int val = *sp++;
105  val = val >= 0 ? val : 0;
106  val = val <= max_val ? val : max_val;
107  *p++ = (ui8)val;
108  }
109  }
110 
112  void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1,
113  const line_buf *ln2, void *dp,
114  int bit_depth, int count)
115  {
116  const si32 *sp0 = ln0->i32;
117  const si32 *sp1 = ln1->i32;
118  const si32 *sp2 = ln2->i32;
119  ui8* p = (ui8 *)dp;
120 
121  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
122  __m128i zero = _mm_setzero_si128();
123  __m128i m0 = _mm_set_epi64x(0xFFFFFFFF0E0D0C0A, 0x0908060504020100);
124 
125  // 16 entries in each loop
126  for ( ; count >= 16; count -= 16, sp0 += 16, sp1 += 16, sp2 += 16, p += 48)
127  {
128  __m128i a, t, u, v, w;
129  a = _mm_load_si128((__m128i*)sp0);
130  a = _mm_max_epi32(a, zero);
131  t = _mm_min_epi32(a, max_val_vec);
132 
133  a = _mm_load_si128((__m128i*)sp1);
134  a = _mm_max_epi32(a, zero);
135  a = _mm_min_epi32(a, max_val_vec);
136  a = _mm_slli_epi32(a, 8);
137  t = _mm_or_si128(t, a);
138 
139  a = _mm_load_si128((__m128i*)sp2);
140  a = _mm_max_epi32(a, zero);
141  a = _mm_min_epi32(a, max_val_vec);
142  a = _mm_slli_epi32(a, 16);
143  t = _mm_or_si128(t, a);
144  t = _mm_shuffle_epi8(t, m0);
145 
146  a = _mm_load_si128((__m128i*)sp0 + 1);
147  a = _mm_max_epi32(a, zero);
148  u = _mm_min_epi32(a, max_val_vec);
149 
150  a = _mm_load_si128((__m128i*)sp1 + 1);
151  a = _mm_max_epi32(a, zero);
152  a = _mm_min_epi32(a, max_val_vec);
153  a = _mm_slli_epi32(a, 8);
154  u = _mm_or_si128(u, a);
155 
156  a = _mm_load_si128((__m128i*)sp2 + 1);
157  a = _mm_max_epi32(a, zero);
158  a = _mm_min_epi32(a, max_val_vec);
159  a = _mm_slli_epi32(a, 16);
160  u = _mm_or_si128(u, a);
161  u = _mm_shuffle_epi8(u, m0);
162 
163  a = _mm_load_si128((__m128i*)sp0 + 2);
164  a = _mm_max_epi32(a, zero);
165  v = _mm_min_epi32(a, max_val_vec);
166 
167  a = _mm_load_si128((__m128i*)sp1 + 2);
168  a = _mm_max_epi32(a, zero);
169  a = _mm_min_epi32(a, max_val_vec);
170  a = _mm_slli_epi32(a, 8);
171  v = _mm_or_si128(v, a);
172 
173  a = _mm_load_si128((__m128i*)sp2 + 2);
174  a = _mm_max_epi32(a, zero);
175  a = _mm_min_epi32(a, max_val_vec);
176  a = _mm_slli_epi32(a, 16);
177  v = _mm_or_si128(v, a);
178  v = _mm_shuffle_epi8(v, m0);
179 
180  a = _mm_load_si128((__m128i*)sp0 + 3);
181  a = _mm_max_epi32(a, zero);
182  w = _mm_min_epi32(a, max_val_vec);
183 
184  a = _mm_load_si128((__m128i*)sp1 + 3);
185  a = _mm_max_epi32(a, zero);
186  a = _mm_min_epi32(a, max_val_vec);
187  a = _mm_slli_epi32(a, 8);
188  w = _mm_or_si128(w, a);
189 
190  a = _mm_load_si128((__m128i*)sp2 + 3);
191  a = _mm_max_epi32(a, zero);
192  a = _mm_min_epi32(a, max_val_vec);
193  a = _mm_slli_epi32(a, 16);
194  w = _mm_or_si128(w, a);
195  w = _mm_shuffle_epi8(w, m0);
196 
197  t = _mm_or_si128(t, _mm_bslli_si128(u, 12));
198  u = _mm_or_si128(_mm_bsrli_si128(u, 4), _mm_bslli_si128(v, 8));
199  v = _mm_or_si128(_mm_bsrli_si128(v, 8), _mm_bslli_si128(w, 4));
200 
201  _mm_storeu_si128((__m128i*)p + 0, t);
202  _mm_storeu_si128((__m128i*)p + 1, u);
203  _mm_storeu_si128((__m128i*)p + 2, v);
204  }
205 
206  int max_val = (1<<bit_depth) - 1;
207  for ( ; count > 0; --count)
208  {
209  int val;
210  val = *sp0++;
211  val = val >= 0 ? val : 0;
212  val = val <= max_val ? val : max_val;
213  *p++ = (ui8) val;
214  val = *sp1++;
215  val = val >= 0 ? val : 0;
216  val = val <= max_val ? val : max_val;
217  *p++ = (ui8) val;
218  val = *sp2++;
219  val = val >= 0 ? val : 0;
220  val = val <= max_val ? val : max_val;
221  *p++ = (ui8) val;
222  }
223  }
224 
226  void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1,
227  const line_buf *ln2, void *dp,
228  int bit_depth, int count)
229  {
230  ojph_unused(ln1);
231  ojph_unused(ln2);
232 
233  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
234  __m128i zero = _mm_setzero_si128();
235  __m128i mask = _mm_set_epi64x(0x0F0E0B0A07060302, 0x0D0C090805040100);
236  const si32 *sp = ln0->i32;
237  ui16* p = (ui16 *)dp;
238 
239  // 8 entries in each loop
240  for ( ; count >= 8; count -= 8, sp += 8, p += 8)
241  {
242  __m128i a, t;
243  a = _mm_load_si128((__m128i*)sp);
244  a = _mm_max_epi32(a, zero);
245  t = _mm_min_epi32(a, max_val_vec);
246 
247  a = _mm_load_si128((__m128i*)sp + 1);
248  a = _mm_max_epi32(a, zero);
249  a = _mm_min_epi32(a, max_val_vec);
250  a = _mm_slli_epi32(a, 16);
251  t = _mm_or_si128(t, a);
252 
253  t = _mm_shuffle_epi8(t, mask);
254  _mm_storeu_si128((__m128i*)p, t);
255  }
256 
257  int max_val = (1<<bit_depth) - 1;
258  for ( ; count > 0; --count)
259  {
260  int val = *sp++;
261  val = val >= 0 ? val : 0;
262  val = val <= max_val ? val : max_val;
263  *p++ = (ui16) val;
264  }
265  }
266 
268  void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1,
269  const line_buf *ln2, void *dp,
270  int bit_depth, int count)
271  {
272  const si32 *sp0 = ln0->i32;
273  const si32 *sp1 = ln1->i32;
274  const si32 *sp2 = ln2->i32;
275  ui16* p = (ui16*)dp;
276 
277  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
278  __m128i zero = _mm_setzero_si128();
279 
280  __m128i m0 = _mm_set_epi64x(0x0B0A0908FFFF0706, 0x0504FFFF03020100);
281  __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0504FFFF, 0xFFFF0100FFFFFFFF);
282  __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0F0E0D0CFFFF);
283  __m128i m3 = _mm_set_epi64x(0x0706FFFFFFFF0302, 0x0D0CFFFFFFFF0908);
284  __m128i m4 = _mm_set_epi64x(0xFFFF03020100FFFF, 0xFFFFFFFFFFFFFFFF);
285  __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0F0EFFFF, 0xFFFF0B0AFFFFFFFF);
286  __m128i m6 = _mm_set_epi64x(0x0F0E0D0CFFFF0B0A, 0x0908FFFF07060504);
287 
288  // 24 entries in each loop
289  for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
290  {
291  __m128i a, b, t, u, v;
292  a = _mm_load_si128((__m128i*)sp0);
293  a = _mm_max_epi32(a, zero);
294  t = _mm_min_epi32(a, max_val_vec);
295 
296  a = _mm_load_si128((__m128i*)sp1);
297  a = _mm_max_epi32(a, zero);
298  a = _mm_min_epi32(a, max_val_vec);
299  a = _mm_slli_epi32(a, 16);
300  t = _mm_or_si128(t, a);
301 
302  a = _mm_load_si128((__m128i*)sp2);
303  a = _mm_max_epi32(a, zero);
304  u = _mm_min_epi32(a, max_val_vec);
305 
306  a = _mm_load_si128((__m128i*)sp0 + 1);
307  a = _mm_max_epi32(a, zero);
308  a = _mm_min_epi32(a, max_val_vec);
309  a = _mm_slli_epi32(a, 16);
310  u = _mm_or_si128(u, a);
311 
312  a = _mm_load_si128((__m128i*)sp1 + 1);
313  a = _mm_max_epi32(a, zero);
314  v = _mm_min_epi32(a, max_val_vec);
315 
316  a = _mm_load_si128((__m128i*)sp2 + 1);
317  a = _mm_max_epi32(a, zero);
318  a = _mm_min_epi32(a, max_val_vec);
319  a = _mm_slli_epi32(a, 16);
320  v = _mm_or_si128(v, a);
321 
322  a = _mm_shuffle_epi8(t, m0);
323  b = _mm_shuffle_epi8(u, m1);
324  a = _mm_or_si128(a, b);
325  _mm_storeu_si128((__m128i*)p, a);
326 
327  a = _mm_shuffle_epi8(t, m2);
328  b = _mm_shuffle_epi8(u, m3);
329  a = _mm_or_si128(a, b);
330  b = _mm_shuffle_epi8(v, m4);
331  a = _mm_or_si128(a, b);
332  _mm_storeu_si128((__m128i*)p + 1, a);
333 
334  a = _mm_shuffle_epi8(u, m5);
335  b = _mm_shuffle_epi8(v, m6);
336  a = _mm_or_si128(a, b);
337  _mm_storeu_si128((__m128i*)p + 2, a);
338  }
339 
340  int max_val = (1<<bit_depth) - 1;
341  for ( ; count > 0; --count)
342  {
343  int val;
344  val = *sp0++;
345  val = val >= 0 ? val : 0;
346  val = val <= max_val ? val : max_val;
347  *p++ = be2le((ui16) val);
348  val = *sp1++;
349  val = val >= 0 ? val : 0;
350  val = val <= max_val ? val : max_val;
351  *p++ = be2le((ui16) val);
352  val = *sp2++;
353  val = val >= 0 ? val : 0;
354  val = val <= max_val ? val : max_val;
355  *p++ = (ui16) val;
356  }
357  }
358 
360  void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1,
361  const line_buf *ln2, void *dp,
362  int bit_depth, int count)
363  {
364  ojph_unused(ln1);
365  ojph_unused(ln2);
366 
367  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
368  __m128i zero = _mm_setzero_si128();
369  __m128i mask = _mm_set_epi64x(0x0E0F0A0B06070203, 0x0C0D080904050001);
370  const si32 *sp = ln0->i32;
371  ui16* p = (ui16 *)dp;
372 
373  // 8 entries in each loop
374  for ( ; count >= 8; count -= 8, sp += 8, p += 8)
375  {
376  __m128i a, t;
377  a = _mm_load_si128((__m128i*)sp);
378  a = _mm_max_epi32(a, zero);
379  t = _mm_min_epi32(a, max_val_vec);
380 
381  a = _mm_load_si128((__m128i*)sp + 1);
382  a = _mm_max_epi32(a, zero);
383  a = _mm_min_epi32(a, max_val_vec);
384  a = _mm_slli_epi32(a, 16);
385  t = _mm_or_si128(t, a);
386 
387  t = _mm_shuffle_epi8(t, mask);
388  _mm_storeu_si128((__m128i*)p, t);
389  }
390 
391  int max_val = (1<<bit_depth) - 1;
392  for ( ; count > 0; --count)
393  {
394  int val = *sp++;
395  val = val >= 0 ? val : 0;
396  val = val <= max_val ? val : max_val;
397  *p++ = be2le((ui16) val);
398  }
399  }
400 
402  void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1,
403  const line_buf *ln2, void *dp,
404  int bit_depth, int count)
405  {
406  const si32 *sp0 = ln0->i32;
407  const si32 *sp1 = ln1->i32;
408  const si32 *sp2 = ln2->i32;
409  ui16* p = (ui16*)dp;
410 
411  __m128i max_val_vec = _mm_set1_epi32((1 << bit_depth) - 1);
412  __m128i zero = _mm_setzero_si128();
413 
414  __m128i m0 = _mm_set_epi64x(0x0A0B0809FFFF0607, 0x0405FFFF02030001);
415  __m128i m1 = _mm_set_epi64x(0xFFFFFFFF0405FFFF, 0xFFFF0001FFFFFFFF);
416  __m128i m2 = _mm_set_epi64x(0xFFFFFFFFFFFFFFFF, 0xFFFF0E0F0C0DFFFF);
417  __m128i m3 = _mm_set_epi64x(0x0607FFFFFFFF0203, 0x0C0DFFFFFFFF0809);
418  __m128i m4 = _mm_set_epi64x(0xFFFF02030001FFFF, 0xFFFFFFFFFFFFFFFF);
419  __m128i m5 = _mm_set_epi64x(0xFFFFFFFF0E0FFFFF, 0xFFFF0A0BFFFFFFFF);
420  __m128i m6 = _mm_set_epi64x(0x0E0F0C0DFFFF0A0B, 0x0809FFFF06070405);
421 
422  // 24 entries in each loop
423  for ( ; count >= 8; count -= 8, sp0 += 8, sp1 += 8, sp2 += 8, p += 24)
424  {
425  __m128i a, b, t, u, v;
426  a = _mm_load_si128((__m128i*)sp0);
427  a = _mm_max_epi32(a, zero);
428  t = _mm_min_epi32(a, max_val_vec);
429 
430  a = _mm_load_si128((__m128i*)sp1);
431  a = _mm_max_epi32(a, zero);
432  a = _mm_min_epi32(a, max_val_vec);
433  a = _mm_slli_epi32(a, 16);
434  t = _mm_or_si128(t, a);
435 
436  a = _mm_load_si128((__m128i*)sp2);
437  a = _mm_max_epi32(a, zero);
438  u = _mm_min_epi32(a, max_val_vec);
439 
440  a = _mm_load_si128((__m128i*)sp0 + 1);
441  a = _mm_max_epi32(a, zero);
442  a = _mm_min_epi32(a, max_val_vec);
443  a = _mm_slli_epi32(a, 16);
444  u = _mm_or_si128(u, a);
445 
446  a = _mm_load_si128((__m128i*)sp1 + 1);
447  a = _mm_max_epi32(a, zero);
448  v = _mm_min_epi32(a, max_val_vec);
449 
450  a = _mm_load_si128((__m128i*)sp2 + 1);
451  a = _mm_max_epi32(a, zero);
452  a = _mm_min_epi32(a, max_val_vec);
453  a = _mm_slli_epi32(a, 16);
454  v = _mm_or_si128(v, a);
455 
456  a = _mm_shuffle_epi8(t, m0);
457  b = _mm_shuffle_epi8(u, m1);
458  a = _mm_or_si128(a, b);
459  _mm_storeu_si128((__m128i*)p, a);
460 
461  a = _mm_shuffle_epi8(t, m2);
462  b = _mm_shuffle_epi8(u, m3);
463  a = _mm_or_si128(a, b);
464  b = _mm_shuffle_epi8(v, m4);
465  a = _mm_or_si128(a, b);
466  _mm_storeu_si128((__m128i*)p + 1, a);
467 
468  a = _mm_shuffle_epi8(u, m5);
469  b = _mm_shuffle_epi8(v, m6);
470  a = _mm_or_si128(a, b);
471  _mm_storeu_si128((__m128i*)p + 2, a);
472  }
473 
474  int max_val = (1<<bit_depth) - 1;
475  for ( ; count > 0; --count)
476  {
477  int val;
478  val = *sp0++;
479  val = val >= 0 ? val : 0;
480  val = val <= max_val ? val : max_val;
481  *p++ = be2le((ui16) val);
482  val = *sp1++;
483  val = val >= 0 ? val : 0;
484  val = val <= max_val ? val : max_val;
485  *p++ = be2le((ui16) val);
486  val = *sp2++;
487  val = val >= 0 ? val : 0;
488  val = val <= max_val ? val : max_val;
489  *p++ = be2le((ui16) val);
490  }
491  }
492 }
void sse41_cvrt_32b3c_to_16ub3c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b1c_to_16ub1c_be(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint16_t ui16
Definition: ojph_defs.h:52
void sse41_cvrt_32b1c_to_8ub1c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
static ui16 be2le(const ui16 v)
Definition: ojph_img_io.cpp:55
void sse41_cvrt_32b1c_to_16ub1c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
void sse41_cvrt_32b3c_to_16ub3c_le(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
int32_t si32
Definition: ojph_defs.h:55
void sse41_cvrt_32b3c_to_8ub3c(const line_buf *ln0, const line_buf *ln1, const line_buf *ln2, void *dp, int bit_depth, int count)
uint8_t ui8
Definition: ojph_defs.h:50
#define ojph_unused(x)
Definition: ojph_defs.h:78
si32 * i32
Definition: ojph_mem.h:155