OpenJPH
Open-source implementation of JPEG2000 Part-15
ojph_codestream_sse2.cpp
Go to the documentation of this file.
1 //***************************************************************************/
2 // This software is released under the 2-Clause BSD license, included
3 // below.
4 //
5 // Copyright (c) 2022, Aous Naman
6 // Copyright (c) 2022, Kakadu Software Pty Ltd, Australia
7 // Copyright (c) 2022, The University of New South Wales, Australia
8 //
9 // Redistribution and use in source and binary forms, with or without
10 // modification, are permitted provided that the following conditions are
11 // met:
12 //
13 // 1. Redistributions of source code must retain the above copyright
14 // notice, this list of conditions and the following disclaimer.
15 //
16 // 2. Redistributions in binary form must reproduce the above copyright
17 // notice, this list of conditions and the following disclaimer in the
18 // documentation and/or other materials provided with the distribution.
19 //
20 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS
21 // IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
22 // TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
23 // PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
24 // HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
25 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
26 // TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
27 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
28 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
29 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
30 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31 //***************************************************************************/
32 // This file is part of the OpenJPH software implementation.
33 // File: ojph_codestream_sse2.cpp
34 // Author: Aous Naman
35 // Date: 15 May 2022
36 //***************************************************************************/
37 
38 #include <immintrin.h>
39 #include "ojph_defs.h"
40 
41 namespace ojph {
42  namespace local {
43 
46  {
47  __m128i x1, x0 = _mm_loadu_si128((__m128i*)address);
48  x1 = _mm_shuffle_epi32(x0, 0xEE); // x1 = x0[2,3,2,3]
49  x0 = _mm_or_si128(x0, x1);
50  x1 = _mm_shuffle_epi32(x0, 0x55); // x1 = x0[1,1,1,1]
51  x0 = _mm_or_si128(x0, x1);
52  _mm_storeu_si128((__m128i*)address, x0);
53  return *address;
54  // A single movd t, xmm0 can do the trick, but it is not available
55  // in SSE2 intrinsics. extract_epi32 is available in sse4.1
56  // ui32 t = (ui32)_mm_extract_epi16(x0, 0);
57  // t |= (ui32)_mm_extract_epi16(x0, 1) << 16;
58  // return t;
59  }
60 
62  void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
63  float delta_inv, ui32 count, ui32* max_val)
64  {
65  ojph_unused(delta_inv);
66 
67  // convert to sign and magnitude and keep max_val
68  ui32 shift = 31 - K_max;
69  __m128i m0 = _mm_set1_epi32((int)0x80000000);
70  __m128i zero = _mm_setzero_si128();
71  __m128i one = _mm_set1_epi32(1);
72  __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
73  __m128i *p = (__m128i*)sp;
74  for (ui32 i = 0; i < count; i += 4, p += 1, dp += 4)
75  {
76  __m128i v = _mm_loadu_si128(p);
77  __m128i sign = _mm_cmplt_epi32(v, zero);
78  __m128i val = _mm_xor_si128(v, sign); // negate 1's complement
79  __m128i ones = _mm_and_si128(sign, one);
80  val = _mm_add_epi32(val, ones); // 2's complement
81  sign = _mm_and_si128(sign, m0);
82  val = _mm_slli_epi32(val, (int)shift);
83  tmax = _mm_or_si128(tmax, val);
84  val = _mm_or_si128(val, sign);
85  _mm_storeu_si128((__m128i*)dp, val);
86  }
87  _mm_storeu_si128((__m128i*)max_val, tmax);
88  }
89 
91  void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max,
92  float delta_inv, ui32 count, ui32* max_val)
93  {
94  ojph_unused(K_max);
95 
96  //quantize and convert to sign and magnitude and keep max_val
97 
98  __m128 d = _mm_set1_ps(delta_inv);
99  __m128i zero = _mm_setzero_si128();
100  __m128i one = _mm_set1_epi32(1);
101  __m128i tmax = _mm_loadu_si128((__m128i*)max_val);
102  float *p = (float*)sp;
103  for (ui32 i = 0; i < count; i += 4, p += 4, dp += 4)
104  {
105  __m128 vf = _mm_loadu_ps(p);
106  vf = _mm_mul_ps(vf, d); // multiply
107  __m128i val = _mm_cvtps_epi32(vf); // convert to int
108  __m128i sign = _mm_cmplt_epi32(val, zero); // get sign
109  val = _mm_xor_si128(val, sign); // negate 1's complement
110  __m128i ones = _mm_and_si128(sign, one);
111  val = _mm_add_epi32(val, ones); // 2's complement
112  tmax = _mm_or_si128(tmax, val);
113  sign = _mm_slli_epi32(sign, 31);
114  val = _mm_or_si128(val, sign);
115  _mm_storeu_si128((__m128i*)dp, val);
116  }
117  _mm_storeu_si128((__m128i*)max_val, tmax);
118  }
119 
121  void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
122  float delta, ui32 count)
123  {
124  ojph_unused(delta);
125  ui32 shift = 31 - K_max;
126  __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
127  __m128i zero = _mm_setzero_si128();
128  __m128i one = _mm_set1_epi32(1);
129  si32 *p = (si32*)dp;
130  for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
131  {
132  __m128i v = _mm_load_si128((__m128i*)sp);
133  __m128i val = _mm_and_si128(v, m1);
134  val = _mm_srli_epi32(val, (int)shift);
135  __m128i sign = _mm_cmplt_epi32(v, zero);
136  val = _mm_xor_si128(val, sign); // negate 1's complement
137  __m128i ones = _mm_and_si128(sign, one);
138  val = _mm_add_epi32(val, ones); // 2's complement
139  _mm_storeu_si128((__m128i*)p, val);
140  }
141  }
142 
144  void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max,
145  float delta, ui32 count)
146  {
147  ojph_unused(K_max);
148  __m128i m1 = _mm_set1_epi32(0x7FFFFFFF);
149  __m128 d = _mm_set1_ps(delta);
150  float *p = (float*)dp;
151  for (ui32 i = 0; i < count; i += 4, sp += 4, p += 4)
152  {
153  __m128i v = _mm_load_si128((__m128i*)sp);
154  __m128i vali = _mm_and_si128(v, m1);
155  __m128 valf = _mm_cvtepi32_ps(vali);
156  valf = _mm_mul_ps(valf, d);
157  __m128i sign = _mm_andnot_si128(m1, v);
158  valf = _mm_or_ps(valf, _mm_castsi128_ps(sign));
159  _mm_storeu_ps(p, valf);
160  }
161  }
162  }
163 }
void sse2_irv_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
void sse2_irv_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_from_cb(const ui32 *sp, void *dp, ui32 K_max, float delta, ui32 count)
void sse2_rev_tx_to_cb(const void *sp, ui32 *dp, ui32 K_max, float delta_inv, ui32 count, ui32 *max_val)
ui32 sse2_find_max_val(ui32 *address)
int32_t si32
Definition: ojph_defs.h:55
uint32_t ui32
Definition: ojph_defs.h:54
#define ojph_unused(x)
Definition: ojph_defs.h:78