TUT HEVC Encoder
Loading...
Searching...
No Matches
reg_sad_pow2_widths-avx2.h
Go to the documentation of this file.
1/*****************************************************************************
2 * This file is part of Kvazaar HEVC encoder.
3 *
4 * Copyright (c) 2021, Tampere University, ITU/ISO/IEC, project contributors
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without modification,
8 * are permitted provided that the following conditions are met:
9 *
10 * * Redistributions of source code must retain the above copyright notice, this
11 * list of conditions and the following disclaimer.
12 *
13 * * Redistributions in binary form must reproduce the above copyright notice, this
14 * list of conditions and the following disclaimer in the documentation and/or
15 * other materials provided with the distribution.
16 *
17 * * Neither the name of the Tampere University or ITU/ISO/IEC nor the names of its
18 * contributors may be used to endorse or promote products derived from
19 * this software without specific prior written permission.
20 *
21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
23 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
25 * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 * INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
27 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION HOWEVER CAUSED AND ON
28 * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29 * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 * INCLUDING NEGLIGENCE OR OTHERWISE ARISING IN ANY WAY OUT OF THE USE OF THIS
31 ****************************************************************************/
32
33#ifndef REG_SAD_POW2_WIDTHS_AVX2_H_
34#define REG_SAD_POW2_WIDTHS_AVX2_H_
35
36#include "kvazaar.h"
37
38#if KVZ_BIT_DEPTH == 8
39
41
42static INLINE uint32_t reg_sad_w32(const uint8_t * const data1, const uint8_t * const data2,
43 const int32_t height, const uint32_t stride1,
44 const uint32_t stride2)
45{
47 int32_t y;
48
49 const int32_t height_fourline_groups = height & ~3;
50 const int32_t height_residual_lines = height & 3;
51
52 for (y = 0; y < height_fourline_groups; y += 4) {
53 __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
54 __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
55 __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
56 __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
57 __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 2) * stride1));
58 __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 2) * stride2));
59 __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 3) * stride1));
60 __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 3) * stride2));
61
66
71 }
73 for (; y < height; y++) {
74 __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
75 __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
76
79 }
80 }
81
84
88
89 return _mm_cvtsi128_si32(sad);
90}
91
92static INLINE uint32_t reg_sad_w64(const uint8_t * const data1, const uint8_t * const data2,
93 const int32_t height, const uint32_t stride1,
94 const uint32_t stride2)
95{
97 int32_t y;
98
99 const int32_t height_twoline_groups = height & ~1;
100 const int32_t height_residual_lines = height & 1;
101
102 for (y = 0; y < height_twoline_groups; y += 2) {
103 __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
104 __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
105 __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
106 __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
107
108 __m256i e = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1));
109 __m256i f = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2));
110 __m256i g = _mm256_loadu_si256((const __m256i *)(data1 + (y + 1) * stride1 + 32));
111 __m256i h = _mm256_loadu_si256((const __m256i *)(data2 + (y + 1) * stride2 + 32));
112
117
122 }
124 for (; y < height; y++) {
125 __m256i a = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1));
126 __m256i b = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2));
127 __m256i c = _mm256_loadu_si256((const __m256i *)(data1 + (y + 0) * stride1 + 32));
128 __m256i d = _mm256_loadu_si256((const __m256i *)(data2 + (y + 0) * stride2 + 32));
129
134 }
135 }
136
139
143
144 return _mm_cvtsi128_si32(sad);
145}
146
149 const uint32_t left, const uint32_t right)
150{
152
153 const size_t block_width = 32;
154 const size_t block_width_log2 = 5;
155 const size_t lane_width = 16;
156
157 const int32_t left_eq_wid = left >> block_width_log2;
158 const int32_t left_clamped = left - left_eq_wid;
159 const int32_t right_eq_wid = right >> block_width_log2;
160 const int32_t right_clamped = right - right_eq_wid;
161
167 const __m256i ns = _mm256_setr_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
168 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31);
169
171
175
178
182
183 // For left != 0, use low lane of mlo2hi_mask_l as blend mask for high lane.
184 // For right != 0, use low lane of mlo2hi_mask_r as blend mask for low lane.
186
187 // If left != 0 (ie. right == 0), the xchg should only affect high lane,
188 // if right != 0 (ie. left == 0), the low lane. Set bits on the lane that
189 // the xchg should affect. left == right == 0 should never happen, this'll
190 // break if it does.
193
195
196 // If we're straddling the left border, start from the left border instead,
197 // and if right border, end on the border
198 const int32_t ld_offset = left - right;
199
200 int32_t y;
201 for (y = 0; y < height; y++) {
202 __m256i a = _mm256_loadu_si256((__m256i *)(pic_data + (y + 0) * pic_stride + 0));
204
209
211
213 }
216
220
221 return _mm_cvtsi128_si32(sad);
222}
223
224#endif // KVZ_BIT_DEPTH == 8
225
226#endif
#define INLINE
Definition global.h:240
#define MAX_TILES_PER_DIM
Definition global.h:232
This file defines the public API of Kvazaar when used as a library.