SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
simd_algorithm_avx512.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
21 
22 //-----------------------------------------------------------------------------
23 // forward declare avx512 simd algorithms that use avx512 intrinsics
24 //-----------------------------------------------------------------------------
25 
26 namespace seqan3::detail
27 {
31 template <simd::simd_concept simd_t>
32 constexpr simd_t load_avx512(void const * mem_addr);
33 
37 template <simd::simd_concept simd_t>
38 constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec);
39 
43 template <simd::simd_concept simd_t>
44 inline void transpose_matrix_avx512(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
45 
49 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50 constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src);
51 
55 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56 constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src);
57 
61 template <uint8_t index, simd::simd_concept simd_t>
62 constexpr simd_t extract_half_avx512(simd_t const & src);
63 
67 template <uint8_t index, simd::simd_concept simd_t>
68 constexpr simd_t extract_quarter_avx512(simd_t const & src);
69 
73 template <uint8_t index, simd::simd_concept simd_t>
74 constexpr simd_t extract_eighth_avx512(simd_t const & src);
75 
76 } // namespace seqan3::detail
77 
78 //-----------------------------------------------------------------------------
79 // implementation
80 //-----------------------------------------------------------------------------
81 
82 #ifdef __AVX512F__
83 
84 namespace seqan3::detail
85 {
86 
87 template <simd::simd_concept simd_t>
88 constexpr simd_t load_avx512(void const * mem_addr)
89 {
90  return reinterpret_cast<simd_t>(_mm512_loadu_si512(mem_addr));
91 }
92 
93 template <simd::simd_concept simd_t>
94 constexpr void store_avx512(void * mem_addr, simd_t const & simd_vec)
95 {
96  _mm512_storeu_si512(mem_addr, reinterpret_cast<__m512i const &>(simd_vec));
97 }
98 
99 # if defined(__AVX512BW__)
100 template <simd::simd_concept simd_t>
101 inline void transpose_matrix_avx512(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
102 {
103  // Transposing a 64x64 byte matrix in 6x64 instructions using AVX512 intrinsics.
104  // Step 1: Unpack 8-bit operands.
105 
106  // Note: _mm512_unpack* operates on 128-bit lanes, i.e. the following pattern applies:
107  // * for lower half ('lo')
108  // d[127:0] = interleave(a[63:0], b[63:0])
109  // d[255:128] = interleave(a[191:128], b[191:128])
110  // d[383:256] = interleave(a[319:256], b[319:256])
111  // d[511:384] = interleave(a[447:384], b[447:384])
112  // * for higher half ('hi')
113  // d[127:0] = interleave(a[127:64], b[127:64])
114  // d[255:128] = interleave(a[255:192], b[255:192])
115  // d[383:256] = interleave(a[319:320], b[383:320])
116  // d[511:384] = interleave(a[511:448], b[511:448])
117  __m512i tmp1[64];
118  for (int i = 0; i < 32; ++i)
119  {
120  tmp1[i] = _mm512_unpacklo_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
121  reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
122  tmp1[i + 32] = _mm512_unpackhi_epi8(reinterpret_cast<__m512i const &>(matrix[2 * i]),
123  reinterpret_cast<__m512i const &>(matrix[2 * i + 1]));
124  }
125 
126  // Step 2: Unpack 16-bit operands.
127  __m512i tmp2[64];
128  for (int i = 0; i < 32; ++i)
129  {
130  tmp2[i] = _mm512_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
131  tmp2[i + 32] = _mm512_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
132  }
133  // Step 3: Unpack 32-bit operands.
134  for (int i = 0; i < 32; ++i)
135  {
136  tmp1[i] = _mm512_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
137  tmp1[i + 32] = _mm512_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
138  }
139  // Step 4: Unpack 64-bit operands.
140  for (int i = 0; i < 32; ++i)
141  {
142  tmp2[i] = _mm512_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
143  tmp2[i + 32] = _mm512_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
144  }
145 
146  // Step 5: Unpack 128-bit operands.
147  // Helper function to emulate unpack of 128-bit across lanes using _mm512_permutex2var_epi64.
148  auto _mm512_unpacklo_epi128 = [](__m512i const & a, __m512i const & b)
149  {
150  constexpr std::array<uint64_t, 8> lo_mask{0, 1, 8, 9, 2, 3, 10, 11};
151  return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*lo_mask.data()), b);
152  };
153 
154  auto _mm521_unpackhi_epi128 = [](__m512i const & a, __m512i const & b)
155  {
156  constexpr std::array<uint64_t, 8> hi_mask{4, 5, 12, 13, 6, 7, 14, 15};
157  return _mm512_permutex2var_epi64(a, reinterpret_cast<__m512i const &>(*hi_mask.data()), b);
158  };
159 
160  for (int i = 0; i < 32; ++i)
161  {
162  tmp1[i] = _mm512_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
163  tmp1[i + 32] = _mm521_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]);
164  }
165  // Step 6: Unpack 128-bit operands.
166  // Helper function to emulate unpack of 256-bit across lanes using _mm512_shuffle_i64x2.
167  auto _mm512_unpacklo_epi256 = [](__m512i const & a, __m512i const & b)
168  {
169  return _mm512_shuffle_i64x2(a, b, 0b0100'0100);
170  };
171 
172  auto _mm521_unpackhi_epi256 = [](__m512i const & a, __m512i const & b)
173  {
174  return _mm512_shuffle_i64x2(a, b, 0b1110'1110);
175  };
176 
177  // A look-up table to place the final transposed vector to the correct position in the
178  // original matrix.
179  // clang-format off
180  constexpr std::array<uint32_t, 64> reverse_idx_mask{
181  // 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
182  0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30,
183  // 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
184  1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31,
185  // 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
186  32, 48, 40, 56, 36, 52, 44, 60, 34, 50, 42, 58, 38, 54, 46, 62,
187  // 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
188  33, 49, 41, 57, 37, 53, 45, 61, 35, 51, 43, 59, 39, 55, 47, 63};
189  // clang-format on
190 
191  for (int i = 0; i < 32; ++i)
192  {
193  int const idx = i * 2;
194  matrix[reverse_idx_mask[idx]] = reinterpret_cast<simd_t>(_mm512_unpacklo_epi256(tmp1[idx], tmp1[idx + 1]));
195  matrix[reverse_idx_mask[idx + 1]] = reinterpret_cast<simd_t>(_mm521_unpackhi_epi256(tmp1[idx], tmp1[idx + 1]));
196  }
197 }
198 # endif // defined(__AVX512BW__)
199 
200 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
201 constexpr target_simd_t upcast_signed_avx512(source_simd_t const & src)
202 {
203  __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
204  if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
205  {
206  if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
207  return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi16(_mm512_castsi512_si256(tmp)));
208  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
209  return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi32(_mm512_castsi512_si128(tmp)));
210  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
211  return reinterpret_cast<target_simd_t>(_mm512_cvtepi8_epi64(_mm512_castsi512_si128(tmp)));
212  }
213  else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
214  {
215  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
216  return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi32(_mm512_castsi512_si256(tmp)));
217  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
218  return reinterpret_cast<target_simd_t>(_mm512_cvtepi16_epi64(_mm512_castsi512_si128(tmp)));
219  }
220  else // cast from epi32 to epi64
221  {
222  static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
223  return reinterpret_cast<target_simd_t>(_mm512_cvtepi32_epi64(_mm512_castsi512_si256(tmp)));
224  }
225 }
226 
227 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
228 constexpr target_simd_t upcast_unsigned_avx512(source_simd_t const & src)
229 {
230  __m512i const & tmp = reinterpret_cast<__m512i const &>(src);
231  if constexpr (simd_traits<source_simd_t>::length == 64) // cast from epi8 ...
232  {
233  if constexpr (simd_traits<target_simd_t>::length == 32) // to epi16
234  return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi16(_mm512_castsi512_si256(tmp)));
235  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
236  return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(tmp)));
237  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
238  return reinterpret_cast<target_simd_t>(_mm512_cvtepu8_epi64(_mm512_castsi512_si128(tmp)));
239  }
240  else if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi16 ...
241  {
242  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi32
243  return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(tmp)));
244  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi64
245  return reinterpret_cast<target_simd_t>(_mm512_cvtepu16_epi64(_mm512_castsi512_si128(tmp)));
246  }
247  else // cast from epi32 to epi64
248  {
249  static_assert(simd_traits<source_simd_t>::length == 16, "Expected 32 bit scalar type.");
250  return reinterpret_cast<target_simd_t>(_mm512_cvtepu32_epi64(_mm512_castsi512_si256(tmp)));
251  }
252 }
253 
254 template <uint8_t index, simd::simd_concept simd_t>
255 constexpr simd_t extract_half_avx512(simd_t const & src)
256 {
257  return reinterpret_cast<simd_t>(
258  _mm512_castsi256_si512(_mm512_extracti64x4_epi64(reinterpret_cast<__m512i const &>(src), index)));
259 }
260 
261 # if defined(__AVX512DQ__)
262 template <uint8_t index, simd::simd_concept simd_t>
263 constexpr simd_t extract_quarter_avx512(simd_t const & src)
264 {
265  return reinterpret_cast<simd_t>(
266  _mm512_castsi128_si512(_mm512_extracti64x2_epi64(reinterpret_cast<__m512i const &>(src), index)));
267 }
268 
269 template <uint8_t index, simd::simd_concept simd_t>
270 constexpr simd_t extract_eighth_avx512(simd_t const & src)
271 {
272  __m512i tmp = reinterpret_cast<__m512i const &>(src);
273 
274  // for uneven index exchange higher 64 bits with lower 64 bits for each 128 bit lane.
275  if constexpr (index % 2 == 1)
276  tmp = _mm512_shuffle_epi32(tmp, 0b0100'1110); // := [1, 0, 3, 2].
277 
278  return reinterpret_cast<simd_t>(_mm512_castsi128_si512(_mm512_extracti64x2_epi64(tmp, index / 2)));
279 }
280 # endif // defined(__AVX512DQ__)
281 
282 } // namespace seqan3::detail
283 
284 #endif // __AVX512F__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.
Provides seqan3::simd::simd_concept.