SeqAn3  3.2.0-rc.1
The Modern C++ library for sequence analysis.
simd_algorithm_avx2.hpp
Go to the documentation of this file.
1 // -----------------------------------------------------------------------------------------------------
2 // Copyright (c) 2006-2022, Knut Reinert & Freie Universität Berlin
3 // Copyright (c) 2016-2022, Knut Reinert & MPI für molekulare Genetik
4 // This file may be used, modified and/or redistributed under the terms of the 3-clause BSD-License
5 // shipped with this file and also available at: https://github.com/seqan/seqan3/blob/master/LICENSE.md
6 // -----------------------------------------------------------------------------------------------------
7 
13 #pragma once
14 
15 #include <array>
16 
21 
22 //-----------------------------------------------------------------------------
23 // forward declare avx2 simd algorithms that use avx2 intrinsics
24 //-----------------------------------------------------------------------------
25 
26 namespace seqan3::detail
27 {
31 template <simd::simd_concept simd_t>
32 constexpr simd_t load_avx2(void const * mem_addr);
33 
37 template <simd::simd_concept simd_t>
38 constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec);
39 
43 template <simd::simd_concept simd_t>
44 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix);
45 
49 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
50 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src);
51 
55 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
56 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src);
57 
61 template <uint8_t index, simd::simd_concept simd_t>
62 constexpr simd_t extract_half_avx2(simd_t const & src);
63 
67 template <uint8_t index, simd::simd_concept simd_t>
68 constexpr simd_t extract_quarter_avx2(simd_t const & src);
69 
73 template <uint8_t index, simd::simd_concept simd_t>
74 constexpr simd_t extract_eighth_avx2(simd_t const & src);
75 
76 } // namespace seqan3::detail
77 
78 //-----------------------------------------------------------------------------
79 // implementation
80 //-----------------------------------------------------------------------------
81 
82 #ifdef __AVX2__
83 
84 namespace seqan3::detail
85 {
86 
87 template <simd::simd_concept simd_t>
88 constexpr simd_t load_avx2(void const * mem_addr)
89 {
90  return reinterpret_cast<simd_t>(_mm256_loadu_si256(reinterpret_cast<__m256i const *>(mem_addr)));
91 }
92 
93 template <simd::simd_concept simd_t>
94 constexpr void store_avx2(void * mem_addr, simd_t const & simd_vec)
95 {
96  _mm256_storeu_si256(reinterpret_cast<__m256i *>(mem_addr), reinterpret_cast<__m256i const &>(simd_vec));
97 }
98 
99 template <simd::simd_concept simd_t>
100 inline void transpose_matrix_avx2(std::array<simd_t, simd_traits<simd_t>::length> & matrix)
101 {
102  // emulate missing _mm256_unpacklo_epi128/_mm256_unpackhi_epi128 instructions
103  auto _mm256_unpacklo_epi128 = [](__m256i const & a, __m256i const & b)
104  {
105  return _mm256_permute2x128_si256(a, b, 0x20);
106  };
107 
108  auto _mm256_unpackhi_epi128 = [](__m256i const & a, __m256i const & b)
109  {
110  return _mm256_permute2x128_si256(a, b, 0x31);
111  };
112 
113  // A look-up table to reverse the lowest 4 bits in order to permute the transposed rows.
114  static const uint8_t bit_rev[] = {0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15,
115  16, 24, 20, 28, 18, 26, 22, 30, 17, 25, 21, 29, 19, 27, 23, 31};
116 
117  // transpose a 32x32 byte matrix
118  __m256i tmp1[32];
119  for (int i = 0; i < 16; ++i)
120  {
121  tmp1[i] = _mm256_unpacklo_epi8(reinterpret_cast<__m256i const &>(matrix[2 * i]),
122  reinterpret_cast<__m256i const &>(matrix[2 * i + 1]));
123  tmp1[i + 16] = _mm256_unpackhi_epi8(reinterpret_cast<__m256i const &>(matrix[2 * i]),
124  reinterpret_cast<__m256i const &>(matrix[2 * i + 1]));
125  }
126  __m256i tmp2[32];
127  for (int i = 0; i < 16; ++i)
128  {
129  tmp2[i] = _mm256_unpacklo_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
130  tmp2[i + 16] = _mm256_unpackhi_epi16(tmp1[2 * i], tmp1[2 * i + 1]);
131  }
132  for (int i = 0; i < 16; ++i)
133  {
134  tmp1[i] = _mm256_unpacklo_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
135  tmp1[i + 16] = _mm256_unpackhi_epi32(tmp2[2 * i], tmp2[2 * i + 1]);
136  }
137  for (int i = 0; i < 16; ++i)
138  {
139  tmp2[i] = _mm256_unpacklo_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
140  tmp2[i + 16] = _mm256_unpackhi_epi64(tmp1[2 * i], tmp1[2 * i + 1]);
141  }
142  for (int i = 0; i < 16; ++i)
143  {
144  matrix[bit_rev[i]] = reinterpret_cast<simd_t>(_mm256_unpacklo_epi128(tmp2[2 * i], tmp2[2 * i + 1]));
145  matrix[bit_rev[i + 16]] = reinterpret_cast<simd_t>(_mm256_unpackhi_epi128(tmp2[2 * i], tmp2[2 * i + 1]));
146  }
147 }
148 
149 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
150 constexpr target_simd_t upcast_signed_avx2(source_simd_t const & src)
151 {
152  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
153  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
154  {
155  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
156  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi16(tmp));
157  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
158  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi32(tmp));
159  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
160  return reinterpret_cast<target_simd_t>(_mm256_cvtepi8_epi64(tmp));
161  }
162  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
163  {
164  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
165  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi32(tmp));
166  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
167  return reinterpret_cast<target_simd_t>(_mm256_cvtepi16_epi64(tmp));
168  }
169  else // cast from epi32 to epi64
170  {
171  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
172  return reinterpret_cast<target_simd_t>(_mm256_cvtepi32_epi64(tmp));
173  }
174 }
175 
176 template <simd::simd_concept target_simd_t, simd::simd_concept source_simd_t>
177 constexpr target_simd_t upcast_unsigned_avx2(source_simd_t const & src)
178 {
179  __m128i const & tmp = _mm256_castsi256_si128(reinterpret_cast<__m256i const &>(src));
180  if constexpr (simd_traits<source_simd_t>::length == 32) // cast from epi8 ...
181  {
182  if constexpr (simd_traits<target_simd_t>::length == 16) // to epi16
183  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi16(tmp));
184  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
185  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi32(tmp));
186  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
187  return reinterpret_cast<target_simd_t>(_mm256_cvtepu8_epi64(tmp));
188  }
189  else if constexpr (simd_traits<source_simd_t>::length == 16) // cast from epi16 ...
190  {
191  if constexpr (simd_traits<target_simd_t>::length == 8) // to epi32
192  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi32(tmp));
193  if constexpr (simd_traits<target_simd_t>::length == 4) // to epi64
194  return reinterpret_cast<target_simd_t>(_mm256_cvtepu16_epi64(tmp));
195  }
196  else // cast from epi32 to epi64
197  {
198  static_assert(simd_traits<source_simd_t>::length == 8, "Expected 32 bit scalar type.");
199  return reinterpret_cast<target_simd_t>(_mm256_cvtepu32_epi64(tmp));
200  }
201 }
202 
203 template <uint8_t index, simd::simd_concept simd_t>
204 constexpr simd_t extract_half_avx2(simd_t const & src)
205 {
206  return reinterpret_cast<simd_t>(
207  _mm256_castsi128_si256(_mm256_extracti128_si256(reinterpret_cast<__m256i const &>(src), index)));
208 }
209 
210 template <uint8_t index, simd::simd_concept simd_t>
211 constexpr simd_t extract_quarter_avx2(simd_t const & src)
212 {
213  return reinterpret_cast<simd_t>(_mm256_castsi128_si256(
214  _mm_cvtsi64x_si128(_mm256_extract_epi64(reinterpret_cast<__m256i const &>(src), index))));
215 }
216 
217 template <uint8_t index, simd::simd_concept simd_t>
218 constexpr simd_t extract_eighth_avx2(simd_t const & src)
219 {
220  return reinterpret_cast<simd_t>(
221  _mm256_castsi128_si256(_mm_cvtsi32_si128(_mm256_extract_epi32(reinterpret_cast<__m256i const &>(src), index))));
222 }
223 
224 } // namespace seqan3::detail
225 
226 #endif // __AVX2__
Provides seqan3::detail::builtin_simd, seqan3::detail::is_builtin_simd and seqan3::simd::simd_traits<...
Provides intrinsics include for builtin simd.
Provides seqan3::simd::simd_traits.
Provides seqan3::simd::simd_concept.