29#ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30#define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
32#include "mdds/global.hpp"
33#include "../types.hpp"
42namespace mdds {
namespace mtv {
namespace soa {
namespace detail {
44template<
typename Blks, lu_factor_t F>
47 void operator()(Blks& , int64_t , int64_t )
const
50 mdds::detail::invalid_static_int<F>,
"The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
54template<
typename Blks>
57 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
59 int64_t n = block_store.positions.size();
61 if (start_block_index >= n)
65#pragma omp parallel for
67 for (int64_t i = start_block_index; i < n; ++i)
68 block_store.positions[i] += delta;
72template<
typename Blks>
75 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
77 int64_t n = block_store.positions.size();
79 if (start_block_index >= n)
83 int64_t len = n - start_block_index;
84 int64_t rem = len & 3;
86 len += start_block_index;
88#pragma omp parallel for
90 for (int64_t i = start_block_index; i < len; i += 4)
92 block_store.positions[i + 0] += delta;
93 block_store.positions[i + 1] += delta;
94 block_store.positions[i + 2] += delta;
95 block_store.positions[i + 3] += delta;
99 for (int64_t i = len; i < rem; ++i)
100 block_store.positions[i] += delta;
104template<
typename Blks>
107 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
109 int64_t n = block_store.positions.size();
111 if (start_block_index >= n)
115 int64_t len = n - start_block_index;
116 int64_t rem = len & 7;
118 len += start_block_index;
120#pragma omp parallel for
122 for (int64_t i = start_block_index; i < len; i += 8)
124 block_store.positions[i + 0] += delta;
125 block_store.positions[i + 1] += delta;
126 block_store.positions[i + 2] += delta;
127 block_store.positions[i + 3] += delta;
128 block_store.positions[i + 4] += delta;
129 block_store.positions[i + 5] += delta;
130 block_store.positions[i + 6] += delta;
131 block_store.positions[i + 7] += delta;
135 for (int64_t i = len; i < rem; ++i)
136 block_store.positions[i] += delta;
140template<
typename Blks>
143 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
145 int64_t n = block_store.positions.size();
147 if (start_block_index >= n)
151 int64_t len = n - start_block_index;
152 int64_t rem = len & 15;
154 len += start_block_index;
156#pragma omp parallel for
158 for (int64_t i = start_block_index; i < len; i += 16)
160 block_store.positions[i + 0] += delta;
161 block_store.positions[i + 1] += delta;
162 block_store.positions[i + 2] += delta;
163 block_store.positions[i + 3] += delta;
164 block_store.positions[i + 4] += delta;
165 block_store.positions[i + 5] += delta;
166 block_store.positions[i + 6] += delta;
167 block_store.positions[i + 7] += delta;
168 block_store.positions[i + 8] += delta;
169 block_store.positions[i + 9] += delta;
170 block_store.positions[i + 10] += delta;
171 block_store.positions[i + 11] += delta;
172 block_store.positions[i + 12] += delta;
173 block_store.positions[i + 13] += delta;
174 block_store.positions[i + 14] += delta;
175 block_store.positions[i + 15] += delta;
179 for (int64_t i = len; i < rem; ++i)
180 block_store.positions[i] += delta;
184template<
typename Blks>
187 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
189 int64_t n = block_store.positions.size();
191 if (start_block_index >= n)
195 int64_t len = n - start_block_index;
196 int64_t rem = len & 31;
198 len += start_block_index;
200#pragma omp parallel for
202 for (int64_t i = start_block_index; i < len; i += 32)
204 block_store.positions[i + 0] += delta;
205 block_store.positions[i + 1] += delta;
206 block_store.positions[i + 2] += delta;
207 block_store.positions[i + 3] += delta;
208 block_store.positions[i + 4] += delta;
209 block_store.positions[i + 5] += delta;
210 block_store.positions[i + 6] += delta;
211 block_store.positions[i + 7] += delta;
212 block_store.positions[i + 8] += delta;
213 block_store.positions[i + 9] += delta;
214 block_store.positions[i + 10] += delta;
215 block_store.positions[i + 11] += delta;
216 block_store.positions[i + 12] += delta;
217 block_store.positions[i + 13] += delta;
218 block_store.positions[i + 14] += delta;
219 block_store.positions[i + 15] += delta;
220 block_store.positions[i + 16] += delta;
221 block_store.positions[i + 17] += delta;
222 block_store.positions[i + 18] += delta;
223 block_store.positions[i + 19] += delta;
224 block_store.positions[i + 20] += delta;
225 block_store.positions[i + 21] += delta;
226 block_store.positions[i + 22] += delta;
227 block_store.positions[i + 23] += delta;
228 block_store.positions[i + 24] += delta;
229 block_store.positions[i + 25] += delta;
230 block_store.positions[i + 26] += delta;
231 block_store.positions[i + 27] += delta;
232 block_store.positions[i + 28] += delta;
233 block_store.positions[i + 29] += delta;
234 block_store.positions[i + 30] += delta;
235 block_store.positions[i + 31] += delta;
239 for (int64_t i = len; i < rem; ++i)
240 block_store.positions[i] += delta;
246template<
typename Blks>
249 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
252 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
253 "This code works only when the position values are 64-bit wide.");
255 int64_t n = block_store.positions.size();
257 if (start_block_index >= n)
261 int64_t len = n - start_block_index;
266 len += start_block_index;
268 __m128i right = _mm_set_epi64x(delta, delta);
271#pragma omp parallel for
273 for (int64_t i = start_block_index; i < len; i += 2)
275 __m128i* dst = (__m128i*)&block_store.positions[i];
276 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
280 block_store.positions[len] += delta;
284template<
typename Blks>
285struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
287 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
290 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
291 "This code works only when the position values are 64-bit wide.");
293 int64_t n = block_store.positions.size();
295 if (start_block_index >= n)
299 int64_t len = n - start_block_index;
300 int64_t rem = len & 7;
302 len += start_block_index;
304 __m128i right = _mm_set_epi64x(delta, delta);
307#pragma omp parallel for
309 for (int64_t i = start_block_index; i < len; i += 8)
311 __m128i* dst0 = (__m128i*)&block_store.positions[i];
312 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
314 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
315 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
317 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
318 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
320 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
321 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
325 for (int64_t i = len; i < rem; ++i)
326 block_store.positions[i] += delta;
330template<
typename Blks>
331struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
333 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
336 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
337 "This code works only when the position values are 64-bit wide.");
339 int64_t n = block_store.positions.size();
341 if (start_block_index >= n)
345 int64_t len = n - start_block_index;
346 int64_t rem = len & 15;
348 len += start_block_index;
350 __m128i right = _mm_set_epi64x(delta, delta);
353#pragma omp parallel for
355 for (int64_t i = start_block_index; i < len; i += 16)
357 __m128i* dst0 = (__m128i*)&block_store.positions[i];
358 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
360 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
361 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
363 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
364 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
366 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
367 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
369 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
370 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
372 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
373 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
375 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
376 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
378 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
379 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
383 for (int64_t i = len; i < rem; ++i)
384 block_store.positions[i] += delta;
388template<
typename Blks>
389struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
391 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
394 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
395 "This code works only when the position values are 64-bit wide.");
397 int64_t n = block_store.positions.size();
399 if (start_block_index >= n)
403 int64_t len = n - start_block_index;
404 int64_t rem = len & 31;
406 len += start_block_index;
408 __m128i right = _mm_set_epi64x(delta, delta);
411#pragma omp parallel for
413 for (int64_t i = start_block_index; i < len; i += 32)
415 __m128i* dst0 = (__m128i*)&block_store.positions[i];
416 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
418 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
419 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
421 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
422 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
424 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
425 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
427 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
428 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
430 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
431 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
433 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
434 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
436 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
437 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
439 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
440 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
442 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
443 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
445 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
446 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
448 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
449 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
451 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
452 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
454 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
455 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
457 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
458 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
460 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
461 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
465 for (int64_t i = len; i < rem; ++i)
466 block_store.positions[i] += delta;
474template<
typename Blks>
475struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
477 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
480 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
481 "This code works only when the position values are 64-bit wide.");
483 int64_t n = block_store.positions.size();
485 if (start_block_index >= n)
489 int64_t len = n - start_block_index;
490 int64_t rem = len & 3;
492 len += start_block_index;
494 __m256i right = _mm256_set1_epi64x(delta);
497#pragma omp parallel for
499 for (int64_t i = start_block_index; i < len; i += 4)
501 __m256i* dst = (__m256i*)&block_store.positions[i];
502 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
506 for (int64_t i = len; i < rem; ++i)
507 block_store.positions[i] += delta;
511template<
typename Blks>
512struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
514 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
517 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
518 "This code works only when the position values are 64-bit wide.");
520 int64_t n = block_store.positions.size();
522 if (start_block_index >= n)
526 int64_t len = n - start_block_index;
527 int64_t rem = len & 15;
529 len += start_block_index;
531 __m256i right = _mm256_set1_epi64x(delta);
534#pragma omp parallel for
536 for (int64_t i = start_block_index; i < len; i += 16)
538 __m256i* dst = (__m256i*)&block_store.positions[i];
539 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
541 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
542 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
544 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
545 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
547 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
548 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
552 for (int64_t i = len; i < rem; ++i)
553 block_store.positions[i] += delta;
557template<
typename Blks>
558struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
560 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta)
const
563 sizeof(
typename decltype(block_store.positions)::value_type) == 8,
564 "This code works only when the position values are 64-bit wide.");
566 int64_t n = block_store.positions.size();
568 if (start_block_index >= n)
572 int64_t len = n - start_block_index;
573 int64_t rem = len & 31;
575 len += start_block_index;
577 __m256i right = _mm256_set1_epi64x(delta);
580#pragma omp parallel for
582 for (int64_t i = start_block_index; i < len; i += 32)
584 __m256i* dst = (__m256i*)&block_store.positions[i];
585 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
587 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
588 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
590 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
591 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
593 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
594 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
596 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
597 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
599 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
600 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
602 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
603 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
605 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
606 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
610 for (int64_t i = len; i < rem; ++i)
611 block_store.positions[i] += delta;
Definition: soa/block_util.hpp:46