mdds
soa/block_util.hpp
1/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
2/*************************************************************************
3 *
4 * Copyright (c) 2021 Kohei Yoshida
5 *
6 * Permission is hereby granted, free of charge, to any person
7 * obtaining a copy of this software and associated documentation
8 * files (the "Software"), to deal in the Software without
9 * restriction, including without limitation the rights to use,
10 * copy, modify, merge, publish, distribute, sublicense, and/or sell
11 * copies of the Software, and to permit persons to whom the
12 * Software is furnished to do so, subject to the following
13 * conditions:
14 *
15 * The above copyright notice and this permission notice shall be
16 * included in all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
20 * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
21 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
22 * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
23 * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
25 * OTHER DEALINGS IN THE SOFTWARE.
26 *
27 ************************************************************************/
28
29#ifndef INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
30#define INCLUDED_MDDS_MULTI_TYPE_VECTOR_DIR_SOA_BLOCK_UTIL_HPP
31
32#include "mdds/global.hpp"
33#include "../types.hpp"
34
35#if defined(__SSE2__)
36#include <emmintrin.h>
37#endif
38#if defined(__AVX2__)
39#include <immintrin.h>
40#endif
41
42namespace mdds { namespace mtv { namespace soa { namespace detail {
43
44template<typename Blks, lu_factor_t F>
46{
47 void operator()(Blks& /*block_store*/, int64_t /*start_block_index*/, int64_t /*delta*/) const
48 {
49 static_assert(
50 mdds::detail::invalid_static_int<F>, "The loop-unrolling factor must be one of 0, 4, 8, 16, or 32.");
51 }
52};
53
54template<typename Blks>
55struct adjust_block_positions<Blks, lu_factor_t::none>
56{
57 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
58 {
59 int64_t n = block_store.positions.size();
60
61 if (start_block_index >= n)
62 return;
63
64#if MDDS_USE_OPENMP
65#pragma omp parallel for
66#endif
67 for (int64_t i = start_block_index; i < n; ++i)
68 block_store.positions[i] += delta;
69 }
70};
71
72template<typename Blks>
73struct adjust_block_positions<Blks, lu_factor_t::lu4>
74{
75 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
76 {
77 int64_t n = block_store.positions.size();
78
79 if (start_block_index >= n)
80 return;
81
82 // Ensure that the section length is divisible by 4.
83 int64_t len = n - start_block_index;
84 int64_t rem = len & 3; // % 4
85 len -= rem;
86 len += start_block_index;
87#if MDDS_USE_OPENMP
88#pragma omp parallel for
89#endif
90 for (int64_t i = start_block_index; i < len; i += 4)
91 {
92 block_store.positions[i + 0] += delta;
93 block_store.positions[i + 1] += delta;
94 block_store.positions[i + 2] += delta;
95 block_store.positions[i + 3] += delta;
96 }
97
98 rem += len;
99 for (int64_t i = len; i < rem; ++i)
100 block_store.positions[i] += delta;
101 }
102};
103
104template<typename Blks>
105struct adjust_block_positions<Blks, lu_factor_t::lu8>
106{
107 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
108 {
109 int64_t n = block_store.positions.size();
110
111 if (start_block_index >= n)
112 return;
113
114 // Ensure that the section length is divisible by 8.
115 int64_t len = n - start_block_index;
116 int64_t rem = len & 7; // % 8
117 len -= rem;
118 len += start_block_index;
119#if MDDS_USE_OPENMP
120#pragma omp parallel for
121#endif
122 for (int64_t i = start_block_index; i < len; i += 8)
123 {
124 block_store.positions[i + 0] += delta;
125 block_store.positions[i + 1] += delta;
126 block_store.positions[i + 2] += delta;
127 block_store.positions[i + 3] += delta;
128 block_store.positions[i + 4] += delta;
129 block_store.positions[i + 5] += delta;
130 block_store.positions[i + 6] += delta;
131 block_store.positions[i + 7] += delta;
132 }
133
134 rem += len;
135 for (int64_t i = len; i < rem; ++i)
136 block_store.positions[i] += delta;
137 }
138};
139
140template<typename Blks>
141struct adjust_block_positions<Blks, lu_factor_t::lu16>
142{
143 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
144 {
145 int64_t n = block_store.positions.size();
146
147 if (start_block_index >= n)
148 return;
149
150 // Ensure that the section length is divisible by 16.
151 int64_t len = n - start_block_index;
152 int64_t rem = len & 15; // % 16
153 len -= rem;
154 len += start_block_index;
155#if MDDS_USE_OPENMP
156#pragma omp parallel for
157#endif
158 for (int64_t i = start_block_index; i < len; i += 16)
159 {
160 block_store.positions[i + 0] += delta;
161 block_store.positions[i + 1] += delta;
162 block_store.positions[i + 2] += delta;
163 block_store.positions[i + 3] += delta;
164 block_store.positions[i + 4] += delta;
165 block_store.positions[i + 5] += delta;
166 block_store.positions[i + 6] += delta;
167 block_store.positions[i + 7] += delta;
168 block_store.positions[i + 8] += delta;
169 block_store.positions[i + 9] += delta;
170 block_store.positions[i + 10] += delta;
171 block_store.positions[i + 11] += delta;
172 block_store.positions[i + 12] += delta;
173 block_store.positions[i + 13] += delta;
174 block_store.positions[i + 14] += delta;
175 block_store.positions[i + 15] += delta;
176 }
177
178 rem += len;
179 for (int64_t i = len; i < rem; ++i)
180 block_store.positions[i] += delta;
181 }
182};
183
184template<typename Blks>
185struct adjust_block_positions<Blks, lu_factor_t::lu32>
186{
187 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
188 {
189 int64_t n = block_store.positions.size();
190
191 if (start_block_index >= n)
192 return;
193
194 // Ensure that the section length is divisible by 32.
195 int64_t len = n - start_block_index;
196 int64_t rem = len & 31; // % 32
197 len -= rem;
198 len += start_block_index;
199#if MDDS_USE_OPENMP
200#pragma omp parallel for
201#endif
202 for (int64_t i = start_block_index; i < len; i += 32)
203 {
204 block_store.positions[i + 0] += delta;
205 block_store.positions[i + 1] += delta;
206 block_store.positions[i + 2] += delta;
207 block_store.positions[i + 3] += delta;
208 block_store.positions[i + 4] += delta;
209 block_store.positions[i + 5] += delta;
210 block_store.positions[i + 6] += delta;
211 block_store.positions[i + 7] += delta;
212 block_store.positions[i + 8] += delta;
213 block_store.positions[i + 9] += delta;
214 block_store.positions[i + 10] += delta;
215 block_store.positions[i + 11] += delta;
216 block_store.positions[i + 12] += delta;
217 block_store.positions[i + 13] += delta;
218 block_store.positions[i + 14] += delta;
219 block_store.positions[i + 15] += delta;
220 block_store.positions[i + 16] += delta;
221 block_store.positions[i + 17] += delta;
222 block_store.positions[i + 18] += delta;
223 block_store.positions[i + 19] += delta;
224 block_store.positions[i + 20] += delta;
225 block_store.positions[i + 21] += delta;
226 block_store.positions[i + 22] += delta;
227 block_store.positions[i + 23] += delta;
228 block_store.positions[i + 24] += delta;
229 block_store.positions[i + 25] += delta;
230 block_store.positions[i + 26] += delta;
231 block_store.positions[i + 27] += delta;
232 block_store.positions[i + 28] += delta;
233 block_store.positions[i + 29] += delta;
234 block_store.positions[i + 30] += delta;
235 block_store.positions[i + 31] += delta;
236 }
237
238 rem += len;
239 for (int64_t i = len; i < rem; ++i)
240 block_store.positions[i] += delta;
241 }
242};
243
244#if defined(__SSE2__)
245
246template<typename Blks>
247struct adjust_block_positions<Blks, lu_factor_t::sse2_x64>
248{
249 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
250 {
251 static_assert(
252 sizeof(typename decltype(block_store.positions)::value_type) == 8,
253 "This code works only when the position values are 64-bit wide.");
254
255 int64_t n = block_store.positions.size();
256
257 if (start_block_index >= n)
258 return;
259
260 // Ensure that the section length is divisible by 2.
261 int64_t len = n - start_block_index;
262 bool odd = len & 1;
263 if (odd)
264 len -= 1;
265
266 len += start_block_index;
267
268 __m128i right = _mm_set_epi64x(delta, delta);
269
270#if MDDS_USE_OPENMP
271#pragma omp parallel for
272#endif
273 for (int64_t i = start_block_index; i < len; i += 2)
274 {
275 __m128i* dst = (__m128i*)&block_store.positions[i];
276 _mm_storeu_si128(dst, _mm_add_epi64(_mm_loadu_si128(dst), right));
277 }
278
279 if (odd)
280 block_store.positions[len] += delta;
281 }
282};
283
284template<typename Blks>
285struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu4>
286{
287 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
288 {
289 static_assert(
290 sizeof(typename decltype(block_store.positions)::value_type) == 8,
291 "This code works only when the position values are 64-bit wide.");
292
293 int64_t n = block_store.positions.size();
294
295 if (start_block_index >= n)
296 return;
297
298 // Ensure that the section length is divisible by 8.
299 int64_t len = n - start_block_index;
300 int64_t rem = len & 7; // % 8
301 len -= rem;
302 len += start_block_index;
303
304 __m128i right = _mm_set_epi64x(delta, delta);
305
306#if MDDS_USE_OPENMP
307#pragma omp parallel for
308#endif
309 for (int64_t i = start_block_index; i < len; i += 8)
310 {
311 __m128i* dst0 = (__m128i*)&block_store.positions[i];
312 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
313
314 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
315 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
316
317 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
318 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
319
320 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
321 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
322 }
323
324 rem += len;
325 for (int64_t i = len; i < rem; ++i)
326 block_store.positions[i] += delta;
327 }
328};
329
330template<typename Blks>
331struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu8>
332{
333 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
334 {
335 static_assert(
336 sizeof(typename decltype(block_store.positions)::value_type) == 8,
337 "This code works only when the position values are 64-bit wide.");
338
339 int64_t n = block_store.positions.size();
340
341 if (start_block_index >= n)
342 return;
343
344 // Ensure that the section length is divisible by 16.
345 int64_t len = n - start_block_index;
346 int64_t rem = len & 15; // % 16
347 len -= rem;
348 len += start_block_index;
349
350 __m128i right = _mm_set_epi64x(delta, delta);
351
352#if MDDS_USE_OPENMP
353#pragma omp parallel for
354#endif
355 for (int64_t i = start_block_index; i < len; i += 16)
356 {
357 __m128i* dst0 = (__m128i*)&block_store.positions[i];
358 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
359
360 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
361 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
362
363 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
364 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
365
366 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
367 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
368
369 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
370 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
371
372 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
373 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
374
375 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
376 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
377
378 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
379 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
380 }
381
382 rem += len;
383 for (int64_t i = len; i < rem; ++i)
384 block_store.positions[i] += delta;
385 }
386};
387
388template<typename Blks>
389struct adjust_block_positions<Blks, lu_factor_t::sse2_x64_lu16>
390{
391 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
392 {
393 static_assert(
394 sizeof(typename decltype(block_store.positions)::value_type) == 8,
395 "This code works only when the position values are 64-bit wide.");
396
397 int64_t n = block_store.positions.size();
398
399 if (start_block_index >= n)
400 return;
401
402 // Ensure that the section length is divisible by 32.
403 int64_t len = n - start_block_index;
404 int64_t rem = len & 31; // % 32
405 len -= rem;
406 len += start_block_index;
407
408 __m128i right = _mm_set_epi64x(delta, delta);
409
410#if MDDS_USE_OPENMP
411#pragma omp parallel for
412#endif
413 for (int64_t i = start_block_index; i < len; i += 32)
414 {
415 __m128i* dst0 = (__m128i*)&block_store.positions[i];
416 _mm_storeu_si128(dst0, _mm_add_epi64(_mm_loadu_si128(dst0), right));
417
418 __m128i* dst2 = (__m128i*)&block_store.positions[i + 2];
419 _mm_storeu_si128(dst2, _mm_add_epi64(_mm_loadu_si128(dst2), right));
420
421 __m128i* dst4 = (__m128i*)&block_store.positions[i + 4];
422 _mm_storeu_si128(dst4, _mm_add_epi64(_mm_loadu_si128(dst4), right));
423
424 __m128i* dst6 = (__m128i*)&block_store.positions[i + 6];
425 _mm_storeu_si128(dst6, _mm_add_epi64(_mm_loadu_si128(dst6), right));
426
427 __m128i* dst8 = (__m128i*)&block_store.positions[i + 8];
428 _mm_storeu_si128(dst8, _mm_add_epi64(_mm_loadu_si128(dst8), right));
429
430 __m128i* dst10 = (__m128i*)&block_store.positions[i + 10];
431 _mm_storeu_si128(dst10, _mm_add_epi64(_mm_loadu_si128(dst10), right));
432
433 __m128i* dst12 = (__m128i*)&block_store.positions[i + 12];
434 _mm_storeu_si128(dst12, _mm_add_epi64(_mm_loadu_si128(dst12), right));
435
436 __m128i* dst14 = (__m128i*)&block_store.positions[i + 14];
437 _mm_storeu_si128(dst14, _mm_add_epi64(_mm_loadu_si128(dst14), right));
438
439 __m128i* dst16 = (__m128i*)&block_store.positions[i + 16];
440 _mm_storeu_si128(dst16, _mm_add_epi64(_mm_loadu_si128(dst16), right));
441
442 __m128i* dst18 = (__m128i*)&block_store.positions[i + 18];
443 _mm_storeu_si128(dst18, _mm_add_epi64(_mm_loadu_si128(dst18), right));
444
445 __m128i* dst20 = (__m128i*)&block_store.positions[i + 20];
446 _mm_storeu_si128(dst20, _mm_add_epi64(_mm_loadu_si128(dst20), right));
447
448 __m128i* dst22 = (__m128i*)&block_store.positions[i + 22];
449 _mm_storeu_si128(dst22, _mm_add_epi64(_mm_loadu_si128(dst22), right));
450
451 __m128i* dst24 = (__m128i*)&block_store.positions[i + 24];
452 _mm_storeu_si128(dst24, _mm_add_epi64(_mm_loadu_si128(dst24), right));
453
454 __m128i* dst26 = (__m128i*)&block_store.positions[i + 26];
455 _mm_storeu_si128(dst26, _mm_add_epi64(_mm_loadu_si128(dst26), right));
456
457 __m128i* dst28 = (__m128i*)&block_store.positions[i + 28];
458 _mm_storeu_si128(dst28, _mm_add_epi64(_mm_loadu_si128(dst28), right));
459
460 __m128i* dst30 = (__m128i*)&block_store.positions[i + 30];
461 _mm_storeu_si128(dst30, _mm_add_epi64(_mm_loadu_si128(dst30), right));
462 }
463
464 rem += len;
465 for (int64_t i = len; i < rem; ++i)
466 block_store.positions[i] += delta;
467 }
468};
469
470#endif // __SSE2__
471
472#if defined(__AVX2__)
473
474template<typename Blks>
475struct adjust_block_positions<Blks, lu_factor_t::avx2_x64>
476{
477 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
478 {
479 static_assert(
480 sizeof(typename decltype(block_store.positions)::value_type) == 8,
481 "This code works only when the position values are 64-bit wide.");
482
483 int64_t n = block_store.positions.size();
484
485 if (start_block_index >= n)
486 return;
487
488 // Ensure that the section length is divisible by 4.
489 int64_t len = n - start_block_index;
490 int64_t rem = len & 3; // % 4
491 len -= rem;
492 len += start_block_index;
493
494 __m256i right = _mm256_set1_epi64x(delta);
495
496#if MDDS_USE_OPENMP
497#pragma omp parallel for
498#endif
499 for (int64_t i = start_block_index; i < len; i += 4)
500 {
501 __m256i* dst = (__m256i*)&block_store.positions[i];
502 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
503 }
504
505 rem += len;
506 for (int64_t i = len; i < rem; ++i)
507 block_store.positions[i] += delta;
508 }
509};
510
511template<typename Blks>
512struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu4>
513{
514 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
515 {
516 static_assert(
517 sizeof(typename decltype(block_store.positions)::value_type) == 8,
518 "This code works only when the position values are 64-bit wide.");
519
520 int64_t n = block_store.positions.size();
521
522 if (start_block_index >= n)
523 return;
524
525 // Ensure that the section length is divisible by 16.
526 int64_t len = n - start_block_index;
527 int64_t rem = len & 15; // % 16
528 len -= rem;
529 len += start_block_index;
530
531 __m256i right = _mm256_set1_epi64x(delta);
532
533#if MDDS_USE_OPENMP
534#pragma omp parallel for
535#endif
536 for (int64_t i = start_block_index; i < len; i += 16)
537 {
538 __m256i* dst = (__m256i*)&block_store.positions[i];
539 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
540
541 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
542 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
543
544 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
545 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
546
547 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
548 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
549 }
550
551 rem += len;
552 for (int64_t i = len; i < rem; ++i)
553 block_store.positions[i] += delta;
554 }
555};
556
557template<typename Blks>
558struct adjust_block_positions<Blks, lu_factor_t::avx2_x64_lu8>
559{
560 void operator()(Blks& block_store, int64_t start_block_index, int64_t delta) const
561 {
562 static_assert(
563 sizeof(typename decltype(block_store.positions)::value_type) == 8,
564 "This code works only when the position values are 64-bit wide.");
565
566 int64_t n = block_store.positions.size();
567
568 if (start_block_index >= n)
569 return;
570
571 // Ensure that the section length is divisible by 16.
572 int64_t len = n - start_block_index;
573 int64_t rem = len & 31; // % 32
574 len -= rem;
575 len += start_block_index;
576
577 __m256i right = _mm256_set1_epi64x(delta);
578
579#if MDDS_USE_OPENMP
580#pragma omp parallel for
581#endif
582 for (int64_t i = start_block_index; i < len; i += 32)
583 {
584 __m256i* dst = (__m256i*)&block_store.positions[i];
585 _mm256_storeu_si256(dst, _mm256_add_epi64(_mm256_loadu_si256(dst), right));
586
587 __m256i* dst4 = (__m256i*)&block_store.positions[i + 4];
588 _mm256_storeu_si256(dst4, _mm256_add_epi64(_mm256_loadu_si256(dst4), right));
589
590 __m256i* dst8 = (__m256i*)&block_store.positions[i + 8];
591 _mm256_storeu_si256(dst8, _mm256_add_epi64(_mm256_loadu_si256(dst8), right));
592
593 __m256i* dst12 = (__m256i*)&block_store.positions[i + 12];
594 _mm256_storeu_si256(dst12, _mm256_add_epi64(_mm256_loadu_si256(dst12), right));
595
596 __m256i* dst16 = (__m256i*)&block_store.positions[i + 16];
597 _mm256_storeu_si256(dst16, _mm256_add_epi64(_mm256_loadu_si256(dst16), right));
598
599 __m256i* dst20 = (__m256i*)&block_store.positions[i + 20];
600 _mm256_storeu_si256(dst20, _mm256_add_epi64(_mm256_loadu_si256(dst20), right));
601
602 __m256i* dst24 = (__m256i*)&block_store.positions[i + 24];
603 _mm256_storeu_si256(dst24, _mm256_add_epi64(_mm256_loadu_si256(dst24), right));
604
605 __m256i* dst28 = (__m256i*)&block_store.positions[i + 28];
606 _mm256_storeu_si256(dst28, _mm256_add_epi64(_mm256_loadu_si256(dst28), right));
607 }
608
609 rem += len;
610 for (int64_t i = len; i < rem; ++i)
611 block_store.positions[i] += delta;
612 }
613};
614
615#endif // __AVX2__
616
617}}}} // namespace mdds::mtv::soa::detail
618
619#endif
620
621/* vim:set shiftwidth=4 softtabstop=4 expandtab: */
Definition: soa/block_util.hpp:46