Grok  9.7.5
copy-inl.h
Go to the documentation of this file.
1 // Copyright 2022 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Per-target include guard
17 #if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
18  defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
21 #else
22 #define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
23 #endif
24 
25 #include <string.h> // memcpy
26 
27 #include "hwy/highway.h"
28 
30 namespace hwy {
31 namespace HWY_NAMESPACE {
32 
33 // These functions avoid having to write a loop plus remainder handling in the
34 // (unfortunately still common) case where arrays are not aligned/padded. If the
35 // inputs are known to be aligned/padded, it is more efficient to write a single
36 // loop using Load(). We do not provide a CopyAlignedPadded because it
37 // would be more verbose than such a loop.
38 //
39 // If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking.
40 
41 // Copies `from`[0, `count`) to `to`, which must not overlap `from`.
42 template <class D, typename T = TFromD<D>>
43 void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
44  const size_t N = Lanes(d);
45 
46  size_t idx = 0;
47  for (; idx + N <= count; idx += N) {
48  const Vec<D> v = LoadU(d, from + idx);
49  StoreU(v, d, to + idx);
50  }
51 
52  // `count` was a multiple of the vector length `N`: already done.
53  if (HWY_UNLIKELY(idx == count)) return;
54 
55 #if HWY_MEM_OPS_MIGHT_FAULT
56  memcpy(to, from, count * sizeof(T));
57 #else
58  const size_t remaining = count - idx;
59  HWY_DASSERT(0 != remaining && remaining < N);
60  const Mask<D> mask = FirstN(d, remaining);
61 
62  const Vec<D> v = MaskedLoad(mask, d, from + idx);
63  BlendedStore(v, mask, d, to + idx); // Avoid overwriting past the end
64 #endif // HWY_MEM_OPS_MIGHT_FAULT
65 }
66 
67 // For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
68 // corresponding mask element of `func(d, v)` is true. Returns the STL-style end
69 // of the newly written elements in `to`.
70 //
71 // `func` is either a functor with a templated operator()(d, v) returning a
72 // mask, or a generic lambda if using C++14. Due to apparent limitations of
73 // Clang on Windows, it is currently necessary to add HWY_ATTR before the
74 // opening { of the lambda to avoid errors about "function .. requires target".
75 //
76 // NOTE: this is only supported for 16-, 32- or 64-bit types.
77 // NOTE: Func may be called a second time for elements it has already seen, but
78 // these elements will not be written to `to` again.
79 template <class D, class Func, typename T = TFromD<D>>
80 T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
81  const Func& func) {
82  const size_t N = Lanes(d);
83 
84  size_t idx = 0;
85  for (; idx + N <= count; idx += N) {
86  const Vec<D> v = LoadU(d, from + idx);
87  to += CompressBlendedStore(v, func(d, v), d, to);
88  }
89 
90  // `count` was a multiple of the vector length `N`: already done.
91  if (HWY_UNLIKELY(idx == count)) return to;
92 
93 #if HWY_MEM_OPS_MIGHT_FAULT
94  // Proceed one by one.
95  const CappedTag<T, 1> d1;
96  for (; idx < count; ++idx) {
97  using V1 = Vec<decltype(d1)>;
98  const V1 v = LoadU(d1, from + idx);
99  // Avoid storing to `to` unless we know it should be kept - otherwise, we
100  // might overrun the end if it was allocated for the exact count.
101  if (CountTrue(d1, func(d1, v)) == 0) continue;
102  StoreU(v, d1, to);
103  to += 1;
104  }
105 #else
106  // Start index of the last unaligned whole vector, ending at the array end.
107  const size_t last = count - N;
108  // Number of elements before `from` or already written.
109  const size_t invalid = idx - last;
110  HWY_DASSERT(0 != invalid && invalid < N);
111  const Mask<D> mask = Not(FirstN(d, invalid));
112  const Vec<D> v = MaskedLoad(mask, d, from + last);
113  to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
114 #endif
115  return to;
116 }
117 
118 // NOLINTNEXTLINE(google-readability-namespace-comments)
119 } // namespace HWY_NAMESPACE
120 } // namespace hwy
122 
123 #endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_UNLIKELY(expr)
Definition: base.h:69
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
d
Definition: rvv-inl.h:1656
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:173
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API size_t CountTrue(Full128< T >, const Mask128< T > mask)
Definition: arm_neon-inl.h:4742
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API size_t CompressBlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:5061
void Copy(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to)
Definition: copy-inl.h:43
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition: arm_neon-inl.h:1422
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
T * CopyIf(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to, const Func &func)
Definition: copy-inl.h:80
N
Definition: rvv-inl.h:1656
const vfloat64m1_t v
Definition: rvv-inl.h:1656
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
FuncOutput(*)(const void *, FuncInput) Func
Definition: nanobenchmark.h:105
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80