Grok  9.7.5
transform-inl.h
Go to the documentation of this file.
1 // Copyright 2022 Google LLC
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 // Per-target include guard
17 #if defined(HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_) == \
18  defined(HWY_TARGET_TOGGLE)
19 #ifdef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
20 #undef HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
21 #else
22 #define HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
23 #endif
24 
25 #include "hwy/highway.h"
26 
28 namespace hwy {
29 namespace HWY_NAMESPACE {
30 
31 // These functions avoid having to write a loop plus remainder handling in the
32 // (unfortunately still common) case where arrays are not aligned/padded. If the
33 // inputs are known to be aligned/padded, it is more efficient to write a single
34 // loop using Load(). We do not provide a TransformAlignedPadded because it
35 // would be more verbose than such a loop.
36 //
37 // Func is either a functor with a templated operator()(d, v[, v1[, v2]]), or a
38 // generic lambda if using C++14. Due to apparent limitations of Clang on
39 // Windows, it is currently necessary to add HWY_ATTR before the opening { of
40 // the lambda to avoid errors about "always_inline function .. requires target".
41 //
42 // If HWY_MEM_OPS_MIGHT_FAULT, we use scalar code instead of masking. Otherwise,
43 // we used `MaskedLoad` and `BlendedStore` to read/write the final partial
44 // vector.
45 
46 // Replaces `inout[idx]` with `func(d, inout[idx])`. Example usage: multiplying
47 // array elements by a constant.
48 template <class D, class Func, typename T = TFromD<D>>
49 void Transform(D d, T* HWY_RESTRICT inout, size_t count, const Func& func) {
50  const size_t N = Lanes(d);
51 
52  size_t idx = 0;
53  for (; idx + N <= count; idx += N) {
54  const Vec<D> v = LoadU(d, inout + idx);
55  StoreU(func(d, v), d, inout + idx);
56  }
57 
58  // `count` was a multiple of the vector length `N`: already done.
59  if (HWY_UNLIKELY(idx == count)) return;
60 
61 #if HWY_MEM_OPS_MIGHT_FAULT
62  // Proceed one by one.
63  const CappedTag<T, 1> d1;
64  for (; idx < count; ++idx) {
65  using V1 = Vec<decltype(d1)>;
66  const V1 v = LoadU(d1, inout + idx);
67  StoreU(func(d1, v), d1, inout + idx);
68  }
69 #else
70  const size_t remaining = count - idx;
71  HWY_DASSERT(0 != remaining && remaining < N);
72  const Mask<D> mask = FirstN(d, remaining);
73  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
74  BlendedStore(func(d, v), mask, d, inout + idx);
75 #endif
76 }
77 
78 // Replaces `inout[idx]` with `func(d, inout[idx], in1[idx])`. Example usage:
79 // multiplying array elements by those of another array.
80 template <class D, class Func, typename T = TFromD<D>>
81 void Transform1(D d, T* HWY_RESTRICT inout, size_t count,
82  const T* HWY_RESTRICT in1, const Func& func) {
83  const size_t N = Lanes(d);
84 
85  size_t idx = 0;
86  for (; idx + N <= count; idx += N) {
87  const Vec<D> v = LoadU(d, inout + idx);
88  const Vec<D> v1 = LoadU(d, in1 + idx);
89  StoreU(func(d, v, v1), d, inout + idx);
90  }
91 
92  // `count` was a multiple of the vector length `N`: already done.
93  if (HWY_UNLIKELY(idx == count)) return;
94 
95 #if HWY_MEM_OPS_MIGHT_FAULT
96  // Proceed one by one.
97  const CappedTag<T, 1> d1;
98  for (; idx < count; ++idx) {
99  using V1 = Vec<decltype(d1)>;
100  const V1 v = LoadU(d1, inout + idx);
101  const V1 v1 = LoadU(d1, in1 + idx);
102  StoreU(func(d1, v, v1), d1, inout + idx);
103  }
104 #else
105  const size_t remaining = count - idx;
106  HWY_DASSERT(0 != remaining && remaining < N);
107  const Mask<D> mask = FirstN(d, remaining);
108  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
109  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
110  BlendedStore(func(d, v, v1), mask, d, inout + idx);
111 #endif
112 }
113 
114 // Replaces `inout[idx]` with `func(d, inout[idx], in1[idx], in2[idx])`. Example
115 // usage: FMA of elements from three arrays, stored into the first array.
116 template <class D, class Func, typename T = TFromD<D>>
117 void Transform2(D d, T* HWY_RESTRICT inout, size_t count,
118  const T* HWY_RESTRICT in1, const T* HWY_RESTRICT in2,
119  const Func& func) {
120  const size_t N = Lanes(d);
121 
122  size_t idx = 0;
123  for (; idx + N <= count; idx += N) {
124  const Vec<D> v = LoadU(d, inout + idx);
125  const Vec<D> v1 = LoadU(d, in1 + idx);
126  const Vec<D> v2 = LoadU(d, in2 + idx);
127  StoreU(func(d, v, v1, v2), d, inout + idx);
128  }
129 
130  // `count` was a multiple of the vector length `N`: already done.
131  if (HWY_UNLIKELY(idx == count)) return;
132 
133 #if HWY_MEM_OPS_MIGHT_FAULT
134  // Proceed one by one.
135  const CappedTag<T, 1> d1;
136  for (; idx < count; ++idx) {
137  using V1 = Vec<decltype(d1)>;
138  const V1 v = LoadU(d1, inout + idx);
139  const V1 v1 = LoadU(d1, in1 + idx);
140  const V1 v2 = LoadU(d1, in2 + idx);
141  StoreU(func(d1, v, v1, v2), d1, inout + idx);
142  }
143 #else
144  const size_t remaining = count - idx;
145  HWY_DASSERT(0 != remaining && remaining < N);
146  const Mask<D> mask = FirstN(d, remaining);
147  const Vec<D> v = MaskedLoad(mask, d, inout + idx);
148  const Vec<D> v1 = MaskedLoad(mask, d, in1 + idx);
149  const Vec<D> v2 = MaskedLoad(mask, d, in2 + idx);
150  BlendedStore(func(d, v, v1, v2), mask, d, inout + idx);
151 #endif
152 }
153 
154 // NOLINTNEXTLINE(google-readability-namespace-comments)
155 } // namespace HWY_NAMESPACE
156 } // namespace hwy
158 
159 #endif // HIGHWAY_HWY_CONTRIB_ALGO_TRANSFORM_INL_H_
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_DASSERT(condition)
Definition: base.h:193
#define HWY_UNLIKELY(expr)
Definition: base.h:69
d
Definition: rvv-inl.h:1656
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
typename detail::CappedTagChecker< T, kLimit >::type CappedTag
Definition: ops/shared-inl.h:173
void Transform2(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const T *HWY_RESTRICT in2, const Func &func)
Definition: transform-inl.h:117
HWY_API size_t Lanes(Simd< T, N, kPow2 > d)
Definition: arm_sve-inl.h:218
HWY_API Vec128< T, N > MaskedLoad(Mask128< T, N > m, Simd< T, N, 0 > d, const T *HWY_RESTRICT aligned)
Definition: arm_neon-inl.h:2210
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API void StoreU(const Vec128< uint8_t > v, Full128< uint8_t >, uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2224
void Transform(D d, T *HWY_RESTRICT inout, size_t count, const Func &func)
Definition: transform-inl.h:49
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
N
Definition: rvv-inl.h:1656
void Transform1(D d, T *HWY_RESTRICT inout, size_t count, const T *HWY_RESTRICT in1, const Func &func)
Definition: transform-inl.h:81
const vfloat64m1_t v
Definition: rvv-inl.h:1656
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
FuncOutput(*)(const void *, FuncInput) Func
Definition: nanobenchmark.h:105
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()