48 #if HWY_TARGET != HWY_SCALAR && HWY_TARGET != HWY_RVV
50 template <
size_t kLanes,
class D,
class V = VFromD<D>>
52 constexpr
size_t kBytes = kLanes *
sizeof(LaneType<V>);
53 static_assert(kBytes < 16,
"Shift count is per-block");
54 return CombineShiftRightBytes<kBytes>(
d, hi, lo);
63 const Unsigned bit = Unsigned(1) << (
sizeof(Unsigned) * 8 - 1);
78 template <
class D,
typename T = TFromD<D>>
81 #if HWY_MEM_OPS_MIGHT_FAULT
83 for (
size_t i = 0; i < num; ++i) {
94 #if HWY_TARGET != HWY_SCALAR
110 const auto mask =
Set(du, 0xF);
114 alignas(16)
static constexpr uint8_t basisL[16] = {
115 0x00, 0x70, 0x2A, 0x5A, 0x98, 0xE8, 0xB2, 0xC2,
116 0x08, 0x78, 0x22, 0x52, 0x90, 0xE0, 0xBA, 0xCA};
117 alignas(16)
static constexpr uint8_t basisU[16] = {
118 0x00, 0x4D, 0x7C, 0x31, 0x7D, 0x30, 0x01, 0x4C,
119 0x81, 0xCC, 0xFD, 0xB0, 0xFC, 0xB1, 0x80, 0xCD};
120 const auto sL =
And(state, mask);
121 const auto sU = ShiftRight<4>(state);
124 state =
Xor(gf4L, gf4U);
129 alignas(16)
static constexpr uint8_t kZetaInv[16] = {
130 0x80, 7, 11, 15, 6, 10, 4, 1, 9, 8, 5, 2, 12, 14, 13, 3};
131 alignas(16)
static constexpr uint8_t kInv[16] = {
132 0x80, 1, 8, 13, 15, 6, 5, 14, 2, 12, 11, 10, 9, 3, 7, 4};
134 const auto sL =
And(state, mask);
135 const auto sU = ShiftRight<4>(state);
136 const auto sX =
Xor(sU, sL);
145 alignas(16)
static constexpr uint8_t kAffineL[16] = {
146 0x00, 0xC7, 0xBD, 0x6F, 0x17, 0x6D, 0xD2, 0xD0,
147 0x78, 0xA8, 0x02, 0xC5, 0x7A, 0xBF, 0xAA, 0x15};
148 alignas(16)
static constexpr uint8_t kAffineU[16] = {
149 0x00, 0x6A, 0xBB, 0x5F, 0xA5, 0x74, 0xE4, 0xCF,
150 0xFA, 0x35, 0x2B, 0x41, 0xD1, 0x90, 0x1E, 0x8E};
153 return Xor(
Xor(affL, affU),
Set(du, 0x63));
161 #if (defined(HWY_NATIVE_AES) == defined(HWY_TARGET_TOGGLE))
162 #ifdef HWY_NATIVE_AES
163 #undef HWY_NATIVE_AES
165 #define HWY_NATIVE_AES
169 #if HWY_TARGET != HWY_SCALAR
174 HWY_API V ShiftRows(
const V state) {
176 alignas(16)
static constexpr uint8_t kShiftRow[16] = {
181 const auto shift_row =
LoadDup128(du, kShiftRow);
186 HWY_API V MixColumns(
const V state) {
193 alignas(16)
static constexpr uint8_t k2301[16] = {
194 2, 3, 0, 1, 6, 7, 4, 5, 10, 11, 8, 9, 14, 15, 12, 13};
195 alignas(16)
static constexpr uint8_t k1230[16] = {
196 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12};
200 const auto d =
Xor(
Add(state, state), overflow);
202 const auto d_s2301 =
Xor(
d, s2301);
203 const auto t_s2301 =
Xor(state, d_s2301);
205 return Xor(d_s2301, t1230_s3012);
214 state = detail::SubBytes(state);
215 state = detail::ShiftRows(state);
216 state = detail::MixColumns(state);
217 state =
Xor(state, round_key);
224 state = detail::SubBytes(state);
225 state = detail::ShiftRows(state);
226 state =
Xor(state, round_key);
236 static_assert(
IsSame<
TFromD<decltype(
d)>, uint64_t>(),
"V must be u64");
237 const auto k1 =
Set(
d, 0x1111111111111111ULL);
238 const auto k2 =
Set(
d, 0x2222222222222222ULL);
239 const auto k4 =
Set(
d, 0x4444444444444444ULL);
240 const auto k8 =
Set(
d, 0x8888888888888888ULL);
241 const auto a0 =
And(a, k1);
242 const auto a1 =
And(a, k2);
243 const auto a2 =
And(a, k4);
244 const auto a3 =
And(a, k8);
245 const auto b0 =
And(b, k1);
246 const auto b1 =
And(b, k2);
247 const auto b2 =
And(b, k4);
248 const auto b3 =
And(b, k8);
264 static_assert(
IsSame<
TFromD<decltype(
d)>, uint64_t>(),
"V must be u64");
265 const auto k1 =
Set(
d, 0x1111111111111111ULL);
266 const auto k2 =
Set(
d, 0x2222222222222222ULL);
267 const auto k4 =
Set(
d, 0x4444444444444444ULL);
268 const auto k8 =
Set(
d, 0x8888888888888888ULL);
269 const auto a0 =
And(a, k1);
270 const auto a1 =
And(a, k2);
271 const auto a2 =
And(a, k4);
272 const auto a3 =
And(a, k8);
273 const auto b0 =
And(b, k1);
274 const auto b1 =
And(b, k2);
275 const auto b2 =
And(b, k4);
276 const auto b3 =
And(b, k8);
293 #if (defined(HWY_NATIVE_POPCNT) == defined(HWY_TARGET_TOGGLE))
294 #ifdef HWY_NATIVE_POPCNT
295 #undef HWY_NATIVE_POPCNT
297 #define HWY_NATIVE_POPCNT
300 #if HWY_TARGET == HWY_RVV
301 #define HWY_MIN_POW2_FOR_128 1
305 #define HWY_MIN_POW2_FOR_128 0
310 template <
typename V, HWY_IF_LANES_ARE(u
int8_t, V), HWY_IF_GE128_D(DFromV<V>),
311 HWY_IF_POW2_GE(DFromV<V>, HWY_MIN_POW2_FOR_128)>
314 HWY_ALIGN constexpr uint8_t kLookup[16] = {
315 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
317 const auto lo =
And(
v,
Set(
d, 0xF));
318 const auto hi = ShiftRight<4>(
v);
324 #if HWY_TARGET != HWY_RVV
326 template <
typename V, HWY_IF_LANES_ARE(u
int8_t, V), HWY_IF_LT128_D(DFromV<V>)>
336 template <
typename V, HWY_IF_LANES_ARE(u
int16_t, V)>
341 return Add(ShiftRight<8>(vals),
And(vals,
Set(
d, 0xFF)));
344 template <
typename V, HWY_IF_LANES_ARE(u
int32_t, V)>
349 return Add(ShiftRight<16>(vals),
And(vals,
Set(
d, 0xFF)));
352 #if HWY_HAVE_INTEGER64
353 template <
typename V, HWY_IF_LANES_ARE(u
int64_t, V)>
358 return Add(ShiftRight<32>(vals),
And(vals,
Set(
d, 0xFF)));
#define HWY_RESTRICT
Definition: base.h:63
#define HWY_API
Definition: base.h:122
#define HWY_INLINE
Definition: base.h:64
HWY_INLINE Mask128< T, N > Xor(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:936
HWY_INLINE Vec128< T, N > IfThenElseZero(hwy::SizeTag< 1 >, Mask128< T, N > mask, Vec128< T, N > yes)
Definition: x86_128-inl.h:728
HWY_INLINE Mask128< T, N > And(hwy::SizeTag< 1 >, const Mask128< T, N > a, const Mask128< T, N > b)
Definition: x86_128-inl.h:825
d
Definition: rvv-inl.h:1656
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:61
HWY_API uint8_t GetLane(const Vec128< uint8_t, 16 > v)
Definition: arm_neon-inl.h:767
HWY_API Vec128< T, N > PopulationCount(Vec128< T, N > v)
Definition: arm_neon-inl.h:1595
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5252
HWY_API Vec< D > NaN(D d)
Definition: generic_ops-inl.h:69
HWY_API Mask128< T, N > FirstN(const Simd< T, N, 0 > d, size_t num)
Definition: arm_neon-inl.h:1896
HWY_API Vec128< uint64_t, N > Min(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1957
HWY_API Vec256< uint64_t > CLMulLower(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3972
HWY_API Vec128< uint64_t, N > Max(const Vec128< uint64_t, N > a, const Vec128< uint64_t, N > b)
Definition: arm_neon-inl.h:1995
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition: arm_neon-inl.h:1675
HWY_INLINE Vec128< uint64_t > MulOdd(Vec128< uint64_t > a, Vec128< uint64_t > b)
Definition: arm_neon-inl.h:4159
HWY_API Vec128< T, N > Zero(Simd< T, N, 0 > d)
Definition: arm_neon-inl.h:733
HWY_API Vec256< uint8_t > AESRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3947
HWY_API Vec128< int64_t > MulEven(Vec128< int32_t > a, Vec128< int32_t > b)
Definition: arm_neon-inl.h:4119
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition: ops/shared-inl.h:199
HWY_API void BlendedStore(Vec128< T, N > v, Mask128< T, N > m, Simd< T, N, 0 > d, T *HWY_RESTRICT p)
Definition: arm_neon-inl.h:2402
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5217
svuint16_t Set(Simd< bfloat16_t, N, kPow2 > d, bfloat16_t arg)
Definition: arm_sve-inl.h:282
HWY_API Vec256< uint64_t > CLMulUpper(Vec256< uint64_t > a, Vec256< uint64_t > b)
Definition: x86_256-inl.h:3983
sseg3 sseg3 sseg4 sseg4 m2
Definition: rvv-inl.h:1436
HWY_API Vec256< uint8_t > AESLastRound(Vec256< uint8_t > state, Vec256< uint8_t > round_key)
Definition: x86_256-inl.h:3959
HWY_API Vec128< uint8_t > LoadU(Full128< uint8_t >, const uint8_t *HWY_RESTRICT unaligned)
Definition: arm_neon-inl.h:2031
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:25
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1440
HWY_API Vec128< T, N > BitCast(Simd< T, N, 0 > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:710
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5221
typename D::template Rebind< T > Rebind
Definition: ops/shared-inl.h:196
sseg3 m1
Definition: rvv-inl.h:1409
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1489
HWY_API V Clamp(const V v, const V lo, const V hi)
Definition: generic_ops-inl.h:42
decltype(detail::DeduceD()(V())) DFromV
Definition: arm_neon-inl.h:555
HWY_API Vec128< T, N > LoadDup128(Simd< T, N, 0 > d, const T *const HWY_RESTRICT p)
Definition: arm_neon-inl.h:2217
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition: generic_ops-inl.h:79
typename D::template Repartition< T > Repartition
Definition: ops/shared-inl.h:207
decltype(MaskFromVec(Zero(D()))) Mask
Definition: generic_ops-inl.h:38
HWY_API Vec128< TI > TableLookupBytes(const Vec128< T > bytes, const Vec128< TI > from)
Definition: arm_neon-inl.h:4169
const vfloat64m1_t v
Definition: rvv-inl.h:1656
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition: rvv-inl.h:2108
typename D::T TFromD
Definition: ops/shared-inl.h:192
HWY_API VI TableLookupBytesOr0(const V bytes, const VI from)
Definition: arm_neon-inl.h:4224
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1477
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:32
Definition: aligned_allocator.h:27
constexpr HWY_API T LimitsMax()
Definition: base.h:497
constexpr HWY_API bool IsSame()
Definition: base.h:286
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition: base.h:452
#define HWY_ALIGN
Definition: set_macros-inl.h:81
#define HWY_NAMESPACE
Definition: set_macros-inl.h:80