Grok  9.5.0
math-inl.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 // Include guard (still compiled once per target)
16 #if defined(HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_) == \
17  defined(HWY_TARGET_TOGGLE)
18 #ifdef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
19 #undef HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
20 #else
21 #define HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
22 #endif
23 
24 #include "hwy/highway.h"
25 
27 namespace hwy {
28 namespace HWY_NAMESPACE {
29 
38 template <class D, class V>
39 HWY_INLINE V Acos(const D d, V x);
40 template <class D, class V>
41 HWY_NOINLINE V CallAcos(const D d, VecArg<V> x) {
42  return Acos(d, x);
43 }
44 
53 template <class D, class V>
54 HWY_INLINE V Acosh(const D d, V x);
55 template <class D, class V>
56 HWY_NOINLINE V CallAcosh(const D d, VecArg<V> x) {
57  return Acosh(d, x);
58 }
59 
68 template <class D, class V>
69 HWY_INLINE V Asin(const D d, V x);
70 template <class D, class V>
71 HWY_NOINLINE V CallAsin(const D d, VecArg<V> x) {
72  return Asin(d, x);
73 }
74 
83 template <class D, class V>
84 HWY_INLINE V Asinh(const D d, V x);
85 template <class D, class V>
86 HWY_NOINLINE V CallAsinh(const D d, VecArg<V> x) {
87  return Asinh(d, x);
88 }
89 
98 template <class D, class V>
99 HWY_INLINE V Atan(const D d, V x);
100 template <class D, class V>
101 HWY_NOINLINE V CallAtan(const D d, VecArg<V> x) {
102  return Atan(d, x);
103 }
104 
113 template <class D, class V>
114 HWY_INLINE V Atanh(const D d, V x);
115 template <class D, class V>
117  return Atanh(d, x);
118 }
119 
128 template <class D, class V>
129 HWY_INLINE V Cos(const D d, V x);
130 template <class D, class V>
131 HWY_NOINLINE V CallCos(const D d, VecArg<V> x) {
132  return Cos(d, x);
133 }
134 
143 template <class D, class V>
144 HWY_INLINE V Exp(const D d, V x);
145 template <class D, class V>
146 HWY_NOINLINE V CallExp(const D d, VecArg<V> x) {
147  return Exp(d, x);
148 }
149 
158 template <class D, class V>
159 HWY_INLINE V Expm1(const D d, V x);
160 template <class D, class V>
162  return Expm1(d, x);
163 }
164 
173 template <class D, class V>
174 HWY_INLINE V Log(const D d, V x);
175 template <class D, class V>
176 HWY_NOINLINE V CallLog(const D d, VecArg<V> x) {
177  return Log(d, x);
178 }
179 
188 template <class D, class V>
189 HWY_INLINE V Log10(const D d, V x);
190 template <class D, class V>
192  return Log10(d, x);
193 }
194 
203 template <class D, class V>
204 HWY_INLINE V Log1p(const D d, V x);
205 template <class D, class V>
207  return Log1p(d, x);
208 }
209 
218 template <class D, class V>
219 HWY_INLINE V Log2(const D d, V x);
220 template <class D, class V>
221 HWY_NOINLINE V CallLog2(const D d, VecArg<V> x) {
222  return Log2(d, x);
223 }
224 
233 template <class D, class V>
234 HWY_INLINE V Sin(const D d, V x);
235 template <class D, class V>
236 HWY_NOINLINE V CallSin(const D d, VecArg<V> x) {
237  return Sin(d, x);
238 }
239 
248 template <class D, class V>
249 HWY_INLINE V Sinh(const D d, V x);
250 template <class D, class V>
251 HWY_NOINLINE V CallSinh(const D d, VecArg<V> x) {
252  return Sinh(d, x);
253 }
254 
263 template <class D, class V>
264 HWY_INLINE V Tanh(const D d, V x);
265 template <class D, class V>
266 HWY_NOINLINE V CallTanh(const D d, VecArg<V> x) {
267  return Tanh(d, x);
268 }
269 
271 // Implementation
273 namespace impl {
274 
275 // Estrin's Scheme is a faster method for evaluating large polynomials on
276 // super scalar architectures. It works by factoring the Horner's Method
277 // polynomial into power of two sub-trees that can be evaluated in parallel.
278 // Wikipedia Link: https://en.wikipedia.org/wiki/Estrin%27s_scheme
279 template <class T>
280 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1) {
281  return MulAdd(c1, x, c0);
282 }
283 template <class T>
284 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2) {
285  T x2 = Mul(x, x);
286  return MulAdd(x2, c2, MulAdd(c1, x, c0));
287 }
288 template <class T>
289 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3) {
290  T x2 = Mul(x, x);
291  return MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0));
292 }
293 template <class T>
294 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4) {
295  T x2 = Mul(x, x);
296  T x4 = Mul(x2, x2);
297  return MulAdd(x4, c4, MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
298 }
299 template <class T>
300 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5) {
301  T x2 = Mul(x, x);
302  T x4 = Mul(x2, x2);
303  return MulAdd(x4, MulAdd(c5, x, c4),
304  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
305 }
306 template <class T>
307 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
308  T c6) {
309  T x2 = Mul(x, x);
310  T x4 = Mul(x2, x2);
311  return MulAdd(x4, MulAdd(x2, c6, MulAdd(c5, x, c4)),
312  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
313 }
314 template <class T>
315 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
316  T c6, T c7) {
317  T x2 = Mul(x, x);
318  T x4 = Mul(x2, x2);
319  return MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
320  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)));
321 }
322 template <class T>
323 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
324  T c6, T c7, T c8) {
325  T x2 = Mul(x, x);
326  T x4 = Mul(x2, x2);
327  T x8 = Mul(x4, x4);
328  return MulAdd(x8, c8,
329  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
330  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
331 }
332 template <class T>
333 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
334  T c6, T c7, T c8, T c9) {
335  T x2 = Mul(x, x);
336  T x4 = Mul(x2, x2);
337  T x8 = Mul(x4, x4);
338  return MulAdd(x8, MulAdd(c9, x, c8),
339  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
340  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
341 }
342 template <class T>
343 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
344  T c6, T c7, T c8, T c9, T c10) {
345  T x2 = Mul(x, x);
346  T x4 = Mul(x2, x2);
347  T x8 = Mul(x4, x4);
348  return MulAdd(x8, MulAdd(x2, c10, MulAdd(c9, x, c8)),
349  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
350  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
351 }
352 template <class T>
353 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
354  T c6, T c7, T c8, T c9, T c10, T c11) {
355  T x2 = Mul(x, x);
356  T x4 = Mul(x2, x2);
357  T x8 = Mul(x4, x4);
358  return MulAdd(x8, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8)),
359  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
360  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
361 }
362 template <class T>
363 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
364  T c6, T c7, T c8, T c9, T c10, T c11,
365  T c12) {
366  T x2 = Mul(x, x);
367  T x4 = Mul(x2, x2);
368  T x8 = Mul(x4, x4);
369  return MulAdd(
370  x8, MulAdd(x4, c12, MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
371  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
372  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
373 }
374 template <class T>
375 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
376  T c6, T c7, T c8, T c9, T c10, T c11,
377  T c12, T c13) {
378  T x2 = Mul(x, x);
379  T x4 = Mul(x2, x2);
380  T x8 = Mul(x4, x4);
381  return MulAdd(x8,
382  MulAdd(x4, MulAdd(c13, x, c12),
383  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
384  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
385  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
386 }
387 template <class T>
388 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
389  T c6, T c7, T c8, T c9, T c10, T c11,
390  T c12, T c13, T c14) {
391  T x2 = Mul(x, x);
392  T x4 = Mul(x2, x2);
393  T x8 = Mul(x4, x4);
394  return MulAdd(x8,
395  MulAdd(x4, MulAdd(x2, c14, MulAdd(c13, x, c12)),
396  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
397  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
398  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
399 }
400 template <class T>
401 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
402  T c6, T c7, T c8, T c9, T c10, T c11,
403  T c12, T c13, T c14, T c15) {
404  T x2 = Mul(x, x);
405  T x4 = Mul(x2, x2);
406  T x8 = Mul(x4, x4);
407  return MulAdd(x8,
408  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
409  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
410  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
411  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0))));
412 }
413 template <class T>
414 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
415  T c6, T c7, T c8, T c9, T c10, T c11,
416  T c12, T c13, T c14, T c15, T c16) {
417  T x2 = Mul(x, x);
418  T x4 = Mul(x2, x2);
419  T x8 = Mul(x4, x4);
420  T x16 = Mul(x8, x8);
421  return MulAdd(
422  x16, c16,
423  MulAdd(x8,
424  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
425  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
426  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
427  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
428 }
429 template <class T>
430 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
431  T c6, T c7, T c8, T c9, T c10, T c11,
432  T c12, T c13, T c14, T c15, T c16, T c17) {
433  T x2 = Mul(x, x);
434  T x4 = Mul(x2, x2);
435  T x8 = Mul(x4, x4);
436  T x16 = Mul(x8, x8);
437  return MulAdd(
438  x16, MulAdd(c17, x, c16),
439  MulAdd(x8,
440  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
441  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
442  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
443  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
444 }
445 template <class T>
446 HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1, T c2, T c3, T c4, T c5,
447  T c6, T c7, T c8, T c9, T c10, T c11,
448  T c12, T c13, T c14, T c15, T c16, T c17,
449  T c18) {
450  T x2 = Mul(x, x);
451  T x4 = Mul(x2, x2);
452  T x8 = Mul(x4, x4);
453  T x16 = Mul(x8, x8);
454  return MulAdd(
455  x16, MulAdd(x2, c18, MulAdd(c17, x, c16)),
456  MulAdd(x8,
457  MulAdd(x4, MulAdd(x2, MulAdd(c15, x, c14), MulAdd(c13, x, c12)),
458  MulAdd(x2, MulAdd(c11, x, c10), MulAdd(c9, x, c8))),
459  MulAdd(x4, MulAdd(x2, MulAdd(c7, x, c6), MulAdd(c5, x, c4)),
460  MulAdd(x2, MulAdd(c3, x, c2), MulAdd(c1, x, c0)))));
461 }
462 
463 template <class FloatOrDouble>
464 struct AsinImpl {};
465 template <class FloatOrDouble>
466 struct AtanImpl {};
467 template <class FloatOrDouble>
468 struct CosSinImpl {};
469 template <class FloatOrDouble>
470 struct ExpImpl {};
471 template <class FloatOrDouble>
472 struct LogImpl {};
473 
474 template <>
475 struct AsinImpl<float> {
476  // Polynomial approximation for asin(x) over the range [0, 0.5).
477  template <class D, class V>
478  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
479  const auto k0 = Set(d, +0.1666677296f);
480  const auto k1 = Set(d, +0.07495029271f);
481  const auto k2 = Set(d, +0.04547423869f);
482  const auto k3 = Set(d, +0.02424046025f);
483  const auto k4 = Set(d, +0.04197454825f);
484 
485  return Estrin(x2, k0, k1, k2, k3, k4);
486  }
487 };
488 
489 #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
490 
491 template <>
492 struct AsinImpl<double> {
493  // Polynomial approximation for asin(x) over the range [0, 0.5).
494  template <class D, class V>
495  HWY_INLINE V AsinPoly(D d, V x2, V /*x*/) {
496  const auto k0 = Set(d, +0.1666666666666497543);
497  const auto k1 = Set(d, +0.07500000000378581611);
498  const auto k2 = Set(d, +0.04464285681377102438);
499  const auto k3 = Set(d, +0.03038195928038132237);
500  const auto k4 = Set(d, +0.02237176181932048341);
501  const auto k5 = Set(d, +0.01735956991223614604);
502  const auto k6 = Set(d, +0.01388715184501609218);
503  const auto k7 = Set(d, +0.01215360525577377331);
504  const auto k8 = Set(d, +0.006606077476277170610);
505  const auto k9 = Set(d, +0.01929045477267910674);
506  const auto k10 = Set(d, -0.01581918243329996643);
507  const auto k11 = Set(d, +0.03161587650653934628);
508 
509  return Estrin(x2, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11);
510  }
511 };
512 
513 #endif
514 
515 template <>
516 struct AtanImpl<float> {
517  // Polynomial approximation for atan(x) over the range [0, 1.0).
518  template <class D, class V>
519  HWY_INLINE V AtanPoly(D d, V x) {
520  const auto k0 = Set(d, -0.333331018686294555664062f);
521  const auto k1 = Set(d, +0.199926957488059997558594f);
522  const auto k2 = Set(d, -0.142027363181114196777344f);
523  const auto k3 = Set(d, +0.106347933411598205566406f);
524  const auto k4 = Set(d, -0.0748900920152664184570312f);
525  const auto k5 = Set(d, +0.0425049886107444763183594f);
526  const auto k6 = Set(d, -0.0159569028764963150024414f);
527  const auto k7 = Set(d, +0.00282363896258175373077393f);
528 
529  const auto y = Mul(x, x);
530  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7), Mul(y, x), x);
531  }
532 };
533 
534 #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
535 
536 template <>
537 struct AtanImpl<double> {
538  // Polynomial approximation for atan(x) over the range [0, 1.0).
539  template <class D, class V>
540  HWY_INLINE V AtanPoly(D d, V x) {
541  const auto k0 = Set(d, -0.333333333333311110369124);
542  const auto k1 = Set(d, +0.199999999996591265594148);
543  const auto k2 = Set(d, -0.14285714266771329383765);
544  const auto k3 = Set(d, +0.111111105648261418443745);
545  const auto k4 = Set(d, -0.090908995008245008229153);
546  const auto k5 = Set(d, +0.0769219538311769618355029);
547  const auto k6 = Set(d, -0.0666573579361080525984562);
548  const auto k7 = Set(d, +0.0587666392926673580854313);
549  const auto k8 = Set(d, -0.0523674852303482457616113);
550  const auto k9 = Set(d, +0.0466667150077840625632675);
551  const auto k10 = Set(d, -0.0407629191276836500001934);
552  const auto k11 = Set(d, +0.0337852580001353069993897);
553  const auto k12 = Set(d, -0.0254517624932312641616861);
554  const auto k13 = Set(d, +0.016599329773529201970117);
555  const auto k14 = Set(d, -0.00889896195887655491740809);
556  const auto k15 = Set(d, +0.00370026744188713119232403);
557  const auto k16 = Set(d, -0.00110611831486672482563471);
558  const auto k17 = Set(d, +0.000209850076645816976906797);
559  const auto k18 = Set(d, -1.88796008463073496563746e-5);
560 
561  const auto y = Mul(x, x);
562  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10, k11,
563  k12, k13, k14, k15, k16, k17, k18),
564  Mul(y, x), x);
565  }
566 };
567 
568 #endif
569 
570 template <>
571 struct CosSinImpl<float> {
572  // Rounds float toward zero and returns as int32_t.
573  template <class D, class V>
575  return ConvertTo(Rebind<int32_t, D>(), x);
576  }
577 
578  template <class D, class V>
579  HWY_INLINE V Poly(D d, V x) {
580  const auto k0 = Set(d, -1.66666597127914428710938e-1f);
581  const auto k1 = Set(d, +8.33307858556509017944336e-3f);
582  const auto k2 = Set(d, -1.981069071916863322258e-4f);
583  const auto k3 = Set(d, +2.6083159809786593541503e-6f);
584 
585  const auto y = Mul(x, x);
586  return MulAdd(Estrin(y, k0, k1, k2, k3), Mul(y, x), x);
587  }
588 
589  template <class D, class V, class VI32>
590  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
591  // kHalfPiPart0f + kHalfPiPart1f + kHalfPiPart2f + kHalfPiPart3f ~= -pi/2
592  const V kHalfPiPart0f = Set(d, -0.5f * 3.140625f);
593  const V kHalfPiPart1f = Set(d, -0.5f * 0.0009670257568359375f);
594  const V kHalfPiPart2f = Set(d, -0.5f * 6.2771141529083251953e-7f);
595  const V kHalfPiPart3f = Set(d, -0.5f * 1.2154201256553420762e-10f);
596 
597  // Extended precision modular arithmetic.
598  const V qf = ConvertTo(d, q);
599  x = MulAdd(qf, kHalfPiPart0f, x);
600  x = MulAdd(qf, kHalfPiPart1f, x);
601  x = MulAdd(qf, kHalfPiPart2f, x);
602  x = MulAdd(qf, kHalfPiPart3f, x);
603  return x;
604  }
605 
606  template <class D, class V, class VI32>
607  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
608  // kPiPart0f + kPiPart1f + kPiPart2f + kPiPart3f ~= -pi
609  const V kPiPart0f = Set(d, -3.140625f);
610  const V kPiPart1f = Set(d, -0.0009670257568359375f);
611  const V kPiPart2f = Set(d, -6.2771141529083251953e-7f);
612  const V kPiPart3f = Set(d, -1.2154201256553420762e-10f);
613 
614  // Extended precision modular arithmetic.
615  const V qf = ConvertTo(d, q);
616  x = MulAdd(qf, kPiPart0f, x);
617  x = MulAdd(qf, kPiPart1f, x);
618  x = MulAdd(qf, kPiPart2f, x);
619  x = MulAdd(qf, kPiPart3f, x);
620  return x;
621  }
622 
623  // (q & 2) == 0 ? -0.0 : +0.0
624  template <class D, class VI32>
626  const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
627  return BitCast(d, ShiftLeft<30>(AndNot(q, kTwo)));
628  }
629 
630  // ((q & 1) ? -0.0 : +0.0)
631  template <class D, class VI32>
633  const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
634  return BitCast(d, ShiftLeft<31>(And(q, kOne)));
635  }
636 };
637 
638 #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
639 
640 template <>
641 struct CosSinImpl<double> {
642  // Rounds double toward zero and returns as int32_t.
643  template <class D, class V>
644  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
645  return DemoteTo(Rebind<int32_t, D>(), x);
646  }
647 
648  template <class D, class V>
649  HWY_INLINE V Poly(D d, V x) {
650  const auto k0 = Set(d, -0.166666666666666657414808);
651  const auto k1 = Set(d, +0.00833333333333332974823815);
652  const auto k2 = Set(d, -0.000198412698412696162806809);
653  const auto k3 = Set(d, +2.75573192239198747630416e-6);
654  const auto k4 = Set(d, -2.50521083763502045810755e-8);
655  const auto k5 = Set(d, +1.60590430605664501629054e-10);
656  const auto k6 = Set(d, -7.64712219118158833288484e-13);
657  const auto k7 = Set(d, +2.81009972710863200091251e-15);
658  const auto k8 = Set(d, -7.97255955009037868891952e-18);
659 
660  const auto y = Mul(x, x);
661  return MulAdd(Estrin(y, k0, k1, k2, k3, k4, k5, k6, k7, k8), Mul(y, x), x);
662  }
663 
664  template <class D, class V, class VI32>
665  HWY_INLINE V CosReduce(D d, V x, VI32 q) {
666  // kHalfPiPart0d + kHalfPiPart1d + kHalfPiPart2d + kHalfPiPart3d ~= -pi/2
667  const V kHalfPiPart0d = Set(d, -0.5 * 3.1415926218032836914);
668  const V kHalfPiPart1d = Set(d, -0.5 * 3.1786509424591713469e-8);
669  const V kHalfPiPart2d = Set(d, -0.5 * 1.2246467864107188502e-16);
670  const V kHalfPiPart3d = Set(d, -0.5 * 1.2736634327021899816e-24);
671 
672  // Extended precision modular arithmetic.
673  const V qf = PromoteTo(d, q);
674  x = MulAdd(qf, kHalfPiPart0d, x);
675  x = MulAdd(qf, kHalfPiPart1d, x);
676  x = MulAdd(qf, kHalfPiPart2d, x);
677  x = MulAdd(qf, kHalfPiPart3d, x);
678  return x;
679  }
680 
681  template <class D, class V, class VI32>
682  HWY_INLINE V SinReduce(D d, V x, VI32 q) {
683  // kPiPart0d + kPiPart1d + kPiPart2d + kPiPart3d ~= -pi
684  const V kPiPart0d = Set(d, -3.1415926218032836914);
685  const V kPiPart1d = Set(d, -3.1786509424591713469e-8);
686  const V kPiPart2d = Set(d, -1.2246467864107188502e-16);
687  const V kPiPart3d = Set(d, -1.2736634327021899816e-24);
688 
689  // Extended precision modular arithmetic.
690  const V qf = PromoteTo(d, q);
691  x = MulAdd(qf, kPiPart0d, x);
692  x = MulAdd(qf, kPiPart1d, x);
693  x = MulAdd(qf, kPiPart2d, x);
694  x = MulAdd(qf, kPiPart3d, x);
695  return x;
696  }
697 
698  // (q & 2) == 0 ? -0.0 : +0.0
699  template <class D, class VI32>
700  HWY_INLINE Vec<Rebind<double, D>> CosSignFromQuadrant(D d, VI32 q) {
701  const VI32 kTwo = Set(Rebind<int32_t, D>(), 2);
702  return BitCast(
703  d, ShiftLeft<62>(PromoteTo(Rebind<int64_t, D>(), AndNot(q, kTwo))));
704  }
705 
706  // ((q & 1) ? -0.0 : +0.0)
707  template <class D, class VI32>
708  HWY_INLINE Vec<Rebind<double, D>> SinSignFromQuadrant(D d, VI32 q) {
709  const VI32 kOne = Set(Rebind<int32_t, D>(), 1);
710  return BitCast(
711  d, ShiftLeft<63>(PromoteTo(Rebind<int64_t, D>(), And(q, kOne))));
712  }
713 };
714 
715 #endif
716 
717 template <>
718 struct ExpImpl<float> {
719  // Rounds float toward zero and returns as int32_t.
720  template <class D, class V>
722  return ConvertTo(Rebind<int32_t, D>(), x);
723  }
724 
725  template <class D, class V>
726  HWY_INLINE V ExpPoly(D d, V x) {
727  const auto k0 = Set(d, +0.5f);
728  const auto k1 = Set(d, +0.166666671633720397949219f);
729  const auto k2 = Set(d, +0.0416664853692054748535156f);
730  const auto k3 = Set(d, +0.00833336077630519866943359f);
731  const auto k4 = Set(d, +0.00139304355252534151077271f);
732  const auto k5 = Set(d, +0.000198527617612853646278381f);
733 
734  return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5), Mul(x, x), x);
735  }
736 
737  // Computes 2^x, where x is an integer.
738  template <class D, class VI32>
739  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
740  const Rebind<int32_t, D> di32;
741  const VI32 kOffset = Set(di32, 0x7F);
742  return BitCast(d, ShiftLeft<23>(Add(x, kOffset)));
743  }
744 
745  // Sets the exponent of 'x' to 2^e.
746  template <class D, class V, class VI32>
747  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
748  const VI32 y = ShiftRight<1>(e);
749  return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
750  }
751 
752  template <class D, class V, class VI32>
753  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
754  // kLn2Part0f + kLn2Part1f ~= -ln(2)
755  const V kLn2Part0f = Set(d, -0.693145751953125f);
756  const V kLn2Part1f = Set(d, -1.428606765330187045e-6f);
757 
758  // Extended precision modular arithmetic.
759  const V qf = ConvertTo(d, q);
760  x = MulAdd(qf, kLn2Part0f, x);
761  x = MulAdd(qf, kLn2Part1f, x);
762  return x;
763  }
764 };
765 
766 template <>
767 struct LogImpl<float> {
768  template <class D, class V>
770  const Rebind<int32_t, D> di32;
771  const Rebind<uint32_t, D> du32;
772  const auto kBias = Set(di32, 0x7F);
773  return Sub(BitCast(di32, ShiftRight<23>(BitCast(du32, x))), kBias);
774  }
775 
776  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
777  template <class D, class V>
778  HWY_INLINE V LogPoly(D d, V x) {
779  const V k0 = Set(d, 0.66666662693f);
780  const V k1 = Set(d, 0.40000972152f);
781  const V k2 = Set(d, 0.28498786688f);
782  const V k3 = Set(d, 0.24279078841f);
783 
784  const V x2 = Mul(x, x);
785  const V x4 = Mul(x2, x2);
786  return MulAdd(MulAdd(k2, x4, k0), x2, Mul(MulAdd(k3, x4, k1), x4));
787  }
788 };
789 
790 #if HWY_CAP_FLOAT64 && HWY_CAP_INTEGER64
791 template <>
792 struct ExpImpl<double> {
793  // Rounds double toward zero and returns as int32_t.
794  template <class D, class V>
795  HWY_INLINE Vec<Rebind<int32_t, D>> ToInt32(D /*unused*/, V x) {
796  return DemoteTo(Rebind<int32_t, D>(), x);
797  }
798 
799  template <class D, class V>
800  HWY_INLINE V ExpPoly(D d, V x) {
801  const auto k0 = Set(d, +0.5);
802  const auto k1 = Set(d, +0.166666666666666851703837);
803  const auto k2 = Set(d, +0.0416666666666665047591422);
804  const auto k3 = Set(d, +0.00833333333331652721664984);
805  const auto k4 = Set(d, +0.00138888888889774492207962);
806  const auto k5 = Set(d, +0.000198412698960509205564975);
807  const auto k6 = Set(d, +2.4801587159235472998791e-5);
808  const auto k7 = Set(d, +2.75572362911928827629423e-6);
809  const auto k8 = Set(d, +2.75573911234900471893338e-7);
810  const auto k9 = Set(d, +2.51112930892876518610661e-8);
811  const auto k10 = Set(d, +2.08860621107283687536341e-9);
812 
813  return MulAdd(Estrin(x, k0, k1, k2, k3, k4, k5, k6, k7, k8, k9, k10),
814  Mul(x, x), x);
815  }
816 
817  // Computes 2^x, where x is an integer.
818  template <class D, class VI32>
819  HWY_INLINE Vec<D> Pow2I(D d, VI32 x) {
820  const Rebind<int32_t, D> di32;
821  const Rebind<int64_t, D> di64;
822  const VI32 kOffset = Set(di32, 0x3FF);
823  return BitCast(d, ShiftLeft<52>(PromoteTo(di64, Add(x, kOffset))));
824  }
825 
826  // Sets the exponent of 'x' to 2^e.
827  template <class D, class V, class VI32>
828  HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e) {
829  const VI32 y = ShiftRight<1>(e);
830  return Mul(Mul(x, Pow2I(d, y)), Pow2I(d, Sub(e, y)));
831  }
832 
833  template <class D, class V, class VI32>
834  HWY_INLINE V ExpReduce(D d, V x, VI32 q) {
835  // kLn2Part0d + kLn2Part1d ~= -ln(2)
836  const V kLn2Part0d = Set(d, -0.6931471805596629565116018);
837  const V kLn2Part1d = Set(d, -0.28235290563031577122588448175e-12);
838 
839  // Extended precision modular arithmetic.
840  const V qf = PromoteTo(d, q);
841  x = MulAdd(qf, kLn2Part0d, x);
842  x = MulAdd(qf, kLn2Part1d, x);
843  return x;
844  }
845 };
846 
847 template <>
848 struct LogImpl<double> {
849  template <class D, class V>
850  HWY_INLINE Vec<Rebind<int64_t, D>> Log2p1NoSubnormal(D /*d*/, V x) {
851  const Rebind<int64_t, D> di64;
852  const Rebind<uint64_t, D> du64;
853  return Sub(BitCast(di64, ShiftRight<52>(BitCast(du64, x))),
854  Set(di64, 0x3FF));
855  }
856 
857  // Approximates Log(x) over the range [sqrt(2) / 2, sqrt(2)].
858  template <class D, class V>
859  HWY_INLINE V LogPoly(D d, V x) {
860  const V k0 = Set(d, 0.6666666666666735130);
861  const V k1 = Set(d, 0.3999999999940941908);
862  const V k2 = Set(d, 0.2857142874366239149);
863  const V k3 = Set(d, 0.2222219843214978396);
864  const V k4 = Set(d, 0.1818357216161805012);
865  const V k5 = Set(d, 0.1531383769920937332);
866  const V k6 = Set(d, 0.1479819860511658591);
867 
868  const V x2 = Mul(x, x);
869  const V x4 = Mul(x2, x2);
870  return MulAdd(MulAdd(MulAdd(MulAdd(k6, x4, k4), x4, k2), x4, k0), x2,
871  (Mul(MulAdd(MulAdd(k5, x4, k3), x4, k1), x4)));
872  }
873 };
874 
875 #endif
876 
877 template <class D, class V, bool kAllowSubnormals = true>
878 HWY_INLINE V Log(const D d, V x) {
879  // http://git.musl-libc.org/cgit/musl/tree/src/math/log.c for more info.
880  using LaneType = LaneType<V>;
882 
883  // clang-format off
884  constexpr bool kIsF32 = (sizeof(LaneType) == 4);
885 
886  // Float Constants
887  const V kLn2Hi = Set(d, (kIsF32 ? 0.69313812256f :
888  0.693147180369123816490 ));
889  const V kLn2Lo = Set(d, (kIsF32 ? 9.0580006145e-6f :
890  1.90821492927058770002e-10));
891  const V kOne = Set(d, +1.0);
892  const V kMinNormal = Set(d, (kIsF32 ? 1.175494351e-38f :
893  2.2250738585072014e-308 ));
894  const V kScale = Set(d, (kIsF32 ? 3.355443200e+7f :
895  1.8014398509481984e+16 ));
896 
897  // Integer Constants
898  const Rebind<MakeSigned<LaneType>, D> di;
899  using VI = decltype(Zero(di));
900  const VI kLowerBits = Set(di, (kIsF32 ? 0x00000000L : 0xFFFFFFFFLL));
901  const VI kMagic = Set(di, (kIsF32 ? 0x3F3504F3L : 0x3FE6A09E00000000LL));
902  const VI kExpMask = Set(di, (kIsF32 ? 0x3F800000L : 0x3FF0000000000000LL));
903  const VI kExpScale = Set(di, (kIsF32 ? -25 : -54));
904  const VI kManMask = Set(di, (kIsF32 ? 0x7FFFFFL : 0xFFFFF00000000LL));
905  // clang-format on
906 
907  // Scale up 'x' so that it is no longer denormalized.
908  VI exp_bits;
909  V exp;
910  if (kAllowSubnormals == true) {
911  const auto is_denormal = Lt(x, kMinNormal);
912  x = IfThenElse(is_denormal, Mul(x, kScale), x);
913 
914  // Compute the new exponent.
915  exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
916  const VI exp_scale =
917  BitCast(di, IfThenElseZero(is_denormal, BitCast(d, kExpScale)));
918  exp = ConvertTo(
919  d, Add(exp_scale, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits))));
920  } else {
921  // Compute the new exponent.
922  exp_bits = Add(BitCast(di, x), Sub(kExpMask, kMagic));
923  exp = ConvertTo(d, impl.Log2p1NoSubnormal(d, BitCast(d, exp_bits)));
924  }
925 
926  // Renormalize.
927  const V y = Or(And(x, BitCast(d, kLowerBits)),
928  BitCast(d, Add(And(exp_bits, kManMask), kMagic)));
929 
930  // Approximate and reconstruct.
931  const V ym1 = Sub(y, kOne);
932  const V z = Div(ym1, Add(y, kOne));
933 
934  return MulSub(
935  exp, kLn2Hi,
936  Sub(MulSub(z, Sub(ym1, impl.LogPoly(d, z)), Mul(exp, kLn2Lo)), ym1));
937 }
938 
939 } // namespace impl
940 
941 template <class D, class V>
942 HWY_INLINE V Acos(const D d, V x) {
943  using LaneType = LaneType<V>;
944 
945  const V kZero = Zero(d);
946  const V kHalf = Set(d, +0.5);
947  const V kPi = Set(d, +3.14159265358979323846264);
948  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
949 
950  const V sign_x = And(SignBit(d), x);
951  const V abs_x = Xor(x, sign_x);
952  const auto mask = Lt(abs_x, kHalf);
953  const V yy =
954  IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
955  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
956 
958  const V t = Mul(impl.AsinPoly(d, yy, y), Mul(y, yy));
959 
960  const V t_plus_y = Add(t, y);
961  const V z =
962  IfThenElse(mask, Sub(kPiOverTwo, Add(Xor(y, sign_x), Xor(t, sign_x))),
963  Add(t_plus_y, t_plus_y));
964  return IfThenElse(Or(mask, Ge(x, kZero)), z, Sub(kPi, z));
965 }
966 
967 template <class D, class V>
968 HWY_INLINE V Acosh(const D d, V x) {
969  const V kLarge = Set(d, 268435456.0);
970  const V kLog2 = Set(d, 0.693147180559945286227);
971  const V kOne = Set(d, +1.0);
972  const V kTwo = Set(d, +2.0);
973 
974  const auto is_x_large = Gt(x, kLarge);
975  const auto is_x_gt_2 = Gt(x, kTwo);
976 
977  const V x_minus_1 = Sub(x, kOne);
978  const V y0 = MulSub(kTwo, x, Div(kOne, Add(Sqrt(MulSub(x, x, kOne)), x)));
979  const V y1 =
980  Add(Sqrt(MulAdd(x_minus_1, kTwo, Mul(x_minus_1, x_minus_1))), x_minus_1);
981  const V y2 =
982  IfThenElse(is_x_gt_2, IfThenElse(is_x_large, x, y0), Add(y1, kOne));
983  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
984 
985  const auto is_pole = Eq(y2, kOne);
986  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
987  return Add(IfThenElse(is_x_gt_2, z,
988  IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor))),
989  IfThenElseZero(is_x_large, kLog2));
990 }
991 
992 template <class D, class V>
993 HWY_INLINE V Asin(const D d, V x) {
994  using LaneType = LaneType<V>;
995 
996  const V kHalf = Set(d, +0.5);
997  const V kTwo = Set(d, +2.0);
998  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
999 
1000  const V sign_x = And(SignBit(d), x);
1001  const V abs_x = Xor(x, sign_x);
1002  const auto mask = Lt(abs_x, kHalf);
1003  const V yy =
1004  IfThenElse(mask, Mul(abs_x, abs_x), NegMulAdd(abs_x, kHalf, kHalf));
1005  const V y = IfThenElse(mask, abs_x, Sqrt(yy));
1006 
1008  const V z0 = MulAdd(impl.AsinPoly(d, yy, y), Mul(yy, y), y);
1009  const V z1 = NegMulAdd(z0, kTwo, kPiOverTwo);
1010  return Or(IfThenElse(mask, z0, z1), sign_x);
1011 }
1012 
1013 template <class D, class V>
1014 HWY_INLINE V Asinh(const D d, V x) {
1015  const V kSmall = Set(d, 1.0 / 268435456.0);
1016  const V kLarge = Set(d, 268435456.0);
1017  const V kLog2 = Set(d, 0.693147180559945286227);
1018  const V kOne = Set(d, +1.0);
1019  const V kTwo = Set(d, +2.0);
1020 
1021  const V sign_x = And(SignBit(d), x); // Extract the sign bit
1022  const V abs_x = Xor(x, sign_x);
1023 
1024  const auto is_x_large = Gt(abs_x, kLarge);
1025  const auto is_x_lt_2 = Lt(abs_x, kTwo);
1026 
1027  const V x2 = Mul(x, x);
1028  const V sqrt_x2_plus_1 = Sqrt(Add(x2, kOne));
1029 
1030  const V y0 = MulAdd(abs_x, kTwo, Div(kOne, Add(sqrt_x2_plus_1, abs_x)));
1031  const V y1 = Add(Div(x2, Add(sqrt_x2_plus_1, kOne)), abs_x);
1032  const V y2 =
1033  IfThenElse(is_x_lt_2, Add(y1, kOne), IfThenElse(is_x_large, abs_x, y0));
1034  const V z = impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y2);
1035 
1036  const auto is_pole = Eq(y2, kOne);
1037  const auto divisor = Sub(IfThenZeroElse(is_pole, y2), kOne);
1038  const auto large = IfThenElse(is_pole, y1, Div(Mul(z, y1), divisor));
1039  const V y = IfThenElse(Lt(abs_x, kSmall), x, large);
1040  return Or(Add(IfThenElse(is_x_lt_2, y, z), IfThenElseZero(is_x_large, kLog2)),
1041  sign_x);
1042 }
1043 
1044 template <class D, class V>
1045 HWY_INLINE V Atan(const D d, V x) {
1046  using LaneType = LaneType<V>;
1047 
1048  const V kOne = Set(d, +1.0);
1049  const V kPiOverTwo = Set(d, +1.57079632679489661923132169);
1050 
1051  const V sign = And(SignBit(d), x);
1052  const V abs_x = Xor(x, sign);
1053  const auto mask = Gt(abs_x, kOne);
1054 
1056  const auto divisor = IfThenElse(mask, abs_x, kOne);
1057  const V y = impl.AtanPoly(d, IfThenElse(mask, Div(kOne, divisor), abs_x));
1058  return Or(IfThenElse(mask, Sub(kPiOverTwo, y), y), sign);
1059 }
1060 
1061 template <class D, class V>
1062 HWY_INLINE V Atanh(const D d, V x) {
1063  const V kHalf = Set(d, +0.5);
1064  const V kOne = Set(d, +1.0);
1065 
1066  const V sign = And(SignBit(d), x); // Extract the sign bit
1067  const V abs_x = Xor(x, sign);
1068  return Mul(Log1p(d, Div(Add(abs_x, abs_x), Sub(kOne, abs_x))),
1069  Xor(kHalf, sign));
1070 }
1071 
1072 template <class D, class V>
1073 HWY_INLINE V Cos(const D d, V x) {
1074  using LaneType = LaneType<V>;
1076 
1077  // Float Constants
1078  const V kOneOverPi = Set(d, 0.31830988618379067153);
1079 
1080  // Integer Constants
1081  const Rebind<int32_t, D> di32;
1082  using VI32 = decltype(Zero(di32));
1083  const VI32 kOne = Set(di32, 1);
1084 
1085  const V y = Abs(x); // cos(x) == cos(|x|)
1086 
1087  // Compute the quadrant, q = int(|x| / pi) * 2 + 1
1088  const VI32 q = Add(ShiftLeft<1>(impl.ToInt32(d, Mul(y, kOneOverPi))), kOne);
1089 
1090  // Reduce range, apply sign, and approximate.
1091  return impl.Poly(
1092  d, Xor(impl.CosReduce(d, y, q), impl.CosSignFromQuadrant(d, q)));
1093 }
1094 
1095 template <class D, class V>
1096 HWY_INLINE V Exp(const D d, V x) {
1097  using LaneType = LaneType<V>;
1098 
1099  // clang-format off
1100  const V kHalf = Set(d, +0.5);
1101  const V kLowerBound = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
1102  const V kNegZero = Set(d, -0.0);
1103  const V kOne = Set(d, +1.0);
1104  const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
1105  // clang-format on
1106 
1108 
1109  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1110  const auto q =
1111  impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1112 
1113  // Reduce, approximate, and then reconstruct.
1114  const V y = impl.LoadExpShortRange(
1115  d, Add(impl.ExpPoly(d, impl.ExpReduce(d, x, q)), kOne), q);
1116  return IfThenElseZero(Ge(x, kLowerBound), y);
1117 }
1118 
1119 template <class D, class V>
1120 HWY_INLINE V Expm1(const D d, V x) {
1121  using LaneType = LaneType<V>;
1122 
1123  // clang-format off
1124  const V kHalf = Set(d, +0.5);
1125  const V kLowerBound = Set(d, (sizeof(LaneType) == 4 ? -104.0 : -1000.0));
1126  const V kLn2Over2 = Set(d, +0.346573590279972654708616);
1127  const V kNegOne = Set(d, -1.0);
1128  const V kNegZero = Set(d, -0.0);
1129  const V kOne = Set(d, +1.0);
1130  const V kOneOverLog2 = Set(d, +1.442695040888963407359924681);
1131  // clang-format on
1132 
1134 
1135  // q = static_cast<int32>((x / log(2)) + ((x < 0) ? -0.5 : +0.5))
1136  const auto q =
1137  impl.ToInt32(d, MulAdd(x, kOneOverLog2, Or(kHalf, And(x, kNegZero))));
1138 
1139  // Reduce, approximate, and then reconstruct.
1140  const V y = impl.ExpPoly(d, impl.ExpReduce(d, x, q));
1141  const V z = IfThenElse(Lt(Abs(x), kLn2Over2), y,
1142  Sub(impl.LoadExpShortRange(d, Add(y, kOne), q), kOne));
1143  return IfThenElse(Lt(x, kLowerBound), kNegOne, z);
1144 }
1145 
1146 template <class D, class V>
1147 HWY_INLINE V Log(const D d, V x) {
1148  return impl::Log<D, V, /*kAllowSubnormals=*/true>(d, x);
1149 }
1150 
1151 template <class D, class V>
1152 HWY_INLINE V Log10(const D d, V x) {
1153  return Mul(Log(d, x), Set(d, 0.4342944819032518276511));
1154 }
1155 
1156 template <class D, class V>
1157 HWY_INLINE V Log1p(const D d, V x) {
1158  const V kOne = Set(d, +1.0);
1159 
1160  const V y = Add(x, kOne);
1161  const auto is_pole = Eq(y, kOne);
1162  const auto divisor = Sub(IfThenZeroElse(is_pole, y), kOne);
1163  const auto non_pole =
1164  Mul(impl::Log<D, V, /*kAllowSubnormals=*/false>(d, y), Div(x, divisor));
1165  return IfThenElse(is_pole, x, non_pole);
1166 }
1167 
1168 template <class D, class V>
1169 HWY_INLINE V Log2(const D d, V x) {
1170  return Mul(Log(d, x), Set(d, 1.44269504088896340735992));
1171 }
1172 
1173 template <class D, class V>
1174 HWY_INLINE V Sin(const D d, V x) {
1175  using LaneType = LaneType<V>;
1177 
1178  // Float Constants
1179  const V kOneOverPi = Set(d, 0.31830988618379067153);
1180  const V kHalf = Set(d, 0.5);
1181 
1182  // Integer Constants
1183  const Rebind<int32_t, D> di32;
1184  using VI32 = decltype(Zero(di32));
1185 
1186  const V abs_x = Abs(x);
1187  const V sign_x = Xor(abs_x, x);
1188 
1189  // Compute the quadrant, q = int((|x| / pi) + 0.5)
1190  const VI32 q = impl.ToInt32(d, MulAdd(abs_x, kOneOverPi, kHalf));
1191 
1192  // Reduce range, apply sign, and approximate.
1193  return impl.Poly(d, Xor(impl.SinReduce(d, abs_x, q),
1194  Xor(impl.SinSignFromQuadrant(d, q), sign_x)));
1195 }
1196 
1197 template <class D, class V>
1198 HWY_INLINE V Sinh(const D d, V x) {
1199  const V kHalf = Set(d, +0.5);
1200  const V kOne = Set(d, +1.0);
1201  const V kTwo = Set(d, +2.0);
1202 
1203  const V sign = And(SignBit(d), x); // Extract the sign bit
1204  const V abs_x = Xor(x, sign);
1205  const V y = Expm1(d, abs_x);
1206  const V z = Mul(Div(Add(y, kTwo), Add(y, kOne)), Mul(y, kHalf));
1207  return Xor(z, sign); // Reapply the sign bit
1208 }
1209 
1210 template <class D, class V>
1211 HWY_INLINE V Tanh(const D d, V x) {
1212  const V kLimit = Set(d, 18.714973875);
1213  const V kOne = Set(d, +1.0);
1214  const V kTwo = Set(d, +2.0);
1215 
1216  const V sign = And(SignBit(d), x); // Extract the sign bit
1217  const V abs_x = Xor(x, sign);
1218  const V y = Expm1(d, Mul(abs_x, kTwo));
1219  const V z = IfThenElse(Gt(abs_x, kLimit), kOne, Div(y, Add(y, kTwo)));
1220  return Xor(z, sign); // Reapply the sign bit
1221 }
1222 
1223 // NOLINTNEXTLINE(google-readability-namespace-comments)
1224 } // namespace HWY_NAMESPACE
1225 } // namespace hwy
1227 
1228 #endif // HIGHWAY_HWY_CONTRIB_MATH_MATH_INL_H_
#define HWY_NOINLINE
Definition: base.h:60
#define HWY_INLINE
Definition: base.h:59
#define HWY_MAYBE_UNUSED
Definition: base.h:70
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
HWY_INLINE HWY_MAYBE_UNUSED T Estrin(T x, T c0, T c1)
Definition: math-inl.h:280
HWY_INLINE V Log(const D d, V x)
Definition: math-inl.h:878
HWY_API Vec< D > SignBit(D d)
Definition: generic_ops-inl.h:66
HWY_NOINLINE V CallSin(const D d, VecArg< V > x)
Definition: math-inl.h:236
V VecArg
Definition: shared-inl.h:226
svuint16_t Set(Simd< bfloat16_t, N > d, bfloat16_t arg)
Definition: arm_sve-inl.h:299
HWY_NOINLINE V CallAsin(const D d, VecArg< V > x)
Definition: math-inl.h:71
HWY_INLINE V Atan(const D d, V x)
Highway SIMD version of std::atan(x).
Definition: math-inl.h:1045
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5035
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5027
HWY_INLINE V Cos(const D d, V x)
Highway SIMD version of std::cos(x).
Definition: math-inl.h:1073
HWY_NOINLINE V CallAcos(const D d, VecArg< V > x)
Definition: math-inl.h:41
HWY_API auto Gt(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5040
HWY_API Vec128< float, N > MulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1232
HWY_INLINE V Sin(const D d, V x)
Highway SIMD version of std::sin(x).
Definition: math-inl.h:1174
HWY_API Vec128< int8_t > Abs(const Vec128< int8_t > v)
Definition: arm_neon-inl.h:1529
HWY_INLINE V Exp(const D d, V x)
Highway SIMD version of std::exp(x).
Definition: math-inl.h:1096
HWY_API auto Ge(V a, V b) -> decltype(a==b)
Definition: arm_neon-inl.h:5044
HWY_INLINE V Log10(const D d, V x)
Highway SIMD version of std::log10(x).
Definition: math-inl.h:1152
HWY_INLINE V Log1p(const D d, V x)
Highway SIMD version of std::log1p(x).
Definition: math-inl.h:1157
HWY_NOINLINE V CallExpm1(const D d, VecArg< V > x)
Definition: math-inl.h:161
HWY_API Vec128< uint16_t, 4 > DemoteTo(Simd< uint16_t, 4 >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2546
HWY_NOINLINE V CallLog1p(const D d, VecArg< V > x)
Definition: math-inl.h:206
HWY_INLINE V Atanh(const D d, V x)
Highway SIMD version of std::atanh(x).
Definition: math-inl.h:1062
HWY_API Vec128< float > ConvertTo(Full128< float >, const Vec128< int32_t > v)
Definition: arm_neon-inl.h:2739
HWY_NOINLINE V CallLog10(const D d, VecArg< V > x)
Definition: math-inl.h:191
HWY_API Vec128< T, N > IfThenElseZero(const Mask128< T, N > mask, const Vec128< T, N > yes)
Definition: arm_neon-inl.h:1642
HWY_API V Add(V a, V b)
Definition: arm_neon-inl.h:5000
HWY_NOINLINE V CallLog2(const D d, VecArg< V > x)
Definition: math-inl.h:221
HWY_NOINLINE V CallExp(const D d, VecArg< V > x)
Definition: math-inl.h:146
HWY_NOINLINE V CallAtanh(const D d, VecArg< V > x)
Definition: math-inl.h:116
HWY_API Vec128< float, N > MulSub(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > sub)
Definition: arm_neon-inl.h:1288
HWY_INLINE V Log2(const D d, V x)
Highway SIMD version of std::log2(x).
Definition: math-inl.h:1169
HWY_INLINE V Acos(const D d, V x)
Highway SIMD version of std::acos(x).
Definition: math-inl.h:942
HWY_NOINLINE V CallAtan(const D d, VecArg< V > x)
Definition: math-inl.h:101
HWY_INLINE V Acosh(const D d, V x)
Highway SIMD version of std::acosh(x).
Definition: math-inl.h:968
HWY_NOINLINE V CallLog(const D d, VecArg< V > x)
Definition: math-inl.h:176
HWY_INLINE V Tanh(const D d, V x)
Highway SIMD version of std::tanh(x).
Definition: math-inl.h:1211
decltype(GetLane(V())) LaneType
Definition: generic_ops-inl.h:24
HWY_INLINE V Log(const D d, V x)
Highway SIMD version of std::log(x).
Definition: math-inl.h:1147
HWY_API Vec128< uint16_t > PromoteTo(Full128< uint16_t >, const Vec128< uint8_t, 8 > v)
Definition: arm_neon-inl.h:2362
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1384
HWY_API Vec128< T, N > BitCast(Simd< T, N > d, Vec128< FromT, N *sizeof(T)/sizeof(FromT)> v)
Definition: arm_neon-inl.h:687
HWY_INLINE V Asin(const D d, V x)
Highway SIMD version of std::asin(x).
Definition: math-inl.h:993
HWY_INLINE V Asinh(const D d, V x)
Highway SIMD version of std::asinh(x).
Definition: math-inl.h:1014
HWY_NOINLINE V CallAsinh(const D d, VecArg< V > x)
Definition: math-inl.h:86
HWY_API V Sub(V a, V b)
Definition: arm_neon-inl.h:5004
typename D::template Rebind< T > Rebind
Definition: shared-inl.h:144
HWY_INLINE V Expm1(const D d, V x)
Highway SIMD version of std::expm1(x).
Definition: math-inl.h:1120
HWY_API Vec128< T, N > IfThenZeroElse(const Mask128< T, N > mask, const Vec128< T, N > no)
Definition: arm_neon-inl.h:1649
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1430
HWY_API Vec128< float, N > NegMulAdd(const Vec128< float, N > mul, const Vec128< float, N > x, const Vec128< float, N > add)
Definition: arm_neon-inl.h:1266
HWY_NOINLINE V CallCos(const D d, VecArg< V > x)
Definition: math-inl.h:131
HWY_API Vec128< float, N > Sqrt(const Vec128< float, N > v)
Definition: arm_neon-inl.h:1348
HWY_NOINLINE V CallSinh(const D d, VecArg< V > x)
Definition: math-inl.h:251
HWY_INLINE V Sinh(const D d, V x)
Highway SIMD version of std::sinh(x).
Definition: math-inl.h:1198
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition: arm_neon-inl.h:1398
HWY_API V Div(V a, V b)
Definition: arm_neon-inl.h:5013
HWY_API V Mul(V a, V b)
Definition: arm_neon-inl.h:5009
HWY_NOINLINE V CallTanh(const D d, VecArg< V > x)
Definition: math-inl.h:266
HWY_API Vec128< T, N > Zero(Simd< T, N > d)
Definition: arm_neon-inl.h:710
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition: arm_neon-inl.h:1419
decltype(Zero(D())) Vec
Definition: generic_ops-inl.h:31
HWY_API Vec1< T > IfThenElse(const Mask1< T > mask, const Vec1< T > yes, const Vec1< T > no)
Definition: scalar-inl.h:263
HWY_NOINLINE V CallAcosh(const D d, VecArg< V > x)
Definition: math-inl.h:56
Definition: aligned_allocator.h:23
#define HWY_NAMESPACE
Definition: set_macros-inl.h:77
HWY_INLINE V AsinPoly(D d, V x2, V)
Definition: math-inl.h:478
Definition: math-inl.h:464
HWY_INLINE V AtanPoly(D d, V x)
Definition: math-inl.h:519
Definition: math-inl.h:466
HWY_INLINE Vec< Rebind< float, D > > CosSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:625
HWY_INLINE V SinReduce(D d, V x, VI32 q)
Definition: math-inl.h:607
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:574
HWY_INLINE V Poly(D d, V x)
Definition: math-inl.h:579
HWY_INLINE Vec< Rebind< float, D > > SinSignFromQuadrant(D d, VI32 q)
Definition: math-inl.h:632
HWY_INLINE V CosReduce(D d, V x, VI32 q)
Definition: math-inl.h:590
Definition: math-inl.h:468
HWY_INLINE V ExpReduce(D d, V x, VI32 q)
Definition: math-inl.h:753
HWY_INLINE Vec< Rebind< int32_t, D > > ToInt32(D, V x)
Definition: math-inl.h:721
HWY_INLINE V ExpPoly(D d, V x)
Definition: math-inl.h:726
HWY_INLINE V LoadExpShortRange(D d, V x, VI32 e)
Definition: math-inl.h:747
HWY_INLINE Vec< D > Pow2I(D d, VI32 x)
Definition: math-inl.h:739
Definition: math-inl.h:470
HWY_INLINE V LogPoly(D d, V x)
Definition: math-inl.h:778
HWY_INLINE Vec< Rebind< int32_t, D > > Log2p1NoSubnormal(D, V x)
Definition: math-inl.h:769
Definition: math-inl.h:472