OgreSIMDHelper.h
Go to the documentation of this file.
1/*
2-----------------------------------------------------------------------------
3This source file is part of OGRE
4 (Object-oriented Graphics Rendering Engine)
5For the latest info, see http://www.ogre3d.org/
6
7Copyright (c) 2000-2013 Torus Knot Software Ltd
8
9Permission is hereby granted, free of charge, to any person obtaining a copy
10of this software and associated documentation files (the "Software"), to deal
11in the Software without restriction, including without limitation the rights
12to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13copies of the Software, and to permit persons to whom the Software is
14furnished to do so, subject to the following conditions:
15
16The above copyright notice and this permission notice shall be included in
17all copies or substantial portions of the Software.
18
19THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25THE SOFTWARE.
26-----------------------------------------------------------------------------
27*/
28#ifndef __SIMDHelper_H__
29#define __SIMDHelper_H__
30
31#include "OgrePrerequisites.h"
33
34// Stack-alignment hackery.
35//
36// If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests
37// special code to ensure stack align to a 16-bytes boundary.
38//
39// Note:
40// This macro can only guarantee callee stack pointer (esp) align
41// to a 16-bytes boundary, but not that for frame pointer (ebp).
42// Because most compiler might use frame pointer to access to stack
43// variables, so you need to wrap those alignment required functions
44// with extra function call.
45//
46#if defined(__INTEL_COMPILER)
47// For intel's compiler, simply calling alloca seems to do the right
48// thing. The size of the allocated block seems to be irrelevant.
49#define __OGRE_SIMD_ALIGN_STACK() _alloca(16)
50#define __OGRE_SIMD_ALIGN_ATTRIBUTE
51
52#elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64)
53// mark functions with GCC attribute to force stack alignment to 16 bytes
54#define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer))
55
56#elif defined(_MSC_VER)
57// Fortunately, MSVC will align the stack automatically
58#define __OGRE_SIMD_ALIGN_ATTRIBUTE
59
60#else
61#define __OGRE_SIMD_ALIGN_ATTRIBUTE
62
63#endif
64
65
66// Additional platform-dependent header files and declares.
67//
68// NOTE: Should be sync with __OGRE_HAVE_SSE macro.
69//
70
71#if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
72
73// GCC version 4.0 upwards should be reliable for official SSE now,
74// so no longer define SSE macros ourselves
75// We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky?
76#include <xmmintrin.h>
77
78
79#endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86
80
81
82
83//---------------------------------------------------------------------
84// SIMD macros and helpers
85//---------------------------------------------------------------------
86
87
88namespace Ogre {
96#if __OGRE_HAVE_SSE
97
108#if 1
109#define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x)
110#else
111#define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below
112#endif
113
122#define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
123 { \
124 __m128 tmp3, tmp2, tmp1, tmp0; \
125 \
126 /* r00 r01 r02 r03 */ \
127 /* r10 r11 r12 r13 */ \
128 /* r20 r21 r22 r23 */ \
129 /* r30 r31 r32 r33 */ \
130 \
131 tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \
132 tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \
133 tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \
134 tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \
135 \
136 r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \
137 r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \
138 r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \
139 r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \
140 }
141
150#define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \
151 { \
152 __m128 tmp0, tmp1, tmp2; \
153 \
154 /* r00 r01 r02 r10 */ \
155 /* r11 r12 r20 r21 */ \
156 /* r22 r30 r31 r32 */ \
157 \
158 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \
159 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \
160 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \
161 \
162 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \
163 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \
164 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \
165 }
166
174#define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \
175 { \
176 __m128 tmp0, tmp1, tmp2; \
177 \
178 /* r00 r10 r20 r30 */ \
179 /* r01 r11 r21 r31 */ \
180 /* r02 r12 r22 r32 */ \
181 \
182 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \
183 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \
184 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \
185 \
186 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \
187 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \
188 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \
189 }
190
194#define __MM_SELECT(v, fp) \
195 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp)))
196
198#define __MM_ACCUM4_PS(a, b, c, d) \
199 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d))
200
204#define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \
205 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3))
206
210#define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
211 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3)
212
214#define __MM_ACCUM3_PS(a, b, c) \
215 _mm_add_ps(_mm_add_ps(a, b), c)
216
220#define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
221 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2))
222
224#define __MM_MADD_PS(a, b, c) \
225 _mm_add_ps(_mm_mul_ps(a, b), c)
226
228#define __MM_LERP_PS(t, a, b) \
229 __MM_MADD_PS(_mm_sub_ps(b, a), t, a)
230
232#define __MM_MADD_SS(a, b, c) \
233 _mm_add_ss(_mm_mul_ss(a, b), c)
234
236#define __MM_LERP_SS(t, a, b) \
237 __MM_MADD_SS(_mm_sub_ss(b, a), t, a)
238
240#define __MM_LOAD_PS(p) \
241 (*(const __m128*)(p))
242
244#define __MM_STORE_PS(p, v) \
245 (*(__m128*)(p) = (v))
246
247
250 template <bool aligned = false>
251 struct SSEMemoryAccessor
252 {
253 static FORCEINLINE __m128 load(const float *p)
254 {
255 return _mm_loadu_ps(p);
256 }
257 static FORCEINLINE void store(float *p, const __m128& v)
258 {
259 _mm_storeu_ps(p, v);
260 }
261 };
262 // Special aligned accessor
263 template <>
264 struct SSEMemoryAccessor<true>
265 {
266 static FORCEINLINE const __m128& load(const float *p)
267 {
268 return __MM_LOAD_PS(p);
269 }
270 static FORCEINLINE void store(float *p, const __m128& v)
271 {
272 __MM_STORE_PS(p, v);
273 }
274 };
275
278 static FORCEINLINE bool _isAlignedForSSE(const void *p)
279 {
280 return (((size_t)p) & 15) == 0;
281 }
282
286 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x)
287 {
288 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f };
289 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f };
290 __m128 t = _mm_rsqrt_ps(x);
291 return _mm_mul_ps(_mm_mul_ps(v0pt5, t),
292 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t)));
293 }
294
295// Macro to check the stack aligned for SSE
296#if OGRE_DEBUG_MODE
297#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \
298 { \
299 __m128 test; \
300 assert(_isAlignedForSSE(&test)); \
301 }
302
303#else // !OGRE_DEBUG_MODE
304#define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE()
305
306#endif // OGRE_DEBUG_MODE
307
308
309#endif // __OGRE_HAVE_SSE
313}
314
315#endif // __SIMDHelper_H__
#define FORCEINLINE
Definition: OgrePlatform.h:102

Copyright © 2012 Torus Knot Software Ltd
Creative Commons License
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.