00001 /* 00002 ----------------------------------------------------------------------------- 00003 This source file is part of OGRE 00004 (Object-oriented Graphics Rendering Engine) 00005 For the latest info, see http://www.ogre3d.org/ 00006 00007 Copyright (c) 2000-2012 Torus Knot Software Ltd 00008 00009 Permission is hereby granted, free of charge, to any person obtaining a copy 00010 of this software and associated documentation files (the "Software"), to deal 00011 in the Software without restriction, including without limitation the rights 00012 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 00013 copies of the Software, and to permit persons to whom the Software is 00014 furnished to do so, subject to the following conditions: 00015 00016 The above copyright notice and this permission notice shall be included in 00017 all copies or substantial portions of the Software. 00018 00019 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 00020 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 00021 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 00022 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 00023 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 00024 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 00025 THE SOFTWARE. 00026 ----------------------------------------------------------------------------- 00027 */ 00028 #ifndef __SIMDHelper_H__ 00029 #define __SIMDHelper_H__ 00030 00031 #include "OgrePrerequisites.h" 00032 #include "OgrePlatformInformation.h" 00033 00034 // Stack-alignment hackery. 00035 // 00036 // If macro __OGRE_SIMD_ALIGN_STACK defined, means there requests 00037 // special code to ensure stack align to a 16-bytes boundary. 00038 // 00039 // Note: 00040 // This macro can only guarantee callee stack pointer (esp) align 00041 // to a 16-bytes boundary, but not that for frame pointer (ebp). 00042 // Because most compiler might use frame pointer to access to stack 00043 // variables, so you need to wrap those alignment required functions 00044 // with extra function call. 00045 // 00046 #if defined(__INTEL_COMPILER) 00047 // For intel's compiler, simply calling alloca seems to do the right 00048 // thing. The size of the allocated block seems to be irrelevant. 00049 #define __OGRE_SIMD_ALIGN_STACK() _alloca(16) 00050 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 00051 00052 #elif OGRE_CPU == OGRE_CPU_X86 && (OGRE_COMPILER == OGRE_COMPILER_GNUC || OGRE_COMPILER == OGRE_COMPILER_CLANG) && (OGRE_ARCH_TYPE != OGRE_ARCHITECTURE_64) 00053 // mark functions with GCC attribute to force stack alignment to 16 bytes 00054 #define __OGRE_SIMD_ALIGN_ATTRIBUTE __attribute__((force_align_arg_pointer)) 00055 00056 #elif defined(_MSC_VER) 00057 // Fortunately, MSVC will align the stack automatically 00058 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 00059 00060 #else 00061 #define __OGRE_SIMD_ALIGN_ATTRIBUTE 00062 00063 #endif 00064 00065 00066 // Additional platform-dependent header files and declares. 00067 // 00068 // NOTE: Should be sync with __OGRE_HAVE_SSE macro. 00069 // 00070 00071 #if OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00072 00073 // GCC version 4.0 upwards should be reliable for official SSE now, 00074 // so no longer define SSE macros ourselves 00075 // We don't support gcc 3.x anymore anyway, although that had SSE it was a bit flaky? 00076 #include <xmmintrin.h> 00077 00078 00079 #endif // OGRE_DOUBLE_PRECISION == 0 && OGRE_CPU == OGRE_CPU_X86 00080 00081 00082 00083 //--------------------------------------------------------------------- 00084 // SIMD macros and helpers 00085 //--------------------------------------------------------------------- 00086 00087 00088 namespace Ogre { 00096 #if __OGRE_HAVE_SSE 00097 00108 #if 1 00109 #define __MM_RSQRT_PS(x) _mm_rsqrt_ps(x) 00110 #else 00111 #define __MM_RSQRT_PS(x) __mm_rsqrt_nr_ps(x) // Implemented below 00112 #endif 00113 00122 #define __MM_TRANSPOSE4x4_PS(r0, r1, r2, r3) \ 00123 { \ 00124 __m128 tmp3, tmp2, tmp1, tmp0; \ 00125 \ 00126 /* r00 r01 r02 r03 */ \ 00127 /* r10 r11 r12 r13 */ \ 00128 /* r20 r21 r22 r23 */ \ 00129 /* r30 r31 r32 r33 */ \ 00130 \ 00131 tmp0 = _mm_unpacklo_ps(r0, r1); /* r00 r10 r01 r11 */ \ 00132 tmp2 = _mm_unpackhi_ps(r0, r1); /* r02 r12 r03 r13 */ \ 00133 tmp1 = _mm_unpacklo_ps(r2, r3); /* r20 r30 r21 r31 */ \ 00134 tmp3 = _mm_unpackhi_ps(r2, r3); /* r22 r32 r23 r33 */ \ 00135 \ 00136 r0 = _mm_movelh_ps(tmp0, tmp1); /* r00 r10 r20 r30 */ \ 00137 r1 = _mm_movehl_ps(tmp1, tmp0); /* r01 r11 r21 r31 */ \ 00138 r2 = _mm_movelh_ps(tmp2, tmp3); /* r02 r12 r22 r32 */ \ 00139 r3 = _mm_movehl_ps(tmp3, tmp2); /* r03 r13 r23 r33 */ \ 00140 } 00141 00150 #define __MM_TRANSPOSE4x3_PS(v0, v1, v2) \ 00151 { \ 00152 __m128 tmp0, tmp1, tmp2; \ 00153 \ 00154 /* r00 r01 r02 r10 */ \ 00155 /* r11 r12 r20 r21 */ \ 00156 /* r22 r30 r31 r32 */ \ 00157 \ 00158 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(3,0,3,0)); /* r00 r10 r22 r32 */ \ 00159 tmp1 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(1,0,2,1)); /* r01 r02 r11 r12 */ \ 00160 tmp2 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(2,1,3,2)); /* r20 r21 r30 r31 */ \ 00161 \ 00162 v0 = _mm_shuffle_ps(tmp0, tmp2, _MM_SHUFFLE(2,0,1,0)); /* r00 r10 r20 r30 */ \ 00163 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r01 r11 r21 r31 */ \ 00164 v2 = _mm_shuffle_ps(tmp1, tmp0, _MM_SHUFFLE(3,2,3,1)); /* r02 r12 r22 r32 */ \ 00165 } 00166 00174 #define __MM_TRANSPOSE3x4_PS(v0, v1, v2) \ 00175 { \ 00176 __m128 tmp0, tmp1, tmp2; \ 00177 \ 00178 /* r00 r10 r20 r30 */ \ 00179 /* r01 r11 r21 r31 */ \ 00180 /* r02 r12 r22 r32 */ \ 00181 \ 00182 tmp0 = _mm_shuffle_ps(v0, v2, _MM_SHUFFLE(2,0,3,1)); /* r10 r30 r02 r22 */ \ 00183 tmp1 = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(3,1,3,1)); /* r11 r31 r12 r32 */ \ 00184 tmp2 = _mm_shuffle_ps(v0, v1, _MM_SHUFFLE(2,0,2,0)); /* r00 r20 r01 r21 */ \ 00185 \ 00186 v0 = _mm_shuffle_ps(tmp2, tmp0, _MM_SHUFFLE(0,2,2,0)); /* r00 r01 r02 r10 */ \ 00187 v1 = _mm_shuffle_ps(tmp1, tmp2, _MM_SHUFFLE(3,1,2,0)); /* r11 r12 r20 r21 */ \ 00188 v2 = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3,1,1,3)); /* r22 r30 r31 r32 */ \ 00189 } 00190 00194 #define __MM_SELECT(v, fp) \ 00195 _mm_shuffle_ps((v), (v), _MM_SHUFFLE((fp),(fp),(fp),(fp))) 00196 00198 #define __MM_ACCUM4_PS(a, b, c, d) \ 00199 _mm_add_ps(_mm_add_ps(a, b), _mm_add_ps(c, d)) 00200 00204 #define __MM_DOT4x4_PS(a0, a1, a2, a3, b0, b1, b2, b3) \ 00205 __MM_ACCUM4_PS(_mm_mul_ps(a0, b0), _mm_mul_ps(a1, b1), _mm_mul_ps(a2, b2), _mm_mul_ps(a3, b3)) 00206 00210 #define __MM_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \ 00211 __MM_ACCUM4_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2), r3) 00212 00214 #define __MM_ACCUM3_PS(a, b, c) \ 00215 _mm_add_ps(_mm_add_ps(a, b), c) 00216 00220 #define __MM_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \ 00221 __MM_ACCUM3_PS(_mm_mul_ps(r0, v0), _mm_mul_ps(r1, v1), _mm_mul_ps(r2, v2)) 00222 00224 #define __MM_MADD_PS(a, b, c) \ 00225 _mm_add_ps(_mm_mul_ps(a, b), c) 00226 00228 #define __MM_LERP_PS(t, a, b) \ 00229 __MM_MADD_PS(_mm_sub_ps(b, a), t, a) 00230 00232 #define __MM_MADD_SS(a, b, c) \ 00233 _mm_add_ss(_mm_mul_ss(a, b), c) 00234 00236 #define __MM_LERP_SS(t, a, b) \ 00237 __MM_MADD_SS(_mm_sub_ss(b, a), t, a) 00238 00240 #define __MM_LOAD_PS(p) \ 00241 (*(__m128*)(p)) 00242 00244 #define __MM_STORE_PS(p, v) \ 00245 (*(__m128*)(p) = (v)) 00246 00247 00250 template <bool aligned = false> 00251 struct SSEMemoryAccessor 00252 { 00253 static FORCEINLINE __m128 load(const float *p) 00254 { 00255 return _mm_loadu_ps(p); 00256 } 00257 static FORCEINLINE void store(float *p, const __m128& v) 00258 { 00259 _mm_storeu_ps(p, v); 00260 } 00261 }; 00262 // Special aligned accessor 00263 template <> 00264 struct SSEMemoryAccessor<true> 00265 { 00266 static FORCEINLINE const __m128& load(const float *p) 00267 { 00268 return __MM_LOAD_PS(p); 00269 } 00270 static FORCEINLINE void store(float *p, const __m128& v) 00271 { 00272 __MM_STORE_PS(p, v); 00273 } 00274 }; 00275 00278 static FORCEINLINE bool _isAlignedForSSE(const void *p) 00279 { 00280 return (((size_t)p) & 15) == 0; 00281 } 00282 00286 static FORCEINLINE __m128 __mm_rsqrt_nr_ps(const __m128& x) 00287 { 00288 static const __m128 v0pt5 = { 0.5f, 0.5f, 0.5f, 0.5f }; 00289 static const __m128 v3pt0 = { 3.0f, 3.0f, 3.0f, 3.0f }; 00290 __m128 t = _mm_rsqrt_ps(x); 00291 return _mm_mul_ps(_mm_mul_ps(v0pt5, t), 00292 _mm_sub_ps(v3pt0, _mm_mul_ps(_mm_mul_ps(x, t), t))); 00293 } 00294 00295 // Macro to check the stack aligned for SSE 00296 #if OGRE_DEBUG_MODE 00297 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() \ 00298 { \ 00299 __m128 test; \ 00300 assert(_isAlignedForSSE(&test)); \ 00301 } 00302 00303 #else // !OGRE_DEBUG_MODE 00304 #define __OGRE_CHECK_STACK_ALIGNED_FOR_SSE() 00305 00306 #endif // OGRE_DEBUG_MODE 00307 00308 00309 #endif // __OGRE_HAVE_SSE 00310 00313 } 00314 00315 #endif // __SIMDHelper_H__
Copyright © 2012 Torus Knot Software Ltd
This work is licensed under a Creative Commons Attribution-ShareAlike 3.0 Unported License.
Last modified Sun Sep 2 2012 07:27:24