From 0bababf1f5866dff5f73c4650a87648cb427b5a1 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 14:53:07 -0800 Subject: [PATCH 1/7] Safe replacement of glm_mat4_mul() for unaligned arguments instead of __m128 --- libraries/shared/src/GLMHelpers.h | 49 +++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/libraries/shared/src/GLMHelpers.h b/libraries/shared/src/GLMHelpers.h index 4aac913768..609c3ab08b 100644 --- a/libraries/shared/src/GLMHelpers.h +++ b/libraries/shared/src/GLMHelpers.h @@ -245,4 +245,53 @@ inline bool isNaN(const glm::quat& value) { return isNaN(value.w) || isNaN(value glm::mat4 orthoInverse(const glm::mat4& m); +// +// Safe replacement of glm_mat4_mul() for unaligned arguments instead of __m128 +// +inline void glm_mat4u_mul(const glm::mat4& m1, const glm::mat4& m2, glm::mat4& r) { + +#if GLM_ARCH & GLM_ARCH_SSE2_BIT + __m128 u0 = _mm_loadu_ps((float*)&m1[0][0]); + __m128 u1 = _mm_loadu_ps((float*)&m1[1][0]); + __m128 u2 = _mm_loadu_ps((float*)&m1[2][0]); + __m128 u3 = _mm_loadu_ps((float*)&m1[3][0]); + + __m128 v0 = _mm_loadu_ps((float*)&m2[0][0]); + __m128 v1 = _mm_loadu_ps((float*)&m2[1][0]); + __m128 v2 = _mm_loadu_ps((float*)&m2[2][0]); + __m128 v3 = _mm_loadu_ps((float*)&m2[3][0]); + + __m128 t0 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,0,0,0)), u0); + __m128 t1 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(1,1,1,1)), u1); + __m128 t2 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(2,2,2,2)), u2); + __m128 t3 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(3,3,3,3)), u3); + v0 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)); + + t0 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0,0,0,0)), u0); + t1 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(1,1,1,1)), u1); + t2 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,2,2)), u2); + t3 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3,3,3,3)), u3); + v1 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)); + + t0 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(0,0,0,0)), u0); + t1 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(1,1,1,1)), u1); + t2 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(2,2,2,2)), u2); + t3 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3,3,3,3)), u3); + v2 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)); + + t0 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(0,0,0,0)), u0); + t1 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(1,1,1,1)), u1); + t2 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(2,2,2,2)), u2); + t3 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(3,3,3,3)), u3); + v3 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3)); + + _mm_storeu_ps((float*)&r[0][0], v0); + _mm_storeu_ps((float*)&r[1][0], v1); + _mm_storeu_ps((float*)&r[2][0], v2); + _mm_storeu_ps((float*)&r[3][0], v3); +#else + r = m1 * m2; +#endif +} + #endif // hifi_GLMHelpers_h From 117bba8b6a79473190220e15b77617f32eb7171b Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 15:17:09 -0800 Subject: [PATCH 2/7] redo unsafe optimization --- interface/src/avatar/CauterizedModel.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/src/avatar/CauterizedModel.cpp b/interface/src/avatar/CauterizedModel.cpp index 0c3d863649..7faf89dec5 100644 --- a/interface/src/avatar/CauterizedModel.cpp +++ b/interface/src/avatar/CauterizedModel.cpp @@ -110,13 +110,7 @@ void CauterizedModel::updateClusterMatrices() { for (int j = 0; j < mesh.clusters.size(); j++) { const FBXCluster& cluster = mesh.clusters.at(j); auto jointMatrix = _rig->getJointTransform(cluster.jointIndex); -#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC) - glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix; - glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out); - state.clusterMatrices[j] = out; -#else - state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix; -#endif + glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]); } // Once computed the cluster matrices, update the buffer(s) From 46c5f961130a42e0434927f33d75470ce7b8323e Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 15:24:03 -0800 Subject: [PATCH 3/7] redo unsafe optimization --- interface/src/avatar/CauterizedModel.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/src/avatar/CauterizedModel.cpp b/interface/src/avatar/CauterizedModel.cpp index 7faf89dec5..1ca87a498a 100644 --- a/interface/src/avatar/CauterizedModel.cpp +++ b/interface/src/avatar/CauterizedModel.cpp @@ -143,13 +143,7 @@ void CauterizedModel::updateClusterMatrices() { if (_cauterizeBoneSet.find(cluster.jointIndex) != _cauterizeBoneSet.end()) { jointMatrix = cauterizeMatrix; } -#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC) - glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix; - glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out); - state.clusterMatrices[j] = out; -#else - state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix; -#endif + glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]); } if (!_cauterizeBoneSet.empty() && (state.clusterMatrices.size() > 1)) { From 50f92cb934034767416cc5946e886b1cdb4048e8 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 15:29:23 -0800 Subject: [PATCH 4/7] redo unsafe optimization --- interface/src/avatar/SoftAttachmentModel.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/interface/src/avatar/SoftAttachmentModel.cpp b/interface/src/avatar/SoftAttachmentModel.cpp index 6ed54afb27..0521f7a893 100644 --- a/interface/src/avatar/SoftAttachmentModel.cpp +++ b/interface/src/avatar/SoftAttachmentModel.cpp @@ -60,13 +60,7 @@ void SoftAttachmentModel::updateClusterMatrices() { } else { jointMatrix = _rig->getJointTransform(cluster.jointIndex); } -#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC) - glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix; - glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out); - state.clusterMatrices[j] = out; -#else - state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix; -#endif + glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]); } // Once computed the cluster matrices, update the buffer(s) From 44c1f8500dfafc8de6ee0338baaeee40790f1451 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 15:55:53 -0800 Subject: [PATCH 5/7] redo unsafe optimization --- libraries/animation/src/AnimPose.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/libraries/animation/src/AnimPose.cpp b/libraries/animation/src/AnimPose.cpp index 5638cacabc..e1c8528e0b 100644 --- a/libraries/animation/src/AnimPose.cpp +++ b/libraries/animation/src/AnimPose.cpp @@ -50,15 +50,9 @@ glm::vec3 AnimPose::xformVector(const glm::vec3& rhs) const { } AnimPose AnimPose::operator*(const AnimPose& rhs) const { -#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC) glm::mat4 result; - glm::mat4 lhsMat = *this; - glm::mat4 rhsMat = rhs; - glm_mat4_mul((glm_vec4*)&lhsMat, (glm_vec4*)&rhsMat, (glm_vec4*)&result); + glm_mat4u_mul(*this, rhs, result); return AnimPose(result); -#else - return AnimPose(static_cast(*this) * static_cast(rhs)); -#endif } AnimPose AnimPose::inverse() const { From a5571bd49dbd4e71f236e95d8bf637aa9574f67c Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 16:02:39 -0800 Subject: [PATCH 6/7] redo unsafe optimization --- libraries/render-utils/src/Model.cpp | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index adfffe2614..d4de05c84d 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1178,13 +1178,7 @@ void Model::updateClusterMatrices() { for (int j = 0; j < mesh.clusters.size(); j++) { const FBXCluster& cluster = mesh.clusters.at(j); auto jointMatrix = _rig->getJointTransform(cluster.jointIndex); -#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC) - glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix; - glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out); - state.clusterMatrices[j] = out; -#else - state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix; -#endif + glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]); } // Once computed the cluster matrices, update the buffer(s) From 818425707b182343ecbd0739fcc2d9df5b5cd689 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sat, 4 Mar 2017 16:14:31 -0800 Subject: [PATCH 7/7] update unit tests --- tests/shared/src/GLMHelpersTests.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/shared/src/GLMHelpersTests.cpp b/tests/shared/src/GLMHelpersTests.cpp index 8d26d35c69..b4af4729a3 100644 --- a/tests/shared/src/GLMHelpersTests.cpp +++ b/tests/shared/src/GLMHelpersTests.cpp @@ -115,8 +115,8 @@ void GLMHelpersTests::testSimd() { a1 = a * b; b1 = b * a; - glm_mat4_mul((glm_vec4*)&a, (glm_vec4*)&b, (glm_vec4*)&a2); - glm_mat4_mul((glm_vec4*)&b, (glm_vec4*)&a, (glm_vec4*)&b2); + glm_mat4u_mul(a, b, a2); + glm_mat4u_mul(b, a, b2); { @@ -133,8 +133,8 @@ void GLMHelpersTests::testSimd() { QElapsedTimer timer; timer.start(); for (size_t i = 0; i < LOOPS; ++i) { - glm_mat4_mul((glm_vec4*)&a, (glm_vec4*)&b, (glm_vec4*)&a2); - glm_mat4_mul((glm_vec4*)&b, (glm_vec4*)&a, (glm_vec4*)&b2); + glm_mat4u_mul(a, b, a2); + glm_mat4u_mul(b, a, b2); } qDebug() << "SIMD " << timer.elapsed(); }