mirror of
https://github.com/JulianGro/overte.git
synced 2025-04-25 16:55:07 +02:00
Merge pull request #9810 from kencooke/glm-simd-bugfix
Fix unsafe SIMD optimizations
This commit is contained in:
commit
b730342283
6 changed files with 58 additions and 39 deletions
|
@ -110,13 +110,7 @@ void CauterizedModel::updateClusterMatrices() {
|
|||
for (int j = 0; j < mesh.clusters.size(); j++) {
|
||||
const FBXCluster& cluster = mesh.clusters.at(j);
|
||||
auto jointMatrix = _rig->getJointTransform(cluster.jointIndex);
|
||||
#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC)
|
||||
glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix;
|
||||
glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out);
|
||||
state.clusterMatrices[j] = out;
|
||||
#else
|
||||
state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix;
|
||||
#endif
|
||||
glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]);
|
||||
}
|
||||
|
||||
// Once computed the cluster matrices, update the buffer(s)
|
||||
|
@ -149,13 +143,7 @@ void CauterizedModel::updateClusterMatrices() {
|
|||
if (_cauterizeBoneSet.find(cluster.jointIndex) != _cauterizeBoneSet.end()) {
|
||||
jointMatrix = cauterizeMatrix;
|
||||
}
|
||||
#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC)
|
||||
glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix;
|
||||
glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out);
|
||||
state.clusterMatrices[j] = out;
|
||||
#else
|
||||
state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix;
|
||||
#endif
|
||||
glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]);
|
||||
}
|
||||
|
||||
if (!_cauterizeBoneSet.empty() && (state.clusterMatrices.size() > 1)) {
|
||||
|
|
|
@ -60,13 +60,7 @@ void SoftAttachmentModel::updateClusterMatrices() {
|
|||
} else {
|
||||
jointMatrix = _rig->getJointTransform(cluster.jointIndex);
|
||||
}
|
||||
#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC)
|
||||
glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix;
|
||||
glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out);
|
||||
state.clusterMatrices[j] = out;
|
||||
#else
|
||||
state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix;
|
||||
#endif
|
||||
glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]);
|
||||
}
|
||||
|
||||
// Once computed the cluster matrices, update the buffer(s)
|
||||
|
|
|
@ -50,15 +50,9 @@ glm::vec3 AnimPose::xformVector(const glm::vec3& rhs) const {
|
|||
}
|
||||
|
||||
AnimPose AnimPose::operator*(const AnimPose& rhs) const {
|
||||
#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC)
|
||||
glm::mat4 result;
|
||||
glm::mat4 lhsMat = *this;
|
||||
glm::mat4 rhsMat = rhs;
|
||||
glm_mat4_mul((glm_vec4*)&lhsMat, (glm_vec4*)&rhsMat, (glm_vec4*)&result);
|
||||
glm_mat4u_mul(*this, rhs, result);
|
||||
return AnimPose(result);
|
||||
#else
|
||||
return AnimPose(static_cast<glm::mat4>(*this) * static_cast<glm::mat4>(rhs));
|
||||
#endif
|
||||
}
|
||||
|
||||
AnimPose AnimPose::inverse() const {
|
||||
|
|
|
@ -1178,13 +1178,7 @@ void Model::updateClusterMatrices() {
|
|||
for (int j = 0; j < mesh.clusters.size(); j++) {
|
||||
const FBXCluster& cluster = mesh.clusters.at(j);
|
||||
auto jointMatrix = _rig->getJointTransform(cluster.jointIndex);
|
||||
#if (GLM_ARCH & GLM_ARCH_SSE2) && !(defined Q_OS_MAC)
|
||||
glm::mat4 out, inverseBindMatrix = cluster.inverseBindMatrix;
|
||||
glm_mat4_mul((glm_vec4*)&jointMatrix, (glm_vec4*)&inverseBindMatrix, (glm_vec4*)&out);
|
||||
state.clusterMatrices[j] = out;
|
||||
#else
|
||||
state.clusterMatrices[j] = jointMatrix * cluster.inverseBindMatrix;
|
||||
#endif
|
||||
glm_mat4u_mul(jointMatrix, cluster.inverseBindMatrix, state.clusterMatrices[j]);
|
||||
}
|
||||
|
||||
// Once computed the cluster matrices, update the buffer(s)
|
||||
|
|
|
@ -245,4 +245,53 @@ inline bool isNaN(const glm::quat& value) { return isNaN(value.w) || isNaN(value
|
|||
|
||||
glm::mat4 orthoInverse(const glm::mat4& m);
|
||||
|
||||
//
|
||||
// Safe replacement of glm_mat4_mul() for unaligned arguments instead of __m128
|
||||
//
|
||||
inline void glm_mat4u_mul(const glm::mat4& m1, const glm::mat4& m2, glm::mat4& r) {
|
||||
|
||||
#if GLM_ARCH & GLM_ARCH_SSE2_BIT
|
||||
__m128 u0 = _mm_loadu_ps((float*)&m1[0][0]);
|
||||
__m128 u1 = _mm_loadu_ps((float*)&m1[1][0]);
|
||||
__m128 u2 = _mm_loadu_ps((float*)&m1[2][0]);
|
||||
__m128 u3 = _mm_loadu_ps((float*)&m1[3][0]);
|
||||
|
||||
__m128 v0 = _mm_loadu_ps((float*)&m2[0][0]);
|
||||
__m128 v1 = _mm_loadu_ps((float*)&m2[1][0]);
|
||||
__m128 v2 = _mm_loadu_ps((float*)&m2[2][0]);
|
||||
__m128 v3 = _mm_loadu_ps((float*)&m2[3][0]);
|
||||
|
||||
__m128 t0 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,0,0,0)), u0);
|
||||
__m128 t1 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(1,1,1,1)), u1);
|
||||
__m128 t2 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(2,2,2,2)), u2);
|
||||
__m128 t3 = _mm_mul_ps(_mm_shuffle_ps(v0, v0, _MM_SHUFFLE(3,3,3,3)), u3);
|
||||
v0 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
|
||||
|
||||
t0 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(0,0,0,0)), u0);
|
||||
t1 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(1,1,1,1)), u1);
|
||||
t2 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2,2,2,2)), u2);
|
||||
t3 = _mm_mul_ps(_mm_shuffle_ps(v1, v1, _MM_SHUFFLE(3,3,3,3)), u3);
|
||||
v1 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
|
||||
|
||||
t0 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(0,0,0,0)), u0);
|
||||
t1 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(1,1,1,1)), u1);
|
||||
t2 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(2,2,2,2)), u2);
|
||||
t3 = _mm_mul_ps(_mm_shuffle_ps(v2, v2, _MM_SHUFFLE(3,3,3,3)), u3);
|
||||
v2 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
|
||||
|
||||
t0 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(0,0,0,0)), u0);
|
||||
t1 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(1,1,1,1)), u1);
|
||||
t2 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(2,2,2,2)), u2);
|
||||
t3 = _mm_mul_ps(_mm_shuffle_ps(v3, v3, _MM_SHUFFLE(3,3,3,3)), u3);
|
||||
v3 = _mm_add_ps(_mm_add_ps(t0, t1), _mm_add_ps(t2, t3));
|
||||
|
||||
_mm_storeu_ps((float*)&r[0][0], v0);
|
||||
_mm_storeu_ps((float*)&r[1][0], v1);
|
||||
_mm_storeu_ps((float*)&r[2][0], v2);
|
||||
_mm_storeu_ps((float*)&r[3][0], v3);
|
||||
#else
|
||||
r = m1 * m2;
|
||||
#endif
|
||||
}
|
||||
|
||||
#endif // hifi_GLMHelpers_h
|
||||
|
|
|
@ -115,8 +115,8 @@ void GLMHelpersTests::testSimd() {
|
|||
|
||||
a1 = a * b;
|
||||
b1 = b * a;
|
||||
glm_mat4_mul((glm_vec4*)&a, (glm_vec4*)&b, (glm_vec4*)&a2);
|
||||
glm_mat4_mul((glm_vec4*)&b, (glm_vec4*)&a, (glm_vec4*)&b2);
|
||||
glm_mat4u_mul(a, b, a2);
|
||||
glm_mat4u_mul(b, a, b2);
|
||||
|
||||
|
||||
{
|
||||
|
@ -133,8 +133,8 @@ void GLMHelpersTests::testSimd() {
|
|||
QElapsedTimer timer;
|
||||
timer.start();
|
||||
for (size_t i = 0; i < LOOPS; ++i) {
|
||||
glm_mat4_mul((glm_vec4*)&a, (glm_vec4*)&b, (glm_vec4*)&a2);
|
||||
glm_mat4_mul((glm_vec4*)&b, (glm_vec4*)&a, (glm_vec4*)&b2);
|
||||
glm_mat4u_mul(a, b, a2);
|
||||
glm_mat4u_mul(b, a, b2);
|
||||
}
|
||||
qDebug() << "SIMD " << timer.elapsed();
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue