From 9e309b095da40440ac5b5c6a052b1bdc9a5b8e68 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sun, 23 Jun 2019 08:00:13 -0700 Subject: [PATCH 1/8] Fix bug that was packing zero offsets that were never used --- libraries/render-utils/src/Model.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index 64a46f3c1e..eccc7287b2 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1738,6 +1738,9 @@ void Blender::run() { int numMeshes = 0; // number of meshes in this model. for (auto meshIter = _hfmModel->meshes.cbegin(); meshIter != _hfmModel->meshes.cend(); ++meshIter) { numMeshes++; + if (meshIter->blendshapes.isEmpty()) { + continue; + } int numVertsInMesh = meshIter->vertices.size(); numBlendshapeOffsets += numVertsInMesh; } From 87c680382f5c11bb8556e901fcff383fdd2b1285 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sun, 23 Jun 2019 08:42:38 -0700 Subject: [PATCH 2/8] Strip-mining optimization to improve cache utilization. For each mesh: init, accumulate, and pack using a recycled offset buffer. --- libraries/render-utils/src/Model.cpp | 32 +++++++++++++++++----------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index eccc7287b2..6e969fe51f 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1735,6 +1735,7 @@ Blender::Blender(ModelPointer model, HFMModel::ConstPointer hfmModel, int blendN void Blender::run() { DETAILED_PROFILE_RANGE_EX(simulation_animation, __FUNCTION__, 0xFFFF0000, 0, { { "url", _model->getURL().toString() } }); int numBlendshapeOffsets = 0; // number of offsets required for all meshes. + int maxBlendshapeOffsets = 0; // number of offsets in the largest mesh. int numMeshes = 0; // number of meshes in this model. for (auto meshIter = _hfmModel->meshes.cbegin(); meshIter != _hfmModel->meshes.cend(); ++meshIter) { numMeshes++; @@ -1743,16 +1744,19 @@ void Blender::run() { } int numVertsInMesh = meshIter->vertices.size(); numBlendshapeOffsets += numVertsInMesh; + maxBlendshapeOffsets = std::max(maxBlendshapeOffsets, numVertsInMesh); } - // all elements are default constructed to zero offsets. - QVector packedBlendshapeOffsets(numBlendshapeOffsets); - QVector unpackedBlendshapeOffsets(numBlendshapeOffsets); - - // allocate the required size + // allocate the required sizes QVector blendedMeshSizes; blendedMeshSizes.reserve(numMeshes); + QVector packedBlendshapeOffsets; + packedBlendshapeOffsets.reserve(numBlendshapeOffsets); + + QVector unpackedBlendshapeOffsets; + unpackedBlendshapeOffsets.reserve(maxBlendshapeOffsets); // reuse for all meshes + int offset = 0; for (auto meshIter = _hfmModel->meshes.cbegin(); meshIter != _hfmModel->meshes.cend(); ++meshIter) { if (meshIter->blendshapes.isEmpty()) { @@ -1762,6 +1766,9 @@ void Blender::run() { int numVertsInMesh = meshIter->vertices.size(); blendedMeshSizes.push_back(numVertsInMesh); + // initialize offsets to zero + memset(unpackedBlendshapeOffsets.data(), 0, numVertsInMesh * sizeof(BlendshapeOffsetUnpacked)); + // for each blendshape in this mesh, accumulate the offsets into unpackedBlendshapeOffsets. const float NORMAL_COEFFICIENT_SCALE = 0.01f; for (int i = 0, n = qMin(_blendshapeCoefficients.size(), meshIter->blendshapes.size()); i < n; i++) { @@ -1776,7 +1783,7 @@ void Blender::run() { for (int j = 0; j < blendshape.indices.size(); ++j) { int index = blendshape.indices.at(j); - auto& currentBlendshapeOffset = unpackedBlendshapeOffsets[offset + index]; + auto& currentBlendshapeOffset = unpackedBlendshapeOffsets[index]; currentBlendshapeOffset.positionOffset += blendshape.vertices.at(j) * vertexCoefficient; currentBlendshapeOffset.normalOffset += blendshape.normals.at(j) * normalCoefficient; if (j < blendshape.tangents.size()) { @@ -1784,20 +1791,19 @@ void Blender::run() { } } } - offset += numVertsInMesh; - } - // convert unpackedBlendshapeOffsets into packedBlendshapeOffsets for the gpu. - // FIXME it feels like we could be more effectively using SIMD here - { + // convert unpackedBlendshapeOffsets into packedBlendshapeOffsets for the gpu. auto unpacked = unpackedBlendshapeOffsets.data(); - auto packed = packedBlendshapeOffsets.data(); - for (int i = 0; i < unpackedBlendshapeOffsets.size(); ++i) { + auto packed = packedBlendshapeOffsets.data() + offset; + for (int i = 0; i < numVertsInMesh; ++i) { packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked)); ++unpacked; ++packed; } + + offset += numVertsInMesh; } + Q_ASSERT(offset == numBlendshapeOffsets); // post the result to the ModelBlender, which will dispatch to the model if still alive QMetaObject::invokeMethod(DependencyManager::get().data(), "setBlendedVertices", From cceff21cd0ea1d583e39c6dea5f333df909bed10 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sun, 23 Jun 2019 08:57:56 -0700 Subject: [PATCH 3/8] Pull packBlendshapeOffsets() into separate function --- libraries/render-utils/src/Model.cpp | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index 6e969fe51f..6a06047bef 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1691,10 +1691,7 @@ public: } }; - -using packBlendshapeOffsetTo = void(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked); - -void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) { +static void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) { float len = glm::compMax(glm::abs(unpacked.positionOffset)); glm::vec3 normalizedPos(unpacked.positionOffset); if (len > 0.0f) { @@ -1711,6 +1708,14 @@ void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& pac ); } +static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { + for (int i = 0; i < size; ++i) { + packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked)); + ++unpacked; + ++packed; + } +} + class Blender : public QRunnable { public: @@ -1795,11 +1800,7 @@ void Blender::run() { // convert unpackedBlendshapeOffsets into packedBlendshapeOffsets for the gpu. auto unpacked = unpackedBlendshapeOffsets.data(); auto packed = packedBlendshapeOffsets.data() + offset; - for (int i = 0; i < numVertsInMesh; ++i) { - packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked)); - ++unpacked; - ++packed; - } + packBlendshapeOffsets(unpacked, packed, numVertsInMesh); offset += numVertsInMesh; } From 8653118b6cb09810b7d1688ddb62975d52b96f05 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sun, 23 Jun 2019 10:32:57 -0700 Subject: [PATCH 4/8] Full SIMD implementation of packBlendshapeOffsets() using AVX2. 6x speedup over the existing (partial SIMD) version. 60x speedup over the original (pure GLM) version. --- libraries/render-utils/src/Model.cpp | 10 +- .../src/avx2/BlendshapePacking_avx2.cpp | 285 ++++++++++++++++++ 2 files changed, 294 insertions(+), 1 deletion(-) create mode 100644 libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index 6a06047bef..67f64395b2 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1708,7 +1708,7 @@ static void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uve ); } -static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { +static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { for (int i = 0; i < size; ++i) { packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked)); ++unpacked; @@ -1716,6 +1716,14 @@ static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, Blendshape } } +void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size); + +static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { + static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match."); + static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match."); + packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size); +} + class Blender : public QRunnable { public: diff --git a/libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp b/libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp new file mode 100644 index 0000000000..5524c355dc --- /dev/null +++ b/libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp @@ -0,0 +1,285 @@ +// +// BlendshapePacking_avx2.cpp +// +// Created by Ken Cooke on 6/22/19. +// Copyright 2019 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#ifdef __AVX2__ + +#include +#include + +void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size) { + + int i = 0; + for (; i < size - 7; i += 8) { // blocks of 8 + + // + // deinterleave (8x9 to 9x8 matrix transpose) + // + __m256 s0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+0][0])), _mm_load_ps(&unpacked[i+4][0]), 1); + __m256 s1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+1][0])), _mm_load_ps(&unpacked[i+5][0]), 1); + __m256 s2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+2][0])), _mm_load_ps(&unpacked[i+6][0]), 1); + __m256 s3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+3][0])), _mm_load_ps(&unpacked[i+7][0]), 1); + __m256 s4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+0][4])), _mm_load_ps(&unpacked[i+4][4]), 1); + __m256 s5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+1][4])), _mm_load_ps(&unpacked[i+5][4]), 1); + __m256 s6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+2][4])), _mm_load_ps(&unpacked[i+6][4]), 1); + __m256 s7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+3][4])), _mm_load_ps(&unpacked[i+7][4]), 1); + + __m256 t0 = _mm256_unpacklo_ps(s0, s1); + __m256 t1 = _mm256_unpackhi_ps(s0, s1); + __m256 t2 = _mm256_unpacklo_ps(s2, s3); + __m256 t3 = _mm256_unpackhi_ps(s2, s3); + __m256 t4 = _mm256_unpacklo_ps(s4, s5); + __m256 t5 = _mm256_unpackhi_ps(s4, s5); + __m256 t6 = _mm256_unpacklo_ps(s6, s7); + __m256 t7 = _mm256_unpackhi_ps(s6, s7); + + __m256 px = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0)); + __m256 py = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2)); + __m256 pz = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0)); + __m256 nx = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2)); + __m256 ny = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1,0,1,0)); + __m256 nz = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3,2,3,2)); + __m256 tx = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1,0,1,0)); + __m256 ty = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3,2,3,2)); + + __m256 tz = _mm256_i32gather_ps(unpacked[i+0], _mm256_setr_epi32(8,17,26,35,44,53,62,71), sizeof(float)); + + // abs(pos) + __m256 apx = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), px); + __m256 apy = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), py); + __m256 apz = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), pz); + + // len = compMax(abs(pos)) + __m256 len = _mm256_max_ps(_mm256_max_ps(apx, apy), apz); + + // detect zeros + __m256 mask = _mm256_cmp_ps(len, _mm256_setzero_ps(), _CMP_EQ_OQ); + + // rcp = 1.0f / len + __m256 rcp = _mm256_div_ps(_mm256_set1_ps(1.0f), len); + + // replace +inf with 1.0f + rcp = _mm256_blendv_ps(rcp, _mm256_set1_ps(1.0f), mask); + len = _mm256_blendv_ps(len, _mm256_set1_ps(1.0f), mask); + + // pos *= 1.0f / len + px = _mm256_mul_ps(px, rcp); + py = _mm256_mul_ps(py, rcp); + pz = _mm256_mul_ps(pz, rcp); + + // clamp(vec, -1.0f, 1.0f) + px = _mm256_min_ps(_mm256_max_ps(px, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + py = _mm256_min_ps(_mm256_max_ps(py, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + pz = _mm256_min_ps(_mm256_max_ps(pz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + nx = _mm256_min_ps(_mm256_max_ps(nx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + ny = _mm256_min_ps(_mm256_max_ps(ny, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + nz = _mm256_min_ps(_mm256_max_ps(nz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + tx = _mm256_min_ps(_mm256_max_ps(tx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + ty = _mm256_min_ps(_mm256_max_ps(ty, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + tz = _mm256_min_ps(_mm256_max_ps(tz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + + // vec *= 511.0f + px = _mm256_mul_ps(px, _mm256_set1_ps(511.0f)); + py = _mm256_mul_ps(py, _mm256_set1_ps(511.0f)); + pz = _mm256_mul_ps(pz, _mm256_set1_ps(511.0f)); + nx = _mm256_mul_ps(nx, _mm256_set1_ps(511.0f)); + ny = _mm256_mul_ps(ny, _mm256_set1_ps(511.0f)); + nz = _mm256_mul_ps(nz, _mm256_set1_ps(511.0f)); + tx = _mm256_mul_ps(tx, _mm256_set1_ps(511.0f)); + ty = _mm256_mul_ps(ty, _mm256_set1_ps(511.0f)); + tz = _mm256_mul_ps(tz, _mm256_set1_ps(511.0f)); + + // veci = lrint(vec) & 03ff + __m256i pxi = _mm256_and_si256(_mm256_cvtps_epi32(px), _mm256_set1_epi32(0x3ff)); + __m256i pyi = _mm256_and_si256(_mm256_cvtps_epi32(py), _mm256_set1_epi32(0x3ff)); + __m256i pzi = _mm256_and_si256(_mm256_cvtps_epi32(pz), _mm256_set1_epi32(0x3ff)); + __m256i nxi = _mm256_and_si256(_mm256_cvtps_epi32(nx), _mm256_set1_epi32(0x3ff)); + __m256i nyi = _mm256_and_si256(_mm256_cvtps_epi32(ny), _mm256_set1_epi32(0x3ff)); + __m256i nzi = _mm256_and_si256(_mm256_cvtps_epi32(nz), _mm256_set1_epi32(0x3ff)); + __m256i txi = _mm256_and_si256(_mm256_cvtps_epi32(tx), _mm256_set1_epi32(0x3ff)); + __m256i tyi = _mm256_and_si256(_mm256_cvtps_epi32(ty), _mm256_set1_epi32(0x3ff)); + __m256i tzi = _mm256_and_si256(_mm256_cvtps_epi32(tz), _mm256_set1_epi32(0x3ff)); + + // pack = (xi << 0) | (yi << 10) | (zi << 20); + __m256i li = _mm256_castps_si256(len); // length + __m256i pi = _mm256_or_si256(_mm256_or_si256(pxi, _mm256_slli_epi32(pyi, 10)), _mm256_slli_epi32(pzi, 20)); // position + __m256i ni = _mm256_or_si256(_mm256_or_si256(nxi, _mm256_slli_epi32(nyi, 10)), _mm256_slli_epi32(nzi, 20)); // normal + __m256i ti = _mm256_or_si256(_mm256_or_si256(txi, _mm256_slli_epi32(tyi, 10)), _mm256_slli_epi32(tzi, 20)); // tangent + + // + // interleave (4x4 matrix transpose) + // + __m256i u0 = _mm256_unpacklo_epi32(li, pi); + __m256i u1 = _mm256_unpackhi_epi32(li, pi); + __m256i u2 = _mm256_unpacklo_epi32(ni, ti); + __m256i u3 = _mm256_unpackhi_epi32(ni, ti); + + __m256i v0 = _mm256_unpacklo_epi64(u0, u2); + __m256i v1 = _mm256_unpackhi_epi64(u0, u2); + __m256i v2 = _mm256_unpacklo_epi64(u1, u3); + __m256i v3 = _mm256_unpackhi_epi64(u1, u3); + + __m256i w0 = _mm256_permute2f128_si256(v0, v1, 0x20); + __m256i w1 = _mm256_permute2f128_si256(v2, v3, 0x20); + __m256i w2 = _mm256_permute2f128_si256(v0, v1, 0x31); + __m256i w3 = _mm256_permute2f128_si256(v2, v3, 0x31); + + // store pack x 8 + _mm256_storeu_si256((__m256i*)packed[i+0], w0); + _mm256_storeu_si256((__m256i*)packed[i+2], w1); + _mm256_storeu_si256((__m256i*)packed[i+4], w2); + _mm256_storeu_si256((__m256i*)packed[i+6], w3); + } + + if (i < size) { // remainder + int rem = size - i; + + // + // deinterleave (8x9 to 9x8 matrix transpose) + // + __m256 s0 = _mm256_setzero_ps(); + __m256 s1 = _mm256_setzero_ps(); + __m256 s2 = _mm256_setzero_ps(); + __m256 s3 = _mm256_setzero_ps(); + __m256 s4 = _mm256_setzero_ps(); + __m256 s5 = _mm256_setzero_ps(); + __m256 s6 = _mm256_setzero_ps(); + __m256 s7 = _mm256_setzero_ps(); + + switch (rem) { + case 7: s6 = _mm256_loadu_ps(unpacked[i+6]); + case 6: s5 = _mm256_loadu_ps(unpacked[i+5]); + case 5: s4 = _mm256_loadu_ps(unpacked[i+4]); + case 4: s3 = _mm256_loadu_ps(unpacked[i+3]); + case 3: s2 = _mm256_loadu_ps(unpacked[i+2]); + case 2: s1 = _mm256_loadu_ps(unpacked[i+1]); + case 1: s0 = _mm256_loadu_ps(unpacked[i+0]); + } + + __m256 t0 = _mm256_unpacklo_ps(s0, s1); + __m256 t1 = _mm256_unpackhi_ps(s0, s1); + __m256 t2 = _mm256_unpacklo_ps(s2, s3); + __m256 t3 = _mm256_unpackhi_ps(s2, s3); + __m256 t4 = _mm256_unpacklo_ps(s4, s5); + __m256 t5 = _mm256_unpackhi_ps(s4, s5); + __m256 t6 = _mm256_unpacklo_ps(s6, s7); + __m256 t7 = _mm256_unpackhi_ps(s6, s7); + + s0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0)); + s1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2)); + s2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0)); + s3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2)); + s4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1,0,1,0)); + s5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3,2,3,2)); + s6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1,0,1,0)); + s7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3,2,3,2)); + + __m256 px = _mm256_permute2f128_ps(s0, s4, 0x20); + __m256 py = _mm256_permute2f128_ps(s1, s5, 0x20); + __m256 pz = _mm256_permute2f128_ps(s2, s6, 0x20); + __m256 nx = _mm256_permute2f128_ps(s3, s7, 0x20); + __m256 ny = _mm256_permute2f128_ps(s0, s4, 0x31); + __m256 nz = _mm256_permute2f128_ps(s1, s5, 0x31); + __m256 tx = _mm256_permute2f128_ps(s2, s6, 0x31); + __m256 ty = _mm256_permute2f128_ps(s3, s7, 0x31); + + __m256i loadmask = _mm256_cvtepi8_epi32(_mm_cvtsi64_si128(0xffffffffffffffffULL >> (64 - 8 * rem))); + __m256 tz = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), unpacked[i+0], _mm256_setr_epi32(8,17,26,35,44,53,62,71), + _mm256_castsi256_ps(loadmask), sizeof(float)); + // abs(pos) + __m256 apx = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), px); + __m256 apy = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), py); + __m256 apz = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), pz); + + // len = compMax(abs(pos)) + __m256 len = _mm256_max_ps(_mm256_max_ps(apx, apy), apz); + + // detect zeros + __m256 mask = _mm256_cmp_ps(len, _mm256_setzero_ps(), _CMP_EQ_OQ); + + // rcp = 1.0f / len + __m256 rcp = _mm256_div_ps(_mm256_set1_ps(1.0f), len); + + // replace +inf with 1.0f + rcp = _mm256_blendv_ps(rcp, _mm256_set1_ps(1.0f), mask); + len = _mm256_blendv_ps(len, _mm256_set1_ps(1.0f), mask); + + // pos *= 1.0f / len + px = _mm256_mul_ps(px, rcp); + py = _mm256_mul_ps(py, rcp); + pz = _mm256_mul_ps(pz, rcp); + + // clamp(vec, -1.0f, 1.0f) + px = _mm256_min_ps(_mm256_max_ps(px, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + py = _mm256_min_ps(_mm256_max_ps(py, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + pz = _mm256_min_ps(_mm256_max_ps(pz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + nx = _mm256_min_ps(_mm256_max_ps(nx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + ny = _mm256_min_ps(_mm256_max_ps(ny, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + nz = _mm256_min_ps(_mm256_max_ps(nz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + tx = _mm256_min_ps(_mm256_max_ps(tx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + ty = _mm256_min_ps(_mm256_max_ps(ty, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + tz = _mm256_min_ps(_mm256_max_ps(tz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f)); + + // vec *= 511.0f + px = _mm256_mul_ps(px, _mm256_set1_ps(511.0f)); + py = _mm256_mul_ps(py, _mm256_set1_ps(511.0f)); + pz = _mm256_mul_ps(pz, _mm256_set1_ps(511.0f)); + nx = _mm256_mul_ps(nx, _mm256_set1_ps(511.0f)); + ny = _mm256_mul_ps(ny, _mm256_set1_ps(511.0f)); + nz = _mm256_mul_ps(nz, _mm256_set1_ps(511.0f)); + tx = _mm256_mul_ps(tx, _mm256_set1_ps(511.0f)); + ty = _mm256_mul_ps(ty, _mm256_set1_ps(511.0f)); + tz = _mm256_mul_ps(tz, _mm256_set1_ps(511.0f)); + + // veci = lrint(vec) & 03ff + __m256i pxi = _mm256_and_si256(_mm256_cvtps_epi32(px), _mm256_set1_epi32(0x3ff)); + __m256i pyi = _mm256_and_si256(_mm256_cvtps_epi32(py), _mm256_set1_epi32(0x3ff)); + __m256i pzi = _mm256_and_si256(_mm256_cvtps_epi32(pz), _mm256_set1_epi32(0x3ff)); + __m256i nxi = _mm256_and_si256(_mm256_cvtps_epi32(nx), _mm256_set1_epi32(0x3ff)); + __m256i nyi = _mm256_and_si256(_mm256_cvtps_epi32(ny), _mm256_set1_epi32(0x3ff)); + __m256i nzi = _mm256_and_si256(_mm256_cvtps_epi32(nz), _mm256_set1_epi32(0x3ff)); + __m256i txi = _mm256_and_si256(_mm256_cvtps_epi32(tx), _mm256_set1_epi32(0x3ff)); + __m256i tyi = _mm256_and_si256(_mm256_cvtps_epi32(ty), _mm256_set1_epi32(0x3ff)); + __m256i tzi = _mm256_and_si256(_mm256_cvtps_epi32(tz), _mm256_set1_epi32(0x3ff)); + + // pack = (xi << 0) | (yi << 10) | (zi << 20); + __m256i li = _mm256_castps_si256(len); // length + __m256i pi = _mm256_or_si256(_mm256_or_si256(pxi, _mm256_slli_epi32(pyi, 10)), _mm256_slli_epi32(pzi, 20)); // position + __m256i ni = _mm256_or_si256(_mm256_or_si256(nxi, _mm256_slli_epi32(nyi, 10)), _mm256_slli_epi32(nzi, 20)); // normal + __m256i ti = _mm256_or_si256(_mm256_or_si256(txi, _mm256_slli_epi32(tyi, 10)), _mm256_slli_epi32(tzi, 20)); // tangent + + // + // interleave (4x4 matrix transpose) + // + __m256i u0 = _mm256_unpacklo_epi32(li, pi); + __m256i u1 = _mm256_unpackhi_epi32(li, pi); + __m256i u2 = _mm256_unpacklo_epi32(ni, ti); + __m256i u3 = _mm256_unpackhi_epi32(ni, ti); + + __m256i v0 = _mm256_unpacklo_epi64(u0, u2); + __m256i v1 = _mm256_unpackhi_epi64(u0, u2); + __m256i v2 = _mm256_unpacklo_epi64(u1, u3); + __m256i v3 = _mm256_unpackhi_epi64(u1, u3); + + // store pack x 8 + switch (rem) { + case 7: _mm_storeu_si128((__m128i*)packed[i+6], _mm256_extractf128_si256(v2, 1)); + case 6: _mm_storeu_si128((__m128i*)packed[i+5], _mm256_extractf128_si256(v1, 1)); + case 5: _mm_storeu_si128((__m128i*)packed[i+4], _mm256_extractf128_si256(v0, 1)); + case 4: _mm_storeu_si128((__m128i*)packed[i+3], _mm256_castsi256_si128(v3)); + case 3: _mm_storeu_si128((__m128i*)packed[i+2], _mm256_castsi256_si128(v2)); + case 2: _mm_storeu_si128((__m128i*)packed[i+1], _mm256_castsi256_si128(v1)); + case 1: _mm_storeu_si128((__m128i*)packed[i+0], _mm256_castsi256_si128(v0)); + } + } + + _mm256_zeroupper(); +} + +#endif From 87e0f5b2bba804f892741217a893e345f6283b53 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Sun, 23 Jun 2019 10:40:15 -0700 Subject: [PATCH 5/8] Runtime dispatch for AVX2 --- libraries/render-utils/src/Model.cpp | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index 67f64395b2..164090709a 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1716,14 +1716,29 @@ static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, Blends } } +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +// +// Runtime CPU dispatch +// +#include "CPUDetect.h" + void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size); static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { - static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match."); - static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match."); - packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size); + static bool _cpuSupportsAVX2 = cpuSupportsAVX2(); + if (_cpuSupportsAVX2) { + static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match."); + static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match."); + packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size); + } else { + packBlendshapeOffsets_ref(unpacked, packed, size); + } } +#else // portable reference code +static auto& packBlendshapeOffsets = packBlendshapeOffsets_ref; +#endif + class Blender : public QRunnable { public: From 75aea88f200b0b3cf716aba5aa1871739701097d Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Mon, 24 Jun 2019 11:51:03 -0700 Subject: [PATCH 6/8] CR feedback --- libraries/render-utils/src/Model.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libraries/render-utils/src/Model.cpp b/libraries/render-utils/src/Model.cpp index 164090709a..11c1e42fd4 100644 --- a/libraries/render-utils/src/Model.cpp +++ b/libraries/render-utils/src/Model.cpp @@ -1720,7 +1720,7 @@ static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, Blends // // Runtime CPU dispatch // -#include "CPUDetect.h" +#include void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size); From 541d45012a0677f65757e1e84add0654ba3b2872 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Mon, 24 Jun 2019 12:54:58 -0700 Subject: [PATCH 7/8] Move BlendshapePacking_avx2.cpp to shared --- .../{render-utils => shared}/src/avx2/BlendshapePacking_avx2.cpp | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename libraries/{render-utils => shared}/src/avx2/BlendshapePacking_avx2.cpp (100%) diff --git a/libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp b/libraries/shared/src/avx2/BlendshapePacking_avx2.cpp similarity index 100% rename from libraries/render-utils/src/avx2/BlendshapePacking_avx2.cpp rename to libraries/shared/src/avx2/BlendshapePacking_avx2.cpp From 2322df5a5930f436fcf43389a906f97f638f71b0 Mon Sep 17 00:00:00 2001 From: Ken Cooke Date: Mon, 24 Jun 2019 12:57:42 -0700 Subject: [PATCH 8/8] Add unit-test to validate AVX2 --- tests/shared/src/BlendshapePackingTests.cpp | 148 ++++++++++++++++++++ tests/shared/src/BlendshapePackingTests.h | 23 +++ 2 files changed, 171 insertions(+) create mode 100644 tests/shared/src/BlendshapePackingTests.cpp create mode 100644 tests/shared/src/BlendshapePackingTests.h diff --git a/tests/shared/src/BlendshapePackingTests.cpp b/tests/shared/src/BlendshapePackingTests.cpp new file mode 100644 index 0000000000..a751a5ca02 --- /dev/null +++ b/tests/shared/src/BlendshapePackingTests.cpp @@ -0,0 +1,148 @@ +// +// BlendshapePackingTests.cpp +// tests/shared/src +// +// Created by Ken Cooke on 6/24/19. +// Copyright 2019 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#include "BlendshapePackingTests.h" + +#include + +#include + +#include +#include + +struct BlendshapeOffsetUnpacked { + glm::vec3 positionOffset; + glm::vec3 normalOffset; + glm::vec3 tangentOffset; +}; + +struct BlendshapeOffsetPacked { + glm::uvec4 packedPosNorTan; +}; + +QTEST_MAIN(BlendshapePackingTests) + +static void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) { + float len = glm::compMax(glm::abs(unpacked.positionOffset)); + glm::vec3 normalizedPos(unpacked.positionOffset); + if (len > 0.0f) { + normalizedPos /= len; + } else { + len = 1.0f; + } + + packed = glm::uvec4( + glm::floatBitsToUint(len), + glm_packSnorm3x10_1x2(glm::vec4(normalizedPos, 0.0f)), + glm_packSnorm3x10_1x2(glm::vec4(unpacked.normalOffset, 0.0f)), + glm_packSnorm3x10_1x2(glm::vec4(unpacked.tangentOffset, 0.0f)) + ); +} + +static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { + for (int i = 0; i < size; ++i) { + packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked)); + ++unpacked; + ++packed; + } +} + +#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__) +// +// Runtime CPU dispatch +// +#include + +void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size); + +static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) { + static bool _cpuSupportsAVX2 = cpuSupportsAVX2(); + if (_cpuSupportsAVX2) { + static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match."); + static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match."); + packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size); + } else { + packBlendshapeOffsets_ref(unpacked, packed, size); + } +} + +#else // portable reference code +static auto& packBlendshapeOffsets = packBlendshapeOffsets_ref; +#endif + +void comparePacked(BlendshapeOffsetPacked& ref, BlendshapeOffsetPacked& tst) { + union i10i10i10i2 { + struct { + int x : 10; + int y : 10; + int z : 10; + int w : 2; + } data; + uint32_t pack; + } Ref[4], Tst[4]; + + for (int i = 0; i < 4; i++) { + Ref[i].pack = ref.packedPosNorTan[i]; + Tst[i].pack = tst.packedPosNorTan[i]; + } + + // allow 1 ULP due to rounding differences + QCOMPARE_WITH_ABS_ERROR(Tst[0].pack, Ref[0].pack, 1); + + QCOMPARE_WITH_ABS_ERROR(Tst[1].data.x, Ref[1].data.x, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[1].data.y, Ref[1].data.y, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[1].data.z, Ref[1].data.z, 1); + + QCOMPARE_WITH_ABS_ERROR(Tst[2].data.x, Ref[2].data.x, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[2].data.y, Ref[2].data.y, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[2].data.z, Ref[2].data.z, 1); + + QCOMPARE_WITH_ABS_ERROR(Tst[3].data.x, Ref[3].data.x, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[3].data.y, Ref[3].data.y, 1); + QCOMPARE_WITH_ABS_ERROR(Tst[3].data.z, Ref[3].data.z, 1); +} + +void BlendshapePackingTests::testAVX2() { + + for (int numBlendshapeOffsets = 0; numBlendshapeOffsets < 4096; ++numBlendshapeOffsets) { + + std::vector unpackedBlendshapeOffsets(numBlendshapeOffsets); + std::vector packedBlendshapeOffsets1(numBlendshapeOffsets); + std::vector packedBlendshapeOffsets2(numBlendshapeOffsets); + + // init test data + if (numBlendshapeOffsets > 0) { + unpackedBlendshapeOffsets[0] = { + glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f), + }; + } + for (int i = 1; i < numBlendshapeOffsets; ++i) { + unpackedBlendshapeOffsets[i] = { + glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)), + glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)), + glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)), + }; + } + + // ref version + packBlendshapeOffsets_ref(unpackedBlendshapeOffsets.data(), packedBlendshapeOffsets1.data(), numBlendshapeOffsets); + + // AVX2 version, if supported by CPU + packBlendshapeOffsets(unpackedBlendshapeOffsets.data(), packedBlendshapeOffsets2.data(), numBlendshapeOffsets); + + // verify + for (int i = 0; i < numBlendshapeOffsets; ++i) { + auto ref = packedBlendshapeOffsets1.at(i); + auto tst = packedBlendshapeOffsets2.at(i); + comparePacked(ref, tst); + } + } +} diff --git a/tests/shared/src/BlendshapePackingTests.h b/tests/shared/src/BlendshapePackingTests.h new file mode 100644 index 0000000000..d6d0ef8b10 --- /dev/null +++ b/tests/shared/src/BlendshapePackingTests.h @@ -0,0 +1,23 @@ +// +// BlendshapePackingTests.h +// tests/shared/src +// +// Created by Ken Cooke on 6/24/19. +// Copyright 2019 High Fidelity, Inc. +// +// Distributed under the Apache License, Version 2.0. +// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html +// + +#ifndef hifi_BlendshapePackingTests_h +#define hifi_BlendshapePackingTests_h + +#include + +class BlendshapePackingTests : public QObject { + Q_OBJECT +private slots: + void testAVX2(); +}; + +#endif // hifi_BlendshapePackingTests_h