Merge pull request #15825 from kencooke/render-blendshape-optim-avx2

Blendshape CPU optimization
This commit is contained in:
Bradley Austin Davis 2019-06-25 09:41:13 -07:00 committed by GitHub
commit ca797e30e4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 511 additions and 22 deletions

View file

@ -1691,10 +1691,7 @@ public:
}
};
using packBlendshapeOffsetTo = void(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked);
void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) {
static void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) {
float len = glm::compMax(glm::abs(unpacked.positionOffset));
glm::vec3 normalizedPos(unpacked.positionOffset);
if (len > 0.0f) {
@ -1711,6 +1708,37 @@ void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& pac
);
}
static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) {
for (int i = 0; i < size; ++i) {
packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked));
++unpacked;
++packed;
}
}
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
//
// Runtime CPU dispatch
//
#include <CPUDetect.h>
void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size);
static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) {
static bool _cpuSupportsAVX2 = cpuSupportsAVX2();
if (_cpuSupportsAVX2) {
static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match.");
static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match.");
packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size);
} else {
packBlendshapeOffsets_ref(unpacked, packed, size);
}
}
#else // portable reference code
static auto& packBlendshapeOffsets = packBlendshapeOffsets_ref;
#endif
class Blender : public QRunnable {
public:
@ -1735,21 +1763,28 @@ Blender::Blender(ModelPointer model, HFMModel::ConstPointer hfmModel, int blendN
void Blender::run() {
DETAILED_PROFILE_RANGE_EX(simulation_animation, __FUNCTION__, 0xFFFF0000, 0, { { "url", _model->getURL().toString() } });
int numBlendshapeOffsets = 0; // number of offsets required for all meshes.
int maxBlendshapeOffsets = 0; // number of offsets in the largest mesh.
int numMeshes = 0; // number of meshes in this model.
for (auto meshIter = _hfmModel->meshes.cbegin(); meshIter != _hfmModel->meshes.cend(); ++meshIter) {
numMeshes++;
if (meshIter->blendshapes.isEmpty()) {
continue;
}
int numVertsInMesh = meshIter->vertices.size();
numBlendshapeOffsets += numVertsInMesh;
maxBlendshapeOffsets = std::max(maxBlendshapeOffsets, numVertsInMesh);
}
// all elements are default constructed to zero offsets.
QVector<BlendshapeOffset> packedBlendshapeOffsets(numBlendshapeOffsets);
QVector<BlendshapeOffsetUnpacked> unpackedBlendshapeOffsets(numBlendshapeOffsets);
// allocate the required size
// allocate the required sizes
QVector<int> blendedMeshSizes;
blendedMeshSizes.reserve(numMeshes);
QVector<BlendshapeOffset> packedBlendshapeOffsets;
packedBlendshapeOffsets.reserve(numBlendshapeOffsets);
QVector<BlendshapeOffsetUnpacked> unpackedBlendshapeOffsets;
unpackedBlendshapeOffsets.reserve(maxBlendshapeOffsets); // reuse for all meshes
int offset = 0;
for (auto meshIter = _hfmModel->meshes.cbegin(); meshIter != _hfmModel->meshes.cend(); ++meshIter) {
if (meshIter->blendshapes.isEmpty()) {
@ -1759,6 +1794,9 @@ void Blender::run() {
int numVertsInMesh = meshIter->vertices.size();
blendedMeshSizes.push_back(numVertsInMesh);
// initialize offsets to zero
memset(unpackedBlendshapeOffsets.data(), 0, numVertsInMesh * sizeof(BlendshapeOffsetUnpacked));
// for each blendshape in this mesh, accumulate the offsets into unpackedBlendshapeOffsets.
const float NORMAL_COEFFICIENT_SCALE = 0.01f;
for (int i = 0, n = qMin(_blendshapeCoefficients.size(), meshIter->blendshapes.size()); i < n; i++) {
@ -1773,7 +1811,7 @@ void Blender::run() {
for (int j = 0; j < blendshape.indices.size(); ++j) {
int index = blendshape.indices.at(j);
auto& currentBlendshapeOffset = unpackedBlendshapeOffsets[offset + index];
auto& currentBlendshapeOffset = unpackedBlendshapeOffsets[index];
currentBlendshapeOffset.positionOffset += blendshape.vertices.at(j) * vertexCoefficient;
currentBlendshapeOffset.normalOffset += blendshape.normals.at(j) * normalCoefficient;
if (j < blendshape.tangents.size()) {
@ -1781,20 +1819,15 @@ void Blender::run() {
}
}
}
// convert unpackedBlendshapeOffsets into packedBlendshapeOffsets for the gpu.
auto unpacked = unpackedBlendshapeOffsets.data();
auto packed = packedBlendshapeOffsets.data() + offset;
packBlendshapeOffsets(unpacked, packed, numVertsInMesh);
offset += numVertsInMesh;
}
// convert unpackedBlendshapeOffsets into packedBlendshapeOffsets for the gpu.
// FIXME it feels like we could be more effectively using SIMD here
{
auto unpacked = unpackedBlendshapeOffsets.data();
auto packed = packedBlendshapeOffsets.data();
for (int i = 0; i < unpackedBlendshapeOffsets.size(); ++i) {
packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked));
++unpacked;
++packed;
}
}
Q_ASSERT(offset == numBlendshapeOffsets);
// post the result to the ModelBlender, which will dispatch to the model if still alive
QMetaObject::invokeMethod(DependencyManager::get<ModelBlender>().data(), "setBlendedVertices",

View file

@ -0,0 +1,285 @@
//
// BlendshapePacking_avx2.cpp
//
// Created by Ken Cooke on 6/22/19.
// Copyright 2019 High Fidelity, Inc.
//
// Distributed under the Apache License, Version 2.0.
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#ifdef __AVX2__
#include <stdint.h>
#include <immintrin.h>
void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size) {
int i = 0;
for (; i < size - 7; i += 8) { // blocks of 8
//
// deinterleave (8x9 to 9x8 matrix transpose)
//
__m256 s0 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+0][0])), _mm_load_ps(&unpacked[i+4][0]), 1);
__m256 s1 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+1][0])), _mm_load_ps(&unpacked[i+5][0]), 1);
__m256 s2 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+2][0])), _mm_load_ps(&unpacked[i+6][0]), 1);
__m256 s3 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+3][0])), _mm_load_ps(&unpacked[i+7][0]), 1);
__m256 s4 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+0][4])), _mm_load_ps(&unpacked[i+4][4]), 1);
__m256 s5 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+1][4])), _mm_load_ps(&unpacked[i+5][4]), 1);
__m256 s6 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+2][4])), _mm_load_ps(&unpacked[i+6][4]), 1);
__m256 s7 = _mm256_insertf128_ps(_mm256_castps128_ps256(_mm_load_ps(&unpacked[i+3][4])), _mm_load_ps(&unpacked[i+7][4]), 1);
__m256 t0 = _mm256_unpacklo_ps(s0, s1);
__m256 t1 = _mm256_unpackhi_ps(s0, s1);
__m256 t2 = _mm256_unpacklo_ps(s2, s3);
__m256 t3 = _mm256_unpackhi_ps(s2, s3);
__m256 t4 = _mm256_unpacklo_ps(s4, s5);
__m256 t5 = _mm256_unpackhi_ps(s4, s5);
__m256 t6 = _mm256_unpacklo_ps(s6, s7);
__m256 t7 = _mm256_unpackhi_ps(s6, s7);
__m256 px = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0));
__m256 py = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2));
__m256 pz = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0));
__m256 nx = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2));
__m256 ny = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1,0,1,0));
__m256 nz = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3,2,3,2));
__m256 tx = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1,0,1,0));
__m256 ty = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3,2,3,2));
__m256 tz = _mm256_i32gather_ps(unpacked[i+0], _mm256_setr_epi32(8,17,26,35,44,53,62,71), sizeof(float));
// abs(pos)
__m256 apx = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), px);
__m256 apy = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), py);
__m256 apz = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), pz);
// len = compMax(abs(pos))
__m256 len = _mm256_max_ps(_mm256_max_ps(apx, apy), apz);
// detect zeros
__m256 mask = _mm256_cmp_ps(len, _mm256_setzero_ps(), _CMP_EQ_OQ);
// rcp = 1.0f / len
__m256 rcp = _mm256_div_ps(_mm256_set1_ps(1.0f), len);
// replace +inf with 1.0f
rcp = _mm256_blendv_ps(rcp, _mm256_set1_ps(1.0f), mask);
len = _mm256_blendv_ps(len, _mm256_set1_ps(1.0f), mask);
// pos *= 1.0f / len
px = _mm256_mul_ps(px, rcp);
py = _mm256_mul_ps(py, rcp);
pz = _mm256_mul_ps(pz, rcp);
// clamp(vec, -1.0f, 1.0f)
px = _mm256_min_ps(_mm256_max_ps(px, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
py = _mm256_min_ps(_mm256_max_ps(py, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
pz = _mm256_min_ps(_mm256_max_ps(pz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
nx = _mm256_min_ps(_mm256_max_ps(nx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
ny = _mm256_min_ps(_mm256_max_ps(ny, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
nz = _mm256_min_ps(_mm256_max_ps(nz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
tx = _mm256_min_ps(_mm256_max_ps(tx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
ty = _mm256_min_ps(_mm256_max_ps(ty, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
tz = _mm256_min_ps(_mm256_max_ps(tz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
// vec *= 511.0f
px = _mm256_mul_ps(px, _mm256_set1_ps(511.0f));
py = _mm256_mul_ps(py, _mm256_set1_ps(511.0f));
pz = _mm256_mul_ps(pz, _mm256_set1_ps(511.0f));
nx = _mm256_mul_ps(nx, _mm256_set1_ps(511.0f));
ny = _mm256_mul_ps(ny, _mm256_set1_ps(511.0f));
nz = _mm256_mul_ps(nz, _mm256_set1_ps(511.0f));
tx = _mm256_mul_ps(tx, _mm256_set1_ps(511.0f));
ty = _mm256_mul_ps(ty, _mm256_set1_ps(511.0f));
tz = _mm256_mul_ps(tz, _mm256_set1_ps(511.0f));
// veci = lrint(vec) & 03ff
__m256i pxi = _mm256_and_si256(_mm256_cvtps_epi32(px), _mm256_set1_epi32(0x3ff));
__m256i pyi = _mm256_and_si256(_mm256_cvtps_epi32(py), _mm256_set1_epi32(0x3ff));
__m256i pzi = _mm256_and_si256(_mm256_cvtps_epi32(pz), _mm256_set1_epi32(0x3ff));
__m256i nxi = _mm256_and_si256(_mm256_cvtps_epi32(nx), _mm256_set1_epi32(0x3ff));
__m256i nyi = _mm256_and_si256(_mm256_cvtps_epi32(ny), _mm256_set1_epi32(0x3ff));
__m256i nzi = _mm256_and_si256(_mm256_cvtps_epi32(nz), _mm256_set1_epi32(0x3ff));
__m256i txi = _mm256_and_si256(_mm256_cvtps_epi32(tx), _mm256_set1_epi32(0x3ff));
__m256i tyi = _mm256_and_si256(_mm256_cvtps_epi32(ty), _mm256_set1_epi32(0x3ff));
__m256i tzi = _mm256_and_si256(_mm256_cvtps_epi32(tz), _mm256_set1_epi32(0x3ff));
// pack = (xi << 0) | (yi << 10) | (zi << 20);
__m256i li = _mm256_castps_si256(len); // length
__m256i pi = _mm256_or_si256(_mm256_or_si256(pxi, _mm256_slli_epi32(pyi, 10)), _mm256_slli_epi32(pzi, 20)); // position
__m256i ni = _mm256_or_si256(_mm256_or_si256(nxi, _mm256_slli_epi32(nyi, 10)), _mm256_slli_epi32(nzi, 20)); // normal
__m256i ti = _mm256_or_si256(_mm256_or_si256(txi, _mm256_slli_epi32(tyi, 10)), _mm256_slli_epi32(tzi, 20)); // tangent
//
// interleave (4x4 matrix transpose)
//
__m256i u0 = _mm256_unpacklo_epi32(li, pi);
__m256i u1 = _mm256_unpackhi_epi32(li, pi);
__m256i u2 = _mm256_unpacklo_epi32(ni, ti);
__m256i u3 = _mm256_unpackhi_epi32(ni, ti);
__m256i v0 = _mm256_unpacklo_epi64(u0, u2);
__m256i v1 = _mm256_unpackhi_epi64(u0, u2);
__m256i v2 = _mm256_unpacklo_epi64(u1, u3);
__m256i v3 = _mm256_unpackhi_epi64(u1, u3);
__m256i w0 = _mm256_permute2f128_si256(v0, v1, 0x20);
__m256i w1 = _mm256_permute2f128_si256(v2, v3, 0x20);
__m256i w2 = _mm256_permute2f128_si256(v0, v1, 0x31);
__m256i w3 = _mm256_permute2f128_si256(v2, v3, 0x31);
// store pack x 8
_mm256_storeu_si256((__m256i*)packed[i+0], w0);
_mm256_storeu_si256((__m256i*)packed[i+2], w1);
_mm256_storeu_si256((__m256i*)packed[i+4], w2);
_mm256_storeu_si256((__m256i*)packed[i+6], w3);
}
if (i < size) { // remainder
int rem = size - i;
//
// deinterleave (8x9 to 9x8 matrix transpose)
//
__m256 s0 = _mm256_setzero_ps();
__m256 s1 = _mm256_setzero_ps();
__m256 s2 = _mm256_setzero_ps();
__m256 s3 = _mm256_setzero_ps();
__m256 s4 = _mm256_setzero_ps();
__m256 s5 = _mm256_setzero_ps();
__m256 s6 = _mm256_setzero_ps();
__m256 s7 = _mm256_setzero_ps();
switch (rem) {
case 7: s6 = _mm256_loadu_ps(unpacked[i+6]);
case 6: s5 = _mm256_loadu_ps(unpacked[i+5]);
case 5: s4 = _mm256_loadu_ps(unpacked[i+4]);
case 4: s3 = _mm256_loadu_ps(unpacked[i+3]);
case 3: s2 = _mm256_loadu_ps(unpacked[i+2]);
case 2: s1 = _mm256_loadu_ps(unpacked[i+1]);
case 1: s0 = _mm256_loadu_ps(unpacked[i+0]);
}
__m256 t0 = _mm256_unpacklo_ps(s0, s1);
__m256 t1 = _mm256_unpackhi_ps(s0, s1);
__m256 t2 = _mm256_unpacklo_ps(s2, s3);
__m256 t3 = _mm256_unpackhi_ps(s2, s3);
__m256 t4 = _mm256_unpacklo_ps(s4, s5);
__m256 t5 = _mm256_unpackhi_ps(s4, s5);
__m256 t6 = _mm256_unpacklo_ps(s6, s7);
__m256 t7 = _mm256_unpackhi_ps(s6, s7);
s0 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(1,0,1,0));
s1 = _mm256_shuffle_ps(t0, t2, _MM_SHUFFLE(3,2,3,2));
s2 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(1,0,1,0));
s3 = _mm256_shuffle_ps(t1, t3, _MM_SHUFFLE(3,2,3,2));
s4 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(1,0,1,0));
s5 = _mm256_shuffle_ps(t4, t6, _MM_SHUFFLE(3,2,3,2));
s6 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(1,0,1,0));
s7 = _mm256_shuffle_ps(t5, t7, _MM_SHUFFLE(3,2,3,2));
__m256 px = _mm256_permute2f128_ps(s0, s4, 0x20);
__m256 py = _mm256_permute2f128_ps(s1, s5, 0x20);
__m256 pz = _mm256_permute2f128_ps(s2, s6, 0x20);
__m256 nx = _mm256_permute2f128_ps(s3, s7, 0x20);
__m256 ny = _mm256_permute2f128_ps(s0, s4, 0x31);
__m256 nz = _mm256_permute2f128_ps(s1, s5, 0x31);
__m256 tx = _mm256_permute2f128_ps(s2, s6, 0x31);
__m256 ty = _mm256_permute2f128_ps(s3, s7, 0x31);
__m256i loadmask = _mm256_cvtepi8_epi32(_mm_cvtsi64_si128(0xffffffffffffffffULL >> (64 - 8 * rem)));
__m256 tz = _mm256_mask_i32gather_ps(_mm256_setzero_ps(), unpacked[i+0], _mm256_setr_epi32(8,17,26,35,44,53,62,71),
_mm256_castsi256_ps(loadmask), sizeof(float));
// abs(pos)
__m256 apx = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), px);
__m256 apy = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), py);
__m256 apz = _mm256_andnot_ps(_mm256_set1_ps(-0.0f), pz);
// len = compMax(abs(pos))
__m256 len = _mm256_max_ps(_mm256_max_ps(apx, apy), apz);
// detect zeros
__m256 mask = _mm256_cmp_ps(len, _mm256_setzero_ps(), _CMP_EQ_OQ);
// rcp = 1.0f / len
__m256 rcp = _mm256_div_ps(_mm256_set1_ps(1.0f), len);
// replace +inf with 1.0f
rcp = _mm256_blendv_ps(rcp, _mm256_set1_ps(1.0f), mask);
len = _mm256_blendv_ps(len, _mm256_set1_ps(1.0f), mask);
// pos *= 1.0f / len
px = _mm256_mul_ps(px, rcp);
py = _mm256_mul_ps(py, rcp);
pz = _mm256_mul_ps(pz, rcp);
// clamp(vec, -1.0f, 1.0f)
px = _mm256_min_ps(_mm256_max_ps(px, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
py = _mm256_min_ps(_mm256_max_ps(py, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
pz = _mm256_min_ps(_mm256_max_ps(pz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
nx = _mm256_min_ps(_mm256_max_ps(nx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
ny = _mm256_min_ps(_mm256_max_ps(ny, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
nz = _mm256_min_ps(_mm256_max_ps(nz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
tx = _mm256_min_ps(_mm256_max_ps(tx, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
ty = _mm256_min_ps(_mm256_max_ps(ty, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
tz = _mm256_min_ps(_mm256_max_ps(tz, _mm256_set1_ps(-1.0f)), _mm256_set1_ps(1.0f));
// vec *= 511.0f
px = _mm256_mul_ps(px, _mm256_set1_ps(511.0f));
py = _mm256_mul_ps(py, _mm256_set1_ps(511.0f));
pz = _mm256_mul_ps(pz, _mm256_set1_ps(511.0f));
nx = _mm256_mul_ps(nx, _mm256_set1_ps(511.0f));
ny = _mm256_mul_ps(ny, _mm256_set1_ps(511.0f));
nz = _mm256_mul_ps(nz, _mm256_set1_ps(511.0f));
tx = _mm256_mul_ps(tx, _mm256_set1_ps(511.0f));
ty = _mm256_mul_ps(ty, _mm256_set1_ps(511.0f));
tz = _mm256_mul_ps(tz, _mm256_set1_ps(511.0f));
// veci = lrint(vec) & 03ff
__m256i pxi = _mm256_and_si256(_mm256_cvtps_epi32(px), _mm256_set1_epi32(0x3ff));
__m256i pyi = _mm256_and_si256(_mm256_cvtps_epi32(py), _mm256_set1_epi32(0x3ff));
__m256i pzi = _mm256_and_si256(_mm256_cvtps_epi32(pz), _mm256_set1_epi32(0x3ff));
__m256i nxi = _mm256_and_si256(_mm256_cvtps_epi32(nx), _mm256_set1_epi32(0x3ff));
__m256i nyi = _mm256_and_si256(_mm256_cvtps_epi32(ny), _mm256_set1_epi32(0x3ff));
__m256i nzi = _mm256_and_si256(_mm256_cvtps_epi32(nz), _mm256_set1_epi32(0x3ff));
__m256i txi = _mm256_and_si256(_mm256_cvtps_epi32(tx), _mm256_set1_epi32(0x3ff));
__m256i tyi = _mm256_and_si256(_mm256_cvtps_epi32(ty), _mm256_set1_epi32(0x3ff));
__m256i tzi = _mm256_and_si256(_mm256_cvtps_epi32(tz), _mm256_set1_epi32(0x3ff));
// pack = (xi << 0) | (yi << 10) | (zi << 20);
__m256i li = _mm256_castps_si256(len); // length
__m256i pi = _mm256_or_si256(_mm256_or_si256(pxi, _mm256_slli_epi32(pyi, 10)), _mm256_slli_epi32(pzi, 20)); // position
__m256i ni = _mm256_or_si256(_mm256_or_si256(nxi, _mm256_slli_epi32(nyi, 10)), _mm256_slli_epi32(nzi, 20)); // normal
__m256i ti = _mm256_or_si256(_mm256_or_si256(txi, _mm256_slli_epi32(tyi, 10)), _mm256_slli_epi32(tzi, 20)); // tangent
//
// interleave (4x4 matrix transpose)
//
__m256i u0 = _mm256_unpacklo_epi32(li, pi);
__m256i u1 = _mm256_unpackhi_epi32(li, pi);
__m256i u2 = _mm256_unpacklo_epi32(ni, ti);
__m256i u3 = _mm256_unpackhi_epi32(ni, ti);
__m256i v0 = _mm256_unpacklo_epi64(u0, u2);
__m256i v1 = _mm256_unpackhi_epi64(u0, u2);
__m256i v2 = _mm256_unpacklo_epi64(u1, u3);
__m256i v3 = _mm256_unpackhi_epi64(u1, u3);
// store pack x 8
switch (rem) {
case 7: _mm_storeu_si128((__m128i*)packed[i+6], _mm256_extractf128_si256(v2, 1));
case 6: _mm_storeu_si128((__m128i*)packed[i+5], _mm256_extractf128_si256(v1, 1));
case 5: _mm_storeu_si128((__m128i*)packed[i+4], _mm256_extractf128_si256(v0, 1));
case 4: _mm_storeu_si128((__m128i*)packed[i+3], _mm256_castsi256_si128(v3));
case 3: _mm_storeu_si128((__m128i*)packed[i+2], _mm256_castsi256_si128(v2));
case 2: _mm_storeu_si128((__m128i*)packed[i+1], _mm256_castsi256_si128(v1));
case 1: _mm_storeu_si128((__m128i*)packed[i+0], _mm256_castsi256_si128(v0));
}
}
_mm256_zeroupper();
}
#endif

View file

@ -0,0 +1,148 @@
//
// BlendshapePackingTests.cpp
// tests/shared/src
//
// Created by Ken Cooke on 6/24/19.
// Copyright 2019 High Fidelity, Inc.
//
// Distributed under the Apache License, Version 2.0.
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#include "BlendshapePackingTests.h"
#include <vector>
#include <test-utils/QTestExtensions.h>
#include <GLMHelpers.h>
#include <glm/gtc/random.hpp>
struct BlendshapeOffsetUnpacked {
glm::vec3 positionOffset;
glm::vec3 normalOffset;
glm::vec3 tangentOffset;
};
struct BlendshapeOffsetPacked {
glm::uvec4 packedPosNorTan;
};
QTEST_MAIN(BlendshapePackingTests)
static void packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10(glm::uvec4& packed, const BlendshapeOffsetUnpacked& unpacked) {
float len = glm::compMax(glm::abs(unpacked.positionOffset));
glm::vec3 normalizedPos(unpacked.positionOffset);
if (len > 0.0f) {
normalizedPos /= len;
} else {
len = 1.0f;
}
packed = glm::uvec4(
glm::floatBitsToUint(len),
glm_packSnorm3x10_1x2(glm::vec4(normalizedPos, 0.0f)),
glm_packSnorm3x10_1x2(glm::vec4(unpacked.normalOffset, 0.0f)),
glm_packSnorm3x10_1x2(glm::vec4(unpacked.tangentOffset, 0.0f))
);
}
static void packBlendshapeOffsets_ref(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) {
for (int i = 0; i < size; ++i) {
packBlendshapeOffsetTo_Pos_F32_3xSN10_Nor_3xSN10_Tan_3xSN10((*packed).packedPosNorTan, (*unpacked));
++unpacked;
++packed;
}
}
#if defined(_M_IX86) || defined(_M_X64) || defined(__i386__) || defined(__x86_64__)
//
// Runtime CPU dispatch
//
#include <CPUDetect.h>
void packBlendshapeOffsets_AVX2(float (*unpacked)[9], uint32_t (*packed)[4], int size);
static void packBlendshapeOffsets(BlendshapeOffsetUnpacked* unpacked, BlendshapeOffsetPacked* packed, int size) {
static bool _cpuSupportsAVX2 = cpuSupportsAVX2();
if (_cpuSupportsAVX2) {
static_assert(sizeof(BlendshapeOffsetUnpacked) == 9 * sizeof(float), "struct BlendshapeOffsetUnpacked size doesn't match.");
static_assert(sizeof(BlendshapeOffsetPacked) == 4 * sizeof(uint32_t), "struct BlendshapeOffsetPacked size doesn't match.");
packBlendshapeOffsets_AVX2((float(*)[9])unpacked, (uint32_t(*)[4])packed, size);
} else {
packBlendshapeOffsets_ref(unpacked, packed, size);
}
}
#else // portable reference code
static auto& packBlendshapeOffsets = packBlendshapeOffsets_ref;
#endif
void comparePacked(BlendshapeOffsetPacked& ref, BlendshapeOffsetPacked& tst) {
union i10i10i10i2 {
struct {
int x : 10;
int y : 10;
int z : 10;
int w : 2;
} data;
uint32_t pack;
} Ref[4], Tst[4];
for (int i = 0; i < 4; i++) {
Ref[i].pack = ref.packedPosNorTan[i];
Tst[i].pack = tst.packedPosNorTan[i];
}
// allow 1 ULP due to rounding differences
QCOMPARE_WITH_ABS_ERROR(Tst[0].pack, Ref[0].pack, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[1].data.x, Ref[1].data.x, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[1].data.y, Ref[1].data.y, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[1].data.z, Ref[1].data.z, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[2].data.x, Ref[2].data.x, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[2].data.y, Ref[2].data.y, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[2].data.z, Ref[2].data.z, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[3].data.x, Ref[3].data.x, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[3].data.y, Ref[3].data.y, 1);
QCOMPARE_WITH_ABS_ERROR(Tst[3].data.z, Ref[3].data.z, 1);
}
void BlendshapePackingTests::testAVX2() {
for (int numBlendshapeOffsets = 0; numBlendshapeOffsets < 4096; ++numBlendshapeOffsets) {
std::vector<BlendshapeOffsetUnpacked> unpackedBlendshapeOffsets(numBlendshapeOffsets);
std::vector<BlendshapeOffsetPacked> packedBlendshapeOffsets1(numBlendshapeOffsets);
std::vector<BlendshapeOffsetPacked> packedBlendshapeOffsets2(numBlendshapeOffsets);
// init test data
if (numBlendshapeOffsets > 0) {
unpackedBlendshapeOffsets[0] = {
glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f), glm::vec3(0.0f, 0.0f, 0.0f),
};
}
for (int i = 1; i < numBlendshapeOffsets; ++i) {
unpackedBlendshapeOffsets[i] = {
glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)),
glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)),
glm::linearRand(glm::vec3(-2.0f, -2.0f, -2.0f), glm::vec3(2.0f, 2.0f, 2.0f)),
};
}
// ref version
packBlendshapeOffsets_ref(unpackedBlendshapeOffsets.data(), packedBlendshapeOffsets1.data(), numBlendshapeOffsets);
// AVX2 version, if supported by CPU
packBlendshapeOffsets(unpackedBlendshapeOffsets.data(), packedBlendshapeOffsets2.data(), numBlendshapeOffsets);
// verify
for (int i = 0; i < numBlendshapeOffsets; ++i) {
auto ref = packedBlendshapeOffsets1.at(i);
auto tst = packedBlendshapeOffsets2.at(i);
comparePacked(ref, tst);
}
}
}

View file

@ -0,0 +1,23 @@
//
// BlendshapePackingTests.h
// tests/shared/src
//
// Created by Ken Cooke on 6/24/19.
// Copyright 2019 High Fidelity, Inc.
//
// Distributed under the Apache License, Version 2.0.
// See the accompanying file LICENSE or http://www.apache.org/licenses/LICENSE-2.0.html
//
#ifndef hifi_BlendshapePackingTests_h
#define hifi_BlendshapePackingTests_h
#include <QtTest/QtTest>
class BlendshapePackingTests : public QObject {
Q_OBJECT
private slots:
void testAVX2();
};
#endif // hifi_BlendshapePackingTests_h