initial optimizations to AudioMixer with mmx additions

2025-04-23 12:14:00 +02:00 · 2014-03-14 17:50:57 -07:00 · 2014-03-14 17:50:57 -07:00 · 72449fdb6a
commit 72449fdb6a
parent 1303c19498
4 changed files with 64 additions and 30 deletions
--- a/assignment-client/src/audio/AudioMixer.cpp
+++ b/assignment-client/src/audio/AudioMixer.cpp
@ -6,6 +6,7 @@
 //  Copyright (c) 2013 HighFidelity, Inc. All rights reserved.
 //

+#include <mmintrin.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <fstream>
@ -157,27 +158,59 @@ void AudioMixer::addBufferToMixForListeningNodeWithBuffer(PositionalAudioRingBuf
    // if the bearing relative angle to source is > 0 then the delayed channel is the right one
    int delayedChannelOffset = (bearingRelativeAngleToSource > 0.0f) ? 1 : 0;
    int goodChannelOffset = delayedChannelOffset == 0 ? 1 : 0;
+    
+    const int16_t* nextOutputStart = bufferToAdd->getNextOutput();
+    const int16_t* delayNextOutputStart = nextOutputStart - numSamplesDelay;
+    const int16_t* bufferStart = bufferToAdd->getBuffer();
+    int ringBufferSampleCapacity = bufferToAdd->getSampleCapacity();
+    
+    if (delayNextOutputStart < bufferStart) {
+        delayNextOutputStart = bufferStart + ringBufferSampleCapacity - numSamplesDelay;
+    }

-    for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 2) {
-        if ((s / 2) < numSamplesDelay) {
-            // pull the earlier sample for the delayed channel
-            int earlierSample = (*bufferToAdd)[(s / 2) - numSamplesDelay] * attenuationCoefficient * weakChannelAmplitudeRatio;
-            _clientSamples[s + delayedChannelOffset] = glm::clamp(_clientSamples[s + delayedChannelOffset] + earlierSample,
-                                                                    MIN_SAMPLE_VALUE, MAX_SAMPLE_VALUE);
-        }
-
-        // pull the current sample for the good channel
-        int16_t currentSample = (*bufferToAdd)[s / 2] * attenuationCoefficient;
-        _clientSamples[s + goodChannelOffset] = glm::clamp(_clientSamples[s + goodChannelOffset] + currentSample,
-                                                           MIN_SAMPLE_VALUE, MAX_SAMPLE_VALUE);
-
-        if ((s / 2) + numSamplesDelay < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
-            // place the current sample at the right spot in the delayed channel
-            int16_t clampedSample = glm::clamp((int) (_clientSamples[s + (numSamplesDelay * 2) + delayedChannelOffset]
-                                               + (currentSample * weakChannelAmplitudeRatio)),
-                                               MIN_SAMPLE_VALUE, MAX_SAMPLE_VALUE);
-            _clientSamples[s + (numSamplesDelay * 2) + delayedChannelOffset] = clampedSample;
+    int16_t correctMixSample[2], correctBufferSample[2], delayMixSample[2], delayBufferSample[2];
+    int delayedChannelIndex[2];
+    
+    float attenuationAndWeakChannelRatio = attenuationCoefficient * weakChannelAmplitudeRatio;
+    
+    for (int s = 0; s < NETWORK_BUFFER_LENGTH_SAMPLES_STEREO; s += 4) {
+        
+        // setup the int16_t variables for the two sample sets
+        correctBufferSample[0] = nextOutputStart[s / 2] * attenuationCoefficient;
+        correctBufferSample[1] = nextOutputStart[(s / 2) + 1] * attenuationCoefficient;
+        correctMixSample[0] = _clientSamples[s + goodChannelOffset];
+        correctMixSample[1] = _clientSamples[s + goodChannelOffset + 2];
+        
+        for (int i = 0; i < 2; ++i) {
+            if ((s / 2) + numSamplesDelay + i < NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL) {
+                // for the delayed channel we fill the range (n + numSamplesDelay) to NETWORK_BUFFER_LENGTH_SAMPLES_STEREO first
+                delayedChannelIndex[i] = s + (numSamplesDelay * 2) + (i * 2) + delayedChannelOffset;
+                delayBufferSample[i] = correctBufferSample[i] * weakChannelAmplitudeRatio;
+            } else {
+                // now that the right most range has been filled, we go back to fill in numSamples delay at the beginning
+                int samplesBack = (s / 2) - NETWORK_BUFFER_LENGTH_SAMPLES_PER_CHANNEL + i;
+                
+                delayBufferSample[i] = delayNextOutputStart[numSamplesDelay + samplesBack] * attenuationAndWeakChannelRatio;
+                delayedChannelIndex[i] = (numSamplesDelay + samplesBack) * 2 + delayedChannelOffset;
+            }
        }
+        
+        delayMixSample[0] = _clientSamples[delayedChannelIndex[0]];
+        delayMixSample[1] = _clientSamples[delayedChannelIndex[1]];
+        
+        __m64 bufferSamples = _mm_set_pi16(correctMixSample[0], correctMixSample[1], delayMixSample[0], delayMixSample[1]);
+        __m64 addedSamples = _mm_set_pi16(correctBufferSample[0], correctBufferSample[1],
+                                         delayBufferSample[0], delayBufferSample[1]);
+        
+        // perform the MMX add (with saturation) of two correct and delayed samples
+        __m64 mmxResult = _mm_adds_pi16(bufferSamples, addedSamples);
+        int16_t* shortResults = reinterpret_cast<int16_t*>(&mmxResult);
+        
+        // assign the results from the result of the mmx arithmetic
+        _clientSamples[s + goodChannelOffset] = shortResults[3];
+        _clientSamples[s + goodChannelOffset + 2] = shortResults[2];
+        _clientSamples[delayedChannelIndex[0]] = shortResults[1];
+        _clientSamples[delayedChannelIndex[1]] = shortResults[0];
    }
 }

--- a/interface/interface_en.ts
+++ b/interface/interface_en.ts
@ -4,22 +4,22 @@
 <context>
    <name>Application</name>
    <message>
-        <location filename="src/Application.cpp" line="1353"/>
+        <location filename="src/Application.cpp" line="1354"/>
        <source>Export Voxels</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="src/Application.cpp" line="1354"/>
+        <location filename="src/Application.cpp" line="1355"/>
        <source>Sparse Voxel Octree Files (*.svo)</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="src/Application.cpp" line="3557"/>
+        <location filename="src/Application.cpp" line="3561"/>
        <source>Open Script</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="src/Application.cpp" line="3558"/>
+        <location filename="src/Application.cpp" line="3562"/>
        <source>JavaScript Files (*.js)</source>
        <translation type="unfinished"></translation>
    </message>
@ -113,18 +113,18 @@
 <context>
    <name>Menu</name>
    <message>
-        <location filename="src/Menu.cpp" line="422"/>
+        <location filename="src/Menu.cpp" line="424"/>
        <source>Open .ini config file</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="src/Menu.cpp" line="424"/>
-        <location filename="src/Menu.cpp" line="436"/>
+        <location filename="src/Menu.cpp" line="426"/>
+        <location filename="src/Menu.cpp" line="438"/>
        <source>Text files (*.ini)</source>
        <translation type="unfinished"></translation>
    </message>
    <message>
-        <location filename="src/Menu.cpp" line="434"/>
+        <location filename="src/Menu.cpp" line="436"/>
        <source>Save .ini config file</source>
        <translation type="unfinished"></translation>
    </message>
--- a/libraries/audio/src/AudioRingBuffer.cpp
+++ b/libraries/audio/src/AudioRingBuffer.cpp
@ -121,9 +121,6 @@ qint64 AudioRingBuffer::writeData(const char* data, qint64 maxSize) {
 }

 int16_t& AudioRingBuffer::operator[](const int index) {
-    // make sure this is a valid index
-    assert(index > -_sampleCapacity && index < _sampleCapacity);
-
    return *shiftedPositionAccomodatingWrap(_nextOutput, index);
 }

--- a/libraries/audio/src/AudioRingBuffer.h
+++ b/libraries/audio/src/AudioRingBuffer.h
@ -45,6 +45,10 @@ public:
    int getSampleCapacity() const { return _sampleCapacity; }
    
    int parseData(const QByteArray& packet);
+    
+    // assume callers using this will never wrap around the end
+    const int16_t* getNextOutput() { return _nextOutput; }
+    const int16_t* getBuffer() { return _buffer; }

    qint64 readSamples(int16_t* destination, qint64 maxSamples);
    qint64 writeSamples(const int16_t* source, qint64 maxSamples);