// // Audio.cpp // interface // // Created by Stephen Birarda on 1/22/13. // Copyright (c) 2013 High Fidelity, Inc. All rights reserved. // #ifndef _WIN32 #include #include #include #include #include #include #include #include #include #include #include #include #include "Application.h" #include "Audio.h" #include "Util.h" #include "Log.h" #define VISUALIZE_ECHO_CANCELLATION static const int NUM_AUDIO_CHANNELS = 2; static const int PACKET_LENGTH_BYTES = 1024; static const int PACKET_LENGTH_BYTES_PER_CHANNEL = PACKET_LENGTH_BYTES / 2; static const int PACKET_LENGTH_SAMPLES = PACKET_LENGTH_BYTES / sizeof(int16_t); static const int PACKET_LENGTH_SAMPLES_PER_CHANNEL = PACKET_LENGTH_SAMPLES / 2; static const int PHASE_DELAY_AT_90 = 20; static const float AMPLITUDE_RATIO_AT_90 = 0.5; static const int MIN_FLANGE_EFFECT_THRESHOLD = 600; static const int MAX_FLANGE_EFFECT_THRESHOLD = 1500; static const float FLANGE_BASE_RATE = 4; static const float MAX_FLANGE_SAMPLE_WEIGHT = 0.50; static const float MIN_FLANGE_INTENSITY = 0.25; static const float JITTER_BUFFER_LENGTH_MSECS = 12; static const short JITTER_BUFFER_SAMPLES = JITTER_BUFFER_LENGTH_MSECS * NUM_AUDIO_CHANNELS * (SAMPLE_RATE / 1000.0); static const float AUDIO_CALLBACK_MSECS = (float)BUFFER_LENGTH_SAMPLES_PER_CHANNEL / (float)SAMPLE_RATE * 1000.0; static const int AGENT_LOOPBACK_MODIFIER = 307; // Speex preprocessor and echo canceller adaption static const int AEC_N_CHANNELS_MIC = 1; // Number of microphone channels static const int AEC_N_CHANNELS_PLAY = 2; // Number of speaker channels static const int AEC_FILTER_LENGTH = BUFFER_LENGTH_SAMPLES_PER_CHANNEL * 20; // Width of the filter static const int AEC_BUFFERED_FRAMES = 6; // Maximum number of frames to buffer static const int AEC_BUFFERED_SAMPLES_PER_CHANNEL = BUFFER_LENGTH_SAMPLES_PER_CHANNEL * AEC_BUFFERED_FRAMES; static const int AEC_BUFFERED_SAMPLES = AEC_BUFFERED_SAMPLES_PER_CHANNEL * AEC_N_CHANNELS_PLAY; static const int AEC_TMP_BUFFER_SIZE = (AEC_N_CHANNELS_MIC + // Temporary space for processing a AEC_N_CHANNELS_PLAY) * BUFFER_LENGTH_SAMPLES_PER_CHANNEL; // single frame // Speex preprocessor and echo canceller configuration static const int AEC_NOISE_REDUCTION = -80; // Noise reduction (important) static const int AEC_RESIDUAL_ECHO_REDUCTION = -60; // Residual echo reduction static const int AEC_RESIDUAL_ECHO_REDUCTION_ACTIVE = -45; // ~on active side static const bool AEC_USE_AGC = true; // Automatic gain control static const int AEC_AGC_MAX_GAIN = -30; // Gain in db static const int AEC_AGC_TARGET_LEVEL = 9000; // Target reference level static const int AEC_AGC_MAX_INC = 6; // Max increase in db/s static const int AEC_AGC_MAX_DEC = 200; // Max decrease in db/s static const bool AEC_USE_VAD = false; // Voice activity determination // Ping test configuration static const float PING_PITCH = 16.f; // Ping wavelength, # samples / radian static const float PING_VOLUME = 32000.f; // Ping peak amplitude static const int PING_MIN_AMPLI = 225; // Minimum amplitude static const int PING_MAX_PERIOD_DIFFERENCE = 15; // Maximum # samples from expected period static const int PING_PERIOD = int(Radians::twicePi() * PING_PITCH); // Sine period based on the given pitch static const int PING_HALF_PERIOD = int(Radians::pi() * PING_PITCH); // Distance between extrema static const int PING_FRAMES_TO_RECORD = AEC_BUFFERED_FRAMES; // Frames to record for analysis static const int PING_SAMPLES_TO_ANALYZE = AEC_BUFFERED_SAMPLES_PER_CHANNEL; // Samples to analyze (reusing AEC buffer) static const int PING_BUFFER_OFFSET = BUFFER_LENGTH_SAMPLES_PER_CHANNEL - PING_PERIOD * 2.0f; // Signal start inline void Audio::performIO(int16_t* inputLeft, int16_t* outputLeft, int16_t* outputRight) { AgentList* agentList = AgentList::getInstance(); Application* interface = Application::getInstance(); Avatar* interfaceAvatar = interface->getAvatar(); eventuallyCancelEcho(inputLeft); // Add Procedural effects to input samples addProceduralSounds(inputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); if (agentList && inputLeft) { // Measure the loudness of the signal from the microphone and store in audio object float loudness = 0; for (int i = 0; i < BUFFER_LENGTH_SAMPLES_PER_CHANNEL; i++) { loudness += abs(inputLeft[i]); } loudness /= BUFFER_LENGTH_SAMPLES_PER_CHANNEL; _lastInputLoudness = loudness; // add input (@microphone) data to the scope #ifdef VISUALIZE_ECHO_CANCELLATION if (! isCancellingEcho()) { #endif _scope->addSamples(0, inputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); #ifdef VISUALIZE_ECHO_CANCELLATION } #endif Agent* audioMixer = agentList->soloAgentOfType(AGENT_TYPE_AUDIO_MIXER); if (audioMixer) { glm::vec3 headPosition = interfaceAvatar->getHeadJointPosition(); glm::quat headOrientation = interfaceAvatar->getHead().getOrientation(); int leadingBytes = sizeof(PACKET_HEADER_MICROPHONE_AUDIO_NO_ECHO) + sizeof(headPosition) + sizeof(headOrientation); // we need the amount of bytes in the buffer + 1 for type // + 12 for 3 floats for position + float for bearing + 1 attenuation byte unsigned char dataPacket[BUFFER_LENGTH_BYTES_PER_CHANNEL + leadingBytes]; dataPacket[0] = (Application::getInstance()->shouldEchoAudio()) ? PACKET_HEADER_MICROPHONE_AUDIO_WITH_ECHO : PACKET_HEADER_MICROPHONE_AUDIO_NO_ECHO; unsigned char *currentPacketPtr = dataPacket + 1; // memcpy the three float positions memcpy(currentPacketPtr, &headPosition, sizeof(headPosition)); currentPacketPtr += (sizeof(headPosition)); // memcpy our orientation memcpy(currentPacketPtr, &headOrientation, sizeof(headOrientation)); currentPacketPtr += sizeof(headOrientation); // copy the audio data to the last BUFFER_LENGTH_BYTES bytes of the data packet memcpy(currentPacketPtr, inputLeft, BUFFER_LENGTH_BYTES_PER_CHANNEL); agentList->getAgentSocket()->send(audioMixer->getActiveSocket(), dataPacket, BUFFER_LENGTH_BYTES_PER_CHANNEL + leadingBytes); interface->getBandwidthMeter()->outputStream(0) .updateValue(BUFFER_LENGTH_BYTES_PER_CHANNEL + leadingBytes); } } memset(outputLeft, 0, PACKET_LENGTH_BYTES_PER_CHANNEL); memset(outputRight, 0, PACKET_LENGTH_BYTES_PER_CHANNEL); AudioRingBuffer* ringBuffer = &_ringBuffer; // if we've been reset, and there isn't any new packets yet // just play some silence if (ringBuffer->getEndOfLastWrite()) { if (!ringBuffer->isStarted() && ringBuffer->diffLastWriteNextOutput() < PACKET_LENGTH_SAMPLES + JITTER_BUFFER_SAMPLES) { // printLog("Held back, buffer has %d of %d samples required.\n", // ringBuffer->diffLastWriteNextOutput(), PACKET_LENGTH_SAMPLES + JITTER_BUFFER_SAMPLES); } else if (ringBuffer->diffLastWriteNextOutput() < PACKET_LENGTH_SAMPLES) { ringBuffer->setStarted(false); _numStarves++; _packetsReceivedThisPlayback = 0; // printLog("Starved #%d\n", starve_counter); _wasStarved = 10; // Frames to render the indication that the system was starved. } else { if (!ringBuffer->isStarted()) { ringBuffer->setStarted(true); // printLog("starting playback %3.1f msecs delayed \n", (usecTimestampNow() - usecTimestamp(&firstPlaybackTimer))/1000.0); } else { // printLog("pushing buffer\n"); } // play whatever we have in the audio buffer // if we haven't fired off the flange effect, check if we should // TODO: lastMeasuredHeadYaw is now relative to body - check if this still works. int lastYawMeasured = fabsf(interfaceAvatar->getHeadYawRate()); if (!_samplesLeftForFlange && lastYawMeasured > MIN_FLANGE_EFFECT_THRESHOLD) { // we should flange for one second if ((_lastYawMeasuredMaximum = std::max(_lastYawMeasuredMaximum, lastYawMeasured)) != lastYawMeasured) { _lastYawMeasuredMaximum = std::min(_lastYawMeasuredMaximum, MIN_FLANGE_EFFECT_THRESHOLD); _samplesLeftForFlange = SAMPLE_RATE; _flangeIntensity = MIN_FLANGE_INTENSITY + ((_lastYawMeasuredMaximum - MIN_FLANGE_EFFECT_THRESHOLD) / (float)(MAX_FLANGE_EFFECT_THRESHOLD - MIN_FLANGE_EFFECT_THRESHOLD)) * (1 - MIN_FLANGE_INTENSITY); _flangeRate = FLANGE_BASE_RATE * _flangeIntensity; _flangeWeight = MAX_FLANGE_SAMPLE_WEIGHT * _flangeIntensity; } } for (int s = 0; s < PACKET_LENGTH_SAMPLES_PER_CHANNEL; s++) { int leftSample = ringBuffer->getNextOutput()[s]; int rightSample = ringBuffer->getNextOutput()[s + PACKET_LENGTH_SAMPLES_PER_CHANNEL]; if (_samplesLeftForFlange > 0) { float exponent = (SAMPLE_RATE - _samplesLeftForFlange - (SAMPLE_RATE / _flangeRate)) / (SAMPLE_RATE / _flangeRate); int sampleFlangeDelay = (SAMPLE_RATE / (1000 * _flangeIntensity)) * powf(2, exponent); if (_samplesLeftForFlange != SAMPLE_RATE || s >= (SAMPLE_RATE / 2000)) { // we have a delayed sample to add to this sample int16_t *flangeFrame = ringBuffer->getNextOutput(); int flangeIndex = s - sampleFlangeDelay; if (flangeIndex < 0) { // we need to grab the flange sample from earlier in the buffer flangeFrame = ringBuffer->getNextOutput() != ringBuffer->getBuffer() ? ringBuffer->getNextOutput() - PACKET_LENGTH_SAMPLES : ringBuffer->getNextOutput() + RING_BUFFER_LENGTH_SAMPLES - PACKET_LENGTH_SAMPLES; flangeIndex = PACKET_LENGTH_SAMPLES_PER_CHANNEL + (s - sampleFlangeDelay); } int16_t leftFlangeSample = flangeFrame[flangeIndex]; int16_t rightFlangeSample = flangeFrame[flangeIndex + PACKET_LENGTH_SAMPLES_PER_CHANNEL]; leftSample = (1 - _flangeWeight) * leftSample + (_flangeWeight * leftFlangeSample); rightSample = (1 - _flangeWeight) * rightSample + (_flangeWeight * rightFlangeSample); _samplesLeftForFlange--; if (_samplesLeftForFlange == 0) { _lastYawMeasuredMaximum = 0; } } } outputLeft[s] = leftSample; outputRight[s] = rightSample; } ringBuffer->setNextOutput(ringBuffer->getNextOutput() + PACKET_LENGTH_SAMPLES); if (ringBuffer->getNextOutput() == ringBuffer->getBuffer() + RING_BUFFER_LENGTH_SAMPLES) { ringBuffer->setNextOutput(ringBuffer->getBuffer()); } } } eventuallySendRecvPing(inputLeft, outputLeft, outputRight); eventuallyRecordEcho(outputLeft, outputRight); // add output (@speakers) data just written to the scope #ifdef VISUALIZE_ECHO_CANCELLATION if (! isCancellingEcho()) { _scope->setColor(2, 0x00ffff); #endif _scope->addSamples(1, outputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); _scope->addSamples(2, outputRight, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); #ifdef VISUALIZE_ECHO_CANCELLATION } #endif gettimeofday(&_lastCallbackTime, NULL); } // inputBuffer A pointer to an internal portaudio data buffer containing data read by portaudio. // outputBuffer A pointer to an internal portaudio data buffer to be read by the configured output device. // frames Number of frames that portaudio requests to be read/written. // timeInfo Portaudio time info. Currently unused. // statusFlags Portaudio status flags. Currently unused. // userData Pointer to supplied user data (in this case, a pointer to the parent Audio object int Audio::audioCallback (const void* inputBuffer, void* outputBuffer, unsigned long frames, const PaStreamCallbackTimeInfo *timeInfo, PaStreamCallbackFlags statusFlags, void* userData) { int16_t* inputLeft = static_cast(inputBuffer)[0]; int16_t* outputLeft = static_cast(outputBuffer)[0]; int16_t* outputRight = static_cast(outputBuffer)[1]; static_cast(userData)->performIO(inputLeft, outputLeft, outputRight); return paContinue; } static void outputPortAudioError(PaError error) { if (error != paNoError) { printLog("-- portaudio termination error --\n"); printLog("PortAudio error (%d): %s\n", error, Pa_GetErrorText(error)); } } Audio::Audio(Oscilloscope* scope) : _stream(NULL), _ringBuffer(true), _scope(scope), _averagedLatency(0.0), _measuredJitter(0), // _jitterBufferLengthMsecs(12.0), // _jitterBufferSamples(_jitterBufferLengthMsecs * // NUM_AUDIO_CHANNELS * (SAMPLE_RATE / 1000.0)), _wasStarved(0), _numStarves(0), _lastInputLoudness(0), _lastVelocity(0), _lastAcceleration(0), _totalPacketsReceived(0), _firstPlaybackTime(), _packetsReceivedThisPlayback(0), _isCancellingEcho(false), _echoDelay(0), _echoSamplesLeft(0l), _speexEchoState(NULL), _speexPreprocessState(NULL), _isSendingEchoPing(false), _pingAnalysisPending(false), _pingFramesToRecord(0), _samplesLeftForFlange(0), _lastYawMeasuredMaximum(0), _flangeIntensity(0.0f), _flangeRate(0.0f), _flangeWeight(0.0f) { outputPortAudioError(Pa_Initialize()); outputPortAudioError(Pa_OpenDefaultStream(&_stream, 2, 2, (paInt16 | paNonInterleaved), SAMPLE_RATE, BUFFER_LENGTH_SAMPLES_PER_CHANNEL, audioCallback, (void*) this)); if (! _stream) { return; } _echoSamplesLeft = new int16_t[AEC_BUFFERED_SAMPLES + AEC_TMP_BUFFER_SIZE]; if (! _echoSamplesLeft) { return; } memset(_echoSamplesLeft, 0, AEC_BUFFERED_SAMPLES * sizeof(int16_t)); _echoSamplesRight = _echoSamplesLeft + AEC_BUFFERED_SAMPLES_PER_CHANNEL; _speexTmpBuf = _echoSamplesRight + AEC_BUFFERED_SAMPLES_PER_CHANNEL; _speexPreprocessState = speex_preprocess_state_init(BUFFER_LENGTH_SAMPLES_PER_CHANNEL, SAMPLE_RATE); if (_speexPreprocessState) { _speexEchoState = speex_echo_state_init_mc(BUFFER_LENGTH_SAMPLES_PER_CHANNEL, AEC_FILTER_LENGTH, AEC_N_CHANNELS_MIC, AEC_N_CHANNELS_PLAY); if (_speexEchoState) { speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_ECHO_STATE, _speexEchoState); int tmp; speex_echo_ctl(_speexEchoState, SPEEX_ECHO_SET_SAMPLING_RATE, &(tmp = SAMPLE_RATE)); tmp = AEC_NOISE_REDUCTION; speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_NOISE_SUPPRESS, &tmp); tmp = AEC_RESIDUAL_ECHO_REDUCTION; speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_ECHO_SUPPRESS, &tmp); tmp = AEC_RESIDUAL_ECHO_REDUCTION_ACTIVE; speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE, &tmp); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_AGC, &(tmp = int(AEC_USE_AGC))); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_AGC_MAX_GAIN, &(tmp = AEC_AGC_MAX_GAIN)); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_AGC_TARGET, &(tmp = AEC_AGC_TARGET_LEVEL)); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_AGC_INCREMENT, &(tmp = AEC_AGC_MAX_INC)); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_AGC_DECREMENT, &(tmp = AEC_AGC_MAX_DEC)); speex_preprocess_ctl(_speexPreprocessState, SPEEX_PREPROCESS_SET_VAD, &(tmp = int(AEC_USE_VAD))); } else { speex_preprocess_state_destroy(_speexPreprocessState); _speexPreprocessState = NULL; } } // start the stream now that sources are good to go outputPortAudioError(Pa_StartStream(_stream)); gettimeofday(&_lastReceiveTime, NULL); } Audio::~Audio() { if (_stream) { outputPortAudioError(Pa_CloseStream(_stream)); outputPortAudioError(Pa_Terminate()); } if (_speexEchoState) { speex_preprocess_state_destroy(_speexPreprocessState); speex_echo_state_destroy(_speexEchoState); } delete[] _echoSamplesLeft; } void Audio::addReceivedAudioToBuffer(unsigned char* receivedData, int receivedBytes) { const int NUM_INITIAL_PACKETS_DISCARD = 3; timeval currentReceiveTime; gettimeofday(¤tReceiveTime, NULL); _totalPacketsReceived++; double timeDiff = diffclock(&_lastReceiveTime, ¤tReceiveTime); // Discard first few received packets for computing jitter (often they pile up on start) if (_totalPacketsReceived > NUM_INITIAL_PACKETS_DISCARD) { _stdev.addValue(timeDiff); } if (_stdev.getSamples() > 500) { _measuredJitter = _stdev.getStDev(); _stdev.reset(); } if (!_ringBuffer.isStarted()) { _packetsReceivedThisPlayback++; } if (_packetsReceivedThisPlayback == 1) { gettimeofday(&_firstPlaybackTime, NULL); } _ringBuffer.parseData((unsigned char*) receivedData, PACKET_LENGTH_BYTES + sizeof(PACKET_HEADER)); Application::getInstance()->getBandwidthMeter()->inputStream(0) .updateValue(PACKET_LENGTH_BYTES + sizeof(PACKET_HEADER)); _lastReceiveTime = currentReceiveTime; } void Audio::render(int screenWidth, int screenHeight) { if (_stream) { glLineWidth(2.0); glBegin(GL_LINES); glColor3f(1,1,1); int startX = 20.0; int currentX = startX; int topY = screenHeight - 40; int bottomY = screenHeight - 20; float frameWidth = 20.0; float halfY = topY + ((bottomY - topY) / 2.0); // draw the lines for the base of the ring buffer glVertex2f(currentX, topY); glVertex2f(currentX, bottomY); for (int i = 0; i < RING_BUFFER_LENGTH_FRAMES; i++) { glVertex2f(currentX, halfY); glVertex2f(currentX + frameWidth, halfY); currentX += frameWidth; glVertex2f(currentX, topY); glVertex2f(currentX, bottomY); } glEnd(); // Show a bar with the amount of audio remaining in ring buffer beyond current playback float remainingBuffer = 0; timeval currentTime; gettimeofday(¤tTime, NULL); float timeLeftInCurrentBuffer = 0; if (_lastCallbackTime.tv_usec > 0) { timeLeftInCurrentBuffer = AUDIO_CALLBACK_MSECS - diffclock(&_lastCallbackTime, ¤tTime); } if (_ringBuffer.getEndOfLastWrite() != NULL) remainingBuffer = _ringBuffer.diffLastWriteNextOutput() / PACKET_LENGTH_SAMPLES * AUDIO_CALLBACK_MSECS; if (_wasStarved == 0) { glColor3f(0, 1, 0); } else { glColor3f(0.5 + (_wasStarved / 20.0f), 0, 0); _wasStarved--; } glBegin(GL_QUADS); glVertex2f(startX, topY + 2); glVertex2f(startX + (remainingBuffer + timeLeftInCurrentBuffer)/AUDIO_CALLBACK_MSECS*frameWidth, topY + 2); glVertex2f(startX + (remainingBuffer + timeLeftInCurrentBuffer)/AUDIO_CALLBACK_MSECS*frameWidth, bottomY - 2); glVertex2f(startX, bottomY - 2); glEnd(); if (_averagedLatency == 0.0) { _averagedLatency = remainingBuffer + timeLeftInCurrentBuffer; } else { _averagedLatency = 0.99f * _averagedLatency + 0.01f * (remainingBuffer + timeLeftInCurrentBuffer); } // Show a yellow bar with the averaged msecs latency you are hearing (from time of packet receipt) glColor3f(1,1,0); glBegin(GL_QUADS); glVertex2f(startX + _averagedLatency / AUDIO_CALLBACK_MSECS * frameWidth - 2, topY - 2); glVertex2f(startX + _averagedLatency / AUDIO_CALLBACK_MSECS * frameWidth + 2, topY - 2); glVertex2f(startX + _averagedLatency / AUDIO_CALLBACK_MSECS * frameWidth + 2, bottomY + 2); glVertex2f(startX + _averagedLatency / AUDIO_CALLBACK_MSECS * frameWidth - 2, bottomY + 2); glEnd(); char out[40]; sprintf(out, "%3.0f\n", _averagedLatency); drawtext(startX + _averagedLatency / AUDIO_CALLBACK_MSECS * frameWidth - 10, topY - 10, 0.10, 0, 1, 0, out, 1,1,0); //drawtext(startX + 0, topY-10, 0.08, 0, 1, 0, out, 1,1,0); // Show a Cyan bar with the most recently measured jitter stdev int jitterPels = _measuredJitter / ((1000.0f * PACKET_LENGTH_SAMPLES / SAMPLE_RATE)) * frameWidth; glColor3f(0,1,1); glBegin(GL_QUADS); glVertex2f(startX + jitterPels - 2, topY - 2); glVertex2f(startX + jitterPels + 2, topY - 2); glVertex2f(startX + jitterPels + 2, bottomY + 2); glVertex2f(startX + jitterPels - 2, bottomY + 2); glEnd(); sprintf(out,"%3.1f\n", _measuredJitter); drawtext(startX + jitterPels - 5, topY-10, 0.10, 0, 1, 0, out, 0,1,1); sprintf(out, "%3.1fms\n", JITTER_BUFFER_LENGTH_MSECS); drawtext(startX - 10, bottomY + 15, 0.1, 0, 1, 0, out, 1, 0, 0); } } // Take a pointer to the acquired microphone input samples and add procedural sounds void Audio::addProceduralSounds(int16_t* inputBuffer, int numSamples) { const float MAX_AUDIBLE_VELOCITY = 6.0; const float MIN_AUDIBLE_VELOCITY = 0.1; const int VOLUME_BASELINE = 400; const float SOUND_PITCH = 8.f; float speed = glm::length(_lastVelocity); float volume = VOLUME_BASELINE * (1.f - speed / MAX_AUDIBLE_VELOCITY); // Add a noise-modulated sinewave with volume that tapers off with speed increasing if ((speed > MIN_AUDIBLE_VELOCITY) && (speed < MAX_AUDIBLE_VELOCITY)) { for (int i = 0; i < numSamples; i++) { inputBuffer[i] += (int16_t)((sinf((float) i / SOUND_PITCH * speed) * randFloat()) * volume * speed); } } } // ----------------------------- // Speex-based echo cancellation // ----------------------------- bool Audio::isCancellingEcho() const { return _isCancellingEcho && ! (_pingFramesToRecord != 0 || _pingAnalysisPending || ! _speexPreprocessState); } void Audio::setIsCancellingEcho(bool enable) { if (enable && _speexPreprocessState) { speex_echo_state_reset(_speexEchoState); _echoWritePos = 0; memset(_echoSamplesLeft, 0, AEC_BUFFERED_SAMPLES * sizeof(int16_t)); } _isCancellingEcho = enable; } inline void Audio::eventuallyCancelEcho(int16_t* inputLeft) { if (! isCancellingEcho()) { return; } // Construct an artificial frame from the captured playback // that contains the appropriately delayed output to cancel unsigned n = BUFFER_LENGTH_SAMPLES_PER_CHANNEL, n2 = 0; unsigned readPos = (_echoWritePos + AEC_BUFFERED_SAMPLES_PER_CHANNEL - _echoDelay) % AEC_BUFFERED_SAMPLES_PER_CHANNEL; unsigned readEnd = readPos + n; if (readEnd >= AEC_BUFFERED_SAMPLES_PER_CHANNEL) { n2 = (readEnd -= AEC_BUFFERED_SAMPLES_PER_CHANNEL); n -= n2; } // Use two subsequent buffers for the two stereo channels int16_t* playBufferLeft = _speexTmpBuf + BUFFER_LENGTH_SAMPLES_PER_CHANNEL; memcpy(playBufferLeft, _echoSamplesLeft + readPos, n * sizeof(int16_t)); memcpy(playBufferLeft + n, _echoSamplesLeft, n2 * sizeof(int16_t)); int16_t* playBufferRight = playBufferLeft + BUFFER_LENGTH_SAMPLES_PER_CHANNEL; memcpy(playBufferRight, _echoSamplesRight + readPos, n * sizeof(int16_t)); memcpy(playBufferRight + n, _echoSamplesLeft, n2 * sizeof(int16_t)); #ifdef VISUALIZE_ECHO_CANCELLATION // Visualize the input _scope->addSamples(0, inputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); _scope->addSamples(1, playBufferLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); #endif // Have Speex perform echo cancellation speex_echo_cancellation(_speexEchoState, inputLeft, playBufferLeft, _speexTmpBuf); memcpy(inputLeft, _speexTmpBuf, BUFFER_LENGTH_BYTES_PER_CHANNEL); speex_preprocess_run(_speexPreprocessState, inputLeft); #ifdef VISUALIZE_ECHO_CANCELLATION // Visualize the result _scope->setColor(2, 0x00ff00); _scope->addSamples(2, inputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL); #endif } inline void Audio::eventuallyRecordEcho(int16_t* outputLeft, int16_t* outputRight) { if (! isCancellingEcho()) { return; } // Copy playback data to circular buffers unsigned n = BUFFER_LENGTH_SAMPLES_PER_CHANNEL, n2 = 0; unsigned writeEnd = _echoWritePos + n; if (writeEnd >= AEC_BUFFERED_SAMPLES_PER_CHANNEL) { n2 = (writeEnd -= AEC_BUFFERED_SAMPLES_PER_CHANNEL); n -= n2; } memcpy(_echoSamplesLeft + _echoWritePos, outputLeft, n * sizeof(int16_t)); memcpy(_echoSamplesLeft, outputLeft + n, n2 * sizeof(int16_t)); memcpy(_echoSamplesRight + _echoWritePos, outputRight, n * sizeof(int16_t)); memcpy(_echoSamplesRight, outputRight + n, n2 * sizeof(int16_t)); _echoWritePos = writeEnd; } // ----------------------------------------------------------- // Accoustic ping (audio system round trip time determination) // ----------------------------------------------------------- void Audio::ping() { _pingFramesToRecord = PING_FRAMES_TO_RECORD; _isSendingEchoPing = true; _scope->setDownsampleRatio(8); _scope->inputPaused = false; } inline void Audio::eventuallySendRecvPing(int16_t* inputLeft, int16_t* outputLeft, int16_t* outputRight) { if (_isSendingEchoPing) { // Overwrite output with ping signal. // // Using a signed variant of sinc because it's speaker-reproducible // with a unique, characteristic point in time (its center), aligned // to the right of the output buffer. // // | // | | // ...--- t --------+-+-+-+-+-------> // | | : // | : // buffer :<- start of next buffer // : : : // :---: sine period // :-: half sine period // memset(outputLeft, 0, PING_BUFFER_OFFSET * sizeof(int16_t)); outputLeft += PING_BUFFER_OFFSET; memset(outputRight, 0, PING_BUFFER_OFFSET * sizeof(int16_t)); outputRight += PING_BUFFER_OFFSET; for (int s = -PING_PERIOD; s < PING_PERIOD; ++s) { float t = float(s) / PING_PITCH; *outputLeft++ = *outputRight++ = int16_t(PING_VOLUME * sinf(t) / fmaxf(1.0f, pow((abs(t)-1.5f) / 1.5f, 1.2f))); } // As of the next frame, we'll be recoding PING_FRAMES_TO_RECORD from // the mic (pointless to start now as we can't record unsent audio). _isSendingEchoPing = false; printLog("Send audio ping\n"); } else if (_pingFramesToRecord > 0) { // Store input samples int offset = BUFFER_LENGTH_SAMPLES_PER_CHANNEL * ( PING_FRAMES_TO_RECORD - _pingFramesToRecord); memcpy(_echoSamplesLeft + offset, inputLeft, BUFFER_LENGTH_SAMPLES_PER_CHANNEL * sizeof(int16_t)); --_pingFramesToRecord; if (_pingFramesToRecord == 0) { _pingAnalysisPending = true; printLog("Received ping echo\n"); } } } static int findExtremum(int16_t const* samples, int length, int sign) { int x0 = -1; int y0 = -PING_VOLUME; for (int x = 0; x < length; ++samples, ++x) { int y = *samples * sign; if (y > y0) { x0 = x; y0 = y; } } return x0; } inline void Audio::analyzePing() { // Determine extrema int botAt = findExtremum(_echoSamplesLeft, PING_SAMPLES_TO_ANALYZE, -1); if (botAt == -1) { printLog("Audio Ping: Minimum not found.\n"); return; } int topAt = findExtremum(_echoSamplesLeft, PING_SAMPLES_TO_ANALYZE, 1); if (topAt == -1) { printLog("Audio Ping: Maximum not found.\n"); return; } // Determine peak amplitude - warn if low int ampli = (_echoSamplesLeft[topAt] - _echoSamplesLeft[botAt]) / 2; if (ampli < PING_MIN_AMPLI) { printLog("Audio Ping unreliable - low amplitude %d.\n", ampli); } // Determine period - warn if doesn't look like our signal int halfPeriod = abs(topAt - botAt); if (abs(halfPeriod-PING_HALF_PERIOD) > PING_MAX_PERIOD_DIFFERENCE) { printLog("Audio Ping unreliable - peak distance %d vs. %d\n", halfPeriod, PING_HALF_PERIOD); } // Ping is sent: // // ---[ record ]--[ play ]--- audio in space/time ---> // : : : // : : ping: ->X<- // : : : // : : |+| (buffer end - signal center = t1-t0) // : |<----------+ // : : : : // : ->X<- (corresponding input buffer position t0) // : : : : // : : : : // : : : : // Next frame (we're recording from now on): // : : : // : - - --[ record ]--[ play ]------------------> // : : : : // : : |<-- (start of recording t1) // : : : // : : : // At some frame, the signal is picked up: // : : : : // : : : : // : : : V // : : : - - --[ record ]--[ play ]----------> // : V : : // : |<--------->| // |+|<------->| period + measured samples // // If we could pick up the signal at t0 we'd have zero round trip // time - in this case we had recorded the output buffer instantly // in its entirety (we can't - but there's the proper reference // point). We know the number of samples from t1 and, knowing that // data is streaming continuously, we know that t1-t0 is the distance // of the characterisic point from the end of the buffer. int delay = (botAt + topAt) / 2 + PING_PERIOD; printLog("\n| Audio Ping results:\n+----- ---- --- - - - - -\n\n" "Delay = %d samples (%d ms)\nPeak amplitude = %d\n\n", delay, (delay * 1000) / int(SAMPLE_RATE), ampli); } bool Audio::eventuallyAnalyzePing() { if (! _pingAnalysisPending) { return false; } _scope->inputPaused = true; analyzePing(); setIsCancellingEcho(_isCancellingEcho); _pingAnalysisPending = false; return true; } #endif