mirror of
https://github.com/overte-org/overte.git
synced 2025-08-07 16:50:43 +02:00
Some experimentation yields promising results...
This commit is contained in:
parent
79f30eb05d
commit
26e388b139
5 changed files with 844 additions and 811 deletions
|
@ -1182,6 +1182,7 @@ Application::Application(int& argc, char** argv, QElapsedTimer& startupTimer, bo
|
|||
|
||||
auto TTS = DependencyManager::get<TTSScriptingInterface>().data();
|
||||
connect(TTS, &TTSScriptingInterface::ttsSampleCreated, audioIO, &AudioClient::handleTTSAudioInput);
|
||||
connect(TTS, &TTSScriptingInterface::clearTTSBuffer, audioIO, &AudioClient::clearTTSBuffer);
|
||||
|
||||
connect(audioIO, &AudioClient::inputReceived, [](const QByteArray& audio) {
|
||||
static auto recorder = DependencyManager::get<recording::Recorder>();
|
||||
|
|
|
@ -65,7 +65,7 @@ void TTSScriptingInterface::testTone(const bool& alsoInject) {
|
|||
int16_t temp = (glm::sin(glm::radians((float)a))) * 32768;
|
||||
samples[a] = temp;
|
||||
}
|
||||
emit ttsSampleCreated(_lastSoundByteArray);
|
||||
emit ttsSampleCreated(_lastSoundByteArray, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50, 96);
|
||||
|
||||
if (alsoInject) {
|
||||
AudioInjectorOptions options;
|
||||
|
@ -75,11 +75,16 @@ void TTSScriptingInterface::testTone(const bool& alsoInject) {
|
|||
}
|
||||
}
|
||||
|
||||
void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& alsoInject) {
|
||||
void TTSScriptingInterface::speakText(const QString& textToSpeak,
|
||||
const int& newChunkSize,
|
||||
const int& timerInterval,
|
||||
const int& sampleRate,
|
||||
const int& bitsPerSample,
|
||||
const bool& alsoInject) {
|
||||
WAVEFORMATEX fmt;
|
||||
fmt.wFormatTag = WAVE_FORMAT_PCM;
|
||||
fmt.nSamplesPerSec = 24000;
|
||||
fmt.wBitsPerSample = 16;
|
||||
fmt.nSamplesPerSec = sampleRate;
|
||||
fmt.wBitsPerSample = bitsPerSample;
|
||||
fmt.nChannels = 1;
|
||||
fmt.nBlockAlign = fmt.nChannels * fmt.wBitsPerSample / 8;
|
||||
fmt.nAvgBytesPerSec = fmt.nSamplesPerSec * fmt.nBlockAlign;
|
||||
|
@ -146,7 +151,7 @@ void TTSScriptingInterface::speakText(const QString& textToSpeak, const bool& al
|
|||
_lastSoundByteArray.resize(0);
|
||||
_lastSoundByteArray.append(buf1, dwSize);
|
||||
|
||||
emit ttsSampleCreated(_lastSoundByteArray);
|
||||
emit ttsSampleCreated(_lastSoundByteArray, newChunkSize, timerInterval);
|
||||
|
||||
if (alsoInject) {
|
||||
AudioInjectorOptions options;
|
||||
|
@ -160,4 +165,6 @@ void TTSScriptingInterface::stopLastSpeech() {
|
|||
if (_lastSoundAudioInjector) {
|
||||
_lastSoundAudioInjector->stop();
|
||||
}
|
||||
|
||||
emit clearTTSBuffer();
|
||||
}
|
||||
|
|
|
@ -19,6 +19,7 @@
|
|||
#include <sapi.h> // SAPI
|
||||
#include <sphelper.h> // SAPI Helper
|
||||
#include <AudioInjector.h>
|
||||
#include <AudioConstants.h>
|
||||
|
||||
class TTSScriptingInterface : public QObject, public Dependency {
|
||||
Q_OBJECT
|
||||
|
@ -28,11 +29,17 @@ public:
|
|||
~TTSScriptingInterface();
|
||||
|
||||
Q_INVOKABLE void testTone(const bool& alsoInject = false);
|
||||
Q_INVOKABLE void speakText(const QString& textToSpeak, const bool& alsoInject = false);
|
||||
Q_INVOKABLE void speakText(const QString& textToSpeak,
|
||||
const int& newChunkSize = (AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50),
|
||||
const int& timerInterval = 96,
|
||||
const int& sampleRate = 24000,
|
||||
const int& bitsPerSample = 16,
|
||||
const bool& alsoInject = false);
|
||||
Q_INVOKABLE void stopLastSpeech();
|
||||
|
||||
signals:
|
||||
void ttsSampleCreated(QByteArray outputArray);
|
||||
void ttsSampleCreated(QByteArray outputArray, const int& newChunkSize, const int& timerInterval);
|
||||
void clearTTSBuffer();
|
||||
|
||||
private:
|
||||
class CComAutoInit {
|
||||
|
|
|
@ -186,7 +186,7 @@ AudioClient::AudioClient() :
|
|||
_networkToOutputResampler(NULL), _localToOutputResampler(NULL),
|
||||
_audioLimiter(AudioConstants::SAMPLE_RATE, OUTPUT_CHANNEL_COUNT), _outgoingAvatarAudioSequenceNumber(0),
|
||||
_audioOutputIODevice(_localInjectorsStream, _receivedAudioStream, this), _stats(&_receivedAudioStream),
|
||||
_positionGetter(DEFAULT_POSITION_GETTER),
|
||||
_positionGetter(DEFAULT_POSITION_GETTER), _TTSTimer(this),
|
||||
#if defined(Q_OS_ANDROID)
|
||||
_checkInputTimer(this), _isHeadsetPluggedIn(false),
|
||||
#endif
|
||||
|
@ -245,6 +245,8 @@ AudioClient::AudioClient() :
|
|||
packetReceiver.registerListener(PacketType::NoisyMute, this, "handleNoisyMutePacket");
|
||||
packetReceiver.registerListener(PacketType::MuteEnvironment, this, "handleMuteEnvironmentPacket");
|
||||
packetReceiver.registerListener(PacketType::SelectedAudioFormat, this, "handleSelectedAudioFormat");
|
||||
|
||||
connect(&_TTSTimer, &QTimer::timeout, this, &AudioClient::processTTSBuffer);
|
||||
}
|
||||
|
||||
AudioClient::~AudioClient() {
|
||||
|
@ -939,7 +941,7 @@ void AudioClient::setReverbOptions(const AudioEffectOptions* options) {
|
|||
}
|
||||
}
|
||||
|
||||
void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
|
||||
void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray, const int& sampleRate, const int& channelCount) {
|
||||
// If there is server echo, reverb will be applied to the recieved audio stream so no need to have it here.
|
||||
bool hasReverb = _reverb || _receivedAudioStream.hasReverb();
|
||||
if (_muted || !_audioOutput || (!_shouldEchoLocally && !hasReverb)) {
|
||||
|
@ -949,7 +951,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
|
|||
// NOTE: we assume the inputFormat and the outputFormat are the same, since on any modern
|
||||
// multimedia OS they should be. If there is a device that this is not true for, we can
|
||||
// add back support to do resampling.
|
||||
if (_inputFormat.sampleRate() != _outputFormat.sampleRate()) {
|
||||
if (sampleRate != _outputFormat.sampleRate()) {
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -972,7 +974,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
|
|||
static QByteArray loopBackByteArray;
|
||||
|
||||
int numInputSamples = inputByteArray.size() / AudioConstants::SAMPLE_SIZE;
|
||||
int numLoopbackSamples = (numInputSamples * OUTPUT_CHANNEL_COUNT) / _inputFormat.channelCount();
|
||||
int numLoopbackSamples = (numInputSamples * OUTPUT_CHANNEL_COUNT) / channelCount;
|
||||
|
||||
loopBackByteArray.resize(numLoopbackSamples * AudioConstants::SAMPLE_SIZE);
|
||||
|
||||
|
@ -980,7 +982,7 @@ void AudioClient::handleLocalEchoAndReverb(QByteArray& inputByteArray) {
|
|||
int16_t* loopbackSamples = reinterpret_cast<int16_t*>(loopBackByteArray.data());
|
||||
|
||||
// upmix mono to stereo
|
||||
if (!sampleChannelConversion(inputSamples, loopbackSamples, numInputSamples, _inputFormat.channelCount(),
|
||||
if (!sampleChannelConversion(inputSamples, loopbackSamples, numInputSamples, channelCount,
|
||||
OUTPUT_CHANNEL_COUNT)) {
|
||||
// no conversion, just copy the samples
|
||||
memcpy(loopbackSamples, inputSamples, numInputSamples * AudioConstants::SAMPLE_SIZE);
|
||||
|
@ -1093,23 +1095,29 @@ void AudioClient::handleAudioInput(QByteArray& audioBuffer) {
|
|||
}
|
||||
}
|
||||
|
||||
void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
|
||||
const uchar& channelCount,
|
||||
const qint32& bytesForDuration,
|
||||
QByteArray& rollingBuffer) {
|
||||
void AudioClient::handleMicAudioInput() {
|
||||
if (!_inputDevice || _isPlayingBackRecording) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(Q_OS_ANDROID)
|
||||
_inputReadsSinceLastCheck++;
|
||||
#endif
|
||||
|
||||
// input samples required to produce exactly NETWORK_FRAME_SAMPLES of output
|
||||
const int inputSamplesRequired =
|
||||
(_inputToNetworkResampler ? _inputToNetworkResampler->getMinInput(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL)
|
||||
: AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL) *
|
||||
channelCount;
|
||||
_inputFormat.channelCount();
|
||||
|
||||
const auto inputAudioSamples = std::unique_ptr<int16_t[]>(new int16_t[inputSamplesRequired]);
|
||||
QByteArray inputByteArray = _inputDevice->readAll();
|
||||
|
||||
handleLocalEchoAndReverb(inputByteArray);
|
||||
handleLocalEchoAndReverb(inputByteArray, _inputFormat.sampleRate(), _inputFormat.channelCount());
|
||||
|
||||
_inputRingBuffer.writeData(inputByteArray.data(), inputByteArray.size());
|
||||
|
||||
float audioInputMsecsRead = inputByteArray.size() / (float)(bytesForDuration);
|
||||
float audioInputMsecsRead = inputByteArray.size() / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
|
||||
_stats.updateInputMsRead(audioInputMsecsRead);
|
||||
|
||||
const int numNetworkBytes =
|
||||
|
@ -1125,33 +1133,17 @@ void AudioClient::processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
|
|||
} else {
|
||||
_inputRingBuffer.readSamples(inputAudioSamples.get(), inputSamplesRequired);
|
||||
possibleResampling(_inputToNetworkResampler, inputAudioSamples.get(), networkAudioSamples, inputSamplesRequired,
|
||||
numNetworkSamples, channelCount, _desiredInputFormat.channelCount());
|
||||
numNetworkSamples, _inputFormat.channelCount(), _desiredInputFormat.channelCount());
|
||||
}
|
||||
int bytesInInputRingBuffer = _inputRingBuffer.samplesAvailable() * AudioConstants::SAMPLE_SIZE;
|
||||
float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(bytesForDuration);
|
||||
float msecsInInputRingBuffer = bytesInInputRingBuffer / (float)(_inputFormat.bytesForDuration(USECS_PER_MSEC));
|
||||
_stats.updateInputMsUnplayed(msecsInInputRingBuffer);
|
||||
|
||||
QByteArray audioBuffer(reinterpret_cast<char*>(networkAudioSamples), numNetworkBytes);
|
||||
rollingBuffer.append(audioBuffer);
|
||||
handleAudioInput(audioBuffer);
|
||||
}
|
||||
}
|
||||
|
||||
void AudioClient::handleMicAudioInput() {
|
||||
if (!_inputDevice || _isPlayingBackRecording) {
|
||||
return;
|
||||
}
|
||||
|
||||
#if defined(Q_OS_ANDROID)
|
||||
_inputReadsSinceLastCheck++;
|
||||
#endif
|
||||
|
||||
QByteArray temp;
|
||||
|
||||
processAudioAndAddToRingBuffer(_inputDevice->readAll(), _inputFormat.channelCount(),
|
||||
_inputFormat.bytesForDuration(USECS_PER_MSEC), temp);
|
||||
}
|
||||
|
||||
void AudioClient::handleDummyAudioInput() {
|
||||
const int numNetworkBytes =
|
||||
_isStereoInput ? AudioConstants::NETWORK_FRAME_BYTES_STEREO : AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
|
||||
|
@ -1210,25 +1202,43 @@ int rawToWav(const char* rawData, const int& rawLength, const char* wavfn, long
|
|||
return 0;
|
||||
}
|
||||
|
||||
void AudioClient::handleTTSAudioInput(const QByteArray& audio) {
|
||||
QByteArray audioBuffer(audio);
|
||||
|
||||
QString filename = QString::number(usecTimestampNow());
|
||||
QString path = PathUtils::getAppDataPath() + "Audio/" + filename + "-before.wav";
|
||||
rawToWav(audioBuffer.data(), audioBuffer.size(), path.toLocal8Bit(), 24000, 1);
|
||||
|
||||
QByteArray temp;
|
||||
|
||||
while (audioBuffer.size() > 0) {
|
||||
void AudioClient::processTTSBuffer() {
|
||||
Lock lock(_TTSMutex);
|
||||
if (_TTSAudioBuffer.size() > 0) {
|
||||
QByteArray part;
|
||||
part.append(audioBuffer.data(), AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
|
||||
audioBuffer.remove(0, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
|
||||
processAudioAndAddToRingBuffer(part, 1, 48, temp);
|
||||
part.append(_TTSAudioBuffer.data(), _TTSChunkSize);
|
||||
_TTSAudioBuffer.remove(0, _TTSChunkSize);
|
||||
handleAudioInput(part);
|
||||
} else {
|
||||
_isProcessingTTS = false;
|
||||
_TTSTimer.stop();
|
||||
}
|
||||
}
|
||||
|
||||
filename = QString::number(usecTimestampNow());
|
||||
path = PathUtils::getAppDataPath() + "Audio/" + filename + "-after.wav";
|
||||
rawToWav(temp.data(), temp.size(), path.toLocal8Bit(), 12000, 1);
|
||||
void AudioClient::handleTTSAudioInput(const QByteArray& audio, const int& newChunkSize, const int& timerInterval) {
|
||||
_TTSChunkSize = newChunkSize;
|
||||
_TTSAudioBuffer.append(audio);
|
||||
|
||||
handleLocalEchoAndReverb(_TTSAudioBuffer, 48000, 1);
|
||||
|
||||
//QString filename = QString::number(usecTimestampNow());
|
||||
//QString path = PathUtils::getAppDataPath() + "Audio/" + filename + "-before.wav";
|
||||
//rawToWav(_TTSAudioBuffer.data(), _TTSAudioBuffer.size(), path.toLocal8Bit(), 24000, 1);
|
||||
|
||||
//QByteArray temp;
|
||||
|
||||
_isProcessingTTS = true;
|
||||
_TTSTimer.start(timerInterval);
|
||||
|
||||
//filename = QString::number(usecTimestampNow());
|
||||
//path = PathUtils::getAppDataPath() + "Audio/" + filename + "-after.wav";
|
||||
//rawToWav(temp.data(), temp.size(), path.toLocal8Bit(), 12000, 1);
|
||||
}
|
||||
|
||||
void AudioClient::clearTTSBuffer() {
|
||||
_TTSAudioBuffer.resize(0);
|
||||
_isProcessingTTS = false;
|
||||
_TTSTimer.stop();
|
||||
}
|
||||
|
||||
void AudioClient::prepareLocalAudioInjectors(std::unique_ptr<Lock> localAudioLock) {
|
||||
|
@ -1313,7 +1323,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
|
|||
if (injectorBuffer) {
|
||||
static const int HRTF_DATASET_INDEX = 1;
|
||||
|
||||
int numChannels = injector->isAmbisonic() ? AudioConstants::AMBISONIC
|
||||
int numChannels = injector->isAmbisonic()
|
||||
? AudioConstants::AMBISONIC
|
||||
: (injector->isStereo() ? AudioConstants::STEREO : AudioConstants::MONO);
|
||||
size_t bytesToRead = numChannels * AudioConstants::NETWORK_FRAME_BYTES_PER_CHANNEL;
|
||||
|
||||
|
@ -1355,8 +1366,8 @@ bool AudioClient::mixLocalAudioInjectors(float* mixBuffer) {
|
|||
float azimuth = azimuthForSource(relativePosition);
|
||||
|
||||
// mono gets spatialized into mixBuffer
|
||||
injector->getLocalHRTF().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX, azimuth, distance, gain,
|
||||
AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
|
||||
injector->getLocalHRTF().render(_localScratchBuffer, mixBuffer, HRTF_DATASET_INDEX, azimuth, distance,
|
||||
gain, AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL);
|
||||
}
|
||||
|
||||
} else {
|
||||
|
@ -1509,7 +1520,8 @@ bool AudioClient::outputLocalInjector(const AudioInjectorPointer& injector) {
|
|||
}
|
||||
|
||||
void AudioClient::outputFormatChanged() {
|
||||
_outputFrameSize = (AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * OUTPUT_CHANNEL_COUNT * _outputFormat.sampleRate()) /
|
||||
_outputFrameSize =
|
||||
(AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * OUTPUT_CHANNEL_COUNT * _outputFormat.sampleRate()) /
|
||||
_desiredOutputFormat.sampleRate();
|
||||
_receivedAudioStream.outputFormatChanged(_outputFormat.sampleRate(), OUTPUT_CHANNEL_COUNT);
|
||||
}
|
||||
|
@ -1725,8 +1737,8 @@ void AudioClient::outputNotify() {
|
|||
int newOutputBufferSizeFrames = setOutputBufferSize(oldOutputBufferSizeFrames + 1, false);
|
||||
|
||||
if (newOutputBufferSizeFrames > oldOutputBufferSizeFrames) {
|
||||
qCDebug(audioclient, "Starve threshold surpassed (%d starves in %d ms)", _outputStarveDetectionCount,
|
||||
dt);
|
||||
qCDebug(audioclient, "Starve threshold surpassed (%d starves in %d ms)",
|
||||
_outputStarveDetectionCount, dt);
|
||||
}
|
||||
|
||||
_outputStarveDetectionStartTimeMsec = now;
|
||||
|
@ -2054,7 +2066,8 @@ qint64 AudioClient::AudioOutputIODevice::readData(char* data, qint64 maxSize) {
|
|||
}
|
||||
|
||||
int bytesAudioOutputUnplayed = _audio->_audioOutput->bufferSize() - _audio->_audioOutput->bytesFree();
|
||||
float msecsAudioOutputUnplayed = bytesAudioOutputUnplayed / (float)_audio->_outputFormat.bytesForDuration(USECS_PER_MSEC);
|
||||
float msecsAudioOutputUnplayed =
|
||||
bytesAudioOutputUnplayed / (float)_audio->_outputFormat.bytesForDuration(USECS_PER_MSEC);
|
||||
_audio->_stats.updateOutputMsUnplayed(msecsAudioOutputUnplayed);
|
||||
|
||||
if (bytesAudioOutputUnplayed == 0) {
|
||||
|
|
|
@ -197,7 +197,11 @@ public slots:
|
|||
void checkInputTimeout();
|
||||
void handleDummyAudioInput();
|
||||
void handleRecordedAudioInput(const QByteArray& audio);
|
||||
void handleTTSAudioInput(const QByteArray& audio);
|
||||
void handleTTSAudioInput(const QByteArray& audio,
|
||||
const int& newChunkSize,
|
||||
const int& timerInterval);
|
||||
void clearTTSBuffer();
|
||||
void processTTSBuffer();
|
||||
void reset();
|
||||
void audioMixerKilled();
|
||||
|
||||
|
@ -290,10 +294,11 @@ private:
|
|||
float azimuthForSource(const glm::vec3& relativePosition);
|
||||
float gainForSource(float distance, float volume);
|
||||
|
||||
void processAudioAndAddToRingBuffer(QByteArray& inputByteArray,
|
||||
const uchar& channelCount,
|
||||
const qint32& bytesForDuration,
|
||||
QByteArray& rollingBuffer);
|
||||
Mutex _TTSMutex;
|
||||
QTimer _TTSTimer;
|
||||
bool _isProcessingTTS {false};
|
||||
QByteArray _TTSAudioBuffer;
|
||||
int _TTSChunkSize = AudioConstants::NETWORK_FRAME_SAMPLES_PER_CHANNEL * 50;
|
||||
|
||||
#ifdef Q_OS_ANDROID
|
||||
QTimer _checkInputTimer;
|
||||
|
@ -401,7 +406,7 @@ private:
|
|||
void configureReverb();
|
||||
void updateReverbOptions();
|
||||
|
||||
void handleLocalEchoAndReverb(QByteArray& inputByteArray);
|
||||
void handleLocalEchoAndReverb(QByteArray& inputByteArray, const int& sampleRate, const int& channelCount);
|
||||
|
||||
bool switchInputToAudioDevice(const QAudioDeviceInfo inputDeviceInfo, bool isShutdownRequest = false);
|
||||
bool switchOutputToAudioDevice(const QAudioDeviceInfo outputDeviceInfo, bool isShutdownRequest = false);
|
||||
|
|
Loading…
Reference in a new issue