//
//  AudioGate.cpp
//  libraries/audio/src
//
//  Created by Ken Cooke on 5/5/17.
//  Copyright 2017 High Fidelity, Inc.
//

#include "AudioGate.h"

#include <string.h>
#include <stdlib.h>
#include <assert.h>

#include "AudioDynamics.h"

// log2 domain headroom bits above 0dB (int32_t)
static const int LOG2_HEADROOM_Q30 = 1;

// convert Q30 to Q15 with saturation
static inline int32_t saturateQ30(int32_t x) {

    x = (x + (1 << 14)) >> 15;
    x = MIN(MAX(x, -32768), 32767);

    return x;
}

//
// First-order DC-blocking filter, with zero at 1.0 and pole at 0.9999
//
// -3dB @ 1.0 Hz (48KHz)
// -3dB @ 0.5 Hz (24KHz)
//
// input in Q15, output in Q30
//
class MonoDCBlock {

    int32_t _dcOffset = {};     // Q30, cannot overflow

public:
    void process(int32_t& x) {

        x *= (1 << 15);         // scale to Q30
        x -= _dcOffset;         // remove DC
        _dcOffset += x >> 13;   // pole = (1.0 - 2^-13) = 0.9999
    }
};

class StereoDCBlock {

    int32_t _dcOffset[2] = {};

public:
    void process(int32_t& x0, int32_t& x1) {

        x0 *= (1 << 15);
        x1 *= (1 << 15);

        x0 -= _dcOffset[0];
        x1 -= _dcOffset[1];

        _dcOffset[0] += x0 >> 14;
        _dcOffset[1] += x1 >> 14;
    }
};

class QuadDCBlock {

    int32_t _dcOffset[4] = {};

public:
    void process(int32_t& x0, int32_t& x1, int32_t& x2, int32_t& x3) {

        x0 *= (1 << 15);
        x1 *= (1 << 15);
        x2 *= (1 << 15);
        x3 *= (1 << 15);

        x0 -= _dcOffset[0];
        x1 -= _dcOffset[1];
        x2 -= _dcOffset[2];
        x3 -= _dcOffset[3];

        _dcOffset[0] += x0 >> 14;
        _dcOffset[1] += x1 >> 14;
        _dcOffset[2] += x2 >> 14;
        _dcOffset[3] += x3 >> 14;
    }
};

//
// Gate (common)
//
class GateImpl {
protected:

    // histogram
    static const int NHIST = 256;
    int _update[NHIST] = {};
    int _histogram[NHIST] = {};

    // peakhold
    uint32_t _holdMin = 0x7fffffff;
    uint32_t _holdInc = 0x7fffffff;
    uint32_t _holdMax = 0x7fffffff;
    uint32_t _holdRel = 0x7fffffff;
    int32_t _holdPeak = 0x7fffffff;

    // hysteresis
    int32_t _hysteresis = 0;
    int32_t _hystOffset = 0;
    int32_t _hystPeak = 0x7fffffff;

    int32_t _release = 0x7fffffff;

    int32_t _threshFixed = 0;
    int32_t _threshAdapt = 0;
    int32_t _attn = 0x7fffffff;

    int _sampleRate;

public:
    GateImpl(int sampleRate);
    virtual ~GateImpl() {}

    void setThreshold(float threshold);
    void setHold(float hold);
    void setHysteresis(float hysteresis);
    void setRelease(float release);

    void clearHistogram() { memset(_update, 0, sizeof(_update)); }
    void updateHistogram(int32_t value, int count);
    int partitionHistogram();
    void processHistogram(int numFrames);

    int32_t peakhold(int32_t peak);
    int32_t hysteresis(int32_t peak);
    int32_t envelope(int32_t attn);

    virtual bool process(int16_t* input, int16_t* output, int numFrames) = 0;
    virtual bool removeDC(int16_t* input, int16_t* output, int numFrames) = 0;
};

GateImpl::GateImpl(int sampleRate) {

    sampleRate = MAX(sampleRate, 8000);
    sampleRate = MIN(sampleRate, 96000);
    _sampleRate = sampleRate;

    // defaults
    setThreshold(-30.0f);
    setHold(20.0f);
    setHysteresis(6.0f);
    setRelease(1000.0f);
}

//
// Set the gate threshold (dB)
// This is a base value that is modulated by the adaptive threshold algorithm.
//
void GateImpl::setThreshold(float threshold) {

    // gate threshold = -96dB to 0dB
    threshold = MAX(threshold, -96.0f);
    threshold = MIN(threshold, 0.0f);

    // gate threshold in log2 domain
    _threshFixed = (int32_t)(-(double)threshold * DB_TO_LOG2 * (1 << LOG2_FRACBITS));
    _threshFixed += LOG2_HEADROOM_Q30 << LOG2_FRACBITS;

    _threshAdapt = _threshFixed;
}

//
// Set the detector hold time (milliseconds)
//
void GateImpl::setHold(float hold) {

    const double RELEASE = 100.0;   // release = 100ms
    const double PROGHOLD = 100.0;  // progressive hold = 100ms

    // pure hold = 1 to 1000ms
    hold = MAX(hold, 1.0f);
    hold = MIN(hold, 1000.0f);

    // compute final tc
    _holdMin = msToTc(RELEASE, _sampleRate);

    // compute tc increment, to progress from 0x7fffffff to _holdMin in PROGHOLD ms
    double progSamples = PROGHOLD/1000.0 * _sampleRate;
    _holdInc = (uint32_t)((0x7fffffff - _holdMin) / progSamples);
    _holdInc = MAX(_holdInc, 1);    // prevent 0 on long releases
    
    // compute initial tc, to progress from _holdMax to 0x7fffffff in hold ms
    double holdSamples = (double)hold/1000.0 * _sampleRate;
    _holdMax = 0x7fffffff + (uint32_t)(_holdInc * holdSamples);
}

//
// Set the detector hysteresis (dB)
//
void GateImpl::setHysteresis(float hysteresis) {

    // gate hysteresis in log2 domain
    _hysteresis = (int32_t)((double)hysteresis * DB_TO_LOG2 * (1 << LOG2_FRACBITS));
}

//
// Set the gate release time (milliseconds)
//
void GateImpl::setRelease(float release) {

    // gate release = 50 to 5000ms
    release = MAX(release, 50.0f);
    release = MIN(release, 5000.0f);

    _release = msToTc((double)release, _sampleRate);
}

//
// Update the histogram count of the bin which contains value
//
void GateImpl::updateHistogram(int32_t value, int count = 1) {

    // quantize to LOG2 + 3 fraction bits (0.75dB steps)
    int index = (NHIST-1) - (value >> (LOG2_FRACBITS - 3));

    assert(index >= 0);
    assert(index < NHIST);

    _update[index] += count << 16;  // Q16 for filtering

    assert(_update[index] >= 0);
}

//
// Partition the histogram
//
// The idea behind the adaptive threshold:
//
// When processing a gaussian mixture of signal and noise, separated by minimal SNR, 
// a bimodal distribution emerges in the histogram of preprocessed peak levels.
// In this case, the threshold adapts toward the level that optimally partitions the distributions.
// Partitioning is computed using Otsu's method.
//
// When only a single distribution is present, the threshold becomes level-dependent:
// At levels below the fixed threshold, the threshold adapts toward the upper edge
// of the distribution, presumed to be noise.
// At levels above the fixed threshold, the threshold adapts toward the lower edge
// of the distribution, presumed to be signal.
// This is implemented by adding a hidden (bias) distribution at the fixed threshold.
// 
int GateImpl::partitionHistogram() {

    // initialize
    int total = 0;
    float sum = 0.0f;
    for (int i = 0; i < NHIST; i++) {
        total += _histogram[i];
        sum += (float)i * _histogram[i];
    }

    int w0 = 0;
    float sum0 = 0.0f;
    float max = 0.0f;
    int index = 0;

    // find the index that maximizes the between-class variance
    for (int i = 0 ; i < NHIST; i++) {

        // update weights
        w0 += _histogram[i];
        int w1 = total - w0;

        if (w0 == 0) {
            continue;   // skip leading zeros
        }
        if (w1 == 0) {
            break;      // skip trailing zeros
        }

        // update means
        sum0 += (float)i * _histogram[i];
        float sum1 = sum - sum0;

        float m0 = sum0 / (float)w0;
        float m1 = sum1 / (float)w1;

        // between-class variance
        float variance = (float)w0 * (float)w1 * (m0 - m1) * (m0 - m1);

        // update threshold
        if (variance > max) {
            max = variance;
            index = i;
        }
    }
    return index;
}

//
// Process the histogram to update the adaptive threshold
//
void GateImpl::processHistogram(int numFrames) {

    const int32_t LOG2E_Q26 = (int32_t)(log2(exp(1.0)) * (1 << LOG2_FRACBITS) + 0.5);

    // compute time constants, for sampleRate downsampled by numFrames
    int32_t tcHistogram = fixexp2(MULDIV64(numFrames, LOG2E_Q26, _sampleRate * 10));    // 10 seconds
    int32_t tcThreshold = fixexp2(MULDIV64(numFrames, LOG2E_Q26, _sampleRate * 1));     // 1 second
    
    // add bias at the fixed threshold
    updateHistogram(_threshFixed, (numFrames+7)/8);

    // leaky integrate into long-term histogram
    for (int i = 0; i < NHIST; i++) {
        _histogram[i] = _update[i] + MULQ31((_histogram[i] - _update[i]), tcHistogram);
    }

    // compute new threshold
    int index = partitionHistogram();
    int32_t threshold = ((NHIST-1) - index) << (LOG2_FRACBITS - 3);

    // smooth threshold update
    _threshAdapt = threshold + MULQ31((_threshAdapt - threshold), tcThreshold);
}

//
// Gate detector peakhold
//
int32_t GateImpl::peakhold(int32_t peak) {

    if (peak > _holdPeak) {

        // RELEASE
        // 3-stage progressive hold
        //
        // (_holdRel > 0x7fffffff) pure hold
        // (_holdRel > _holdMin) progressive hold
        // (_holdRel = _holdMin) release

        _holdRel -= _holdInc;                                   // update progressive hold
        int32_t tc = MIN(MAX(_holdRel, _holdMin), 0x7fffffff);  // saturate to [_holdMin, 0x7fffffff]
        peak += MULQ31((_holdPeak - peak), tc);                 // apply release

    } else {

        // ATTACK
        _holdRel = _holdMax;    // reset release
    }
    _holdPeak = peak;

    return peak;
}

//
// Gate hysteresis
// Implemented as detector hysteresis instead of high/low thresholds, to simplify adaptive threshold.
//
int32_t GateImpl::hysteresis(int32_t peak) {

    // by construction, cannot overflow/underflow
    assert((double)_hystOffset + (peak - _hystPeak) <= +(double)0x7fffffff);
    assert((double)_hystOffset + (peak - _hystPeak) >= -(double)0x80000000);

    // accumulate the offset, with saturation
    _hystOffset += peak - _hystPeak;
    _hystOffset = MIN(MAX(_hystOffset, 0), _hysteresis);

    _hystPeak = peak;
    peak -= _hystOffset;    // apply hysteresis

    assert(peak >= 0);
    return peak;
}

//
// Gate envelope processing
// zero attack, fixed release
//
int32_t GateImpl::envelope(int32_t attn) {

    if (attn > _attn) {
        attn += MULQ31((_attn - attn), _release);   // apply release
    }
    _attn = attn;

    return attn;
}

//
// Gate (mono)
//
template<int N>
class GateMono : public GateImpl {

    MonoDCBlock _dc;
    MaxFilter<N> _filter;
    MonoDelay<N, int32_t> _delay;

public:
    GateMono(int sampleRate) : GateImpl(sampleRate) {}

    // mono input/output (in-place is allowed)
    bool process(int16_t* input, int16_t* output, int numFrames) override;
    bool removeDC(int16_t* input, int16_t* output, int numFrames) override;
};

template<int N>
bool GateMono<N>::process(int16_t* input, int16_t* output, int numFrames) {

    clearHistogram();
    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x = input[n];

        // remove DC
        _dc.process(x);

        // peak detect
        int32_t peak = abs(x);

        // convert to log2 domain
        peak = fixlog2(peak);

        // apply peak hold
        peak = peakhold(peak);

        // count peak level
        updateHistogram(peak);

        // apply hysteresis
        peak = hysteresis(peak);

        // compute gate attenuation
        int32_t attn = (peak > _threshAdapt) ? 0x7fffffff : 0;    // hard-knee, 1:inf ratio

        // apply envelope
        attn = envelope(attn);

        // convert from log2 domain
        attn = fixexp2(attn);

        // lowpass filter
        attn = _filter.process(attn);

        // delay audio
        _delay.process(x);

        // apply gain
        x = MULQ31(x, attn);

        // store 16-bit output
        x = saturateQ30(x);
        output[n] = (int16_t)x;

        mask |= x;
    }

    // update adaptive threshold
    processHistogram(numFrames);
    return mask != 0;
}

template<int N>
bool GateMono<N>::removeDC(int16_t* input, int16_t* output, int numFrames) {

    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x = input[n];

        // remove DC
        _dc.process(x);

        // store 16-bit output
        x = saturateQ30(x);
        output[n] = (int16_t)x;

        mask |= x;
    }

    return mask != 0;
}

//
// Gate (stereo)
//
template<int N>
class GateStereo : public GateImpl {

    StereoDCBlock _dc;
    MaxFilter<N> _filter;
    StereoDelay<N, int32_t> _delay;

public:
    GateStereo(int sampleRate) : GateImpl(sampleRate) {}

    // interleaved stereo input/output (in-place is allowed)
    bool process(int16_t* input, int16_t* output, int numFrames) override;
    bool removeDC(int16_t* input, int16_t* output, int numFrames) override;
};

template<int N>
bool GateStereo<N>::process(int16_t* input, int16_t* output, int numFrames) {

    clearHistogram();
    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x0 = input[2*n+0];
        int32_t x1 = input[2*n+1];

        // remove DC
        _dc.process(x0, x1);

        // peak detect
        int32_t peak = MAX(abs(x0), abs(x1));

        // convert to log2 domain
        peak = fixlog2(peak);

        // apply peak hold
        peak = peakhold(peak);

        // count peak level
        updateHistogram(peak);

        // apply hysteresis
        peak = hysteresis(peak);

        // compute gate attenuation
        int32_t attn = (peak > _threshAdapt) ? 0x7fffffff : 0;    // hard-knee, 1:inf ratio

        // apply envelope
        attn = envelope(attn);

        // convert from log2 domain
        attn = fixexp2(attn);

        // lowpass filter
        attn = _filter.process(attn);

        // delay audio
        _delay.process(x0, x1);

        // apply gain
        x0 = MULQ31(x0, attn);
        x1 = MULQ31(x1, attn);

        // store 16-bit output
        x0 = saturateQ30(x0);
        x1 = saturateQ30(x1);
        output[2*n+0] = (int16_t)x0;
        output[2*n+1] = (int16_t)x1;

        mask |= (x0 | x1);
    }

    // update adaptive threshold
    processHistogram(numFrames);
    return mask != 0;
}

template<int N>
bool GateStereo<N>::removeDC(int16_t* input, int16_t* output, int numFrames) {

    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x0 = input[2*n+0];
        int32_t x1 = input[2*n+1];

        // remove DC
        _dc.process(x0, x1);

        // store 16-bit output
        x0 = saturateQ30(x0);
        x1 = saturateQ30(x1);
        output[2*n+0] = (int16_t)x0;
        output[2*n+1] = (int16_t)x1;

        mask |= (x0 | x1);
    }

    return mask != 0;
}

//
// Gate (quad)
//
template<int N>
class GateQuad : public GateImpl {

    QuadDCBlock _dc;
    MaxFilter<N> _filter;
    QuadDelay<N, int32_t> _delay;

public:
    GateQuad(int sampleRate) : GateImpl(sampleRate) {}

    // interleaved quad input/output (in-place is allowed)
    bool process(int16_t* input, int16_t* output, int numFrames) override;
    bool removeDC(int16_t* input, int16_t* output, int numFrames) override;
};

template<int N>
bool GateQuad<N>::process(int16_t* input, int16_t* output, int numFrames) {

    clearHistogram();
    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x0 = input[4*n+0];
        int32_t x1 = input[4*n+1];
        int32_t x2 = input[4*n+2];
        int32_t x3 = input[4*n+3];

        // remove DC
        _dc.process(x0, x1, x2, x3);

        // peak detect
        int32_t peak = MAX(MAX(abs(x0), abs(x1)), MAX(abs(x2), abs(x3)));

        // convert to log2 domain
        peak = fixlog2(peak);

        // apply peak hold
        peak = peakhold(peak);

        // count peak level
        updateHistogram(peak);

        // apply hysteresis
        peak = hysteresis(peak);

        // compute gate attenuation
        int32_t attn = (peak > _threshAdapt) ? 0x7fffffff : 0;    // hard-knee, 1:inf ratio

        // apply envelope
        attn = envelope(attn);

        // convert from log2 domain
        attn = fixexp2(attn);

        // lowpass filter
        attn = _filter.process(attn);

        // delay audio
        _delay.process(x0, x1, x2, x3);

        // apply gain
        x0 = MULQ31(x0, attn);
        x1 = MULQ31(x1, attn);
        x2 = MULQ31(x2, attn);
        x3 = MULQ31(x3, attn);

        // store 16-bit output
        x0 = saturateQ30(x0);
        x1 = saturateQ30(x1);
        x2 = saturateQ30(x2);
        x3 = saturateQ30(x3);
        output[4*n+0] = (int16_t)x0;
        output[4*n+1] = (int16_t)x1;
        output[4*n+2] = (int16_t)x2;
        output[4*n+3] = (int16_t)x3;

        mask |= (x0 | x1 | x2 | x3);
    }

    // update adaptive threshold
    processHistogram(numFrames);
    return mask != 0;
}

template<int N>
bool GateQuad<N>::removeDC(int16_t* input, int16_t* output, int numFrames) {

    int32_t mask = 0;

    for (int n = 0; n < numFrames; n++) {

        int32_t x0 = input[4*n+0];
        int32_t x1 = input[4*n+1];
        int32_t x2 = input[4*n+2];
        int32_t x3 = input[4*n+3];

        // remove DC
        _dc.process(x0, x1, x2, x3);

        // store 16-bit output
        x0 = saturateQ30(x0);
        x1 = saturateQ30(x1);
        x2 = saturateQ30(x2);
        x3 = saturateQ30(x3);
        output[4*n+0] = (int16_t)x0;
        output[4*n+1] = (int16_t)x1;
        output[4*n+2] = (int16_t)x2;
        output[4*n+3] = (int16_t)x3;

        mask |= (x0 | x1 | x2 | x3);
    }

    return mask != 0;
}

//
// Public API
//

AudioGate::AudioGate(int sampleRate, int numChannels) {

    if (numChannels == 1) {

        // ~3ms lookahead for all rates
        if (sampleRate < 16000) {
            _impl = new GateMono<32>(sampleRate);
        } else if (sampleRate < 32000) {
            _impl = new GateMono<64>(sampleRate);
        } else if (sampleRate < 64000) {
            _impl = new GateMono<128>(sampleRate);
        } else {
            _impl = new GateMono<256>(sampleRate);
        }

    } else if (numChannels == 2) {

        // ~3ms lookahead for all rates
        if (sampleRate < 16000) {
            _impl = new GateStereo<32>(sampleRate);
        } else if (sampleRate < 32000) {
            _impl = new GateStereo<64>(sampleRate);
        } else if (sampleRate < 64000) {
            _impl = new GateStereo<128>(sampleRate);
        } else {
            _impl = new GateStereo<256>(sampleRate);
        }

    } else if (numChannels == 4) {

        // ~3ms lookahead for all rates
        if (sampleRate < 16000) {
            _impl = new GateQuad<32>(sampleRate);
        } else if (sampleRate < 32000) {
            _impl = new GateQuad<64>(sampleRate);
        } else if (sampleRate < 64000) {
            _impl = new GateQuad<128>(sampleRate);
        } else {
            _impl = new GateQuad<256>(sampleRate);
        }

    } else {
        assert(0); // unsupported
    }
}

AudioGate::~AudioGate() {
    delete _impl;
}

bool AudioGate::render(int16_t* input, int16_t* output, int numFrames) {
    return _impl->process(input, output, numFrames);
}

bool AudioGate::removeDC(int16_t* input, int16_t* output, int numFrames) {
    return _impl->removeDC(input, output, numFrames);
}

void AudioGate::setThreshold(float threshold) {
    _impl->setThreshold(threshold);
}

void AudioGate::setRelease(float release) {
    _impl->setRelease(release);
}