diff --git a/interface/src/Application.cpp b/interface/src/Application.cpp index b9effa444b..381816e81f 100644 --- a/interface/src/Application.cpp +++ b/interface/src/Application.cpp @@ -240,25 +240,26 @@ class DeadlockWatchdogThread : public QThread { public: static const unsigned long HEARTBEAT_CHECK_INTERVAL_SECS = 1; static const unsigned long HEARTBEAT_UPDATE_INTERVAL_SECS = 1; - static const unsigned long MAX_HEARTBEAT_AGE_USECS = 15 * USECS_PER_SECOND; - + static const unsigned long HEARTBEAT_REPORT_INTERVAL_USECS = 5 * USECS_PER_SECOND; + static const unsigned long MAX_HEARTBEAT_AGE_USECS = 30 * USECS_PER_SECOND; + static const int WARNING_ELAPSED_HEARTBEAT = 500 * USECS_PER_MSEC; // warn if elapsed heartbeat average is large + static const int HEARTBEAT_SAMPLES = 100000; // ~5 seconds worth of samples + // Set the heartbeat on launch DeadlockWatchdogThread() { setObjectName("Deadlock Watchdog"); - QTimer* heartbeatTimer = new QTimer(); // Give the heartbeat an initial value - updateHeartbeat(); - connect(heartbeatTimer, &QTimer::timeout, [this] { - updateHeartbeat(); - }); - heartbeatTimer->start(HEARTBEAT_UPDATE_INTERVAL_SECS * MSECS_PER_SECOND); + _heartbeat = usecTimestampNow(); connect(qApp, &QCoreApplication::aboutToQuit, [this] { _quit = true; }); } void updateHeartbeat() { - _heartbeat = usecTimestampNow(); + auto now = usecTimestampNow(); + auto elapsed = now - _heartbeat; + _movingAverage.addSample(elapsed); + _heartbeat = now; } void deadlockDetectionCrash() { @@ -269,10 +270,52 @@ public: void run() override { while (!_quit) { QThread::sleep(HEARTBEAT_UPDATE_INTERVAL_SECS); -#ifdef NDEBUG auto now = usecTimestampNow(); - auto lastHeartbeatAge = now - _heartbeat; + + // in the unlikely event that now is less than _heartbeat, don't rollover and confuse ourselves + auto lastHeartbeatAge = (now > _heartbeat) ? now - _heartbeat : 0; + auto sinceLastReport = (now > _lastReport) ? now - _lastReport : 0; + auto elapsedMovingAverage = _movingAverage.getAverage(); + + if (elapsedMovingAverage > _maxElapsedAverage) { + qDebug() << "DEADLOCK WATCHDOG NEW maxElapsedAverage:" + << "lastHeartbeatAge:" << lastHeartbeatAge + << "elapsedMovingAverage:" << elapsedMovingAverage + << "maxElapsed:" << _maxElapsed + << "PREVIOUS maxElapsedAverage:" << _maxElapsedAverage + << "NEW maxElapsedAverage:" << elapsedMovingAverage + << "samples:" << _movingAverage.getSamples(); + _maxElapsedAverage = elapsedMovingAverage; + } + if (lastHeartbeatAge > _maxElapsed) { + qDebug() << "DEADLOCK WATCHDOG NEW maxElapsed:" + << "lastHeartbeatAge:" << lastHeartbeatAge + << "elapsedMovingAverage:" << elapsedMovingAverage + << "PREVIOUS maxElapsed:" << _maxElapsed + << "NEW maxElapsed:" << lastHeartbeatAge + << "maxElapsedAverage:" << _maxElapsedAverage + << "samples:" << _movingAverage.getSamples(); + _maxElapsed = lastHeartbeatAge; + } + if ((sinceLastReport > HEARTBEAT_REPORT_INTERVAL_USECS) || (elapsedMovingAverage > WARNING_ELAPSED_HEARTBEAT)) { + qDebug() << "DEADLOCK WATCHDOG STATUS -- lastHeartbeatAge:" << lastHeartbeatAge + << "elapsedMovingAverage:" << elapsedMovingAverage + << "maxElapsed:" << _maxElapsed + << "maxElapsedAverage:" << _maxElapsedAverage + << "samples:" << _movingAverage.getSamples(); + _lastReport = now; + } + +#ifdef NDEBUG if (lastHeartbeatAge > MAX_HEARTBEAT_AGE_USECS) { + qDebug() << "DEADLOCK DETECTED -- " + << "lastHeartbeatAge:" << lastHeartbeatAge + << "[ _heartbeat:" << _heartbeat + << "now:" << now << " ]" + << "elapsedMovingAverage:" << elapsedMovingAverage + << "maxElapsed:" << _maxElapsed + << "maxElapsedAverage:" << _maxElapsedAverage + << "samples:" << _movingAverage.getSamples(); deadlockDetectionCrash(); } #endif @@ -280,10 +323,19 @@ public: } static std::atomic _heartbeat; + static std::atomic _lastReport; + static std::atomic _maxElapsed; + static std::atomic _maxElapsedAverage; + static ThreadSafeMovingAverage _movingAverage; + bool _quit { false }; }; std::atomic DeadlockWatchdogThread::_heartbeat; +std::atomic DeadlockWatchdogThread::_lastReport; +std::atomic DeadlockWatchdogThread::_maxElapsed; +std::atomic DeadlockWatchdogThread::_maxElapsedAverage; +ThreadSafeMovingAverage DeadlockWatchdogThread::_movingAverage; #ifdef Q_OS_WIN class MyNativeEventFilter : public QAbstractNativeEventFilter { @@ -1381,6 +1433,8 @@ void Application::initializeUi() { void Application::paintGL() { + updateHeartbeat(); + // Some plugins process message events, potentially leading to // re-entering a paint event. don't allow further processing if this // happens @@ -2502,6 +2556,8 @@ static uint32_t _renderedFrameIndex { INVALID_FRAME }; void Application::idle(uint64_t now) { + updateHeartbeat(); + if (_aboutToQuit || _inPaint) { return; // bail early, nothing to do here. } diff --git a/libraries/shared/src/SimpleMovingAverage.h b/libraries/shared/src/SimpleMovingAverage.h index 53754ae241..296911ae3e 100644 --- a/libraries/shared/src/SimpleMovingAverage.h +++ b/libraries/shared/src/SimpleMovingAverage.h @@ -14,6 +14,7 @@ #ifndef hifi_SimpleMovingAverage_h #define hifi_SimpleMovingAverage_h +#include #include class SimpleMovingAverage { @@ -64,4 +65,45 @@ public: } }; +template class ThreadSafeMovingAverage { +public: + void clear() { + std::unique_lock lock(_lock); + _samples = 0; + } + + bool isAverageValid() const { + std::unique_lock lock(_lock); + return (_samples > 0); + } + + void addSample(T sample) { + std::unique_lock lock(_lock); + if (_samples > 0) { + _average = (sample * WEIGHTING) + (_average * ONE_MINUS_WEIGHTING); + } else { + _average = sample; + } + _samples++; + } + + T getAverage() const { + std::unique_lock lock(_lock); + return _average; + } + + size_t getSamples() const { + std::unique_lock lock(_lock); + return _samples; + } + +private: + const float WEIGHTING = 1.0f / (float)MAX_NUM_SAMPLES; + const float ONE_MINUS_WEIGHTING = 1.0f - WEIGHTING; + size_t _samples { 0 }; + T _average; + mutable std::mutex _lock; +}; + + #endif // hifi_SimpleMovingAverage_h