mirror of
https://github.com/JulianGro/overte.git
synced 2025-04-25 17:14:59 +02:00
Merge pull request #7457 from ZappoMan/reduceFalsePositiveDeadlocks
make the deadlock watchdog more generous
This commit is contained in:
commit
44c6f81c0f
2 changed files with 109 additions and 11 deletions
|
@ -240,25 +240,26 @@ class DeadlockWatchdogThread : public QThread {
|
|||
public:
|
||||
static const unsigned long HEARTBEAT_CHECK_INTERVAL_SECS = 1;
|
||||
static const unsigned long HEARTBEAT_UPDATE_INTERVAL_SECS = 1;
|
||||
static const unsigned long MAX_HEARTBEAT_AGE_USECS = 15 * USECS_PER_SECOND;
|
||||
|
||||
static const unsigned long HEARTBEAT_REPORT_INTERVAL_USECS = 5 * USECS_PER_SECOND;
|
||||
static const unsigned long MAX_HEARTBEAT_AGE_USECS = 30 * USECS_PER_SECOND;
|
||||
static const int WARNING_ELAPSED_HEARTBEAT = 500 * USECS_PER_MSEC; // warn if elapsed heartbeat average is large
|
||||
static const int HEARTBEAT_SAMPLES = 100000; // ~5 seconds worth of samples
|
||||
|
||||
// Set the heartbeat on launch
|
||||
DeadlockWatchdogThread() {
|
||||
setObjectName("Deadlock Watchdog");
|
||||
QTimer* heartbeatTimer = new QTimer();
|
||||
// Give the heartbeat an initial value
|
||||
updateHeartbeat();
|
||||
connect(heartbeatTimer, &QTimer::timeout, [this] {
|
||||
updateHeartbeat();
|
||||
});
|
||||
heartbeatTimer->start(HEARTBEAT_UPDATE_INTERVAL_SECS * MSECS_PER_SECOND);
|
||||
_heartbeat = usecTimestampNow();
|
||||
connect(qApp, &QCoreApplication::aboutToQuit, [this] {
|
||||
_quit = true;
|
||||
});
|
||||
}
|
||||
|
||||
void updateHeartbeat() {
|
||||
_heartbeat = usecTimestampNow();
|
||||
auto now = usecTimestampNow();
|
||||
auto elapsed = now - _heartbeat;
|
||||
_movingAverage.addSample(elapsed);
|
||||
_heartbeat = now;
|
||||
}
|
||||
|
||||
void deadlockDetectionCrash() {
|
||||
|
@ -269,10 +270,52 @@ public:
|
|||
void run() override {
|
||||
while (!_quit) {
|
||||
QThread::sleep(HEARTBEAT_UPDATE_INTERVAL_SECS);
|
||||
#ifdef NDEBUG
|
||||
auto now = usecTimestampNow();
|
||||
auto lastHeartbeatAge = now - _heartbeat;
|
||||
|
||||
// in the unlikely event that now is less than _heartbeat, don't rollover and confuse ourselves
|
||||
auto lastHeartbeatAge = (now > _heartbeat) ? now - _heartbeat : 0;
|
||||
auto sinceLastReport = (now > _lastReport) ? now - _lastReport : 0;
|
||||
auto elapsedMovingAverage = _movingAverage.getAverage();
|
||||
|
||||
if (elapsedMovingAverage > _maxElapsedAverage) {
|
||||
qDebug() << "DEADLOCK WATCHDOG NEW maxElapsedAverage:"
|
||||
<< "lastHeartbeatAge:" << lastHeartbeatAge
|
||||
<< "elapsedMovingAverage:" << elapsedMovingAverage
|
||||
<< "maxElapsed:" << _maxElapsed
|
||||
<< "PREVIOUS maxElapsedAverage:" << _maxElapsedAverage
|
||||
<< "NEW maxElapsedAverage:" << elapsedMovingAverage
|
||||
<< "samples:" << _movingAverage.getSamples();
|
||||
_maxElapsedAverage = elapsedMovingAverage;
|
||||
}
|
||||
if (lastHeartbeatAge > _maxElapsed) {
|
||||
qDebug() << "DEADLOCK WATCHDOG NEW maxElapsed:"
|
||||
<< "lastHeartbeatAge:" << lastHeartbeatAge
|
||||
<< "elapsedMovingAverage:" << elapsedMovingAverage
|
||||
<< "PREVIOUS maxElapsed:" << _maxElapsed
|
||||
<< "NEW maxElapsed:" << lastHeartbeatAge
|
||||
<< "maxElapsedAverage:" << _maxElapsedAverage
|
||||
<< "samples:" << _movingAverage.getSamples();
|
||||
_maxElapsed = lastHeartbeatAge;
|
||||
}
|
||||
if ((sinceLastReport > HEARTBEAT_REPORT_INTERVAL_USECS) || (elapsedMovingAverage > WARNING_ELAPSED_HEARTBEAT)) {
|
||||
qDebug() << "DEADLOCK WATCHDOG STATUS -- lastHeartbeatAge:" << lastHeartbeatAge
|
||||
<< "elapsedMovingAverage:" << elapsedMovingAverage
|
||||
<< "maxElapsed:" << _maxElapsed
|
||||
<< "maxElapsedAverage:" << _maxElapsedAverage
|
||||
<< "samples:" << _movingAverage.getSamples();
|
||||
_lastReport = now;
|
||||
}
|
||||
|
||||
#ifdef NDEBUG
|
||||
if (lastHeartbeatAge > MAX_HEARTBEAT_AGE_USECS) {
|
||||
qDebug() << "DEADLOCK DETECTED -- "
|
||||
<< "lastHeartbeatAge:" << lastHeartbeatAge
|
||||
<< "[ _heartbeat:" << _heartbeat
|
||||
<< "now:" << now << " ]"
|
||||
<< "elapsedMovingAverage:" << elapsedMovingAverage
|
||||
<< "maxElapsed:" << _maxElapsed
|
||||
<< "maxElapsedAverage:" << _maxElapsedAverage
|
||||
<< "samples:" << _movingAverage.getSamples();
|
||||
deadlockDetectionCrash();
|
||||
}
|
||||
#endif
|
||||
|
@ -280,10 +323,19 @@ public:
|
|||
}
|
||||
|
||||
static std::atomic<uint64_t> _heartbeat;
|
||||
static std::atomic<uint64_t> _lastReport;
|
||||
static std::atomic<uint64_t> _maxElapsed;
|
||||
static std::atomic<int> _maxElapsedAverage;
|
||||
static ThreadSafeMovingAverage<int, HEARTBEAT_SAMPLES> _movingAverage;
|
||||
|
||||
bool _quit { false };
|
||||
};
|
||||
|
||||
std::atomic<uint64_t> DeadlockWatchdogThread::_heartbeat;
|
||||
std::atomic<uint64_t> DeadlockWatchdogThread::_lastReport;
|
||||
std::atomic<uint64_t> DeadlockWatchdogThread::_maxElapsed;
|
||||
std::atomic<int> DeadlockWatchdogThread::_maxElapsedAverage;
|
||||
ThreadSafeMovingAverage<int, DeadlockWatchdogThread::HEARTBEAT_SAMPLES> DeadlockWatchdogThread::_movingAverage;
|
||||
|
||||
#ifdef Q_OS_WIN
|
||||
class MyNativeEventFilter : public QAbstractNativeEventFilter {
|
||||
|
@ -1381,6 +1433,8 @@ void Application::initializeUi() {
|
|||
|
||||
void Application::paintGL() {
|
||||
|
||||
updateHeartbeat();
|
||||
|
||||
// Some plugins process message events, potentially leading to
|
||||
// re-entering a paint event. don't allow further processing if this
|
||||
// happens
|
||||
|
@ -2502,6 +2556,8 @@ static uint32_t _renderedFrameIndex { INVALID_FRAME };
|
|||
|
||||
void Application::idle(uint64_t now) {
|
||||
|
||||
updateHeartbeat();
|
||||
|
||||
if (_aboutToQuit || _inPaint) {
|
||||
return; // bail early, nothing to do here.
|
||||
}
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
#ifndef hifi_SimpleMovingAverage_h
|
||||
#define hifi_SimpleMovingAverage_h
|
||||
|
||||
#include <mutex>
|
||||
#include <stdint.h>
|
||||
|
||||
class SimpleMovingAverage {
|
||||
|
@ -64,4 +65,45 @@ public:
|
|||
}
|
||||
};
|
||||
|
||||
template <class T, int MAX_NUM_SAMPLES> class ThreadSafeMovingAverage {
|
||||
public:
|
||||
void clear() {
|
||||
std::unique_lock<std::mutex> lock(_lock);
|
||||
_samples = 0;
|
||||
}
|
||||
|
||||
bool isAverageValid() const {
|
||||
std::unique_lock<std::mutex> lock(_lock);
|
||||
return (_samples > 0);
|
||||
}
|
||||
|
||||
void addSample(T sample) {
|
||||
std::unique_lock<std::mutex> lock(_lock);
|
||||
if (_samples > 0) {
|
||||
_average = (sample * WEIGHTING) + (_average * ONE_MINUS_WEIGHTING);
|
||||
} else {
|
||||
_average = sample;
|
||||
}
|
||||
_samples++;
|
||||
}
|
||||
|
||||
T getAverage() const {
|
||||
std::unique_lock<std::mutex> lock(_lock);
|
||||
return _average;
|
||||
}
|
||||
|
||||
size_t getSamples() const {
|
||||
std::unique_lock<std::mutex> lock(_lock);
|
||||
return _samples;
|
||||
}
|
||||
|
||||
private:
|
||||
const float WEIGHTING = 1.0f / (float)MAX_NUM_SAMPLES;
|
||||
const float ONE_MINUS_WEIGHTING = 1.0f - WEIGHTING;
|
||||
size_t _samples { 0 };
|
||||
T _average;
|
||||
mutable std::mutex _lock;
|
||||
};
|
||||
|
||||
|
||||
#endif // hifi_SimpleMovingAverage_h
|
||||
|
|
Loading…
Reference in a new issue