Merge pull request #7457 from ZappoMan/reduceFalsePositiveDeadlocks

make the deadlock watchdog more generous
This commit is contained in:
Philip Rosedale 2016-03-25 16:10:37 -07:00
commit 44c6f81c0f
2 changed files with 109 additions and 11 deletions

View file

@ -240,25 +240,26 @@ class DeadlockWatchdogThread : public QThread {
public:
static const unsigned long HEARTBEAT_CHECK_INTERVAL_SECS = 1;
static const unsigned long HEARTBEAT_UPDATE_INTERVAL_SECS = 1;
static const unsigned long MAX_HEARTBEAT_AGE_USECS = 15 * USECS_PER_SECOND;
static const unsigned long HEARTBEAT_REPORT_INTERVAL_USECS = 5 * USECS_PER_SECOND;
static const unsigned long MAX_HEARTBEAT_AGE_USECS = 30 * USECS_PER_SECOND;
static const int WARNING_ELAPSED_HEARTBEAT = 500 * USECS_PER_MSEC; // warn if elapsed heartbeat average is large
static const int HEARTBEAT_SAMPLES = 100000; // ~5 seconds worth of samples
// Set the heartbeat on launch
DeadlockWatchdogThread() {
setObjectName("Deadlock Watchdog");
QTimer* heartbeatTimer = new QTimer();
// Give the heartbeat an initial value
updateHeartbeat();
connect(heartbeatTimer, &QTimer::timeout, [this] {
updateHeartbeat();
});
heartbeatTimer->start(HEARTBEAT_UPDATE_INTERVAL_SECS * MSECS_PER_SECOND);
_heartbeat = usecTimestampNow();
connect(qApp, &QCoreApplication::aboutToQuit, [this] {
_quit = true;
});
}
void updateHeartbeat() {
_heartbeat = usecTimestampNow();
auto now = usecTimestampNow();
auto elapsed = now - _heartbeat;
_movingAverage.addSample(elapsed);
_heartbeat = now;
}
void deadlockDetectionCrash() {
@ -269,10 +270,52 @@ public:
void run() override {
while (!_quit) {
QThread::sleep(HEARTBEAT_UPDATE_INTERVAL_SECS);
#ifdef NDEBUG
auto now = usecTimestampNow();
auto lastHeartbeatAge = now - _heartbeat;
// in the unlikely event that now is less than _heartbeat, don't rollover and confuse ourselves
auto lastHeartbeatAge = (now > _heartbeat) ? now - _heartbeat : 0;
auto sinceLastReport = (now > _lastReport) ? now - _lastReport : 0;
auto elapsedMovingAverage = _movingAverage.getAverage();
if (elapsedMovingAverage > _maxElapsedAverage) {
qDebug() << "DEADLOCK WATCHDOG NEW maxElapsedAverage:"
<< "lastHeartbeatAge:" << lastHeartbeatAge
<< "elapsedMovingAverage:" << elapsedMovingAverage
<< "maxElapsed:" << _maxElapsed
<< "PREVIOUS maxElapsedAverage:" << _maxElapsedAverage
<< "NEW maxElapsedAverage:" << elapsedMovingAverage
<< "samples:" << _movingAverage.getSamples();
_maxElapsedAverage = elapsedMovingAverage;
}
if (lastHeartbeatAge > _maxElapsed) {
qDebug() << "DEADLOCK WATCHDOG NEW maxElapsed:"
<< "lastHeartbeatAge:" << lastHeartbeatAge
<< "elapsedMovingAverage:" << elapsedMovingAverage
<< "PREVIOUS maxElapsed:" << _maxElapsed
<< "NEW maxElapsed:" << lastHeartbeatAge
<< "maxElapsedAverage:" << _maxElapsedAverage
<< "samples:" << _movingAverage.getSamples();
_maxElapsed = lastHeartbeatAge;
}
if ((sinceLastReport > HEARTBEAT_REPORT_INTERVAL_USECS) || (elapsedMovingAverage > WARNING_ELAPSED_HEARTBEAT)) {
qDebug() << "DEADLOCK WATCHDOG STATUS -- lastHeartbeatAge:" << lastHeartbeatAge
<< "elapsedMovingAverage:" << elapsedMovingAverage
<< "maxElapsed:" << _maxElapsed
<< "maxElapsedAverage:" << _maxElapsedAverage
<< "samples:" << _movingAverage.getSamples();
_lastReport = now;
}
#ifdef NDEBUG
if (lastHeartbeatAge > MAX_HEARTBEAT_AGE_USECS) {
qDebug() << "DEADLOCK DETECTED -- "
<< "lastHeartbeatAge:" << lastHeartbeatAge
<< "[ _heartbeat:" << _heartbeat
<< "now:" << now << " ]"
<< "elapsedMovingAverage:" << elapsedMovingAverage
<< "maxElapsed:" << _maxElapsed
<< "maxElapsedAverage:" << _maxElapsedAverage
<< "samples:" << _movingAverage.getSamples();
deadlockDetectionCrash();
}
#endif
@ -280,10 +323,19 @@ public:
}
static std::atomic<uint64_t> _heartbeat;
static std::atomic<uint64_t> _lastReport;
static std::atomic<uint64_t> _maxElapsed;
static std::atomic<int> _maxElapsedAverage;
static ThreadSafeMovingAverage<int, HEARTBEAT_SAMPLES> _movingAverage;
bool _quit { false };
};
std::atomic<uint64_t> DeadlockWatchdogThread::_heartbeat;
std::atomic<uint64_t> DeadlockWatchdogThread::_lastReport;
std::atomic<uint64_t> DeadlockWatchdogThread::_maxElapsed;
std::atomic<int> DeadlockWatchdogThread::_maxElapsedAverage;
ThreadSafeMovingAverage<int, DeadlockWatchdogThread::HEARTBEAT_SAMPLES> DeadlockWatchdogThread::_movingAverage;
#ifdef Q_OS_WIN
class MyNativeEventFilter : public QAbstractNativeEventFilter {
@ -1381,6 +1433,8 @@ void Application::initializeUi() {
void Application::paintGL() {
updateHeartbeat();
// Some plugins process message events, potentially leading to
// re-entering a paint event. don't allow further processing if this
// happens
@ -2502,6 +2556,8 @@ static uint32_t _renderedFrameIndex { INVALID_FRAME };
void Application::idle(uint64_t now) {
updateHeartbeat();
if (_aboutToQuit || _inPaint) {
return; // bail early, nothing to do here.
}

View file

@ -14,6 +14,7 @@
#ifndef hifi_SimpleMovingAverage_h
#define hifi_SimpleMovingAverage_h
#include <mutex>
#include <stdint.h>
class SimpleMovingAverage {
@ -64,4 +65,45 @@ public:
}
};
template <class T, int MAX_NUM_SAMPLES> class ThreadSafeMovingAverage {
public:
void clear() {
std::unique_lock<std::mutex> lock(_lock);
_samples = 0;
}
bool isAverageValid() const {
std::unique_lock<std::mutex> lock(_lock);
return (_samples > 0);
}
void addSample(T sample) {
std::unique_lock<std::mutex> lock(_lock);
if (_samples > 0) {
_average = (sample * WEIGHTING) + (_average * ONE_MINUS_WEIGHTING);
} else {
_average = sample;
}
_samples++;
}
T getAverage() const {
std::unique_lock<std::mutex> lock(_lock);
return _average;
}
size_t getSamples() const {
std::unique_lock<std::mutex> lock(_lock);
return _samples;
}
private:
const float WEIGHTING = 1.0f / (float)MAX_NUM_SAMPLES;
const float ONE_MINUS_WEIGHTING = 1.0f - WEIGHTING;
size_t _samples { 0 };
T _average;
mutable std::mutex _lock;
};
#endif // hifi_SimpleMovingAverage_h