The "Holy Bible" for embedded engineers
Watchdog timers are critical safety mechanisms in embedded systems that monitor system health and automatically trigger recovery actions when the system becomes unresponsive or enters an error state. This guide covers watchdog timer implementation, system recovery strategies, and best practices for building robust embedded systems.
Watchdog timers are hardware or software timers that:
Watchdog Operation Cycle:
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ System │───▶│ Watchdog │───▶│ Recovery │
│ Running │ │ Timer │ │ Action │
└─────────────┘ └─────────────┘ └─────────────┘
│ │ │
▼ ▼ ▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Kick │ │ Countdown │ │ System │
│ Watchdog │ │ Timer │ │ Restart │
└─────────────┘ └─────────────┘ └─────────────┘
System Recovery Hierarchy
┌─────────────────────────────────────────────────────────────┐
│ Level 1: Soft Reset │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Restart application tasks │ │
│ │ Reinitialize peripherals │ │
│ │ Preserve critical data │ │
│ └─────────────────────────────────────────────────────────┘ │
├─────────────────────────────────────────────────────────────┤
│ Level 2: Warm Reset │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Restart RTOS and tasks │ │
│ │ Reinitialize hardware │ │
│ │ Load configuration from non-volatile memory │ │
│ └─────────────────────────────────────────────────────────┘ │
├─────────────────────────────────────────────────────────────┤
│ Level 3: Cold Reset │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Complete system restart │ │
│ │ Full hardware initialization │ │
│ │ Load firmware from bootloader │ │
│ └─────────────────────────────────────────────────────────┘ │
├─────────────────────────────────────────────────────────────┤
│ Level 4: Safe Mode │
│ ┌─────────────────────────────────────────────────────────┐ │
│ │ Minimal functionality │ │
│ │ Error reporting and diagnostics │ │
│ │ Manual intervention required │ │
│ └─────────────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────────┘
Windowed Watchdog Timing
┌─────────────────────────────────────────────────────────────┐
│ Timeout Period (e.g., 1000ms) │
│ ┌─────────┬─────────────┬─────────┬─────────────────────┐ │
│ │ 0ms │ 200ms │ 800ms │ 1000ms │ │
│ │ │ Window │ Window │ │ │
│ │ Invalid │ Start │ End │ Invalid │ │
│ │ Kick │ │ │ Kick │ │
│ └─────────┴─────────────┴─────────┴─────────────────────┘ │
│ │ │ │ │
│ ▼ ▼ ▼ │
│ Early Kick Valid Kick Late Kick │
│ (Violation) (Success) (Violation) │
└─────────────────────────────────────────────────────────────┘
Error Detection and Recovery Flow
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Error │───▶│ Error │───▶│ Recovery │
│ Detection │ │ Analysis │ │ Strategy │
└─────────────┘ └─────────────┘ └─────────────┘
│ │ │
│ │ │
▼ ▼ ▼
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
│ Watchdog │ │ Error │ │ Execute │
│ Timeout │ │ Logging │ │ Recovery │
└─────────────┘ └─────────────┘ └─────────────┘
│
▼
┌─────────────┐
│ Verify │
│ Recovery │
└─────────────┘
Hardware watchdogs are implemented in silicon and provide the highest reliability:
// Hardware watchdog configuration
typedef struct {
uint32_t timeout_ms; // Watchdog timeout in milliseconds
uint32_t window_start_ms; // Window start time (for windowed mode)
uint32_t window_end_ms; // Window end time (for windowed mode)
bool window_mode_enabled; // Enable windowed operation
bool debug_mode_enabled; // Enable debug mode (stops during debug)
uint32_t prescaler; // Clock prescaler value
} hw_watchdog_config_t;
// Hardware watchdog registers (example for STM32)
#define IWDG_KR_KEY_RELOAD 0xAAAA
#define IWDG_KR_KEY_ENABLE 0xCCCC
#define IWDG_KR_KEY_WRITE_ACCESS 0x5555
// Hardware watchdog timeout calculation
uint32_t calculate_hw_watchdog_timeout(uint32_t prescaler, uint32_t reload_value) {
// IWDG clock = LSI / 4 = 40kHz / 4 = 10kHz
uint32_t iwdg_clock = 10000; // 10kHz
// Timeout = (reload_value + 1) * prescaler / iwdg_clock
return ((reload_value + 1) * prescaler * 1000) / iwdg_clock;
}
Software watchdogs provide flexibility and can monitor specific application tasks:
// Software watchdog structure
typedef struct {
uint32_t task_id; // Task identifier
uint32_t last_kick_time; // Last kick timestamp
uint32_t timeout_ms; // Task timeout
bool enabled; // Watchdog enabled flag
void (*recovery_callback)(void); // Recovery function pointer
} sw_watchdog_t;
// Software watchdog array
#define MAX_SW_WATCHDOGS 16
sw_watchdog_t sw_watchdogs[MAX_SW_WATCHDOGS];
uint32_t sw_watchdog_count = 0;
// Register software watchdog
int register_sw_watchdog(uint32_t task_id, uint32_t timeout_ms,
void (*recovery_callback)(void)) {
if (sw_watchdog_count >= MAX_SW_WATCHDOGS) {
return -1; // No more slots available
}
sw_watchdogs[sw_watchdog_count].task_id = task_id;
sw_watchdogs[sw_watchdog_count].timeout_ms = timeout_ms;
sw_watchdogs[sw_watchdog_count].recovery_callback = recovery_callback;
sw_watchdogs[sw_watchdog_count].enabled = true;
sw_watchdogs[sw_watchdog_count].last_kick_time = get_system_time_ms();
sw_watchdog_count++;
return 0;
}
Windowed watchdogs prevent early kicks and ensure proper timing:
// Windowed watchdog configuration
typedef struct {
uint32_t window_start_ms; // Start of kick window
uint32_t window_end_ms; // End of kick window
uint32_t total_timeout_ms; // Total watchdog timeout
bool early_kick_detected; // Early kick detection flag
uint32_t early_kick_count; // Count of early kicks
} windowed_watchdog_t;
// Check if kick is within valid window
bool is_kick_within_window(windowed_watchdog_t *wdog, uint32_t current_time) {
uint32_t time_since_start = current_time % wdog->total_timeout_ms;
if (time_since_start < wdog->window_start_ms) {
// Too early - kick before window opens
wdog->early_kick_detected = true;
wdog->early_kick_count++;
return false;
} else if (time_since_start > wdog->window_end_ms) {
// Too late - window has closed
return false;
}
return true;
}
// Initialize hardware watchdog
int init_hardware_watchdog(hw_watchdog_config_t *config) {
// 1. Enable write access to IWDG registers
IWDG->KR = IWDG_KR_KEY_WRITE_ACCESS;
// 2. Configure prescaler
uint32_t prescaler_value;
switch (config->prescaler) {
case 4: prescaler_value = IWDG_PR_PR_0; break; // /4
case 8: prescaler_value = IWDG_PR_PR_1; break; // /8
case 16: prescaler_value = IWDG_PR_PR_0 | IWDG_PR_PR_1; break; // /16
case 32: prescaler_value = IWDG_PR_PR_2; break; // /32
case 64: prescaler_value = IWDG_PR_PR_0 | IWDG_PR_PR_2; break; // /64
case 128: prescaler_value = IWDG_PR_PR_1 | IWDG_PR_PR_2; break; // /128
case 256: prescaler_value = IWDG_PR_PR_0 | IWDG_PR_PR_1 | IWDG_PR_PR_2; break; // /256
default: return -1; // Invalid prescaler
}
IWDG->PR = prescaler_value;
// 3. Calculate reload value for desired timeout
uint32_t reload_value = (config->timeout_ms * 10000) / (config->prescaler * 1000) - 1;
IWDG->RLR = reload_value;
// 4. Enable watchdog
IWDG->KR = IWDG_KR_KEY_ENABLE;
return 0;
}
// Kick hardware watchdog
void kick_hardware_watchdog(void) {
IWDG->KR = IWDG_KR_KEY_RELOAD;
}
// Software watchdog task
void software_watchdog_task(void *pvParameters) {
uint32_t current_time;
uint32_t elapsed_time;
while (1) {
current_time = get_system_time_ms();
// Check all registered watchdogs
for (uint32_t i = 0; i < sw_watchdog_count; i++) {
if (sw_watchdogs[i].enabled) {
elapsed_time = current_time - sw_watchdogs[i].last_kick_time;
if (elapsed_time > sw_watchdogs[i].timeout_ms) {
// Watchdog timeout - trigger recovery
log_error("Software watchdog timeout for task %d",
sw_watchdogs[i].task_id);
if (sw_watchdogs[i].recovery_callback != NULL) {
sw_watchdogs[i].recovery_callback();
}
// Reset kick time
sw_watchdogs[i].last_kick_time = current_time;
}
}
}
// Check every 100ms
vTaskDelay(pdMS_TO_TICKS(100));
}
}
// Kick software watchdog
void kick_software_watchdog(uint32_t task_id) {
for (uint32_t i = 0; i < sw_watchdog_count; i++) {
if (sw_watchdogs[i].task_id == task_id && sw_watchdogs[i].enabled) {
sw_watchdogs[i].last_kick_time = get_system_time_ms();
break;
}
}
}
// Multi-level watchdog system
typedef struct {
hw_watchdog_t hw_watchdog; // Hardware watchdog (last resort)
sw_watchdog_t app_watchdog; // Application-level watchdog
sw_watchdog_t task_watchdog; // Task-level watchdog
sw_watchdog_t comm_watchdog; // Communication watchdog
uint32_t recovery_level; // Current recovery level
uint32_t failure_count; // Count of consecutive failures
} multi_level_watchdog_t;
// Initialize multi-level watchdog system
int init_multi_level_watchdog(multi_level_watchdog_t *mlwd) {
// 1. Initialize hardware watchdog (longest timeout)
hw_watchdog_config_t hw_config = {
.timeout_ms = 5000, // 5 second timeout
.window_mode_enabled = false,
.debug_mode_enabled = true
};
if (init_hardware_watchdog(&hw_config) != 0) {
return -1;
}
// 2. Initialize application watchdog (medium timeout)
if (register_sw_watchdog(APP_TASK_ID, 2000, app_recovery_callback) != 0) {
return -1;
}
// 3. Initialize task watchdog (short timeout)
if (register_sw_watchdog(TASK_MONITOR_ID, 500, task_recovery_callback) != 0) {
return -1;
}
// 4. Initialize communication watchdog
if (register_sw_watchdog(COMM_TASK_ID, 1000, comm_recovery_callback) != 0) {
return -1;
}
mlwd->recovery_level = 0;
mlwd->failure_count = 0;
return 0;
}
// Recovery level definitions
typedef enum {
RECOVERY_LEVEL_NONE = 0,
RECOVERY_LEVEL_SOFT_RESET,
RECOVERY_LEVEL_WARM_RESET,
RECOVERY_LEVEL_COLD_RESET,
RECOVERY_LEVEL_SAFE_MODE,
RECOVERY_LEVEL_FACTORY_RESET
} recovery_level_t;
// Recovery level configuration
typedef struct {
recovery_level_t level; // Recovery level
uint32_t max_attempts; // Maximum attempts at this level
uint32_t attempt_count; // Current attempt count
uint32_t cooldown_period_ms; // Cooldown period between attempts
uint32_t last_attempt_time; // Timestamp of last attempt
bool enabled; // Level enabled flag
} recovery_level_config_t;
// Recovery level manager
typedef struct {
recovery_level_config_t levels[RECOVERY_LEVEL_FACTORY_RESET + 1];
recovery_level_t current_level; // Current active level
uint32_t total_failure_count; // Total failures across all levels
bool recovery_in_progress; // Recovery operation status
} recovery_manager_t;
// Initialize recovery manager
int init_recovery_manager(recovery_manager_t *manager) {
// Configure each recovery level
manager->levels[RECOVERY_LEVEL_SOFT_RESET] = (recovery_level_config_t){
.level = RECOVERY_LEVEL_SOFT_RESET,
.max_attempts = 3,
.cooldown_period_ms = 1000,
.enabled = true
};
manager->levels[RECOVERY_LEVEL_WARM_RESET] = (recovery_level_config_t){
.level = RECOVERY_LEVEL_WARM_RESET,
.max_attempts = 2,
.cooldown_period_ms = 5000,
.enabled = true
};
manager->levels[RECOVERY_LEVEL_COLD_RESET] = (recovery_level_config_t){
.level = RECOVERY_LEVEL_COLD_RESET,
.max_attempts = 1,
.cooldown_period_ms = 30000,
.enabled = true
};
manager->levels[RECOVERY_LEVEL_SAFE_MODE] = (recovery_level_config_t){
.level = RECOVERY_LEVEL_SAFE_MODE,
.max_attempts = 1,
.cooldown_period_ms = 0,
.enabled = true
};
manager->current_level = RECOVERY_LEVEL_NONE;
manager->total_failure_count = 0;
manager->recovery_in_progress = false;
return 0;
}
// Recovery action functions
int perform_soft_reset(void) {
log_info("Performing soft reset");
// 1. Stop all application tasks
vTaskSuspendAll();
// 2. Reinitialize critical peripherals
if (reinit_critical_peripherals() != 0) {
log_error("Critical peripheral reinitialization failed");
return -1;
}
// 3. Restart application tasks
xTaskResumeAll();
// 4. Reset watchdog timers
reset_all_watchdogs();
log_info("Soft reset completed");
return 0;
}
int perform_warm_reset(void) {
log_info("Performing warm reset");
// 1. Save critical data to non-volatile memory
if (save_critical_data() != 0) {
log_error("Failed to save critical data");
return -1;
}
// 2. Stop RTOS
vTaskEndScheduler();
// 3. Reinitialize hardware
if (reinit_hardware() != 0) {
log_error("Hardware reinitialization failed");
return -1;
}
// 4. Restart RTOS
if (restart_rtos() != 0) {
log_error("RTOS restart failed");
return -1;
}
log_info("Warm reset completed");
return 0;
}
int perform_cold_reset(void) {
log_info("Performing cold reset");
// 1. Save diagnostic information
save_diagnostic_info();
// 2. Perform complete system reset
NVIC_SystemReset();
// This function should never return
return 0;
}
// Error severity levels
typedef enum {
ERROR_SEVERITY_LOW = 0,
ERROR_SEVERITY_MEDIUM,
ERROR_SEVERITY_HIGH,
ERROR_SEVERITY_CRITICAL,
ERROR_SEVERITY_FATAL
} error_severity_t;
// Error categories
typedef enum {
ERROR_CATEGORY_HARDWARE = 0,
ERROR_CATEGORY_SOFTWARE,
ERROR_CATEGORY_COMMUNICATION,
ERROR_CATEGORY_POWER,
ERROR_CATEGORY_TIMING,
ERROR_CATEGORY_MEMORY
} error_category_t;
// Error information structure
typedef struct {
uint32_t error_id; // Unique error identifier
error_severity_t severity; // Error severity level
error_category_t category; // Error category
uint32_t timestamp; // Error occurrence time
uint32_t task_id; // Task that detected the error
uint32_t error_code; // Specific error code
char description[64]; // Error description
uint32_t context_data[4]; // Additional context data
} error_info_t;
// Error handler function type
typedef void (*error_handler_t)(error_info_t *error);
// Error handler registration
typedef struct {
error_category_t category; // Category to handle
error_severity_t min_severity; // Minimum severity to handle
error_handler_t handler; // Handler function
} error_handler_registration_t;
#define MAX_ERROR_HANDLERS 16
error_handler_registration_t error_handlers[MAX_ERROR_HANDLERS];
uint32_t error_handler_count = 0;
// Register error handler
int register_error_handler(error_category_t category, error_severity_t min_severity,
error_handler_t handler) {
if (error_handler_count >= MAX_ERROR_HANDLERS) {
return -1;
}
error_handlers[error_handler_count].category = category;
error_handlers[error_handler_count].min_severity = min_severity;
error_handlers[error_handler_count].handler = handler;
error_handler_count++;
return 0;
}
// Memory corruption detection
typedef struct {
uint32_t start_magic; // Start magic number
uint32_t end_magic; // End magic number
uint32_t checksum; // Data checksum
uint32_t size; // Data size
} memory_guard_t;
// Check memory integrity
bool check_memory_integrity(void *data, uint32_t size) {
memory_guard_t *guard = (memory_guard_t*)data;
// Check magic numbers
if (guard->start_magic != MEMORY_GUARD_MAGIC_START ||
guard->end_magic != MEMORY_GUARD_MAGIC_END) {
return false;
}
// Check size
if (guard->size != size) {
return false;
}
// Check checksum
uint32_t calculated_checksum = calculate_checksum(
(uint8_t*)data + sizeof(memory_guard_t),
size - sizeof(memory_guard_t)
);
return (calculated_checksum == guard->checksum);
}
// Stack overflow detection
bool check_stack_overflow(void) {
extern uint32_t _estack; // Stack end address
extern uint32_t __stack_start__; // Stack start address
uint32_t current_sp = __get_MSP(); // Current stack pointer
// Check if stack pointer is within valid range
if (current_sp < (uint32_t)&__stack_start__ ||
current_sp > (uint32_t)&_estack) {
return true; // Stack overflow detected
}
// Check stack usage pattern
uint32_t *stack_ptr = (uint32_t*)current_sp;
for (int i = 0; i < 16; i++) {
if (stack_ptr[i] == STACK_PATTERN) {
return true; // Stack overflow detected
}
}
return false;
}
// Automatic recovery manager
typedef struct {
uint32_t recovery_attempts; // Number of recovery attempts
uint32_t max_recovery_attempts; // Maximum recovery attempts
uint32_t recovery_cooldown_ms; // Cooldown period between attempts
uint32_t last_recovery_time; // Timestamp of last recovery
bool automatic_recovery_enabled; // Automatic recovery flag
} auto_recovery_manager_t;
// Perform automatic recovery
int perform_automatic_recovery(auto_recovery_manager_t *manager,
error_info_t *error) {
uint32_t current_time = get_system_time_ms();
// Check if enough time has passed since last recovery
if (current_time - manager->last_recovery_time < manager->recovery_cooldown_ms) {
log_warning("Recovery cooldown period not expired");
return -1;
}
// Check if maximum attempts exceeded
if (manager->recovery_attempts >= manager->max_recovery_attempts) {
log_error("Maximum recovery attempts exceeded");
return -1;
}
// Perform recovery based on error type
int result = 0;
switch (error->category) {
case ERROR_CATEGORY_SOFTWARE:
result = perform_soft_reset();
break;
case ERROR_CATEGORY_COMMUNICATION:
result = restart_communication_tasks();
break;
case ERROR_CATEGORY_MEMORY:
result = perform_memory_recovery();
break;
default:
result = perform_soft_reset();
break;
}
if (result == 0) {
manager->recovery_attempts++;
manager->last_recovery_time = current_time;
log_info("Automatic recovery successful (attempt %d/%d)",
manager->recovery_attempts, manager->max_recovery_attempts);
}
return result;
}
// Manual recovery commands
typedef enum {
RECOVERY_CMD_SOFT_RESET,
RECOVERY_CMD_WARM_RESET,
RECOVERY_CMD_COLD_RESET,
RECOVERY_CMD_SAFE_MODE,
RECOVERY_CMD_FACTORY_RESET,
RECOVERY_CMD_GET_STATUS,
RECOVERY_CMD_CLEAR_ERRORS
} recovery_command_t;
// Manual recovery handler
int handle_manual_recovery_command(recovery_command_t command) {
switch (command) {
case RECOVERY_CMD_SOFT_RESET:
return perform_soft_reset();
case RECOVERY_CMD_WARM_RESET:
return perform_warm_reset();
case RECOVERY_CMD_COLD_RESET:
return perform_cold_reset();
case RECOVERY_CMD_SAFE_MODE:
return enter_safe_mode();
case RECOVERY_CMD_FACTORY_RESET:
return perform_factory_reset();
case RECOVERY_CMD_GET_STATUS:
return get_recovery_status();
case RECOVERY_CMD_CLEAR_ERRORS:
return clear_error_logs();
default:
return -1;
}
}
// Complete watchdog and recovery system
typedef struct {
multi_level_watchdog_t watchdog; // Multi-level watchdog system
recovery_manager_t recovery; // Recovery level manager
auto_recovery_manager_t auto_recovery; // Automatic recovery manager
error_logger_t error_logger; // Error logging system
bool system_healthy; // System health status
} watchdog_recovery_system_t;
// Initialize complete system
int init_watchdog_recovery_system(watchdog_recovery_system_t *system) {
int result = 0;
// 1. Initialize multi-level watchdog
result = init_multi_level_watchdog(&system->watchdog);
if (result != 0) {
log_error("Failed to initialize multi-level watchdog");
return result;
}
// 2. Initialize recovery manager
result = init_recovery_manager(&system->recovery);
if (result != 0) {
log_error("Failed to initialize recovery manager");
return result;
}
// 3. Initialize automatic recovery
system->auto_recovery.recovery_attempts = 0;
system->auto_recovery.max_recovery_attempts = 5;
system->auto_recovery.recovery_cooldown_ms = 10000;
system->auto_recovery.automatic_recovery_enabled = true;
// 4. Initialize error logger
result = init_error_logger(&system->error_logger);
if (result != 0) {
log_error("Failed to initialize error logger");
return result;
}
system->system_healthy = true;
log_info("Watchdog and recovery system initialized successfully");
return 0;
}
// Main watchdog monitoring task
void watchdog_monitoring_task(void *pvParameters) {
watchdog_recovery_system_t *system = (watchdog_recovery_system_t*)pvParameters;
while (1) {
// 1. Check system health
if (!check_system_health()) {
system->system_healthy = false;
// 2. Log error
error_info_t error = {
.error_id = generate_error_id(),
.severity = ERROR_SEVERITY_HIGH,
.category = ERROR_CATEGORY_SOFTWARE,
.timestamp = get_system_time_ms(),
.task_id = WATCHDOG_TASK_ID,
.error_code = ERROR_CODE_SYSTEM_UNHEALTHY,
.description = "System health check failed"
};
log_error(&system->error_logger, &error);
// 3. Attempt automatic recovery
if (system->auto_recovery.automatic_recovery_enabled) {
perform_automatic_recovery(&system->auto_recovery, &error);
}
} else {
system->system_healthy = true;
}
// 4. Kick hardware watchdog
kick_hardware_watchdog();
// 5. Check every 100ms
vTaskDelay(pdMS_TO_TICKS(100));
}
}
Watchdog timers and system recovery mechanisms are essential for building robust embedded systems. A well-designed watchdog and recovery system provides:
The key to successful watchdog and recovery implementation lies in:
By following these principles and implementing the techniques discussed in this guide, developers can create robust, reliable, and maintainable embedded systems that automatically recover from failures and provide excellent user experience.