devmatrix-scripts/infrastructure/health-monitor.sh

153 lines
4.1 KiB
Bash
Executable File

#!/bin/bash
# Health Monitor for Mission Control
# Runs every minute via cron
# Source: https://git.lemonlink.eu/devmatrix/devmatrix-scripts
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LOG_FILE="/var/log/mission-control/health-monitor.log"
ALERT_COOLDOWN=300 # 5 minutes between alerts
LAST_ALERT_FILE="/tmp/mission-control-last-alert"
HEALTH_URL="http://localhost:3000/api/health"
MAX_RETRIES=3
RETRY_DELAY=5
# Logging
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
}
# Check if we should send alert (cooldown)
should_alert() {
if [ -f "$LAST_ALERT_FILE" ]; then
local last_alert=$(cat "$LAST_ALERT_FILE")
local current=$(date +%s)
local diff=$((current - last_alert))
if [ $diff -lt $ALERT_COOLDOWN ]; then
return 1 # Don't alert yet
fi
fi
return 0 # Can alert
}
# Record alert time
record_alert() {
date +%s > "$LAST_ALERT_FILE"
}
# Send alert
send_alert() {
local message="$1"
if ! should_alert; then
log "Alert cooldown active, skipping notification"
return
fi
record_alert
# Log to file
log "ALERT: $message"
# Telegram notification
if [ -f "$HOME/.telegram_bot_token" ]; then
local bot_token=$(cat "$HOME/.telegram_bot_token")
local chat_id=$(cat "$HOME/.telegram_chat_id" 2>/dev/null || echo "")
if [ -n "$chat_id" ]; then
curl -s -X POST "https://api.telegram.org/bot$bot_token/sendMessage" \
-d "chat_id=$chat_id" \
-d "text=🚨 MISSION CONTROL ALERT%0A%0A$message%0A%0ATime: $(date)" \
> /dev/null 2>&1 &
fi
fi
# System notification
if command -v notify-send > /dev/null 2>&1; then
DISPLAY=:0 notify-send -u critical "Mission Control Alert" "$message" 2>/dev/null &
fi
}
# Health check
health_check() {
local retries=0
while [ $retries -lt $MAX_RETRIES ]; do
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
# Health check passed
if [ -f "/tmp/mission-control-down" ]; then
rm "/tmp/mission-control-down"
log "✅ Service recovered and is now healthy"
send_alert "✅ Mission Control is back online!"
fi
return 0
fi
retries=$((retries + 1))
if [ $retries -lt $MAX_RETRIES ]; then
log "Health check failed, retry $retries/$MAX_RETRIES..."
sleep $RETRY_DELAY
fi
done
return 1
}
# Auto-restart service
restart_service() {
log "Attempting to restart Mission Control..."
# Try PM2 restart first
if command -v pm2 > /dev/null 2>&1; then
pm2 reload mission-control
sleep 10
if health_check; then
log "✅ Service restarted successfully via PM2"
send_alert "✅ Mission Control was auto-restarted and is now healthy"
return 0
fi
fi
# Fallback to systemd
systemctl restart mission-control
sleep 10
if health_check; then
log "✅ Service restarted successfully via systemd"
send_alert "✅ Mission Control was auto-restarted (systemd) and is now healthy"
return 0
fi
return 1
}
# Main
main() {
# Check if service is supposed to be running
if ! pgrep -f "mission-control" > /dev/null 2>&1; then
if [ ! -f "/tmp/mission-control-down" ]; then
log "⚠️ Mission Control is not running!"
touch "/tmp/mission-control-down"
send_alert "⚠️ Mission Control is DOWN! Attempting auto-restart..."
restart_service
fi
exit 1
fi
# Health check
if ! health_check; then
if [ ! -f "/tmp/mission-control-down" ]; then
log "❌ Health check failed after $MAX_RETRIES retries"
touch "/tmp/mission-control-down"
send_alert "❌ Mission Control health check FAILED! Status: Unhealthy"
restart_service
fi
exit 1
fi
exit 0
}
main "$@"