153 lines
4.1 KiB
Bash
Executable File
153 lines
4.1 KiB
Bash
Executable File
#!/bin/bash
|
|
# Health Monitor for Mission Control
|
|
# Runs every minute via cron
|
|
# Source: https://git.lemonlink.eu/devmatrix/devmatrix-scripts
|
|
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
LOG_FILE="/var/log/mission-control/health-monitor.log"
|
|
ALERT_COOLDOWN=300 # 5 minutes between alerts
|
|
LAST_ALERT_FILE="/tmp/mission-control-last-alert"
|
|
HEALTH_URL="http://localhost:3000/api/health"
|
|
MAX_RETRIES=3
|
|
RETRY_DELAY=5
|
|
|
|
# Logging
|
|
log() {
|
|
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE"
|
|
}
|
|
|
|
# Check if we should send alert (cooldown)
|
|
should_alert() {
|
|
if [ -f "$LAST_ALERT_FILE" ]; then
|
|
local last_alert=$(cat "$LAST_ALERT_FILE")
|
|
local current=$(date +%s)
|
|
local diff=$((current - last_alert))
|
|
|
|
if [ $diff -lt $ALERT_COOLDOWN ]; then
|
|
return 1 # Don't alert yet
|
|
fi
|
|
fi
|
|
return 0 # Can alert
|
|
}
|
|
|
|
# Record alert time
|
|
record_alert() {
|
|
date +%s > "$LAST_ALERT_FILE"
|
|
}
|
|
|
|
# Send alert
|
|
send_alert() {
|
|
local message="$1"
|
|
|
|
if ! should_alert; then
|
|
log "Alert cooldown active, skipping notification"
|
|
return
|
|
fi
|
|
|
|
record_alert
|
|
|
|
# Log to file
|
|
log "ALERT: $message"
|
|
|
|
# Telegram notification
|
|
if [ -f "$HOME/.telegram_bot_token" ]; then
|
|
local bot_token=$(cat "$HOME/.telegram_bot_token")
|
|
local chat_id=$(cat "$HOME/.telegram_chat_id" 2>/dev/null || echo "")
|
|
|
|
if [ -n "$chat_id" ]; then
|
|
curl -s -X POST "https://api.telegram.org/bot$bot_token/sendMessage" \
|
|
-d "chat_id=$chat_id" \
|
|
-d "text=🚨 MISSION CONTROL ALERT%0A%0A$message%0A%0ATime: $(date)" \
|
|
> /dev/null 2>&1 &
|
|
fi
|
|
fi
|
|
|
|
# System notification
|
|
if command -v notify-send > /dev/null 2>&1; then
|
|
DISPLAY=:0 notify-send -u critical "Mission Control Alert" "$message" 2>/dev/null &
|
|
fi
|
|
}
|
|
|
|
# Health check
|
|
health_check() {
|
|
local retries=0
|
|
|
|
while [ $retries -lt $MAX_RETRIES ]; do
|
|
if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then
|
|
# Health check passed
|
|
if [ -f "/tmp/mission-control-down" ]; then
|
|
rm "/tmp/mission-control-down"
|
|
log "✅ Service recovered and is now healthy"
|
|
send_alert "✅ Mission Control is back online!"
|
|
fi
|
|
return 0
|
|
fi
|
|
|
|
retries=$((retries + 1))
|
|
if [ $retries -lt $MAX_RETRIES ]; then
|
|
log "Health check failed, retry $retries/$MAX_RETRIES..."
|
|
sleep $RETRY_DELAY
|
|
fi
|
|
done
|
|
|
|
return 1
|
|
}
|
|
|
|
# Auto-restart service
|
|
restart_service() {
|
|
log "Attempting to restart Mission Control..."
|
|
|
|
# Try PM2 restart first
|
|
if command -v pm2 > /dev/null 2>&1; then
|
|
pm2 reload mission-control
|
|
sleep 10
|
|
|
|
if health_check; then
|
|
log "✅ Service restarted successfully via PM2"
|
|
send_alert "✅ Mission Control was auto-restarted and is now healthy"
|
|
return 0
|
|
fi
|
|
fi
|
|
|
|
# Fallback to systemd
|
|
systemctl restart mission-control
|
|
sleep 10
|
|
|
|
if health_check; then
|
|
log "✅ Service restarted successfully via systemd"
|
|
send_alert "✅ Mission Control was auto-restarted (systemd) and is now healthy"
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
# Main
|
|
main() {
|
|
# Check if service is supposed to be running
|
|
if ! pgrep -f "mission-control" > /dev/null 2>&1; then
|
|
if [ ! -f "/tmp/mission-control-down" ]; then
|
|
log "⚠️ Mission Control is not running!"
|
|
touch "/tmp/mission-control-down"
|
|
send_alert "⚠️ Mission Control is DOWN! Attempting auto-restart..."
|
|
restart_service
|
|
fi
|
|
exit 1
|
|
fi
|
|
|
|
# Health check
|
|
if ! health_check; then
|
|
if [ ! -f "/tmp/mission-control-down" ]; then
|
|
log "❌ Health check failed after $MAX_RETRIES retries"
|
|
touch "/tmp/mission-control-down"
|
|
send_alert "❌ Mission Control health check FAILED! Status: Unhealthy"
|
|
restart_service
|
|
fi
|
|
exit 1
|
|
fi
|
|
|
|
exit 0
|
|
}
|
|
|
|
main "$@"
|