#!/bin/bash # Health Monitor for Mission Control # Runs every minute via cron # Source: https://git.lemonlink.eu/devmatrix/devmatrix-scripts SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" LOG_FILE="/var/log/mission-control/health-monitor.log" ALERT_COOLDOWN=300 # 5 minutes between alerts LAST_ALERT_FILE="/tmp/mission-control-last-alert" HEALTH_URL="http://localhost:3000/api/health" MAX_RETRIES=3 RETRY_DELAY=5 # Logging log() { echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a "$LOG_FILE" } # Check if we should send alert (cooldown) should_alert() { if [ -f "$LAST_ALERT_FILE" ]; then local last_alert=$(cat "$LAST_ALERT_FILE") local current=$(date +%s) local diff=$((current - last_alert)) if [ $diff -lt $ALERT_COOLDOWN ]; then return 1 # Don't alert yet fi fi return 0 # Can alert } # Record alert time record_alert() { date +%s > "$LAST_ALERT_FILE" } # Send alert send_alert() { local message="$1" if ! should_alert; then log "Alert cooldown active, skipping notification" return fi record_alert # Log to file log "ALERT: $message" # Telegram notification if [ -f "$HOME/.telegram_bot_token" ]; then local bot_token=$(cat "$HOME/.telegram_bot_token") local chat_id=$(cat "$HOME/.telegram_chat_id" 2>/dev/null || echo "") if [ -n "$chat_id" ]; then curl -s -X POST "https://api.telegram.org/bot$bot_token/sendMessage" \ -d "chat_id=$chat_id" \ -d "text=🚨 MISSION CONTROL ALERT%0A%0A$message%0A%0ATime: $(date)" \ > /dev/null 2>&1 & fi fi # System notification if command -v notify-send > /dev/null 2>&1; then DISPLAY=:0 notify-send -u critical "Mission Control Alert" "$message" 2>/dev/null & fi } # Health check health_check() { local retries=0 while [ $retries -lt $MAX_RETRIES ]; do if curl -sf "$HEALTH_URL" > /dev/null 2>&1; then # Health check passed if [ -f "/tmp/mission-control-down" ]; then rm "/tmp/mission-control-down" log "✅ Service recovered and is now healthy" send_alert "✅ Mission Control is back online!" fi return 0 fi retries=$((retries + 1)) if [ $retries -lt $MAX_RETRIES ]; then log "Health check failed, retry $retries/$MAX_RETRIES..." sleep $RETRY_DELAY fi done return 1 } # Auto-restart service restart_service() { log "Attempting to restart Mission Control..." # Try PM2 restart first if command -v pm2 > /dev/null 2>&1; then pm2 reload mission-control sleep 10 if health_check; then log "✅ Service restarted successfully via PM2" send_alert "✅ Mission Control was auto-restarted and is now healthy" return 0 fi fi # Fallback to systemd systemctl restart mission-control sleep 10 if health_check; then log "✅ Service restarted successfully via systemd" send_alert "✅ Mission Control was auto-restarted (systemd) and is now healthy" return 0 fi return 1 } # Main main() { # Check if service is supposed to be running if ! pgrep -f "mission-control" > /dev/null 2>&1; then if [ ! -f "/tmp/mission-control-down" ]; then log "⚠️ Mission Control is not running!" touch "/tmp/mission-control-down" send_alert "⚠️ Mission Control is DOWN! Attempting auto-restart..." restart_service fi exit 1 fi # Health check if ! health_check; then if [ ! -f "/tmp/mission-control-down" ]; then log "❌ Health check failed after $MAX_RETRIES retries" touch "/tmp/mission-control-down" send_alert "❌ Mission Control health check FAILED! Status: Unhealthy" restart_service fi exit 1 fi exit 0 } main "$@"