"""Fan control logic including curves and automatic control.""" import json import logging import asyncio from typing import List, Dict, Optional, Any from dataclasses import dataclass, asdict from datetime import datetime, timedelta from enum import Enum from sqlalchemy.orm import Session from database import ( Server, FanCurve, SensorData, FanData, SystemLog, get_db, SessionLocal ) from ipmi_client import IPMIClient, TemperatureReading from config import settings logger = logging.getLogger(__name__) class ControlState(Enum): """Fan control state.""" AUTO = "auto" MANUAL = "manual" PANIC = "panic" OFF = "off" @dataclass class FanCurvePoint: """Single point on a fan curve.""" temp: float speed: int class FanCurveManager: """Manages fan curve calculations.""" @staticmethod def parse_curve(curve_data: str) -> List[FanCurvePoint]: """Parse fan curve from JSON string.""" try: data = json.loads(curve_data) return [FanCurvePoint(p["temp"], p["speed"]) for p in data] except (json.JSONDecodeError, KeyError) as e: logger.error(f"Failed to parse fan curve: {e}") # Return default curve return [ FanCurvePoint(30, 10), FanCurvePoint(40, 20), FanCurvePoint(50, 35), FanCurvePoint(60, 50), FanCurvePoint(70, 70), FanCurvePoint(80, 100), ] @staticmethod def serialize_curve(points: List[FanCurvePoint]) -> str: """Serialize fan curve to JSON string.""" return json.dumps([{"temp": p.temp, "speed": p.speed} for p in points]) @staticmethod def calculate_speed(curve: List[FanCurvePoint], temperature: float) -> int: """ Calculate fan speed for a given temperature using linear interpolation. """ if not curve: return 50 # Default to 50% if no curve # Sort by temperature sorted_curve = sorted(curve, key=lambda p: p.temp) # Below minimum temp if temperature <= sorted_curve[0].temp: return sorted_curve[0].speed # Above maximum temp if temperature >= sorted_curve[-1].temp: return sorted_curve[-1].speed # Find surrounding points for i in range(len(sorted_curve) - 1): p1 = sorted_curve[i] p2 = sorted_curve[i + 1] if p1.temp <= temperature <= p2.temp: # Linear interpolation if p2.temp == p1.temp: return p1.speed ratio = (temperature - p1.temp) / (p2.temp - p1.temp) speed = p1.speed + ratio * (p2.speed - p1.speed) return int(round(speed)) return sorted_curve[-1].speed class FanController: """Main fan controller for managing server fans.""" def __init__(self): self.curve_manager = FanCurveManager() self.running = False self._tasks: Dict[int, asyncio.Task] = {} # server_id -> task self._last_sensor_data: Dict[int, datetime] = {} # server_id -> timestamp async def start(self): """Start the fan controller service.""" self.running = True logger.info("Fan controller started") # Load all servers with auto-control enabled db = SessionLocal() try: servers = db.query(Server).filter( Server.auto_control_enabled == True, Server.is_active == True ).all() for server in servers: await self.start_server_control(server.id) finally: db.close() async def stop(self): """Stop all fan control tasks.""" self.running = False for task in self._tasks.values(): task.cancel() self._tasks.clear() logger.info("Fan controller stopped") async def start_server_control(self, server_id: int): """Start automatic control for a server.""" if server_id in self._tasks: self._tasks[server_id].cancel() task = asyncio.create_task(self._control_loop(server_id)) self._tasks[server_id] = task logger.info(f"Started fan control for server {server_id}") async def stop_server_control(self, server_id: int): """Stop automatic control for a server.""" if server_id in self._tasks: self._tasks[server_id].cancel() del self._tasks[server_id] logger.info(f"Stopped fan control for server {server_id}") async def _control_loop(self, server_id: int): """Main control loop for a server.""" while self.running: try: await self._control_iteration(server_id) await asyncio.sleep(5) # 5 second interval except asyncio.CancelledError: break except Exception as e: logger.error(f"Control loop error for server {server_id}: {e}") await asyncio.sleep(10) async def _control_iteration(self, server_id: int): """Single control iteration for a server.""" db = SessionLocal() try: server = db.query(Server).filter(Server.id == server_id).first() if not server or not server.is_active: return # Create IPMI client from auth import decrypt_password client = IPMIClient( host=server.host, username=server.username, password=decrypt_password(server.encrypted_password), port=server.port, vendor=server.vendor ) # Test connection if not client.test_connection(): logger.warning(f"Cannot connect to server {server.name}") await self._handle_connection_loss(db, server) return # Get sensor data temps = client.get_temperatures() fans = client.get_fan_speeds() all_sensors = client.get_all_sensors() # Store sensor data self._store_sensor_data(db, server_id, temps, fans, all_sensors) # Update last sensor data time self._last_sensor_data[server_id] = datetime.utcnow() server.last_seen = datetime.utcnow() # Check panic mode if self._should_panic(db, server_id, server): await self._enter_panic_mode(db, server, client) return # Calculate and set fan speed if auto control is enabled if server.auto_control_enabled: await self._apply_fan_curve(db, server, client, temps) db.commit() finally: db.close() def _store_sensor_data(self, db: Session, server_id: int, temps: List[TemperatureReading], fans: List[Any], all_sensors: List[Any]): """Store sensor data in database.""" now = datetime.utcnow() # Store temperature readings for temp in temps: sensor = SensorData( server_id=server_id, sensor_name=temp.name, sensor_type="temperature", value=temp.value, unit="°C", timestamp=now ) db.add(sensor) # Store fan readings for fan in fans: fan_data = FanData( server_id=server_id, fan_number=fan.fan_number, fan_id=fan.fan_id, speed_rpm=fan.speed_rpm, speed_percent=fan.speed_percent, is_manual=False, timestamp=now ) db.add(fan_data) def _should_panic(self, db: Session, server_id: int, server: Server) -> bool: """Check if we should enter panic mode.""" if not server.panic_mode_enabled: return False last_seen = self._last_sensor_data.get(server_id) if not last_seen: return False timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS elapsed = (datetime.utcnow() - last_seen).total_seconds() if elapsed > timeout: logger.warning(f"Panic mode triggered for server {server.name}: " f"No sensor data for {elapsed:.0f}s") return True return False async def _enter_panic_mode(self, db: Session, server: Server, client: IPMIClient): """Enter panic mode - set fans to 100%.""" logger.critical(f"Entering PANIC MODE for server {server.name}") # Log the event log = SystemLog( server_id=server.id, event_type="panic", message=f"Panic mode activated - No sensor data received", details=f"Setting all fans to {settings.PANIC_FAN_SPEED}%" ) db.add(log) # Enable manual control if not already if not server.manual_control_enabled: client.enable_manual_fan_control() server.manual_control_enabled = True # Set fans to max client.set_all_fans_speed(settings.PANIC_FAN_SPEED) db.commit() async def _apply_fan_curve(self, db: Session, server: Server, client: IPMIClient, temps: List[TemperatureReading]): """Apply fan curve based on temperatures.""" if not temps: return # Get active fan curve curve_data = server.fan_curve_data if not curve_data: # Use default curve curve = [ FanCurvePoint(30, 10), FanCurvePoint(40, 20), FanCurvePoint(50, 35), FanCurvePoint(60, 50), FanCurvePoint(70, 70), FanCurvePoint(80, 100), ] else: curve = self.curve_manager.parse_curve(curve_data) # Find the highest CPU temperature cpu_temps = [t for t in temps if t.location.startswith("cpu")] if cpu_temps: max_temp = max(t.value for t in cpu_temps) else: # Fall back to highest overall temp max_temp = max(t.value for t in temps) # Calculate target speed target_speed = self.curve_manager.calculate_speed(curve, max_temp) # Enable manual control if not already if not server.manual_control_enabled: if client.enable_manual_fan_control(): server.manual_control_enabled = True logger.info(f"Enabled manual fan control for {server.name}") # Set fan speed current_fans = client.get_fan_speeds() avg_current_speed = 0 if current_fans: # Estimate current speed from RPM if possible avg_current_speed = 50 # Default assumption # Only update if speed changed significantly (avoid constant small changes) if abs(target_speed - avg_current_speed) >= 5: if client.set_all_fans_speed(target_speed): logger.info(f"Set {server.name} fans to {target_speed}% (temp: {max_temp}°C)") async def _handle_connection_loss(self, db: Session, server: Server): """Handle connection loss to a server.""" logger.warning(f"Connection lost to server {server.name}") # Check if we should panic server_id = server.id last_seen = self._last_sensor_data.get(server_id) if last_seen: timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS elapsed = (datetime.utcnow() - last_seen).total_seconds() if elapsed > timeout and server.panic_mode_enabled: log = SystemLog( server_id=server.id, event_type="error", message=f"Connection lost to server", details=f"Last seen {elapsed:.0f} seconds ago" ) db.add(log) db.commit() def get_controller_status(self, server_id: int) -> Dict[str, Any]: """Get current controller status for a server.""" is_running = server_id in self._tasks last_seen = self._last_sensor_data.get(server_id) return { "is_running": is_running, "last_sensor_data": last_seen.isoformat() if last_seen else None, "state": ControlState.AUTO.value if is_running else ControlState.OFF.value } # Global controller instance fan_controller = FanController() async def initialize_fan_controller(): """Initialize and start the fan controller.""" await fan_controller.start() async def shutdown_fan_controller(): """Shutdown the fan controller.""" await fan_controller.stop()