ipmi-fan-control/backend/fan_control.py

382 lines
13 KiB
Python

"""Fan control logic including curves and automatic control."""
import json
import logging
import asyncio
from typing import List, Dict, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
from sqlalchemy.orm import Session
from backend.database import (
Server, FanCurve, SensorData, FanData, SystemLog,
get_db, SessionLocal
)
from backend.ipmi_client import IPMIClient, TemperatureReading
from backend.config import settings
logger = logging.getLogger(__name__)
class ControlState(Enum):
"""Fan control state."""
AUTO = "auto"
MANUAL = "manual"
PANIC = "panic"
OFF = "off"
@dataclass
class FanCurvePoint:
"""Single point on a fan curve."""
temp: float
speed: int
class FanCurveManager:
"""Manages fan curve calculations."""
@staticmethod
def parse_curve(curve_data: str) -> List[FanCurvePoint]:
"""Parse fan curve from JSON string."""
try:
data = json.loads(curve_data)
return [FanCurvePoint(p["temp"], p["speed"]) for p in data]
except (json.JSONDecodeError, KeyError) as e:
logger.error(f"Failed to parse fan curve: {e}")
# Return default curve
return [
FanCurvePoint(30, 10),
FanCurvePoint(40, 20),
FanCurvePoint(50, 35),
FanCurvePoint(60, 50),
FanCurvePoint(70, 70),
FanCurvePoint(80, 100),
]
@staticmethod
def serialize_curve(points: List[FanCurvePoint]) -> str:
"""Serialize fan curve to JSON string."""
return json.dumps([{"temp": p.temp, "speed": p.speed} for p in points])
@staticmethod
def calculate_speed(curve: List[FanCurvePoint], temperature: float) -> int:
"""
Calculate fan speed for a given temperature using linear interpolation.
"""
if not curve:
return 50 # Default to 50% if no curve
# Sort by temperature
sorted_curve = sorted(curve, key=lambda p: p.temp)
# Below minimum temp
if temperature <= sorted_curve[0].temp:
return sorted_curve[0].speed
# Above maximum temp
if temperature >= sorted_curve[-1].temp:
return sorted_curve[-1].speed
# Find surrounding points
for i in range(len(sorted_curve) - 1):
p1 = sorted_curve[i]
p2 = sorted_curve[i + 1]
if p1.temp <= temperature <= p2.temp:
# Linear interpolation
if p2.temp == p1.temp:
return p1.speed
ratio = (temperature - p1.temp) / (p2.temp - p1.temp)
speed = p1.speed + ratio * (p2.speed - p1.speed)
return int(round(speed))
return sorted_curve[-1].speed
class FanController:
"""Main fan controller for managing server fans."""
def __init__(self):
self.curve_manager = FanCurveManager()
self.running = False
self._tasks: Dict[int, asyncio.Task] = {} # server_id -> task
self._last_sensor_data: Dict[int, datetime] = {} # server_id -> timestamp
async def start(self):
"""Start the fan controller service."""
self.running = True
logger.info("Fan controller started")
# Load all servers with auto-control enabled
db = SessionLocal()
try:
servers = db.query(Server).filter(
Server.auto_control_enabled == True,
Server.is_active == True
).all()
for server in servers:
await self.start_server_control(server.id)
finally:
db.close()
async def stop(self):
"""Stop all fan control tasks."""
self.running = False
for task in self._tasks.values():
task.cancel()
self._tasks.clear()
logger.info("Fan controller stopped")
async def start_server_control(self, server_id: int):
"""Start automatic control for a server."""
if server_id in self._tasks:
self._tasks[server_id].cancel()
task = asyncio.create_task(self._control_loop(server_id))
self._tasks[server_id] = task
logger.info(f"Started fan control for server {server_id}")
async def stop_server_control(self, server_id: int):
"""Stop automatic control for a server."""
if server_id in self._tasks:
self._tasks[server_id].cancel()
del self._tasks[server_id]
logger.info(f"Stopped fan control for server {server_id}")
async def _control_loop(self, server_id: int):
"""Main control loop for a server."""
while self.running:
try:
await self._control_iteration(server_id)
await asyncio.sleep(5) # 5 second interval
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Control loop error for server {server_id}: {e}")
await asyncio.sleep(10)
async def _control_iteration(self, server_id: int):
"""Single control iteration for a server."""
db = SessionLocal()
try:
server = db.query(Server).filter(Server.id == server_id).first()
if not server or not server.is_active:
return
# Create IPMI client
from backend.auth import decrypt_password
client = IPMIClient(
host=server.host,
username=server.username,
password=decrypt_password(server.encrypted_password),
port=server.port,
vendor=server.vendor
)
# Test connection
if not client.test_connection():
logger.warning(f"Cannot connect to server {server.name}")
await self._handle_connection_loss(db, server)
return
# Get sensor data
temps = client.get_temperatures()
fans = client.get_fan_speeds()
all_sensors = client.get_all_sensors()
# Store sensor data
self._store_sensor_data(db, server_id, temps, fans, all_sensors)
# Update last sensor data time
self._last_sensor_data[server_id] = datetime.utcnow()
server.last_seen = datetime.utcnow()
# Check panic mode
if self._should_panic(db, server_id, server):
await self._enter_panic_mode(db, server, client)
return
# Calculate and set fan speed if auto control is enabled
if server.auto_control_enabled:
await self._apply_fan_curve(db, server, client, temps)
db.commit()
finally:
db.close()
def _store_sensor_data(self, db: Session, server_id: int,
temps: List[TemperatureReading],
fans: List[Any],
all_sensors: List[Any]):
"""Store sensor data in database."""
now = datetime.utcnow()
# Store temperature readings
for temp in temps:
sensor = SensorData(
server_id=server_id,
sensor_name=temp.name,
sensor_type="temperature",
value=temp.value,
unit="°C",
timestamp=now
)
db.add(sensor)
# Store fan readings
for fan in fans:
fan_data = FanData(
server_id=server_id,
fan_number=fan.fan_number,
fan_id=fan.fan_id,
speed_rpm=fan.speed_rpm,
speed_percent=fan.speed_percent,
is_manual=False,
timestamp=now
)
db.add(fan_data)
def _should_panic(self, db: Session, server_id: int, server: Server) -> bool:
"""Check if we should enter panic mode."""
if not server.panic_mode_enabled:
return False
last_seen = self._last_sensor_data.get(server_id)
if not last_seen:
return False
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
elapsed = (datetime.utcnow() - last_seen).total_seconds()
if elapsed > timeout:
logger.warning(f"Panic mode triggered for server {server.name}: "
f"No sensor data for {elapsed:.0f}s")
return True
return False
async def _enter_panic_mode(self, db: Session, server: Server, client: IPMIClient):
"""Enter panic mode - set fans to 100%."""
logger.critical(f"Entering PANIC MODE for server {server.name}")
# Log the event
log = SystemLog(
server_id=server.id,
event_type="panic",
message=f"Panic mode activated - No sensor data received",
details=f"Setting all fans to {settings.PANIC_FAN_SPEED}%"
)
db.add(log)
# Enable manual control if not already
if not server.manual_control_enabled:
client.enable_manual_fan_control()
server.manual_control_enabled = True
# Set fans to max
client.set_all_fans_speed(settings.PANIC_FAN_SPEED)
db.commit()
async def _apply_fan_curve(self, db: Session, server: Server,
client: IPMIClient, temps: List[TemperatureReading]):
"""Apply fan curve based on temperatures."""
if not temps:
return
# Get active fan curve
curve_data = server.fan_curve_data
if not curve_data:
# Use default curve
curve = [
FanCurvePoint(30, 10),
FanCurvePoint(40, 20),
FanCurvePoint(50, 35),
FanCurvePoint(60, 50),
FanCurvePoint(70, 70),
FanCurvePoint(80, 100),
]
else:
curve = self.curve_manager.parse_curve(curve_data)
# Find the highest CPU temperature
cpu_temps = [t for t in temps if t.location.startswith("cpu")]
if cpu_temps:
max_temp = max(t.value for t in cpu_temps)
else:
# Fall back to highest overall temp
max_temp = max(t.value for t in temps)
# Calculate target speed
target_speed = self.curve_manager.calculate_speed(curve, max_temp)
# Enable manual control if not already
if not server.manual_control_enabled:
if client.enable_manual_fan_control():
server.manual_control_enabled = True
logger.info(f"Enabled manual fan control for {server.name}")
# Set fan speed
current_fans = client.get_fan_speeds()
avg_current_speed = 0
if current_fans:
# Estimate current speed from RPM if possible
avg_current_speed = 50 # Default assumption
# Only update if speed changed significantly (avoid constant small changes)
if abs(target_speed - avg_current_speed) >= 5:
if client.set_all_fans_speed(target_speed):
logger.info(f"Set {server.name} fans to {target_speed}% (temp: {max_temp}°C)")
async def _handle_connection_loss(self, db: Session, server: Server):
"""Handle connection loss to a server."""
logger.warning(f"Connection lost to server {server.name}")
# Check if we should panic
server_id = server.id
last_seen = self._last_sensor_data.get(server_id)
if last_seen:
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
elapsed = (datetime.utcnow() - last_seen).total_seconds()
if elapsed > timeout and server.panic_mode_enabled:
log = SystemLog(
server_id=server.id,
event_type="error",
message=f"Connection lost to server",
details=f"Last seen {elapsed:.0f} seconds ago"
)
db.add(log)
db.commit()
def get_controller_status(self, server_id: int) -> Dict[str, Any]:
"""Get current controller status for a server."""
is_running = server_id in self._tasks
last_seen = self._last_sensor_data.get(server_id)
return {
"is_running": is_running,
"last_sensor_data": last_seen.isoformat() if last_seen else None,
"state": ControlState.AUTO.value if is_running else ControlState.OFF.value
}
# Global controller instance
fan_controller = FanController()
async def initialize_fan_controller():
"""Initialize and start the fan controller."""
await fan_controller.start()
async def shutdown_fan_controller():
"""Shutdown the fan controller."""
await fan_controller.stop()