382 lines
13 KiB
Python
382 lines
13 KiB
Python
"""Fan control logic including curves and automatic control."""
|
|
import json
|
|
import logging
|
|
import asyncio
|
|
from typing import List, Dict, Optional, Any
|
|
from dataclasses import dataclass, asdict
|
|
from datetime import datetime, timedelta
|
|
from enum import Enum
|
|
|
|
from sqlalchemy.orm import Session
|
|
|
|
from backend.database import (
|
|
Server, FanCurve, SensorData, FanData, SystemLog,
|
|
get_db, SessionLocal
|
|
)
|
|
from backend.ipmi_client import IPMIClient, TemperatureReading
|
|
from backend.config import settings
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class ControlState(Enum):
|
|
"""Fan control state."""
|
|
AUTO = "auto"
|
|
MANUAL = "manual"
|
|
PANIC = "panic"
|
|
OFF = "off"
|
|
|
|
|
|
@dataclass
|
|
class FanCurvePoint:
|
|
"""Single point on a fan curve."""
|
|
temp: float
|
|
speed: int
|
|
|
|
|
|
class FanCurveManager:
|
|
"""Manages fan curve calculations."""
|
|
|
|
@staticmethod
|
|
def parse_curve(curve_data: str) -> List[FanCurvePoint]:
|
|
"""Parse fan curve from JSON string."""
|
|
try:
|
|
data = json.loads(curve_data)
|
|
return [FanCurvePoint(p["temp"], p["speed"]) for p in data]
|
|
except (json.JSONDecodeError, KeyError) as e:
|
|
logger.error(f"Failed to parse fan curve: {e}")
|
|
# Return default curve
|
|
return [
|
|
FanCurvePoint(30, 10),
|
|
FanCurvePoint(40, 20),
|
|
FanCurvePoint(50, 35),
|
|
FanCurvePoint(60, 50),
|
|
FanCurvePoint(70, 70),
|
|
FanCurvePoint(80, 100),
|
|
]
|
|
|
|
@staticmethod
|
|
def serialize_curve(points: List[FanCurvePoint]) -> str:
|
|
"""Serialize fan curve to JSON string."""
|
|
return json.dumps([{"temp": p.temp, "speed": p.speed} for p in points])
|
|
|
|
@staticmethod
|
|
def calculate_speed(curve: List[FanCurvePoint], temperature: float) -> int:
|
|
"""
|
|
Calculate fan speed for a given temperature using linear interpolation.
|
|
"""
|
|
if not curve:
|
|
return 50 # Default to 50% if no curve
|
|
|
|
# Sort by temperature
|
|
sorted_curve = sorted(curve, key=lambda p: p.temp)
|
|
|
|
# Below minimum temp
|
|
if temperature <= sorted_curve[0].temp:
|
|
return sorted_curve[0].speed
|
|
|
|
# Above maximum temp
|
|
if temperature >= sorted_curve[-1].temp:
|
|
return sorted_curve[-1].speed
|
|
|
|
# Find surrounding points
|
|
for i in range(len(sorted_curve) - 1):
|
|
p1 = sorted_curve[i]
|
|
p2 = sorted_curve[i + 1]
|
|
|
|
if p1.temp <= temperature <= p2.temp:
|
|
# Linear interpolation
|
|
if p2.temp == p1.temp:
|
|
return p1.speed
|
|
|
|
ratio = (temperature - p1.temp) / (p2.temp - p1.temp)
|
|
speed = p1.speed + ratio * (p2.speed - p1.speed)
|
|
return int(round(speed))
|
|
|
|
return sorted_curve[-1].speed
|
|
|
|
|
|
class FanController:
|
|
"""Main fan controller for managing server fans."""
|
|
|
|
def __init__(self):
|
|
self.curve_manager = FanCurveManager()
|
|
self.running = False
|
|
self._tasks: Dict[int, asyncio.Task] = {} # server_id -> task
|
|
self._last_sensor_data: Dict[int, datetime] = {} # server_id -> timestamp
|
|
|
|
async def start(self):
|
|
"""Start the fan controller service."""
|
|
self.running = True
|
|
logger.info("Fan controller started")
|
|
|
|
# Load all servers with auto-control enabled
|
|
db = SessionLocal()
|
|
try:
|
|
servers = db.query(Server).filter(
|
|
Server.auto_control_enabled == True,
|
|
Server.is_active == True
|
|
).all()
|
|
|
|
for server in servers:
|
|
await self.start_server_control(server.id)
|
|
finally:
|
|
db.close()
|
|
|
|
async def stop(self):
|
|
"""Stop all fan control tasks."""
|
|
self.running = False
|
|
for task in self._tasks.values():
|
|
task.cancel()
|
|
self._tasks.clear()
|
|
logger.info("Fan controller stopped")
|
|
|
|
async def start_server_control(self, server_id: int):
|
|
"""Start automatic control for a server."""
|
|
if server_id in self._tasks:
|
|
self._tasks[server_id].cancel()
|
|
|
|
task = asyncio.create_task(self._control_loop(server_id))
|
|
self._tasks[server_id] = task
|
|
logger.info(f"Started fan control for server {server_id}")
|
|
|
|
async def stop_server_control(self, server_id: int):
|
|
"""Stop automatic control for a server."""
|
|
if server_id in self._tasks:
|
|
self._tasks[server_id].cancel()
|
|
del self._tasks[server_id]
|
|
logger.info(f"Stopped fan control for server {server_id}")
|
|
|
|
async def _control_loop(self, server_id: int):
|
|
"""Main control loop for a server."""
|
|
while self.running:
|
|
try:
|
|
await self._control_iteration(server_id)
|
|
await asyncio.sleep(5) # 5 second interval
|
|
except asyncio.CancelledError:
|
|
break
|
|
except Exception as e:
|
|
logger.error(f"Control loop error for server {server_id}: {e}")
|
|
await asyncio.sleep(10)
|
|
|
|
async def _control_iteration(self, server_id: int):
|
|
"""Single control iteration for a server."""
|
|
db = SessionLocal()
|
|
try:
|
|
server = db.query(Server).filter(Server.id == server_id).first()
|
|
if not server or not server.is_active:
|
|
return
|
|
|
|
# Create IPMI client
|
|
from backend.auth import decrypt_password
|
|
client = IPMIClient(
|
|
host=server.ipmi_host,
|
|
username=server.ipmi_username,
|
|
password=decrypt_password(server.ipmi_encrypted_password),
|
|
port=server.ipmi_port,
|
|
vendor=server.vendor
|
|
)
|
|
|
|
# Test connection
|
|
if not client.test_connection():
|
|
logger.warning(f"Cannot connect to server {server.name}")
|
|
await self._handle_connection_loss(db, server)
|
|
return
|
|
|
|
# Get sensor data
|
|
temps = client.get_temperatures()
|
|
fans = client.get_fan_speeds()
|
|
all_sensors = client.get_all_sensors()
|
|
|
|
# Store sensor data
|
|
self._store_sensor_data(db, server_id, temps, fans, all_sensors)
|
|
|
|
# Update last sensor data time
|
|
self._last_sensor_data[server_id] = datetime.utcnow()
|
|
server.last_seen = datetime.utcnow()
|
|
|
|
# Check panic mode
|
|
if self._should_panic(db, server_id, server):
|
|
await self._enter_panic_mode(db, server, client)
|
|
return
|
|
|
|
# Calculate and set fan speed if auto control is enabled
|
|
if server.auto_control_enabled:
|
|
await self._apply_fan_curve(db, server, client, temps)
|
|
|
|
db.commit()
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
def _store_sensor_data(self, db: Session, server_id: int,
|
|
temps: List[TemperatureReading],
|
|
fans: List[Any],
|
|
all_sensors: List[Any]):
|
|
"""Store sensor data in database."""
|
|
now = datetime.utcnow()
|
|
|
|
# Store temperature readings
|
|
for temp in temps:
|
|
sensor = SensorData(
|
|
server_id=server_id,
|
|
sensor_name=temp.name,
|
|
sensor_type="temperature",
|
|
value=temp.value,
|
|
unit="°C",
|
|
timestamp=now
|
|
)
|
|
db.add(sensor)
|
|
|
|
# Store fan readings
|
|
for fan in fans:
|
|
fan_data = FanData(
|
|
server_id=server_id,
|
|
fan_number=fan.fan_number,
|
|
fan_id=fan.fan_id,
|
|
speed_rpm=fan.speed_rpm,
|
|
speed_percent=fan.speed_percent,
|
|
is_manual=False,
|
|
timestamp=now
|
|
)
|
|
db.add(fan_data)
|
|
|
|
def _should_panic(self, db: Session, server_id: int, server: Server) -> bool:
|
|
"""Check if we should enter panic mode."""
|
|
if not server.panic_mode_enabled:
|
|
return False
|
|
|
|
last_seen = self._last_sensor_data.get(server_id)
|
|
if not last_seen:
|
|
return False
|
|
|
|
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
|
|
elapsed = (datetime.utcnow() - last_seen).total_seconds()
|
|
|
|
if elapsed > timeout:
|
|
logger.warning(f"Panic mode triggered for server {server.name}: "
|
|
f"No sensor data for {elapsed:.0f}s")
|
|
return True
|
|
|
|
return False
|
|
|
|
async def _enter_panic_mode(self, db: Session, server: Server, client: IPMIClient):
|
|
"""Enter panic mode - set fans to 100%."""
|
|
logger.critical(f"Entering PANIC MODE for server {server.name}")
|
|
|
|
# Log the event
|
|
log = SystemLog(
|
|
server_id=server.id,
|
|
event_type="panic",
|
|
message=f"Panic mode activated - No sensor data received",
|
|
details=f"Setting all fans to {settings.PANIC_FAN_SPEED}%"
|
|
)
|
|
db.add(log)
|
|
|
|
# Enable manual control if not already
|
|
if not server.manual_control_enabled:
|
|
client.enable_manual_fan_control()
|
|
server.manual_control_enabled = True
|
|
|
|
# Set fans to max
|
|
client.set_all_fans_speed(settings.PANIC_FAN_SPEED)
|
|
|
|
db.commit()
|
|
|
|
async def _apply_fan_curve(self, db: Session, server: Server,
|
|
client: IPMIClient, temps: List[TemperatureReading]):
|
|
"""Apply fan curve based on temperatures."""
|
|
if not temps:
|
|
return
|
|
|
|
# Get active fan curve
|
|
curve_data = server.fan_curve_data
|
|
if not curve_data:
|
|
# Use default curve
|
|
curve = [
|
|
FanCurvePoint(30, 10),
|
|
FanCurvePoint(40, 20),
|
|
FanCurvePoint(50, 35),
|
|
FanCurvePoint(60, 50),
|
|
FanCurvePoint(70, 70),
|
|
FanCurvePoint(80, 100),
|
|
]
|
|
else:
|
|
curve = self.curve_manager.parse_curve(curve_data)
|
|
|
|
# Find the highest CPU temperature
|
|
cpu_temps = [t for t in temps if t.location.startswith("cpu")]
|
|
if cpu_temps:
|
|
max_temp = max(t.value for t in cpu_temps)
|
|
else:
|
|
# Fall back to highest overall temp
|
|
max_temp = max(t.value for t in temps)
|
|
|
|
# Calculate target speed
|
|
target_speed = self.curve_manager.calculate_speed(curve, max_temp)
|
|
|
|
# Enable manual control if not already
|
|
if not server.manual_control_enabled:
|
|
if client.enable_manual_fan_control():
|
|
server.manual_control_enabled = True
|
|
logger.info(f"Enabled manual fan control for {server.name}")
|
|
|
|
# Set fan speed
|
|
current_fans = client.get_fan_speeds()
|
|
avg_current_speed = 0
|
|
if current_fans:
|
|
# Estimate current speed from RPM if possible
|
|
avg_current_speed = 50 # Default assumption
|
|
|
|
# Only update if speed changed significantly (avoid constant small changes)
|
|
if abs(target_speed - avg_current_speed) >= 5:
|
|
if client.set_all_fans_speed(target_speed):
|
|
logger.info(f"Set {server.name} fans to {target_speed}% (temp: {max_temp}°C)")
|
|
|
|
async def _handle_connection_loss(self, db: Session, server: Server):
|
|
"""Handle connection loss to a server."""
|
|
logger.warning(f"Connection lost to server {server.name}")
|
|
|
|
# Check if we should panic
|
|
server_id = server.id
|
|
last_seen = self._last_sensor_data.get(server_id)
|
|
|
|
if last_seen:
|
|
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
|
|
elapsed = (datetime.utcnow() - last_seen).total_seconds()
|
|
|
|
if elapsed > timeout and server.panic_mode_enabled:
|
|
log = SystemLog(
|
|
server_id=server.id,
|
|
event_type="error",
|
|
message=f"Connection lost to server",
|
|
details=f"Last seen {elapsed:.0f} seconds ago"
|
|
)
|
|
db.add(log)
|
|
db.commit()
|
|
|
|
def get_controller_status(self, server_id: int) -> Dict[str, Any]:
|
|
"""Get current controller status for a server."""
|
|
is_running = server_id in self._tasks
|
|
last_seen = self._last_sensor_data.get(server_id)
|
|
|
|
return {
|
|
"is_running": is_running,
|
|
"last_sensor_data": last_seen.isoformat() if last_seen else None,
|
|
"state": ControlState.AUTO.value if is_running else ControlState.OFF.value
|
|
}
|
|
|
|
|
|
# Global controller instance
|
|
fan_controller = FanController()
|
|
|
|
|
|
async def initialize_fan_controller():
|
|
"""Initialize and start the fan controller."""
|
|
await fan_controller.start()
|
|
|
|
|
|
async def shutdown_fan_controller():
|
|
"""Shutdown the fan controller."""
|
|
await fan_controller.stop()
|