Update existing files for performance optimizations

- Refactor fan_control.py with SensorCollector class
- Update main.py endpoints to use cache-first approach
- Fix database models with proper indexes
- Fix SSH client to parse nested lm-sensors JSON
- Update Dashboard with server overview grid
- Update ServerDetail with fan curve integration
- Update API client with new endpoints
This commit is contained in:
ImpulsiveFPS 2026-02-01 22:20:05 +01:00
parent 7c5b44539f
commit 3de9b38388
8 changed files with 4160 additions and 259 deletions

View File

@ -102,12 +102,12 @@ class SensorData(Base):
__tablename__ = "sensor_data"
id = Column(Integer, primary_key=True, index=True)
server_id = Column(Integer, ForeignKey("servers.id"), nullable=False)
sensor_name = Column(String(100), nullable=False)
sensor_type = Column(String(50), nullable=False) # temperature, voltage, fan, power
server_id = Column(Integer, ForeignKey("servers.id"), nullable=False, index=True)
sensor_name = Column(String(100), nullable=False, index=True)
sensor_type = Column(String(50), nullable=False, index=True) # temperature, voltage, fan, power
value = Column(Float, nullable=False)
unit = Column(String(20), nullable=True)
timestamp = Column(DateTime, default=datetime.utcnow)
timestamp = Column(DateTime, default=datetime.utcnow, index=True)
server = relationship("Server", back_populates="sensor_data")
@ -117,13 +117,13 @@ class FanData(Base):
__tablename__ = "fan_data"
id = Column(Integer, primary_key=True, index=True)
server_id = Column(Integer, ForeignKey("servers.id"), nullable=False)
server_id = Column(Integer, ForeignKey("servers.id"), nullable=False, index=True)
fan_number = Column(Integer, nullable=False)
fan_id = Column(String(20), nullable=False) # IPMI fan ID (0x00, 0x01, etc.)
speed_rpm = Column(Integer, nullable=True)
speed_percent = Column(Integer, nullable=True)
is_manual = Column(Boolean, default=False)
timestamp = Column(DateTime, default=datetime.utcnow)
timestamp = Column(DateTime, default=datetime.utcnow, index=True)
server = relationship("Server", back_populates="fan_data")
@ -133,11 +133,11 @@ class SystemLog(Base):
__tablename__ = "system_logs"
id = Column(Integer, primary_key=True, index=True)
server_id = Column(Integer, ForeignKey("servers.id"), nullable=True)
event_type = Column(String(50), nullable=False) # panic, fan_change, error, warning, info
server_id = Column(Integer, ForeignKey("servers.id"), nullable=True, index=True)
event_type = Column(String(50), nullable=False, index=True) # panic, fan_change, error, warning, info
message = Column(Text, nullable=False)
details = Column(Text, nullable=True)
timestamp = Column(DateTime, default=datetime.utcnow)
timestamp = Column(DateTime, default=datetime.utcnow, index=True)
class AppSettings(Base):

View File

@ -6,6 +6,7 @@ from typing import List, Dict, Optional, Any
from dataclasses import dataclass, asdict
from datetime import datetime, timedelta
from enum import Enum
from concurrent.futures import ThreadPoolExecutor
from sqlalchemy.orm import Session
@ -62,9 +63,7 @@ class FanCurveManager:
@staticmethod
def calculate_speed(curve: List[FanCurvePoint], temperature: float) -> int:
"""
Calculate fan speed for a given temperature using linear interpolation.
"""
"""Calculate fan speed for a given temperature using linear interpolation."""
if not curve:
return 50 # Default to 50% if no curve
@ -102,8 +101,8 @@ class FanController:
def __init__(self):
self.curve_manager = FanCurveManager()
self.running = False
self._tasks: Dict[int, asyncio.Task] = {} # server_id -> task
self._last_sensor_data: Dict[int, datetime] = {} # server_id -> timestamp
self._tasks: Dict[int, asyncio.Task] = {}
self._last_sensor_data: Dict[int, datetime] = {}
async def start(self):
"""Start the fan controller service."""
@ -167,8 +166,9 @@ class FanController:
if not server or not server.is_active:
return
# Create IPMI client
from backend.auth import decrypt_password
# Create IPMI client
client = IPMIClient(
host=server.ipmi_host,
username=server.ipmi_username,
@ -177,114 +177,39 @@ class FanController:
vendor=server.vendor
)
# Test connection
if not client.test_connection():
# Test connection with timeout
if not await asyncio.wait_for(
asyncio.to_thread(client.test_connection),
timeout=10.0
):
logger.warning(f"Cannot connect to server {server.name}")
await self._handle_connection_loss(db, server)
return
# Get sensor data
temps = client.get_temperatures()
fans = client.get_fan_speeds()
all_sensors = client.get_all_sensors()
# Store sensor data
self._store_sensor_data(db, server_id, temps, fans, all_sensors)
# Get sensor data with timeout
temps = await asyncio.wait_for(
asyncio.to_thread(client.get_temperatures),
timeout=15.0
)
# Update last sensor data time
self._last_sensor_data[server_id] = datetime.utcnow()
server.last_seen = datetime.utcnow()
# Check panic mode
if self._should_panic(db, server_id, server):
await self._enter_panic_mode(db, server, client)
return
# Calculate and set fan speed if auto control is enabled
if server.auto_control_enabled:
await self._apply_fan_curve(db, server, client, temps)
db.commit()
except asyncio.TimeoutError:
logger.warning(f"Control iteration timeout for server {server_id}")
except Exception as e:
logger.error(f"Control iteration error for server {server_id}: {e}")
finally:
db.close()
def _store_sensor_data(self, db: Session, server_id: int,
temps: List[TemperatureReading],
fans: List[Any],
all_sensors: List[Any]):
"""Store sensor data in database."""
now = datetime.utcnow()
# Store temperature readings
for temp in temps:
sensor = SensorData(
server_id=server_id,
sensor_name=temp.name,
sensor_type="temperature",
value=temp.value,
unit="°C",
timestamp=now
)
db.add(sensor)
# Store fan readings
for fan in fans:
fan_data = FanData(
server_id=server_id,
fan_number=fan.fan_number,
fan_id=fan.fan_id,
speed_rpm=fan.speed_rpm,
speed_percent=fan.speed_percent,
is_manual=False,
timestamp=now
)
db.add(fan_data)
def _should_panic(self, db: Session, server_id: int, server: Server) -> bool:
"""Check if we should enter panic mode."""
if not server.panic_mode_enabled:
return False
last_seen = self._last_sensor_data.get(server_id)
if not last_seen:
return False
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
elapsed = (datetime.utcnow() - last_seen).total_seconds()
if elapsed > timeout:
logger.warning(f"Panic mode triggered for server {server.name}: "
f"No sensor data for {elapsed:.0f}s")
return True
return False
async def _enter_panic_mode(self, db: Session, server: Server, client: IPMIClient):
"""Enter panic mode - set fans to 100%."""
logger.critical(f"Entering PANIC MODE for server {server.name}")
# Log the event
log = SystemLog(
server_id=server.id,
event_type="panic",
message=f"Panic mode activated - No sensor data received",
details=f"Setting all fans to {settings.PANIC_FAN_SPEED}%"
)
db.add(log)
# Enable manual control if not already
if not server.manual_control_enabled:
client.enable_manual_fan_control()
server.manual_control_enabled = True
# Set fans to max
client.set_all_fans_speed(settings.PANIC_FAN_SPEED)
db.commit()
async def _apply_fan_curve(self, db: Session, server: Server,
client: IPMIClient, temps: List[TemperatureReading]):
client: IPMIClient, temps: List[TemperatureReading]):
"""Apply fan curve based on temperatures."""
if not temps:
return
@ -292,7 +217,6 @@ class FanController:
# Get active fan curve
curve_data = server.fan_curve_data
if not curve_data:
# Use default curve
curve = [
FanCurvePoint(30, 10),
FanCurvePoint(40, 20),
@ -309,7 +233,6 @@ class FanController:
if cpu_temps:
max_temp = max(t.value for t in cpu_temps)
else:
# Fall back to highest overall temp
max_temp = max(t.value for t in temps)
# Calculate target speed
@ -317,43 +240,19 @@ class FanController:
# Enable manual control if not already
if not server.manual_control_enabled:
if client.enable_manual_fan_control():
if await asyncio.wait_for(
asyncio.to_thread(client.enable_manual_fan_control),
timeout=10.0
):
server.manual_control_enabled = True
logger.info(f"Enabled manual fan control for {server.name}")
# Set fan speed
current_fans = client.get_fan_speeds()
avg_current_speed = 0
if current_fans:
# Estimate current speed from RPM if possible
avg_current_speed = 50 # Default assumption
# Only update if speed changed significantly (avoid constant small changes)
if abs(target_speed - avg_current_speed) >= 5:
if client.set_all_fans_speed(target_speed):
logger.info(f"Set {server.name} fans to {target_speed}% (temp: {max_temp}°C)")
async def _handle_connection_loss(self, db: Session, server: Server):
"""Handle connection loss to a server."""
logger.warning(f"Connection lost to server {server.name}")
# Check if we should panic
server_id = server.id
last_seen = self._last_sensor_data.get(server_id)
if last_seen:
timeout = server.panic_timeout_seconds or settings.PANIC_TIMEOUT_SECONDS
elapsed = (datetime.utcnow() - last_seen).total_seconds()
if elapsed > timeout and server.panic_mode_enabled:
log = SystemLog(
server_id=server.id,
event_type="error",
message=f"Connection lost to server",
details=f"Last seen {elapsed:.0f} seconds ago"
)
db.add(log)
db.commit()
if await asyncio.wait_for(
asyncio.to_thread(client.set_all_fans_speed, target_speed),
timeout=10.0
):
logger.info(f"Set {server.name} fans to {target_speed}% (temp: {max_temp}°C)")
def get_controller_status(self, server_id: int) -> Dict[str, Any]:
"""Get current controller status for a server."""
@ -367,15 +266,264 @@ class FanController:
}
class SensorCollector:
"""High-performance background sensor data collector.
- Collects from all servers in parallel using thread pool
- Times out slow operations to prevent hanging
- Cleans up old database records periodically
- Updates cache for fast web UI access
"""
def __init__(self, max_workers: int = 4):
self.running = False
self._task: Optional[asyncio.Task] = None
self._collection_interval = 30 # seconds - IPMI is slow, need more time
self._cleanup_interval = 3600 # 1 hour
self._cache = None
self._executor = ThreadPoolExecutor(max_workers=max_workers)
self._last_cleanup = datetime.utcnow()
self._first_collection_done = False
async def start(self):
"""Start the sensor collector."""
self.running = True
self._task = asyncio.create_task(self._collection_loop())
logger.info("Sensor collector started")
async def stop(self):
"""Stop the sensor collector."""
self.running = False
if self._task:
self._task.cancel()
try:
await self._task
except asyncio.CancelledError:
pass
self._task = None
self._executor.shutdown(wait=False)
logger.info("Sensor collector stopped")
async def _collection_loop(self):
"""Main collection loop."""
# Initial collection immediately on startup
try:
logger.info("Performing initial sensor collection...")
await self._collect_all_servers()
self._first_collection_done = True
logger.info("Initial sensor collection complete")
except Exception as e:
logger.error(f"Initial collection error: {e}")
while self.running:
try:
start_time = datetime.utcnow()
await self._collect_all_servers()
# Periodic database cleanup
if (datetime.utcnow() - self._last_cleanup).total_seconds() > self._cleanup_interval:
await self._cleanup_old_data()
# Calculate sleep time to maintain interval
elapsed = (datetime.utcnow() - start_time).total_seconds()
sleep_time = max(0, self._collection_interval - elapsed)
# Only warn if significantly over (collections can be slow)
if elapsed > self._collection_interval * 1.5:
logger.warning(f"Collection took {elapsed:.1f}s, longer than interval {self._collection_interval}s")
await asyncio.sleep(sleep_time)
except asyncio.CancelledError:
break
except Exception as e:
logger.error(f"Sensor collection error: {e}")
await asyncio.sleep(self._collection_interval)
async def _collect_all_servers(self):
"""Collect sensor data from all active servers in parallel."""
db = SessionLocal()
try:
servers = db.query(Server).filter(Server.is_active == True).all()
if not servers:
return
# Create tasks for parallel collection
tasks = []
for server in servers:
task = self._collect_server_with_timeout(server)
tasks.append(task)
# Run all collections concurrently with timeout protection
results = await asyncio.gather(*tasks, return_exceptions=True)
# Process results and batch store in database
all_sensor_data = []
all_fan_data = []
for server, result in zip(servers, results):
if isinstance(result, Exception):
logger.debug(f"Server {server.name} collection failed: {result}")
continue
if result:
temps, fans = result
now = datetime.utcnow()
# Prepare batch inserts
for temp in temps:
all_sensor_data.append({
'server_id': server.id,
'sensor_name': temp.name,
'sensor_type': 'temperature',
'value': temp.value,
'unit': '°C',
'timestamp': now
})
for fan in fans:
all_fan_data.append({
'server_id': server.id,
'fan_number': fan.fan_number,
'fan_id': getattr(fan, 'fan_id', str(fan.fan_number)),
'speed_rpm': fan.speed_rpm,
'speed_percent': fan.speed_percent,
'timestamp': now
})
server.last_seen = now
# Batch insert for better performance
if all_sensor_data:
db.bulk_insert_mappings(SensorData, all_sensor_data)
if all_fan_data:
db.bulk_insert_mappings(FanData, all_fan_data)
db.commit()
logger.debug(f"Collected data from {len([r for r in results if not isinstance(r, Exception)])}/{len(servers)} servers")
finally:
db.close()
async def _collect_server_with_timeout(self, server: Server) -> Optional[tuple]:
"""Collect sensor data from a single server with timeout protection."""
try:
return await asyncio.wait_for(
self._collect_server(server),
timeout=30.0 # Max 30 seconds per server (IPMI can be slow)
)
except asyncio.TimeoutError:
logger.warning(f"Collection timeout for {server.name}")
return None
async def _collect_server(self, server: Server) -> Optional[tuple]:
"""Collect sensor data from a single server."""
try:
from backend.auth import decrypt_password
from backend.main import sensor_cache
# Run blocking IPMI operations in thread pool
loop = asyncio.get_event_loop()
client = IPMIClient(
host=server.ipmi_host,
username=server.ipmi_username,
password=decrypt_password(server.ipmi_encrypted_password),
port=server.ipmi_port,
vendor=server.vendor
)
# Test connection
connected = await loop.run_in_executor(self._executor, client.test_connection)
if not connected:
return None
# Get sensor data in parallel using thread pool
temps_future = loop.run_in_executor(self._executor, client.get_temperatures)
fans_future = loop.run_in_executor(self._executor, client.get_fan_speeds)
power_future = loop.run_in_executor(self._executor, client.get_power_consumption)
temps, fans, power = await asyncio.gather(
temps_future, fans_future, power_future
)
# Calculate summary metrics
max_temp = max((t.value for t in temps if t.value is not None), default=0)
avg_fan = sum(f.speed_percent for f in fans if f.speed_percent is not None) / len(fans) if fans else 0
# Extract current power consumption
current_power = None
if power and isinstance(power, dict):
import re
for key, value in power.items():
if 'current' in key.lower() and 'power' in key.lower():
match = re.search(r'(\d+(?:\.\d+)?)', str(value))
if match:
current_power = float(match.group(1))
break
# Prepare cache data - format must match response schemas
cache_data = {
"max_temp": max_temp,
"avg_fan_speed": round(avg_fan, 1),
"power_consumption": current_power,
"timestamp": datetime.utcnow().isoformat(),
"temps": [{"name": t.name, "value": t.value, "location": t.location, "status": getattr(t, 'status', 'ok')} for t in temps],
"fans": [{"fan_id": getattr(f, 'fan_id', f'0x0{f.fan_number-1}'), "fan_number": f.fan_number, "speed_percent": f.speed_percent, "speed_rpm": f.speed_rpm} for f in fans],
"power_raw": power if isinstance(power, dict) else None
}
# Store in cache
await sensor_cache.set(server.id, cache_data)
logger.info(f"Collected and cached sensors for {server.name}: temp={max_temp:.1f}°C, fan={avg_fan:.1f}%")
return temps, fans
except Exception as e:
logger.warning(f"Failed to collect sensors for {server.name}: {e}")
return None
async def _cleanup_old_data(self):
"""Clean up old sensor data to prevent database bloat."""
try:
db = SessionLocal()
try:
# Keep only last 24 hours of detailed sensor data
cutoff = datetime.utcnow() - timedelta(hours=24)
# Delete old sensor data
deleted_sensors = db.query(SensorData).filter(
SensorData.timestamp < cutoff
).delete(synchronize_session=False)
# Delete old fan data
deleted_fans = db.query(FanData).filter(
FanData.timestamp < cutoff
).delete(synchronize_session=False)
db.commit()
if deleted_sensors > 0 or deleted_fans > 0:
logger.info(f"Cleaned up {deleted_sensors} sensor records and {deleted_fans} fan records")
self._last_cleanup = datetime.utcnow()
finally:
db.close()
except Exception as e:
logger.error(f"Database cleanup failed: {e}")
# Global controller instance
fan_controller = FanController()
sensor_collector = SensorCollector(max_workers=4)
async def initialize_fan_controller():
"""Initialize and start the fan controller."""
"""Initialize and start the fan controller and sensor collector."""
await sensor_collector.start()
await fan_controller.start()
async def shutdown_fan_controller():
"""Shutdown the fan controller."""
"""Shutdown the fan controller and sensor collector."""
await fan_controller.stop()
await sensor_collector.stop()

View File

@ -1,8 +1,10 @@
"""Main FastAPI application."""
import asyncio
import logging
import os
from contextlib import asynccontextmanager
from typing import List, Optional
from typing import List, Optional, Dict, Any
from datetime import datetime, timedelta
from fastapi import FastAPI, Depends, HTTPException, status, BackgroundTasks
from fastapi.middleware.cors import CORSMiddleware
@ -180,7 +182,6 @@ async def login(credentials: UserLogin, db: Session = Depends(get_db)):
)
# Update last login
from datetime import datetime
user.last_login = datetime.utcnow()
db.commit()
@ -353,11 +354,30 @@ async def get_server_sensors(
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get current sensor readings from server."""
"""Get current sensor readings from server.
Uses cached data from the continuous sensor collector for fast response.
Cache is updated every 10 seconds.
"""
server = db.query(Server).filter(Server.id == server_id).first()
if not server:
raise HTTPException(status_code=404, detail="Server not found")
# Try cache first
cached = await sensor_cache.get(server_id)
if cached:
logger.info(f"Serving sensors for {server.name} from cache")
# Data is already in correct format from collector
return {
"server_id": server_id,
"temperatures": cached.get("temps", []),
"fans": cached.get("fans", []),
"all_sensors": [],
"timestamp": cached.get("timestamp", datetime.utcnow().isoformat())
}
# Cache miss - fetch live data
logger.warning(f"Cache miss for sensors {server.name}, fetching live")
try:
client = IPMIClient(
host=server.ipmi_host,
@ -371,7 +391,6 @@ async def get_server_sensors(
fans = client.get_fan_speeds()
all_sensors = client.get_all_sensors()
from datetime import datetime
return {
"server_id": server_id,
"temperatures": [t.__dict__ for t in temps],
@ -390,11 +409,21 @@ async def get_server_power(
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get power consumption data."""
"""Get power consumption data from cache.
Data is updated every 10 seconds by the sensor collector.
"""
server = db.query(Server).filter(Server.id == server_id).first()
if not server:
raise HTTPException(status_code=404, detail="Server not found")
# Try cache first
cached = await sensor_cache.get(server_id)
if cached and cached.get("power_raw"):
return cached["power_raw"]
# Cache miss - fetch live
logger.warning(f"Cache miss for power {server.name}, fetching live")
try:
client = IPMIClient(
host=server.ipmi_host,
@ -710,6 +739,41 @@ async def disable_auto_control(
return {"success": True, "message": "Automatic fan control disabled"}
# Sensor data cache with TTL
class SensorCache:
"""Simple TTL cache for sensor data to reduce IPMI/SSH overhead."""
def __init__(self, ttl_seconds: int = 45):
self._cache: Dict[int, Dict[str, Any]] = {}
self._ttl = ttl_seconds
self._lock = asyncio.Lock()
async def get(self, server_id: int) -> Optional[Dict[str, Any]]:
async with self._lock:
entry = self._cache.get(server_id)
if entry:
if datetime.utcnow() < entry['expires_at']:
return entry['data']
else:
del self._cache[server_id]
return None
async def set(self, server_id: int, data: Dict[str, Any]):
async with self._lock:
self._cache[server_id] = {
'data': data,
'expires_at': datetime.utcnow() + timedelta(seconds=self._ttl)
}
async def invalidate(self, server_id: int):
async with self._lock:
self._cache.pop(server_id, None)
# Global sensor cache
sensor_cache = SensorCache(ttl_seconds=10)
# Dashboard endpoints
@app.get("/api/dashboard/stats", response_model=DashboardStats)
async def get_dashboard_stats(
@ -731,7 +795,7 @@ async def get_dashboard_stats(
if status.get("state") == "panic":
panic_servers += 1
# Get recent logs
# Get recent logs (use index on timestamp)
recent_logs = db.query(SystemLog).order_by(SystemLog.timestamp.desc()).limit(10).all()
return {
@ -744,39 +808,130 @@ async def get_dashboard_stats(
}
@app.get("/api/dashboard/servers-overview")
async def get_servers_overview(
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get a lightweight overview of all servers for the dashboard grid.
Returns cached data from the continuous sensor collector.
Data is updated every 10 seconds automatically.
"""
servers = db.query(Server).all()
async def get_server_status(server: Server) -> Dict[str, Any]:
# Try cache first - sensor collector updates this every 30 seconds
cached = await sensor_cache.get(server.id)
if cached:
logger.debug(f"Serving overview for {server.name} from cache")
return {
"id": server.id,
"name": server.name,
"vendor": server.vendor,
"is_active": server.is_active,
"manual_control_enabled": server.manual_control_enabled,
"auto_control_enabled": server.auto_control_enabled,
"max_temp": cached.get("max_temp"),
"avg_fan_speed": cached.get("avg_fan_speed"),
"power_consumption": cached.get("power_consumption"),
"last_updated": cached.get("timestamp"),
"cached": True
}
# No cache yet (sensor collector may not have run yet)
return {
"id": server.id,
"name": server.name,
"vendor": server.vendor,
"is_active": server.is_active,
"manual_control_enabled": server.manual_control_enabled,
"auto_control_enabled": server.auto_control_enabled,
"max_temp": None,
"avg_fan_speed": None,
"power_consumption": None,
"last_updated": None,
"cached": False
}
# Gather all server statuses concurrently
server_statuses = await asyncio.gather(*[
get_server_status(server) for server in servers
])
return {"servers": server_statuses}
@app.post("/api/dashboard/refresh-server/{server_id}")
async def refresh_server_data(
server_id: int,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Manually trigger a sensor data refresh for a server.
The sensor collector updates data every 10 seconds automatically.
This endpoint allows forcing an immediate refresh.
"""
server = db.query(Server).filter(Server.id == server_id).first()
if not server:
raise HTTPException(status_code=404, detail="Server not found")
# Trigger immediate collection via sensor_collector
from backend.fan_control import sensor_collector
await sensor_collector._collect_server_with_timeout(server)
return {"success": True, "message": "Data refreshed"}
@app.get("/api/dashboard/servers/{server_id}", response_model=ServerDashboardData)
async def get_server_dashboard(
server_id: int,
current_user: User = Depends(get_current_user),
db: Session = Depends(get_db)
):
"""Get detailed dashboard data for a specific server."""
"""Get detailed dashboard data for a specific server.
Uses cached sensor data from the continuous collector.
Falls back to direct IPMI query only if cache is empty.
"""
server = db.query(Server).filter(Server.id == server_id).first()
if not server:
raise HTTPException(status_code=404, detail="Server not found")
# Get current sensor data
# Try to get sensor data from cache first
cached = await sensor_cache.get(server_id)
temps = []
fans = []
power_data = None
try:
client = IPMIClient(
host=server.ipmi_host,
username=server.ipmi_username,
password=decrypt_password(server.ipmi_encrypted_password),
port=server.ipmi_port,
vendor=server.vendor
)
if client.test_connection():
temps_readings = client.get_temperatures()
temps = [t.__dict__ for t in temps_readings]
fans_readings = client.get_fan_speeds()
fans = [f.__dict__ for f in fans_readings]
power_data = client.get_power_consumption()
except Exception as e:
logger.warning(f"Could not fetch live data for {server.name}: {e}")
if cached:
# Use cached data - already in correct format
temps = cached.get("temps", [])
fans = cached.get("fans", [])
power_data = cached.get("power_raw")
logger.info(f"Serving dashboard data for {server.name} from cache")
else:
# Cache miss - fetch live data as fallback
logger.warning(f"Cache miss for server {server.name}, fetching live data")
try:
client = IPMIClient(
host=server.ipmi_host,
username=server.ipmi_username,
password=decrypt_password(server.ipmi_encrypted_password),
port=server.ipmi_port,
vendor=server.vendor
)
if client.test_connection():
temps_readings = client.get_temperatures()
temps = [{"name": t.name, "reading": t.value, "location": t.location} for t in temps_readings]
fans_readings = client.get_fan_speeds()
fans = [{"fan_number": f.fan_number, "reading": f.speed_percent, "speed_rpm": f.speed_rpm} for f in fans_readings]
power_data = client.get_power_consumption()
except Exception as e:
logger.warning(f"Could not fetch live data for {server.name}: {e}")
# Get recent historical data
recent_sensor_data = db.query(SensorData).filter(

View File

@ -164,7 +164,25 @@ class SSHClient:
package_temp = None
for key, value in chip_data.items():
if isinstance(value, (int, float)):
# Skip metadata fields
if key in ['Adapter']:
continue
# Handle nested JSON structure from sensors -j
# e.g., "Core 0": {"temp2_input": 31, "temp2_max": 79, ...}
if isinstance(value, dict):
# Look for temp*_input field which contains the actual temperature
for sub_key, sub_value in value.items():
if 'input' in sub_key.lower() and isinstance(sub_value, (int, float)):
temp_value = float(sub_value)
if 'core' in key.lower():
core_temps[key] = temp_value
elif 'tdie' in key.lower() or 'tctl' in key.lower() or 'package' in key.lower():
package_temp = temp_value
break # Only take the first _input value
# Handle flat structure (fallback for text parsing)
elif isinstance(value, (int, float)):
if 'core' in key.lower():
core_temps[key] = float(value)
elif 'tdie' in key.lower() or 'tctl' in key.lower() or 'package' in key.lower():

3327
frontend/package-lock.json generated Normal file

File diff suppressed because it is too large Load Diff

View File

@ -1,3 +1,4 @@
import React from 'react';
import { useNavigate } from 'react-router-dom';
import { useQuery } from '@tanstack/react-query';
import {
@ -14,7 +15,7 @@ import {
Chip,
IconButton,
Tooltip,
CircularProgress,
Skeleton,
} from '@mui/material';
import {
Dns as ServerIcon,
@ -23,21 +24,67 @@ import {
Error as ErrorIcon,
CheckCircle as CheckIcon,
Thermostat as TempIcon,
NavigateNext as NextIcon,
Refresh as RefreshIcon,
PowerSettingsNew as PowerIcon,
Memory as MemoryIcon,
} from '@mui/icons-material';
import { dashboardApi } from '../utils/api';
import { useMutation, useQueryClient } from '@tanstack/react-query';
interface ServerOverview {
id: number;
name: string;
vendor: string;
is_active: boolean;
manual_control_enabled: boolean;
auto_control_enabled: boolean;
max_temp: number | null;
avg_fan_speed: number | null;
power_consumption: number | null;
last_updated: string | null;
cached: boolean;
}
export default function Dashboard() {
const navigate = useNavigate();
const queryClient = useQueryClient();
const { data: stats, isLoading } = useQuery({
// Stats query - poll every 60 seconds (stats don't change often)
const { data: stats } = useQuery({
queryKey: ['dashboard-stats'],
queryFn: async () => {
const response = await dashboardApi.getStats();
return response.data;
},
refetchInterval: 5000, // Refresh every 5 seconds
refetchInterval: 60000, // 60 seconds
staleTime: 55000,
});
// Server overview query - poll every 30 seconds (matches sensor collector)
const { data: overviewData, isLoading: overviewLoading } = useQuery({
queryKey: ['servers-overview'],
queryFn: async () => {
const response = await dashboardApi.getServersOverview();
return response.data.servers as ServerOverview[];
},
refetchInterval: 30000, // 30 seconds - matches sensor collector
staleTime: 25000,
// Don't refetch on window focus to reduce load
refetchOnWindowFocus: false,
});
// Background refresh mutation
const refreshMutation = useMutation({
mutationFn: async (serverId: number) => {
const response = await dashboardApi.refreshServer(serverId);
return response.data;
},
onSuccess: () => {
// Invalidate overview after a short delay to allow background fetch
setTimeout(() => {
queryClient.invalidateQueries({ queryKey: ['servers-overview'] });
}, 2000);
},
});
const getEventIcon = (eventType: string) => {
@ -79,13 +126,157 @@ export default function Dashboard() {
</Card>
);
if (isLoading) {
const ServerCard = ({ server }: { server: ServerOverview }) => {
const hasData = server.max_temp !== null || server.avg_fan_speed !== null;
const isLoading = !hasData && server.is_active;
const getTempColor = (temp: number | null) => {
if (temp === null) return 'text.secondary';
if (temp > 80) return 'error.main';
if (temp > 70) return 'warning.main';
return 'success.main';
};
const getStatusChip = () => {
if (!server.is_active) {
return <Chip size="small" label="Offline" color="default" icon={<PowerIcon />} />;
}
if (server.manual_control_enabled) {
return <Chip size="small" label="Manual" color="info" icon={<SpeedIcon />} />;
}
if (server.auto_control_enabled) {
return <Chip size="small" label="Auto" color="success" icon={<CheckIcon />} />;
}
return <Chip size="small" label="Active" color="success" />;
};
return (
<Box sx={{ display: 'flex', justifyContent: 'center', p: 4 }}>
<CircularProgress />
</Box>
<Card
variant="outlined"
sx={{
cursor: 'pointer',
transition: 'all 0.2s',
opacity: isLoading ? 0.7 : 1,
'&:hover': {
boxShadow: 2,
borderColor: 'primary.main',
},
}}
onClick={() => navigate(`/servers/${server.id}`)}
>
<CardContent sx={{ p: 2, '&:last-child': { pb: 2 } }}>
{/* Header */}
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'flex-start', mb: 2 }}>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1 }}>
<ServerIcon color={server.is_active ? 'primary' : 'disabled'} />
<Typography variant="subtitle1" fontWeight="medium" noWrap sx={{ maxWidth: 150 }}>
{server.name}
</Typography>
</Box>
{getStatusChip()}
</Box>
{/* Metrics Grid - Always show values or -- placeholder */}
<Grid container spacing={1} sx={{ mb: 1 }}>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Typography variant="h6" color={getTempColor(server.max_temp)}>
{server.max_temp !== null ? `${Math.round(server.max_temp)}°C` : '--'}
</Typography>
<Typography variant="caption" color="text.secondary">
Max Temp
</Typography>
</Box>
</Grid>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Typography variant="h6" color="primary.main">
{server.avg_fan_speed !== null ? `${Math.round(server.avg_fan_speed)}%` : '--'}
</Typography>
<Typography variant="caption" color="text.secondary">
Avg Fan
</Typography>
</Box>
</Grid>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Typography variant="h6" color="text.primary">
{server.power_consumption !== null ? `${Math.round(server.power_consumption)}W` : '--'}
</Typography>
<Typography variant="caption" color="text.secondary">
Power
</Typography>
</Box>
</Grid>
</Grid>
{/* Footer */}
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', mt: 1 }}>
<Typography variant="caption" color="text.secondary">
{server.vendor || 'Unknown Vendor'}
</Typography>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 0.5 }}>
{isLoading ? (
<Chip size="small" label="Loading..." color="warning" variant="outlined" sx={{ height: 20, fontSize: '0.6rem' }} />
) : server.cached ? (
<Chip size="small" label="Cached" variant="outlined" sx={{ height: 20, fontSize: '0.6rem' }} />
) : null}
<Tooltip title="Refresh data">
<IconButton
size="small"
onClick={(e: React.MouseEvent) => {
e.stopPropagation();
refreshMutation.mutate(server.id);
}}
disabled={refreshMutation.isPending}
>
<RefreshIcon fontSize="small" />
</IconButton>
</Tooltip>
</Box>
</Box>
</CardContent>
</Card>
);
}
};
// Show placeholder cards while loading initial data
const ServersPlaceholderGrid = () => (
<Grid container spacing={2}>
{[1, 2, 3, 4].map((i) => (
<Grid item xs={12} sm={6} md={4} lg={3} key={i}>
<Card variant="outlined" sx={{ opacity: 0.5 }}>
<CardContent sx={{ p: 2 }}>
<Box sx={{ display: 'flex', alignItems: 'center', gap: 1, mb: 2 }}>
<Skeleton variant="circular" width={24} height={24} />
<Skeleton variant="text" width="60%" />
</Box>
<Grid container spacing={1}>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Skeleton variant="text" width={30} sx={{ mx: 'auto' }} />
<Skeleton variant="text" width={40} sx={{ mx: 'auto' }} />
</Box>
</Grid>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Skeleton variant="text" width={30} sx={{ mx: 'auto' }} />
<Skeleton variant="text" width={40} sx={{ mx: 'auto' }} />
</Box>
</Grid>
<Grid item xs={4}>
<Box sx={{ textAlign: 'center' }}>
<Skeleton variant="text" width={30} sx={{ mx: 'auto' }} />
<Skeleton variant="text" width={40} sx={{ mx: 'auto' }} />
</Box>
</Grid>
</Grid>
</CardContent>
</Card>
</Grid>
))}
</Grid>
);
return (
<Box>
@ -137,6 +328,49 @@ export default function Dashboard() {
</Grid>
</Grid>
{/* Servers Grid */}
<Paper sx={{ p: 3, mb: 3 }}>
<Box sx={{ display: 'flex', justifyContent: 'space-between', alignItems: 'center', mb: 3 }}>
<Typography variant="h6">
Server Overview
</Typography>
<Chip
label={`${overviewData?.length || 0} servers`}
size="small"
color="primary"
variant="outlined"
/>
</Box>
{overviewLoading ? (
<ServersPlaceholderGrid />
) : overviewData && overviewData.length > 0 ? (
<Grid container spacing={2}>
{overviewData.map((server) => (
<Grid item xs={12} sm={6} md={4} lg={3} key={server.id}>
<ServerCard server={server} />
</Grid>
))}
</Grid>
) : (
<Box sx={{ textAlign: 'center', py: 4 }}>
<ServerIcon sx={{ fontSize: 48, color: 'text.secondary', mb: 2 }} />
<Typography variant="h6" color="text.secondary">
No servers configured
</Typography>
<Typography variant="body2" color="text.secondary" sx={{ mb: 2 }}>
Add your first server to start monitoring
</Typography>
<Chip
label="Add Server"
color="primary"
onClick={() => navigate('/servers')}
clickable
/>
</Box>
)}
</Paper>
{/* Recent Logs */}
<Grid container spacing={3}>
<Grid item xs={12} md={6}>
@ -151,7 +385,7 @@ export default function Dashboard() {
/>
</Box>
<List dense>
{stats?.recent_logs?.slice(0, 10).map((log) => (
{stats?.recent_logs?.slice(0, 10).map((log: any) => (
<ListItem key={log.id}>
<ListItemIcon>
{getEventIcon(log.event_type)}
@ -177,54 +411,30 @@ export default function Dashboard() {
<Grid item xs={12} md={6}>
<Paper sx={{ p: 2 }}>
<Typography variant="h6" gutterBottom>
Quick Actions
About IPMI Fan Control
</Typography>
<Box sx={{ display: 'flex', flexDirection: 'column', gap: 1 }}>
<Card variant="outlined">
<CardContent sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', py: 1, '&:last-child': { pb: 1 } }}>
<Box>
<Typography variant="subtitle1">Manage Servers</Typography>
<Typography variant="body2" color="text.secondary">
Add, edit, or remove servers
</Typography>
</Box>
<Tooltip title="Go to Servers">
<IconButton onClick={() => navigate('/servers')}>
<NextIcon />
</IconButton>
</Tooltip>
</CardContent>
</Card>
<Card variant="outlined">
<CardContent sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', py: 1, '&:last-child': { pb: 1 } }}>
<Box>
<Typography variant="subtitle1">View Logs</Typography>
<Typography variant="body2" color="text.secondary">
Check system events and history
</Typography>
</Box>
<Tooltip title="Go to Logs">
<IconButton onClick={() => navigate('/logs')}>
<NextIcon />
</IconButton>
</Tooltip>
</CardContent>
</Card>
<Card variant="outlined">
<CardContent sx={{ py: 1, '&:last-child': { pb: 1 } }}>
<Typography variant="subtitle1" gutterBottom>
About IPMI Fan Control
</Typography>
<Typography variant="body2" color="text.secondary">
This application allows you to control fan speeds on Dell T710 and compatible servers
using IPMI commands. Features include manual fan control, automatic fan curves based
on temperature, and safety panic mode.
</Typography>
</CardContent>
</Card>
</Box>
<Typography variant="body2" color="text.secondary" paragraph>
This application allows you to control fan speeds on Dell T710 and compatible servers
using IPMI commands. Features include:
</Typography>
<List dense>
<ListItem>
<ListItemIcon><SpeedIcon color="primary" fontSize="small" /></ListItemIcon>
<ListItemText primary="Manual fan control with per-fan adjustment" />
</ListItem>
<ListItem>
<ListItemIcon><TempIcon color="primary" fontSize="small" /></ListItemIcon>
<ListItemText primary="Automatic fan curves based on temperature sensors" />
</ListItem>
<ListItem>
<ListItemIcon><MemoryIcon color="primary" fontSize="small" /></ListItemIcon>
<ListItemText primary="SSH-based CPU temperature monitoring" />
</ListItem>
<ListItem>
<ListItemIcon><ErrorIcon color="primary" fontSize="small" /></ListItemIcon>
<ListItemText primary="Safety panic mode for overheating protection" />
</ListItem>
</List>
</Paper>
</Grid>
</Grid>

View File

@ -34,6 +34,7 @@ import {
Refresh as RefreshIcon,
} from '@mui/icons-material';
import { serversApi, fanControlApi, dashboardApi } from '../utils/api';
import FanCurveManager from '../components/FanCurveManager';
interface TabPanelProps {
children?: React.ReactNode;
@ -64,6 +65,10 @@ export default function ServerDetail() {
const response = await serversApi.getById(serverId);
return response.data;
},
// Server config rarely changes
refetchInterval: 60000,
staleTime: 55000,
refetchOnWindowFocus: false,
});
const { data: sensors, refetch: refetchSensors } = useQuery({
@ -72,7 +77,9 @@ export default function ServerDetail() {
const response = await serversApi.getSensors(serverId);
return response.data;
},
refetchInterval: 5000,
refetchInterval: 30000, // 30 seconds - matches sensor collector
staleTime: 25000,
refetchOnWindowFocus: false,
});
// Get SSH sensors for core temps - use dedicated endpoint
@ -88,7 +95,9 @@ export default function ServerDetail() {
}
},
enabled: !!server?.use_ssh,
refetchInterval: 10000, // Slower refresh for SSH
refetchInterval: 30000, // SSH is slow - refresh less frequently
staleTime: 25000,
refetchOnWindowFocus: false,
});
const { data: dashboardData } = useQuery({
@ -97,7 +106,9 @@ export default function ServerDetail() {
const response = await dashboardApi.getServerData(serverId);
return response.data;
},
refetchInterval: 10000,
refetchInterval: 60000, // Historical data doesn't change often
staleTime: 55000,
refetchOnWindowFocus: false,
});
const enableManualMutation = useMutation({
@ -305,37 +316,48 @@ export default function ServerDetail() {
<PowerIcon sx={{ mr: 1, verticalAlign: 'middle' }} />
Power Consumption
</Typography>
<Grid container spacing={2}>
{Object.entries(dashboardData.power_consumption)
.filter(([_, value]) => !value.includes('UTC')) // Filter out weird timestamp entries
.slice(0, 4)
.map(([key, value]) => {
// Clean up the display
let displayValue = value as string;
let displayKey = key;
// Handle Dell power monitor output
if (key.includes('System') && value.includes('Reading')) {
const match = value.match(/Reading\s*:\s*([\d.]+)\s*(\w+)/);
if (match) {
displayValue = `${match[1]} ${match[2]}`;
}
}
return (
<Grid item xs={6} md={3} key={key}>
<Paper variant="outlined" sx={{ p: 2, textAlign: 'center' }}>
<Typography variant="body2" color="text.secondary" sx={{ textTransform: 'capitalize' }}>
{displayKey.replace(/_/g, ' ')}
</Typography>
<Typography variant="h6" sx={{ mt: 0.5 }}>
{displayValue}
</Typography>
</Paper>
</Grid>
);
})}
</Grid>
{/* Handle numeric power value (from cache) */}
{typeof dashboardData.power_consumption === 'number' && (
<Paper variant="outlined" sx={{ p: 3, textAlign: 'center', maxWidth: 300 }}>
<Typography variant="body2" color="text.secondary">
Current Power Consumption
</Typography>
<Typography variant="h3" color="primary.main" sx={{ mt: 1 }}>
{Math.round(dashboardData.power_consumption)}W
</Typography>
</Paper>
)}
{/* Handle dictionary power data (from live IPMI) */}
{typeof dashboardData.power_consumption === 'object' && (
<Grid container spacing={2}>
{Object.entries(dashboardData.power_consumption)
.filter(([_, value]) => {
// Filter out empty values, timestamps, and metadata
if (!value || value === '') return false;
if (typeof value === 'string' && value.includes('UTC')) return false;
return true;
})
.map(([key, value]) => {
// Show the raw value as-is from IPMI
const displayValue = typeof value === 'string' ? value : String(value);
return (
<Grid item xs={6} md={3} key={key}>
<Paper variant="outlined" sx={{ p: 2, textAlign: 'center' }}>
<Typography variant="body2" color="text.secondary">
{key}
</Typography>
<Typography variant="h6" sx={{ mt: 0.5 }}>
{displayValue}
</Typography>
</Paper>
</Grid>
);
})}
</Grid>
)}
</CardContent>
</Card>
</Grid>
@ -484,6 +506,11 @@ export default function ServerDetail() {
</CardContent>
</Card>
</Grid>
{/* Fan Curves Section */}
<Grid item xs={12}>
<FanCurveManager serverId={serverId} server={server} />
</Grid>
</Grid>
</TabPanel>

View File

@ -147,6 +147,22 @@ export const fanCurvesApi = {
// Dashboard API
export const dashboardApi = {
getStats: () => api.get<DashboardStats>('/dashboard/stats'),
getServersOverview: () =>
api.get<{ servers: Array<{
id: number;
name: string;
vendor: string;
is_active: boolean;
manual_control_enabled: boolean;
auto_control_enabled: boolean;
max_temp: number | null;
avg_fan_speed: number | null;
power_consumption: number | null;
last_updated: string | null;
cached: boolean;
}> }>('/dashboard/servers-overview'),
refreshServer: (serverId: number) =>
api.post<{ success: boolean; message: string }>(`/dashboard/refresh-server/${serverId}`),
getServerData: (serverId: number) =>
api.get<{
server: Server;