feat(api): admin system metrics endpoint with disk/mem/cpu alerts (0.40.0)
Adds GET /api/v1/admin/system returning disk (scoped to storage_dir via statvfs), memory, CPU, and a server-side alerts array that fires at >90% disk or memory. Disk uses nix::sys::statvfs directly rather than sysinfo's Disks API to avoid mountpoint-matching gymnastics for the storage_dir. A new `Storage::local_root() -> Option<&Path>` trait method exposes the root; the default returns None so a future S3Storage gets `disk: null` in the response instead of fabricated numbers. CPU is sampled inline (refresh → 250ms sleep → refresh → read) so the endpoint adds 250ms of latency per call. No background-cache yet — admin traffic is low-volume and the moving parts aren't worth it until polling shows up. Alerts are evaluated server-side so the frontend can render them without re-implementing the thresholds.
This commit is contained in:
@@ -5,6 +5,7 @@
|
||||
//! `crate::auth::extractor::RequireAdmin`).
|
||||
|
||||
pub mod mangas;
|
||||
pub mod system;
|
||||
pub mod users;
|
||||
|
||||
use axum::Router;
|
||||
@@ -12,5 +13,8 @@ use axum::Router;
|
||||
use crate::app::AppState;
|
||||
|
||||
pub fn routes() -> Router<AppState> {
|
||||
Router::new().merge(users::routes()).merge(mangas::routes())
|
||||
Router::new()
|
||||
.merge(users::routes())
|
||||
.merge(mangas::routes())
|
||||
.merge(system::routes())
|
||||
}
|
||||
|
||||
163
backend/src/api/admin/system.rs
Normal file
163
backend/src/api/admin/system.rs
Normal file
@@ -0,0 +1,163 @@
|
||||
//! System metrics for the admin dashboard.
|
||||
//!
|
||||
//! Disk is `statvfs(storage_dir)` so the number reflects the volume the
|
||||
//! app actually writes to (not the root filesystem of the host). When the
|
||||
//! storage backend doesn't expose a local path (e.g. a future S3 impl)
|
||||
//! the disk fields are `null` rather than fabricated.
|
||||
//!
|
||||
//! Memory and CPU come from `sysinfo`. CPU requires two refreshes with
|
||||
//! at least 200ms between them to compute a meaningful delta; the
|
||||
//! handler eats the 250ms wall-clock cost on each request. Admin
|
||||
//! traffic is low-volume so a background cache isn't worth the moving
|
||||
//! parts yet — revisit if polling becomes frequent.
|
||||
|
||||
use std::path::Path;
|
||||
use std::time::Duration;
|
||||
|
||||
use axum::extract::State;
|
||||
use axum::routing::get;
|
||||
use axum::{Json, Router};
|
||||
use serde::Serialize;
|
||||
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
|
||||
|
||||
use crate::app::AppState;
|
||||
use crate::auth::extractor::RequireAdmin;
|
||||
use crate::error::AppResult;
|
||||
|
||||
const ALERT_THRESHOLD_PERCENT: f64 = 90.0;
|
||||
|
||||
pub fn routes() -> Router<AppState> {
|
||||
Router::new().route("/admin/system", get(system))
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SystemStats {
|
||||
pub disk: Option<DiskStats>,
|
||||
pub memory: MemoryStats,
|
||||
pub cpu: CpuStats,
|
||||
pub alerts: Vec<Alert>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct DiskStats {
|
||||
pub total_bytes: u64,
|
||||
pub used_bytes: u64,
|
||||
pub free_bytes: u64,
|
||||
pub percent_used: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct MemoryStats {
|
||||
pub total_bytes: u64,
|
||||
pub used_bytes: u64,
|
||||
pub percent_used: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct CpuStats {
|
||||
pub percent_used: f64,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct Alert {
|
||||
pub level: AlertLevel,
|
||||
pub message: String,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize, Clone, Copy)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum AlertLevel {
|
||||
Warning,
|
||||
}
|
||||
|
||||
async fn system(
|
||||
State(state): State<AppState>,
|
||||
_admin: RequireAdmin,
|
||||
) -> AppResult<Json<SystemStats>> {
|
||||
let disk = state.storage.local_root().and_then(disk_stats_for);
|
||||
let (memory, cpu) = memory_and_cpu().await;
|
||||
let mut alerts = Vec::new();
|
||||
if let Some(d) = &disk {
|
||||
if d.percent_used >= ALERT_THRESHOLD_PERCENT {
|
||||
alerts.push(Alert {
|
||||
level: AlertLevel::Warning,
|
||||
message: format!(
|
||||
"disk near full ({:.0}% used)",
|
||||
d.percent_used
|
||||
),
|
||||
});
|
||||
}
|
||||
}
|
||||
if memory.percent_used >= ALERT_THRESHOLD_PERCENT {
|
||||
alerts.push(Alert {
|
||||
level: AlertLevel::Warning,
|
||||
message: format!(
|
||||
"memory near full ({:.0}% used)",
|
||||
memory.percent_used
|
||||
),
|
||||
});
|
||||
}
|
||||
Ok(Json(SystemStats {
|
||||
disk,
|
||||
memory,
|
||||
cpu,
|
||||
alerts,
|
||||
}))
|
||||
}
|
||||
|
||||
fn disk_stats_for(root: &Path) -> Option<DiskStats> {
|
||||
let s = nix::sys::statvfs::statvfs(root).ok()?;
|
||||
// statvfs reports `f_frsize * f_blocks` for total bytes. `f_bavail`
|
||||
// is "free to non-root callers" which is what an operator actually
|
||||
// cares about — `f_bfree` includes blocks reserved for root.
|
||||
let block = s.fragment_size();
|
||||
let total = block * s.blocks();
|
||||
let avail = block * s.blocks_available();
|
||||
let used = total.saturating_sub(avail);
|
||||
let percent_used = if total > 0 {
|
||||
(used as f64) * 100.0 / (total as f64)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
Some(DiskStats {
|
||||
total_bytes: total,
|
||||
used_bytes: used,
|
||||
free_bytes: avail,
|
||||
percent_used,
|
||||
})
|
||||
}
|
||||
|
||||
async fn memory_and_cpu() -> (MemoryStats, CpuStats) {
|
||||
// sysinfo's CPU sampling needs two refreshes with a delay between
|
||||
// them — the first seeds the delta counters, the second measures.
|
||||
// We do this once per request; admin traffic is low enough that the
|
||||
// 250ms cost is invisible.
|
||||
let mut sys = System::new_with_specifics(
|
||||
RefreshKind::new()
|
||||
.with_cpu(CpuRefreshKind::everything())
|
||||
.with_memory(MemoryRefreshKind::everything()),
|
||||
);
|
||||
sys.refresh_cpu_all();
|
||||
// Yield the runtime instead of blocking it for the gap.
|
||||
tokio::time::sleep(Duration::from_millis(250)).await;
|
||||
sys.refresh_cpu_all();
|
||||
sys.refresh_memory();
|
||||
|
||||
let total = sys.total_memory();
|
||||
let used = sys.used_memory();
|
||||
let mem_pct = if total > 0 {
|
||||
(used as f64) * 100.0 / (total as f64)
|
||||
} else {
|
||||
0.0
|
||||
};
|
||||
let memory = MemoryStats {
|
||||
total_bytes: total,
|
||||
used_bytes: used,
|
||||
percent_used: mem_pct,
|
||||
};
|
||||
|
||||
let cpu = CpuStats {
|
||||
percent_used: sys.global_cpu_usage() as f64,
|
||||
};
|
||||
(memory, cpu)
|
||||
}
|
||||
@@ -86,6 +86,10 @@ impl Storage for LocalStorage {
|
||||
let path: &Path = &self.resolve(key)?;
|
||||
Ok(fs::try_exists(path).await?)
|
||||
}
|
||||
|
||||
fn local_root(&self) -> Option<&Path> {
|
||||
Some(&self.root)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -9,6 +9,8 @@ mod local;
|
||||
use std::io;
|
||||
use std::pin::Pin;
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use async_trait::async_trait;
|
||||
use bytes::Bytes;
|
||||
use futures_core::Stream;
|
||||
@@ -44,4 +46,13 @@ pub trait Storage: Send + Sync {
|
||||
async fn get_stream(&self, key: &str) -> Result<StreamingFile, StorageError>;
|
||||
async fn delete(&self, key: &str) -> Result<(), StorageError>;
|
||||
async fn exists(&self, key: &str) -> Result<bool, StorageError>;
|
||||
|
||||
/// Filesystem path the backend is rooted at, when introspectable.
|
||||
/// Returns `None` for backends that aren't a local filesystem (e.g.
|
||||
/// a future `S3Storage`). The admin system endpoint uses this to
|
||||
/// statvfs the data dir; backends that return `None` get a `disk:
|
||||
/// null` payload instead of fabricated numbers.
|
||||
fn local_root(&self) -> Option<&Path> {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user