feat(crawler): live status surface, runtime session, dead-job repo, auto-restart

Adds the in-process observability + control infrastructure the admin
dashboard consumes:

- status.rs: CrawlerStatus/Phase/WorkerState + StatusHandle. The daemon
  publishes its current phase (idle/walking/fetching-metadata/cover-backfill),
  per-worker activity, and last-pass summary. Wired through the cron,
  run_metadata_pass, and the worker loop.
- session_control.rs: SessionController refreshes PHPSESSID at runtime —
  rewrites the shared reqwest cookie jar, updates the value on_launch reads,
  persists to crawler_state (survives restart), and clears the expired flag.
  on_launch now reads the live session instead of a startup snapshot.
- RealChapterDispatcher auto-triggers a coordinated browser restart after
  CRAWLER_BROWSER_RESTART_THRESHOLD consecutive transient failures.
- repo::crawler: list_dead_jobs, requeue_dead_jobs (all/manga/job, bypassing
  the quarantine, skipping live duplicates), job_state_counts.
- AppState gains CrawlerControl bundling browser_manager + session + status
  + metadata_pass for the admin endpoints.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-06-03 20:38:54 +02:00
parent 3f91bea768
commit cd0a1e13a9
12 changed files with 787 additions and 19 deletions

View File

@@ -17,8 +17,9 @@
//! Each public function is a transaction boundary so a partial failure
//! mid-call leaves the DB in its pre-call state.
use chrono::Utc;
use sqlx::{PgPool, Postgres, Transaction};
use chrono::{DateTime, Utc};
use serde::Serialize;
use sqlx::{FromRow, PgPool, Postgres, Transaction};
use uuid::Uuid;
use crate::crawler::source::{SourceChapterRef, SourceManga};
@@ -618,3 +619,169 @@ pub async fn last_run_completed_cleanly(
.unwrap_or(true))
}
// ---------------------------------------------------------------------------
// Dead-letter jobs: admin observability + requeue.
// ---------------------------------------------------------------------------
/// A `dead` crawler job joined to its chapter/manga context for the admin
/// dead-letter view. Chapter columns are `Option` because the join is
/// best-effort (the chapter may have been removed since the job died, or
/// the job may be a non-chapter kind).
#[derive(Debug, Clone, Serialize, FromRow)]
pub struct DeadJob {
pub id: Uuid,
pub kind: String,
pub chapter_id: Option<Uuid>,
pub manga_id: Option<Uuid>,
pub manga_title: Option<String>,
pub chapter_number: Option<i32>,
pub attempts: i32,
pub max_attempts: i32,
pub last_error: Option<String>,
pub updated_at: DateTime<Utc>,
}
/// Paginated list of `dead` jobs, newest-failed first, joined to chapter +
/// manga context. `search` filters on manga title (case-insensitive
/// substring). Returns the page slice plus the unfiltered-by-page total.
pub async fn list_dead_jobs(
pool: &PgPool,
search: Option<&str>,
limit: i64,
offset: i64,
) -> sqlx::Result<(Vec<DeadJob>, i64)> {
let search_pat = search
.map(|s| format!("%{}%", s.trim()))
.filter(|p| p.len() > 2);
let items: Vec<DeadJob> = sqlx::query_as(
r#"
SELECT
cj.id,
cj.payload->>'kind' AS kind,
(cj.payload->>'chapter_id')::uuid AS chapter_id,
c.manga_id AS manga_id,
m.title AS manga_title,
c.number AS chapter_number,
cj.attempts,
cj.max_attempts,
cj.last_error,
cj.updated_at
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state = 'dead'
AND ($1::text IS NULL OR m.title ILIKE $1)
ORDER BY cj.updated_at DESC
LIMIT $2 OFFSET $3
"#,
)
.bind(&search_pat)
.bind(limit)
.bind(offset)
.fetch_all(pool)
.await?;
let total: i64 = sqlx::query_scalar(
r#"
SELECT COUNT(*)
FROM crawler_jobs cj
LEFT JOIN chapters c ON c.id = (cj.payload->>'chapter_id')::uuid
LEFT JOIN mangas m ON m.id = c.manga_id
WHERE cj.state = 'dead'
AND ($1::text IS NULL OR m.title ILIKE $1)
"#,
)
.bind(&search_pat)
.fetch_one(pool)
.await?;
Ok((items, total))
}
/// Scope of a dead-job requeue.
#[derive(Debug, Clone)]
pub enum RequeueScope {
/// Every dead job.
All,
/// Dead jobs whose chapter belongs to this manga.
Manga(Uuid),
/// A single dead job by its id.
Job(Uuid),
}
/// Requeue dead jobs back to `pending` with a fresh attempt budget. This is
/// an explicit operator override, so it bypasses the dead-letter quarantine
/// the enqueue helpers honour (we act directly on the row). Skips any dead
/// job whose chapter already has a `pending`/`running` job so the partial
/// dedup index is never violated. Returns the number of rows requeued.
pub async fn requeue_dead_jobs(pool: &PgPool, scope: RequeueScope) -> sqlx::Result<u64> {
// Guard against resurrecting a dead job when a live one already covers
// the same chapter (would otherwise hit the dedup unique index).
const NO_LIVE_DUP: &str = r#"
AND NOT EXISTS (
SELECT 1 FROM crawler_jobs live
WHERE live.payload->>'kind' = 'sync_chapter_content'
AND live.payload->>'chapter_id' = crawler_jobs.payload->>'chapter_id'
AND live.state IN ('pending','running')
)
"#;
const SET: &str = "SET state = 'pending', attempts = 0, leased_until = NULL, \
last_error = NULL, scheduled_at = now(), updated_at = now()";
let affected = match scope {
RequeueScope::All => {
sqlx::query(&format!(
"UPDATE crawler_jobs {SET} WHERE state = 'dead' {NO_LIVE_DUP}"
))
.execute(pool)
.await?
.rows_affected()
}
RequeueScope::Manga(manga_id) => {
sqlx::query(&format!(
"UPDATE crawler_jobs {SET} \
WHERE state = 'dead' \
AND (payload->>'chapter_id')::uuid IN \
(SELECT id FROM chapters WHERE manga_id = $1) \
{NO_LIVE_DUP}"
))
.bind(manga_id)
.execute(pool)
.await?
.rows_affected()
}
RequeueScope::Job(job_id) => {
sqlx::query(&format!(
"UPDATE crawler_jobs {SET} WHERE state = 'dead' AND id = $1 {NO_LIVE_DUP}"
))
.bind(job_id)
.execute(pool)
.await?
.rows_affected()
}
};
Ok(affected)
}
/// Count crawler jobs grouped by state — drives the dashboard queue
/// gauges. Returns `(pending, running, dead)`.
pub async fn job_state_counts(pool: &PgPool) -> sqlx::Result<(i64, i64, i64)> {
let rows: Vec<(String, i64)> =
sqlx::query_as("SELECT state, COUNT(*) FROM crawler_jobs GROUP BY state")
.fetch_all(pool)
.await?;
let mut pending = 0;
let mut running = 0;
let mut dead = 0;
for (state, n) in rows {
match state.as_str() {
"pending" => pending = n,
"running" => running = n,
"dead" => dead = n,
_ => {}
}
}
Ok((pending, running, dead))
}