feat: in-process crawler daemon with cron and worker pool (0.28.0)

The backend now boots an internal crawler daemon that runs a daily
metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded
for multi-replica safety) and drains SyncChapterContent jobs from
crawler_jobs through a worker pool. Chromium launches lazily on first
job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity.

Modules:
- crawler::browser_manager — lazy-launch / idle-teardown wrapper
  around browser::Handle, with an on_launch hook that re-injects
  PHPSESSID on every fresh Chromium spawn.
- crawler::pipeline — run_metadata_pass (the shared discover/upsert
  /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper
  used by the cron tick.
- crawler::daemon — cron task + worker pool, behind two trait seams
  (MetadataPass, ChapterDispatcher) so tests can inject stubs without
  standing up Chromium or a live source.

Behavior:
- CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests).
- Catch-up tick fires on startup if the last persisted slot was missed.
- A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers
  idle until operator restart with a refreshed PHPSESSID.
- Worker dispatch wrapped in catch_unwind so a panicking handler
  marks the job failed instead of taking down the worker.
- Migration 0015 adds a small crawler_state k-v table for the
  last_metadata_tick_at watermark.

Dep additions: chrono-tz (IANA TZ parsing).

CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds
the browser via BrowserManager so the on_launch session injection
flow stays in one place. Inline chapter-content sync semantics are
unchanged — the queue is for the daemon, force-refetches and manual
backfills still bypass it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-25 20:32:02 +02:00
parent 93c7fd63fc
commit 9fe0f26d75
14 changed files with 2162 additions and 309 deletions

View File

@@ -0,0 +1,347 @@
//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
//! that fan out chapter-content work. Shared between the daemon (cron tick)
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
use anyhow::Context;
use sqlx::PgPool;
use uuid::Uuid;
use crate::crawler::browser_manager::BrowserManager;
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::source::target::TargetSource;
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
use crate::repo;
use crate::storage::Storage;
/// Coarse counters surfaced for logging at the end of a metadata pass.
#[derive(Debug, Default, Clone, Copy)]
pub struct MetadataStats {
pub discovered: usize,
pub upserted: usize,
pub covers_fetched: usize,
pub mangas_failed: usize,
}
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
/// for the target source. Pure metadata; chapter content is enqueued as
/// separate `SyncChapterContent` jobs by the caller after this returns.
///
/// `limit == 0` means no cap (full backfill). `skip_chapters == true` is
/// the "metadata-only" mode (parser doesn't extract chapters, and
/// `sync_manga_chapters` is skipped — otherwise an empty chapter list
/// would soft-drop existing rows).
#[allow(clippy::too_many_arguments)]
pub async fn run_metadata_pass(
browser_manager: &BrowserManager,
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
start_url: &str,
limit: usize,
skip_chapters: bool,
) -> anyhow::Result<MetadataStats> {
let lease = browser_manager
.acquire()
.await
.context("acquire browser lease for metadata pass")?;
let browser_ref: &chromiumoxide::Browser = &lease;
let source = {
let s = TargetSource::new(start_url.to_string());
if skip_chapters {
s.without_chapter_parsing()
} else {
s
}
};
let ctx = FetchContext {
browser: browser_ref,
rate,
};
let source_id = source.id();
repo::crawler::ensure_source(
db,
source_id,
"Target Site",
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
)
.await
.context("ensure_source")?;
let run_started_at = chrono::Utc::now();
let max_refs = (limit > 0).then_some(limit);
tracing::info!(?max_refs, "discovering manga list");
let refs = source
.discover(&ctx, DiscoverMode::Backfill, max_refs)
.await
.context("discover failed")?;
tracing::info!(count = refs.len(), "discovered manga list");
let mut stats = MetadataStats {
discovered: refs.len(),
..MetadataStats::default()
};
for (i, r) in refs.iter().enumerate() {
tracing::info!(
idx = i + 1,
total = stats.discovered,
key = %r.source_manga_key,
"fetching metadata"
);
let manga = match source.fetch_manga(&ctx, r).await {
Ok(m) => m,
Err(e) => {
tracing::warn!(
key = %r.source_manga_key,
url = %r.url,
error = ?e,
"fetch_manga failed"
);
stats.mangas_failed += 1;
continue;
}
};
let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
.await
{
Ok(u) => u,
Err(e) => {
tracing::error!(
key = %r.source_manga_key,
error = ?e,
"upsert_manga_from_source failed"
);
stats.mangas_failed += 1;
continue;
}
};
stats.upserted += 1;
tracing::info!(
key = %manga.source_manga_key,
manga_id = %upsert.manga_id,
status = ?upsert.status,
title = %manga.title,
"manga upserted"
);
// Cover image: download when missing in storage or when metadata
// signaled an update (cover URL is part of metadata_hash, so
// Updated implies the URL may have moved). Failures are non-fatal.
let needs_cover = upsert.cover_image_path.is_none()
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
if needs_cover {
if let Some(cover_url) = manga.cover_url.as_deref() {
match download_and_store_cover(
db,
storage,
http,
rate,
&r.url,
upsert.manga_id,
cover_url,
)
.await
{
Ok(()) => stats.covers_fetched += 1,
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"cover download failed"
),
}
}
}
if !skip_chapters {
match repo::crawler::sync_manga_chapters(
db,
source_id,
upsert.manga_id,
&manga.chapters,
)
.await
{
Ok(diff) => tracing::info!(
manga_id = %upsert.manga_id,
new = diff.new,
refreshed = diff.refreshed,
dropped = diff.dropped,
"chapters synced"
),
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"chapter sync failed"
),
}
}
}
if limit == 0 {
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
}
} else {
tracing::info!(limit, "partial sync — skipping drop pass");
}
drop(lease);
Ok(stats)
}
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
/// manga that still has `page_count = 0` and a non-dropped source row.
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
r#"
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
FROM chapters c
JOIN bookmarks b ON b.manga_id = c.manga_id
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.page_count = 0
AND cs.dropped_at IS NULL
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
ORDER BY c.manga_id, c.created_at ASC
"#,
)
.fetch_all(pool)
.await
.context("query bookmarked-pending chapters")?;
let mut summary = EnqueueSummary::default();
for (source_id, chapter_id, source_chapter_key) in rows {
let payload = JobPayload::SyncChapterContent {
source_id,
chapter_id,
source_chapter_key,
};
match jobs::enqueue(pool, &payload).await {
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
Err(e) => {
tracing::warn!(
%chapter_id,
error = ?e,
"enqueue chapter content failed"
);
summary.failed += 1;
}
}
}
Ok(summary)
}
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
pub async fn enqueue_pending_for_manga(
pool: &PgPool,
manga_id: Uuid,
) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
r#"
SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
FROM chapters c
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.manga_id = $1
AND c.page_count = 0
AND cs.dropped_at IS NULL
ORDER BY cs.source_id, c.id
"#,
)
.bind(manga_id)
.fetch_all(pool)
.await
.context("query pending chapters for manga")?;
let mut summary = EnqueueSummary::default();
for (source_id, chapter_id, source_chapter_key) in rows {
let payload = JobPayload::SyncChapterContent {
source_id,
chapter_id,
source_chapter_key,
};
match jobs::enqueue(pool, &payload).await {
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
Err(e) => {
tracing::warn!(
%chapter_id,
error = ?e,
"enqueue chapter content failed"
);
summary.failed += 1;
}
}
}
Ok(summary)
}
#[derive(Debug, Default, Clone, Copy)]
pub struct EnqueueSummary {
pub inserted: usize,
pub skipped: usize,
pub failed: usize,
}
/// Download a cover image and persist its storage path. Local to the
/// pipeline because the CLI still calls it from its inline chapter-content
/// loop; once the worker pool fully replaces that path we can fold this
/// into `pipeline` proper.
async fn download_and_store_cover(
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
manga_url: &str,
manga_id: Uuid,
cover_url: &str,
) -> anyhow::Result<()> {
let absolute = reqwest::Url::parse(manga_url)
.context("parse manga URL")?
.join(cover_url)
.context("join cover URL onto manga URL")?;
rate.wait_for(absolute.as_str()).await?;
let resp = http
.get(absolute.clone())
.header(reqwest::header::REFERER, manga_url)
.send()
.await
.with_context(|| format!("GET {absolute}"))?
.error_for_status()
.with_context(|| format!("non-2xx for {absolute}"))?;
let bytes = resp.bytes().await.context("read cover body")?;
let kind = infer::get(&bytes);
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
let key = format!("mangas/{manga_id}/cover.{ext}");
storage
.put(&key, &bytes)
.await
.with_context(|| format!("store cover at {key}"))?;
repo::manga::set_cover_image_path(db, manga_id, &key)
.await
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
tracing::info!(
manga_id = %manga_id,
key = %key,
bytes = bytes.len(),
%absolute,
"cover stored"
);
Ok(())
}
fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split('/').next()?;
Some(format!("{scheme}://{host}"))
}