feat: in-process crawler daemon with cron and worker pool (0.28.0)

The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-25 20:32:02 +02:00
parent 93c7fd63fc
commit 9fe0f26d75
14 changed files with 2162 additions and 309 deletions
--- a/backend/src/crawler/pipeline.rs
+++ b/backend/src/crawler/pipeline.rs
@@ -0,0 +1,347 @@
+//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
+//! that fan out chapter-content work. Shared between the daemon (cron tick)
+//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
+
+use anyhow::Context;
+use sqlx::PgPool;
+use uuid::Uuid;
+
+use crate::crawler::browser_manager::BrowserManager;
+use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
+use crate::crawler::rate_limit::HostRateLimiters;
+use crate::crawler::source::target::TargetSource;
+use crate::crawler::source::{DiscoverMode, FetchContext, Source};
+use crate::repo;
+use crate::storage::Storage;
+
+/// Coarse counters surfaced for logging at the end of a metadata pass.
+#[derive(Debug, Default, Clone, Copy)]
+pub struct MetadataStats {
+    pub discovered: usize,
+    pub upserted: usize,
+    pub covers_fetched: usize,
+    pub mangas_failed: usize,
+}
+
+/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
+/// for the target source. Pure metadata; chapter content is enqueued as
+/// separate `SyncChapterContent` jobs by the caller after this returns.
+///
+/// `limit == 0` means no cap (full backfill). `skip_chapters == true` is
+/// the "metadata-only" mode (parser doesn't extract chapters, and
+/// `sync_manga_chapters` is skipped — otherwise an empty chapter list
+/// would soft-drop existing rows).
+#[allow(clippy::too_many_arguments)]
+pub async fn run_metadata_pass(
+    browser_manager: &BrowserManager,
+    db: &PgPool,
+    storage: &dyn Storage,
+    http: &reqwest::Client,
+    rate: &HostRateLimiters,
+    start_url: &str,
+    limit: usize,
+    skip_chapters: bool,
+) -> anyhow::Result<MetadataStats> {
+    let lease = browser_manager
+        .acquire()
+        .await
+        .context("acquire browser lease for metadata pass")?;
+    let browser_ref: &chromiumoxide::Browser = &lease;
+
+    let source = {
+        let s = TargetSource::new(start_url.to_string());
+        if skip_chapters {
+            s.without_chapter_parsing()
+        } else {
+            s
+        }
+    };
+    let ctx = FetchContext {
+        browser: browser_ref,
+        rate,
+    };
+
+    let source_id = source.id();
+    repo::crawler::ensure_source(
+        db,
+        source_id,
+        "Target Site",
+        &origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
+    )
+    .await
+    .context("ensure_source")?;
+
+    let run_started_at = chrono::Utc::now();
+    let max_refs = (limit > 0).then_some(limit);
+
+    tracing::info!(?max_refs, "discovering manga list");
+    let refs = source
+        .discover(&ctx, DiscoverMode::Backfill, max_refs)
+        .await
+        .context("discover failed")?;
+    tracing::info!(count = refs.len(), "discovered manga list");
+
+    let mut stats = MetadataStats {
+        discovered: refs.len(),
+        ..MetadataStats::default()
+    };
+
+    for (i, r) in refs.iter().enumerate() {
+        tracing::info!(
+            idx = i + 1,
+            total = stats.discovered,
+            key = %r.source_manga_key,
+            "fetching metadata"
+        );
+        let manga = match source.fetch_manga(&ctx, r).await {
+            Ok(m) => m,
+            Err(e) => {
+                tracing::warn!(
+                    key = %r.source_manga_key,
+                    url = %r.url,
+                    error = ?e,
+                    "fetch_manga failed"
+                );
+                stats.mangas_failed += 1;
+                continue;
+            }
+        };
+
+        let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
+            .await
+        {
+            Ok(u) => u,
+            Err(e) => {
+                tracing::error!(
+                    key = %r.source_manga_key,
+                    error = ?e,
+                    "upsert_manga_from_source failed"
+                );
+                stats.mangas_failed += 1;
+                continue;
+            }
+        };
+        stats.upserted += 1;
+        tracing::info!(
+            key = %manga.source_manga_key,
+            manga_id = %upsert.manga_id,
+            status = ?upsert.status,
+            title = %manga.title,
+            "manga upserted"
+        );
+
+        // Cover image: download when missing in storage or when metadata
+        // signaled an update (cover URL is part of metadata_hash, so
+        // Updated implies the URL may have moved). Failures are non-fatal.
+        let needs_cover = upsert.cover_image_path.is_none()
+            || matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
+        if needs_cover {
+            if let Some(cover_url) = manga.cover_url.as_deref() {
+                match download_and_store_cover(
+                    db,
+                    storage,
+                    http,
+                    rate,
+                    &r.url,
+                    upsert.manga_id,
+                    cover_url,
+                )
+                .await
+                {
+                    Ok(()) => stats.covers_fetched += 1,
+                    Err(e) => tracing::warn!(
+                        manga_id = %upsert.manga_id,
+                        error = ?e,
+                        "cover download failed"
+                    ),
+                }
+            }
+        }
+
+        if !skip_chapters {
+            match repo::crawler::sync_manga_chapters(
+                db,
+                source_id,
+                upsert.manga_id,
+                &manga.chapters,
+            )
+            .await
+            {
+                Ok(diff) => tracing::info!(
+                    manga_id = %upsert.manga_id,
+                    new = diff.new,
+                    refreshed = diff.refreshed,
+                    dropped = diff.dropped,
+                    "chapters synced"
+                ),
+                Err(e) => tracing::warn!(
+                    manga_id = %upsert.manga_id,
+                    error = ?e,
+                    "chapter sync failed"
+                ),
+            }
+        }
+    }
+
+    if limit == 0 {
+        match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
+            Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
+            Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
+        }
+    } else {
+        tracing::info!(limit, "partial sync — skipping drop pass");
+    }
+
+    drop(lease);
+    Ok(stats)
+}
+
+/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
+/// manga that still has `page_count = 0` and a non-dropped source row.
+/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
+pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
+    let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
+        r#"
+        SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
+          FROM chapters c
+          JOIN bookmarks b ON b.manga_id = c.manga_id
+          JOIN chapter_sources cs ON cs.chapter_id = c.id
+         WHERE c.page_count = 0
+           AND cs.dropped_at IS NULL
+         GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
+         ORDER BY c.manga_id, c.created_at ASC
+        "#,
+    )
+    .fetch_all(pool)
+    .await
+    .context("query bookmarked-pending chapters")?;
+
+    let mut summary = EnqueueSummary::default();
+    for (source_id, chapter_id, source_chapter_key) in rows {
+        let payload = JobPayload::SyncChapterContent {
+            source_id,
+            chapter_id,
+            source_chapter_key,
+        };
+        match jobs::enqueue(pool, &payload).await {
+            Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
+            Ok(EnqueueResult::Skipped) => summary.skipped += 1,
+            Err(e) => {
+                tracing::warn!(
+                    %chapter_id,
+                    error = ?e,
+                    "enqueue chapter content failed"
+                );
+                summary.failed += 1;
+            }
+        }
+    }
+    Ok(summary)
+}
+
+/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
+/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
+pub async fn enqueue_pending_for_manga(
+    pool: &PgPool,
+    manga_id: Uuid,
+) -> anyhow::Result<EnqueueSummary> {
+    let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
+        r#"
+        SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
+          FROM chapters c
+          JOIN chapter_sources cs ON cs.chapter_id = c.id
+         WHERE c.manga_id = $1
+           AND c.page_count = 0
+           AND cs.dropped_at IS NULL
+         ORDER BY cs.source_id, c.id
+        "#,
+    )
+    .bind(manga_id)
+    .fetch_all(pool)
+    .await
+    .context("query pending chapters for manga")?;
+
+    let mut summary = EnqueueSummary::default();
+    for (source_id, chapter_id, source_chapter_key) in rows {
+        let payload = JobPayload::SyncChapterContent {
+            source_id,
+            chapter_id,
+            source_chapter_key,
+        };
+        match jobs::enqueue(pool, &payload).await {
+            Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
+            Ok(EnqueueResult::Skipped) => summary.skipped += 1,
+            Err(e) => {
+                tracing::warn!(
+                    %chapter_id,
+                    error = ?e,
+                    "enqueue chapter content failed"
+                );
+                summary.failed += 1;
+            }
+        }
+    }
+    Ok(summary)
+}
+
+#[derive(Debug, Default, Clone, Copy)]
+pub struct EnqueueSummary {
+    pub inserted: usize,
+    pub skipped: usize,
+    pub failed: usize,
+}
+
+/// Download a cover image and persist its storage path. Local to the
+/// pipeline because the CLI still calls it from its inline chapter-content
+/// loop; once the worker pool fully replaces that path we can fold this
+/// into `pipeline` proper.
+async fn download_and_store_cover(
+    db: &PgPool,
+    storage: &dyn Storage,
+    http: &reqwest::Client,
+    rate: &HostRateLimiters,
+    manga_url: &str,
+    manga_id: Uuid,
+    cover_url: &str,
+) -> anyhow::Result<()> {
+    let absolute = reqwest::Url::parse(manga_url)
+        .context("parse manga URL")?
+        .join(cover_url)
+        .context("join cover URL onto manga URL")?;
+
+    rate.wait_for(absolute.as_str()).await?;
+    let resp = http
+        .get(absolute.clone())
+        .header(reqwest::header::REFERER, manga_url)
+        .send()
+        .await
+        .with_context(|| format!("GET {absolute}"))?
+        .error_for_status()
+        .with_context(|| format!("non-2xx for {absolute}"))?;
+    let bytes = resp.bytes().await.context("read cover body")?;
+    let kind = infer::get(&bytes);
+    let ext = kind.map(|k| k.extension()).unwrap_or("bin");
+    let key = format!("mangas/{manga_id}/cover.{ext}");
+
+    storage
+        .put(&key, &bytes)
+        .await
+        .with_context(|| format!("store cover at {key}"))?;
+    repo::manga::set_cover_image_path(db, manga_id, &key)
+        .await
+        .with_context(|| format!("update cover_image_path for {manga_id}"))?;
+    tracing::info!(
+        manga_id = %manga_id,
+        key = %key,
+        bytes = bytes.len(),
+        %absolute,
+        "cover stored"
+    );
+    Ok(())
+}
+
+fn origin_of(url: &str) -> Option<String> {
+    let (scheme, rest) = url.split_once("://")?;
+    let host = rest.split('/').next()?;
+    Some(format!("{scheme}://{host}"))
+}