feat: cover retry backfill + admin force-resync for manga & chapter (0.50.0)

Adds a per-tick cover-backfill pass to the crawler daemon so mangas whose cover download failed on first attempt get retried — the metadata pass's early-stop optimisation otherwise prevents the walk from revisiting them. Adds admin-only POST /admin/mangas/:id/resync and POST /admin/chapters/:id/resync that refetch metadata + cover (or chapter content with force_refetch) from the crawler source synchronously and return the refreshed row. Surfaced in the UI as "Force resync" buttons on the manga detail and reader pages, admin-only via session.user.is_admin. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-06-01 22:00:09 +02:00
parent 5c22dfdb41
commit c134bdbbde
19 changed files with 1505 additions and 17 deletions
--- a/backend/src/crawler/pipeline.rs
+++ b/backend/src/crawler/pipeline.rs
@@ -13,7 +13,7 @@ use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
 use crate::crawler::rate_limit::HostRateLimiters;
 use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
 use crate::crawler::source::target::TargetSource;
-use crate::crawler::source::{FetchContext, Source};
+use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
 use crate::repo;
 use crate::repo::crawler::UpsertStatus;
 use crate::storage::Storage;
@@ -523,12 +523,133 @@ pub struct EnqueueSummary {
    pub failed: usize,
 }

+#[derive(Debug, Default, Clone, Copy)]
+pub struct CoverBackfillStats {
+    pub considered: usize,
+    pub fetched: usize,
+    pub failed: usize,
+}
+
+/// Default per-tick cap for [`backfill_missing_covers`]. The metadata pass
+/// already retries covers when its walk reaches the affected manga; this
+/// backfill exists to catch the residual case where the early-stop
+/// optimisation prevents the walk from reaching mangas whose cover failed
+/// on first attempt. A small cap is enough because the backlog only grows
+/// from sporadic download failures, not from systematic misses.
+pub const COVER_BACKFILL_DEFAULT_MAX: usize = 10;
+
+/// Re-attempt cover downloads for mangas where `cover_image_path IS NULL`
+/// but a live `manga_sources` row exists. Refetches the source detail
+/// page (which is where the cover URL lives) and downloads the cover.
+///
+/// Bounded by `max_mangas` per call so a steady stream of failing covers
+/// — e.g. a CDN host that's persistently 502 — can't monopolise a cron
+/// tick. Orders by `manga_sources.last_seen_at DESC` so the freshest
+/// missing-cover mangas are addressed first.
+///
+/// Failures are logged and counted, not raised: a single bad cover URL
+/// must not stall every other backfill behind it.
+#[allow(clippy::too_many_arguments)]
+pub async fn backfill_missing_covers(
+    browser_manager: &BrowserManager,
+    db: &PgPool,
+    storage: &dyn Storage,
+    http: &reqwest::Client,
+    rate: &HostRateLimiters,
+    max_mangas: usize,
+    allowlist: &DownloadAllowlist,
+    max_image_bytes: usize,
+    tor: Option<&crate::crawler::tor::TorController>,
+) -> anyhow::Result<CoverBackfillStats> {
+    let mut stats = CoverBackfillStats::default();
+    if max_mangas == 0 {
+        return Ok(stats);
+    }
+
+    let entries = repo::crawler::list_missing_covers(db, max_mangas as i64)
+        .await
+        .context("list_missing_covers")?;
+
+    if entries.is_empty() {
+        return Ok(stats);
+    }
+
+    let lease = browser_manager
+        .acquire()
+        .await
+        .context("acquire browser lease for cover backfill")?;
+    let browser_ref: &chromiumoxide::Browser = &lease;
+    let ctx = FetchContext { browser: browser_ref, rate, tor };
+
+    for entry in entries {
+        stats.considered += 1;
+        // Metadata-only TargetSource: skip chapter-list parsing so a
+        // missing-cover refetch doesn't soft-drop chapters on a partial
+        // render. Cover URL alone is what we need.
+        let source = TargetSource::new(entry.source_url.clone()).without_chapter_parsing();
+        let r = SourceMangaRef {
+            source_manga_key: entry.source_manga_key.clone(),
+            title: String::new(),
+            url: entry.source_url.clone(),
+        };
+        let cover_url = match source.fetch_manga(&ctx, &r).await {
+            Ok(manga) => manga.cover_url,
+            Err(e) => {
+                tracing::warn!(
+                    manga_id = %entry.manga_id,
+                    url = %entry.source_url,
+                    error = ?e,
+                    "cover backfill: fetch_manga failed"
+                );
+                stats.failed += 1;
+                continue;
+            }
+        };
+        let Some(cover_url) = cover_url else {
+            tracing::warn!(
+                manga_id = %entry.manga_id,
+                url = %entry.source_url,
+                "cover backfill: source returned no cover_url"
+            );
+            stats.failed += 1;
+            continue;
+        };
+        match download_and_store_cover(
+            db,
+            storage,
+            http,
+            rate,
+            &entry.source_url,
+            entry.manga_id,
+            &cover_url,
+            allowlist,
+            max_image_bytes,
+        )
+        .await
+        {
+            Ok(()) => stats.fetched += 1,
+            Err(e) => {
+                tracing::warn!(
+                    manga_id = %entry.manga_id,
+                    url = %entry.source_url,
+                    error = ?e,
+                    "cover backfill: download failed"
+                );
+                stats.failed += 1;
+            }
+        }
+    }
+
+    drop(lease);
+    Ok(stats)
+}
+
 /// Download a cover image and persist its storage path. Local to the
 /// pipeline because the CLI still calls it from its inline chapter-content
 /// loop; once the worker pool fully replaces that path we can fold this
 /// into `pipeline` proper.
 #[allow(clippy::too_many_arguments)]
-async fn download_and_store_cover(
+pub(crate) async fn download_and_store_cover(
    db: &PgPool,
    storage: &dyn Storage,
    http: &reqwest::Client,