feat: cover retry backfill + admin force-resync for manga & chapter (0.50.0)
Adds a per-tick cover-backfill pass to the crawler daemon so mangas whose cover download failed on first attempt get retried — the metadata pass's early-stop optimisation otherwise prevents the walk from revisiting them. Adds admin-only POST /admin/mangas/:id/resync and POST /admin/chapters/:id/resync that refetch metadata + cover (or chapter content with force_refetch) from the crawler source synchronously and return the refreshed row. Surfaced in the UI as "Force resync" buttons on the manga detail and reader pages, admin-only via session.user.is_admin. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -13,7 +13,7 @@ use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::safety::{fetch_bytes_capped, looks_like_image, DownloadAllowlist};
|
||||
use crate::crawler::source::target::TargetSource;
|
||||
use crate::crawler::source::{FetchContext, Source};
|
||||
use crate::crawler::source::{FetchContext, Source, SourceMangaRef};
|
||||
use crate::repo;
|
||||
use crate::repo::crawler::UpsertStatus;
|
||||
use crate::storage::Storage;
|
||||
@@ -523,12 +523,133 @@ pub struct EnqueueSummary {
|
||||
pub failed: usize,
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct CoverBackfillStats {
|
||||
pub considered: usize,
|
||||
pub fetched: usize,
|
||||
pub failed: usize,
|
||||
}
|
||||
|
||||
/// Default per-tick cap for [`backfill_missing_covers`]. The metadata pass
|
||||
/// already retries covers when its walk reaches the affected manga; this
|
||||
/// backfill exists to catch the residual case where the early-stop
|
||||
/// optimisation prevents the walk from reaching mangas whose cover failed
|
||||
/// on first attempt. A small cap is enough because the backlog only grows
|
||||
/// from sporadic download failures, not from systematic misses.
|
||||
pub const COVER_BACKFILL_DEFAULT_MAX: usize = 10;
|
||||
|
||||
/// Re-attempt cover downloads for mangas where `cover_image_path IS NULL`
|
||||
/// but a live `manga_sources` row exists. Refetches the source detail
|
||||
/// page (which is where the cover URL lives) and downloads the cover.
|
||||
///
|
||||
/// Bounded by `max_mangas` per call so a steady stream of failing covers
|
||||
/// — e.g. a CDN host that's persistently 502 — can't monopolise a cron
|
||||
/// tick. Orders by `manga_sources.last_seen_at DESC` so the freshest
|
||||
/// missing-cover mangas are addressed first.
|
||||
///
|
||||
/// Failures are logged and counted, not raised: a single bad cover URL
|
||||
/// must not stall every other backfill behind it.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn backfill_missing_covers(
|
||||
browser_manager: &BrowserManager,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
max_mangas: usize,
|
||||
allowlist: &DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
) -> anyhow::Result<CoverBackfillStats> {
|
||||
let mut stats = CoverBackfillStats::default();
|
||||
if max_mangas == 0 {
|
||||
return Ok(stats);
|
||||
}
|
||||
|
||||
let entries = repo::crawler::list_missing_covers(db, max_mangas as i64)
|
||||
.await
|
||||
.context("list_missing_covers")?;
|
||||
|
||||
if entries.is_empty() {
|
||||
return Ok(stats);
|
||||
}
|
||||
|
||||
let lease = browser_manager
|
||||
.acquire()
|
||||
.await
|
||||
.context("acquire browser lease for cover backfill")?;
|
||||
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||
let ctx = FetchContext { browser: browser_ref, rate, tor };
|
||||
|
||||
for entry in entries {
|
||||
stats.considered += 1;
|
||||
// Metadata-only TargetSource: skip chapter-list parsing so a
|
||||
// missing-cover refetch doesn't soft-drop chapters on a partial
|
||||
// render. Cover URL alone is what we need.
|
||||
let source = TargetSource::new(entry.source_url.clone()).without_chapter_parsing();
|
||||
let r = SourceMangaRef {
|
||||
source_manga_key: entry.source_manga_key.clone(),
|
||||
title: String::new(),
|
||||
url: entry.source_url.clone(),
|
||||
};
|
||||
let cover_url = match source.fetch_manga(&ctx, &r).await {
|
||||
Ok(manga) => manga.cover_url,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
manga_id = %entry.manga_id,
|
||||
url = %entry.source_url,
|
||||
error = ?e,
|
||||
"cover backfill: fetch_manga failed"
|
||||
);
|
||||
stats.failed += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
let Some(cover_url) = cover_url else {
|
||||
tracing::warn!(
|
||||
manga_id = %entry.manga_id,
|
||||
url = %entry.source_url,
|
||||
"cover backfill: source returned no cover_url"
|
||||
);
|
||||
stats.failed += 1;
|
||||
continue;
|
||||
};
|
||||
match download_and_store_cover(
|
||||
db,
|
||||
storage,
|
||||
http,
|
||||
rate,
|
||||
&entry.source_url,
|
||||
entry.manga_id,
|
||||
&cover_url,
|
||||
allowlist,
|
||||
max_image_bytes,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => stats.fetched += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
manga_id = %entry.manga_id,
|
||||
url = %entry.source_url,
|
||||
error = ?e,
|
||||
"cover backfill: download failed"
|
||||
);
|
||||
stats.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
drop(lease);
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Download a cover image and persist its storage path. Local to the
|
||||
/// pipeline because the CLI still calls it from its inline chapter-content
|
||||
/// loop; once the worker pool fully replaces that path we can fold this
|
||||
/// into `pipeline` proper.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn download_and_store_cover(
|
||||
pub(crate) async fn download_and_store_cover(
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
|
||||
Reference in New Issue
Block a user