Files
Mangalord/backend/src/crawler/pipeline.rs
MechaCat02 45ce0d8f12 feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first
full walk records `seed_completed:<source>` in `crawler_state`, then
Incremental (newest-first, stops after N consecutive Unchanged
upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects
`auto` since it has no pre-run DB state.

`Source::discover` returns a lazy `DiscoverWalk` so Incremental can
break out mid-walk without prefetching pages. The drop pass and seed
marker are now gated on a true full walk — fixes a latent soft-drop
of the index tail under partial sweeps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 06:41:26 +02:00

468 lines
16 KiB
Rust

//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
//! that fan out chapter-content work. Shared between the daemon (cron tick)
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
use anyhow::Context;
use sqlx::PgPool;
use uuid::Uuid;
use crate::crawler::browser_manager::BrowserManager;
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
use crate::crawler::rate_limit::HostRateLimiters;
use crate::crawler::source::target::TargetSource;
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
use crate::repo;
use crate::storage::Storage;
/// Coarse counters surfaced for logging at the end of a metadata pass.
#[derive(Debug, Default, Clone, Copy)]
pub struct MetadataStats {
pub discovered: usize,
pub upserted: usize,
pub covers_fetched: usize,
pub mangas_failed: usize,
}
/// Decide whether the per-ref loop should stop based on the Incremental
/// streak counter. Pulled out as a pure function so the rule is unit-
/// testable without standing up the walker or DB.
pub(crate) fn should_stop(mode: DiscoverMode, consecutive_unchanged: usize) -> bool {
match mode {
DiscoverMode::Backfill => false,
DiscoverMode::Incremental { stop_after_unchanged } => {
consecutive_unchanged >= stop_after_unchanged
}
}
}
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
/// for the target source. Pure metadata; chapter content is enqueued as
/// separate `SyncChapterContent` jobs by the caller after this returns.
///
/// `limit == 0` means no cap (full sweep up to the source's own bound).
/// `skip_chapters == true` is the "metadata-only" mode (parser doesn't
/// extract chapters, and `sync_manga_chapters` is skipped — otherwise an
/// empty chapter list would soft-drop existing rows).
///
/// `mode` controls the walk:
/// - `Backfill` — oldest-first, no early exit. The only mode that runs
/// the end-of-walk drop pass + writes `seed_completed_at`.
/// - `Incremental { stop_after_unchanged }` — newest-first, breaks out
/// after N consecutive Unchanged upserts. Drop pass is skipped (the
/// tail of the index is never visited, so its `last_seen_at` is
/// stale and using it to soft-drop would be unsafe).
#[allow(clippy::too_many_arguments)]
pub async fn run_metadata_pass(
browser_manager: &BrowserManager,
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
start_url: &str,
limit: usize,
skip_chapters: bool,
mode: DiscoverMode,
) -> anyhow::Result<MetadataStats> {
let lease = browser_manager
.acquire()
.await
.context("acquire browser lease for metadata pass")?;
let browser_ref: &chromiumoxide::Browser = &lease;
let source = {
let s = TargetSource::new(start_url.to_string());
if skip_chapters {
s.without_chapter_parsing()
} else {
s
}
};
let ctx = FetchContext {
browser: browser_ref,
rate,
};
let source_id = source.id();
repo::crawler::ensure_source(
db,
source_id,
"Target Site",
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
)
.await
.context("ensure_source")?;
let run_started_at = chrono::Utc::now();
let max_refs = (limit > 0).then_some(limit);
tracing::info!(?mode, ?max_refs, "starting metadata pass");
let mut walker = source
.discover(&ctx, mode)
.await
.context("discover failed")?;
let mut stats = MetadataStats::default();
let mut consecutive_unchanged: usize = 0;
let mut walked_to_completion = false;
let mut hit_limit = false;
let mut hit_incremental_stop = false;
'outer: loop {
let batch = match walker.next_batch(&ctx).await? {
Some(b) => b,
None => {
walked_to_completion = true;
break;
}
};
for r in batch {
if max_refs.map(|m| stats.discovered >= m).unwrap_or(false) {
hit_limit = true;
tracing::info!(cap = ?max_refs, "max_results reached; halting walk");
break 'outer;
}
stats.discovered += 1;
tracing::info!(
idx = stats.discovered,
key = %r.source_manga_key,
"fetching metadata"
);
let manga = match source.fetch_manga(&ctx, &r).await {
Ok(m) => m,
Err(e) => {
tracing::warn!(
key = %r.source_manga_key,
url = %r.url,
error = ?e,
"fetch_manga failed"
);
stats.mangas_failed += 1;
continue;
}
};
let upsert = match repo::crawler::upsert_manga_from_source(
db, source_id, &r.url, &manga,
)
.await
{
Ok(u) => u,
Err(e) => {
tracing::error!(
key = %r.source_manga_key,
error = ?e,
"upsert_manga_from_source failed"
);
stats.mangas_failed += 1;
continue;
}
};
stats.upserted += 1;
tracing::info!(
key = %manga.source_manga_key,
manga_id = %upsert.manga_id,
status = ?upsert.status,
title = %manga.title,
"manga upserted"
);
// Cover image: download when missing in storage or when metadata
// signaled an update (cover URL is part of metadata_hash, so
// Updated implies the URL may have moved). Failures are non-fatal.
let needs_cover = upsert.cover_image_path.is_none()
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
if needs_cover {
if let Some(cover_url) = manga.cover_url.as_deref() {
match download_and_store_cover(
db,
storage,
http,
rate,
&r.url,
upsert.manga_id,
cover_url,
)
.await
{
Ok(()) => stats.covers_fetched += 1,
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"cover download failed"
),
}
}
}
if !skip_chapters {
match repo::crawler::sync_manga_chapters(
db,
source_id,
upsert.manga_id,
&manga.chapters,
)
.await
{
Ok(diff) => tracing::info!(
manga_id = %upsert.manga_id,
new = diff.new,
refreshed = diff.refreshed,
dropped = diff.dropped,
"chapters synced"
),
Err(e) => tracing::warn!(
manga_id = %upsert.manga_id,
error = ?e,
"chapter sync failed"
),
}
}
// Incremental stop: count consecutive Unchanged upserts and
// bail once the threshold is reached. New/Updated resets the
// streak so a fresh entry mid-batch doesn't accidentally trip
// the cutoff.
match upsert.status {
repo::crawler::UpsertStatus::Unchanged => {
consecutive_unchanged += 1;
}
repo::crawler::UpsertStatus::New | repo::crawler::UpsertStatus::Updated => {
consecutive_unchanged = 0;
}
}
if should_stop(mode, consecutive_unchanged) {
hit_incremental_stop = true;
tracing::info!(
consecutive_unchanged,
"incremental stop threshold reached; halting walk"
);
break 'outer;
}
}
}
// Drop pass: only when the walk truly covered everything the source
// surfaces. `last_seen_at` on un-visited rows is stale, so running
// the drop on a partial walk would soft-drop the tail of the index.
let full_walk = walked_to_completion && !hit_limit && !hit_incremental_stop;
let backfill_complete = full_walk && matches!(mode, DiscoverMode::Backfill);
if full_walk {
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
}
} else {
tracing::info!(
?mode,
hit_limit,
hit_incremental_stop,
"partial sync — skipping drop pass"
);
}
if backfill_complete {
if let Err(e) = repo::crawler::mark_seed_completed(db, source_id, run_started_at).await {
tracing::warn!(error = ?e, "mark_seed_completed failed");
} else {
tracing::info!(source_id, "seed marked complete");
}
}
tracing::info!(
?mode,
discovered = stats.discovered,
upserted = stats.upserted,
covers_fetched = stats.covers_fetched,
mangas_failed = stats.mangas_failed,
walked_to_completion,
hit_limit,
hit_incremental_stop,
"metadata pass complete"
);
drop(lease);
Ok(stats)
}
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
/// manga that still has `page_count = 0` and a non-dropped source row.
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
r#"
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
FROM chapters c
JOIN bookmarks b ON b.manga_id = c.manga_id
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.page_count = 0
AND cs.dropped_at IS NULL
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
ORDER BY c.manga_id, c.created_at ASC
"#,
)
.fetch_all(pool)
.await
.context("query bookmarked-pending chapters")?;
let mut summary = EnqueueSummary::default();
for (source_id, chapter_id, source_chapter_key) in rows {
let payload = JobPayload::SyncChapterContent {
source_id,
chapter_id,
source_chapter_key,
};
match jobs::enqueue(pool, &payload).await {
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
Err(e) => {
tracing::warn!(
%chapter_id,
error = ?e,
"enqueue chapter content failed"
);
summary.failed += 1;
}
}
}
Ok(summary)
}
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
pub async fn enqueue_pending_for_manga(
pool: &PgPool,
manga_id: Uuid,
) -> anyhow::Result<EnqueueSummary> {
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
r#"
SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
FROM chapters c
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE c.manga_id = $1
AND c.page_count = 0
AND cs.dropped_at IS NULL
ORDER BY cs.source_id, c.id
"#,
)
.bind(manga_id)
.fetch_all(pool)
.await
.context("query pending chapters for manga")?;
let mut summary = EnqueueSummary::default();
for (source_id, chapter_id, source_chapter_key) in rows {
let payload = JobPayload::SyncChapterContent {
source_id,
chapter_id,
source_chapter_key,
};
match jobs::enqueue(pool, &payload).await {
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
Err(e) => {
tracing::warn!(
%chapter_id,
error = ?e,
"enqueue chapter content failed"
);
summary.failed += 1;
}
}
}
Ok(summary)
}
#[derive(Debug, Default, Clone, Copy)]
pub struct EnqueueSummary {
pub inserted: usize,
pub skipped: usize,
pub failed: usize,
}
/// Download a cover image and persist its storage path. Local to the
/// pipeline because the CLI still calls it from its inline chapter-content
/// loop; once the worker pool fully replaces that path we can fold this
/// into `pipeline` proper.
async fn download_and_store_cover(
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
manga_url: &str,
manga_id: Uuid,
cover_url: &str,
) -> anyhow::Result<()> {
let absolute = reqwest::Url::parse(manga_url)
.context("parse manga URL")?
.join(cover_url)
.context("join cover URL onto manga URL")?;
rate.wait_for(absolute.as_str()).await?;
let resp = http
.get(absolute.clone())
.header(reqwest::header::REFERER, manga_url)
.send()
.await
.with_context(|| format!("GET {absolute}"))?
.error_for_status()
.with_context(|| format!("non-2xx for {absolute}"))?;
let bytes = resp.bytes().await.context("read cover body")?;
let kind = infer::get(&bytes);
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
let key = format!("mangas/{manga_id}/cover.{ext}");
storage
.put(&key, &bytes)
.await
.with_context(|| format!("store cover at {key}"))?;
repo::manga::set_cover_image_path(db, manga_id, &key)
.await
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
tracing::info!(
manga_id = %manga_id,
key = %key,
bytes = bytes.len(),
%absolute,
"cover stored"
);
Ok(())
}
fn origin_of(url: &str) -> Option<String> {
let (scheme, rest) = url.split_once("://")?;
let host = rest.split('/').next()?;
Some(format!("{scheme}://{host}"))
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn backfill_never_stops_regardless_of_streak() {
assert!(!should_stop(DiscoverMode::Backfill, 0));
assert!(!should_stop(DiscoverMode::Backfill, 100));
assert!(!should_stop(DiscoverMode::Backfill, usize::MAX));
}
#[test]
fn incremental_stops_when_streak_meets_threshold() {
let mode = DiscoverMode::Incremental {
stop_after_unchanged: 3,
};
assert!(!should_stop(mode, 0));
assert!(!should_stop(mode, 2));
assert!(should_stop(mode, 3), "stops at exactly the threshold");
assert!(should_stop(mode, 100), "stops at anything past threshold");
}
#[test]
fn incremental_with_zero_threshold_stops_immediately() {
// A nonsensical config (no Unchanged needed to stop) shouldn't
// panic — it just means the very first ref triggers the bail.
let mode = DiscoverMode::Incremental {
stop_after_unchanged: 0,
};
assert!(should_stop(mode, 0));
}
}