feat: in-process crawler daemon with cron and worker pool (0.28.0)
The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
347
backend/src/crawler/pipeline.rs
Normal file
347
backend/src/crawler/pipeline.rs
Normal file
@@ -0,0 +1,347 @@
|
||||
//! Crawler pipeline — the reusable metadata pass and the enqueue helpers
|
||||
//! that fan out chapter-content work. Shared between the daemon (cron tick)
|
||||
//! and the CLI (`bin/crawler.rs`) so behavior stays in lockstep.
|
||||
|
||||
use anyhow::Context;
|
||||
use sqlx::PgPool;
|
||||
use uuid::Uuid;
|
||||
|
||||
use crate::crawler::browser_manager::BrowserManager;
|
||||
use crate::crawler::jobs::{self, EnqueueResult, JobPayload};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::source::target::TargetSource;
|
||||
use crate::crawler::source::{DiscoverMode, FetchContext, Source};
|
||||
use crate::repo;
|
||||
use crate::storage::Storage;
|
||||
|
||||
/// Coarse counters surfaced for logging at the end of a metadata pass.
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct MetadataStats {
|
||||
pub discovered: usize,
|
||||
pub upserted: usize,
|
||||
pub covers_fetched: usize,
|
||||
pub mangas_failed: usize,
|
||||
}
|
||||
|
||||
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||
/// for the target source. Pure metadata; chapter content is enqueued as
|
||||
/// separate `SyncChapterContent` jobs by the caller after this returns.
|
||||
///
|
||||
/// `limit == 0` means no cap (full backfill). `skip_chapters == true` is
|
||||
/// the "metadata-only" mode (parser doesn't extract chapters, and
|
||||
/// `sync_manga_chapters` is skipped — otherwise an empty chapter list
|
||||
/// would soft-drop existing rows).
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
pub async fn run_metadata_pass(
|
||||
browser_manager: &BrowserManager,
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
start_url: &str,
|
||||
limit: usize,
|
||||
skip_chapters: bool,
|
||||
) -> anyhow::Result<MetadataStats> {
|
||||
let lease = browser_manager
|
||||
.acquire()
|
||||
.await
|
||||
.context("acquire browser lease for metadata pass")?;
|
||||
let browser_ref: &chromiumoxide::Browser = &lease;
|
||||
|
||||
let source = {
|
||||
let s = TargetSource::new(start_url.to_string());
|
||||
if skip_chapters {
|
||||
s.without_chapter_parsing()
|
||||
} else {
|
||||
s
|
||||
}
|
||||
};
|
||||
let ctx = FetchContext {
|
||||
browser: browser_ref,
|
||||
rate,
|
||||
};
|
||||
|
||||
let source_id = source.id();
|
||||
repo::crawler::ensure_source(
|
||||
db,
|
||||
source_id,
|
||||
"Target Site",
|
||||
&origin_of(start_url).unwrap_or_else(|| start_url.to_string()),
|
||||
)
|
||||
.await
|
||||
.context("ensure_source")?;
|
||||
|
||||
let run_started_at = chrono::Utc::now();
|
||||
let max_refs = (limit > 0).then_some(limit);
|
||||
|
||||
tracing::info!(?max_refs, "discovering manga list");
|
||||
let refs = source
|
||||
.discover(&ctx, DiscoverMode::Backfill, max_refs)
|
||||
.await
|
||||
.context("discover failed")?;
|
||||
tracing::info!(count = refs.len(), "discovered manga list");
|
||||
|
||||
let mut stats = MetadataStats {
|
||||
discovered: refs.len(),
|
||||
..MetadataStats::default()
|
||||
};
|
||||
|
||||
for (i, r) in refs.iter().enumerate() {
|
||||
tracing::info!(
|
||||
idx = i + 1,
|
||||
total = stats.discovered,
|
||||
key = %r.source_manga_key,
|
||||
"fetching metadata"
|
||||
);
|
||||
let manga = match source.fetch_manga(&ctx, r).await {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
url = %r.url,
|
||||
error = ?e,
|
||||
"fetch_manga failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let upsert = match repo::crawler::upsert_manga_from_source(db, source_id, &r.url, &manga)
|
||||
.await
|
||||
{
|
||||
Ok(u) => u,
|
||||
Err(e) => {
|
||||
tracing::error!(
|
||||
key = %r.source_manga_key,
|
||||
error = ?e,
|
||||
"upsert_manga_from_source failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
continue;
|
||||
}
|
||||
};
|
||||
stats.upserted += 1;
|
||||
tracing::info!(
|
||||
key = %manga.source_manga_key,
|
||||
manga_id = %upsert.manga_id,
|
||||
status = ?upsert.status,
|
||||
title = %manga.title,
|
||||
"manga upserted"
|
||||
);
|
||||
|
||||
// Cover image: download when missing in storage or when metadata
|
||||
// signaled an update (cover URL is part of metadata_hash, so
|
||||
// Updated implies the URL may have moved). Failures are non-fatal.
|
||||
let needs_cover = upsert.cover_image_path.is_none()
|
||||
|| matches!(upsert.status, repo::crawler::UpsertStatus::Updated);
|
||||
if needs_cover {
|
||||
if let Some(cover_url) = manga.cover_url.as_deref() {
|
||||
match download_and_store_cover(
|
||||
db,
|
||||
storage,
|
||||
http,
|
||||
rate,
|
||||
&r.url,
|
||||
upsert.manga_id,
|
||||
cover_url,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(()) => stats.covers_fetched += 1,
|
||||
Err(e) => tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"cover download failed"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !skip_chapters {
|
||||
match repo::crawler::sync_manga_chapters(
|
||||
db,
|
||||
source_id,
|
||||
upsert.manga_id,
|
||||
&manga.chapters,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(diff) => tracing::info!(
|
||||
manga_id = %upsert.manga_id,
|
||||
new = diff.new,
|
||||
refreshed = diff.refreshed,
|
||||
dropped = diff.dropped,
|
||||
"chapters synced"
|
||||
),
|
||||
Err(e) => tracing::warn!(
|
||||
manga_id = %upsert.manga_id,
|
||||
error = ?e,
|
||||
"chapter sync failed"
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if limit == 0 {
|
||||
match repo::crawler::mark_dropped_mangas(db, source_id, run_started_at).await {
|
||||
Ok(n) => tracing::info!(dropped = n, "marked unseen manga as dropped"),
|
||||
Err(e) => tracing::warn!(error = ?e, "drop-pass failed"),
|
||||
}
|
||||
} else {
|
||||
tracing::info!(limit, "partial sync — skipping drop pass");
|
||||
}
|
||||
|
||||
drop(lease);
|
||||
Ok(stats)
|
||||
}
|
||||
|
||||
/// Enqueue a `SyncChapterContent` job for every chapter of *any* bookmarked
|
||||
/// manga that still has `page_count = 0` and a non-dropped source row.
|
||||
/// Returns `(inserted, skipped)` counts. Dedup index handles repeats.
|
||||
pub async fn enqueue_bookmarked_pending(pool: &PgPool) -> anyhow::Result<EnqueueSummary> {
|
||||
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||
FROM chapters c
|
||||
JOIN bookmarks b ON b.manga_id = c.manga_id
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||
WHERE c.page_count = 0
|
||||
AND cs.dropped_at IS NULL
|
||||
GROUP BY cs.source_id, c.id, cs.source_chapter_key, c.manga_id, c.created_at
|
||||
ORDER BY c.manga_id, c.created_at ASC
|
||||
"#,
|
||||
)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("query bookmarked-pending chapters")?;
|
||||
|
||||
let mut summary = EnqueueSummary::default();
|
||||
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||
let payload = JobPayload::SyncChapterContent {
|
||||
source_id,
|
||||
chapter_id,
|
||||
source_chapter_key,
|
||||
};
|
||||
match jobs::enqueue(pool, &payload).await {
|
||||
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
%chapter_id,
|
||||
error = ?e,
|
||||
"enqueue chapter content failed"
|
||||
);
|
||||
summary.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
/// Enqueue chapter-content jobs for a *single* manga (the bookmark-create
|
||||
/// hook). Same dedup semantics as [`enqueue_bookmarked_pending`].
|
||||
pub async fn enqueue_pending_for_manga(
|
||||
pool: &PgPool,
|
||||
manga_id: Uuid,
|
||||
) -> anyhow::Result<EnqueueSummary> {
|
||||
let rows: Vec<(String, Uuid, String)> = sqlx::query_as(
|
||||
r#"
|
||||
SELECT DISTINCT cs.source_id, c.id AS chapter_id, cs.source_chapter_key
|
||||
FROM chapters c
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
||||
WHERE c.manga_id = $1
|
||||
AND c.page_count = 0
|
||||
AND cs.dropped_at IS NULL
|
||||
ORDER BY cs.source_id, c.id
|
||||
"#,
|
||||
)
|
||||
.bind(manga_id)
|
||||
.fetch_all(pool)
|
||||
.await
|
||||
.context("query pending chapters for manga")?;
|
||||
|
||||
let mut summary = EnqueueSummary::default();
|
||||
for (source_id, chapter_id, source_chapter_key) in rows {
|
||||
let payload = JobPayload::SyncChapterContent {
|
||||
source_id,
|
||||
chapter_id,
|
||||
source_chapter_key,
|
||||
};
|
||||
match jobs::enqueue(pool, &payload).await {
|
||||
Ok(EnqueueResult::Inserted(_)) => summary.inserted += 1,
|
||||
Ok(EnqueueResult::Skipped) => summary.skipped += 1,
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
%chapter_id,
|
||||
error = ?e,
|
||||
"enqueue chapter content failed"
|
||||
);
|
||||
summary.failed += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(summary)
|
||||
}
|
||||
|
||||
#[derive(Debug, Default, Clone, Copy)]
|
||||
pub struct EnqueueSummary {
|
||||
pub inserted: usize,
|
||||
pub skipped: usize,
|
||||
pub failed: usize,
|
||||
}
|
||||
|
||||
/// Download a cover image and persist its storage path. Local to the
|
||||
/// pipeline because the CLI still calls it from its inline chapter-content
|
||||
/// loop; once the worker pool fully replaces that path we can fold this
|
||||
/// into `pipeline` proper.
|
||||
async fn download_and_store_cover(
|
||||
db: &PgPool,
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
manga_url: &str,
|
||||
manga_id: Uuid,
|
||||
cover_url: &str,
|
||||
) -> anyhow::Result<()> {
|
||||
let absolute = reqwest::Url::parse(manga_url)
|
||||
.context("parse manga URL")?
|
||||
.join(cover_url)
|
||||
.context("join cover URL onto manga URL")?;
|
||||
|
||||
rate.wait_for(absolute.as_str()).await?;
|
||||
let resp = http
|
||||
.get(absolute.clone())
|
||||
.header(reqwest::header::REFERER, manga_url)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("GET {absolute}"))?
|
||||
.error_for_status()
|
||||
.with_context(|| format!("non-2xx for {absolute}"))?;
|
||||
let bytes = resp.bytes().await.context("read cover body")?;
|
||||
let kind = infer::get(&bytes);
|
||||
let ext = kind.map(|k| k.extension()).unwrap_or("bin");
|
||||
let key = format!("mangas/{manga_id}/cover.{ext}");
|
||||
|
||||
storage
|
||||
.put(&key, &bytes)
|
||||
.await
|
||||
.with_context(|| format!("store cover at {key}"))?;
|
||||
repo::manga::set_cover_image_path(db, manga_id, &key)
|
||||
.await
|
||||
.with_context(|| format!("update cover_image_path for {manga_id}"))?;
|
||||
tracing::info!(
|
||||
manga_id = %manga_id,
|
||||
key = %key,
|
||||
bytes = bytes.len(),
|
||||
%absolute,
|
||||
"cover stored"
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn origin_of(url: &str) -> Option<String> {
|
||||
let (scheme, rest) = url.split_once("://")?;
|
||||
let host = rest.split('/').next()?;
|
||||
Some(format!("{scheme}://{host}"))
|
||||
}
|
||||
Reference in New Issue
Block a user