Files
Mangalord/backend/src/bin/crawler.rs
MechaCat02 9b4f3525f6 feat: incremental crawl mode with seed-completion gate (0.33.0)
Daemon now auto-detects mode per source: Backfill until the first
full walk records `seed_completed:<source>` in `crawler_state`, then
Incremental (newest-first, stops after N consecutive Unchanged
upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects
`auto` since it has no pre-run DB state.

`Source::discover` returns a lazy `DiscoverWalk` so Incremental can
break out mid-walk without prefetching pages. The drop pass and seed
marker are now gated on a true full walk — fixes a latent soft-drop
of the index tail under partial sweeps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 06:41:16 +02:00

499 lines
17 KiB
Rust

//! Crawler binary.
//!
//! Now an ops escape hatch sitting alongside the in-process daemon: walks
//! the source's manga listing (all pages), fetches each manga's metadata +
//! chapter list, downloads covers, reconciles chapters — and then, for any
//! chapter belonging to a bookmarked manga whose `page_count` is still 0,
//! fetches the chapter pages inline. The daemon does the same work through
//! `crawler_jobs`; the CLI is kept around for force-refetches and manual
//! backfills.
//!
//! Configuration mirrors the daemon's `CRAWLER_*` env vars (see
//! `crate::config::CrawlerConfig`) plus the CLI-only:
//! - **Start URL**: first CLI positional arg, else `$CRAWLER_START_URL`.
//! - **Skip chapters / chapter content / force re-fetch / keep browser**:
//! `CRAWLER_SKIP_CHAPTERS`, `CRAWLER_SKIP_CHAPTER_CONTENT`,
//! `CRAWLER_FORCE_REFETCH_CHAPTERS`, `CRAWLER_KEEP_BROWSER_OPEN`.
//! - **Limit**: `CRAWLER_LIMIT` (max manga detail fetches per run).
//!
//! See `crawler::pipeline::run_metadata_pass` for the shared metadata
//! flow.
use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;
use anyhow::{anyhow, Context};
use futures_util::stream::{self, StreamExt};
use mangalord::crawler::browser::{BrowserMode, LaunchOptions};
use mangalord::crawler::browser_manager::{self, BrowserManager};
use mangalord::crawler::content::{self, SyncOutcome};
use mangalord::crawler::pipeline;
use mangalord::crawler::rate_limit::HostRateLimiters;
use mangalord::crawler::session;
use mangalord::crawler::source::DiscoverMode;
use mangalord::storage::{LocalStorage, Storage};
use sqlx::postgres::PgPoolOptions;
use sqlx::PgPool;
use tracing_subscriber::EnvFilter;
use uuid::Uuid;
#[tokio::main]
async fn main() -> anyhow::Result<()> {
dotenvy::dotenv().ok();
tracing_subscriber::fmt()
.with_env_filter(
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off"
.into()
}),
)
.init();
let start_url = resolve_start_url()?;
let database_url = std::env::var("DATABASE_URL")
.map_err(|_| anyhow!("DATABASE_URL must be set"))?;
let storage_dir: PathBuf = std::env::var("STORAGE_DIR")
.unwrap_or_else(|_| "./data/storage".to_string())
.into();
let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
.ok()
.filter(|s| !s.trim().is_empty());
let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms);
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
let mode = parse_crawler_mode(incremental_stop_after)?;
let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false);
let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize;
let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false);
let phpsessid = std::env::var("CRAWLER_PHPSESSID")
.ok()
.filter(|s| !s.trim().is_empty());
let cookie_domain = std::env::var("CRAWLER_COOKIE_DOMAIN")
.ok()
.filter(|s| !s.trim().is_empty())
.or_else(|| session::registrable_domain(&start_url));
let user_agent = std::env::var("CRAWLER_USER_AGENT")
.ok()
.filter(|s| !s.trim().is_empty());
let proxy_url = std::env::var("CRAWLER_PROXY")
.ok()
.filter(|s| !s.trim().is_empty());
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
let db = PgPoolOptions::new()
.max_connections(5)
.connect(&database_url)
.await
.context("connect to database")?;
sqlx::migrate!("./migrations").run(&db).await?;
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) {
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
let seed_url =
reqwest::Url::parse(&start_url).context("parse start URL for cookie seed")?;
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
tracing::info!(domain, "seeded PHPSESSID into reqwest cookie jar");
}
let mut http_builder = reqwest::Client::builder()
.timeout(Duration::from_secs(30))
.no_proxy()
.cookie_provider(cookie_jar);
if let Some(ua) = &user_agent {
http_builder = http_builder.user_agent(ua);
}
if let Some(proxy) = &proxy_url {
http_builder = http_builder
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
}
let http = http_builder.build().context("build http client")?;
let mut options = LaunchOptions::from_env();
if let Some(proxy) = &proxy_url {
options.extra_args.push(format!("--proxy-server={proxy}"));
}
let keep_open = match (keep_browser_open, options.mode) {
(true, BrowserMode::Headed) => true,
(true, BrowserMode::Headless) => {
tracing::warn!(
"CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)"
);
false
}
_ => false,
};
tracing::info!(
?options,
%start_url,
rate_ms,
cdn_host = ?cdn_host,
cdn_rate_ms,
limit,
skip_chapters,
skip_chapter_content,
chapter_workers,
force_refetch_chapters,
phpsessid_set = phpsessid.is_some(),
cookie_domain = ?cookie_domain,
user_agent = ?user_agent,
proxy = ?proxy_url,
keep_open,
?mode,
storage_dir = %storage_dir.display(),
"starting crawler"
);
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
// alive for the entire run — same lifecycle as the old direct
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
// session probe; bad cookies fail fast before any real work happens.
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
(Some(sid), Some(domain)) => {
let sid = sid.clone();
let domain = domain.clone();
let start_url_clone = start_url.clone();
Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url_clone.clone();
Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("inject_phpsessid")?;
session::verify_session(&browser, &start_url)
.await
.context("verify_session")?;
Ok(())
})
})
}
_ => browser_manager::noop_on_launch(),
};
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
let result = run(
Arc::clone(&manager),
&db,
Arc::clone(&storage),
&http,
&start_url,
rate_ms,
cdn_host.as_deref(),
cdn_rate_ms,
limit,
skip_chapters,
skip_chapter_content || !session_ready,
chapter_workers,
force_refetch_chapters,
mode,
)
.await;
if keep_open {
tracing::info!(
"crawler finished; browser kept open. Press Ctrl+C to close and exit."
);
let _ = tokio::signal::ctrl_c().await;
tracing::info!("Ctrl+C received; closing browser");
}
manager.shutdown().await;
result
}
#[allow(clippy::too_many_arguments)]
async fn run(
manager: Arc<BrowserManager>,
db: &PgPool,
storage: Arc<dyn Storage>,
http: &reqwest::Client,
start_url: &str,
rate_ms: u64,
cdn_host: Option<&str>,
cdn_rate_ms: u64,
limit: usize,
skip_chapters: bool,
skip_chapter_content: bool,
chapter_workers: usize,
force_refetch_chapters: bool,
mode: DiscoverMode,
) -> anyhow::Result<()> {
let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms));
if let Some(host) = cdn_host {
rate = rate.with_override(host, Duration::from_millis(cdn_rate_ms));
}
let rate = Arc::new(rate);
let stats = pipeline::run_metadata_pass(
manager.as_ref(),
db,
storage.as_ref(),
http,
rate.as_ref(),
start_url,
limit,
skip_chapters,
mode,
)
.await?;
tracing::info!(?stats, "metadata pass complete");
if !skip_chapter_content {
sync_bookmarked_chapter_content(
Arc::clone(&manager),
db,
Arc::clone(&storage),
http,
Arc::clone(&rate),
"target",
chapter_workers,
force_refetch_chapters,
)
.await?;
}
Ok(())
}
/// Find every chapter whose manga is bookmarked by at least one user and
/// that hasn't been content-synced yet, then fan them out across `workers`
/// concurrent tasks. Same as before except the browser comes from a
/// BrowserManager lease so it interleaves cleanly with the metadata pass.
///
/// A `SessionExpired` result aborts the phase.
#[allow(clippy::too_many_arguments)]
async fn sync_bookmarked_chapter_content(
manager: Arc<BrowserManager>,
db: &PgPool,
storage: Arc<dyn Storage>,
http: &reqwest::Client,
rate: Arc<HostRateLimiters>,
source_id: &str,
workers: usize,
force_refetch: bool,
) -> anyhow::Result<()> {
let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as(
r#"
SELECT id, manga_id, source_url FROM (
SELECT DISTINCT c.id, c.manga_id, c.created_at, cs.source_url
FROM chapters c
JOIN bookmarks b ON b.manga_id = c.manga_id
JOIN chapter_sources cs ON cs.chapter_id = c.id
WHERE cs.source_id = $1
AND cs.dropped_at IS NULL
AND (c.page_count = 0 OR $2)
) sub
ORDER BY manga_id, created_at ASC
"#,
)
.bind(source_id)
.bind(force_refetch)
.fetch_all(db)
.await
.context("query pending chapter content")?;
if pending.is_empty() {
tracing::info!("chapter content: nothing pending");
return Ok(());
}
tracing::info!(count = pending.len(), workers, "chapter content phase starting");
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
let stats = std::sync::Mutex::new(WorkerStats::default());
stream::iter(pending.into_iter())
.for_each_concurrent(workers.max(1), |(chapter_id, manga_id, source_url)| {
let session_expired = Arc::clone(&session_expired);
let storage = Arc::clone(&storage);
let rate = Arc::clone(&rate);
let manager = Arc::clone(&manager);
let stats = &stats;
async move {
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
return;
}
let lease = match manager.acquire().await {
Ok(l) => l,
Err(e) => {
tracing::error!(%chapter_id, error = ?e, "browser acquire failed");
let mut s = stats.lock().unwrap();
s.failed += 1;
return;
}
};
let outcome = content::sync_chapter_content(
&lease,
db,
storage.as_ref(),
http,
rate.as_ref(),
chapter_id,
manga_id,
&source_url,
force_refetch,
)
.await;
drop(lease);
let mut s = stats.lock().unwrap();
match outcome {
Ok(SyncOutcome::Fetched { pages }) => {
tracing::info!(%chapter_id, pages, "chapter content fetched");
s.fetched += 1;
}
Ok(SyncOutcome::Skipped) => s.skipped += 1,
Ok(SyncOutcome::SessionExpired) => {
tracing::error!(
%chapter_id,
"session expired mid-run — refresh CRAWLER_PHPSESSID and re-run"
);
session_expired
.store(true, std::sync::atomic::Ordering::Relaxed);
}
Err(e) => {
tracing::warn!(
%chapter_id, error = ?e, "chapter content sync failed"
);
s.failed += 1;
}
}
}
})
.await;
let total = stats.into_inner().unwrap();
tracing::info!(
fetched = total.fetched,
skipped = total.skipped,
failed = total.failed,
"chapter content phase done"
);
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
anyhow::bail!("session expired during chapter content phase");
}
Ok(())
}
#[derive(Default, Clone, Copy)]
struct WorkerStats {
fetched: usize,
skipped: usize,
failed: usize,
}
fn resolve_start_url() -> anyhow::Result<String> {
if let Some(arg) = std::env::args().nth(1) {
return Ok(arg);
}
std::env::var("CRAWLER_START_URL").map_err(|_| {
anyhow!(
"start URL is required — pass as first CLI arg or set $CRAWLER_START_URL"
)
})
}
/// Parse the CLI's `CRAWLER_MODE`. Defaults to `backfill` because the
/// binary is operator-driven (manual reseeds, force-refetches) — the
/// auto-detect logic lives in the daemon. `auto` is rejected because
/// the CLI has no DB state to consult before the run.
fn parse_crawler_mode(incremental_stop_after: usize) -> anyhow::Result<DiscoverMode> {
parse_crawler_mode_str(
std::env::var("CRAWLER_MODE").ok().as_deref(),
incremental_stop_after,
)
}
/// Pure variant of [`parse_crawler_mode`] — testable without env-var
/// mutation.
fn parse_crawler_mode_str(
raw: Option<&str>,
incremental_stop_after: usize,
) -> anyhow::Result<DiscoverMode> {
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
None | Some("") | Some("backfill") => Ok(DiscoverMode::Backfill),
Some("incremental") => Ok(DiscoverMode::Incremental {
stop_after_unchanged: incremental_stop_after,
}),
Some("auto") => Err(anyhow!(
"CRAWLER_MODE=auto isn't supported by the CLI (use backfill or incremental); \
the daemon does auto-detection"
)),
Some(other) => Err(anyhow!(
"CRAWLER_MODE must be one of: backfill, incremental (got {other:?})"
)),
}
}
fn env_u64(name: &str, default: u64) -> u64 {
std::env::var(name)
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(default)
}
fn env_bool(name: &str, default: bool) -> bool {
match std::env::var(name).ok().as_deref() {
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
_ => default,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn cli_mode_defaults_to_backfill_when_unset_or_blank() {
let none = parse_crawler_mode_str(None, 20).unwrap();
assert!(matches!(none, DiscoverMode::Backfill));
let blank = parse_crawler_mode_str(Some(""), 20).unwrap();
assert!(matches!(blank, DiscoverMode::Backfill));
}
#[test]
fn cli_mode_recognizes_backfill_and_incremental() {
let backfill = parse_crawler_mode_str(Some("backfill"), 20).unwrap();
assert!(matches!(backfill, DiscoverMode::Backfill));
let incremental = parse_crawler_mode_str(Some("incremental"), 9).unwrap();
assert!(matches!(
incremental,
DiscoverMode::Incremental { stop_after_unchanged: 9 }
));
}
#[test]
fn cli_mode_rejects_auto_explicitly() {
let err = parse_crawler_mode_str(Some("auto"), 20).unwrap_err();
let msg = format!("{err}");
assert!(
msg.contains("daemon"),
"rejection should point operator at the daemon: {msg}"
);
}
#[test]
fn cli_mode_rejects_unknown_value() {
let err = parse_crawler_mode_str(Some("garbage"), 20).unwrap_err();
let msg = format!("{err}");
assert!(msg.contains("backfill"));
assert!(msg.contains("incremental"));
}
#[test]
fn cli_mode_is_case_insensitive_and_trims() {
let mixed = parse_crawler_mode_str(Some(" Incremental "), 4).unwrap();
assert!(matches!(
mixed,
DiscoverMode::Incremental { stop_after_unchanged: 4 }
));
}
}