Daemon now auto-detects mode per source: Backfill until the first full walk records `seed_completed:<source>` in `crawler_state`, then Incremental (newest-first, stops after N consecutive Unchanged upserts). `CRAWLER_MODE` overrides to a fixed mode; CLI rejects `auto` since it has no pre-run DB state. `Source::discover` returns a lazy `DiscoverWalk` so Incremental can break out mid-walk without prefetching pages. The drop pass and seed marker are now gated on a true full walk — fixes a latent soft-drop of the index tail under partial sweeps. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
499 lines
17 KiB
Rust
499 lines
17 KiB
Rust
//! Crawler binary.
|
|
//!
|
|
//! Now an ops escape hatch sitting alongside the in-process daemon: walks
|
|
//! the source's manga listing (all pages), fetches each manga's metadata +
|
|
//! chapter list, downloads covers, reconciles chapters — and then, for any
|
|
//! chapter belonging to a bookmarked manga whose `page_count` is still 0,
|
|
//! fetches the chapter pages inline. The daemon does the same work through
|
|
//! `crawler_jobs`; the CLI is kept around for force-refetches and manual
|
|
//! backfills.
|
|
//!
|
|
//! Configuration mirrors the daemon's `CRAWLER_*` env vars (see
|
|
//! `crate::config::CrawlerConfig`) plus the CLI-only:
|
|
//! - **Start URL**: first CLI positional arg, else `$CRAWLER_START_URL`.
|
|
//! - **Skip chapters / chapter content / force re-fetch / keep browser**:
|
|
//! `CRAWLER_SKIP_CHAPTERS`, `CRAWLER_SKIP_CHAPTER_CONTENT`,
|
|
//! `CRAWLER_FORCE_REFETCH_CHAPTERS`, `CRAWLER_KEEP_BROWSER_OPEN`.
|
|
//! - **Limit**: `CRAWLER_LIMIT` (max manga detail fetches per run).
|
|
//!
|
|
//! See `crawler::pipeline::run_metadata_pass` for the shared metadata
|
|
//! flow.
|
|
|
|
use std::path::PathBuf;
|
|
use std::sync::Arc;
|
|
use std::time::Duration;
|
|
|
|
use anyhow::{anyhow, Context};
|
|
use futures_util::stream::{self, StreamExt};
|
|
use mangalord::crawler::browser::{BrowserMode, LaunchOptions};
|
|
use mangalord::crawler::browser_manager::{self, BrowserManager};
|
|
use mangalord::crawler::content::{self, SyncOutcome};
|
|
use mangalord::crawler::pipeline;
|
|
use mangalord::crawler::rate_limit::HostRateLimiters;
|
|
use mangalord::crawler::session;
|
|
use mangalord::crawler::source::DiscoverMode;
|
|
use mangalord::storage::{LocalStorage, Storage};
|
|
use sqlx::postgres::PgPoolOptions;
|
|
use sqlx::PgPool;
|
|
use tracing_subscriber::EnvFilter;
|
|
use uuid::Uuid;
|
|
|
|
#[tokio::main]
|
|
async fn main() -> anyhow::Result<()> {
|
|
dotenvy::dotenv().ok();
|
|
tracing_subscriber::fmt()
|
|
.with_env_filter(
|
|
EnvFilter::try_from_default_env().unwrap_or_else(|_| {
|
|
"info,mangalord=debug,chromiumoxide::conn=off,chromiumoxide::handler=off"
|
|
.into()
|
|
}),
|
|
)
|
|
.init();
|
|
|
|
let start_url = resolve_start_url()?;
|
|
let database_url = std::env::var("DATABASE_URL")
|
|
.map_err(|_| anyhow!("DATABASE_URL must be set"))?;
|
|
let storage_dir: PathBuf = std::env::var("STORAGE_DIR")
|
|
.unwrap_or_else(|_| "./data/storage".to_string())
|
|
.into();
|
|
let rate_ms = env_u64("CRAWLER_RATE_MS", 1000);
|
|
let cdn_host = std::env::var("CRAWLER_CDN_HOST")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty());
|
|
let cdn_rate_ms = env_u64("CRAWLER_CDN_RATE_MS", rate_ms);
|
|
let limit = env_u64("CRAWLER_LIMIT", 0) as usize;
|
|
let skip_chapters = env_bool("CRAWLER_SKIP_CHAPTERS", false);
|
|
let incremental_stop_after = env_u64("CRAWLER_INCREMENTAL_STOP_AFTER", 20).max(1) as usize;
|
|
let mode = parse_crawler_mode(incremental_stop_after)?;
|
|
let skip_chapter_content = env_bool("CRAWLER_SKIP_CHAPTER_CONTENT", false);
|
|
let chapter_workers = env_u64("CRAWLER_CHAPTER_WORKERS", 1).max(1) as usize;
|
|
let force_refetch_chapters = env_bool("CRAWLER_FORCE_REFETCH_CHAPTERS", false);
|
|
let phpsessid = std::env::var("CRAWLER_PHPSESSID")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty());
|
|
let cookie_domain = std::env::var("CRAWLER_COOKIE_DOMAIN")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty())
|
|
.or_else(|| session::registrable_domain(&start_url));
|
|
let user_agent = std::env::var("CRAWLER_USER_AGENT")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty());
|
|
let proxy_url = std::env::var("CRAWLER_PROXY")
|
|
.ok()
|
|
.filter(|s| !s.trim().is_empty());
|
|
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
|
|
|
|
let db = PgPoolOptions::new()
|
|
.max_connections(5)
|
|
.connect(&database_url)
|
|
.await
|
|
.context("connect to database")?;
|
|
sqlx::migrate!("./migrations").run(&db).await?;
|
|
|
|
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(&storage_dir));
|
|
|
|
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
|
if let (Some(sid), Some(domain)) = (&phpsessid, &cookie_domain) {
|
|
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
|
let seed_url =
|
|
reqwest::Url::parse(&start_url).context("parse start URL for cookie seed")?;
|
|
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
|
tracing::info!(domain, "seeded PHPSESSID into reqwest cookie jar");
|
|
}
|
|
let mut http_builder = reqwest::Client::builder()
|
|
.timeout(Duration::from_secs(30))
|
|
.no_proxy()
|
|
.cookie_provider(cookie_jar);
|
|
if let Some(ua) = &user_agent {
|
|
http_builder = http_builder.user_agent(ua);
|
|
}
|
|
if let Some(proxy) = &proxy_url {
|
|
http_builder = http_builder
|
|
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy URL: {proxy}"))?);
|
|
}
|
|
let http = http_builder.build().context("build http client")?;
|
|
|
|
let mut options = LaunchOptions::from_env();
|
|
if let Some(proxy) = &proxy_url {
|
|
options.extra_args.push(format!("--proxy-server={proxy}"));
|
|
}
|
|
let keep_open = match (keep_browser_open, options.mode) {
|
|
(true, BrowserMode::Headed) => true,
|
|
(true, BrowserMode::Headless) => {
|
|
tracing::warn!(
|
|
"CRAWLER_KEEP_BROWSER_OPEN ignored in headless mode (no window to inspect)"
|
|
);
|
|
false
|
|
}
|
|
_ => false,
|
|
};
|
|
tracing::info!(
|
|
?options,
|
|
%start_url,
|
|
rate_ms,
|
|
cdn_host = ?cdn_host,
|
|
cdn_rate_ms,
|
|
limit,
|
|
skip_chapters,
|
|
skip_chapter_content,
|
|
chapter_workers,
|
|
force_refetch_chapters,
|
|
phpsessid_set = phpsessid.is_some(),
|
|
cookie_domain = ?cookie_domain,
|
|
user_agent = ?user_agent,
|
|
proxy = ?proxy_url,
|
|
keep_open,
|
|
?mode,
|
|
storage_dir = %storage_dir.display(),
|
|
"starting crawler"
|
|
);
|
|
|
|
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
|
|
// alive for the entire run — same lifecycle as the old direct
|
|
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
|
|
// session probe; bad cookies fail fast before any real work happens.
|
|
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
|
|
(Some(sid), Some(domain)) => {
|
|
let sid = sid.clone();
|
|
let domain = domain.clone();
|
|
let start_url_clone = start_url.clone();
|
|
Arc::new(move |browser| {
|
|
let sid = sid.clone();
|
|
let domain = domain.clone();
|
|
let start_url = start_url_clone.clone();
|
|
Box::pin(async move {
|
|
session::inject_phpsessid(&browser, &sid, &domain)
|
|
.await
|
|
.context("inject_phpsessid")?;
|
|
session::verify_session(&browser, &start_url)
|
|
.await
|
|
.context("verify_session")?;
|
|
Ok(())
|
|
})
|
|
})
|
|
}
|
|
_ => browser_manager::noop_on_launch(),
|
|
};
|
|
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
|
|
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
|
|
|
|
let result = run(
|
|
Arc::clone(&manager),
|
|
&db,
|
|
Arc::clone(&storage),
|
|
&http,
|
|
&start_url,
|
|
rate_ms,
|
|
cdn_host.as_deref(),
|
|
cdn_rate_ms,
|
|
limit,
|
|
skip_chapters,
|
|
skip_chapter_content || !session_ready,
|
|
chapter_workers,
|
|
force_refetch_chapters,
|
|
mode,
|
|
)
|
|
.await;
|
|
|
|
if keep_open {
|
|
tracing::info!(
|
|
"crawler finished; browser kept open. Press Ctrl+C to close and exit."
|
|
);
|
|
let _ = tokio::signal::ctrl_c().await;
|
|
tracing::info!("Ctrl+C received; closing browser");
|
|
}
|
|
manager.shutdown().await;
|
|
result
|
|
}
|
|
|
|
#[allow(clippy::too_many_arguments)]
|
|
async fn run(
|
|
manager: Arc<BrowserManager>,
|
|
db: &PgPool,
|
|
storage: Arc<dyn Storage>,
|
|
http: &reqwest::Client,
|
|
start_url: &str,
|
|
rate_ms: u64,
|
|
cdn_host: Option<&str>,
|
|
cdn_rate_ms: u64,
|
|
limit: usize,
|
|
skip_chapters: bool,
|
|
skip_chapter_content: bool,
|
|
chapter_workers: usize,
|
|
force_refetch_chapters: bool,
|
|
mode: DiscoverMode,
|
|
) -> anyhow::Result<()> {
|
|
let mut rate = HostRateLimiters::new(Duration::from_millis(rate_ms));
|
|
if let Some(host) = cdn_host {
|
|
rate = rate.with_override(host, Duration::from_millis(cdn_rate_ms));
|
|
}
|
|
let rate = Arc::new(rate);
|
|
|
|
let stats = pipeline::run_metadata_pass(
|
|
manager.as_ref(),
|
|
db,
|
|
storage.as_ref(),
|
|
http,
|
|
rate.as_ref(),
|
|
start_url,
|
|
limit,
|
|
skip_chapters,
|
|
mode,
|
|
)
|
|
.await?;
|
|
tracing::info!(?stats, "metadata pass complete");
|
|
|
|
if !skip_chapter_content {
|
|
sync_bookmarked_chapter_content(
|
|
Arc::clone(&manager),
|
|
db,
|
|
Arc::clone(&storage),
|
|
http,
|
|
Arc::clone(&rate),
|
|
"target",
|
|
chapter_workers,
|
|
force_refetch_chapters,
|
|
)
|
|
.await?;
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|
|
/// Find every chapter whose manga is bookmarked by at least one user and
|
|
/// that hasn't been content-synced yet, then fan them out across `workers`
|
|
/// concurrent tasks. Same as before except the browser comes from a
|
|
/// BrowserManager lease so it interleaves cleanly with the metadata pass.
|
|
///
|
|
/// A `SessionExpired` result aborts the phase.
|
|
#[allow(clippy::too_many_arguments)]
|
|
async fn sync_bookmarked_chapter_content(
|
|
manager: Arc<BrowserManager>,
|
|
db: &PgPool,
|
|
storage: Arc<dyn Storage>,
|
|
http: &reqwest::Client,
|
|
rate: Arc<HostRateLimiters>,
|
|
source_id: &str,
|
|
workers: usize,
|
|
force_refetch: bool,
|
|
) -> anyhow::Result<()> {
|
|
let pending: Vec<(Uuid, Uuid, String)> = sqlx::query_as(
|
|
r#"
|
|
SELECT id, manga_id, source_url FROM (
|
|
SELECT DISTINCT c.id, c.manga_id, c.created_at, cs.source_url
|
|
FROM chapters c
|
|
JOIN bookmarks b ON b.manga_id = c.manga_id
|
|
JOIN chapter_sources cs ON cs.chapter_id = c.id
|
|
WHERE cs.source_id = $1
|
|
AND cs.dropped_at IS NULL
|
|
AND (c.page_count = 0 OR $2)
|
|
) sub
|
|
ORDER BY manga_id, created_at ASC
|
|
"#,
|
|
)
|
|
.bind(source_id)
|
|
.bind(force_refetch)
|
|
.fetch_all(db)
|
|
.await
|
|
.context("query pending chapter content")?;
|
|
|
|
if pending.is_empty() {
|
|
tracing::info!("chapter content: nothing pending");
|
|
return Ok(());
|
|
}
|
|
tracing::info!(count = pending.len(), workers, "chapter content phase starting");
|
|
|
|
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
|
let stats = std::sync::Mutex::new(WorkerStats::default());
|
|
|
|
stream::iter(pending.into_iter())
|
|
.for_each_concurrent(workers.max(1), |(chapter_id, manga_id, source_url)| {
|
|
let session_expired = Arc::clone(&session_expired);
|
|
let storage = Arc::clone(&storage);
|
|
let rate = Arc::clone(&rate);
|
|
let manager = Arc::clone(&manager);
|
|
let stats = &stats;
|
|
async move {
|
|
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
|
return;
|
|
}
|
|
let lease = match manager.acquire().await {
|
|
Ok(l) => l,
|
|
Err(e) => {
|
|
tracing::error!(%chapter_id, error = ?e, "browser acquire failed");
|
|
let mut s = stats.lock().unwrap();
|
|
s.failed += 1;
|
|
return;
|
|
}
|
|
};
|
|
let outcome = content::sync_chapter_content(
|
|
&lease,
|
|
db,
|
|
storage.as_ref(),
|
|
http,
|
|
rate.as_ref(),
|
|
chapter_id,
|
|
manga_id,
|
|
&source_url,
|
|
force_refetch,
|
|
)
|
|
.await;
|
|
drop(lease);
|
|
let mut s = stats.lock().unwrap();
|
|
match outcome {
|
|
Ok(SyncOutcome::Fetched { pages }) => {
|
|
tracing::info!(%chapter_id, pages, "chapter content fetched");
|
|
s.fetched += 1;
|
|
}
|
|
Ok(SyncOutcome::Skipped) => s.skipped += 1,
|
|
Ok(SyncOutcome::SessionExpired) => {
|
|
tracing::error!(
|
|
%chapter_id,
|
|
"session expired mid-run — refresh CRAWLER_PHPSESSID and re-run"
|
|
);
|
|
session_expired
|
|
.store(true, std::sync::atomic::Ordering::Relaxed);
|
|
}
|
|
Err(e) => {
|
|
tracing::warn!(
|
|
%chapter_id, error = ?e, "chapter content sync failed"
|
|
);
|
|
s.failed += 1;
|
|
}
|
|
}
|
|
}
|
|
})
|
|
.await;
|
|
|
|
let total = stats.into_inner().unwrap();
|
|
tracing::info!(
|
|
fetched = total.fetched,
|
|
skipped = total.skipped,
|
|
failed = total.failed,
|
|
"chapter content phase done"
|
|
);
|
|
|
|
if session_expired.load(std::sync::atomic::Ordering::Relaxed) {
|
|
anyhow::bail!("session expired during chapter content phase");
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
#[derive(Default, Clone, Copy)]
|
|
struct WorkerStats {
|
|
fetched: usize,
|
|
skipped: usize,
|
|
failed: usize,
|
|
}
|
|
|
|
fn resolve_start_url() -> anyhow::Result<String> {
|
|
if let Some(arg) = std::env::args().nth(1) {
|
|
return Ok(arg);
|
|
}
|
|
std::env::var("CRAWLER_START_URL").map_err(|_| {
|
|
anyhow!(
|
|
"start URL is required — pass as first CLI arg or set $CRAWLER_START_URL"
|
|
)
|
|
})
|
|
}
|
|
|
|
/// Parse the CLI's `CRAWLER_MODE`. Defaults to `backfill` because the
|
|
/// binary is operator-driven (manual reseeds, force-refetches) — the
|
|
/// auto-detect logic lives in the daemon. `auto` is rejected because
|
|
/// the CLI has no DB state to consult before the run.
|
|
fn parse_crawler_mode(incremental_stop_after: usize) -> anyhow::Result<DiscoverMode> {
|
|
parse_crawler_mode_str(
|
|
std::env::var("CRAWLER_MODE").ok().as_deref(),
|
|
incremental_stop_after,
|
|
)
|
|
}
|
|
|
|
/// Pure variant of [`parse_crawler_mode`] — testable without env-var
|
|
/// mutation.
|
|
fn parse_crawler_mode_str(
|
|
raw: Option<&str>,
|
|
incremental_stop_after: usize,
|
|
) -> anyhow::Result<DiscoverMode> {
|
|
match raw.map(|s| s.trim().to_ascii_lowercase()).as_deref() {
|
|
None | Some("") | Some("backfill") => Ok(DiscoverMode::Backfill),
|
|
Some("incremental") => Ok(DiscoverMode::Incremental {
|
|
stop_after_unchanged: incremental_stop_after,
|
|
}),
|
|
Some("auto") => Err(anyhow!(
|
|
"CRAWLER_MODE=auto isn't supported by the CLI (use backfill or incremental); \
|
|
the daemon does auto-detection"
|
|
)),
|
|
Some(other) => Err(anyhow!(
|
|
"CRAWLER_MODE must be one of: backfill, incremental (got {other:?})"
|
|
)),
|
|
}
|
|
}
|
|
|
|
fn env_u64(name: &str, default: u64) -> u64 {
|
|
std::env::var(name)
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.unwrap_or(default)
|
|
}
|
|
|
|
fn env_bool(name: &str, default: bool) -> bool {
|
|
match std::env::var(name).ok().as_deref() {
|
|
Some("1") | Some("true") | Some("TRUE") | Some("yes") => true,
|
|
Some("0") | Some("false") | Some("FALSE") | Some("no") => false,
|
|
_ => default,
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn cli_mode_defaults_to_backfill_when_unset_or_blank() {
|
|
let none = parse_crawler_mode_str(None, 20).unwrap();
|
|
assert!(matches!(none, DiscoverMode::Backfill));
|
|
let blank = parse_crawler_mode_str(Some(""), 20).unwrap();
|
|
assert!(matches!(blank, DiscoverMode::Backfill));
|
|
}
|
|
|
|
#[test]
|
|
fn cli_mode_recognizes_backfill_and_incremental() {
|
|
let backfill = parse_crawler_mode_str(Some("backfill"), 20).unwrap();
|
|
assert!(matches!(backfill, DiscoverMode::Backfill));
|
|
|
|
let incremental = parse_crawler_mode_str(Some("incremental"), 9).unwrap();
|
|
assert!(matches!(
|
|
incremental,
|
|
DiscoverMode::Incremental { stop_after_unchanged: 9 }
|
|
));
|
|
}
|
|
|
|
#[test]
|
|
fn cli_mode_rejects_auto_explicitly() {
|
|
let err = parse_crawler_mode_str(Some("auto"), 20).unwrap_err();
|
|
let msg = format!("{err}");
|
|
assert!(
|
|
msg.contains("daemon"),
|
|
"rejection should point operator at the daemon: {msg}"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn cli_mode_rejects_unknown_value() {
|
|
let err = parse_crawler_mode_str(Some("garbage"), 20).unwrap_err();
|
|
let msg = format!("{err}");
|
|
assert!(msg.contains("backfill"));
|
|
assert!(msg.contains("incremental"));
|
|
}
|
|
|
|
#[test]
|
|
fn cli_mode_is_case_insensitive_and_trims() {
|
|
let mixed = parse_crawler_mode_str(Some(" Incremental "), 4).unwrap();
|
|
assert!(matches!(
|
|
mixed,
|
|
DiscoverMode::Incremental { stop_after_unchanged: 4 }
|
|
));
|
|
}
|
|
}
|
|
|