feat: in-process crawler daemon with cron and worker pool (0.28.0)
The backend now boots an internal crawler daemon that runs a daily metadata pass (CRAWLER_DAILY_AT in CRAWLER_TZ, advisory-lock guarded for multi-replica safety) and drains SyncChapterContent jobs from crawler_jobs through a worker pool. Chromium launches lazily on first job and is torn down after CRAWLER_IDLE_TIMEOUT_S seconds of inactivity. Modules: - crawler::browser_manager — lazy-launch / idle-teardown wrapper around browser::Handle, with an on_launch hook that re-injects PHPSESSID on every fresh Chromium spawn. - crawler::pipeline — run_metadata_pass (the shared discover/upsert /cover/sync-chapters loop) and the enqueue_bookmarked_pending helper used by the cron tick. - crawler::daemon — cron task + worker pool, behind two trait seams (MetadataPass, ChapterDispatcher) so tests can inject stubs without standing up Chromium or a live source. Behavior: - CRAWLER_DAEMON=false skips daemon spawn entirely (default for tests). - Catch-up tick fires on startup if the last persisted slot was missed. - A SyncOutcome::SessionExpired sets a sticky AtomicBool; workers idle until operator restart with a refreshed PHPSESSID. - Worker dispatch wrapped in catch_unwind so a panicking handler marks the job failed instead of taking down the worker. - Migration 0015 adds a small crawler_state k-v table for the last_metadata_tick_at watermark. Dep additions: chrono-tz (IANA TZ parsing). CLI (bin/crawler) reuses pipeline::run_metadata_pass and now holds the browser via BrowserManager so the on_launch session injection flow stays in one place. Inline chapter-content sync semantics are unchanged — the queue is for the daemon, force-refetches and manual backfills still bypass it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,14 +1,25 @@
|
||||
use std::sync::Arc;
|
||||
use std::sync::atomic::AtomicBool;
|
||||
|
||||
use anyhow::Context;
|
||||
use async_trait::async_trait;
|
||||
use axum::extract::DefaultBodyLimit;
|
||||
use axum::http::{HeaderName, HeaderValue, Method};
|
||||
use axum::Router;
|
||||
use sqlx::postgres::PgPoolOptions;
|
||||
use sqlx::PgPool;
|
||||
use tokio_util::sync::CancellationToken;
|
||||
use tower_http::cors::{AllowOrigin, CorsLayer};
|
||||
use tower_http::trace::TraceLayer;
|
||||
|
||||
use crate::config::{AuthConfig, Config, UploadConfig};
|
||||
use crate::config::{AuthConfig, Config, CrawlerConfig, UploadConfig};
|
||||
use crate::crawler::browser_manager::{self, BrowserManager};
|
||||
use crate::crawler::content::{self, SyncOutcome};
|
||||
use crate::crawler::daemon::{self, ChapterDispatcher, DaemonConfig, MetadataPass};
|
||||
use crate::crawler::jobs::JobPayload;
|
||||
use crate::crawler::pipeline::{self, MetadataStats};
|
||||
use crate::crawler::rate_limit::HostRateLimiters;
|
||||
use crate::crawler::session;
|
||||
use crate::storage::{LocalStorage, Storage};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -19,7 +30,23 @@ pub struct AppState {
|
||||
pub upload: UploadConfig,
|
||||
}
|
||||
|
||||
pub async fn build(config: Config) -> anyhow::Result<Router> {
|
||||
/// Bundle returned by [`build`]. The router is what `axum::serve` consumes;
|
||||
/// the daemon (when enabled) outlives the HTTP server and is awaited via
|
||||
/// [`AppHandle::shutdown`] after the listener has finished gracefully.
|
||||
pub struct AppHandle {
|
||||
pub router: Router,
|
||||
pub daemon: Option<daemon::DaemonHandle>,
|
||||
}
|
||||
|
||||
impl AppHandle {
|
||||
pub async fn shutdown(self) {
|
||||
if let Some(d) = self.daemon {
|
||||
d.shutdown().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn build(config: Config) -> anyhow::Result<AppHandle> {
|
||||
let db = PgPoolOptions::new()
|
||||
.max_connections(10)
|
||||
.connect(&config.database_url)
|
||||
@@ -28,13 +55,235 @@ pub async fn build(config: Config) -> anyhow::Result<Router> {
|
||||
|
||||
let storage: Arc<dyn Storage> = Arc::new(LocalStorage::new(config.storage_dir.clone()));
|
||||
|
||||
let daemon = if config.crawler.daemon_enabled {
|
||||
Some(spawn_crawler_daemon(db.clone(), Arc::clone(&storage), &config.crawler).await?)
|
||||
} else {
|
||||
tracing::info!("crawler daemon disabled (CRAWLER_DAEMON=false)");
|
||||
None
|
||||
};
|
||||
|
||||
let state = AppState {
|
||||
db,
|
||||
storage,
|
||||
auth: config.auth.clone(),
|
||||
upload: config.upload.clone(),
|
||||
};
|
||||
Ok(router(state).layer(cors_layer(&config.cors_allowed_origins)))
|
||||
let router = router(state).layer(cors_layer(&config.cors_allowed_origins));
|
||||
Ok(AppHandle { router, daemon })
|
||||
}
|
||||
|
||||
async fn spawn_crawler_daemon(
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
cfg: &CrawlerConfig,
|
||||
) -> anyhow::Result<daemon::DaemonHandle> {
|
||||
// Reqwest client with cookie jar pre-seeded so CDN image fetches
|
||||
// include PHPSESSID. Same shape as bin/crawler.rs main().
|
||||
let cookie_jar = Arc::new(reqwest::cookie::Jar::default());
|
||||
if let (Some(sid), Some(domain), Some(start_url)) =
|
||||
(&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url)
|
||||
{
|
||||
let cookie_str = format!("PHPSESSID={sid}; Domain={domain}; Path=/");
|
||||
let seed_url = reqwest::Url::parse(start_url)
|
||||
.context("parse CRAWLER_START_URL for cookie seed")?;
|
||||
cookie_jar.add_cookie_str(&cookie_str, &seed_url);
|
||||
}
|
||||
let mut http_builder = reqwest::Client::builder()
|
||||
.timeout(std::time::Duration::from_secs(30))
|
||||
.no_proxy()
|
||||
.cookie_provider(cookie_jar);
|
||||
if let Some(ua) = &cfg.user_agent {
|
||||
http_builder = http_builder.user_agent(ua);
|
||||
}
|
||||
if let Some(proxy) = &cfg.proxy {
|
||||
http_builder = http_builder
|
||||
.proxy(reqwest::Proxy::all(proxy).with_context(|| format!("parse proxy: {proxy}"))?);
|
||||
}
|
||||
let http = http_builder.build().context("build crawler reqwest")?;
|
||||
|
||||
let mut rate = HostRateLimiters::new(std::time::Duration::from_millis(cfg.rate_ms));
|
||||
if let Some(host) = &cfg.cdn_host {
|
||||
rate = rate.with_override(host, std::time::Duration::from_millis(cfg.cdn_rate_ms));
|
||||
}
|
||||
let rate = Arc::new(rate);
|
||||
|
||||
// Browser manager. on_launch re-injects PHPSESSID on every fresh
|
||||
// chromium spawn so an idle teardown followed by re-launch stays
|
||||
// authenticated without operator action.
|
||||
let mut launch_opts = cfg.browser.clone();
|
||||
if let Some(proxy) = &cfg.proxy {
|
||||
launch_opts.extra_args.push(format!("--proxy-server={proxy}"));
|
||||
}
|
||||
let on_launch = match (&cfg.phpsessid, &cfg.cookie_domain, &cfg.start_url) {
|
||||
(Some(sid), Some(domain), Some(start_url)) => {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url = start_url.clone();
|
||||
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
|
||||
let sid = sid.clone();
|
||||
let domain = domain.clone();
|
||||
let start_url = start_url.clone();
|
||||
Box::pin(async move {
|
||||
session::inject_phpsessid(&browser, &sid, &domain)
|
||||
.await
|
||||
.context("on_launch: inject_phpsessid")?;
|
||||
session::verify_session(&browser, &start_url)
|
||||
.await
|
||||
.context("on_launch: verify_session")?;
|
||||
Ok(())
|
||||
})
|
||||
});
|
||||
on_launch
|
||||
}
|
||||
_ => browser_manager::noop_on_launch(),
|
||||
};
|
||||
let browser_manager = BrowserManager::new(launch_opts, cfg.idle_timeout, on_launch);
|
||||
|
||||
let session_expired = Arc::new(AtomicBool::new(false));
|
||||
|
||||
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
|
||||
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
|
||||
browser_manager: Arc::clone(&browser_manager),
|
||||
db: db.clone(),
|
||||
storage: Arc::clone(&storage),
|
||||
http: http.clone(),
|
||||
rate: Arc::clone(&rate),
|
||||
start_url: url.clone(),
|
||||
});
|
||||
m
|
||||
});
|
||||
|
||||
let dispatcher: Arc<dyn ChapterDispatcher> = Arc::new(RealChapterDispatcher {
|
||||
browser_manager: Arc::clone(&browser_manager),
|
||||
db: db.clone(),
|
||||
storage: Arc::clone(&storage),
|
||||
http,
|
||||
rate: Arc::clone(&rate),
|
||||
});
|
||||
|
||||
// Shared cancellation: daemon shutdown cancels the BrowserManager's
|
||||
// idle reaper too. Reaper itself is added to the daemon's extra_tasks
|
||||
// so DaemonHandle::shutdown awaits its completion.
|
||||
let cancel = CancellationToken::new();
|
||||
let reaper_task = browser_manager::spawn_idle_reaper(
|
||||
Arc::clone(&browser_manager),
|
||||
cancel.clone(),
|
||||
);
|
||||
// Also close the browser explicitly on shutdown so we don't rely on
|
||||
// kill-on-drop when other Arc<Browser> holders may still exist.
|
||||
let shutdown_task = {
|
||||
let cancel = cancel.clone();
|
||||
let mgr = Arc::clone(&browser_manager);
|
||||
tokio::spawn(async move {
|
||||
cancel.cancelled().await;
|
||||
mgr.shutdown().await;
|
||||
})
|
||||
};
|
||||
|
||||
let daemon_handle = daemon::spawn(
|
||||
db,
|
||||
cancel,
|
||||
DaemonConfig {
|
||||
metadata_pass,
|
||||
dispatcher,
|
||||
chapter_workers: cfg.chapter_workers,
|
||||
daily_at: cfg.daily_at,
|
||||
tz: cfg.tz,
|
||||
retention_days: cfg.retention_days,
|
||||
session_expired,
|
||||
extra_tasks: vec![reaper_task, shutdown_task],
|
||||
},
|
||||
);
|
||||
|
||||
Ok(daemon_handle)
|
||||
}
|
||||
|
||||
// Real impls of the daemon traits, owning the browser manager + I/O. Kept
|
||||
// in app.rs because they need the same builder-side env wiring that
|
||||
// AppState gets — the daemon module itself stays free of reqwest / storage
|
||||
// details so its tests don't pull them in.
|
||||
|
||||
struct RealMetadataPass {
|
||||
browser_manager: Arc<BrowserManager>,
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
start_url: String,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl MetadataPass for RealMetadataPass {
|
||||
async fn run(&self) -> anyhow::Result<MetadataStats> {
|
||||
pipeline::run_metadata_pass(
|
||||
&self.browser_manager,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
&self.http,
|
||||
&self.rate,
|
||||
&self.start_url,
|
||||
0,
|
||||
false,
|
||||
)
|
||||
.await
|
||||
}
|
||||
}
|
||||
|
||||
struct RealChapterDispatcher {
|
||||
browser_manager: Arc<BrowserManager>,
|
||||
db: PgPool,
|
||||
storage: Arc<dyn Storage>,
|
||||
http: reqwest::Client,
|
||||
rate: Arc<HostRateLimiters>,
|
||||
}
|
||||
|
||||
#[async_trait]
|
||||
impl ChapterDispatcher for RealChapterDispatcher {
|
||||
async fn dispatch(&self, payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
match payload {
|
||||
JobPayload::SyncChapterContent {
|
||||
source_id: _,
|
||||
chapter_id,
|
||||
source_chapter_key: _,
|
||||
} => {
|
||||
// Look up manga_id + source_url for this chapter.
|
||||
let row: Option<(uuid::Uuid, String)> = sqlx::query_as(
|
||||
"SELECT c.manga_id, cs.source_url \
|
||||
FROM chapters c \
|
||||
JOIN chapter_sources cs ON cs.chapter_id = c.id \
|
||||
WHERE c.id = $1 \
|
||||
LIMIT 1",
|
||||
)
|
||||
.bind(chapter_id)
|
||||
.fetch_optional(&self.db)
|
||||
.await
|
||||
.context("look up chapter for dispatch")?;
|
||||
let Some((manga_id, source_url)) = row else {
|
||||
// Chapter (or its source row) is gone — ack done.
|
||||
return Ok(SyncOutcome::Skipped);
|
||||
};
|
||||
let lease = self.browser_manager.acquire().await?;
|
||||
let outcome = content::sync_chapter_content(
|
||||
&lease,
|
||||
&self.db,
|
||||
self.storage.as_ref(),
|
||||
&self.http,
|
||||
&self.rate,
|
||||
chapter_id,
|
||||
manga_id,
|
||||
&source_url,
|
||||
false,
|
||||
)
|
||||
.await?;
|
||||
drop(lease);
|
||||
Ok(outcome)
|
||||
}
|
||||
// Other payload kinds aren't dispatched by this daemon yet —
|
||||
// metadata-driven jobs (Discover/SyncManga/SyncChapterList)
|
||||
// are handled inline by the cron's metadata pass.
|
||||
_ => Ok(SyncOutcome::Skipped),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Build a router from a pre-assembled state. Used by integration tests
|
||||
|
||||
Reference in New Issue
Block a user