feat(crawler): reliability fixes — heartbeat, streaming, jitter, timeout, breaker
A1 Lease heartbeat: jobs::renew keeps a long-but-healthy job's lease fresh so it is never stolen mid-flight nor inflated toward max_attempts. A2 Stream chapter pages straight to storage (peak memory = one image) and persist rows + page_count in one short transaction off the network path (S3-ready); roll back stored blobs on failure via Storage::delete. A3 ±20% jitter on exponential backoff to avoid a retry thundering herd. A4 Outer per-dispatch timeout (CRAWLER_JOB_TIMEOUT_SECS, default 600) so a hung job is acked-failed instead of wedging a worker. A5 Metadata circuit-breaker (CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES, default 10): abort a pass on a source outage without marking a clean exit, so the next tick recovery-sweeps. Adds CRAWLER_BROWSER_RESTART_THRESHOLD config (used by the upcoming coordinated browser restart). Bumps version 0.52.0 -> 0.53.0. Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
2
backend/Cargo.lock
generated
2
backend/Cargo.lock
generated
@@ -1470,7 +1470,7 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4"
|
||||
|
||||
[[package]]
|
||||
name = "mangalord"
|
||||
version = "0.52.0"
|
||||
version = "0.53.0"
|
||||
dependencies = [
|
||||
"anyhow",
|
||||
"argon2",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "mangalord"
|
||||
version = "0.52.0"
|
||||
version = "0.53.0"
|
||||
edition = "2021"
|
||||
default-run = "mangalord"
|
||||
|
||||
|
||||
@@ -210,6 +210,7 @@ async fn spawn_crawler_daemon(
|
||||
manga_limit: cfg.manga_limit,
|
||||
download_allowlist: cfg.download_allowlist.clone(),
|
||||
max_image_bytes: cfg.max_image_bytes,
|
||||
metadata_max_consecutive_failures: cfg.metadata_max_consecutive_failures,
|
||||
tor: tor.as_ref().map(Arc::clone),
|
||||
});
|
||||
m
|
||||
@@ -267,6 +268,7 @@ async fn spawn_crawler_daemon(
|
||||
tz: cfg.tz,
|
||||
retention_days: cfg.retention_days,
|
||||
session_expired,
|
||||
job_timeout: cfg.job_timeout,
|
||||
extra_tasks: vec![reaper_task, shutdown_task],
|
||||
},
|
||||
);
|
||||
@@ -292,6 +294,7 @@ struct RealMetadataPass {
|
||||
manga_limit: usize,
|
||||
download_allowlist: DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
metadata_max_consecutive_failures: u32,
|
||||
tor: Option<Arc<crate::crawler::tor::TorController>>,
|
||||
}
|
||||
|
||||
@@ -309,6 +312,7 @@ impl MetadataPass for RealMetadataPass {
|
||||
false,
|
||||
&self.download_allowlist,
|
||||
self.max_image_bytes,
|
||||
self.metadata_max_consecutive_failures,
|
||||
self.tor.as_deref(),
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -303,6 +303,9 @@ async fn run(
|
||||
skip_chapters,
|
||||
allowlist.as_ref(),
|
||||
max_image_bytes,
|
||||
// Circuit-breaker disabled for the operator-driven CLI: a manual
|
||||
// sweep should push through transient failures, not self-abort.
|
||||
0,
|
||||
tor.as_deref(),
|
||||
)
|
||||
.await?;
|
||||
|
||||
@@ -132,6 +132,19 @@ pub struct CrawlerConfig {
|
||||
/// (full sweep up to the source's own bound). Sourced from
|
||||
/// `CRAWLER_LIMIT`, mirroring the CLI binary.
|
||||
pub manga_limit: usize,
|
||||
/// Hard upper bound on a single chapter-content job dispatch. A job
|
||||
/// exceeding this is acked failed (exponential backoff) instead of
|
||||
/// wedging a worker. Defaults to 600s. `CRAWLER_JOB_TIMEOUT_SECS`.
|
||||
pub job_timeout: Duration,
|
||||
/// Consecutive `fetch_manga` failures that abort a metadata pass
|
||||
/// (circuit-breaker for a source outage). The pass does NOT mark a
|
||||
/// clean exit, so the next tick does a recovery sweep. Defaults to
|
||||
/// 10. `CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES`.
|
||||
pub metadata_max_consecutive_failures: u32,
|
||||
/// Consecutive transient chapter failures (after TOR recircuit is
|
||||
/// exhausted) that trigger an automatic coordinated browser restart.
|
||||
/// Defaults to 3. `CRAWLER_BROWSER_RESTART_THRESHOLD`.
|
||||
pub browser_restart_threshold: u32,
|
||||
}
|
||||
|
||||
impl Default for CrawlerConfig {
|
||||
@@ -159,6 +172,9 @@ impl Default for CrawlerConfig {
|
||||
download_allowlist: DownloadAllowlist::new(),
|
||||
max_image_bytes: DEFAULT_MAX_IMAGE_BYTES,
|
||||
manga_limit: 0,
|
||||
job_timeout: Duration::from_secs(600),
|
||||
metadata_max_consecutive_failures: 10,
|
||||
browser_restart_threshold: 3,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -283,6 +299,13 @@ impl CrawlerConfig {
|
||||
download_allowlist,
|
||||
max_image_bytes: env_usize("CRAWLER_MAX_IMAGE_BYTES", DEFAULT_MAX_IMAGE_BYTES),
|
||||
manga_limit: env_usize("CRAWLER_LIMIT", 0),
|
||||
job_timeout: Duration::from_secs(env_u64("CRAWLER_JOB_TIMEOUT_SECS", 600).max(1)),
|
||||
metadata_max_consecutive_failures: env_u64(
|
||||
"CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES",
|
||||
10,
|
||||
) as u32,
|
||||
browser_restart_threshold: env_u64("CRAWLER_BROWSER_RESTART_THRESHOLD", 3).max(1)
|
||||
as u32,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -384,6 +407,33 @@ mod tests {
|
||||
assert_eq!(cfg.manga_limit, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reliability_knobs_default_when_unset() {
|
||||
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
|
||||
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
|
||||
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
|
||||
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||
assert_eq!(cfg.job_timeout, Duration::from_secs(600));
|
||||
assert_eq!(cfg.metadata_max_consecutive_failures, 10);
|
||||
assert_eq!(cfg.browser_restart_threshold, 3);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn reliability_knobs_parse_from_env() {
|
||||
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||
std::env::set_var("CRAWLER_JOB_TIMEOUT_SECS", "120");
|
||||
std::env::set_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES", "5");
|
||||
std::env::set_var("CRAWLER_BROWSER_RESTART_THRESHOLD", "7");
|
||||
let cfg = CrawlerConfig::from_env().expect("from_env");
|
||||
std::env::remove_var("CRAWLER_JOB_TIMEOUT_SECS");
|
||||
std::env::remove_var("CRAWLER_METADATA_MAX_CONSECUTIVE_FAILURES");
|
||||
std::env::remove_var("CRAWLER_BROWSER_RESTART_THRESHOLD");
|
||||
assert_eq!(cfg.job_timeout, Duration::from_secs(120));
|
||||
assert_eq!(cfg.metadata_max_consecutive_failures, 5);
|
||||
assert_eq!(cfg.browser_restart_threshold, 7);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn private_mode_env_parses_true() {
|
||||
let _g = ENV_GUARD.lock().unwrap_or_else(|p| p.into_inner());
|
||||
|
||||
@@ -260,56 +260,118 @@ pub async fn sync_chapter_content(
|
||||
// Resolve image URLs against the chapter URL (they may be relative).
|
||||
let base = reqwest::Url::parse(source_url).context("parse chapter URL")?;
|
||||
|
||||
// Fetch every image bytes-first into memory before writing
|
||||
// anything. Lets us bail the whole chapter cleanly if any image
|
||||
// fails — DB stays at page_count=0, no partial rows persisted.
|
||||
let mut fetched: Vec<(i32, Vec<u8>, &'static str)> = Vec::with_capacity(images.len());
|
||||
// Stream each image straight to storage as it's fetched, capping peak
|
||||
// memory at a single image rather than the whole chapter. Track the
|
||||
// keys written so they can be rolled back if a later page (or the
|
||||
// final DB commit) fails — preserving the all-or-nothing guarantee
|
||||
// without holding a DB transaction open across the network puts
|
||||
// (which matters once `Storage` is backed by S3).
|
||||
let mut written_keys: Vec<String> = Vec::with_capacity(images.len());
|
||||
let mut stored: Vec<StoredPage> = Vec::with_capacity(images.len());
|
||||
for img in &images {
|
||||
let url = base.join(&img.url).with_context(|| {
|
||||
format!("join image URL {} onto {source_url}", img.url)
|
||||
})?;
|
||||
rate.wait_for(url.as_str()).await?;
|
||||
let bytes = fetch_bytes_capped(
|
||||
match download_and_store_page(
|
||||
storage,
|
||||
http,
|
||||
url.as_str(),
|
||||
Some(source_url),
|
||||
rate,
|
||||
&base,
|
||||
source_url,
|
||||
manga_id,
|
||||
chapter_id,
|
||||
img,
|
||||
allowlist,
|
||||
max_image_bytes,
|
||||
)
|
||||
.await?
|
||||
.to_vec();
|
||||
// Reject any non-image response: the only valid output of an
|
||||
// image URL is an image. `infer` returns None on truncated
|
||||
// bytes too, which also wants to be a failure not a silent
|
||||
// `.bin` extension.
|
||||
if !looks_like_image(&bytes) {
|
||||
anyhow::bail!(
|
||||
"image URL {url} returned non-image bytes \
|
||||
(first 16: {:?}); refusing to store as binary blob",
|
||||
&bytes.get(..16.min(bytes.len()))
|
||||
);
|
||||
.await
|
||||
{
|
||||
Ok(page) => {
|
||||
written_keys.push(page.storage_key.clone());
|
||||
stored.push(page);
|
||||
}
|
||||
Err(e) => {
|
||||
cleanup_orphans(storage, &written_keys).await;
|
||||
return Err(e);
|
||||
}
|
||||
}
|
||||
let ext = infer::get(&bytes)
|
||||
.map(|k| k.extension())
|
||||
.expect("looks_like_image asserted infer succeeded");
|
||||
fetched.push((img.page_number, bytes, ext));
|
||||
}
|
||||
|
||||
// Atomic write: storage puts + page row inserts + page_count
|
||||
// update, all in one transaction. If anything fails, rollback +
|
||||
// the chapter is retried next run. Storage orphans the bytes; a
|
||||
// reaper sweeps them later.
|
||||
let mut tx = db.begin().await.context("open chapter sync tx")?;
|
||||
for (page_number, bytes, ext) in &fetched {
|
||||
let key = format!(
|
||||
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
|
||||
page_number
|
||||
// Short transaction: page rows + page_count only, no network I/O. On
|
||||
// failure, roll back the stored bytes so the chapter stays at
|
||||
// page_count=0 and is retried cleanly next run.
|
||||
if let Err(e) = persist_pages(db, chapter_id, &stored).await {
|
||||
cleanup_orphans(storage, &written_keys).await;
|
||||
return Err(e);
|
||||
}
|
||||
|
||||
Ok(SyncOutcome::Fetched { pages: stored.len() })
|
||||
}
|
||||
|
||||
/// A page image that has been written to storage and is awaiting its DB
|
||||
/// row. Carries everything `persist_pages` needs.
|
||||
pub(crate) struct StoredPage {
|
||||
page_number: i32,
|
||||
storage_key: String,
|
||||
content_type: String,
|
||||
}
|
||||
|
||||
/// Download a single page image, validate it's really an image, and write
|
||||
/// it to storage. Returns the storage key + content type. Does not touch
|
||||
/// the DB — persistence is batched into one short transaction afterward.
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
async fn download_and_store_page(
|
||||
storage: &dyn Storage,
|
||||
http: &reqwest::Client,
|
||||
rate: &HostRateLimiters,
|
||||
base: &reqwest::Url,
|
||||
source_url: &str,
|
||||
manga_id: Uuid,
|
||||
chapter_id: Uuid,
|
||||
img: &ChapterImage,
|
||||
allowlist: &DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
) -> anyhow::Result<StoredPage> {
|
||||
let url = base
|
||||
.join(&img.url)
|
||||
.with_context(|| format!("join image URL {} onto {source_url}", img.url))?;
|
||||
rate.wait_for(url.as_str()).await?;
|
||||
let bytes = fetch_bytes_capped(http, url.as_str(), Some(source_url), allowlist, max_image_bytes)
|
||||
.await?;
|
||||
// Reject any non-image response: the only valid output of an image URL
|
||||
// is an image. `infer` returns None on truncated bytes too, which also
|
||||
// wants to be a failure not a silent `.bin` extension.
|
||||
if !looks_like_image(&bytes) {
|
||||
anyhow::bail!(
|
||||
"image URL {url} returned non-image bytes \
|
||||
(first 16: {:?}); refusing to store as binary blob",
|
||||
&bytes.get(..16.min(bytes.len()))
|
||||
);
|
||||
storage
|
||||
.put(&key, bytes)
|
||||
.await
|
||||
.with_context(|| format!("put {key}"))?;
|
||||
// (chapter_id, page_number) is unique — re-runs idempotent.
|
||||
}
|
||||
let ext = infer::get(&bytes)
|
||||
.map(|k| k.extension())
|
||||
.expect("looks_like_image asserted infer succeeded");
|
||||
let key = format!(
|
||||
"mangas/{manga_id}/chapters/{chapter_id}/pages/{:04}.{ext}",
|
||||
img.page_number
|
||||
);
|
||||
storage
|
||||
.put(&key, &bytes)
|
||||
.await
|
||||
.with_context(|| format!("put {key}"))?;
|
||||
Ok(StoredPage {
|
||||
page_number: img.page_number,
|
||||
storage_key: key,
|
||||
content_type: format!("image/{ext}"),
|
||||
})
|
||||
}
|
||||
|
||||
/// Persist the page rows + chapter `page_count` in one short transaction.
|
||||
/// `(chapter_id, page_number)` is unique so re-runs are idempotent.
|
||||
pub(crate) async fn persist_pages(
|
||||
db: &PgPool,
|
||||
chapter_id: Uuid,
|
||||
stored: &[StoredPage],
|
||||
) -> anyhow::Result<()> {
|
||||
let mut tx = db.begin().await.context("open chapter sync tx")?;
|
||||
for page in stored {
|
||||
sqlx::query(
|
||||
"INSERT INTO pages (chapter_id, page_number, storage_key, content_type)
|
||||
VALUES ($1, $2, $3, $4)
|
||||
@@ -318,22 +380,36 @@ pub async fn sync_chapter_content(
|
||||
content_type = EXCLUDED.content_type",
|
||||
)
|
||||
.bind(chapter_id)
|
||||
.bind(page_number)
|
||||
.bind(&key)
|
||||
.bind(format!("image/{ext}"))
|
||||
.bind(page.page_number)
|
||||
.bind(&page.storage_key)
|
||||
.bind(&page.content_type)
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.with_context(|| format!("insert page row {page_number}"))?;
|
||||
.with_context(|| format!("insert page row {}", page.page_number))?;
|
||||
}
|
||||
sqlx::query("UPDATE chapters SET page_count = $1 WHERE id = $2")
|
||||
.bind(fetched.len() as i32)
|
||||
.bind(stored.len() as i32)
|
||||
.bind(chapter_id)
|
||||
.execute(&mut *tx)
|
||||
.await
|
||||
.context("update page_count")?;
|
||||
tx.commit().await.context("commit chapter sync")?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
Ok(SyncOutcome::Fetched { pages: fetched.len() })
|
||||
/// Best-effort delete of partially-written page blobs after a chapter sync
|
||||
/// fails, so a retry doesn't accumulate orphans. Errors are logged, not
|
||||
/// raised — a leftover blob is harmless and a future reaper can sweep it.
|
||||
pub(crate) async fn cleanup_orphans(storage: &dyn Storage, keys: &[String]) {
|
||||
for key in keys {
|
||||
if let Err(e) = storage.delete(key).await {
|
||||
tracing::warn!(
|
||||
%key,
|
||||
error = ?e,
|
||||
"failed to delete orphaned page blob after chapter sync failure"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Suppress unused-import warning for `session::registrable_domain`
|
||||
@@ -347,6 +423,90 @@ fn _keep_session_in_scope() {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::storage::LocalStorage;
|
||||
|
||||
#[tokio::test]
|
||||
async fn cleanup_orphans_deletes_written_keys() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let storage = LocalStorage::new(dir.path());
|
||||
let keys = vec![
|
||||
"mangas/m/chapters/c/pages/0001.jpg".to_string(),
|
||||
"mangas/m/chapters/c/pages/0002.jpg".to_string(),
|
||||
];
|
||||
for k in &keys {
|
||||
storage.put(k, b"\xff\xd8\xff\xe0 jpeg-ish").await.unwrap();
|
||||
assert!(storage.exists(k).await.unwrap());
|
||||
}
|
||||
cleanup_orphans(&storage, &keys).await;
|
||||
for k in &keys {
|
||||
assert!(!storage.exists(k).await.unwrap(), "{k} should be deleted");
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn cleanup_orphans_tolerates_missing_keys() {
|
||||
// A key that was never written (e.g. the put itself failed) must
|
||||
// not make cleanup error — it's best-effort.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let storage = LocalStorage::new(dir.path());
|
||||
cleanup_orphans(&storage, &["never/written.jpg".to_string()]).await;
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn persist_pages_inserts_rows_and_sets_page_count(pool: PgPool) {
|
||||
let manga_id = Uuid::new_v4();
|
||||
let chapter_id = Uuid::new_v4();
|
||||
sqlx::query("INSERT INTO mangas (id, title) VALUES ($1, 'T')")
|
||||
.bind(manga_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
sqlx::query("INSERT INTO chapters (id, manga_id, number) VALUES ($1, $2, 1)")
|
||||
.bind(chapter_id)
|
||||
.bind(manga_id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let stored = vec![
|
||||
StoredPage {
|
||||
page_number: 1,
|
||||
storage_key: "k/0001.jpg".into(),
|
||||
content_type: "image/jpeg".into(),
|
||||
},
|
||||
StoredPage {
|
||||
page_number: 2,
|
||||
storage_key: "k/0002.jpg".into(),
|
||||
content_type: "image/jpeg".into(),
|
||||
},
|
||||
];
|
||||
persist_pages(&pool, chapter_id, &stored).await.unwrap();
|
||||
|
||||
let page_count: i32 =
|
||||
sqlx::query_scalar("SELECT page_count FROM chapters WHERE id = $1")
|
||||
.bind(chapter_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(page_count, 2);
|
||||
let rows: i64 =
|
||||
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
|
||||
.bind(chapter_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(rows, 2);
|
||||
|
||||
// Idempotent re-run (force refetch path): same rows, page_count stable.
|
||||
persist_pages(&pool, chapter_id, &stored).await.unwrap();
|
||||
let rows2: i64 =
|
||||
sqlx::query_scalar("SELECT COUNT(*) FROM pages WHERE chapter_id = $1")
|
||||
.bind(chapter_id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(rows2, 2, "re-run is idempotent via ON CONFLICT");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_chapter_pages_skips_loader_and_sorts_by_id() {
|
||||
|
||||
@@ -56,6 +56,15 @@ pub const CRON_LOCK_KEY: i64 = 0x4D414E47414C5244;
|
||||
|
||||
const STATE_KEY_LAST_TICK: &str = "last_metadata_tick_at";
|
||||
|
||||
/// Lease window handed to `jobs::lease`. Kept short, but continuously
|
||||
/// extended by the per-job heartbeat (see [`WorkerContext::process_lease`])
|
||||
/// so a long-but-healthy job never lapses and gets stolen.
|
||||
const LEASE_DURATION: Duration = Duration::from_secs(60);
|
||||
|
||||
/// How often the heartbeat renews the lease while a job runs. A third of
|
||||
/// the lease window leaves two missed-beat's slack before expiry.
|
||||
const LEASE_HEARTBEAT: Duration = Duration::from_secs(20);
|
||||
|
||||
#[async_trait]
|
||||
pub trait MetadataPass: Send + Sync {
|
||||
async fn run(&self) -> anyhow::Result<pipeline::MetadataStats>;
|
||||
@@ -77,6 +86,11 @@ pub struct DaemonConfig {
|
||||
pub tz: Tz,
|
||||
pub retention_days: u32,
|
||||
pub session_expired: Arc<AtomicBool>,
|
||||
/// Hard upper bound on a single job's dispatch. A job that exceeds it
|
||||
/// is acked failed (exponential backoff) rather than wedging a worker
|
||||
/// forever. Must exceed [`LEASE_HEARTBEAT`] and the realistic
|
||||
/// single-job runtime.
|
||||
pub job_timeout: Duration,
|
||||
/// Tasks that should run alongside the cron + workers and be cancelled
|
||||
/// on shutdown. Used to hand the daemon ownership of the browser
|
||||
/// manager's idle reaper.
|
||||
@@ -123,6 +137,7 @@ pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> Daem
|
||||
tz,
|
||||
retention_days,
|
||||
session_expired,
|
||||
job_timeout,
|
||||
extra_tasks,
|
||||
} = cfg;
|
||||
|
||||
@@ -146,6 +161,7 @@ pub fn spawn(pool: PgPool, cancel: CancellationToken, cfg: DaemonConfig) -> Daem
|
||||
cancel: cancel.clone(),
|
||||
dispatcher: Arc::clone(&dispatcher),
|
||||
session_expired: Arc::clone(&session_expired),
|
||||
job_timeout,
|
||||
id: worker_id,
|
||||
};
|
||||
join.spawn(async move { ctx.run().await });
|
||||
@@ -283,6 +299,7 @@ struct WorkerContext {
|
||||
cancel: CancellationToken,
|
||||
dispatcher: Arc<dyn ChapterDispatcher>,
|
||||
session_expired: Arc<AtomicBool>,
|
||||
job_timeout: Duration,
|
||||
id: usize,
|
||||
}
|
||||
|
||||
@@ -303,7 +320,7 @@ impl WorkerContext {
|
||||
&self.pool,
|
||||
Some(KIND_SYNC_CHAPTER_CONTENT),
|
||||
1,
|
||||
Duration::from_secs(60),
|
||||
LEASE_DURATION,
|
||||
)
|
||||
.await
|
||||
{
|
||||
@@ -341,9 +358,54 @@ impl WorkerContext {
|
||||
}
|
||||
}
|
||||
|
||||
let outcome = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
|
||||
.catch_unwind()
|
||||
.await;
|
||||
// Heartbeat: keep the lease fresh while the (potentially long)
|
||||
// dispatch runs, so a slow-but-healthy job is never re-leased and
|
||||
// never inflates `attempts` toward `max_attempts`. Stops itself
|
||||
// once the job is no longer ours (renew returns false).
|
||||
let heartbeat = {
|
||||
let hb_pool = self.pool.clone();
|
||||
let hb_id = lease.id;
|
||||
tokio::spawn(async move {
|
||||
loop {
|
||||
tokio::time::sleep(LEASE_HEARTBEAT).await;
|
||||
match jobs::renew(&hb_pool, hb_id, LEASE_DURATION).await {
|
||||
Ok(true) => {}
|
||||
Ok(false) => break,
|
||||
Err(e) => {
|
||||
tracing::warn!(lease_id = %hb_id, ?e, "heartbeat renew failed");
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
};
|
||||
|
||||
// Outer timeout: a dispatch that exceeds `job_timeout` is acked
|
||||
// failed (exponential backoff) rather than wedging the worker.
|
||||
let dispatch = AssertUnwindSafe(self.dispatcher.dispatch(lease.payload.clone()))
|
||||
.catch_unwind();
|
||||
let outcome = tokio::time::timeout(self.job_timeout, dispatch).await;
|
||||
heartbeat.abort();
|
||||
|
||||
let outcome = match outcome {
|
||||
Ok(o) => o,
|
||||
Err(_elapsed) => {
|
||||
tracing::warn!(
|
||||
worker = self.id,
|
||||
lease_id = %lease.id,
|
||||
timeout_secs = self.job_timeout.as_secs(),
|
||||
"worker: dispatch timed out — ack failed"
|
||||
);
|
||||
let _ = jobs::ack_failed(
|
||||
&self.pool,
|
||||
lease.id,
|
||||
"dispatch timed out",
|
||||
lease.attempts,
|
||||
lease.max_attempts,
|
||||
)
|
||||
.await;
|
||||
return;
|
||||
}
|
||||
};
|
||||
match outcome {
|
||||
Ok(Ok(SyncOutcome::Fetched { .. } | SyncOutcome::Skipped)) => {
|
||||
let _ = jobs::ack_done(&self.pool, lease.id).await;
|
||||
|
||||
@@ -66,16 +66,33 @@ pub struct Lease {
|
||||
pub max_attempts: i32,
|
||||
}
|
||||
|
||||
/// Exponential backoff for `ack_failed` retries. `attempts` is the
|
||||
/// post-increment value reported by `lease()` (so the first failure has
|
||||
/// `attempts == 1` and waits 60s, the second 120s, etc.). Capped at 1h to
|
||||
/// avoid runaway long sleeps that would outlive the daemon process.
|
||||
fn backoff_for(attempts: i32) -> Duration {
|
||||
/// Deterministic exponential backoff base for `ack_failed` retries.
|
||||
/// `attempts` is the post-increment value reported by `lease()` (so the
|
||||
/// first failure has `attempts == 1` and waits 60s, the second 120s,
|
||||
/// etc.). Capped at 1h to avoid runaway long sleeps that would outlive
|
||||
/// the daemon process. Jitter is applied separately by [`apply_jitter`].
|
||||
fn backoff_base(attempts: i32) -> Duration {
|
||||
let shift = attempts.saturating_sub(1).clamp(0, 20) as u32;
|
||||
let secs = 60u64.saturating_mul(1u64 << shift);
|
||||
Duration::from_secs(secs.min(3600))
|
||||
}
|
||||
|
||||
/// Apply ±20% jitter to a backoff duration. `jitter` is a fraction in
|
||||
/// `[0.0, 1.0)` (e.g. `rand::random::<f64>()`), mapped to a multiplier in
|
||||
/// `[0.8, 1.2)`. Pure so the bounds stay unit-testable. Spreading retries
|
||||
/// avoids a thundering herd when a source outage fails many jobs at once.
|
||||
fn apply_jitter(base: Duration, jitter: f64) -> Duration {
|
||||
let frac = jitter.clamp(0.0, 1.0);
|
||||
let mult = 0.8 + 0.4 * frac; // [0.8, 1.2)
|
||||
Duration::from_secs((base.as_secs_f64() * mult).round() as u64)
|
||||
}
|
||||
|
||||
/// Jittered exponential backoff for `ack_failed`. Wraps [`backoff_base`]
|
||||
/// with a random ±20% spread.
|
||||
fn backoff_for(attempts: i32) -> Duration {
|
||||
apply_jitter(backoff_base(attempts), rand::random::<f64>())
|
||||
}
|
||||
|
||||
/// Insert a new pending job. For `SyncChapterContent` payloads the
|
||||
/// partial unique index `crawler_jobs_chapter_content_dedup_idx` blocks
|
||||
/// a second `(pending|running)` insert per chapter_id, returning
|
||||
@@ -159,6 +176,35 @@ pub async fn lease(
|
||||
Ok(leases)
|
||||
}
|
||||
|
||||
/// Extend the lease on a still-owned `running` job. Returns `true` if the
|
||||
/// row was updated (we still hold the lease), `false` if the job is no
|
||||
/// longer `running` (re-leased after a missed heartbeat, or already
|
||||
/// acked) — the caller's heartbeat loop should stop. The `state =
|
||||
/// 'running'` guard mirrors [`ack_done`]'s rationale.
|
||||
///
|
||||
/// This is the heartbeat primitive: a worker renews periodically while a
|
||||
/// long-but-healthy job runs so `leased_until` never lapses, which would
|
||||
/// otherwise let another worker steal the in-flight job and spuriously
|
||||
/// inflate `attempts` toward `max_attempts`.
|
||||
pub async fn renew(
|
||||
pool: &PgPool,
|
||||
lease_id: Uuid,
|
||||
lease_duration: Duration,
|
||||
) -> sqlx::Result<bool> {
|
||||
let lease_ms: i64 = lease_duration.as_millis().min(i64::MAX as u128) as i64;
|
||||
let res = sqlx::query(
|
||||
"UPDATE crawler_jobs \
|
||||
SET leased_until = now() + ($2::bigint || ' milliseconds')::interval, \
|
||||
updated_at = now() \
|
||||
WHERE id = $1 AND state = 'running'",
|
||||
)
|
||||
.bind(lease_id)
|
||||
.bind(lease_ms)
|
||||
.execute(pool)
|
||||
.await?;
|
||||
Ok(res.rows_affected() > 0)
|
||||
}
|
||||
|
||||
/// Mark a leased job as successfully completed. The `state = 'running'`
|
||||
/// predicate guards against a late ack from a worker whose lease expired
|
||||
/// and was already re-leased by another worker: without it, the late ack
|
||||
@@ -278,19 +324,48 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn backoff_grows_exponentially_and_caps_at_one_hour() {
|
||||
fn backoff_base_grows_exponentially_and_caps_at_one_hour() {
|
||||
// attempts == 1 → 60s, doubling each step.
|
||||
assert_eq!(backoff_for(1), Duration::from_secs(60));
|
||||
assert_eq!(backoff_for(2), Duration::from_secs(120));
|
||||
assert_eq!(backoff_for(3), Duration::from_secs(240));
|
||||
assert_eq!(backoff_for(4), Duration::from_secs(480));
|
||||
assert_eq!(backoff_for(5), Duration::from_secs(960));
|
||||
assert_eq!(backoff_for(6), Duration::from_secs(1920));
|
||||
assert_eq!(backoff_base(1), Duration::from_secs(60));
|
||||
assert_eq!(backoff_base(2), Duration::from_secs(120));
|
||||
assert_eq!(backoff_base(3), Duration::from_secs(240));
|
||||
assert_eq!(backoff_base(4), Duration::from_secs(480));
|
||||
assert_eq!(backoff_base(5), Duration::from_secs(960));
|
||||
assert_eq!(backoff_base(6), Duration::from_secs(1920));
|
||||
// 7th: 60 * 64 = 3840 → capped to 3600.
|
||||
assert_eq!(backoff_for(7), Duration::from_secs(3600));
|
||||
assert_eq!(backoff_for(20), Duration::from_secs(3600));
|
||||
assert_eq!(backoff_base(7), Duration::from_secs(3600));
|
||||
assert_eq!(backoff_base(20), Duration::from_secs(3600));
|
||||
// Garbage / zero / negatives stay sane.
|
||||
assert_eq!(backoff_for(0), Duration::from_secs(60));
|
||||
assert_eq!(backoff_for(-5), Duration::from_secs(60));
|
||||
assert_eq!(backoff_base(0), Duration::from_secs(60));
|
||||
assert_eq!(backoff_base(-5), Duration::from_secs(60));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn apply_jitter_stays_within_plus_minus_twenty_percent() {
|
||||
let base = Duration::from_secs(100);
|
||||
// Lower bound (jitter = 0.0) → 0.8x.
|
||||
assert_eq!(apply_jitter(base, 0.0), Duration::from_secs(80));
|
||||
// Midpoint (jitter = 0.5) → 1.0x.
|
||||
assert_eq!(apply_jitter(base, 0.5), Duration::from_secs(100));
|
||||
// Upper end (jitter → 1.0) → ~1.2x.
|
||||
assert_eq!(apply_jitter(base, 1.0), Duration::from_secs(120));
|
||||
// Out-of-range inputs are clamped, never panic.
|
||||
assert_eq!(apply_jitter(base, -3.0), Duration::from_secs(80));
|
||||
assert_eq!(apply_jitter(base, 9.0), Duration::from_secs(120));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backoff_for_random_jitter_stays_in_band() {
|
||||
// The production wrapper draws its own randomness; assert the
|
||||
// result for a mid-range attempt always lands within the jitter
|
||||
// band of the base, across many draws.
|
||||
let base = backoff_base(3).as_secs_f64(); // 240s
|
||||
for _ in 0..1000 {
|
||||
let v = backoff_for(3).as_secs_f64();
|
||||
assert!(
|
||||
v >= base * 0.8 - 1.0 && v <= base * 1.2 + 1.0,
|
||||
"jittered backoff {v} outside band of base {base}"
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -65,6 +65,17 @@ pub(crate) fn should_mark_clean_exit(
|
||||
walked_to_completion || hit_stop_condition
|
||||
}
|
||||
|
||||
/// Circuit-breaker: abort the walk once `consecutive` `fetch_manga`
|
||||
/// failures reach `threshold`. A `threshold` of 0 disables the breaker
|
||||
/// (unbounded — the legacy behaviour). When it fires the caller must NOT
|
||||
/// mark a clean exit, so the next tick does a recovery sweep over the
|
||||
/// catalog tail the aborted pass never reached.
|
||||
///
|
||||
/// Pure so the rule is unit-testable without the walker.
|
||||
pub(crate) fn should_abort_pass(consecutive: u32, threshold: u32) -> bool {
|
||||
threshold > 0 && consecutive >= threshold
|
||||
}
|
||||
|
||||
/// Runs the discover → fetch → upsert → cover → chapter-list-diff pipeline
|
||||
/// for the target source. Pure metadata; chapter content is enqueued as
|
||||
/// separate `SyncChapterContent` jobs by the caller after this returns.
|
||||
@@ -103,6 +114,7 @@ pub async fn run_metadata_pass(
|
||||
skip_chapters: bool,
|
||||
allowlist: &DownloadAllowlist,
|
||||
max_image_bytes: usize,
|
||||
max_consecutive_failures: u32,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
) -> anyhow::Result<MetadataStats> {
|
||||
let lease = browser_manager
|
||||
@@ -165,6 +177,11 @@ pub async fn run_metadata_pass(
|
||||
let mut walked_to_completion = false;
|
||||
let mut hit_limit = false;
|
||||
let mut hit_stop_condition = false;
|
||||
// Circuit-breaker state: consecutive fetch_manga failures. A sustained
|
||||
// run abort (source outage) leaves the pass un-clean → recovery sweep
|
||||
// next tick.
|
||||
let mut consecutive_failures = 0u32;
|
||||
let mut hit_failure_breaker = false;
|
||||
|
||||
'outer: loop {
|
||||
let batch = match walker.next_batch(&ctx).await? {
|
||||
@@ -204,7 +221,10 @@ pub async fn run_metadata_pass(
|
||||
"fetching metadata"
|
||||
);
|
||||
let manga = match source.fetch_manga(&ctx, &r).await {
|
||||
Ok(m) => m,
|
||||
Ok(m) => {
|
||||
consecutive_failures = 0;
|
||||
m
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!(
|
||||
key = %r.source_manga_key,
|
||||
@@ -213,6 +233,17 @@ pub async fn run_metadata_pass(
|
||||
"fetch_manga failed"
|
||||
);
|
||||
stats.mangas_failed += 1;
|
||||
consecutive_failures += 1;
|
||||
if should_abort_pass(consecutive_failures, max_consecutive_failures) {
|
||||
hit_failure_breaker = true;
|
||||
tracing::error!(
|
||||
consecutive_failures,
|
||||
threshold = max_consecutive_failures,
|
||||
"metadata pass: too many consecutive fetch_manga failures; \
|
||||
aborting (recovery sweep on next tick)"
|
||||
);
|
||||
break 'outer;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
};
|
||||
@@ -390,6 +421,7 @@ pub async fn run_metadata_pass(
|
||||
walked_to_completion,
|
||||
hit_limit,
|
||||
hit_stop_condition,
|
||||
hit_failure_breaker,
|
||||
exited_cleanly,
|
||||
"metadata pass complete"
|
||||
);
|
||||
@@ -756,6 +788,18 @@ mod tests {
|
||||
assert!(!should_stop(false, UpsertStatus::New, None));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn abort_pass_fires_at_threshold_and_respects_disable() {
|
||||
// Disabled (0) never fires, no matter how many failures.
|
||||
assert!(!should_abort_pass(0, 0));
|
||||
assert!(!should_abort_pass(100, 0));
|
||||
// Below threshold: keep going.
|
||||
assert!(!should_abort_pass(9, 10));
|
||||
// At/above threshold: abort.
|
||||
assert!(should_abort_pass(10, 10));
|
||||
assert!(should_abort_pass(11, 10));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_exit_when_walked_to_completion() {
|
||||
// End-of-walk reached the catalog tail — the recovery flag may
|
||||
|
||||
@@ -40,6 +40,7 @@ fn make_cfg(
|
||||
tz: Tz::UTC,
|
||||
retention_days: 7,
|
||||
session_expired,
|
||||
job_timeout: Duration::from_secs(60),
|
||||
extra_tasks: Vec::new(),
|
||||
}
|
||||
}
|
||||
@@ -88,6 +89,52 @@ impl ChapterDispatcher for PanickingDispatcher {
|
||||
}
|
||||
}
|
||||
|
||||
/// Never completes — used to verify the worker's outer dispatch timeout.
|
||||
struct HangingDispatcher {
|
||||
seen: AtomicUsize,
|
||||
}
|
||||
#[async_trait::async_trait]
|
||||
impl ChapterDispatcher for HangingDispatcher {
|
||||
async fn dispatch(&self, _payload: JobPayload) -> anyhow::Result<SyncOutcome> {
|
||||
self.seen.fetch_add(1, Ordering::AcqRel);
|
||||
std::future::pending::<()>().await;
|
||||
unreachable!("hanging dispatcher never resolves");
|
||||
}
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn worker_times_out_a_hung_dispatch_and_acks_failed(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
let dispatcher = Arc::new(HangingDispatcher {
|
||||
seen: AtomicUsize::new(0),
|
||||
});
|
||||
let session_expired = Arc::new(std::sync::atomic::AtomicBool::new(false));
|
||||
let cancel = CancellationToken::new();
|
||||
let mut cfg = make_cfg(None, dispatcher.clone(), session_expired, 1);
|
||||
cfg.job_timeout = Duration::from_millis(300);
|
||||
let handle = daemon::spawn(pool.clone(), cancel.clone(), cfg);
|
||||
|
||||
// The hung job should time out and return to pending with backoff
|
||||
// (attempts=1 < max=5). Poll for the recorded error.
|
||||
let mut timed_out = false;
|
||||
for _ in 0..40 {
|
||||
let n: i64 = sqlx::query_scalar(
|
||||
"SELECT COUNT(*) FROM crawler_jobs WHERE last_error = 'dispatch timed out'",
|
||||
)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
if n == 1 {
|
||||
timed_out = true;
|
||||
break;
|
||||
}
|
||||
tokio::time::sleep(Duration::from_millis(50)).await;
|
||||
}
|
||||
handle.shutdown().await;
|
||||
assert!(timed_out, "hung dispatch must be acked failed with a timeout error");
|
||||
assert!(dispatcher.seen.load(Ordering::Acquire) >= 1);
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn workers_drain_jobs_through_dispatcher(pool: PgPool) {
|
||||
enqueue_chapter_job(&pool).await;
|
||||
|
||||
@@ -185,6 +185,68 @@ async fn lease_marks_running_and_bumps_attempts_and_sets_leased_until(pool: PgPo
|
||||
assert!(leased_until > chrono::Utc::now());
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn renew_extends_leased_until_while_running(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
EnqueueResult::Skipped => unreachable!(),
|
||||
};
|
||||
|
||||
// Lease with a short window, then collapse leased_until to the recent
|
||||
// past so the renew is unambiguously an extension.
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(5))
|
||||
.await
|
||||
.unwrap();
|
||||
assert_eq!(leases.len(), 1);
|
||||
sqlx::query("UPDATE crawler_jobs SET leased_until = now() - interval '1 second' WHERE id = $1")
|
||||
.bind(id)
|
||||
.execute(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(still_owned, "renew on a running job returns true");
|
||||
|
||||
let leased_until: chrono::DateTime<chrono::Utc> =
|
||||
sqlx::query_scalar("SELECT leased_until FROM crawler_jobs WHERE id = $1")
|
||||
.bind(id)
|
||||
.fetch_one(&pool)
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(
|
||||
leased_until > chrono::Utc::now() + chrono::Duration::seconds(60),
|
||||
"leased_until pushed ~120s into the future"
|
||||
);
|
||||
assert_eq!(job_state(&pool, id).await, "running");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn renew_is_noop_once_job_no_longer_running(pool: PgPool) {
|
||||
let id = match jobs::enqueue(&pool, &chapter_content_payload(Uuid::new_v4()))
|
||||
.await
|
||||
.unwrap()
|
||||
{
|
||||
EnqueueResult::Inserted(id) => id,
|
||||
EnqueueResult::Skipped => unreachable!(),
|
||||
};
|
||||
let leases = jobs::lease(&pool, None, 1, Duration::from_secs(60))
|
||||
.await
|
||||
.unwrap();
|
||||
// Job completes — heartbeat should now see it's no longer ours.
|
||||
jobs::ack_done(&pool, leases[0].id).await.unwrap();
|
||||
|
||||
let still_owned = jobs::renew(&pool, id, Duration::from_secs(120))
|
||||
.await
|
||||
.unwrap();
|
||||
assert!(!still_owned, "renew on a non-running job returns false");
|
||||
assert_eq!(job_state(&pool, id).await, "done");
|
||||
}
|
||||
|
||||
#[sqlx::test(migrations = "./migrations")]
|
||||
async fn lease_with_kind_filter_only_matches_that_kind(pool: PgPool) {
|
||||
let manga_id = match jobs::enqueue(&pool, &sync_manga_payload("foo"))
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "mangalord-frontend",
|
||||
"version": "0.52.0",
|
||||
"version": "0.53.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
Reference in New Issue
Block a user