From c30c7a546ff0cb8dcbb484a9ae5c40e632e60427 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Sun, 31 May 2026 20:25:25 +0200 Subject: [PATCH] =?UTF-8?q?fix(crawler):=20unify=20recircuit=20budget=20se?= =?UTF-8?q?mantics=20=E2=80=94=20N=20=3D=20total=20attempts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The three retry-with-recircuit sites disagreed: detect.rs's retry_on_transient_with_hook used "N = total attempts" (3 → 3 fetches), but session.rs's unauth branch and content.rs's chapter loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock "max=3", different sites hit the upstream a different number of times. Unify on N = total attempts (matching the existing retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS env var now means exactly what its name suggests. Disabling the recircuit feature collapses to max_attempts=1 (single attempt, no retry) — bit-for-bit pre-TOR behavior preserved. Adds a debug_assert!(max >= 1) on both helpers and a new content.rs test exercising the mixed Transient → Unauth → Ok sequence to lock in the shared-counter invariant. Audit ref: #5. Co-Authored-By: Claude Opus 4.7 (1M context) --- backend/src/crawler/content.rs | 135 +++++++++++++++++++++------------ backend/src/crawler/session.rs | 102 +++++++++++++------------ 2 files changed, 143 insertions(+), 94 deletions(-) diff --git a/backend/src/crawler/content.rs b/backend/src/crawler/content.rs index 4406ce9..1549786 100644 --- a/backend/src/crawler/content.rs +++ b/backend/src/crawler/content.rs @@ -73,9 +73,10 @@ pub enum SyncOutcome { SessionExpired, } -/// Per-chapter recircuit budget for both transient pages and -/// `Unauthenticated` outcomes. When TOR is not configured the budget -/// is effectively 0 (no recircuit attempted; original behavior). +/// Per-chapter max fetch attempts when TOR is configured. `N = 3` means +/// up to 3 total page fetches with 2 NEWNYM signals between them. When +/// TOR is not configured the effective budget collapses to 1 (single +/// attempt, no retry, no recircuit — bit-for-bit pre-TOR behavior). const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3; /// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the @@ -125,15 +126,23 @@ async fn fetch_chapter_html_once( Ok(html) } -/// Pure-over-IO loop: fetch + classify, with up to `recircuit_budget` -/// NEWNYM-and-retry cycles after a `Transient` or `Unauthenticated` -/// outcome. `recircuit_budget = 0` collapses to the original -/// single-shot behavior — `Unauthenticated` → `SessionExpired`, -/// `Transient` → `PersistentTransient` on the first hit, no recircuit. +/// Pure-over-IO loop: fetch + classify, up to `max_attempts` total +/// fetches. Between attempts, `recircuit` is invoked (a no-op when +/// TOR isn't configured). `max_attempts = 1` collapses to the +/// original single-shot behavior — `Unauthenticated` → +/// `SessionExpired`, `Transient` → `PersistentTransient` on the first +/// hit, no recircuit. +/// +/// Semantics match [`crate::crawler::detect::retry_on_transient`] and +/// [`run_session_probe_loop`]: `N` is **total attempts including the +/// first**, so `N = 3` means 3 fetches and up to 2 NEWNYM calls. +/// `Unauthenticated` and `Transient` share the budget — the loop +/// doesn't distinguish, so a sequence like Transient → Unauth → Ok +/// counts as 3 attempts. async fn fetch_chapter_html_with_recircuit( mut fetch: F, mut recircuit: R, - recircuit_budget: u32, + max_attempts: u32, source_url_for_msg: &str, ) -> anyhow::Result where @@ -142,38 +151,36 @@ where R: FnMut() -> RFut, RFut: std::future::Future, { - let mut recircuits = 0u32; + debug_assert!(max_attempts >= 1, "max_attempts must be at least 1"); + let mut attempt = 0u32; loop { + attempt += 1; let html = fetch().await?; match session::classify_chapter_probe(&html) { ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)), ChapterProbe::Unauthenticated => { - if recircuits < recircuit_budget { - recircuits += 1; - tracing::warn!( - attempt = recircuits, - max = recircuit_budget, - url = source_url_for_msg, - "chapter probe Unauthenticated; signaling TOR NEWNYM and retrying" - ); - recircuit().await; - continue; + if attempt >= max_attempts { + return Ok(ChapterFetchOutcome::SessionExpired); } - return Ok(ChapterFetchOutcome::SessionExpired); + tracing::warn!( + attempt, + max = max_attempts, + url = source_url_for_msg, + "chapter probe Unauthenticated; signaling TOR NEWNYM and retrying" + ); + recircuit().await; } ChapterProbe::Transient => { - if recircuits < recircuit_budget { - recircuits += 1; - tracing::warn!( - attempt = recircuits, - max = recircuit_budget, - url = source_url_for_msg, - "chapter probe Transient; signaling TOR NEWNYM and retrying" - ); - recircuit().await; - continue; + if attempt >= max_attempts { + return Ok(ChapterFetchOutcome::PersistentTransient); } - return Ok(ChapterFetchOutcome::PersistentTransient); + tracing::warn!( + attempt, + max = max_attempts, + url = source_url_for_msg, + "chapter probe Transient; signaling TOR NEWNYM and retrying" + ); + recircuit().await; } } } @@ -212,10 +219,11 @@ pub async fn sync_chapter_content( } } - // Fetch + classify with a recircuit budget when TOR is configured. - // Without TOR the closure-recircuit is a no-op and the loop reduces - // to the original single-attempt behavior. - let recircuit_budget = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 0 }; + // Fetch + classify. With TOR configured, allow up to + // CHAPTER_RECIRCUIT_MAX_ATTEMPTS total page fetches with NEWNYM + // between each. Without TOR, collapse to 1 attempt (no retry, no + // recircuit) — matches the pre-TOR single-shot behavior bit-for-bit. + let max_attempts = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 1 }; let html = match fetch_chapter_html_with_recircuit( || fetch_chapter_html_once(browser, rate, source_url), || async { @@ -225,7 +233,7 @@ pub async fn sync_chapter_content( } } }, - recircuit_budget, + max_attempts, source_url, ) .await? @@ -238,7 +246,7 @@ pub async fn sync_chapter_content( // session-expired sticky flag). anyhow::bail!( "chapter page at {source_url} returned a transient response after \ - {recircuit_budget} TOR recircuit(s); will retry" + {max_attempts} attempt(s); will retry" ); } }; @@ -431,7 +439,8 @@ mod tests { } #[tokio::test] - async fn recircuit_loop_unauth_with_zero_budget_returns_session_expired() { + async fn recircuit_loop_unauth_with_single_attempt_returns_session_expired() { + // max_attempts=1 = TOR disabled, fail-fast on first Unauthenticated. let mut recircuits = 0u32; let mut fetches = 0u32; let outcome = fetch_chapter_html_with_recircuit( @@ -443,18 +452,19 @@ mod tests { recircuits += 1; async {} }, - 0, + 1, "https://example/c", ) .await .expect("ok-result"); assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired)); assert_eq!(fetches, 1); - assert_eq!(recircuits, 0, "no recircuit when budget is 0 (TOR disabled)"); + assert_eq!(recircuits, 0, "no recircuit when budget is 1 (TOR disabled)"); } #[tokio::test] async fn recircuit_loop_unauth_then_ok_within_budget() { + // max_attempts=3 = up to 3 fetches with 2 recircuits between. let mut recircuits = 0u32; let mut fetch_n = 0u32; let outcome = fetch_chapter_html_with_recircuit( @@ -496,15 +506,14 @@ mod tests { recircuits += 1; async {} }, - 2, + 3, "https://example/c", ) .await .expect("ok-result"); assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired)); - // budget=2 → initial + 2 recircuit-and-retry = 3 fetches. - assert_eq!(fetch_n, 3); - assert_eq!(recircuits, 2); + assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total"); + assert_eq!(recircuits, 2, "2 recircuits between 3 fetches"); } #[tokio::test] @@ -556,8 +565,40 @@ mod tests { .await .expect("ok-result"); assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient)); - assert_eq!(fetch_n, 4, "budget=3 → 1 initial + 3 retries"); - assert_eq!(recircuits, 3); + assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total"); + assert_eq!(recircuits, 2, "2 recircuits between 3 fetches"); + } + + #[tokio::test] + async fn recircuit_loop_mixed_transient_then_unauth_then_ok_shares_budget() { + // Audit-prompted regression: outcomes share the attempt counter. + // Sequence: Transient (attempt 1) → Unauth (attempt 2) → Ok (3). + let mut recircuits = 0u32; + let mut fetch_n = 0u32; + let outcome = fetch_chapter_html_with_recircuit( + || { + fetch_n += 1; + let n = fetch_n; + async move { + match n { + 1 => Ok(TRANSIENT_HTML.to_string()), + 2 => Ok(UNAUTH_HTML.to_string()), + _ => Ok(OK_HTML.to_string()), + } + } + }, + || { + recircuits += 1; + async {} + }, + 3, + "https://example/c", + ) + .await + .expect("ok"); + assert!(matches!(outcome, ChapterFetchOutcome::Ok(_))); + assert_eq!(fetch_n, 3); + assert_eq!(recircuits, 2); } #[tokio::test] diff --git a/backend/src/crawler/session.rs b/backend/src/crawler/session.rs index 39d3835..e495d1d 100644 --- a/backend/src/crawler/session.rs +++ b/backend/src/crawler/session.rs @@ -167,19 +167,19 @@ pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Resul /// Like [`verify_session`] but, when `tor` is `Some`, signals /// `SIGNAL NEWNYM` between retries on transient pages AND treats -/// `Unauthenticated` as a recoverable failure (up to -/// `unauth_max_recircuit` recircuit cycles before giving up). The bare -/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`. +/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total +/// probes, calling NEWNYM between each). /// -/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth` -/// stays a hard fail, matching the original behavior. +/// `verify_session` is `verify_session_with_recircuit(..., None, _)`, +/// which collapses the `Unauthenticated` budget to 1 attempt — i.e. +/// fail-fast, exactly the pre-TOR behavior. pub async fn verify_session_with_recircuit( browser: &Browser, probe_url: &str, tor: Option<&crate::crawler::tor::TorController>, - unauth_max_recircuit: u32, + tor_max_attempts: u32, ) -> anyhow::Result<()> { - let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 }; + let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 }; run_session_probe_loop( || fetch_probe_html(browser, probe_url), || async { @@ -190,7 +190,7 @@ pub async fn verify_session_with_recircuit( } }, PROBE_MAX_ATTEMPTS, - effective_unauth_budget, + unauth_max_attempts, PROBE_RETRY_DELAY, probe_url, ) @@ -201,20 +201,25 @@ pub async fn verify_session_with_recircuit( /// fetch and recircuit closures so it can be unit-tested without a /// real browser or TOR daemon. /// -/// Semantics: +/// Both budgets count **total attempts**, including the first — so +/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits +/// between them, and `unauth_max_attempts = 1` means "fail-fast, no +/// retry". This matches [`crate::crawler::detect::retry_on_transient`] +/// and the content-path recircuit loop. +/// +/// Outcomes: /// - `SessionProbe::Ok` → return `Ok(())`. -/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0` -/// and budget remaining, call `recircuit` + sleep + retry. Otherwise -/// bail with the "PHPSESSID expired" diagnostic, mentioning the -/// recircuit count so a TOR-misconfig diagnosis is easier. -/// - `SessionProbe::Transient` → up to `transient_max_attempts` total -/// tries, calling `recircuit` between each. After the cap, bail with -/// the "site down or rate-limiting" diagnostic. +/// - `SessionProbe::Unauthenticated` → recircuit + retry while +/// under the unauth budget. After the cap, bail with the +/// "PHPSESSID expired" diagnostic, mentioning the attempt count so +/// a TOR-misconfig diagnosis is easier. +/// - `SessionProbe::Transient` → same shape against the transient +/// budget; bails with "site down or rate-limiting" after the cap. async fn run_session_probe_loop( mut fetch_html: F, mut recircuit: R, transient_max_attempts: u32, - unauth_max_recircuit: u32, + unauth_max_attempts: u32, retry_delay: Duration, probe_url_for_msg: &str, ) -> anyhow::Result<()> @@ -224,37 +229,38 @@ where R: FnMut() -> RFut, RFut: std::future::Future, { + debug_assert!(transient_max_attempts >= 1); + debug_assert!(unauth_max_attempts >= 1); let mut transient_attempts = 0u32; - let mut unauth_recircuits = 0u32; + let mut unauth_attempts = 0u32; loop { let html = fetch_html().await?; match classify_probe(&html) { SessionProbe::Ok => { tracing::info!( transient_attempts, - unauth_recircuits, + unauth_attempts, "session probe ok — #logo + #avatar_menu present" ); return Ok(()); } SessionProbe::Unauthenticated => { - if unauth_recircuits < unauth_max_recircuit { - unauth_recircuits += 1; - tracing::warn!( - attempt = unauth_recircuits, - max = unauth_max_recircuit, - "session probe Unauthenticated despite PHPSESSID; signaling TOR \ - NEWNYM and retrying" - ); - recircuit().await; - tokio::time::sleep(retry_delay).await; - continue; + unauth_attempts += 1; + if unauth_attempts >= unauth_max_attempts { + return Err(anyhow!( + "session probe failed — #avatar_menu not present at {probe_url_for_msg} \ + after {unauth_attempts} attempt(s); PHPSESSID is missing, \ + expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run." + )); } - return Err(anyhow!( - "session probe failed — #avatar_menu not present at {probe_url_for_msg} \ - after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \ - expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run." - )); + tracing::warn!( + attempt = unauth_attempts, + max_attempts = unauth_max_attempts, + "session probe Unauthenticated despite PHPSESSID; signaling TOR \ + NEWNYM and retrying" + ); + recircuit().await; + tokio::time::sleep(retry_delay).await; } SessionProbe::Transient => { transient_attempts += 1; @@ -451,7 +457,8 @@ mod tests { } #[tokio::test] - async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() { + async fn probe_loop_unauth_then_ok_when_attempt_budget_available() { + // Budget = 3 total attempts. Unauth on call 1, ok on call 2. let mut recircuits = 0u32; let mut call = 0u32; run_session_probe_loop( @@ -482,7 +489,8 @@ mod tests { } #[tokio::test] - async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() { + async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() { + // Budget = 1 total attempt = no retry (matches no-TOR behavior). let mut recircuits = 0u32; let mut call = 0u32; let err = run_session_probe_loop( @@ -495,20 +503,21 @@ mod tests { async {} }, 3, - 0, + 1, Duration::from_millis(0), "https://example/probe", ) .await - .expect_err("zero budget → fail"); - assert_eq!(call, 1, "no retry when budget is 0"); + .expect_err("budget=1 → fail-fast"); + assert_eq!(call, 1, "no retry when budget is 1"); assert_eq!(recircuits, 0); let msg = format!("{err:#}"); assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}"); + assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}"); } #[tokio::test] - async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() { + async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() { let mut recircuits = 0u32; let mut call = 0u32; let err = run_session_probe_loop( @@ -521,17 +530,16 @@ mod tests { async {} }, 10, // transient budget irrelevant here - 2, + 3, // 3 attempts total, 2 recircuits between Duration::from_millis(0), "https://example/probe", ) .await .expect_err("exhausts unauth budget"); - // 3 fetches total: initial + 2 recircuit-and-retry assert_eq!(call, 3); assert_eq!(recircuits, 2); let msg = format!("{err:#}"); - assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}"); + assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}"); } #[tokio::test] @@ -548,14 +556,14 @@ mod tests { async {} }, 3, - 0, + 1, Duration::from_millis(0), "https://example/probe", ) .await .expect_err("transient until max → fail"); assert_eq!(call, 3); - // recircuit fires between attempts: 3 attempts → 2 recircuits. + // Recircuit fires between attempts: 3 attempts → 2 recircuits. assert_eq!(recircuits, 2); let msg = format!("{err:#}"); assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}"); @@ -582,7 +590,7 @@ mod tests { async {} }, 3, - 0, + 1, Duration::from_millis(0), "https://example/probe", )