fix(crawler): unify recircuit budget semantics — N = total attempts

The three retry-with-recircuit sites disagreed: detect.rs's
retry_on_transient_with_hook used "N = total attempts" (3 → 3
fetches), but session.rs's unauth branch and content.rs's chapter
loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock
"max=3", different sites hit the upstream a different number of times.

Unify on N = total attempts (matching the existing
retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS
env var now means exactly what its name suggests. Disabling the
recircuit feature collapses to max_attempts=1 (single attempt, no
retry) — bit-for-bit pre-TOR behavior preserved.

Adds a debug_assert!(max >= 1) on both helpers and a new
content.rs test exercising the mixed Transient → Unauth → Ok
sequence to lock in the shared-counter invariant.

Audit ref: #5.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 20:25:25 +02:00
parent a0db7beb81
commit c30c7a546f
2 changed files with 143 additions and 94 deletions

View File

@@ -73,9 +73,10 @@ pub enum SyncOutcome {
SessionExpired, SessionExpired,
} }
/// Per-chapter recircuit budget for both transient pages and /// Per-chapter max fetch attempts when TOR is configured. `N = 3` means
/// `Unauthenticated` outcomes. When TOR is not configured the budget /// up to 3 total page fetches with 2 NEWNYM signals between them. When
/// is effectively 0 (no recircuit attempted; original behavior). /// TOR is not configured the effective budget collapses to 1 (single
/// attempt, no retry, no recircuit — bit-for-bit pre-TOR behavior).
const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3; const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3;
/// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the /// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the
@@ -125,15 +126,23 @@ async fn fetch_chapter_html_once(
Ok(html) Ok(html)
} }
/// Pure-over-IO loop: fetch + classify, with up to `recircuit_budget` /// Pure-over-IO loop: fetch + classify, up to `max_attempts` total
/// NEWNYM-and-retry cycles after a `Transient` or `Unauthenticated` /// fetches. Between attempts, `recircuit` is invoked (a no-op when
/// outcome. `recircuit_budget = 0` collapses to the original /// TOR isn't configured). `max_attempts = 1` collapses to the
/// single-shot behavior — `Unauthenticated` → `SessionExpired`, /// original single-shot behavior — `Unauthenticated` →
/// `Transient` → `PersistentTransient` on the first hit, no recircuit. /// `SessionExpired`, `Transient` → `PersistentTransient` on the first
/// hit, no recircuit.
///
/// Semantics match [`crate::crawler::detect::retry_on_transient`] and
/// [`run_session_probe_loop`]: `N` is **total attempts including the
/// first**, so `N = 3` means 3 fetches and up to 2 NEWNYM calls.
/// `Unauthenticated` and `Transient` share the budget — the loop
/// doesn't distinguish, so a sequence like Transient → Unauth → Ok
/// counts as 3 attempts.
async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>( async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>(
mut fetch: F, mut fetch: F,
mut recircuit: R, mut recircuit: R,
recircuit_budget: u32, max_attempts: u32,
source_url_for_msg: &str, source_url_for_msg: &str,
) -> anyhow::Result<ChapterFetchOutcome> ) -> anyhow::Result<ChapterFetchOutcome>
where where
@@ -142,38 +151,36 @@ where
R: FnMut() -> RFut, R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>, RFut: std::future::Future<Output = ()>,
{ {
let mut recircuits = 0u32; debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
let mut attempt = 0u32;
loop { loop {
attempt += 1;
let html = fetch().await?; let html = fetch().await?;
match session::classify_chapter_probe(&html) { match session::classify_chapter_probe(&html) {
ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)), ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)),
ChapterProbe::Unauthenticated => { ChapterProbe::Unauthenticated => {
if recircuits < recircuit_budget { if attempt >= max_attempts {
recircuits += 1; return Ok(ChapterFetchOutcome::SessionExpired);
}
tracing::warn!( tracing::warn!(
attempt = recircuits, attempt,
max = recircuit_budget, max = max_attempts,
url = source_url_for_msg, url = source_url_for_msg,
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying" "chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
); );
recircuit().await; recircuit().await;
continue;
}
return Ok(ChapterFetchOutcome::SessionExpired);
} }
ChapterProbe::Transient => { ChapterProbe::Transient => {
if recircuits < recircuit_budget { if attempt >= max_attempts {
recircuits += 1; return Ok(ChapterFetchOutcome::PersistentTransient);
}
tracing::warn!( tracing::warn!(
attempt = recircuits, attempt,
max = recircuit_budget, max = max_attempts,
url = source_url_for_msg, url = source_url_for_msg,
"chapter probe Transient; signaling TOR NEWNYM and retrying" "chapter probe Transient; signaling TOR NEWNYM and retrying"
); );
recircuit().await; recircuit().await;
continue;
}
return Ok(ChapterFetchOutcome::PersistentTransient);
} }
} }
} }
@@ -212,10 +219,11 @@ pub async fn sync_chapter_content(
} }
} }
// Fetch + classify with a recircuit budget when TOR is configured. // Fetch + classify. With TOR configured, allow up to
// Without TOR the closure-recircuit is a no-op and the loop reduces // CHAPTER_RECIRCUIT_MAX_ATTEMPTS total page fetches with NEWNYM
// to the original single-attempt behavior. // between each. Without TOR, collapse to 1 attempt (no retry, no
let recircuit_budget = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 0 }; // recircuit) — matches the pre-TOR single-shot behavior bit-for-bit.
let max_attempts = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 1 };
let html = match fetch_chapter_html_with_recircuit( let html = match fetch_chapter_html_with_recircuit(
|| fetch_chapter_html_once(browser, rate, source_url), || fetch_chapter_html_once(browser, rate, source_url),
|| async { || async {
@@ -225,7 +233,7 @@ pub async fn sync_chapter_content(
} }
} }
}, },
recircuit_budget, max_attempts,
source_url, source_url,
) )
.await? .await?
@@ -238,7 +246,7 @@ pub async fn sync_chapter_content(
// session-expired sticky flag). // session-expired sticky flag).
anyhow::bail!( anyhow::bail!(
"chapter page at {source_url} returned a transient response after \ "chapter page at {source_url} returned a transient response after \
{recircuit_budget} TOR recircuit(s); will retry" {max_attempts} attempt(s); will retry"
); );
} }
}; };
@@ -431,7 +439,8 @@ mod tests {
} }
#[tokio::test] #[tokio::test]
async fn recircuit_loop_unauth_with_zero_budget_returns_session_expired() { async fn recircuit_loop_unauth_with_single_attempt_returns_session_expired() {
// max_attempts=1 = TOR disabled, fail-fast on first Unauthenticated.
let mut recircuits = 0u32; let mut recircuits = 0u32;
let mut fetches = 0u32; let mut fetches = 0u32;
let outcome = fetch_chapter_html_with_recircuit( let outcome = fetch_chapter_html_with_recircuit(
@@ -443,18 +452,19 @@ mod tests {
recircuits += 1; recircuits += 1;
async {} async {}
}, },
0, 1,
"https://example/c", "https://example/c",
) )
.await .await
.expect("ok-result"); .expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired)); assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
assert_eq!(fetches, 1); assert_eq!(fetches, 1);
assert_eq!(recircuits, 0, "no recircuit when budget is 0 (TOR disabled)"); assert_eq!(recircuits, 0, "no recircuit when budget is 1 (TOR disabled)");
} }
#[tokio::test] #[tokio::test]
async fn recircuit_loop_unauth_then_ok_within_budget() { async fn recircuit_loop_unauth_then_ok_within_budget() {
// max_attempts=3 = up to 3 fetches with 2 recircuits between.
let mut recircuits = 0u32; let mut recircuits = 0u32;
let mut fetch_n = 0u32; let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit( let outcome = fetch_chapter_html_with_recircuit(
@@ -496,15 +506,14 @@ mod tests {
recircuits += 1; recircuits += 1;
async {} async {}
}, },
2, 3,
"https://example/c", "https://example/c",
) )
.await .await
.expect("ok-result"); .expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired)); assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
// budget=2 → initial + 2 recircuit-and-retry = 3 fetches. assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
assert_eq!(fetch_n, 3); assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
assert_eq!(recircuits, 2);
} }
#[tokio::test] #[tokio::test]
@@ -556,8 +565,40 @@ mod tests {
.await .await
.expect("ok-result"); .expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient)); assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient));
assert_eq!(fetch_n, 4, "budget=3 → 1 initial + 3 retries"); assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
assert_eq!(recircuits, 3); assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
}
#[tokio::test]
async fn recircuit_loop_mixed_transient_then_unauth_then_ok_shares_budget() {
// Audit-prompted regression: outcomes share the attempt counter.
// Sequence: Transient (attempt 1) → Unauth (attempt 2) → Ok (3).
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
match n {
1 => Ok(TRANSIENT_HTML.to_string()),
2 => Ok(UNAUTH_HTML.to_string()),
_ => Ok(OK_HTML.to_string()),
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 3);
assert_eq!(recircuits, 2);
} }
#[tokio::test] #[tokio::test]

View File

@@ -167,19 +167,19 @@ pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Resul
/// Like [`verify_session`] but, when `tor` is `Some`, signals /// Like [`verify_session`] but, when `tor` is `Some`, signals
/// `SIGNAL NEWNYM` between retries on transient pages AND treats /// `SIGNAL NEWNYM` between retries on transient pages AND treats
/// `Unauthenticated` as a recoverable failure (up to /// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare /// probes, calling NEWNYM between each).
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
/// ///
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth` /// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
/// stays a hard fail, matching the original behavior. /// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
/// fail-fast, exactly the pre-TOR behavior.
pub async fn verify_session_with_recircuit( pub async fn verify_session_with_recircuit(
browser: &Browser, browser: &Browser,
probe_url: &str, probe_url: &str,
tor: Option<&crate::crawler::tor::TorController>, tor: Option<&crate::crawler::tor::TorController>,
unauth_max_recircuit: u32, tor_max_attempts: u32,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 }; let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
run_session_probe_loop( run_session_probe_loop(
|| fetch_probe_html(browser, probe_url), || fetch_probe_html(browser, probe_url),
|| async { || async {
@@ -190,7 +190,7 @@ pub async fn verify_session_with_recircuit(
} }
}, },
PROBE_MAX_ATTEMPTS, PROBE_MAX_ATTEMPTS,
effective_unauth_budget, unauth_max_attempts,
PROBE_RETRY_DELAY, PROBE_RETRY_DELAY,
probe_url, probe_url,
) )
@@ -201,20 +201,25 @@ pub async fn verify_session_with_recircuit(
/// fetch and recircuit closures so it can be unit-tested without a /// fetch and recircuit closures so it can be unit-tested without a
/// real browser or TOR daemon. /// real browser or TOR daemon.
/// ///
/// Semantics: /// Both budgets count **total attempts**, including the first — so
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
/// and the content-path recircuit loop.
///
/// Outcomes:
/// - `SessionProbe::Ok` → return `Ok(())`. /// - `SessionProbe::Ok` → return `Ok(())`.
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0` /// - `SessionProbe::Unauthenticated` → recircuit + retry while
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise /// under the unauth budget. After the cap, bail with the
/// bail with the "PHPSESSID expired" diagnostic, mentioning the /// "PHPSESSID expired" diagnostic, mentioning the attempt count so
/// recircuit count so a TOR-misconfig diagnosis is easier. /// a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total /// - `SessionProbe::Transient` → same shape against the transient
/// tries, calling `recircuit` between each. After the cap, bail with /// budget; bails with "site down or rate-limiting" after the cap.
/// the "site down or rate-limiting" diagnostic.
async fn run_session_probe_loop<F, Fut, R, RFut>( async fn run_session_probe_loop<F, Fut, R, RFut>(
mut fetch_html: F, mut fetch_html: F,
mut recircuit: R, mut recircuit: R,
transient_max_attempts: u32, transient_max_attempts: u32,
unauth_max_recircuit: u32, unauth_max_attempts: u32,
retry_delay: Duration, retry_delay: Duration,
probe_url_for_msg: &str, probe_url_for_msg: &str,
) -> anyhow::Result<()> ) -> anyhow::Result<()>
@@ -224,37 +229,38 @@ where
R: FnMut() -> RFut, R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>, RFut: std::future::Future<Output = ()>,
{ {
debug_assert!(transient_max_attempts >= 1);
debug_assert!(unauth_max_attempts >= 1);
let mut transient_attempts = 0u32; let mut transient_attempts = 0u32;
let mut unauth_recircuits = 0u32; let mut unauth_attempts = 0u32;
loop { loop {
let html = fetch_html().await?; let html = fetch_html().await?;
match classify_probe(&html) { match classify_probe(&html) {
SessionProbe::Ok => { SessionProbe::Ok => {
tracing::info!( tracing::info!(
transient_attempts, transient_attempts,
unauth_recircuits, unauth_attempts,
"session probe ok — #logo + #avatar_menu present" "session probe ok — #logo + #avatar_menu present"
); );
return Ok(()); return Ok(());
} }
SessionProbe::Unauthenticated => { SessionProbe::Unauthenticated => {
if unauth_recircuits < unauth_max_recircuit { unauth_attempts += 1;
unauth_recircuits += 1; if unauth_attempts >= unauth_max_attempts {
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
tracing::warn!( tracing::warn!(
attempt = unauth_recircuits, attempt = unauth_attempts,
max = unauth_max_recircuit, max_attempts = unauth_max_attempts,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \ "session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying" NEWNYM and retrying"
); );
recircuit().await; recircuit().await;
tokio::time::sleep(retry_delay).await; tokio::time::sleep(retry_delay).await;
continue;
}
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
} }
SessionProbe::Transient => { SessionProbe::Transient => {
transient_attempts += 1; transient_attempts += 1;
@@ -451,7 +457,8 @@ mod tests {
} }
#[tokio::test] #[tokio::test]
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() { async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
let mut recircuits = 0u32; let mut recircuits = 0u32;
let mut call = 0u32; let mut call = 0u32;
run_session_probe_loop( run_session_probe_loop(
@@ -482,7 +489,8 @@ mod tests {
} }
#[tokio::test] #[tokio::test]
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() { async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
let mut recircuits = 0u32; let mut recircuits = 0u32;
let mut call = 0u32; let mut call = 0u32;
let err = run_session_probe_loop( let err = run_session_probe_loop(
@@ -495,20 +503,21 @@ mod tests {
async {} async {}
}, },
3, 3,
0, 1,
Duration::from_millis(0), Duration::from_millis(0),
"https://example/probe", "https://example/probe",
) )
.await .await
.expect_err("zero budget → fail"); .expect_err("budget=1 → fail-fast");
assert_eq!(call, 1, "no retry when budget is 0"); assert_eq!(call, 1, "no retry when budget is 1");
assert_eq!(recircuits, 0); assert_eq!(recircuits, 0);
let msg = format!("{err:#}"); let msg = format!("{err:#}");
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}"); assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
} }
#[tokio::test] #[tokio::test]
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() { async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
let mut recircuits = 0u32; let mut recircuits = 0u32;
let mut call = 0u32; let mut call = 0u32;
let err = run_session_probe_loop( let err = run_session_probe_loop(
@@ -521,17 +530,16 @@ mod tests {
async {} async {}
}, },
10, // transient budget irrelevant here 10, // transient budget irrelevant here
2, 3, // 3 attempts total, 2 recircuits between
Duration::from_millis(0), Duration::from_millis(0),
"https://example/probe", "https://example/probe",
) )
.await .await
.expect_err("exhausts unauth budget"); .expect_err("exhausts unauth budget");
// 3 fetches total: initial + 2 recircuit-and-retry
assert_eq!(call, 3); assert_eq!(call, 3);
assert_eq!(recircuits, 2); assert_eq!(recircuits, 2);
let msg = format!("{err:#}"); let msg = format!("{err:#}");
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}"); assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
} }
#[tokio::test] #[tokio::test]
@@ -548,14 +556,14 @@ mod tests {
async {} async {}
}, },
3, 3,
0, 1,
Duration::from_millis(0), Duration::from_millis(0),
"https://example/probe", "https://example/probe",
) )
.await .await
.expect_err("transient until max → fail"); .expect_err("transient until max → fail");
assert_eq!(call, 3); assert_eq!(call, 3);
// recircuit fires between attempts: 3 attempts → 2 recircuits. // Recircuit fires between attempts: 3 attempts → 2 recircuits.
assert_eq!(recircuits, 2); assert_eq!(recircuits, 2);
let msg = format!("{err:#}"); let msg = format!("{err:#}");
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}"); assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
@@ -582,7 +590,7 @@ mod tests {
async {} async {}
}, },
3, 3,
0, 1,
Duration::from_millis(0), Duration::from_millis(0),
"https://example/probe", "https://example/probe",
) )