fix(crawler): unify recircuit budget semantics — N = total attempts

The three retry-with-recircuit sites disagreed: detect.rs's
retry_on_transient_with_hook used "N = total attempts" (3 → 3
fetches), but session.rs's unauth branch and content.rs's chapter
loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock
"max=3", different sites hit the upstream a different number of times.

Unify on N = total attempts (matching the existing
retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS
env var now means exactly what its name suggests. Disabling the
recircuit feature collapses to max_attempts=1 (single attempt, no
retry) — bit-for-bit pre-TOR behavior preserved.

Adds a debug_assert!(max >= 1) on both helpers and a new
content.rs test exercising the mixed Transient → Unauth → Ok
sequence to lock in the shared-counter invariant.

Audit ref: #5.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 20:25:25 +02:00
parent a0db7beb81
commit c30c7a546f
2 changed files with 143 additions and 94 deletions

View File

@@ -167,19 +167,19 @@ pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Resul
/// Like [`verify_session`] but, when `tor` is `Some`, signals
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
/// `Unauthenticated` as a recoverable failure (up to
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
/// probes, calling NEWNYM between each).
///
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
/// stays a hard fail, matching the original behavior.
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
/// fail-fast, exactly the pre-TOR behavior.
pub async fn verify_session_with_recircuit(
browser: &Browser,
probe_url: &str,
tor: Option<&crate::crawler::tor::TorController>,
unauth_max_recircuit: u32,
tor_max_attempts: u32,
) -> anyhow::Result<()> {
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
run_session_probe_loop(
|| fetch_probe_html(browser, probe_url),
|| async {
@@ -190,7 +190,7 @@ pub async fn verify_session_with_recircuit(
}
},
PROBE_MAX_ATTEMPTS,
effective_unauth_budget,
unauth_max_attempts,
PROBE_RETRY_DELAY,
probe_url,
)
@@ -201,20 +201,25 @@ pub async fn verify_session_with_recircuit(
/// fetch and recircuit closures so it can be unit-tested without a
/// real browser or TOR daemon.
///
/// Semantics:
/// Both budgets count **total attempts**, including the first — so
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
/// and the content-path recircuit loop.
///
/// Outcomes:
/// - `SessionProbe::Ok` → return `Ok(())`.
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
/// recircuit count so a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
/// tries, calling `recircuit` between each. After the cap, bail with
/// the "site down or rate-limiting" diagnostic.
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
/// under the unauth budget. After the cap, bail with the
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
/// a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → same shape against the transient
/// budget; bails with "site down or rate-limiting" after the cap.
async fn run_session_probe_loop<F, Fut, R, RFut>(
mut fetch_html: F,
mut recircuit: R,
transient_max_attempts: u32,
unauth_max_recircuit: u32,
unauth_max_attempts: u32,
retry_delay: Duration,
probe_url_for_msg: &str,
) -> anyhow::Result<()>
@@ -224,37 +229,38 @@ where
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
debug_assert!(transient_max_attempts >= 1);
debug_assert!(unauth_max_attempts >= 1);
let mut transient_attempts = 0u32;
let mut unauth_recircuits = 0u32;
let mut unauth_attempts = 0u32;
loop {
let html = fetch_html().await?;
match classify_probe(&html) {
SessionProbe::Ok => {
tracing::info!(
transient_attempts,
unauth_recircuits,
unauth_attempts,
"session probe ok — #logo + #avatar_menu present"
);
return Ok(());
}
SessionProbe::Unauthenticated => {
if unauth_recircuits < unauth_max_recircuit {
unauth_recircuits += 1;
tracing::warn!(
attempt = unauth_recircuits,
max = unauth_max_recircuit,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
continue;
unauth_attempts += 1;
if unauth_attempts >= unauth_max_attempts {
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
tracing::warn!(
attempt = unauth_attempts,
max_attempts = unauth_max_attempts,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
}
SessionProbe::Transient => {
transient_attempts += 1;
@@ -451,7 +457,8 @@ mod tests {
}
#[tokio::test]
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
@@ -482,7 +489,8 @@ mod tests {
}
#[tokio::test]
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
@@ -495,20 +503,21 @@ mod tests {
async {}
},
3,
0,
1,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("zero budget → fail");
assert_eq!(call, 1, "no retry when budget is 0");
.expect_err("budget=1 → fail-fast");
assert_eq!(call, 1, "no retry when budget is 1");
assert_eq!(recircuits, 0);
let msg = format!("{err:#}");
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
}
#[tokio::test]
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
@@ -521,17 +530,16 @@ mod tests {
async {}
},
10, // transient budget irrelevant here
2,
3, // 3 attempts total, 2 recircuits between
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("exhausts unauth budget");
// 3 fetches total: initial + 2 recircuit-and-retry
assert_eq!(call, 3);
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
}
#[tokio::test]
@@ -548,14 +556,14 @@ mod tests {
async {}
},
3,
0,
1,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("transient until max → fail");
assert_eq!(call, 3);
// recircuit fires between attempts: 3 attempts → 2 recircuits.
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
@@ -582,7 +590,7 @@ mod tests {
async {}
},
3,
0,
1,
Duration::from_millis(0),
"https://example/probe",
)