fix(crawler): unify recircuit budget semantics — N = total attempts
The three retry-with-recircuit sites disagreed: detect.rs's retry_on_transient_with_hook used "N = total attempts" (3 → 3 fetches), but session.rs's unauth branch and content.rs's chapter loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock "max=3", different sites hit the upstream a different number of times. Unify on N = total attempts (matching the existing retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS env var now means exactly what its name suggests. Disabling the recircuit feature collapses to max_attempts=1 (single attempt, no retry) — bit-for-bit pre-TOR behavior preserved. Adds a debug_assert!(max >= 1) on both helpers and a new content.rs test exercising the mixed Transient → Unauth → Ok sequence to lock in the shared-counter invariant. Audit ref: #5. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -167,19 +167,19 @@ pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Resul
|
||||
|
||||
/// Like [`verify_session`] but, when `tor` is `Some`, signals
|
||||
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
|
||||
/// `Unauthenticated` as a recoverable failure (up to
|
||||
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
|
||||
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
|
||||
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
|
||||
/// probes, calling NEWNYM between each).
|
||||
///
|
||||
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
|
||||
/// stays a hard fail, matching the original behavior.
|
||||
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
|
||||
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
|
||||
/// fail-fast, exactly the pre-TOR behavior.
|
||||
pub async fn verify_session_with_recircuit(
|
||||
browser: &Browser,
|
||||
probe_url: &str,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
unauth_max_recircuit: u32,
|
||||
tor_max_attempts: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
|
||||
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
|
||||
run_session_probe_loop(
|
||||
|| fetch_probe_html(browser, probe_url),
|
||||
|| async {
|
||||
@@ -190,7 +190,7 @@ pub async fn verify_session_with_recircuit(
|
||||
}
|
||||
},
|
||||
PROBE_MAX_ATTEMPTS,
|
||||
effective_unauth_budget,
|
||||
unauth_max_attempts,
|
||||
PROBE_RETRY_DELAY,
|
||||
probe_url,
|
||||
)
|
||||
@@ -201,20 +201,25 @@ pub async fn verify_session_with_recircuit(
|
||||
/// fetch and recircuit closures so it can be unit-tested without a
|
||||
/// real browser or TOR daemon.
|
||||
///
|
||||
/// Semantics:
|
||||
/// Both budgets count **total attempts**, including the first — so
|
||||
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
|
||||
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
|
||||
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
|
||||
/// and the content-path recircuit loop.
|
||||
///
|
||||
/// Outcomes:
|
||||
/// - `SessionProbe::Ok` → return `Ok(())`.
|
||||
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
|
||||
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
|
||||
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
|
||||
/// recircuit count so a TOR-misconfig diagnosis is easier.
|
||||
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
|
||||
/// tries, calling `recircuit` between each. After the cap, bail with
|
||||
/// the "site down or rate-limiting" diagnostic.
|
||||
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
|
||||
/// under the unauth budget. After the cap, bail with the
|
||||
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
|
||||
/// a TOR-misconfig diagnosis is easier.
|
||||
/// - `SessionProbe::Transient` → same shape against the transient
|
||||
/// budget; bails with "site down or rate-limiting" after the cap.
|
||||
async fn run_session_probe_loop<F, Fut, R, RFut>(
|
||||
mut fetch_html: F,
|
||||
mut recircuit: R,
|
||||
transient_max_attempts: u32,
|
||||
unauth_max_recircuit: u32,
|
||||
unauth_max_attempts: u32,
|
||||
retry_delay: Duration,
|
||||
probe_url_for_msg: &str,
|
||||
) -> anyhow::Result<()>
|
||||
@@ -224,37 +229,38 @@ where
|
||||
R: FnMut() -> RFut,
|
||||
RFut: std::future::Future<Output = ()>,
|
||||
{
|
||||
debug_assert!(transient_max_attempts >= 1);
|
||||
debug_assert!(unauth_max_attempts >= 1);
|
||||
let mut transient_attempts = 0u32;
|
||||
let mut unauth_recircuits = 0u32;
|
||||
let mut unauth_attempts = 0u32;
|
||||
loop {
|
||||
let html = fetch_html().await?;
|
||||
match classify_probe(&html) {
|
||||
SessionProbe::Ok => {
|
||||
tracing::info!(
|
||||
transient_attempts,
|
||||
unauth_recircuits,
|
||||
unauth_attempts,
|
||||
"session probe ok — #logo + #avatar_menu present"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
SessionProbe::Unauthenticated => {
|
||||
if unauth_recircuits < unauth_max_recircuit {
|
||||
unauth_recircuits += 1;
|
||||
tracing::warn!(
|
||||
attempt = unauth_recircuits,
|
||||
max = unauth_max_recircuit,
|
||||
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||
NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
continue;
|
||||
unauth_attempts += 1;
|
||||
if unauth_attempts >= unauth_max_attempts {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
|
||||
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
}
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
|
||||
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
tracing::warn!(
|
||||
attempt = unauth_attempts,
|
||||
max_attempts = unauth_max_attempts,
|
||||
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||
NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
}
|
||||
SessionProbe::Transient => {
|
||||
transient_attempts += 1;
|
||||
@@ -451,7 +457,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
|
||||
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
|
||||
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
run_session_probe_loop(
|
||||
@@ -482,7 +489,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
|
||||
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
|
||||
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
@@ -495,20 +503,21 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("zero budget → fail");
|
||||
assert_eq!(call, 1, "no retry when budget is 0");
|
||||
.expect_err("budget=1 → fail-fast");
|
||||
assert_eq!(call, 1, "no retry when budget is 1");
|
||||
assert_eq!(recircuits, 0);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
|
||||
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
|
||||
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
@@ -521,17 +530,16 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
10, // transient budget irrelevant here
|
||||
2,
|
||||
3, // 3 attempts total, 2 recircuits between
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("exhausts unauth budget");
|
||||
// 3 fetches total: initial + 2 recircuit-and-retry
|
||||
assert_eq!(call, 3);
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
|
||||
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -548,14 +556,14 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("transient until max → fail");
|
||||
assert_eq!(call, 3);
|
||||
// recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
|
||||
@@ -582,7 +590,7 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user