fix(crawler): unify recircuit budget semantics — N = total attempts
The three retry-with-recircuit sites disagreed: detect.rs's retry_on_transient_with_hook used "N = total attempts" (3 → 3 fetches), but session.rs's unauth branch and content.rs's chapter loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock "max=3", different sites hit the upstream a different number of times. Unify on N = total attempts (matching the existing retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS env var now means exactly what its name suggests. Disabling the recircuit feature collapses to max_attempts=1 (single attempt, no retry) — bit-for-bit pre-TOR behavior preserved. Adds a debug_assert!(max >= 1) on both helpers and a new content.rs test exercising the mixed Transient → Unauth → Ok sequence to lock in the shared-counter invariant. Audit ref: #5. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -73,9 +73,10 @@ pub enum SyncOutcome {
|
||||
SessionExpired,
|
||||
}
|
||||
|
||||
/// Per-chapter recircuit budget for both transient pages and
|
||||
/// `Unauthenticated` outcomes. When TOR is not configured the budget
|
||||
/// is effectively 0 (no recircuit attempted; original behavior).
|
||||
/// Per-chapter max fetch attempts when TOR is configured. `N = 3` means
|
||||
/// up to 3 total page fetches with 2 NEWNYM signals between them. When
|
||||
/// TOR is not configured the effective budget collapses to 1 (single
|
||||
/// attempt, no retry, no recircuit — bit-for-bit pre-TOR behavior).
|
||||
const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3;
|
||||
|
||||
/// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the
|
||||
@@ -125,15 +126,23 @@ async fn fetch_chapter_html_once(
|
||||
Ok(html)
|
||||
}
|
||||
|
||||
/// Pure-over-IO loop: fetch + classify, with up to `recircuit_budget`
|
||||
/// NEWNYM-and-retry cycles after a `Transient` or `Unauthenticated`
|
||||
/// outcome. `recircuit_budget = 0` collapses to the original
|
||||
/// single-shot behavior — `Unauthenticated` → `SessionExpired`,
|
||||
/// `Transient` → `PersistentTransient` on the first hit, no recircuit.
|
||||
/// Pure-over-IO loop: fetch + classify, up to `max_attempts` total
|
||||
/// fetches. Between attempts, `recircuit` is invoked (a no-op when
|
||||
/// TOR isn't configured). `max_attempts = 1` collapses to the
|
||||
/// original single-shot behavior — `Unauthenticated` →
|
||||
/// `SessionExpired`, `Transient` → `PersistentTransient` on the first
|
||||
/// hit, no recircuit.
|
||||
///
|
||||
/// Semantics match [`crate::crawler::detect::retry_on_transient`] and
|
||||
/// [`run_session_probe_loop`]: `N` is **total attempts including the
|
||||
/// first**, so `N = 3` means 3 fetches and up to 2 NEWNYM calls.
|
||||
/// `Unauthenticated` and `Transient` share the budget — the loop
|
||||
/// doesn't distinguish, so a sequence like Transient → Unauth → Ok
|
||||
/// counts as 3 attempts.
|
||||
async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>(
|
||||
mut fetch: F,
|
||||
mut recircuit: R,
|
||||
recircuit_budget: u32,
|
||||
max_attempts: u32,
|
||||
source_url_for_msg: &str,
|
||||
) -> anyhow::Result<ChapterFetchOutcome>
|
||||
where
|
||||
@@ -142,38 +151,36 @@ where
|
||||
R: FnMut() -> RFut,
|
||||
RFut: std::future::Future<Output = ()>,
|
||||
{
|
||||
let mut recircuits = 0u32;
|
||||
debug_assert!(max_attempts >= 1, "max_attempts must be at least 1");
|
||||
let mut attempt = 0u32;
|
||||
loop {
|
||||
attempt += 1;
|
||||
let html = fetch().await?;
|
||||
match session::classify_chapter_probe(&html) {
|
||||
ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)),
|
||||
ChapterProbe::Unauthenticated => {
|
||||
if recircuits < recircuit_budget {
|
||||
recircuits += 1;
|
||||
tracing::warn!(
|
||||
attempt = recircuits,
|
||||
max = recircuit_budget,
|
||||
url = source_url_for_msg,
|
||||
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
continue;
|
||||
if attempt >= max_attempts {
|
||||
return Ok(ChapterFetchOutcome::SessionExpired);
|
||||
}
|
||||
return Ok(ChapterFetchOutcome::SessionExpired);
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max = max_attempts,
|
||||
url = source_url_for_msg,
|
||||
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
}
|
||||
ChapterProbe::Transient => {
|
||||
if recircuits < recircuit_budget {
|
||||
recircuits += 1;
|
||||
tracing::warn!(
|
||||
attempt = recircuits,
|
||||
max = recircuit_budget,
|
||||
url = source_url_for_msg,
|
||||
"chapter probe Transient; signaling TOR NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
continue;
|
||||
if attempt >= max_attempts {
|
||||
return Ok(ChapterFetchOutcome::PersistentTransient);
|
||||
}
|
||||
return Ok(ChapterFetchOutcome::PersistentTransient);
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max = max_attempts,
|
||||
url = source_url_for_msg,
|
||||
"chapter probe Transient; signaling TOR NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -212,10 +219,11 @@ pub async fn sync_chapter_content(
|
||||
}
|
||||
}
|
||||
|
||||
// Fetch + classify with a recircuit budget when TOR is configured.
|
||||
// Without TOR the closure-recircuit is a no-op and the loop reduces
|
||||
// to the original single-attempt behavior.
|
||||
let recircuit_budget = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 0 };
|
||||
// Fetch + classify. With TOR configured, allow up to
|
||||
// CHAPTER_RECIRCUIT_MAX_ATTEMPTS total page fetches with NEWNYM
|
||||
// between each. Without TOR, collapse to 1 attempt (no retry, no
|
||||
// recircuit) — matches the pre-TOR single-shot behavior bit-for-bit.
|
||||
let max_attempts = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 1 };
|
||||
let html = match fetch_chapter_html_with_recircuit(
|
||||
|| fetch_chapter_html_once(browser, rate, source_url),
|
||||
|| async {
|
||||
@@ -225,7 +233,7 @@ pub async fn sync_chapter_content(
|
||||
}
|
||||
}
|
||||
},
|
||||
recircuit_budget,
|
||||
max_attempts,
|
||||
source_url,
|
||||
)
|
||||
.await?
|
||||
@@ -238,7 +246,7 @@ pub async fn sync_chapter_content(
|
||||
// session-expired sticky flag).
|
||||
anyhow::bail!(
|
||||
"chapter page at {source_url} returned a transient response after \
|
||||
{recircuit_budget} TOR recircuit(s); will retry"
|
||||
{max_attempts} attempt(s); will retry"
|
||||
);
|
||||
}
|
||||
};
|
||||
@@ -431,7 +439,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn recircuit_loop_unauth_with_zero_budget_returns_session_expired() {
|
||||
async fn recircuit_loop_unauth_with_single_attempt_returns_session_expired() {
|
||||
// max_attempts=1 = TOR disabled, fail-fast on first Unauthenticated.
|
||||
let mut recircuits = 0u32;
|
||||
let mut fetches = 0u32;
|
||||
let outcome = fetch_chapter_html_with_recircuit(
|
||||
@@ -443,18 +452,19 @@ mod tests {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
0,
|
||||
1,
|
||||
"https://example/c",
|
||||
)
|
||||
.await
|
||||
.expect("ok-result");
|
||||
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
|
||||
assert_eq!(fetches, 1);
|
||||
assert_eq!(recircuits, 0, "no recircuit when budget is 0 (TOR disabled)");
|
||||
assert_eq!(recircuits, 0, "no recircuit when budget is 1 (TOR disabled)");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn recircuit_loop_unauth_then_ok_within_budget() {
|
||||
// max_attempts=3 = up to 3 fetches with 2 recircuits between.
|
||||
let mut recircuits = 0u32;
|
||||
let mut fetch_n = 0u32;
|
||||
let outcome = fetch_chapter_html_with_recircuit(
|
||||
@@ -496,15 +506,14 @@ mod tests {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
2,
|
||||
3,
|
||||
"https://example/c",
|
||||
)
|
||||
.await
|
||||
.expect("ok-result");
|
||||
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
|
||||
// budget=2 → initial + 2 recircuit-and-retry = 3 fetches.
|
||||
assert_eq!(fetch_n, 3);
|
||||
assert_eq!(recircuits, 2);
|
||||
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
|
||||
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -556,8 +565,40 @@ mod tests {
|
||||
.await
|
||||
.expect("ok-result");
|
||||
assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient));
|
||||
assert_eq!(fetch_n, 4, "budget=3 → 1 initial + 3 retries");
|
||||
assert_eq!(recircuits, 3);
|
||||
assert_eq!(fetch_n, 3, "max_attempts=3 → 3 fetches total");
|
||||
assert_eq!(recircuits, 2, "2 recircuits between 3 fetches");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn recircuit_loop_mixed_transient_then_unauth_then_ok_shares_budget() {
|
||||
// Audit-prompted regression: outcomes share the attempt counter.
|
||||
// Sequence: Transient (attempt 1) → Unauth (attempt 2) → Ok (3).
|
||||
let mut recircuits = 0u32;
|
||||
let mut fetch_n = 0u32;
|
||||
let outcome = fetch_chapter_html_with_recircuit(
|
||||
|| {
|
||||
fetch_n += 1;
|
||||
let n = fetch_n;
|
||||
async move {
|
||||
match n {
|
||||
1 => Ok(TRANSIENT_HTML.to_string()),
|
||||
2 => Ok(UNAUTH_HTML.to_string()),
|
||||
_ => Ok(OK_HTML.to_string()),
|
||||
}
|
||||
}
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
"https://example/c",
|
||||
)
|
||||
.await
|
||||
.expect("ok");
|
||||
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
|
||||
assert_eq!(fetch_n, 3);
|
||||
assert_eq!(recircuits, 2);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
@@ -167,19 +167,19 @@ pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Resul
|
||||
|
||||
/// Like [`verify_session`] but, when `tor` is `Some`, signals
|
||||
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
|
||||
/// `Unauthenticated` as a recoverable failure (up to
|
||||
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
|
||||
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
|
||||
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
|
||||
/// probes, calling NEWNYM between each).
|
||||
///
|
||||
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
|
||||
/// stays a hard fail, matching the original behavior.
|
||||
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
|
||||
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
|
||||
/// fail-fast, exactly the pre-TOR behavior.
|
||||
pub async fn verify_session_with_recircuit(
|
||||
browser: &Browser,
|
||||
probe_url: &str,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
unauth_max_recircuit: u32,
|
||||
tor_max_attempts: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
|
||||
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
|
||||
run_session_probe_loop(
|
||||
|| fetch_probe_html(browser, probe_url),
|
||||
|| async {
|
||||
@@ -190,7 +190,7 @@ pub async fn verify_session_with_recircuit(
|
||||
}
|
||||
},
|
||||
PROBE_MAX_ATTEMPTS,
|
||||
effective_unauth_budget,
|
||||
unauth_max_attempts,
|
||||
PROBE_RETRY_DELAY,
|
||||
probe_url,
|
||||
)
|
||||
@@ -201,20 +201,25 @@ pub async fn verify_session_with_recircuit(
|
||||
/// fetch and recircuit closures so it can be unit-tested without a
|
||||
/// real browser or TOR daemon.
|
||||
///
|
||||
/// Semantics:
|
||||
/// Both budgets count **total attempts**, including the first — so
|
||||
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
|
||||
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
|
||||
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
|
||||
/// and the content-path recircuit loop.
|
||||
///
|
||||
/// Outcomes:
|
||||
/// - `SessionProbe::Ok` → return `Ok(())`.
|
||||
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
|
||||
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
|
||||
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
|
||||
/// recircuit count so a TOR-misconfig diagnosis is easier.
|
||||
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
|
||||
/// tries, calling `recircuit` between each. After the cap, bail with
|
||||
/// the "site down or rate-limiting" diagnostic.
|
||||
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
|
||||
/// under the unauth budget. After the cap, bail with the
|
||||
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
|
||||
/// a TOR-misconfig diagnosis is easier.
|
||||
/// - `SessionProbe::Transient` → same shape against the transient
|
||||
/// budget; bails with "site down or rate-limiting" after the cap.
|
||||
async fn run_session_probe_loop<F, Fut, R, RFut>(
|
||||
mut fetch_html: F,
|
||||
mut recircuit: R,
|
||||
transient_max_attempts: u32,
|
||||
unauth_max_recircuit: u32,
|
||||
unauth_max_attempts: u32,
|
||||
retry_delay: Duration,
|
||||
probe_url_for_msg: &str,
|
||||
) -> anyhow::Result<()>
|
||||
@@ -224,37 +229,38 @@ where
|
||||
R: FnMut() -> RFut,
|
||||
RFut: std::future::Future<Output = ()>,
|
||||
{
|
||||
debug_assert!(transient_max_attempts >= 1);
|
||||
debug_assert!(unauth_max_attempts >= 1);
|
||||
let mut transient_attempts = 0u32;
|
||||
let mut unauth_recircuits = 0u32;
|
||||
let mut unauth_attempts = 0u32;
|
||||
loop {
|
||||
let html = fetch_html().await?;
|
||||
match classify_probe(&html) {
|
||||
SessionProbe::Ok => {
|
||||
tracing::info!(
|
||||
transient_attempts,
|
||||
unauth_recircuits,
|
||||
unauth_attempts,
|
||||
"session probe ok — #logo + #avatar_menu present"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
SessionProbe::Unauthenticated => {
|
||||
if unauth_recircuits < unauth_max_recircuit {
|
||||
unauth_recircuits += 1;
|
||||
tracing::warn!(
|
||||
attempt = unauth_recircuits,
|
||||
max = unauth_max_recircuit,
|
||||
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||
NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
continue;
|
||||
unauth_attempts += 1;
|
||||
if unauth_attempts >= unauth_max_attempts {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
|
||||
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
}
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
|
||||
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
tracing::warn!(
|
||||
attempt = unauth_attempts,
|
||||
max_attempts = unauth_max_attempts,
|
||||
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||
NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
}
|
||||
SessionProbe::Transient => {
|
||||
transient_attempts += 1;
|
||||
@@ -451,7 +457,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
|
||||
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
|
||||
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
run_session_probe_loop(
|
||||
@@ -482,7 +489,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
|
||||
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
|
||||
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
@@ -495,20 +503,21 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("zero budget → fail");
|
||||
assert_eq!(call, 1, "no retry when budget is 0");
|
||||
.expect_err("budget=1 → fail-fast");
|
||||
assert_eq!(call, 1, "no retry when budget is 1");
|
||||
assert_eq!(recircuits, 0);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
|
||||
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
|
||||
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
@@ -521,17 +530,16 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
10, // transient budget irrelevant here
|
||||
2,
|
||||
3, // 3 attempts total, 2 recircuits between
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("exhausts unauth budget");
|
||||
// 3 fetches total: initial + 2 recircuit-and-retry
|
||||
assert_eq!(call, 3);
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
|
||||
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
@@ -548,14 +556,14 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("transient until max → fail");
|
||||
assert_eq!(call, 3);
|
||||
// recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
|
||||
@@ -582,7 +590,7 @@ mod tests {
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
1,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user