feat(crawler): recircuit TOR on transient pages and unauthenticated probes

- target.rs swaps retry_on_transient → retry_on_transient_with_hook,
  signaling NEWNYM via ctx.tor between attempts when configured.
- session.rs gains verify_session_with_recircuit; the bare
  verify_session is now a one-line wrapper passing tor=None,
  unauth_max_recircuit=0. The inner run_session_probe_loop is
  pure-over-IO and unit-tested with closure-based fakes.
- content.rs extracts fetch_chapter_html_once + the closure-driven
  fetch_chapter_html_with_recircuit, used by sync_chapter_content to
  retry on Transient or Unauthenticated up to a recircuit_budget.
  Budget = 0 (no TOR) preserves original behavior bit-for-bit.
- app.rs and bin/crawler.rs construct the controller before on_launch
  and pass it into verify_session_with_recircuit, so a transient
  hiccup at startup no longer requires PHPSESSID rotation.

Recircuit budget defaults to CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS (3).
Errors from NEWNYM are logged and swallowed — failing to recircuit
should not take down the crawl.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 18:52:28 +02:00
parent 8557e432a2
commit 8c6378b877
5 changed files with 699 additions and 114 deletions

View File

@@ -162,37 +162,117 @@ const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
/// minutes into a backfill costs 30 minutes.
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
let mut attempt = 0u32;
verify_session_with_recircuit(browser, probe_url, None, 0).await
}
/// Like [`verify_session`] but, when `tor` is `Some`, signals
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
/// `Unauthenticated` as a recoverable failure (up to
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
///
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
/// stays a hard fail, matching the original behavior.
pub async fn verify_session_with_recircuit(
browser: &Browser,
probe_url: &str,
tor: Option<&crate::crawler::tor::TorController>,
unauth_max_recircuit: u32,
) -> anyhow::Result<()> {
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
run_session_probe_loop(
|| fetch_probe_html(browser, probe_url),
|| async {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
}
}
},
PROBE_MAX_ATTEMPTS,
effective_unauth_budget,
PROBE_RETRY_DELAY,
probe_url,
)
.await
}
/// Pure-over-IO loop body for the session probe. Generic over the
/// fetch and recircuit closures so it can be unit-tested without a
/// real browser or TOR daemon.
///
/// Semantics:
/// - `SessionProbe::Ok` → return `Ok(())`.
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
/// recircuit count so a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
/// tries, calling `recircuit` between each. After the cap, bail with
/// the "site down or rate-limiting" diagnostic.
async fn run_session_probe_loop<F, Fut, R, RFut>(
mut fetch_html: F,
mut recircuit: R,
transient_max_attempts: u32,
unauth_max_recircuit: u32,
retry_delay: Duration,
probe_url_for_msg: &str,
) -> anyhow::Result<()>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<String>>,
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
let mut transient_attempts = 0u32;
let mut unauth_recircuits = 0u32;
loop {
attempt += 1;
let html = fetch_probe_html(browser, probe_url).await?;
let html = fetch_html().await?;
match classify_probe(&html) {
SessionProbe::Ok => {
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
tracing::info!(
transient_attempts,
unauth_recircuits,
"session probe ok — #logo + #avatar_menu present"
);
return Ok(());
}
SessionProbe::Unauthenticated => {
if unauth_recircuits < unauth_max_recircuit {
unauth_recircuits += 1;
tracing::warn!(
attempt = unauth_recircuits,
max = unauth_max_recircuit,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
continue;
}
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url} \
(page rendered the normal layout); PHPSESSID is missing, expired, \
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
tracing::warn!(
attempt,
max_attempts = PROBE_MAX_ATTEMPTS,
"session probe got a transient page; retrying"
);
tokio::time::sleep(PROBE_RETRY_DELAY).await;
}
SessionProbe::Transient => {
return Err(anyhow!(
"session probe failed — probe page at {probe_url} returned a \
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
transient_attempts += 1;
if transient_attempts >= transient_max_attempts {
return Err(anyhow!(
"session probe failed — probe page at {probe_url_for_msg} returned \
a broken-page response after {transient_max_attempts} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
}
tracing::warn!(
attempt = transient_attempts,
max_attempts = transient_max_attempts,
"session probe got a transient page; recircuit + retry"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
}
}
}
@@ -336,6 +416,202 @@ mod tests {
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
}
// --- run_session_probe_loop -----------------------------------------
//
// These tests exercise the recircuit-aware loop without a real
// browser. The fetch and recircuit closures are mocked over Vecs of
// canned outcomes / counters.
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
#[tokio::test]
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
let mut recircuits = 0u32;
let mut fetched = 0u32;
run_session_probe_loop(
|| {
fetched += 1;
async { Ok(OK_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on first attempt");
assert_eq!(fetched, 1);
assert_eq!(recircuits, 0);
}
#[tokio::test]
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(UNAUTH_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("recovers after one recircuit");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("zero budget → fail");
assert_eq!(call, 1, "no retry when budget is 0");
assert_eq!(recircuits, 0);
let msg = format!("{err:#}");
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
}
#[tokio::test]
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
10, // transient budget irrelevant here
2,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("exhausts unauth budget");
// 3 fetches total: initial + 2 recircuit-and-retry
assert_eq!(call, 3);
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_repeats_until_max_then_errors() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(TRANSIENT_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("transient until max → fail");
assert_eq!(call, 3);
// recircuit fires between attempts: 3 attempts → 2 recircuits.
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(TRANSIENT_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on second try");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_propagates_fetch_errors_immediately() {
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Err(anyhow!("nav timeout")) }
},
|| async {},
5,
5,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("fetch error bubbles");
assert_eq!(call, 1);
assert!(format!("{err:#}").contains("nav timeout"));
}
#[test]
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
// Defensive: if a broken-page body somehow contains an