feat(crawler): recircuit TOR on transient pages and unauthenticated probes
- target.rs swaps retry_on_transient → retry_on_transient_with_hook, signaling NEWNYM via ctx.tor between attempts when configured. - session.rs gains verify_session_with_recircuit; the bare verify_session is now a one-line wrapper passing tor=None, unauth_max_recircuit=0. The inner run_session_probe_loop is pure-over-IO and unit-tested with closure-based fakes. - content.rs extracts fetch_chapter_html_once + the closure-driven fetch_chapter_html_with_recircuit, used by sync_chapter_content to retry on Transient or Unauthenticated up to a recircuit_budget. Budget = 0 (no TOR) preserves original behavior bit-for-bit. - app.rs and bin/crawler.rs construct the controller before on_launch and pass it into verify_session_with_recircuit, so a transient hiccup at startup no longer requires PHPSESSID rotation. Recircuit budget defaults to CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS (3). Errors from NEWNYM are logged and swallowed — failing to recircuit should not take down the crawl. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -162,37 +162,117 @@ const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
||||
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
||||
/// minutes into a backfill costs 30 minutes.
|
||||
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
||||
let mut attempt = 0u32;
|
||||
verify_session_with_recircuit(browser, probe_url, None, 0).await
|
||||
}
|
||||
|
||||
/// Like [`verify_session`] but, when `tor` is `Some`, signals
|
||||
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
|
||||
/// `Unauthenticated` as a recoverable failure (up to
|
||||
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
|
||||
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
|
||||
///
|
||||
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
|
||||
/// stays a hard fail, matching the original behavior.
|
||||
pub async fn verify_session_with_recircuit(
|
||||
browser: &Browser,
|
||||
probe_url: &str,
|
||||
tor: Option<&crate::crawler::tor::TorController>,
|
||||
unauth_max_recircuit: u32,
|
||||
) -> anyhow::Result<()> {
|
||||
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
|
||||
run_session_probe_loop(
|
||||
|| fetch_probe_html(browser, probe_url),
|
||||
|| async {
|
||||
if let Some(t) = tor {
|
||||
if let Err(e) = t.new_identity().await {
|
||||
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
|
||||
}
|
||||
}
|
||||
},
|
||||
PROBE_MAX_ATTEMPTS,
|
||||
effective_unauth_budget,
|
||||
PROBE_RETRY_DELAY,
|
||||
probe_url,
|
||||
)
|
||||
.await
|
||||
}
|
||||
|
||||
/// Pure-over-IO loop body for the session probe. Generic over the
|
||||
/// fetch and recircuit closures so it can be unit-tested without a
|
||||
/// real browser or TOR daemon.
|
||||
///
|
||||
/// Semantics:
|
||||
/// - `SessionProbe::Ok` → return `Ok(())`.
|
||||
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
|
||||
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
|
||||
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
|
||||
/// recircuit count so a TOR-misconfig diagnosis is easier.
|
||||
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
|
||||
/// tries, calling `recircuit` between each. After the cap, bail with
|
||||
/// the "site down or rate-limiting" diagnostic.
|
||||
async fn run_session_probe_loop<F, Fut, R, RFut>(
|
||||
mut fetch_html: F,
|
||||
mut recircuit: R,
|
||||
transient_max_attempts: u32,
|
||||
unauth_max_recircuit: u32,
|
||||
retry_delay: Duration,
|
||||
probe_url_for_msg: &str,
|
||||
) -> anyhow::Result<()>
|
||||
where
|
||||
F: FnMut() -> Fut,
|
||||
Fut: std::future::Future<Output = anyhow::Result<String>>,
|
||||
R: FnMut() -> RFut,
|
||||
RFut: std::future::Future<Output = ()>,
|
||||
{
|
||||
let mut transient_attempts = 0u32;
|
||||
let mut unauth_recircuits = 0u32;
|
||||
loop {
|
||||
attempt += 1;
|
||||
let html = fetch_probe_html(browser, probe_url).await?;
|
||||
let html = fetch_html().await?;
|
||||
match classify_probe(&html) {
|
||||
SessionProbe::Ok => {
|
||||
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
|
||||
tracing::info!(
|
||||
transient_attempts,
|
||||
unauth_recircuits,
|
||||
"session probe ok — #logo + #avatar_menu present"
|
||||
);
|
||||
return Ok(());
|
||||
}
|
||||
SessionProbe::Unauthenticated => {
|
||||
if unauth_recircuits < unauth_max_recircuit {
|
||||
unauth_recircuits += 1;
|
||||
tracing::warn!(
|
||||
attempt = unauth_recircuits,
|
||||
max = unauth_max_recircuit,
|
||||
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
||||
NEWNYM and retrying"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
continue;
|
||||
}
|
||||
return Err(anyhow!(
|
||||
"session probe failed — #avatar_menu not present at {probe_url} \
|
||||
(page rendered the normal layout); PHPSESSID is missing, expired, \
|
||||
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
||||
after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
|
||||
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
||||
));
|
||||
}
|
||||
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
|
||||
tracing::warn!(
|
||||
attempt,
|
||||
max_attempts = PROBE_MAX_ATTEMPTS,
|
||||
"session probe got a transient page; retrying"
|
||||
);
|
||||
tokio::time::sleep(PROBE_RETRY_DELAY).await;
|
||||
}
|
||||
SessionProbe::Transient => {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — probe page at {probe_url} returned a \
|
||||
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
|
||||
The site appears to be down or rate-limiting us; try again \
|
||||
later before refreshing CRAWLER_PHPSESSID."
|
||||
));
|
||||
transient_attempts += 1;
|
||||
if transient_attempts >= transient_max_attempts {
|
||||
return Err(anyhow!(
|
||||
"session probe failed — probe page at {probe_url_for_msg} returned \
|
||||
a broken-page response after {transient_max_attempts} attempts. \
|
||||
The site appears to be down or rate-limiting us; try again \
|
||||
later before refreshing CRAWLER_PHPSESSID."
|
||||
));
|
||||
}
|
||||
tracing::warn!(
|
||||
attempt = transient_attempts,
|
||||
max_attempts = transient_max_attempts,
|
||||
"session probe got a transient page; recircuit + retry"
|
||||
);
|
||||
recircuit().await;
|
||||
tokio::time::sleep(retry_delay).await;
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -336,6 +416,202 @@ mod tests {
|
||||
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
||||
}
|
||||
|
||||
// --- run_session_probe_loop -----------------------------------------
|
||||
//
|
||||
// These tests exercise the recircuit-aware loop without a real
|
||||
// browser. The fetch and recircuit closures are mocked over Vecs of
|
||||
// canned outcomes / counters.
|
||||
|
||||
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
|
||||
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
|
||||
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut fetched = 0u32;
|
||||
run_session_probe_loop(
|
||||
|| {
|
||||
fetched += 1;
|
||||
async { Ok(OK_HTML.to_string()) }
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
3,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect("ok on first attempt");
|
||||
assert_eq!(fetched, 1);
|
||||
assert_eq!(recircuits, 0);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
let n = call;
|
||||
async move {
|
||||
if n == 1 {
|
||||
Ok(UNAUTH_HTML.to_string())
|
||||
} else {
|
||||
Ok(OK_HTML.to_string())
|
||||
}
|
||||
}
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
3,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect("recovers after one recircuit");
|
||||
assert_eq!(call, 2);
|
||||
assert_eq!(recircuits, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
async { Ok(UNAUTH_HTML.to_string()) }
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("zero budget → fail");
|
||||
assert_eq!(call, 1, "no retry when budget is 0");
|
||||
assert_eq!(recircuits, 0);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
async { Ok(UNAUTH_HTML.to_string()) }
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
10, // transient budget irrelevant here
|
||||
2,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("exhausts unauth budget");
|
||||
// 3 fetches total: initial + 2 recircuit-and-retry
|
||||
assert_eq!(call, 3);
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_transient_repeats_until_max_then_errors() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
async { Ok(TRANSIENT_HTML.to_string()) }
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("transient until max → fail");
|
||||
assert_eq!(call, 3);
|
||||
// recircuit fires between attempts: 3 attempts → 2 recircuits.
|
||||
assert_eq!(recircuits, 2);
|
||||
let msg = format!("{err:#}");
|
||||
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
|
||||
let mut recircuits = 0u32;
|
||||
let mut call = 0u32;
|
||||
run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
let n = call;
|
||||
async move {
|
||||
if n == 1 {
|
||||
Ok(TRANSIENT_HTML.to_string())
|
||||
} else {
|
||||
Ok(OK_HTML.to_string())
|
||||
}
|
||||
}
|
||||
},
|
||||
|| {
|
||||
recircuits += 1;
|
||||
async {}
|
||||
},
|
||||
3,
|
||||
0,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect("ok on second try");
|
||||
assert_eq!(call, 2);
|
||||
assert_eq!(recircuits, 1);
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn probe_loop_propagates_fetch_errors_immediately() {
|
||||
let mut call = 0u32;
|
||||
let err = run_session_probe_loop(
|
||||
|| {
|
||||
call += 1;
|
||||
async { Err(anyhow!("nav timeout")) }
|
||||
},
|
||||
|| async {},
|
||||
5,
|
||||
5,
|
||||
Duration::from_millis(0),
|
||||
"https://example/probe",
|
||||
)
|
||||
.await
|
||||
.expect_err("fetch error bubbles");
|
||||
assert_eq!(call, 1);
|
||||
assert!(format!("{err:#}").contains("nav timeout"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
||||
// Defensive: if a broken-page body somehow contains an
|
||||
|
||||
Reference in New Issue
Block a user