The three retry-with-recircuit sites disagreed: detect.rs's retry_on_transient_with_hook used "N = total attempts" (3 → 3 fetches), but session.rs's unauth branch and content.rs's chapter loop used "N = recircuits" (3 → 4 fetches). At the same wall-clock "max=3", different sites hit the upstream a different number of times. Unify on N = total attempts (matching the existing retry_on_transient convention). The CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS env var now means exactly what its name suggests. Disabling the recircuit feature collapses to max_attempts=1 (single attempt, no retry) — bit-for-bit pre-TOR behavior preserved. Adds a debug_assert!(max >= 1) on both helpers and a new content.rs test exercising the mixed Transient → Unauth → Ok sequence to lock in the shared-counter invariant. Audit ref: #5. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
636 lines
23 KiB
Rust
636 lines
23 KiB
Rust
//! PHPSESSID injection + login probe.
|
|
//!
|
|
//! The catalog site we crawl renders chapter pages as a single multi-
|
|
//! page list only for logged-in users. We don't try to bypass the
|
|
//! login (CAPTCHA wall) — instead the operator pastes their browser's
|
|
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
|
|
//! it into Chromium *and* reqwest before the first navigation.
|
|
//!
|
|
//! Two things the cookie alone doesn't give us:
|
|
//! 1. The cookie value is only meaningful to the *server* — we have
|
|
//! no way to predict from the value alone whether it's still valid.
|
|
//! `verify_session` does a navigation and inspects the probe page
|
|
//! for three outcomes: broken-page response (transient — retry the
|
|
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
|
|
//! — bail loudly), or both present (authenticated). The earlier
|
|
//! avatar-only check conflated "site is hiccuping" with "session is
|
|
//! dead" and refused to start the crawler when the site had a brief
|
|
//! 503.
|
|
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
|
//! has its own cookie store; we seed it for the catalog host only.
|
|
//! CDN hosts are deliberately *not* given the cookie — they serve
|
|
//! image bytes by signed URLs and don't need it.
|
|
|
|
use std::time::Duration;
|
|
|
|
use anyhow::{anyhow, Context};
|
|
use chromiumoxide::browser::Browser;
|
|
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
|
|
|
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
|
|
|
|
/// Outcome of inspecting a probe-page response.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum SessionProbe {
|
|
/// `#logo` present and `#avatar_menu` present — session valid.
|
|
Ok,
|
|
/// `#logo` present but `#avatar_menu` absent — site rendered the
|
|
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
|
|
Unauthenticated,
|
|
/// Broken-page body signature or `#logo` missing — site is hiccuping.
|
|
/// Caller retries the probe rather than blaming the session.
|
|
Transient,
|
|
}
|
|
|
|
/// Re-export so existing callers keep working after the helper moved
|
|
/// to `crawler::url_utils`. The body lives there.
|
|
pub use crate::crawler::url_utils::registrable_domain;
|
|
|
|
/// Inject the PHPSESSID cookie into the browser's cookie store for the
|
|
/// catalog domain. Must be called before any navigation that depends on
|
|
/// authentication; subsequent navigations include the cookie
|
|
/// automatically.
|
|
pub async fn inject_phpsessid(
|
|
browser: &Browser,
|
|
sid: &str,
|
|
cookie_domain: &str,
|
|
) -> anyhow::Result<()> {
|
|
let cookie = CookieParam {
|
|
name: "PHPSESSID".to_string(),
|
|
value: sid.to_string(),
|
|
url: None,
|
|
domain: Some(cookie_domain.to_string()),
|
|
path: Some("/".to_string()),
|
|
secure: None,
|
|
http_only: Some(true),
|
|
same_site: None,
|
|
expires: None,
|
|
priority: None,
|
|
same_party: None,
|
|
source_scheme: None,
|
|
source_port: None,
|
|
partition_key: None,
|
|
};
|
|
browser
|
|
.set_cookies(vec![cookie])
|
|
.await
|
|
.context("set PHPSESSID in chromium cookie store")?;
|
|
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
|
|
Ok(())
|
|
}
|
|
|
|
/// Three-way classification of a probe-page response. Pure over HTML so
|
|
/// it's unit-testable without a real browser. Order matters: a body
|
|
/// matching the broken-page template is `Transient` even if the page
|
|
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
|
|
/// site signal over a stray selector match.
|
|
pub fn classify_probe(html: &str) -> SessionProbe {
|
|
if is_broken_page_body(html) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let doc = scraper::Html::parse_document(html);
|
|
if !has_logo_sentinel(&doc) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
|
|
if doc.select(&avatar_sel).next().is_some() {
|
|
SessionProbe::Ok
|
|
} else {
|
|
SessionProbe::Unauthenticated
|
|
}
|
|
}
|
|
|
|
/// Three-way classification of a chapter page response.
|
|
///
|
|
/// Reader pages don't render `#logo`, so [`classify_probe`] can't be
|
|
/// reused as-is. The chapter-specific marker is `a#pic_container`
|
|
/// (asserted by the reader-page parser at `parse_chapter_pages`).
|
|
///
|
|
/// Order matters: broken-page body wins over selector matches, so a
|
|
/// transient site-wide 5xx that happens to render the avatar widget
|
|
/// elsewhere doesn't falsely reach `Ok`.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum ChapterProbe {
|
|
/// `a#pic_container` present — reader rendered. Whether
|
|
/// `#avatar_menu` is also there is informational; if the reader
|
|
/// loaded the session is by definition still good.
|
|
Ok,
|
|
/// Site rendered a "logged out" or "please log in" page (no
|
|
/// reader, no broken-page body, and no avatar widget either).
|
|
/// Distinguishes the genuine expired-session case from a
|
|
/// transient site hiccup.
|
|
Unauthenticated,
|
|
/// Broken-page body, or reader didn't render but the user is
|
|
/// still logged in (avatar widget present). Caller should retry
|
|
/// rather than blame the session.
|
|
Transient,
|
|
}
|
|
|
|
pub fn classify_chapter_probe(html: &str) -> ChapterProbe {
|
|
if is_broken_page_body(html) {
|
|
return ChapterProbe::Transient;
|
|
}
|
|
let doc = scraper::Html::parse_document(html);
|
|
let container = scraper::Selector::parse("a#pic_container").unwrap();
|
|
if doc.select(&container).next().is_some() {
|
|
return ChapterProbe::Ok;
|
|
}
|
|
let avatar = scraper::Selector::parse("#avatar_menu").unwrap();
|
|
if doc.select(&avatar).next().is_some() {
|
|
// Logged-in user, but the reader didn't render — most likely
|
|
// the layout shifted or the site is serving an interstitial.
|
|
ChapterProbe::Transient
|
|
} else {
|
|
// No reader, no avatar, no broken-body marker — site rendered
|
|
// the "please log in" page, which is the genuine session-
|
|
// expired signal on this route.
|
|
ChapterProbe::Unauthenticated
|
|
}
|
|
}
|
|
|
|
/// In-startup retry budget for the session probe. Small but non-zero —
|
|
/// startup hitting a 5-second site hiccup shouldn't fail the operator
|
|
/// with "PHPSESSID expired" when the session is actually fine.
|
|
const PROBE_MAX_ATTEMPTS: u32 = 3;
|
|
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
|
|
|
/// Navigate to `probe_url` and classify the response. Retries the probe
|
|
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
|
|
/// fast on `Unauthenticated`; returns `Ok(())` on success.
|
|
///
|
|
/// This burns one navigation per attempt against the catalog's rate
|
|
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
|
/// minutes into a backfill costs 30 minutes.
|
|
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
|
verify_session_with_recircuit(browser, probe_url, None, 0).await
|
|
}
|
|
|
|
/// Like [`verify_session`] but, when `tor` is `Some`, signals
|
|
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
|
|
/// `Unauthenticated` as recoverable (up to `tor_max_attempts` total
|
|
/// probes, calling NEWNYM between each).
|
|
///
|
|
/// `verify_session` is `verify_session_with_recircuit(..., None, _)`,
|
|
/// which collapses the `Unauthenticated` budget to 1 attempt — i.e.
|
|
/// fail-fast, exactly the pre-TOR behavior.
|
|
pub async fn verify_session_with_recircuit(
|
|
browser: &Browser,
|
|
probe_url: &str,
|
|
tor: Option<&crate::crawler::tor::TorController>,
|
|
tor_max_attempts: u32,
|
|
) -> anyhow::Result<()> {
|
|
let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 };
|
|
run_session_probe_loop(
|
|
|| fetch_probe_html(browser, probe_url),
|
|
|| async {
|
|
if let Some(t) = tor {
|
|
if let Err(e) = t.new_identity().await {
|
|
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
|
|
}
|
|
}
|
|
},
|
|
PROBE_MAX_ATTEMPTS,
|
|
unauth_max_attempts,
|
|
PROBE_RETRY_DELAY,
|
|
probe_url,
|
|
)
|
|
.await
|
|
}
|
|
|
|
/// Pure-over-IO loop body for the session probe. Generic over the
|
|
/// fetch and recircuit closures so it can be unit-tested without a
|
|
/// real browser or TOR daemon.
|
|
///
|
|
/// Both budgets count **total attempts**, including the first — so
|
|
/// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits
|
|
/// between them, and `unauth_max_attempts = 1` means "fail-fast, no
|
|
/// retry". This matches [`crate::crawler::detect::retry_on_transient`]
|
|
/// and the content-path recircuit loop.
|
|
///
|
|
/// Outcomes:
|
|
/// - `SessionProbe::Ok` → return `Ok(())`.
|
|
/// - `SessionProbe::Unauthenticated` → recircuit + retry while
|
|
/// under the unauth budget. After the cap, bail with the
|
|
/// "PHPSESSID expired" diagnostic, mentioning the attempt count so
|
|
/// a TOR-misconfig diagnosis is easier.
|
|
/// - `SessionProbe::Transient` → same shape against the transient
|
|
/// budget; bails with "site down or rate-limiting" after the cap.
|
|
async fn run_session_probe_loop<F, Fut, R, RFut>(
|
|
mut fetch_html: F,
|
|
mut recircuit: R,
|
|
transient_max_attempts: u32,
|
|
unauth_max_attempts: u32,
|
|
retry_delay: Duration,
|
|
probe_url_for_msg: &str,
|
|
) -> anyhow::Result<()>
|
|
where
|
|
F: FnMut() -> Fut,
|
|
Fut: std::future::Future<Output = anyhow::Result<String>>,
|
|
R: FnMut() -> RFut,
|
|
RFut: std::future::Future<Output = ()>,
|
|
{
|
|
debug_assert!(transient_max_attempts >= 1);
|
|
debug_assert!(unauth_max_attempts >= 1);
|
|
let mut transient_attempts = 0u32;
|
|
let mut unauth_attempts = 0u32;
|
|
loop {
|
|
let html = fetch_html().await?;
|
|
match classify_probe(&html) {
|
|
SessionProbe::Ok => {
|
|
tracing::info!(
|
|
transient_attempts,
|
|
unauth_attempts,
|
|
"session probe ok — #logo + #avatar_menu present"
|
|
);
|
|
return Ok(());
|
|
}
|
|
SessionProbe::Unauthenticated => {
|
|
unauth_attempts += 1;
|
|
if unauth_attempts >= unauth_max_attempts {
|
|
return Err(anyhow!(
|
|
"session probe failed — #avatar_menu not present at {probe_url_for_msg} \
|
|
after {unauth_attempts} attempt(s); PHPSESSID is missing, \
|
|
expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
|
));
|
|
}
|
|
tracing::warn!(
|
|
attempt = unauth_attempts,
|
|
max_attempts = unauth_max_attempts,
|
|
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
|
|
NEWNYM and retrying"
|
|
);
|
|
recircuit().await;
|
|
tokio::time::sleep(retry_delay).await;
|
|
}
|
|
SessionProbe::Transient => {
|
|
transient_attempts += 1;
|
|
if transient_attempts >= transient_max_attempts {
|
|
return Err(anyhow!(
|
|
"session probe failed — probe page at {probe_url_for_msg} returned \
|
|
a broken-page response after {transient_max_attempts} attempts. \
|
|
The site appears to be down or rate-limiting us; try again \
|
|
later before refreshing CRAWLER_PHPSESSID."
|
|
));
|
|
}
|
|
tracing::warn!(
|
|
attempt = transient_attempts,
|
|
max_attempts = transient_max_attempts,
|
|
"session probe got a transient page; recircuit + retry"
|
|
);
|
|
recircuit().await;
|
|
tokio::time::sleep(retry_delay).await;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
|
|
let page = browser
|
|
.new_page(probe_url)
|
|
.await
|
|
.with_context(|| format!("open probe page {probe_url}"))?;
|
|
crate::crawler::nav::wait_for_nav(&page)
|
|
.await
|
|
.context("wait for nav on probe")?;
|
|
// Best-effort wait for the layout marker. Timeout is fine — the
|
|
// probe classifier handles a missing `#logo` as Transient anyway,
|
|
// and the verify loop retries on Transient.
|
|
let _ = crate::crawler::nav::wait_for_selector(
|
|
&page,
|
|
"#logo",
|
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
|
)
|
|
.await;
|
|
let html = page.content().await.context("read probe html")?;
|
|
page.close().await.ok();
|
|
Ok(html)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// registrable_domain tests live in crawler::url_utils now —
|
|
// it's the canonical home for that helper.
|
|
|
|
#[test]
|
|
fn classify_probe_ok_when_logo_and_avatar_present() {
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
|
|
// Real "logged out" response: site layout renders fine, just no
|
|
// avatar widget. This is the only state that should blame the
|
|
// session cookie.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<main>Please log in.</main>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_broken_page_body() {
|
|
let html = "<html><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_when_logo_missing() {
|
|
// No broken-body marker, but no site layout either — treat as
|
|
// transient (could be a Cloudflare interstitial, a 5xx page,
|
|
// etc.) rather than blaming the session.
|
|
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_empty_response() {
|
|
assert_eq!(classify_probe(""), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_ok_when_reader_rendered() {
|
|
let html = r#"
|
|
<html><body>
|
|
<a id="pic_container">
|
|
<img id="page1" src="https://cdn/1.jpg">
|
|
</a>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() {
|
|
// What a logged-out hit on a chapter URL renders: a normal
|
|
// site layout (header etc.) with a "please log in" body, but
|
|
// no reader and no avatar widget.
|
|
let html = r#"
|
|
<html><body>
|
|
<header><div id="logo">Catalog</div></header>
|
|
<main>Please log in to read this chapter.</main>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(
|
|
classify_chapter_probe(html),
|
|
ChapterProbe::Unauthenticated
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() {
|
|
// Avatar shows the session is still valid; reader didn't
|
|
// render — site is serving an interstitial or the layout
|
|
// momentarily shifted. Retry, don't blame the session.
|
|
let html = r#"
|
|
<html><body>
|
|
<header><div id="logo">Catalog</div><div id="avatar_menu"></div></header>
|
|
<main>Site maintenance — back in 5 minutes.</main>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_transient_on_broken_page_body() {
|
|
let html =
|
|
"<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() {
|
|
// Regression for the original bug: the binary
|
|
// find_element("#avatar_menu") check treated "no avatar" as
|
|
// session-expired even when a transient hiccup was the real
|
|
// cause. classify_chapter_probe must NOT trip on that pattern
|
|
// when pic_container *is* present.
|
|
let html = r#"
|
|
<html><body>
|
|
<a id="pic_container">
|
|
<img id="page1" src="https://cdn/1.jpg">
|
|
</a>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
|
}
|
|
|
|
// --- run_session_probe_loop -----------------------------------------
|
|
//
|
|
// These tests exercise the recircuit-aware loop without a real
|
|
// browser. The fetch and recircuit closures are mocked over Vecs of
|
|
// canned outcomes / counters.
|
|
|
|
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
|
|
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
|
|
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
|
|
let mut recircuits = 0u32;
|
|
let mut fetched = 0u32;
|
|
run_session_probe_loop(
|
|
|| {
|
|
fetched += 1;
|
|
async { Ok(OK_HTML.to_string()) }
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
3,
|
|
3,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect("ok on first attempt");
|
|
assert_eq!(fetched, 1);
|
|
assert_eq!(recircuits, 0);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_unauth_then_ok_when_attempt_budget_available() {
|
|
// Budget = 3 total attempts. Unauth on call 1, ok on call 2.
|
|
let mut recircuits = 0u32;
|
|
let mut call = 0u32;
|
|
run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
let n = call;
|
|
async move {
|
|
if n == 1 {
|
|
Ok(UNAUTH_HTML.to_string())
|
|
} else {
|
|
Ok(OK_HTML.to_string())
|
|
}
|
|
}
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
3,
|
|
3,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect("recovers after one recircuit");
|
|
assert_eq!(call, 2);
|
|
assert_eq!(recircuits, 1);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() {
|
|
// Budget = 1 total attempt = no retry (matches no-TOR behavior).
|
|
let mut recircuits = 0u32;
|
|
let mut call = 0u32;
|
|
let err = run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
async { Ok(UNAUTH_HTML.to_string()) }
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
3,
|
|
1,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect_err("budget=1 → fail-fast");
|
|
assert_eq!(call, 1, "no retry when budget is 1");
|
|
assert_eq!(recircuits, 0);
|
|
let msg = format!("{err:#}");
|
|
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
|
|
assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() {
|
|
let mut recircuits = 0u32;
|
|
let mut call = 0u32;
|
|
let err = run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
async { Ok(UNAUTH_HTML.to_string()) }
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
10, // transient budget irrelevant here
|
|
3, // 3 attempts total, 2 recircuits between
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect_err("exhausts unauth budget");
|
|
assert_eq!(call, 3);
|
|
assert_eq!(recircuits, 2);
|
|
let msg = format!("{err:#}");
|
|
assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_transient_repeats_until_max_then_errors() {
|
|
let mut recircuits = 0u32;
|
|
let mut call = 0u32;
|
|
let err = run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
async { Ok(TRANSIENT_HTML.to_string()) }
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
3,
|
|
1,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect_err("transient until max → fail");
|
|
assert_eq!(call, 3);
|
|
// Recircuit fires between attempts: 3 attempts → 2 recircuits.
|
|
assert_eq!(recircuits, 2);
|
|
let msg = format!("{err:#}");
|
|
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
|
|
let mut recircuits = 0u32;
|
|
let mut call = 0u32;
|
|
run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
let n = call;
|
|
async move {
|
|
if n == 1 {
|
|
Ok(TRANSIENT_HTML.to_string())
|
|
} else {
|
|
Ok(OK_HTML.to_string())
|
|
}
|
|
}
|
|
},
|
|
|| {
|
|
recircuits += 1;
|
|
async {}
|
|
},
|
|
3,
|
|
1,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect("ok on second try");
|
|
assert_eq!(call, 2);
|
|
assert_eq!(recircuits, 1);
|
|
}
|
|
|
|
#[tokio::test]
|
|
async fn probe_loop_propagates_fetch_errors_immediately() {
|
|
let mut call = 0u32;
|
|
let err = run_session_probe_loop(
|
|
|| {
|
|
call += 1;
|
|
async { Err(anyhow!("nav timeout")) }
|
|
},
|
|
|| async {},
|
|
5,
|
|
5,
|
|
Duration::from_millis(0),
|
|
"https://example/probe",
|
|
)
|
|
.await
|
|
.expect_err("fetch error bubbles");
|
|
assert_eq!(call, 1);
|
|
assert!(format!("{err:#}").contains("nav timeout"));
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
|
// Defensive: if a broken-page body somehow contains an
|
|
// #avatar_menu element (e.g. an unrelated debug page on the
|
|
// same template), the body signature still wins.
|
|
let html = r#"<html><body>
|
|
<p>we're sorry, the request file are not found.</p>
|
|
<div id="logo"></div>
|
|
<div id="avatar_menu"></div>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
}
|