//! PHPSESSID injection + login probe. //! //! The catalog site we crawl renders chapter pages as a single multi- //! page list only for logged-in users. We don't try to bypass the //! login (CAPTCHA wall) — instead the operator pastes their browser's //! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects //! it into Chromium *and* reqwest before the first navigation. //! //! Two things the cookie alone doesn't give us: //! 1. The cookie value is only meaningful to the *server* — we have //! no way to predict from the value alone whether it's still valid. //! `verify_session` does a navigation and inspects the probe page //! for three outcomes: broken-page response (transient — retry the //! probe), `#logo` present but `#avatar_menu` absent (genuine logout //! — bail loudly), or both present (authenticated). The earlier //! avatar-only check conflated "site is hiccuping" with "session is //! dead" and refused to start the crawler when the site had a brief //! 503. //! 2. The reqwest client (used for cover and chapter-image downloads) //! has its own cookie store; we seed it for the catalog host only. //! CDN hosts are deliberately *not* given the cookie — they serve //! image bytes by signed URLs and don't need it. use std::time::Duration; use anyhow::{anyhow, Context}; use chromiumoxide::browser::Browser; use chromiumoxide::cdp::browser_protocol::network::CookieParam; use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body}; /// Outcome of inspecting a probe-page response. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SessionProbe { /// `#logo` present and `#avatar_menu` present — session valid. Ok, /// `#logo` present but `#avatar_menu` absent — site rendered the /// normal layout for an unauthenticated visitor; refresh PHPSESSID. Unauthenticated, /// Broken-page body signature or `#logo` missing — site is hiccuping. /// Caller retries the probe rather than blaming the session. Transient, } /// Re-export so existing callers keep working after the helper moved /// to `crawler::url_utils`. The body lives there. pub use crate::crawler::url_utils::registrable_domain; /// Inject the PHPSESSID cookie into the browser's cookie store for the /// catalog domain. Must be called before any navigation that depends on /// authentication; subsequent navigations include the cookie /// automatically. pub async fn inject_phpsessid( browser: &Browser, sid: &str, cookie_domain: &str, ) -> anyhow::Result<()> { let cookie = CookieParam { name: "PHPSESSID".to_string(), value: sid.to_string(), url: None, domain: Some(cookie_domain.to_string()), path: Some("/".to_string()), secure: None, http_only: Some(true), same_site: None, expires: None, priority: None, same_party: None, source_scheme: None, source_port: None, partition_key: None, }; browser .set_cookies(vec![cookie]) .await .context("set PHPSESSID in chromium cookie store")?; tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser"); Ok(()) } /// Three-way classification of a probe-page response. Pure over HTML so /// it's unit-testable without a real browser. Order matters: a body /// matching the broken-page template is `Transient` even if the page /// happens to contain `#avatar_menu` HTML somewhere — trust the universal /// site signal over a stray selector match. pub fn classify_probe(html: &str) -> SessionProbe { if is_broken_page_body(html) { return SessionProbe::Transient; } let doc = scraper::Html::parse_document(html); if !has_logo_sentinel(&doc) { return SessionProbe::Transient; } let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap(); if doc.select(&avatar_sel).next().is_some() { SessionProbe::Ok } else { SessionProbe::Unauthenticated } } /// Three-way classification of a chapter page response. /// /// Reader pages don't render `#logo`, so [`classify_probe`] can't be /// reused as-is. The chapter-specific marker is `a#pic_container` /// (asserted by the reader-page parser at `parse_chapter_pages`). /// /// Order matters: broken-page body wins over selector matches, so a /// transient site-wide 5xx that happens to render the avatar widget /// elsewhere doesn't falsely reach `Ok`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ChapterProbe { /// `a#pic_container` present — reader rendered. Whether /// `#avatar_menu` is also there is informational; if the reader /// loaded the session is by definition still good. Ok, /// Site rendered a "logged out" or "please log in" page (no /// reader, no broken-page body, and no avatar widget either). /// Distinguishes the genuine expired-session case from a /// transient site hiccup. Unauthenticated, /// Broken-page body, or reader didn't render but the user is /// still logged in (avatar widget present). Caller should retry /// rather than blame the session. Transient, } pub fn classify_chapter_probe(html: &str) -> ChapterProbe { if is_broken_page_body(html) { return ChapterProbe::Transient; } let doc = scraper::Html::parse_document(html); let container = scraper::Selector::parse("a#pic_container").unwrap(); if doc.select(&container).next().is_some() { return ChapterProbe::Ok; } let avatar = scraper::Selector::parse("#avatar_menu").unwrap(); if doc.select(&avatar).next().is_some() { // Logged-in user, but the reader didn't render — most likely // the layout shifted or the site is serving an interstitial. ChapterProbe::Transient } else { // No reader, no avatar, no broken-body marker — site rendered // the "please log in" page, which is the genuine session- // expired signal on this route. ChapterProbe::Unauthenticated } } /// In-startup retry budget for the session probe. Small but non-zero — /// startup hitting a 5-second site hiccup shouldn't fail the operator /// with "PHPSESSID expired" when the session is actually fine. const PROBE_MAX_ATTEMPTS: u32 = 3; const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2); /// Navigate to `probe_url` and classify the response. Retries the probe /// on `Transient` outcomes (broken-page body, missing `#logo`); fails /// fast on `Unauthenticated`; returns `Ok(())` on success. /// /// This burns one navigation per attempt against the catalog's rate /// limiter. The trade is worth it — failing here costs ~1s; failing 30 /// minutes into a backfill costs 30 minutes. pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> { verify_session_with_recircuit(browser, probe_url, None, 0).await } /// Like [`verify_session`] but, when `tor` is `Some`, signals /// `SIGNAL NEWNYM` between retries on transient pages AND treats /// `Unauthenticated` as recoverable (up to `tor_max_attempts` total /// probes, calling NEWNYM between each). /// /// `verify_session` is `verify_session_with_recircuit(..., None, _)`, /// which collapses the `Unauthenticated` budget to 1 attempt — i.e. /// fail-fast, exactly the pre-TOR behavior. pub async fn verify_session_with_recircuit( browser: &Browser, probe_url: &str, tor: Option<&crate::crawler::tor::TorController>, tor_max_attempts: u32, ) -> anyhow::Result<()> { let unauth_max_attempts = if tor.is_some() { tor_max_attempts.max(1) } else { 1 }; run_session_probe_loop( || fetch_probe_html(browser, probe_url), || async { if let Some(t) = tor { if let Err(e) = t.new_identity().await { tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit"); } } }, PROBE_MAX_ATTEMPTS, unauth_max_attempts, PROBE_RETRY_DELAY, probe_url, ) .await } /// Pure-over-IO loop body for the session probe. Generic over the /// fetch and recircuit closures so it can be unit-tested without a /// real browser or TOR daemon. /// /// Both budgets count **total attempts**, including the first — so /// `transient_max_attempts = 3` allows 3 fetches and 2 recircuits /// between them, and `unauth_max_attempts = 1` means "fail-fast, no /// retry". This matches [`crate::crawler::detect::retry_on_transient`] /// and the content-path recircuit loop. /// /// Outcomes: /// - `SessionProbe::Ok` → return `Ok(())`. /// - `SessionProbe::Unauthenticated` → recircuit + retry while /// under the unauth budget. After the cap, bail with the /// "PHPSESSID expired" diagnostic, mentioning the attempt count so /// a TOR-misconfig diagnosis is easier. /// - `SessionProbe::Transient` → same shape against the transient /// budget; bails with "site down or rate-limiting" after the cap. async fn run_session_probe_loop( mut fetch_html: F, mut recircuit: R, transient_max_attempts: u32, unauth_max_attempts: u32, retry_delay: Duration, probe_url_for_msg: &str, ) -> anyhow::Result<()> where F: FnMut() -> Fut, Fut: std::future::Future>, R: FnMut() -> RFut, RFut: std::future::Future, { debug_assert!(transient_max_attempts >= 1); debug_assert!(unauth_max_attempts >= 1); let mut transient_attempts = 0u32; let mut unauth_attempts = 0u32; loop { let html = fetch_html().await?; match classify_probe(&html) { SessionProbe::Ok => { tracing::info!( transient_attempts, unauth_attempts, "session probe ok — #logo + #avatar_menu present" ); return Ok(()); } SessionProbe::Unauthenticated => { unauth_attempts += 1; if unauth_attempts >= unauth_max_attempts { return Err(anyhow!( "session probe failed — #avatar_menu not present at {probe_url_for_msg} \ after {unauth_attempts} attempt(s); PHPSESSID is missing, \ expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run." )); } tracing::warn!( attempt = unauth_attempts, max_attempts = unauth_max_attempts, "session probe Unauthenticated despite PHPSESSID; signaling TOR \ NEWNYM and retrying" ); recircuit().await; tokio::time::sleep(retry_delay).await; } SessionProbe::Transient => { transient_attempts += 1; if transient_attempts >= transient_max_attempts { return Err(anyhow!( "session probe failed — probe page at {probe_url_for_msg} returned \ a broken-page response after {transient_max_attempts} attempts. \ The site appears to be down or rate-limiting us; try again \ later before refreshing CRAWLER_PHPSESSID." )); } tracing::warn!( attempt = transient_attempts, max_attempts = transient_max_attempts, "session probe got a transient page; recircuit + retry" ); recircuit().await; tokio::time::sleep(retry_delay).await; } } } } async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result { let page = browser .new_page(probe_url) .await .with_context(|| format!("open probe page {probe_url}"))?; crate::crawler::nav::wait_for_nav(&page) .await .context("wait for nav on probe")?; // Best-effort wait for the layout marker. Timeout is fine — the // probe classifier handles a missing `#logo` as Transient anyway, // and the verify loop retries on Transient. let _ = crate::crawler::nav::wait_for_selector( &page, "#logo", crate::crawler::nav::SELECTOR_TIMEOUT, ) .await; let html = page.content().await.context("read probe html")?; page.close().await.ok(); Ok(html) } #[cfg(test)] mod tests { use super::*; // registrable_domain tests live in crawler::url_utils now — // it's the canonical home for that helper. #[test] fn classify_probe_ok_when_logo_and_avatar_present() { let html = r#"
"#; assert_eq!(classify_probe(html), SessionProbe::Ok); } #[test] fn classify_probe_unauth_when_logo_present_but_avatar_absent() { // Real "logged out" response: site layout renders fine, just no // avatar widget. This is the only state that should blame the // session cookie. let html = r#"
Please log in.
"#; assert_eq!(classify_probe(html), SessionProbe::Unauthenticated); } #[test] fn classify_probe_transient_on_broken_page_body() { let html = "\

we're sorry, the request file are not found.

\ "; assert_eq!(classify_probe(html), SessionProbe::Transient); } #[test] fn classify_probe_transient_when_logo_missing() { // No broken-body marker, but no site layout either — treat as // transient (could be a Cloudflare interstitial, a 5xx page, // etc.) rather than blaming the session. let html = "

Service Unavailable

"; assert_eq!(classify_probe(html), SessionProbe::Transient); } #[test] fn classify_probe_transient_on_empty_response() { assert_eq!(classify_probe(""), SessionProbe::Transient); } #[test] fn classify_chapter_probe_ok_when_reader_rendered() { let html = r#" "#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); } #[test] fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() { // What a logged-out hit on a chapter URL renders: a normal // site layout (header etc.) with a "please log in" body, but // no reader and no avatar widget. let html = r#"
Please log in to read this chapter.
"#; assert_eq!( classify_chapter_probe(html), ChapterProbe::Unauthenticated ); } #[test] fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() { // Avatar shows the session is still valid; reader didn't // render — site is serving an interstitial or the layout // momentarily shifted. Retry, don't blame the session. let html = r#"
Site maintenance — back in 5 minutes.
"#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); } #[test] fn classify_chapter_probe_transient_on_broken_page_body() { let html = "

we're sorry, the request file are not found.

"; assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); } #[test] fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() { // Regression for the original bug: the binary // find_element("#avatar_menu") check treated "no avatar" as // session-expired even when a transient hiccup was the real // cause. classify_chapter_probe must NOT trip on that pattern // when pic_container *is* present. let html = r#" "#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); } // --- run_session_probe_loop ----------------------------------------- // // These tests exercise the recircuit-aware loop without a real // browser. The fetch and recircuit closures are mocked over Vecs of // canned outcomes / counters. const OK_HTML: &str = r#"
"#; const UNAUTH_HTML: &str = r#""#; const TRANSIENT_HTML: &str = "

we're sorry, the request file are not found.

"; #[tokio::test] async fn probe_loop_ok_on_first_attempt_does_not_recircuit() { let mut recircuits = 0u32; let mut fetched = 0u32; run_session_probe_loop( || { fetched += 1; async { Ok(OK_HTML.to_string()) } }, || { recircuits += 1; async {} }, 3, 3, Duration::from_millis(0), "https://example/probe", ) .await .expect("ok on first attempt"); assert_eq!(fetched, 1); assert_eq!(recircuits, 0); } #[tokio::test] async fn probe_loop_unauth_then_ok_when_attempt_budget_available() { // Budget = 3 total attempts. Unauth on call 1, ok on call 2. let mut recircuits = 0u32; let mut call = 0u32; run_session_probe_loop( || { call += 1; let n = call; async move { if n == 1 { Ok(UNAUTH_HTML.to_string()) } else { Ok(OK_HTML.to_string()) } } }, || { recircuits += 1; async {} }, 3, 3, Duration::from_millis(0), "https://example/probe", ) .await .expect("recovers after one recircuit"); assert_eq!(call, 2); assert_eq!(recircuits, 1); } #[tokio::test] async fn probe_loop_unauth_with_single_attempt_budget_fails_fast() { // Budget = 1 total attempt = no retry (matches no-TOR behavior). let mut recircuits = 0u32; let mut call = 0u32; let err = run_session_probe_loop( || { call += 1; async { Ok(UNAUTH_HTML.to_string()) } }, || { recircuits += 1; async {} }, 3, 1, Duration::from_millis(0), "https://example/probe", ) .await .expect_err("budget=1 → fail-fast"); assert_eq!(call, 1, "no retry when budget is 1"); assert_eq!(recircuits, 0); let msg = format!("{err:#}"); assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}"); assert!(msg.contains("after 1 attempt"), "expected attempt count in msg: {msg}"); } #[tokio::test] async fn probe_loop_unauth_after_exhausting_budget_emits_attempt_count() { let mut recircuits = 0u32; let mut call = 0u32; let err = run_session_probe_loop( || { call += 1; async { Ok(UNAUTH_HTML.to_string()) } }, || { recircuits += 1; async {} }, 10, // transient budget irrelevant here 3, // 3 attempts total, 2 recircuits between Duration::from_millis(0), "https://example/probe", ) .await .expect_err("exhausts unauth budget"); assert_eq!(call, 3); assert_eq!(recircuits, 2); let msg = format!("{err:#}"); assert!(msg.contains("after 3 attempt"), "expected attempt count in error, got: {msg}"); } #[tokio::test] async fn probe_loop_transient_repeats_until_max_then_errors() { let mut recircuits = 0u32; let mut call = 0u32; let err = run_session_probe_loop( || { call += 1; async { Ok(TRANSIENT_HTML.to_string()) } }, || { recircuits += 1; async {} }, 3, 1, Duration::from_millis(0), "https://example/probe", ) .await .expect_err("transient until max → fail"); assert_eq!(call, 3); // Recircuit fires between attempts: 3 attempts → 2 recircuits. assert_eq!(recircuits, 2); let msg = format!("{err:#}"); assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}"); } #[tokio::test] async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() { let mut recircuits = 0u32; let mut call = 0u32; run_session_probe_loop( || { call += 1; let n = call; async move { if n == 1 { Ok(TRANSIENT_HTML.to_string()) } else { Ok(OK_HTML.to_string()) } } }, || { recircuits += 1; async {} }, 3, 1, Duration::from_millis(0), "https://example/probe", ) .await .expect("ok on second try"); assert_eq!(call, 2); assert_eq!(recircuits, 1); } #[tokio::test] async fn probe_loop_propagates_fetch_errors_immediately() { let mut call = 0u32; let err = run_session_probe_loop( || { call += 1; async { Err(anyhow!("nav timeout")) } }, || async {}, 5, 5, Duration::from_millis(0), "https://example/probe", ) .await .expect_err("fetch error bubbles"); assert_eq!(call, 1); assert!(format!("{err:#}").contains("nav timeout")); } #[test] fn classify_probe_trusts_broken_body_over_stray_avatar_match() { // Defensive: if a broken-page body somehow contains an // #avatar_menu element (e.g. an unrelated debug page on the // same template), the body signature still wins. let html = r#"

we're sorry, the request file are not found.

"#; assert_eq!(classify_probe(html), SessionProbe::Transient); } }