//! PHPSESSID injection + login probe. //! //! The catalog site we crawl renders chapter pages as a single multi- //! page list only for logged-in users. We don't try to bypass the //! login (CAPTCHA wall) — instead the operator pastes their browser's //! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects //! it into Chromium *and* reqwest before the first navigation. //! //! Two things the cookie alone doesn't give us: //! 1. The cookie value is only meaningful to the *server* — we have //! no way to predict from the value alone whether it's still valid. //! `verify_session` does a navigation and inspects the probe page //! for three outcomes: broken-page response (transient — retry the //! probe), `#logo` present but `#avatar_menu` absent (genuine logout //! — bail loudly), or both present (authenticated). The earlier //! avatar-only check conflated "site is hiccuping" with "session is //! dead" and refused to start the crawler when the site had a brief //! 503. //! 2. The reqwest client (used for cover and chapter-image downloads) //! has its own cookie store; we seed it for the catalog host only. //! CDN hosts are deliberately *not* given the cookie — they serve //! image bytes by signed URLs and don't need it. use std::time::Duration; use anyhow::{anyhow, Context}; use chromiumoxide::browser::Browser; use chromiumoxide::cdp::browser_protocol::network::CookieParam; use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body}; /// Outcome of inspecting a probe-page response. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum SessionProbe { /// `#logo` present and `#avatar_menu` present — session valid. Ok, /// `#logo` present but `#avatar_menu` absent — site rendered the /// normal layout for an unauthenticated visitor; refresh PHPSESSID. Unauthenticated, /// Broken-page body signature or `#logo` missing — site is hiccuping. /// Caller retries the probe rather than blaming the session. Transient, } /// Re-export so existing callers keep working after the helper moved /// to `crawler::url_utils`. The body lives there. pub use crate::crawler::url_utils::registrable_domain; /// Inject the PHPSESSID cookie into the browser's cookie store for the /// catalog domain. Must be called before any navigation that depends on /// authentication; subsequent navigations include the cookie /// automatically. pub async fn inject_phpsessid( browser: &Browser, sid: &str, cookie_domain: &str, ) -> anyhow::Result<()> { let cookie = CookieParam { name: "PHPSESSID".to_string(), value: sid.to_string(), url: None, domain: Some(cookie_domain.to_string()), path: Some("/".to_string()), secure: None, http_only: Some(true), same_site: None, expires: None, priority: None, same_party: None, source_scheme: None, source_port: None, partition_key: None, }; browser .set_cookies(vec![cookie]) .await .context("set PHPSESSID in chromium cookie store")?; tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser"); Ok(()) } /// Three-way classification of a probe-page response. Pure over HTML so /// it's unit-testable without a real browser. Order matters: a body /// matching the broken-page template is `Transient` even if the page /// happens to contain `#avatar_menu` HTML somewhere — trust the universal /// site signal over a stray selector match. pub fn classify_probe(html: &str) -> SessionProbe { if is_broken_page_body(html) { return SessionProbe::Transient; } let doc = scraper::Html::parse_document(html); if !has_logo_sentinel(&doc) { return SessionProbe::Transient; } let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap(); if doc.select(&avatar_sel).next().is_some() { SessionProbe::Ok } else { SessionProbe::Unauthenticated } } /// Three-way classification of a chapter page response. /// /// Reader pages don't render `#logo`, so [`classify_probe`] can't be /// reused as-is. The chapter-specific marker is `a#pic_container` /// (asserted by the reader-page parser at `parse_chapter_pages`). /// /// Order matters: broken-page body wins over selector matches, so a /// transient site-wide 5xx that happens to render the avatar widget /// elsewhere doesn't falsely reach `Ok`. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum ChapterProbe { /// `a#pic_container` present — reader rendered. Whether /// `#avatar_menu` is also there is informational; if the reader /// loaded the session is by definition still good. Ok, /// Site rendered a "logged out" or "please log in" page (no /// reader, no broken-page body, and no avatar widget either). /// Distinguishes the genuine expired-session case from a /// transient site hiccup. Unauthenticated, /// Broken-page body, or reader didn't render but the user is /// still logged in (avatar widget present). Caller should retry /// rather than blame the session. Transient, } pub fn classify_chapter_probe(html: &str) -> ChapterProbe { if is_broken_page_body(html) { return ChapterProbe::Transient; } let doc = scraper::Html::parse_document(html); let container = scraper::Selector::parse("a#pic_container").unwrap(); if doc.select(&container).next().is_some() { return ChapterProbe::Ok; } let avatar = scraper::Selector::parse("#avatar_menu").unwrap(); if doc.select(&avatar).next().is_some() { // Logged-in user, but the reader didn't render — most likely // the layout shifted or the site is serving an interstitial. ChapterProbe::Transient } else { // No reader, no avatar, no broken-body marker — site rendered // the "please log in" page, which is the genuine session- // expired signal on this route. ChapterProbe::Unauthenticated } } /// In-startup retry budget for the session probe. Small but non-zero — /// startup hitting a 5-second site hiccup shouldn't fail the operator /// with "PHPSESSID expired" when the session is actually fine. const PROBE_MAX_ATTEMPTS: u32 = 3; const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2); /// Navigate to `probe_url` and classify the response. Retries the probe /// on `Transient` outcomes (broken-page body, missing `#logo`); fails /// fast on `Unauthenticated`; returns `Ok(())` on success. /// /// This burns one navigation per attempt against the catalog's rate /// limiter. The trade is worth it — failing here costs ~1s; failing 30 /// minutes into a backfill costs 30 minutes. pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> { let mut attempt = 0u32; loop { attempt += 1; let html = fetch_probe_html(browser, probe_url).await?; match classify_probe(&html) { SessionProbe::Ok => { tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present"); return Ok(()); } SessionProbe::Unauthenticated => { return Err(anyhow!( "session probe failed — #avatar_menu not present at {probe_url} \ (page rendered the normal layout); PHPSESSID is missing, expired, \ or revoked. Refresh CRAWLER_PHPSESSID and re-run." )); } SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => { tracing::warn!( attempt, max_attempts = PROBE_MAX_ATTEMPTS, "session probe got a transient page; retrying" ); tokio::time::sleep(PROBE_RETRY_DELAY).await; } SessionProbe::Transient => { return Err(anyhow!( "session probe failed — probe page at {probe_url} returned a \ broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \ The site appears to be down or rate-limiting us; try again \ later before refreshing CRAWLER_PHPSESSID." )); } } } } async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result { let page = browser .new_page(probe_url) .await .with_context(|| format!("open probe page {probe_url}"))?; crate::crawler::nav::wait_for_nav(&page) .await .context("wait for nav on probe")?; // Best-effort wait for the layout marker. Timeout is fine — the // probe classifier handles a missing `#logo` as Transient anyway, // and the verify loop retries on Transient. let _ = crate::crawler::nav::wait_for_selector( &page, "#logo", crate::crawler::nav::SELECTOR_TIMEOUT, ) .await; let html = page.content().await.context("read probe html")?; page.close().await.ok(); Ok(html) } #[cfg(test)] mod tests { use super::*; // registrable_domain tests live in crawler::url_utils now — // it's the canonical home for that helper. #[test] fn classify_probe_ok_when_logo_and_avatar_present() { let html = r#"
"#; assert_eq!(classify_probe(html), SessionProbe::Ok); } #[test] fn classify_probe_unauth_when_logo_present_but_avatar_absent() { // Real "logged out" response: site layout renders fine, just no // avatar widget. This is the only state that should blame the // session cookie. let html = r#"
Please log in.
"#; assert_eq!(classify_probe(html), SessionProbe::Unauthenticated); } #[test] fn classify_probe_transient_on_broken_page_body() { let html = "\

we're sorry, the request file are not found.

\ "; assert_eq!(classify_probe(html), SessionProbe::Transient); } #[test] fn classify_probe_transient_when_logo_missing() { // No broken-body marker, but no site layout either — treat as // transient (could be a Cloudflare interstitial, a 5xx page, // etc.) rather than blaming the session. let html = "

Service Unavailable

"; assert_eq!(classify_probe(html), SessionProbe::Transient); } #[test] fn classify_probe_transient_on_empty_response() { assert_eq!(classify_probe(""), SessionProbe::Transient); } #[test] fn classify_chapter_probe_ok_when_reader_rendered() { let html = r#" "#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); } #[test] fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() { // What a logged-out hit on a chapter URL renders: a normal // site layout (header etc.) with a "please log in" body, but // no reader and no avatar widget. let html = r#"
Please log in to read this chapter.
"#; assert_eq!( classify_chapter_probe(html), ChapterProbe::Unauthenticated ); } #[test] fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() { // Avatar shows the session is still valid; reader didn't // render — site is serving an interstitial or the layout // momentarily shifted. Retry, don't blame the session. let html = r#"
Site maintenance — back in 5 minutes.
"#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); } #[test] fn classify_chapter_probe_transient_on_broken_page_body() { let html = "

we're sorry, the request file are not found.

"; assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient); } #[test] fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() { // Regression for the original bug: the binary // find_element("#avatar_menu") check treated "no avatar" as // session-expired even when a transient hiccup was the real // cause. classify_chapter_probe must NOT trip on that pattern // when pic_container *is* present. let html = r#" "#; assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); } #[test] fn classify_probe_trusts_broken_body_over_stray_avatar_match() { // Defensive: if a broken-page body somehow contains an // #avatar_menu element (e.g. an unrelated debug page on the // same template), the body signature still wins. let html = r#"

we're sorry, the request file are not found.

"#; assert_eq!(classify_probe(html), SessionProbe::Transient); } }