Files
Mangalord/backend/src/crawler/session.rs
MechaCat02 8e0b638e3f fix(crawler): wait for page marker instead of fixed 1s sleep (0.36.2)
A chromium snapshot taken between the wrapper-render and row-render
phases let parse_chapter_list return Ok(vec![]) for a manga that
actually has chapters — the soft-drop branch in sync_manga_chapters
then flipped every existing chapter to dropped_at.

Add wait_for_selector to crawler::nav. navigate() now takes a CSS
marker matching the most-specific element the downstream parser will
look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER /
DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by
SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass
through because the parser's #chapter_table sentinel and the
universal broken-page body check stay in force.

Same pattern wired at the reader nav (a#pic_container) and probe
nav (#logo), replacing the implicit assumption that the post-load
JS had finished within 1 second.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-30 18:29:38 +02:00

352 lines
14 KiB
Rust

//! PHPSESSID injection + login probe.
//!
//! The catalog site we crawl renders chapter pages as a single multi-
//! page list only for logged-in users. We don't try to bypass the
//! login (CAPTCHA wall) — instead the operator pastes their browser's
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
//! it into Chromium *and* reqwest before the first navigation.
//!
//! Two things the cookie alone doesn't give us:
//! 1. The cookie value is only meaningful to the *server* — we have
//! no way to predict from the value alone whether it's still valid.
//! `verify_session` does a navigation and inspects the probe page
//! for three outcomes: broken-page response (transient — retry the
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
//! — bail loudly), or both present (authenticated). The earlier
//! avatar-only check conflated "site is hiccuping" with "session is
//! dead" and refused to start the crawler when the site had a brief
//! 503.
//! 2. The reqwest client (used for cover and chapter-image downloads)
//! has its own cookie store; we seed it for the catalog host only.
//! CDN hosts are deliberately *not* given the cookie — they serve
//! image bytes by signed URLs and don't need it.
use std::time::Duration;
use anyhow::{anyhow, Context};
use chromiumoxide::browser::Browser;
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
/// Outcome of inspecting a probe-page response.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SessionProbe {
/// `#logo` present and `#avatar_menu` present — session valid.
Ok,
/// `#logo` present but `#avatar_menu` absent — site rendered the
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
Unauthenticated,
/// Broken-page body signature or `#logo` missing — site is hiccuping.
/// Caller retries the probe rather than blaming the session.
Transient,
}
/// Re-export so existing callers keep working after the helper moved
/// to `crawler::url_utils`. The body lives there.
pub use crate::crawler::url_utils::registrable_domain;
/// Inject the PHPSESSID cookie into the browser's cookie store for the
/// catalog domain. Must be called before any navigation that depends on
/// authentication; subsequent navigations include the cookie
/// automatically.
pub async fn inject_phpsessid(
browser: &Browser,
sid: &str,
cookie_domain: &str,
) -> anyhow::Result<()> {
let cookie = CookieParam {
name: "PHPSESSID".to_string(),
value: sid.to_string(),
url: None,
domain: Some(cookie_domain.to_string()),
path: Some("/".to_string()),
secure: None,
http_only: Some(true),
same_site: None,
expires: None,
priority: None,
same_party: None,
source_scheme: None,
source_port: None,
partition_key: None,
};
browser
.set_cookies(vec![cookie])
.await
.context("set PHPSESSID in chromium cookie store")?;
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
Ok(())
}
/// Three-way classification of a probe-page response. Pure over HTML so
/// it's unit-testable without a real browser. Order matters: a body
/// matching the broken-page template is `Transient` even if the page
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
/// site signal over a stray selector match.
pub fn classify_probe(html: &str) -> SessionProbe {
if is_broken_page_body(html) {
return SessionProbe::Transient;
}
let doc = scraper::Html::parse_document(html);
if !has_logo_sentinel(&doc) {
return SessionProbe::Transient;
}
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
if doc.select(&avatar_sel).next().is_some() {
SessionProbe::Ok
} else {
SessionProbe::Unauthenticated
}
}
/// Three-way classification of a chapter page response.
///
/// Reader pages don't render `#logo`, so [`classify_probe`] can't be
/// reused as-is. The chapter-specific marker is `a#pic_container`
/// (asserted by the reader-page parser at `parse_chapter_pages`).
///
/// Order matters: broken-page body wins over selector matches, so a
/// transient site-wide 5xx that happens to render the avatar widget
/// elsewhere doesn't falsely reach `Ok`.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ChapterProbe {
/// `a#pic_container` present — reader rendered. Whether
/// `#avatar_menu` is also there is informational; if the reader
/// loaded the session is by definition still good.
Ok,
/// Site rendered a "logged out" or "please log in" page (no
/// reader, no broken-page body, and no avatar widget either).
/// Distinguishes the genuine expired-session case from a
/// transient site hiccup.
Unauthenticated,
/// Broken-page body, or reader didn't render but the user is
/// still logged in (avatar widget present). Caller should retry
/// rather than blame the session.
Transient,
}
pub fn classify_chapter_probe(html: &str) -> ChapterProbe {
if is_broken_page_body(html) {
return ChapterProbe::Transient;
}
let doc = scraper::Html::parse_document(html);
let container = scraper::Selector::parse("a#pic_container").unwrap();
if doc.select(&container).next().is_some() {
return ChapterProbe::Ok;
}
let avatar = scraper::Selector::parse("#avatar_menu").unwrap();
if doc.select(&avatar).next().is_some() {
// Logged-in user, but the reader didn't render — most likely
// the layout shifted or the site is serving an interstitial.
ChapterProbe::Transient
} else {
// No reader, no avatar, no broken-body marker — site rendered
// the "please log in" page, which is the genuine session-
// expired signal on this route.
ChapterProbe::Unauthenticated
}
}
/// In-startup retry budget for the session probe. Small but non-zero —
/// startup hitting a 5-second site hiccup shouldn't fail the operator
/// with "PHPSESSID expired" when the session is actually fine.
const PROBE_MAX_ATTEMPTS: u32 = 3;
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
/// Navigate to `probe_url` and classify the response. Retries the probe
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
/// fast on `Unauthenticated`; returns `Ok(())` on success.
///
/// This burns one navigation per attempt against the catalog's rate
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
/// minutes into a backfill costs 30 minutes.
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
let mut attempt = 0u32;
loop {
attempt += 1;
let html = fetch_probe_html(browser, probe_url).await?;
match classify_probe(&html) {
SessionProbe::Ok => {
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
return Ok(());
}
SessionProbe::Unauthenticated => {
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url} \
(page rendered the normal layout); PHPSESSID is missing, expired, \
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
tracing::warn!(
attempt,
max_attempts = PROBE_MAX_ATTEMPTS,
"session probe got a transient page; retrying"
);
tokio::time::sleep(PROBE_RETRY_DELAY).await;
}
SessionProbe::Transient => {
return Err(anyhow!(
"session probe failed — probe page at {probe_url} returned a \
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
}
}
}
}
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
let page = browser
.new_page(probe_url)
.await
.with_context(|| format!("open probe page {probe_url}"))?;
crate::crawler::nav::wait_for_nav(&page)
.await
.context("wait for nav on probe")?;
// Best-effort wait for the layout marker. Timeout is fine — the
// probe classifier handles a missing `#logo` as Transient anyway,
// and the verify loop retries on Transient.
let _ = crate::crawler::nav::wait_for_selector(
&page,
"#logo",
crate::crawler::nav::SELECTOR_TIMEOUT,
)
.await;
let html = page.content().await.context("read probe html")?;
page.close().await.ok();
Ok(html)
}
#[cfg(test)]
mod tests {
use super::*;
// registrable_domain tests live in crawler::url_utils now —
// it's the canonical home for that helper.
#[test]
fn classify_probe_ok_when_logo_and_avatar_present() {
let html = r#"<html><body>
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Ok);
}
#[test]
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
// Real "logged out" response: site layout renders fine, just no
// avatar widget. This is the only state that should blame the
// session cookie.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<main>Please log in.</main>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
}
#[test]
fn classify_probe_transient_on_broken_page_body() {
let html = "<html><body>\
<p>we're sorry, the request file are not found.</p>\
</body></html>";
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
#[test]
fn classify_probe_transient_when_logo_missing() {
// No broken-body marker, but no site layout either — treat as
// transient (could be a Cloudflare interstitial, a 5xx page,
// etc.) rather than blaming the session.
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
#[test]
fn classify_probe_transient_on_empty_response() {
assert_eq!(classify_probe(""), SessionProbe::Transient);
}
#[test]
fn classify_chapter_probe_ok_when_reader_rendered() {
let html = r#"
<html><body>
<a id="pic_container">
<img id="page1" src="https://cdn/1.jpg">
</a>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
}
#[test]
fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() {
// What a logged-out hit on a chapter URL renders: a normal
// site layout (header etc.) with a "please log in" body, but
// no reader and no avatar widget.
let html = r#"
<html><body>
<header><div id="logo">Catalog</div></header>
<main>Please log in to read this chapter.</main>
</body></html>
"#;
assert_eq!(
classify_chapter_probe(html),
ChapterProbe::Unauthenticated
);
}
#[test]
fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() {
// Avatar shows the session is still valid; reader didn't
// render — site is serving an interstitial or the layout
// momentarily shifted. Retry, don't blame the session.
let html = r#"
<html><body>
<header><div id="logo">Catalog</div><div id="avatar_menu"></div></header>
<main>Site maintenance — back in 5 minutes.</main>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
}
#[test]
fn classify_chapter_probe_transient_on_broken_page_body() {
let html =
"<html><body><p>we're sorry, the request file are not found.</p></body></html>";
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
}
#[test]
fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() {
// Regression for the original bug: the binary
// find_element("#avatar_menu") check treated "no avatar" as
// session-expired even when a transient hiccup was the real
// cause. classify_chapter_probe must NOT trip on that pattern
// when pic_container *is* present.
let html = r#"
<html><body>
<a id="pic_container">
<img id="page1" src="https://cdn/1.jpg">
</a>
</body></html>
"#;
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
}
#[test]
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
// Defensive: if a broken-page body somehow contains an
// #avatar_menu element (e.g. an unrelated debug page on the
// same template), the body signature still wins.
let html = r#"<html><body>
<p>we're sorry, the request file are not found.</p>
<div id="logo"></div>
<div id="avatar_menu"></div>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
}