A chromium snapshot taken between the wrapper-render and row-render phases let parse_chapter_list return Ok(vec![]) for a manga that actually has chapters — the soft-drop branch in sync_manga_chapters then flipped every existing chapter to dropped_at. Add wait_for_selector to crawler::nav. navigate() now takes a CSS marker matching the most-specific element the downstream parser will look for (one of LIST_PAGE_MARKER / DETAIL_PAGE_CHAPTERS_MARKER / DETAIL_PAGE_LAYOUT_MARKER). The wait is best-effort and capped by SELECTOR_TIMEOUT (10s); a legitimately empty page can still pass through because the parser's #chapter_table sentinel and the universal broken-page body check stay in force. Same pattern wired at the reader nav (a#pic_container) and probe nav (#logo), replacing the implicit assumption that the post-load JS had finished within 1 second. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
352 lines
14 KiB
Rust
352 lines
14 KiB
Rust
//! PHPSESSID injection + login probe.
|
|
//!
|
|
//! The catalog site we crawl renders chapter pages as a single multi-
|
|
//! page list only for logged-in users. We don't try to bypass the
|
|
//! login (CAPTCHA wall) — instead the operator pastes their browser's
|
|
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
|
|
//! it into Chromium *and* reqwest before the first navigation.
|
|
//!
|
|
//! Two things the cookie alone doesn't give us:
|
|
//! 1. The cookie value is only meaningful to the *server* — we have
|
|
//! no way to predict from the value alone whether it's still valid.
|
|
//! `verify_session` does a navigation and inspects the probe page
|
|
//! for three outcomes: broken-page response (transient — retry the
|
|
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
|
|
//! — bail loudly), or both present (authenticated). The earlier
|
|
//! avatar-only check conflated "site is hiccuping" with "session is
|
|
//! dead" and refused to start the crawler when the site had a brief
|
|
//! 503.
|
|
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
|
//! has its own cookie store; we seed it for the catalog host only.
|
|
//! CDN hosts are deliberately *not* given the cookie — they serve
|
|
//! image bytes by signed URLs and don't need it.
|
|
|
|
use std::time::Duration;
|
|
|
|
use anyhow::{anyhow, Context};
|
|
use chromiumoxide::browser::Browser;
|
|
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
|
|
|
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
|
|
|
|
/// Outcome of inspecting a probe-page response.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum SessionProbe {
|
|
/// `#logo` present and `#avatar_menu` present — session valid.
|
|
Ok,
|
|
/// `#logo` present but `#avatar_menu` absent — site rendered the
|
|
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
|
|
Unauthenticated,
|
|
/// Broken-page body signature or `#logo` missing — site is hiccuping.
|
|
/// Caller retries the probe rather than blaming the session.
|
|
Transient,
|
|
}
|
|
|
|
/// Re-export so existing callers keep working after the helper moved
|
|
/// to `crawler::url_utils`. The body lives there.
|
|
pub use crate::crawler::url_utils::registrable_domain;
|
|
|
|
/// Inject the PHPSESSID cookie into the browser's cookie store for the
|
|
/// catalog domain. Must be called before any navigation that depends on
|
|
/// authentication; subsequent navigations include the cookie
|
|
/// automatically.
|
|
pub async fn inject_phpsessid(
|
|
browser: &Browser,
|
|
sid: &str,
|
|
cookie_domain: &str,
|
|
) -> anyhow::Result<()> {
|
|
let cookie = CookieParam {
|
|
name: "PHPSESSID".to_string(),
|
|
value: sid.to_string(),
|
|
url: None,
|
|
domain: Some(cookie_domain.to_string()),
|
|
path: Some("/".to_string()),
|
|
secure: None,
|
|
http_only: Some(true),
|
|
same_site: None,
|
|
expires: None,
|
|
priority: None,
|
|
same_party: None,
|
|
source_scheme: None,
|
|
source_port: None,
|
|
partition_key: None,
|
|
};
|
|
browser
|
|
.set_cookies(vec![cookie])
|
|
.await
|
|
.context("set PHPSESSID in chromium cookie store")?;
|
|
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
|
|
Ok(())
|
|
}
|
|
|
|
/// Three-way classification of a probe-page response. Pure over HTML so
|
|
/// it's unit-testable without a real browser. Order matters: a body
|
|
/// matching the broken-page template is `Transient` even if the page
|
|
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
|
|
/// site signal over a stray selector match.
|
|
pub fn classify_probe(html: &str) -> SessionProbe {
|
|
if is_broken_page_body(html) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let doc = scraper::Html::parse_document(html);
|
|
if !has_logo_sentinel(&doc) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
|
|
if doc.select(&avatar_sel).next().is_some() {
|
|
SessionProbe::Ok
|
|
} else {
|
|
SessionProbe::Unauthenticated
|
|
}
|
|
}
|
|
|
|
/// Three-way classification of a chapter page response.
|
|
///
|
|
/// Reader pages don't render `#logo`, so [`classify_probe`] can't be
|
|
/// reused as-is. The chapter-specific marker is `a#pic_container`
|
|
/// (asserted by the reader-page parser at `parse_chapter_pages`).
|
|
///
|
|
/// Order matters: broken-page body wins over selector matches, so a
|
|
/// transient site-wide 5xx that happens to render the avatar widget
|
|
/// elsewhere doesn't falsely reach `Ok`.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum ChapterProbe {
|
|
/// `a#pic_container` present — reader rendered. Whether
|
|
/// `#avatar_menu` is also there is informational; if the reader
|
|
/// loaded the session is by definition still good.
|
|
Ok,
|
|
/// Site rendered a "logged out" or "please log in" page (no
|
|
/// reader, no broken-page body, and no avatar widget either).
|
|
/// Distinguishes the genuine expired-session case from a
|
|
/// transient site hiccup.
|
|
Unauthenticated,
|
|
/// Broken-page body, or reader didn't render but the user is
|
|
/// still logged in (avatar widget present). Caller should retry
|
|
/// rather than blame the session.
|
|
Transient,
|
|
}
|
|
|
|
pub fn classify_chapter_probe(html: &str) -> ChapterProbe {
|
|
if is_broken_page_body(html) {
|
|
return ChapterProbe::Transient;
|
|
}
|
|
let doc = scraper::Html::parse_document(html);
|
|
let container = scraper::Selector::parse("a#pic_container").unwrap();
|
|
if doc.select(&container).next().is_some() {
|
|
return ChapterProbe::Ok;
|
|
}
|
|
let avatar = scraper::Selector::parse("#avatar_menu").unwrap();
|
|
if doc.select(&avatar).next().is_some() {
|
|
// Logged-in user, but the reader didn't render — most likely
|
|
// the layout shifted or the site is serving an interstitial.
|
|
ChapterProbe::Transient
|
|
} else {
|
|
// No reader, no avatar, no broken-body marker — site rendered
|
|
// the "please log in" page, which is the genuine session-
|
|
// expired signal on this route.
|
|
ChapterProbe::Unauthenticated
|
|
}
|
|
}
|
|
|
|
/// In-startup retry budget for the session probe. Small but non-zero —
|
|
/// startup hitting a 5-second site hiccup shouldn't fail the operator
|
|
/// with "PHPSESSID expired" when the session is actually fine.
|
|
const PROBE_MAX_ATTEMPTS: u32 = 3;
|
|
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
|
|
|
/// Navigate to `probe_url` and classify the response. Retries the probe
|
|
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
|
|
/// fast on `Unauthenticated`; returns `Ok(())` on success.
|
|
///
|
|
/// This burns one navigation per attempt against the catalog's rate
|
|
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
|
/// minutes into a backfill costs 30 minutes.
|
|
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
|
let mut attempt = 0u32;
|
|
loop {
|
|
attempt += 1;
|
|
let html = fetch_probe_html(browser, probe_url).await?;
|
|
match classify_probe(&html) {
|
|
SessionProbe::Ok => {
|
|
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
|
|
return Ok(());
|
|
}
|
|
SessionProbe::Unauthenticated => {
|
|
return Err(anyhow!(
|
|
"session probe failed — #avatar_menu not present at {probe_url} \
|
|
(page rendered the normal layout); PHPSESSID is missing, expired, \
|
|
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
|
));
|
|
}
|
|
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
|
|
tracing::warn!(
|
|
attempt,
|
|
max_attempts = PROBE_MAX_ATTEMPTS,
|
|
"session probe got a transient page; retrying"
|
|
);
|
|
tokio::time::sleep(PROBE_RETRY_DELAY).await;
|
|
}
|
|
SessionProbe::Transient => {
|
|
return Err(anyhow!(
|
|
"session probe failed — probe page at {probe_url} returned a \
|
|
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
|
|
The site appears to be down or rate-limiting us; try again \
|
|
later before refreshing CRAWLER_PHPSESSID."
|
|
));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
|
|
let page = browser
|
|
.new_page(probe_url)
|
|
.await
|
|
.with_context(|| format!("open probe page {probe_url}"))?;
|
|
crate::crawler::nav::wait_for_nav(&page)
|
|
.await
|
|
.context("wait for nav on probe")?;
|
|
// Best-effort wait for the layout marker. Timeout is fine — the
|
|
// probe classifier handles a missing `#logo` as Transient anyway,
|
|
// and the verify loop retries on Transient.
|
|
let _ = crate::crawler::nav::wait_for_selector(
|
|
&page,
|
|
"#logo",
|
|
crate::crawler::nav::SELECTOR_TIMEOUT,
|
|
)
|
|
.await;
|
|
let html = page.content().await.context("read probe html")?;
|
|
page.close().await.ok();
|
|
Ok(html)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// registrable_domain tests live in crawler::url_utils now —
|
|
// it's the canonical home for that helper.
|
|
|
|
#[test]
|
|
fn classify_probe_ok_when_logo_and_avatar_present() {
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
|
|
// Real "logged out" response: site layout renders fine, just no
|
|
// avatar widget. This is the only state that should blame the
|
|
// session cookie.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<main>Please log in.</main>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_broken_page_body() {
|
|
let html = "<html><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_when_logo_missing() {
|
|
// No broken-body marker, but no site layout either — treat as
|
|
// transient (could be a Cloudflare interstitial, a 5xx page,
|
|
// etc.) rather than blaming the session.
|
|
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_empty_response() {
|
|
assert_eq!(classify_probe(""), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_ok_when_reader_rendered() {
|
|
let html = r#"
|
|
<html><body>
|
|
<a id="pic_container">
|
|
<img id="page1" src="https://cdn/1.jpg">
|
|
</a>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_unauthenticated_when_no_reader_and_no_avatar() {
|
|
// What a logged-out hit on a chapter URL renders: a normal
|
|
// site layout (header etc.) with a "please log in" body, but
|
|
// no reader and no avatar widget.
|
|
let html = r#"
|
|
<html><body>
|
|
<header><div id="logo">Catalog</div></header>
|
|
<main>Please log in to read this chapter.</main>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(
|
|
classify_chapter_probe(html),
|
|
ChapterProbe::Unauthenticated
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_transient_when_logged_in_but_reader_missing() {
|
|
// Avatar shows the session is still valid; reader didn't
|
|
// render — site is serving an interstitial or the layout
|
|
// momentarily shifted. Retry, don't blame the session.
|
|
let html = r#"
|
|
<html><body>
|
|
<header><div id="logo">Catalog</div><div id="avatar_menu"></div></header>
|
|
<main>Site maintenance — back in 5 minutes.</main>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_transient_on_broken_page_body() {
|
|
let html =
|
|
"<html><body><p>we're sorry, the request file are not found.</p></body></html>";
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_chapter_probe_does_not_misfire_on_avatar_alone_without_reader() {
|
|
// Regression for the original bug: the binary
|
|
// find_element("#avatar_menu") check treated "no avatar" as
|
|
// session-expired even when a transient hiccup was the real
|
|
// cause. classify_chapter_probe must NOT trip on that pattern
|
|
// when pic_container *is* present.
|
|
let html = r#"
|
|
<html><body>
|
|
<a id="pic_container">
|
|
<img id="page1" src="https://cdn/1.jpg">
|
|
</a>
|
|
</body></html>
|
|
"#;
|
|
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
|
// Defensive: if a broken-page body somehow contains an
|
|
// #avatar_menu element (e.g. an unrelated debug page on the
|
|
// same template), the body signature still wins.
|
|
let html = r#"<html><body>
|
|
<p>we're sorry, the request file are not found.</p>
|
|
<div id="logo"></div>
|
|
<div id="avatar_menu"></div>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
}
|