Files
Mangalord/backend/src/crawler/session.rs
MechaCat02 c320eda7cd chore: dedupe is_unique_violation, lift SQL into repo, centralise URL parsing
Three layering cleanups from REVIEW.md §5 / §3:

- Drop the three private `is_unique_violation` helpers in
  repo::{user,chapter,bookmark} in favour of sqlx 0.8's
  `DatabaseError::is_unique_violation()` method (already used by
  repo::collection).
- Remove the unreachable 23505 branch in repo::chapter::create — the
  (manga_id, number) UNIQUE was dropped in 0013, so the defensive arm
  could no longer fire. A doc note records what to do if uniqueness
  is re-added.
- Move three inline SQL queries out of handlers/daemon into repo
  functions: bookmarks' chapter-belongs-to-manga guard
  (`repo::chapter::belongs_to_manga`), the daemon's dispatch lookup
  (`repo::chapter::dispatch_target`), and the daemon's page_count
  safety net (`repo::chapter::page_count`). Restores the
  handlers→repo layering invariant in CLAUDE.md.
- New `crawler::url_utils` module consolidates host_of / origin_of /
  registrable_domain — they used to live in three crawler submodules
  with diverging edge-case behaviour. Tests moved with them.
- Doc cross-references on repo::author::set_for_manga and
  repo::genre::set_for_manga pointing to the crawler's name-keyed
  variants, so the intentional duplication is discoverable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-28 20:24:05 +02:00

226 lines
8.8 KiB
Rust

//! PHPSESSID injection + login probe.
//!
//! The catalog site we crawl renders chapter pages as a single multi-
//! page list only for logged-in users. We don't try to bypass the
//! login (CAPTCHA wall) — instead the operator pastes their browser's
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
//! it into Chromium *and* reqwest before the first navigation.
//!
//! Two things the cookie alone doesn't give us:
//! 1. The cookie value is only meaningful to the *server* — we have
//! no way to predict from the value alone whether it's still valid.
//! `verify_session` does a navigation and inspects the probe page
//! for three outcomes: broken-page response (transient — retry the
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
//! — bail loudly), or both present (authenticated). The earlier
//! avatar-only check conflated "site is hiccuping" with "session is
//! dead" and refused to start the crawler when the site had a brief
//! 503.
//! 2. The reqwest client (used for cover and chapter-image downloads)
//! has its own cookie store; we seed it for the catalog host only.
//! CDN hosts are deliberately *not* given the cookie — they serve
//! image bytes by signed URLs and don't need it.
use std::time::Duration;
use anyhow::{anyhow, Context};
use chromiumoxide::browser::Browser;
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
/// Outcome of inspecting a probe-page response.
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SessionProbe {
/// `#logo` present and `#avatar_menu` present — session valid.
Ok,
/// `#logo` present but `#avatar_menu` absent — site rendered the
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
Unauthenticated,
/// Broken-page body signature or `#logo` missing — site is hiccuping.
/// Caller retries the probe rather than blaming the session.
Transient,
}
/// Re-export so existing callers keep working after the helper moved
/// to `crawler::url_utils`. The body lives there.
pub use crate::crawler::url_utils::registrable_domain;
/// Inject the PHPSESSID cookie into the browser's cookie store for the
/// catalog domain. Must be called before any navigation that depends on
/// authentication; subsequent navigations include the cookie
/// automatically.
pub async fn inject_phpsessid(
browser: &Browser,
sid: &str,
cookie_domain: &str,
) -> anyhow::Result<()> {
let cookie = CookieParam {
name: "PHPSESSID".to_string(),
value: sid.to_string(),
url: None,
domain: Some(cookie_domain.to_string()),
path: Some("/".to_string()),
secure: None,
http_only: Some(true),
same_site: None,
expires: None,
priority: None,
same_party: None,
source_scheme: None,
source_port: None,
partition_key: None,
};
browser
.set_cookies(vec![cookie])
.await
.context("set PHPSESSID in chromium cookie store")?;
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
Ok(())
}
/// Three-way classification of a probe-page response. Pure over HTML so
/// it's unit-testable without a real browser. Order matters: a body
/// matching the broken-page template is `Transient` even if the page
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
/// site signal over a stray selector match.
pub fn classify_probe(html: &str) -> SessionProbe {
if is_broken_page_body(html) {
return SessionProbe::Transient;
}
let doc = scraper::Html::parse_document(html);
if !has_logo_sentinel(&doc) {
return SessionProbe::Transient;
}
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
if doc.select(&avatar_sel).next().is_some() {
SessionProbe::Ok
} else {
SessionProbe::Unauthenticated
}
}
/// In-startup retry budget for the session probe. Small but non-zero —
/// startup hitting a 5-second site hiccup shouldn't fail the operator
/// with "PHPSESSID expired" when the session is actually fine.
const PROBE_MAX_ATTEMPTS: u32 = 3;
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
/// Navigate to `probe_url` and classify the response. Retries the probe
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
/// fast on `Unauthenticated`; returns `Ok(())` on success.
///
/// This burns one navigation per attempt against the catalog's rate
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
/// minutes into a backfill costs 30 minutes.
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
let mut attempt = 0u32;
loop {
attempt += 1;
let html = fetch_probe_html(browser, probe_url).await?;
match classify_probe(&html) {
SessionProbe::Ok => {
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
return Ok(());
}
SessionProbe::Unauthenticated => {
return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url} \
(page rendered the normal layout); PHPSESSID is missing, expired, \
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
));
}
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
tracing::warn!(
attempt,
max_attempts = PROBE_MAX_ATTEMPTS,
"session probe got a transient page; retrying"
);
tokio::time::sleep(PROBE_RETRY_DELAY).await;
}
SessionProbe::Transient => {
return Err(anyhow!(
"session probe failed — probe page at {probe_url} returned a \
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID."
));
}
}
}
}
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
let page = browser
.new_page(probe_url)
.await
.with_context(|| format!("open probe page {probe_url}"))?;
page.wait_for_navigation().await.context("wait for nav on probe")?;
let html = page.content().await.context("read probe html")?;
page.close().await.ok();
Ok(html)
}
#[cfg(test)]
mod tests {
use super::*;
// registrable_domain tests live in crawler::url_utils now —
// it's the canonical home for that helper.
#[test]
fn classify_probe_ok_when_logo_and_avatar_present() {
let html = r#"<html><body>
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Ok);
}
#[test]
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
// Real "logged out" response: site layout renders fine, just no
// avatar widget. This is the only state that should blame the
// session cookie.
let html = r#"<html><body>
<header><div id="logo">Target</div></header>
<main>Please log in.</main>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
}
#[test]
fn classify_probe_transient_on_broken_page_body() {
let html = "<html><body>\
<p>we're sorry, the request file are not found.</p>\
</body></html>";
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
#[test]
fn classify_probe_transient_when_logo_missing() {
// No broken-body marker, but no site layout either — treat as
// transient (could be a Cloudflare interstitial, a 5xx page,
// etc.) rather than blaming the session.
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
#[test]
fn classify_probe_transient_on_empty_response() {
assert_eq!(classify_probe(""), SessionProbe::Transient);
}
#[test]
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
// Defensive: if a broken-page body somehow contains an
// #avatar_menu element (e.g. an unrelated debug page on the
// same template), the body signature still wins.
let html = r#"<html><body>
<p>we're sorry, the request file are not found.</p>
<div id="logo"></div>
<div id="avatar_menu"></div>
</body></html>"#;
assert_eq!(classify_probe(html), SessionProbe::Transient);
}
}