Three layering cleanups from REVIEW.md §5 / §3:
- Drop the three private `is_unique_violation` helpers in
repo::{user,chapter,bookmark} in favour of sqlx 0.8's
`DatabaseError::is_unique_violation()` method (already used by
repo::collection).
- Remove the unreachable 23505 branch in repo::chapter::create — the
(manga_id, number) UNIQUE was dropped in 0013, so the defensive arm
could no longer fire. A doc note records what to do if uniqueness
is re-added.
- Move three inline SQL queries out of handlers/daemon into repo
functions: bookmarks' chapter-belongs-to-manga guard
(`repo::chapter::belongs_to_manga`), the daemon's dispatch lookup
(`repo::chapter::dispatch_target`), and the daemon's page_count
safety net (`repo::chapter::page_count`). Restores the
handlers→repo layering invariant in CLAUDE.md.
- New `crawler::url_utils` module consolidates host_of / origin_of /
registrable_domain — they used to live in three crawler submodules
with diverging edge-case behaviour. Tests moved with them.
- Doc cross-references on repo::author::set_for_manga and
repo::genre::set_for_manga pointing to the crawler's name-keyed
variants, so the intentional duplication is discoverable.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
226 lines
8.8 KiB
Rust
226 lines
8.8 KiB
Rust
//! PHPSESSID injection + login probe.
|
|
//!
|
|
//! The catalog site we crawl renders chapter pages as a single multi-
|
|
//! page list only for logged-in users. We don't try to bypass the
|
|
//! login (CAPTCHA wall) — instead the operator pastes their browser's
|
|
//! `PHPSESSID` cookie into `CRAWLER_PHPSESSID` and the crawler injects
|
|
//! it into Chromium *and* reqwest before the first navigation.
|
|
//!
|
|
//! Two things the cookie alone doesn't give us:
|
|
//! 1. The cookie value is only meaningful to the *server* — we have
|
|
//! no way to predict from the value alone whether it's still valid.
|
|
//! `verify_session` does a navigation and inspects the probe page
|
|
//! for three outcomes: broken-page response (transient — retry the
|
|
//! probe), `#logo` present but `#avatar_menu` absent (genuine logout
|
|
//! — bail loudly), or both present (authenticated). The earlier
|
|
//! avatar-only check conflated "site is hiccuping" with "session is
|
|
//! dead" and refused to start the crawler when the site had a brief
|
|
//! 503.
|
|
//! 2. The reqwest client (used for cover and chapter-image downloads)
|
|
//! has its own cookie store; we seed it for the catalog host only.
|
|
//! CDN hosts are deliberately *not* given the cookie — they serve
|
|
//! image bytes by signed URLs and don't need it.
|
|
|
|
use std::time::Duration;
|
|
|
|
use anyhow::{anyhow, Context};
|
|
use chromiumoxide::browser::Browser;
|
|
use chromiumoxide::cdp::browser_protocol::network::CookieParam;
|
|
|
|
use crate::crawler::detect::{has_logo_sentinel, is_broken_page_body};
|
|
|
|
/// Outcome of inspecting a probe-page response.
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum SessionProbe {
|
|
/// `#logo` present and `#avatar_menu` present — session valid.
|
|
Ok,
|
|
/// `#logo` present but `#avatar_menu` absent — site rendered the
|
|
/// normal layout for an unauthenticated visitor; refresh PHPSESSID.
|
|
Unauthenticated,
|
|
/// Broken-page body signature or `#logo` missing — site is hiccuping.
|
|
/// Caller retries the probe rather than blaming the session.
|
|
Transient,
|
|
}
|
|
|
|
/// Re-export so existing callers keep working after the helper moved
|
|
/// to `crawler::url_utils`. The body lives there.
|
|
pub use crate::crawler::url_utils::registrable_domain;
|
|
|
|
/// Inject the PHPSESSID cookie into the browser's cookie store for the
|
|
/// catalog domain. Must be called before any navigation that depends on
|
|
/// authentication; subsequent navigations include the cookie
|
|
/// automatically.
|
|
pub async fn inject_phpsessid(
|
|
browser: &Browser,
|
|
sid: &str,
|
|
cookie_domain: &str,
|
|
) -> anyhow::Result<()> {
|
|
let cookie = CookieParam {
|
|
name: "PHPSESSID".to_string(),
|
|
value: sid.to_string(),
|
|
url: None,
|
|
domain: Some(cookie_domain.to_string()),
|
|
path: Some("/".to_string()),
|
|
secure: None,
|
|
http_only: Some(true),
|
|
same_site: None,
|
|
expires: None,
|
|
priority: None,
|
|
same_party: None,
|
|
source_scheme: None,
|
|
source_port: None,
|
|
partition_key: None,
|
|
};
|
|
browser
|
|
.set_cookies(vec![cookie])
|
|
.await
|
|
.context("set PHPSESSID in chromium cookie store")?;
|
|
tracing::info!(domain = cookie_domain, "injected PHPSESSID into browser");
|
|
Ok(())
|
|
}
|
|
|
|
/// Three-way classification of a probe-page response. Pure over HTML so
|
|
/// it's unit-testable without a real browser. Order matters: a body
|
|
/// matching the broken-page template is `Transient` even if the page
|
|
/// happens to contain `#avatar_menu` HTML somewhere — trust the universal
|
|
/// site signal over a stray selector match.
|
|
pub fn classify_probe(html: &str) -> SessionProbe {
|
|
if is_broken_page_body(html) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let doc = scraper::Html::parse_document(html);
|
|
if !has_logo_sentinel(&doc) {
|
|
return SessionProbe::Transient;
|
|
}
|
|
let avatar_sel = scraper::Selector::parse("#avatar_menu").unwrap();
|
|
if doc.select(&avatar_sel).next().is_some() {
|
|
SessionProbe::Ok
|
|
} else {
|
|
SessionProbe::Unauthenticated
|
|
}
|
|
}
|
|
|
|
/// In-startup retry budget for the session probe. Small but non-zero —
|
|
/// startup hitting a 5-second site hiccup shouldn't fail the operator
|
|
/// with "PHPSESSID expired" when the session is actually fine.
|
|
const PROBE_MAX_ATTEMPTS: u32 = 3;
|
|
const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
|
|
|
|
/// Navigate to `probe_url` and classify the response. Retries the probe
|
|
/// on `Transient` outcomes (broken-page body, missing `#logo`); fails
|
|
/// fast on `Unauthenticated`; returns `Ok(())` on success.
|
|
///
|
|
/// This burns one navigation per attempt against the catalog's rate
|
|
/// limiter. The trade is worth it — failing here costs ~1s; failing 30
|
|
/// minutes into a backfill costs 30 minutes.
|
|
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
|
|
let mut attempt = 0u32;
|
|
loop {
|
|
attempt += 1;
|
|
let html = fetch_probe_html(browser, probe_url).await?;
|
|
match classify_probe(&html) {
|
|
SessionProbe::Ok => {
|
|
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present");
|
|
return Ok(());
|
|
}
|
|
SessionProbe::Unauthenticated => {
|
|
return Err(anyhow!(
|
|
"session probe failed — #avatar_menu not present at {probe_url} \
|
|
(page rendered the normal layout); PHPSESSID is missing, expired, \
|
|
or revoked. Refresh CRAWLER_PHPSESSID and re-run."
|
|
));
|
|
}
|
|
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
|
|
tracing::warn!(
|
|
attempt,
|
|
max_attempts = PROBE_MAX_ATTEMPTS,
|
|
"session probe got a transient page; retrying"
|
|
);
|
|
tokio::time::sleep(PROBE_RETRY_DELAY).await;
|
|
}
|
|
SessionProbe::Transient => {
|
|
return Err(anyhow!(
|
|
"session probe failed — probe page at {probe_url} returned a \
|
|
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \
|
|
The site appears to be down or rate-limiting us; try again \
|
|
later before refreshing CRAWLER_PHPSESSID."
|
|
));
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
async fn fetch_probe_html(browser: &Browser, probe_url: &str) -> anyhow::Result<String> {
|
|
let page = browser
|
|
.new_page(probe_url)
|
|
.await
|
|
.with_context(|| format!("open probe page {probe_url}"))?;
|
|
page.wait_for_navigation().await.context("wait for nav on probe")?;
|
|
let html = page.content().await.context("read probe html")?;
|
|
page.close().await.ok();
|
|
Ok(html)
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
// registrable_domain tests live in crawler::url_utils now —
|
|
// it's the canonical home for that helper.
|
|
|
|
#[test]
|
|
fn classify_probe_ok_when_logo_and_avatar_present() {
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div><div id="avatar_menu"></div></header>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Ok);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_unauth_when_logo_present_but_avatar_absent() {
|
|
// Real "logged out" response: site layout renders fine, just no
|
|
// avatar widget. This is the only state that should blame the
|
|
// session cookie.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<main>Please log in.</main>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Unauthenticated);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_broken_page_body() {
|
|
let html = "<html><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_when_logo_missing() {
|
|
// No broken-body marker, but no site layout either — treat as
|
|
// transient (could be a Cloudflare interstitial, a 5xx page,
|
|
// etc.) rather than blaming the session.
|
|
let html = "<html><body><h1>Service Unavailable</h1></body></html>";
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_transient_on_empty_response() {
|
|
assert_eq!(classify_probe(""), SessionProbe::Transient);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
|
|
// Defensive: if a broken-page body somehow contains an
|
|
// #avatar_menu element (e.g. an unrelated debug page on the
|
|
// same template), the body signature still wins.
|
|
let html = r#"<html><body>
|
|
<p>we're sorry, the request file are not found.</p>
|
|
<div id="logo"></div>
|
|
<div id="avatar_menu"></div>
|
|
</body></html>"#;
|
|
assert_eq!(classify_probe(html), SessionProbe::Transient);
|
|
}
|
|
}
|