Files
Mangalord/backend/src/crawler/source/target.rs
MechaCat02 d24e68c78d feat: chapter content sync via PHPSESSID + per-host pacing (0.25.0)
After the metadata pass, the crawler now fetches per-chapter image
content for chapters belonging to bookmarked mangas. Logged-in chapter
pages render every page image at once (no per-page navigation), so the
crawler reuses the operator's browser session via a pasted PHPSESSID
cookie. Each chapter sync is a single transaction: storage puts + page
row inserts + page_count update commit together, or roll back together
on any image error so the chapter stays at page_count=0 and is retried
next run.

New crawler modules:

- `rate_limit::HostRateLimiters`: per-host buckets keyed by URL host,
  with optional per-host overrides. Replaces the single shared
  `Mutex<RateLimiter>`. Catalog and CDN no longer share a budget;
  default 1 req/s per host.
- `session`: derives `.<registrable>.<tld>` from the start URL
  (override via `CRAWLER_COOKIE_DOMAIN` for multi-part TLDs), injects
  PHPSESSID into the Chromium cookie store, probes `#avatar_menu` at
  startup to fail fast on a bad/expired cookie.
- `content`: parses `a#pic_container img:not(.loading)` with `pageN`
  id-based sorting (DOM order isn't trusted), then performs the
  atomic chapter sync.

bin/crawler additions:

- Concurrent chapter content phase via `futures_util::for_each_concurrent`
  (`CRAWLER_CHAPTER_WORKERS`, default 1). Browser is borrowed across
  workers — chromiumoxide allows concurrent `new_page` on `&self` —
  and per-host rate limit gates total RPS regardless of worker count.
- reqwest gets the `cookies` feature, a `Jar` seeded with PHPSESSID
  for the catalog domain only (CDN intentionally not given the
  cookie), and `Referer` is set on cover + chapter image fetches.
- New env knobs: `CRAWLER_PHPSESSID`, `CRAWLER_COOKIE_DOMAIN`,
  `CRAWLER_USER_AGENT`, `CRAWLER_CHAPTER_WORKERS`,
  `CRAWLER_SKIP_CHAPTER_CONTENT`, `CRAWLER_FORCE_REFETCH_CHAPTERS`,
  `CRAWLER_CDN_HOST` + `CRAWLER_CDN_RATE_MS`.
- Mid-run session-expired detection: `#avatar_menu` is re-checked on
  every chapter page nav; first failure aborts the phase with a
  cookie-refresh message.

Bookmark-driven enqueueing is sync-on-crawl-tick only: the bookmarked
chapters with `page_count = 0` are queried at the start of the
chapter-content phase. Sync-on-bookmark via an API hook is deferred
to a follow-up branch — that needs a daemon consumer of crawler_jobs,
which doesn't exist yet.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-23 00:28:36 +02:00

793 lines
29 KiB
Rust

//! First concrete [`Source`] impl, modeled on the selectors of the
//! old Puppeteer crawler. The name "target" is a placeholder — rename
//! once the site is officially identified.
//!
//! `scraper`'s selector parser does not support `:has()` or
//! `:contains()`, so the labelled-`td` lookups from the old script
//! (`td:has(label:contains("Author:"))`) are implemented by walking
//! the parsed tree.
use std::time::Duration;
use anyhow::Context;
use async_trait::async_trait;
use sha2::{Digest, Sha256};
use super::{
DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
SourceMangaRef,
};
pub struct TargetSource {
base_url: String,
parse_chapters: bool,
}
impl TargetSource {
pub fn new(base_url: impl Into<String>) -> Self {
Self {
base_url: base_url.into(),
parse_chapters: true,
}
}
pub fn base_url(&self) -> &str {
&self.base_url
}
/// Skip the chapter-list selector when parsing detail pages.
/// The returned `SourceManga.chapters` will be empty even when the
/// page has a chapter table. Caller must also avoid calling
/// `repo::crawler::sync_manga_chapters` for these mangas — an
/// empty list would otherwise soft-drop the manga's existing
/// chapter rows.
pub fn without_chapter_parsing(mut self) -> Self {
self.parse_chapters = false;
self
}
}
#[async_trait]
impl Source for TargetSource {
fn id(&self) -> &'static str {
"target"
}
async fn discover(
&self,
ctx: &FetchContext<'_>,
mode: DiscoverMode,
max_results: Option<usize>,
) -> anyhow::Result<Vec<SourceMangaRef>> {
// Always visit page 1 first because that's the only way to
// discover `last_page`. We cache the HTML so we don't have to
// re-navigate when the iteration reaches page 1 again.
let first_html = navigate(ctx, self.base_url.as_str()).await?;
let last_page = {
let doc = scraper::Html::parse_document(&first_html);
parse_last_page(&doc)
};
let backfill = matches!(mode, DiscoverMode::Backfill);
let order: Vec<i32> = match (last_page, backfill) {
(None, _) => vec![1],
// Backfill = oldest-first: walk pages last → 1, then
// reverse within each page (the listing is update_date
// DESC, so the bottom of the last page is the oldest
// entry the source still surfaces).
(Some(last), true) => (1..=last).rev().collect(),
(Some(last), false) => (1..=last).collect(),
};
tracing::info!(
?mode,
last_page = ?last_page,
page_count = order.len(),
"walking pagination"
);
let mut all = Vec::new();
for page_num in order {
let html = if page_num == 1 {
first_html.clone()
} else {
navigate(ctx, &page_url(&self.base_url, page_num)).await?
};
let mut page_refs = {
let doc = scraper::Html::parse_document(&html);
parse_manga_list_from(&doc)
};
if backfill {
page_refs.reverse();
}
tracing::info!(page_num, count = page_refs.len(), "page walked");
all.extend(page_refs);
if cap_reached(&all, max_results) {
tracing::info!(cap = ?max_results, "max_results reached; halting pagination");
break;
}
}
Ok(truncate_to_cap(all, max_results))
}
async fn fetch_manga(
&self,
ctx: &FetchContext<'_>,
r: &SourceMangaRef,
) -> anyhow::Result<SourceManga> {
let html = navigate(ctx, r.url.as_str()).await?;
parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
.with_context(|| format!("parse manga detail at {}", r.url))
}
async fn fetch_chapter_list(
&self,
_ctx: &FetchContext<'_>,
_manga: &SourceManga,
) -> anyhow::Result<Vec<SourceChapterRef>> {
anyhow::bail!("fetch_chapter_list not implemented yet")
}
async fn fetch_chapter(
&self,
_ctx: &FetchContext<'_>,
_r: &SourceChapterRef,
) -> anyhow::Result<SourceChapter> {
anyhow::bail!("fetch_chapter not implemented yet")
}
}
fn cap_reached<T>(buf: &[T], max: Option<usize>) -> bool {
matches!(max, Some(m) if buf.len() >= m)
}
fn truncate_to_cap<T>(mut buf: Vec<T>, max: Option<usize>) -> Vec<T> {
if let Some(m) = max {
buf.truncate(m);
}
buf
}
/// Single point of rate-limited navigation. Every Source request goes
/// through here, so the per-host limiter map is the only knob that
/// controls per-origin RPS.
async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result<String> {
ctx.rate.wait_for(url).await?;
let page = ctx.browser.new_page(url).await?;
page.wait_for_navigation().await?;
// Stopgap until we wait on a specific selector per page type —
// gives any post-load JS a beat to finish injecting content.
tokio::time::sleep(Duration::from_secs(1)).await;
let html = page.content().await?;
page.close().await?;
Ok(html)
}
fn parse_last_page(doc: &scraper::Html) -> Option<i32> {
// Pagination links carry their page number as text. Take the
// numeric maximum so we don't depend on a specific layout (Prev,
// Next, ellipses, etc. all get filtered out by .parse).
let sel = scraper::Selector::parse("#left_side .pagination a").unwrap();
doc.select(&sel)
.filter_map(|a| {
collapse_whitespace(&a.text().collect::<String>())
.parse::<i32>()
.ok()
})
.max()
}
/// Substitutes the first `/N/` path segment with the target page
/// number. Source impls that paginate via a different URL shape can
/// override this — for the modeled site the segment is always present.
fn page_url(template_url: &str, page: i32) -> String {
let bytes = template_url.as_bytes();
let mut i = 0;
while i + 1 < bytes.len() {
if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() {
let start = i;
let mut j = i + 1;
while j < bytes.len() && bytes[j].is_ascii_digit() {
j += 1;
}
if j < bytes.len() && bytes[j] == b'/' {
let mut out = String::with_capacity(template_url.len() + 4);
out.push_str(&template_url[..start]);
out.push_str(&format!("/{page}/"));
out.push_str(&template_url[j + 1..]);
return out;
}
}
i += 1;
}
template_url.to_string()
}
#[cfg(test)]
fn parse_manga_list(html: &str) -> Vec<SourceMangaRef> {
let doc = scraper::Html::parse_document(html);
parse_manga_list_from(&doc)
}
fn parse_manga_list_from(doc: &scraper::Html) -> Vec<SourceMangaRef> {
let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
doc.select(&sel)
.filter_map(|a| {
let url = a.value().attr("href")?.trim().to_string();
if url.is_empty() {
return None;
}
let title = collapse_whitespace(&a.text().collect::<String>());
if title.is_empty() {
return None;
}
Some(SourceMangaRef {
source_manga_key: derive_key_from_url(&url),
title,
url,
})
})
.collect()
}
fn parse_manga_detail(
html: &str,
key: &str,
include_chapters: bool,
) -> anyhow::Result<SourceManga> {
let doc = scraper::Html::parse_document(html);
let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
let summary = first_text(&doc, ".manga_summary");
let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");
let authors = links_in_labelled_td(&doc, "Author");
let genres = links_in_labelled_td(&doc, "Genre");
let raw_status = labelled_td_child_text(&doc, "Status", "span");
let status = normalize_status(raw_status.as_deref(), key);
let alternative_titles = labelled_td_value_after_label(&doc, "Alternative")
.map(|s| {
s.split([';', ',', '|'])
.map(str::trim)
.filter(|p| !p.is_empty())
.map(String::from)
.collect()
})
.unwrap_or_default();
let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap();
let tags: Vec<String> = doc
.select(&tag_sel)
.map(|a| collapse_whitespace(&a.text().collect::<String>()))
.map(|s| strip_tag_count(&s))
.filter(|s| !s.is_empty())
.collect();
let chapters = if include_chapters {
parse_chapter_list(&doc)
} else {
Vec::new()
};
let mut manga = SourceManga {
source_manga_key: key.to_string(),
title,
alternative_titles,
authors,
genres,
tags,
status,
summary,
cover_url,
chapters,
metadata_hash: String::new(),
};
manga.metadata_hash = compute_metadata_hash(&manga);
Ok(manga)
}
/// Source advertises status as "Ongoing" or "Completed"; we normalize
/// to the lowercase form the `mangas.status` CHECK constraint accepts.
/// Anything else is a parse miss (selector drift, new value, etc.) and
/// returns `None` after logging — the manga sync continues regardless.
fn normalize_status(raw: Option<&str>, key: &str) -> Option<String> {
let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?;
if trimmed.eq_ignore_ascii_case("ongoing") {
Some("ongoing".to_string())
} else if trimmed.eq_ignore_ascii_case("completed") {
Some("completed".to_string())
} else {
tracing::error!(
key,
raw_status = trimmed,
"unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None"
);
None
}
}
/// Strips a trailing digit-only `(NN)` suffix from a tag name, the form
/// the source uses to display tag counts. Non-numeric parentheses are
/// preserved.
fn strip_tag_count(s: &str) -> String {
let trimmed = s.trim();
if trimmed.ends_with(')') {
if let Some(open) = trimmed.rfind('(') {
let inside = &trimmed[open + 1..trimmed.len() - 1];
if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) {
return trimmed[..open].trim().to_string();
}
}
}
trimmed.to_string()
}
fn parse_chapter_list(doc: &scraper::Html) -> Vec<SourceChapterRef> {
let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
doc.select(&sel)
.filter_map(|a| {
let url = a.value().attr("href")?.trim().to_string();
if url.is_empty() {
return None;
}
let title_text = collapse_whitespace(&a.text().collect::<String>());
let number = parse_chapter_number(&title_text).unwrap_or(0);
Some(SourceChapterRef {
source_chapter_key: derive_chapter_key_from_url(&url),
number,
title: (!title_text.is_empty()).then_some(title_text),
url,
})
})
.collect()
}
fn parse_chapter_number(text: &str) -> Option<i32> {
let mut buf = String::new();
for c in text.chars() {
if c.is_ascii_digit() {
buf.push(c);
} else if !buf.is_empty() {
break;
}
}
buf.parse().ok()
}
fn derive_key_from_url(url: &str) -> String {
url.split('?')
.next()
.unwrap_or(url)
.trim_end_matches('/')
.rsplit('/')
.find(|s| !s.is_empty())
.unwrap_or(url)
.to_string()
}
/// Chapter URLs on this source point at the reader's page 1, e.g.
/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
/// identifies a page *within* a chapter, so naively taking the last
/// path component returns `"pg-1"` for every chapter and collapses
/// them all under one source_chapter_key downstream.
fn derive_chapter_key_from_url(url: &str) -> String {
let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
let without_reader_page = match trimmed.rsplit_once('/') {
Some((prefix, last)) if is_reader_page_segment(last) => prefix,
_ => trimmed,
};
without_reader_page
.rsplit('/')
.find(|s| !s.is_empty())
.unwrap_or(url)
.to_string()
}
fn is_reader_page_segment(s: &str) -> bool {
s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
}
fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
let s = scraper::Selector::parse(sel).ok()?;
let el = doc.select(&s).next()?;
let text = collapse_whitespace(&el.text().collect::<String>());
(!text.is_empty()).then_some(text)
}
fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option<String> {
let s = scraper::Selector::parse(sel).ok()?;
let el = doc.select(&s).next()?;
el.value().attr(attr).map(str::to_string)
}
/// `td` whose contained `label` text begins with `label_prefix` — the
/// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`.
fn td_with_label<'a>(
doc: &'a scraper::Html,
label_prefix: &str,
) -> Option<scraper::ElementRef<'a>> {
let td_sel = scraper::Selector::parse("td").unwrap();
let label_sel = scraper::Selector::parse("label").unwrap();
for td in doc.select(&td_sel) {
for label in td.select(&label_sel) {
let text: String = label.text().collect();
if text.trim().starts_with(label_prefix) {
return Some(td);
}
}
}
None
}
fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec<String> {
let Some(td) = td_with_label(doc, label_prefix) else {
return Vec::new();
};
let a_sel = scraper::Selector::parse("a").unwrap();
td.select(&a_sel)
.map(|a| collapse_whitespace(&a.text().collect::<String>()))
.filter(|s| !s.is_empty())
.collect()
}
fn labelled_td_child_text(
doc: &scraper::Html,
label_prefix: &str,
child_sel: &str,
) -> Option<String> {
let td = td_with_label(doc, label_prefix)?;
let child = scraper::Selector::parse(child_sel).ok()?;
let el = td.select(&child).next()?;
let text = collapse_whitespace(&el.text().collect::<String>());
(!text.is_empty()).then_some(text)
}
/// Returns the text content of the labelled `td` with the leading
/// "Label:" portion stripped — used for "Alternative:" which puts the
/// value directly in the cell rather than in a child element.
fn labelled_td_value_after_label(
doc: &scraper::Html,
label_prefix: &str,
) -> Option<String> {
let td = td_with_label(doc, label_prefix)?;
let full: String = td.text().collect();
let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full);
let trimmed = collapse_whitespace(after);
(!trimmed.is_empty()).then_some(trimmed)
}
fn collapse_whitespace(s: &str) -> String {
s.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn compute_metadata_hash(m: &SourceManga) -> String {
// Field separators are ASCII unit/record separators so a field
// containing a delimiter character can't be mistaken for two
// smaller fields.
let mut h = Sha256::new();
fn feed(h: &mut Sha256, s: &str) {
h.update(s.as_bytes());
h.update(b"\x1F");
}
fn feed_list(h: &mut Sha256, xs: &[String]) {
for s in xs {
feed(h, s);
}
h.update(b"\x1E");
}
feed(&mut h, &m.title);
feed_list(&mut h, &m.alternative_titles);
feed_list(&mut h, &m.authors);
feed_list(&mut h, &m.genres);
feed_list(&mut h, &m.tags);
feed(&mut h, m.status.as_deref().unwrap_or(""));
feed(&mut h, m.summary.as_deref().unwrap_or(""));
feed(&mut h, m.cover_url.as_deref().unwrap_or(""));
format!("{:x}", h.finalize())
}
#[cfg(test)]
mod tests {
use super::*;
const LISTING_HTML: &str = r#"
<html><body>
<div id="left_side">
<div class="pic_list">
<div class="updatesli">
<span><a href="https://target.example/manga/foo">Foo Manga</a></span>
</div>
<div class="updatesli">
<span><a href="https://target.example/manga/bar-baz"> Bar Baz </a></span>
</div>
<div class="updatesli">
<span><a href="">Empty href ignored</a></span>
</div>
</div>
</div>
</body></html>
"#;
const DETAIL_HTML: &str = r#"
<html><body>
<div class="w-title"><h1>Test Manga Title</h1></div>
<div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
<div class="manga_summary">A summary of the manga.</div>
<table>
<tr><td><label>Author:</label><a href="/a/1">Author One</a><a href="/a/2">Author Two</a></td></tr>
<tr><td><label>Genre(s):</label><a href="/g/1">Action</a><a href="/g/2">Drama</a></td></tr>
<tr><td><label>Status:</label><span>Ongoing</span></td></tr>
<tr><td><label>Alternative:</label> Alt Title 1; Alt Title 2 </td></tr>
</table>
<aside><div class="aside-body">
<a class="tag">Fantasy (21)</a>
<a class="tag">Romance</a>
<a class="tag"> Action (5)</a>
<a class="not-a-tag">should-be-ignored</a>
</div></aside>
<table id="chapter_table">
<tr><td><h4><a class="chico" href="/manga/foo/chapter/1">Ch.1</a></h4></td></tr>
<tr><td><h4><a class="chico" href="/manga/foo/chapter/2">Ch.2 - The Beginning</a></h4></td></tr>
<tr><td><h4><a class="chico" href="/manga/foo/chapter/3">Chapter 3: Onward</a></h4></td></tr>
</table>
</body></html>
"#;
#[test]
fn parse_manga_list_extracts_title_url_and_derives_key() {
let refs = parse_manga_list(LISTING_HTML);
assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
assert_eq!(refs[0].title, "Foo Manga");
assert_eq!(refs[0].url, "https://target.example/manga/foo");
assert_eq!(refs[0].source_manga_key, "foo");
assert_eq!(refs[1].title, "Bar Baz");
assert_eq!(refs[1].source_manga_key, "bar-baz");
}
#[test]
fn parse_manga_detail_pulls_all_fields() {
let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
assert_eq!(m.source_manga_key, "test-key");
assert_eq!(m.title, "Test Manga Title");
assert_eq!(m.summary.as_deref(), Some("A summary of the manga."));
assert_eq!(m.authors, vec!["Author One", "Author Two"]);
assert_eq!(m.genres, vec!["Action", "Drama"]);
assert_eq!(m.status.as_deref(), Some("ongoing"));
assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]);
// Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy".
assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]);
assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg"));
assert!(!m.metadata_hash.is_empty());
assert_eq!(m.chapters.len(), 3);
assert_eq!(m.chapters[0].number, 1);
assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1"));
assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1");
assert_eq!(m.chapters[0].source_chapter_key, "1");
assert_eq!(m.chapters[1].number, 2);
assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning"));
assert_eq!(m.chapters[2].number, 3);
assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward"));
}
#[test]
fn status_normalized_case_insensitively() {
assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing"));
assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing"));
assert_eq!(normalize_status(Some(" completed "), "k").as_deref(), Some("completed"));
}
#[test]
fn unknown_status_logs_and_returns_none() {
// Logging is observable in test output via tracing-test, but
// here we just assert the contract: unknown becomes None
// (and the manga is therefore still synced by the caller).
assert!(normalize_status(Some("Hiatus"), "k").is_none());
assert!(normalize_status(Some(""), "k").is_none());
assert!(normalize_status(None, "k").is_none());
}
#[test]
fn strip_tag_count_drops_trailing_digit_parens_only() {
assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy");
assert_eq!(strip_tag_count(" Action (5) "), "Action");
assert_eq!(strip_tag_count("Romance"), "Romance");
// Non-numeric parens stay put.
assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)");
// Only the trailing paren is considered.
assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
}
#[test]
fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
// Real listing fixture from the target site. 15 rows: chapters
// with various Ch.N markup, one hiatus row, three "notice." rows,
// and duplicates of Ch.1 and Ch.52 from different uploaders.
// Every row must survive parsing and every chapter must have a
// distinct source_chapter_key — chapter URLs all end in `/pg-1/`
// (the reader's page-1 entry point), and a naive
// last-segment-of-URL derivation returns "pg-1" for every row,
// collapsing the whole list into one downstream chapter row.
let html = include_str!(
"../../../tests/fixtures/target/chapter_list_uu.html"
);
let doc = scraper::Html::parse_document(html);
let chapters = parse_chapter_list(&doc);
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
let mut keys: Vec<&str> =
chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
keys.sort();
let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
for c in &chapters {
assert_ne!(
c.source_chapter_key, "pg-1",
"key must not be the reader-page segment: {:?}", c
);
}
// Latest chapter is first (source orders newest → oldest).
assert_eq!(chapters[0].number, 67);
assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
// Duplicate-number chapters (different uploaders) survive as
// two rows. The (manga_id, number) UNIQUE collapse is a
// downstream schema concern handled separately.
assert_eq!(
chapters.iter().filter(|c| c.number == 52).count(),
2,
"two Ch.52 uploads must both survive parsing"
);
assert_eq!(
chapters.iter().filter(|c| c.number == 1).count(),
2,
"Ch.1 Official and Ch.1 Team Hazama are both kept"
);
// Notices / hiatus rows have no leading digit so they parse to
// number=0. They are not filtered out.
let zero = chapters.iter().filter(|c| c.number == 0).count();
assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
}
#[test]
fn parse_chapter_number_grabs_first_integer_run() {
assert_eq!(parse_chapter_number("Ch.1"), Some(1));
assert_eq!(parse_chapter_number("Chapter 12"), Some(12));
assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2));
// Decimal chapters keep the integer part (i32 storage).
assert_eq!(parse_chapter_number("Ch.12.5"), Some(12));
assert_eq!(parse_chapter_number("Special"), None);
}
#[test]
fn parse_last_page_picks_highest_pagination_link() {
let html = r#"
<div id="left_side"><div class="pagination">
<a href="/list/1/">Prev</a>
<ol>
<li><a href="/list/1/">1</a></li>
<li><a href="/list/2/">2</a></li>
<li><a href="/list/47/">47</a></li>
<li><a href="/list/2/">Next</a></li>
</ol>
</div></div>
"#;
let doc = scraper::Html::parse_document(html);
assert_eq!(parse_last_page(&doc), Some(47));
}
#[test]
fn parse_last_page_none_when_no_pagination() {
let doc = scraper::Html::parse_document("<html></html>");
assert!(parse_last_page(&doc).is_none());
}
#[test]
fn page_url_substitutes_numeric_path_segment() {
assert_eq!(
page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5),
"https://site.example/list/5/?f=1&o=1&sortby=update_date&e="
);
// No numeric segment → URL returned unchanged.
assert_eq!(
page_url("https://site.example/list/?f=1", 5),
"https://site.example/list/?f=1"
);
}
#[test]
fn derive_key_strips_trailing_slash_and_query() {
assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo");
assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo");
assert_eq!(derive_key_from_url("/manga/bar"), "bar");
}
#[test]
fn derive_chapter_key_strips_trailing_reader_page_segment() {
// Listing links go to page 1 of the reader; strip /pg-\d+/.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
"br_chapter-379272"
);
assert_eq!(
derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
"to_chapter-13"
);
// Defensive: deep-link to a non-first page should still resolve
// to the same chapter identity.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
"br_chapter-379272"
);
// No reader-page suffix → behaves like derive_key_from_url.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
"br_chapter-379272"
);
// Query strings are stripped.
assert_eq!(
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
"br_chapter-379272"
);
// `pg-foo` is not a valid reader-page segment; treated as identity.
assert_eq!(
derive_chapter_key_from_url(".../uu/something/pg-foo/"),
"pg-foo"
);
// Bare `pg-` (no digits) likewise not stripped.
assert_eq!(
derive_chapter_key_from_url(".../uu/something/pg-/"),
"pg-"
);
}
#[test]
fn metadata_hash_is_stable_and_field_sensitive() {
let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
assert_eq!(base.metadata_hash, again.metadata_hash);
// Same fields except status flipped — hash must change.
let altered_html = DETAIL_HTML.replace("Ongoing", "Completed");
let altered = parse_manga_detail(&altered_html, "k", true).unwrap();
assert_ne!(base.metadata_hash, altered.metadata_hash);
}
#[test]
fn missing_optional_fields_parse_to_none() {
let html = r#"<html><body><div class="w-title"><h1>Minimal</h1></div></body></html>"#;
let m = parse_manga_detail(html, "min", true).unwrap();
assert_eq!(m.title, "Minimal");
assert!(m.summary.is_none());
assert!(m.status.is_none());
assert!(m.authors.is_empty());
assert!(m.genres.is_empty());
assert!(m.tags.is_empty());
assert!(m.alternative_titles.is_empty());
assert!(m.chapters.is_empty());
}
#[test]
fn parse_manga_detail_skips_chapters_when_disabled() {
// Same fixture that yields 3 chapters above; with include_chapters=false
// the chapter table is ignored and the rest of the metadata still parses.
let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap();
assert!(m.chapters.is_empty(), "chapters should be empty when disabled");
assert_eq!(m.title, "Test Manga Title", "other fields still parse");
assert_eq!(m.authors, vec!["Author One", "Author Two"]);
}
#[test]
fn parse_manga_detail_errors_on_missing_title() {
let html = "<html><body><p>nothing</p></body></html>";
let err = parse_manga_detail(html, "x", true).unwrap_err();
assert!(err.to_string().contains("missing .w-title h1"));
}
}