//! First concrete [`Source`] impl, modeled on the selectors of the //! old Puppeteer crawler. The name "target" is a placeholder — rename //! once the site is officially identified. //! //! `scraper`'s selector parser does not support `:has()` or //! `:contains()`, so the labelled-`td` lookups from the old script //! (`td:has(label:contains("Author:"))`) are implemented by walking //! the parsed tree. use std::time::Duration; use anyhow::Context; use async_trait::async_trait; use sha2::{Digest, Sha256}; use super::{ DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga, SourceMangaRef, }; use crate::crawler::detect::{ has_logo_sentinel, is_broken_page_body, retry_on_transient_with_hook, PageError, }; use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT}; /// `sources.id` value for this Source impl. Exposed as a const so the /// daemon can look up per-source state (e.g. the recovery flag) before /// constructing the Source itself. pub const SOURCE_ID: &str = "target"; /// In-loop retry budget for transient pages encountered during a single /// `discover` walk. Bounded small because the next cron tick will pick up /// where this run left off via the recovery flag — these inline retries /// only need to absorb a brief site hiccup mid-walk, not a sustained /// outage. const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3; const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2); pub struct TargetSource { base_url: String, parse_chapters: bool, } impl TargetSource { pub fn new(base_url: impl Into) -> Self { Self { base_url: base_url.into(), parse_chapters: true, } } pub fn base_url(&self) -> &str { &self.base_url } /// Skip the chapter-list selector when parsing detail pages. /// The returned `SourceManga.chapters` will be empty even when the /// page has a chapter table. Caller must also avoid calling /// `repo::crawler::sync_manga_chapters` for these mangas — an /// empty list would otherwise soft-drop the manga's existing /// chapter rows. pub fn without_chapter_parsing(mut self) -> Self { self.parse_chapters = false; self } } #[async_trait] impl Source for TargetSource { fn id(&self) -> &'static str { SOURCE_ID } async fn discover( &self, ctx: &FetchContext<'_>, ) -> anyhow::Result> { // Probe page 1 up front (with transient retry) for two reasons: // a broken first page should abort cleanly rather than mid-walk, // and the HTML is handed straight to the first `next_batch` call // so the walker doesn't re-fetch it. Page count is discovered // incrementally — see `TargetSourceWalker::next_batch`. let first_html = retry_on_transient_with_hook( || async { navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await }, PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_DELAY, || async { recircuit_if_configured(ctx.tor).await }, ) .await?; Ok(Box::new(TargetSourceWalker { base_url: self.base_url.clone(), next_page: 1, first_page_html: Some(first_html), })) } async fn fetch_manga( &self, ctx: &FetchContext<'_>, r: &SourceMangaRef, ) -> anyhow::Result { // When we'll parse the chapter table, wait for at least one // chapter row to appear — that's the marker most sensitive to // the post-load JS partial-render race. When we won't, fall // back to the layout-level `#logo` so we still wait for the // page to settle. let marker = if self.parse_chapters { DETAIL_PAGE_CHAPTERS_MARKER } else { DETAIL_PAGE_LAYOUT_MARKER }; let html = navigate(ctx, r.url.as_str(), marker).await?; // Convert PageError → anyhow::Error via `?`. PageError stays // downcastable from the wrapped anyhow::Error so the pipeline // can still recognize Transient via `error.downcast_ref::()`. let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters) .with_context(|| format!("parse manga detail at {}", r.url))?; Ok(manga) } async fn fetch_chapter_list( &self, _ctx: &FetchContext<'_>, _manga: &SourceManga, ) -> anyhow::Result> { anyhow::bail!("fetch_chapter_list not implemented yet") } async fn fetch_chapter( &self, _ctx: &FetchContext<'_>, _r: &SourceChapterRef, ) -> anyhow::Result { anyhow::bail!("fetch_chapter not implemented yet") } } /// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in /// order, terminating as soon as a page renders cleanly with zero entries /// — that's the "we ran off the end of the index" signal. Page 1's HTML /// is cached at construction time (discover already had to fetch it for /// the transient probe) so the first batch doesn't re-fetch. /// /// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what /// stops us: the parser's `#logo` sentinel converts unrendered pages /// into transient errors before they reach this loop, so an empty /// parse result reliably means "no more entries." struct TargetSourceWalker { base_url: String, next_page: i32, first_page_html: Option, } #[async_trait] impl DiscoverWalk for TargetSourceWalker { async fn next_batch( &mut self, ctx: &FetchContext<'_>, ) -> anyhow::Result>> { let page_num = self.next_page; let page_refs = if page_num == 1 { // Reuse the cached page-1 HTML from the initial probe. Take // it (rather than clone) so a future re-entry that somehow // revisits page 1 still falls back to a real fetch. match self.first_page_html.take() { Some(html) => { let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc)? } None => { retry_on_transient_with_hook( || async { let html = navigate( ctx, self.base_url.as_str(), LIST_PAGE_MARKER, ) .await?; let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc) }, PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_DELAY, || async { recircuit_if_configured(ctx.tor).await }, ) .await? } } } else { retry_on_transient_with_hook( || async { let url = page_url(&self.base_url, page_num); let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?; let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc) }, PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_DELAY, || async { recircuit_if_configured(ctx.tor).await }, ) .await? }; tracing::info!(page_num, count = page_refs.len(), "page walked"); if page_refs.is_empty() { return Ok(None); } self.next_page += 1; Ok(Some(page_refs)) } } /// Per-page-type markers used by `navigate`'s post-navigation wait. /// Each is the most specific element the parser will later look for — /// waiting on it closes the partial-render race (e.g. `#chapter_table` /// wrapper present but rows still being injected by post-load JS) that /// the old fixed 1s sleep masked. See [`navigate`]. const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli"; const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico"; const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo"; /// Single point of rate-limited navigation. Every Source request goes /// through here, so the per-host limiter map is the only knob that /// controls per-origin RPS. Also the choke point for transient-page /// detection — every fetched body is screened by /// [`classify_navigate_html`] before being handed to a selector. /// /// `marker` is a CSS selector the caller expects to find on the loaded /// page. The wait is best-effort: a timeout is **not** an error /// (legitimately-empty pages may never render the marker), it just /// caps how long we'll hold for post-load JS to finish injecting /// content. The parser's own sentinels and the universal broken-page /// body check still catch real failures. async fn navigate( ctx: &FetchContext<'_>, url: &str, marker: &str, ) -> Result { ctx.rate.wait_for(url).await?; let page = ctx .browser .new_page(url) .await .map_err(|e| PageError::Other(anyhow::Error::from(e)))?; match wait_for_nav(&page).await { Ok(()) => {} Err(NavError::Timeout(_)) => { page.close().await.ok(); return Err(PageError::transient("nav timeout")); } Err(NavError::Cdp(e)) => { page.close().await.ok(); return Err(PageError::Other(anyhow::Error::from(e))); } } // Best-effort wait for the page-type marker. We deliberately // discard a timeout here — see fn-level doc. let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await; let html = page .content() .await .map_err(|e| PageError::Other(anyhow::Error::from(e)))?; page.close().await.ok(); classify_navigate_html(html) } /// Classify a fetched body. The broken-page template is universal across /// the site — every page type (list, detail, chapter list, reader) gets /// the same `we're sorry, the request file are not found` body when the /// server is hiccuping. Catching it here means individual parsers /// downstream don't have to repeat the check. fn classify_navigate_html(html: String) -> Result { if is_broken_page_body(&html) { return Err(PageError::transient("broken-page body signature")); } Ok(html) } /// Hook for [`retry_on_transient_with_hook`]: when TOR is configured, /// signal `NEWNYM` so the next navigation draws a fresh exit. Errors /// from the controller are logged and swallowed — failing to recircuit /// shouldn't take down the crawl, the next attempt just runs on the /// same circuit as before. async fn recircuit_if_configured(tor: Option<&crate::crawler::tor::TorController>) { if let Some(t) = tor { if let Err(e) = t.new_identity().await { tracing::warn!(error = %e, "TOR NEWNYM failed; retrying on same circuit"); } } } /// Substitutes the first `/N/` path segment with the target page /// number. Source impls that paginate via a different URL shape can /// override this — for the modeled site the segment is always present. fn page_url(template_url: &str, page: i32) -> String { let bytes = template_url.as_bytes(); let mut i = 0; while i + 1 < bytes.len() { if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() { let start = i; let mut j = i + 1; while j < bytes.len() && bytes[j].is_ascii_digit() { j += 1; } if j < bytes.len() && bytes[j] == b'/' { let mut out = String::with_capacity(template_url.len() + 4); out.push_str(&template_url[..start]); out.push_str(&format!("/{page}/")); out.push_str(&template_url[j + 1..]); return out; } } i += 1; } template_url.to_string() } #[cfg(test)] fn parse_manga_list(html: &str) -> Result, PageError> { let doc = scraper::Html::parse_document(html); parse_manga_list_from(&doc) } /// Parse a manga listing page. `#logo` is present on every well-formed /// listing page on the source; its absence means the response is a /// broken-page placeholder (transient) rather than a genuinely empty /// listing. Empty listings (last-page tail, search with no hits) remain /// `Ok(vec![])`. fn parse_manga_list_from(doc: &scraper::Html) -> Result, PageError> { if !has_logo_sentinel(doc) { return Err(PageError::transient("manga list: #logo sentinel missing")); } let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap(); Ok(doc .select(&sel) .filter_map(|a| { let url = a.value().attr("href")?.trim().to_string(); if url.is_empty() { return None; } let title = collapse_whitespace(&a.text().collect::()); if title.is_empty() { return None; } Some(SourceMangaRef { source_manga_key: derive_key_from_url(&url), title, url, }) }) .collect()) } fn parse_manga_detail( html: &str, key: &str, include_chapters: bool, ) -> Result { let doc = scraper::Html::parse_document(html); // Sentinel first: a broken-page response will trip this before any // anyhow context is added for missing required fields. if !has_logo_sentinel(&doc) { return Err(PageError::transient("manga detail: #logo sentinel missing")); } let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?; let summary = first_text(&doc, ".manga_summary"); let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src"); let authors = links_in_labelled_td(&doc, "Author"); let genres = links_in_labelled_td(&doc, "Genre"); let raw_status = labelled_td_child_text(&doc, "Status", "span"); let status = normalize_status(raw_status.as_deref(), key); let alternative_titles = labelled_td_value_after_label(&doc, "Alternative") .map(|s| { s.split([';', ',', '|']) .map(str::trim) .filter(|p| !p.is_empty()) .map(String::from) .collect() }) .unwrap_or_default(); let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap(); let tags: Vec = doc .select(&tag_sel) .map(|a| collapse_whitespace(&a.text().collect::())) .map(|s| strip_tag_count(&s)) .filter(|s| !s.is_empty()) .collect(); let chapters = if include_chapters { parse_chapter_list(&doc)? } else { Vec::new() }; let mut manga = SourceManga { source_manga_key: key.to_string(), title, alternative_titles, authors, genres, tags, status, summary, cover_url, chapters, metadata_hash: String::new(), }; manga.metadata_hash = compute_metadata_hash(&manga); Ok(manga) } /// Source advertises status as "Ongoing" or "Completed"; we normalize /// to the lowercase form the `mangas.status` CHECK constraint accepts. /// Anything else is a parse miss (selector drift, new value, etc.) and /// returns `None` after logging — the manga sync continues regardless. fn normalize_status(raw: Option<&str>, key: &str) -> Option { let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?; if trimmed.eq_ignore_ascii_case("ongoing") { Some("ongoing".to_string()) } else if trimmed.eq_ignore_ascii_case("completed") { Some("completed".to_string()) } else { tracing::error!( key, raw_status = trimmed, "unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None" ); None } } /// Strips a trailing digit-only `(NN)` suffix from a tag name, the form /// the source uses to display tag counts. Non-numeric parentheses are /// preserved. fn strip_tag_count(s: &str) -> String { let trimmed = s.trim(); if trimmed.ends_with(')') { if let Some(open) = trimmed.rfind('(') { let inside = &trimmed[open + 1..trimmed.len() - 1]; if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) { return trimmed[..open].trim().to_string(); } } } trimmed.to_string() } /// Parse the chapter table on a manga detail page. Returns `Transient` if /// `#chapter_table` isn't in the DOM at all — the table is required even /// for mangas with no published chapters yet (the source renders an empty /// ``), so an absent table signals a partial render (post-load JS /// not done, layout drift) rather than a legitimately empty list. Without /// this sentinel, an empty `Vec` reaches `sync_manga_chapters` and the /// soft-drop branch flips every existing chapter to `dropped_at`. fn parse_chapter_list(doc: &scraper::Html) -> Result, PageError> { if !has_chapter_table_sentinel(doc) { return Err(PageError::transient( "manga detail: #chapter_table sentinel missing", )); } let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap(); Ok(doc .select(&sel) .filter_map(|a| { let url = a.value().attr("href")?.trim().to_string(); if url.is_empty() { return None; } let title_text = collapse_whitespace(&a.text().collect::()); let number = parse_chapter_number(&title_text).unwrap_or(0); Some(SourceChapterRef { source_chapter_key: derive_chapter_key_from_url(&url), number, title: (!title_text.is_empty()).then_some(title_text), url, }) }) .collect()) } /// Returns true when the chapter-table container is present in the DOM. /// Source-specific: the target site uses `#chapter_table` as the wrapper /// element. Distinguishes "table is present but empty" (legit edge case /// for new mangas) from "table is missing entirely" (partial render). fn has_chapter_table_sentinel(doc: &scraper::Html) -> bool { let sel = scraper::Selector::parse("#chapter_table").expect("valid selector"); doc.select(&sel).next().is_some() } fn parse_chapter_number(text: &str) -> Option { let mut buf = String::new(); for c in text.chars() { if c.is_ascii_digit() { buf.push(c); } else if !buf.is_empty() { break; } } buf.parse().ok() } fn derive_key_from_url(url: &str) -> String { url.split('?') .next() .unwrap_or(url) .trim_end_matches('/') .rsplit('/') .find(|s| !s.is_empty()) .unwrap_or(url) .to_string() } /// Chapter URLs on this source point at the reader's page 1, e.g. /// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the /// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment /// identifies a page *within* a chapter, so naively taking the last /// path component returns `"pg-1"` for every chapter and collapses /// them all under one source_chapter_key downstream. fn derive_chapter_key_from_url(url: &str) -> String { let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/'); let without_reader_page = match trimmed.rsplit_once('/') { Some((prefix, last)) if is_reader_page_segment(last) => prefix, _ => trimmed, }; without_reader_page .rsplit('/') .find(|s| !s.is_empty()) .unwrap_or(url) .to_string() } fn is_reader_page_segment(s: &str) -> bool { s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit()) } fn first_text(doc: &scraper::Html, sel: &str) -> Option { let s = scraper::Selector::parse(sel).ok()?; let el = doc.select(&s).next()?; let text = collapse_whitespace(&el.text().collect::()); (!text.is_empty()).then_some(text) } fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option { let s = scraper::Selector::parse(sel).ok()?; let el = doc.select(&s).next()?; el.value().attr(attr).map(str::to_string) } /// `td` whose contained `label` text begins with `label_prefix` — the /// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`. fn td_with_label<'a>( doc: &'a scraper::Html, label_prefix: &str, ) -> Option> { let td_sel = scraper::Selector::parse("td").unwrap(); let label_sel = scraper::Selector::parse("label").unwrap(); for td in doc.select(&td_sel) { for label in td.select(&label_sel) { let text: String = label.text().collect(); if text.trim().starts_with(label_prefix) { return Some(td); } } } None } fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec { let Some(td) = td_with_label(doc, label_prefix) else { return Vec::new(); }; let a_sel = scraper::Selector::parse("a").unwrap(); td.select(&a_sel) .map(|a| collapse_whitespace(&a.text().collect::())) .filter(|s| !s.is_empty()) .collect() } fn labelled_td_child_text( doc: &scraper::Html, label_prefix: &str, child_sel: &str, ) -> Option { let td = td_with_label(doc, label_prefix)?; let child = scraper::Selector::parse(child_sel).ok()?; let el = td.select(&child).next()?; let text = collapse_whitespace(&el.text().collect::()); (!text.is_empty()).then_some(text) } /// Returns the text content of the labelled `td` with the leading /// "Label:" portion stripped — used for "Alternative:" which puts the /// value directly in the cell rather than in a child element. fn labelled_td_value_after_label( doc: &scraper::Html, label_prefix: &str, ) -> Option { let td = td_with_label(doc, label_prefix)?; let full: String = td.text().collect(); let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full); let trimmed = collapse_whitespace(after); (!trimmed.is_empty()).then_some(trimmed) } fn collapse_whitespace(s: &str) -> String { s.split_whitespace().collect::>().join(" ") } fn compute_metadata_hash(m: &SourceManga) -> String { // Field separators are ASCII unit/record separators so a field // containing a delimiter character can't be mistaken for two // smaller fields. let mut h = Sha256::new(); fn feed(h: &mut Sha256, s: &str) { h.update(s.as_bytes()); h.update(b"\x1F"); } fn feed_list(h: &mut Sha256, xs: &[String]) { for s in xs { feed(h, s); } h.update(b"\x1E"); } feed(&mut h, &m.title); feed_list(&mut h, &m.alternative_titles); feed_list(&mut h, &m.authors); feed_list(&mut h, &m.genres); feed_list(&mut h, &m.tags); feed(&mut h, m.status.as_deref().unwrap_or("")); feed(&mut h, m.summary.as_deref().unwrap_or("")); feed(&mut h, m.cover_url.as_deref().unwrap_or("")); format!("{:x}", h.finalize()) } #[cfg(test)] mod tests { use super::*; const LISTING_HTML: &str = r#"

Target

Foo Manga

Bar Baz

Empty href ignored

"#; const DETAIL_HTML: &str = r#"

Target

Test Manga Title

A summary of the manga.

Author:Author One Author Two

Genre(s):Action Drama

Status:Ongoing

Alternative: Alt Title 1; Alt Title 2

Ch.1

Ch.2 - The Beginning

Chapter 3: Onward

"#; #[test] fn parse_manga_list_extracts_title_url_and_derives_key() { let refs = parse_manga_list(LISTING_HTML).expect("parse"); assert_eq!(refs.len(), 2, "third entry has empty href and is skipped"); assert_eq!(refs[0].title, "Foo Manga"); assert_eq!(refs[0].url, "https://target.example/manga/foo"); assert_eq!(refs[0].source_manga_key, "foo"); assert_eq!(refs[1].title, "Bar Baz"); assert_eq!(refs[1].source_manga_key, "bar-baz"); } #[test] fn parse_manga_list_returns_transient_when_logo_missing() { // Broken-page response: no #logo, no listing. Empty Vec would // hide this as "page has no mangas"; Transient is the signal // upstream code retries on. let html = r#"\

we're sorry, the request file are not found.

\ "#; let err = parse_manga_list(html).expect_err("expected Transient"); assert!(err.is_transient(), "got non-transient: {err}"); } #[test] fn parse_manga_list_ok_empty_when_logo_present_but_no_items() { // Last page of pagination, "no results" search, etc. Legitimately // empty must stay distinguishable from "page is broken". let html = r#"\

Target

\ "#; let refs = parse_manga_list(html).expect("logo present == not transient"); assert!(refs.is_empty()); } #[test] fn parse_manga_detail_pulls_all_fields() { let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse"); assert_eq!(m.source_manga_key, "test-key"); assert_eq!(m.title, "Test Manga Title"); assert_eq!(m.summary.as_deref(), Some("A summary of the manga.")); assert_eq!(m.authors, vec!["Author One", "Author Two"]); assert_eq!(m.genres, vec!["Action", "Drama"]); assert_eq!(m.status.as_deref(), Some("ongoing")); assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]); // Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy". assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]); assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg")); assert!(!m.metadata_hash.is_empty()); assert_eq!(m.chapters.len(), 3); assert_eq!(m.chapters[0].number, 1); assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1")); assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1"); assert_eq!(m.chapters[0].source_chapter_key, "1"); assert_eq!(m.chapters[1].number, 2); assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning")); assert_eq!(m.chapters[2].number, 3); assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward")); } #[test] fn status_normalized_case_insensitively() { assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing")); assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing")); assert_eq!(normalize_status(Some(" completed "), "k").as_deref(), Some("completed")); } #[test] fn unknown_status_logs_and_returns_none() { // Logging is observable in test output via tracing-test, but // here we just assert the contract: unknown becomes None // (and the manga is therefore still synced by the caller). assert!(normalize_status(Some("Hiatus"), "k").is_none()); assert!(normalize_status(Some(""), "k").is_none()); assert!(normalize_status(None, "k").is_none()); } #[test] fn strip_tag_count_drops_trailing_digit_parens_only() { assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy"); assert_eq!(strip_tag_count(" Action (5) "), "Action"); assert_eq!(strip_tag_count("Romance"), "Romance"); // Non-numeric parens stay put. assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)"); // Only the trailing paren is considered. assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)"); } #[test] fn parse_chapter_list_keeps_all_chapters_with_unique_keys() { // Real listing fixture from the target site. 15 rows: chapters // with various Ch.N markup, one hiatus row, three "notice." rows, // and duplicates of Ch.1 and Ch.52 from different uploaders. // Every row must survive parsing and every chapter must have a // distinct source_chapter_key — chapter URLs all end in `/pg-1/` // (the reader's page-1 entry point), and a naive // last-segment-of-URL derivation returns "pg-1" for every row, // collapsing the whole list into one downstream chapter row. let html = include_str!( "../../../tests/fixtures/target/chapter_list_uu.html" ); let doc = scraper::Html::parse_document(html); let chapters = parse_chapter_list(&doc).expect("fixture has the table"); assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)"); let mut keys: Vec<&str> = chapters.iter().map(|c| c.source_chapter_key.as_str()).collect(); keys.sort(); let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]); assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}"); for c in &chapters { assert_ne!( c.source_chapter_key, "pg-1", "key must not be the reader-page segment: {:?}", c ); } // Latest chapter is first (source orders newest → oldest). assert_eq!(chapters[0].number, 67); assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official")); assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272"); // Duplicate-number chapters (different uploaders) survive as // two rows. The (manga_id, number) UNIQUE collapse is a // downstream schema concern handled separately. assert_eq!( chapters.iter().filter(|c| c.number == 52).count(), 2, "two Ch.52 uploads must both survive parsing" ); assert_eq!( chapters.iter().filter(|c| c.number == 1).count(), 2, "Ch.1 Official and Ch.1 Team Hazama are both kept" ); // Notices / hiatus rows have no leading digit so they parse to // number=0. They are not filtered out. let zero = chapters.iter().filter(|c| c.number == 0).count(); assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}"); } #[test] fn parse_chapter_number_grabs_first_integer_run() { assert_eq!(parse_chapter_number("Ch.1"), Some(1)); assert_eq!(parse_chapter_number("Chapter 12"), Some(12)); assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2)); // Decimal chapters keep the integer part (i32 storage). assert_eq!(parse_chapter_number("Ch.12.5"), Some(12)); assert_eq!(parse_chapter_number("Special"), None); } #[test] fn page_url_substitutes_numeric_path_segment() { assert_eq!( page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5), "https://site.example/list/5/?f=1&o=1&sortby=update_date&e=" ); // No numeric segment → URL returned unchanged. assert_eq!( page_url("https://site.example/list/?f=1", 5), "https://site.example/list/?f=1" ); } #[test] fn derive_key_strips_trailing_slash_and_query() { assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo"); assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo"); assert_eq!(derive_key_from_url("/manga/bar"), "bar"); } #[test] fn derive_chapter_key_strips_trailing_reader_page_segment() { // Listing links go to page 1 of the reader; strip /pg-\d+/. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"), "br_chapter-379272" ); assert_eq!( derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"), "to_chapter-13" ); // Defensive: deep-link to a non-first page should still resolve // to the same chapter identity. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"), "br_chapter-379272" ); // No reader-page suffix → behaves like derive_key_from_url. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/"), "br_chapter-379272" ); // Query strings are stripped. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"), "br_chapter-379272" ); // `pg-foo` is not a valid reader-page segment; treated as identity. assert_eq!( derive_chapter_key_from_url(".../uu/something/pg-foo/"), "pg-foo" ); // Bare `pg-` (no digits) likewise not stripped. assert_eq!( derive_chapter_key_from_url(".../uu/something/pg-/"), "pg-" ); } #[test] fn metadata_hash_is_stable_and_field_sensitive() { let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap(); let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap(); assert_eq!(base.metadata_hash, again.metadata_hash); // Same fields except status flipped — hash must change. let altered_html = DETAIL_HTML.replace("Ongoing", "Completed"); let altered = parse_manga_detail(&altered_html, "k", true).unwrap(); assert_ne!(base.metadata_hash, altered.metadata_hash); } #[test] fn missing_optional_fields_parse_to_none() { // Minimal but well-formed detail page: title is required, every // other field is optional, but the chapter table is structural — // its absence is treated as Transient (a freshly added manga // renders the table empty, not absent). See // `parse_chapter_list_returns_transient_when_table_missing` for // the negative case. let html = r#"\

Target

Minimal

\ "#; let m = parse_manga_detail(html, "min", true).unwrap(); assert_eq!(m.title, "Minimal"); assert!(m.summary.is_none()); assert!(m.status.is_none()); assert!(m.authors.is_empty()); assert!(m.genres.is_empty()); assert!(m.tags.is_empty()); assert!(m.alternative_titles.is_empty()); assert!(m.chapters.is_empty()); } #[test] fn parse_manga_detail_skips_chapters_when_disabled() { // Same fixture that yields 3 chapters above; with include_chapters=false // the chapter table is ignored and the rest of the metadata still parses. let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap(); assert!(m.chapters.is_empty(), "chapters should be empty when disabled"); assert_eq!(m.title, "Test Manga Title", "other fields still parse"); assert_eq!(m.authors, vec!["Author One", "Author Two"]); } #[test] fn parse_manga_detail_errors_on_missing_title() { // Logo present (page is alive) — failure here is a real parse // miss (Other), not Transient. let html = r#"\

Target

nothing

"#; let err = parse_manga_detail(html, "x", true).unwrap_err(); assert!(!err.is_transient(), "expected Other, got Transient: {err}"); assert!(err.to_string().contains("missing .w-title h1")); } #[test] fn classify_navigate_html_passes_normal_body_through() { let body = "

Target

content

" .to_string(); let out = classify_navigate_html(body.clone()).expect("ok"); assert_eq!(out, body); } #[test] fn classify_navigate_html_returns_transient_for_broken_template() { let body = "\

we're sorry, the request file are not found.

\ " .to_string(); let err = classify_navigate_html(body).expect_err("expected Transient"); assert!(err.is_transient(), "got non-transient: {err}"); } #[test] fn parse_manga_detail_returns_transient_when_logo_missing() { // Broken-page response on a detail URL — must be reported as // Transient so the job is retried rather than logging "missing // .w-title h1" against a permanently-skipped manga. let html = "\

we're sorry, the request file are not found.

\ "; let err = parse_manga_detail(html, "x", true).expect_err("expected Transient"); assert!(err.is_transient(), "got non-transient: {err}"); } #[test] fn parse_chapter_list_returns_transient_when_table_missing() { // Partial render (post-load JS hadn't injected the table, layout // drift, etc). Returning Vec::new() would silently soft-drop every // existing chapter for the manga via sync_manga_chapters; Transient // is the signal the job system retries on. let html = r#"

Target

Test

"#; let doc = scraper::Html::parse_document(html); let err = parse_chapter_list(&doc).expect_err("expected Transient"); assert!(err.is_transient(), "got non-transient: {err}"); } #[test] fn parse_chapter_list_ok_empty_when_table_present_but_no_rows() { // A freshly-added manga with no chapters yet — the source renders // the `` wrapper but no `` rows // inside. Must stay distinguishable from a missing-table render. let html = r#"

Target

"#; let doc = scraper::Html::parse_document(html); let chapters = parse_chapter_list(&doc).expect("present table is not transient"); assert!(chapters.is_empty()); } #[test] fn parse_manga_detail_propagates_chapter_table_transient() { // End-to-end: a detail page that survives the #logo sentinel but // has the chapter table stripped must fail Transient at the parser // boundary, not return a SourceManga with empty chapters. let html = r#"

Target

Test Title

"#; let err = parse_manga_detail(html, "key", true).expect_err("expected Transient"); assert!(err.is_transient(), "got non-transient: {err}"); } #[test] fn parse_manga_detail_skips_chapter_sentinel_when_include_chapters_false() { // Metadata-only mode (`skip_chapters` upstream) must not require // the chapter table — pipeline.rs avoids calling sync_manga_chapters // for these mangas, so the absent table is not a correctness issue // and shouldn't surface as Transient. let html = r#"

Target

Test Title

"#; let manga = parse_manga_detail(html, "key", false) .expect("metadata-only parse must not require chapter table"); assert!(manga.chapters.is_empty()); } }