//! First concrete [`Source`] impl, modeled on the selectors of the //! old Puppeteer crawler. The name "target" is a placeholder — rename //! once the site is officially identified. //! //! `scraper`'s selector parser does not support `:has()` or //! `:contains()`, so the labelled-`td` lookups from the old script //! (`td:has(label:contains("Author:"))`) are implemented by walking //! the parsed tree. use std::time::Duration; use anyhow::Context; use async_trait::async_trait; use sha2::{Digest, Sha256}; use super::{ DiscoverMode, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga, SourceMangaRef, }; pub struct TargetSource { base_url: String, parse_chapters: bool, } impl TargetSource { pub fn new(base_url: impl Into) -> Self { Self { base_url: base_url.into(), parse_chapters: true, } } pub fn base_url(&self) -> &str { &self.base_url } /// Skip the chapter-list selector when parsing detail pages. /// The returned `SourceManga.chapters` will be empty even when the /// page has a chapter table. Caller must also avoid calling /// `repo::crawler::sync_manga_chapters` for these mangas — an /// empty list would otherwise soft-drop the manga's existing /// chapter rows. pub fn without_chapter_parsing(mut self) -> Self { self.parse_chapters = false; self } } #[async_trait] impl Source for TargetSource { fn id(&self) -> &'static str { "target" } async fn discover( &self, ctx: &FetchContext<'_>, mode: DiscoverMode, max_results: Option, ) -> anyhow::Result> { // Always visit page 1 first because that's the only way to // discover `last_page`. We cache the HTML so we don't have to // re-navigate when the iteration reaches page 1 again. let first_html = navigate(ctx, self.base_url.as_str()).await?; let last_page = { let doc = scraper::Html::parse_document(&first_html); parse_last_page(&doc) }; let backfill = matches!(mode, DiscoverMode::Backfill); let order: Vec = match (last_page, backfill) { (None, _) => vec![1], // Backfill = oldest-first: walk pages last → 1, then // reverse within each page (the listing is update_date // DESC, so the bottom of the last page is the oldest // entry the source still surfaces). (Some(last), true) => (1..=last).rev().collect(), (Some(last), false) => (1..=last).collect(), }; tracing::info!( ?mode, last_page = ?last_page, page_count = order.len(), "walking pagination" ); let mut all = Vec::new(); for page_num in order { let html = if page_num == 1 { first_html.clone() } else { navigate(ctx, &page_url(&self.base_url, page_num)).await? }; let mut page_refs = { let doc = scraper::Html::parse_document(&html); parse_manga_list_from(&doc) }; if backfill { page_refs.reverse(); } tracing::info!(page_num, count = page_refs.len(), "page walked"); all.extend(page_refs); if cap_reached(&all, max_results) { tracing::info!(cap = ?max_results, "max_results reached; halting pagination"); break; } } Ok(truncate_to_cap(all, max_results)) } async fn fetch_manga( &self, ctx: &FetchContext<'_>, r: &SourceMangaRef, ) -> anyhow::Result { let html = navigate(ctx, r.url.as_str()).await?; parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters) .with_context(|| format!("parse manga detail at {}", r.url)) } async fn fetch_chapter_list( &self, _ctx: &FetchContext<'_>, _manga: &SourceManga, ) -> anyhow::Result> { anyhow::bail!("fetch_chapter_list not implemented yet") } async fn fetch_chapter( &self, _ctx: &FetchContext<'_>, _r: &SourceChapterRef, ) -> anyhow::Result { anyhow::bail!("fetch_chapter not implemented yet") } } fn cap_reached(buf: &[T], max: Option) -> bool { matches!(max, Some(m) if buf.len() >= m) } fn truncate_to_cap(mut buf: Vec, max: Option) -> Vec { if let Some(m) = max { buf.truncate(m); } buf } /// Single point of rate-limited navigation. Every Source request goes /// through here, so the per-host limiter map is the only knob that /// controls per-origin RPS. async fn navigate(ctx: &FetchContext<'_>, url: &str) -> anyhow::Result { ctx.rate.wait_for(url).await?; let page = ctx.browser.new_page(url).await?; page.wait_for_navigation().await?; // Stopgap until we wait on a specific selector per page type — // gives any post-load JS a beat to finish injecting content. tokio::time::sleep(Duration::from_secs(1)).await; let html = page.content().await?; page.close().await?; Ok(html) } fn parse_last_page(doc: &scraper::Html) -> Option { // Pagination links carry their page number as text. Take the // numeric maximum so we don't depend on a specific layout (Prev, // Next, ellipses, etc. all get filtered out by .parse). let sel = scraper::Selector::parse("#left_side .pagination a").unwrap(); doc.select(&sel) .filter_map(|a| { collapse_whitespace(&a.text().collect::()) .parse::() .ok() }) .max() } /// Substitutes the first `/N/` path segment with the target page /// number. Source impls that paginate via a different URL shape can /// override this — for the modeled site the segment is always present. fn page_url(template_url: &str, page: i32) -> String { let bytes = template_url.as_bytes(); let mut i = 0; while i + 1 < bytes.len() { if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() { let start = i; let mut j = i + 1; while j < bytes.len() && bytes[j].is_ascii_digit() { j += 1; } if j < bytes.len() && bytes[j] == b'/' { let mut out = String::with_capacity(template_url.len() + 4); out.push_str(&template_url[..start]); out.push_str(&format!("/{page}/")); out.push_str(&template_url[j + 1..]); return out; } } i += 1; } template_url.to_string() } #[cfg(test)] fn parse_manga_list(html: &str) -> Vec { let doc = scraper::Html::parse_document(html); parse_manga_list_from(&doc) } fn parse_manga_list_from(doc: &scraper::Html) -> Vec { let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap(); doc.select(&sel) .filter_map(|a| { let url = a.value().attr("href")?.trim().to_string(); if url.is_empty() { return None; } let title = collapse_whitespace(&a.text().collect::()); if title.is_empty() { return None; } Some(SourceMangaRef { source_manga_key: derive_key_from_url(&url), title, url, }) }) .collect() } fn parse_manga_detail( html: &str, key: &str, include_chapters: bool, ) -> anyhow::Result { let doc = scraper::Html::parse_document(html); let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?; let summary = first_text(&doc, ".manga_summary"); let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src"); let authors = links_in_labelled_td(&doc, "Author"); let genres = links_in_labelled_td(&doc, "Genre"); let raw_status = labelled_td_child_text(&doc, "Status", "span"); let status = normalize_status(raw_status.as_deref(), key); let alternative_titles = labelled_td_value_after_label(&doc, "Alternative") .map(|s| { s.split([';', ',', '|']) .map(str::trim) .filter(|p| !p.is_empty()) .map(String::from) .collect() }) .unwrap_or_default(); let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap(); let tags: Vec = doc .select(&tag_sel) .map(|a| collapse_whitespace(&a.text().collect::())) .map(|s| strip_tag_count(&s)) .filter(|s| !s.is_empty()) .collect(); let chapters = if include_chapters { parse_chapter_list(&doc) } else { Vec::new() }; let mut manga = SourceManga { source_manga_key: key.to_string(), title, alternative_titles, authors, genres, tags, status, summary, cover_url, chapters, metadata_hash: String::new(), }; manga.metadata_hash = compute_metadata_hash(&manga); Ok(manga) } /// Source advertises status as "Ongoing" or "Completed"; we normalize /// to the lowercase form the `mangas.status` CHECK constraint accepts. /// Anything else is a parse miss (selector drift, new value, etc.) and /// returns `None` after logging — the manga sync continues regardless. fn normalize_status(raw: Option<&str>, key: &str) -> Option { let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?; if trimmed.eq_ignore_ascii_case("ongoing") { Some("ongoing".to_string()) } else if trimmed.eq_ignore_ascii_case("completed") { Some("completed".to_string()) } else { tracing::error!( key, raw_status = trimmed, "unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None" ); None } } /// Strips a trailing digit-only `(NN)` suffix from a tag name, the form /// the source uses to display tag counts. Non-numeric parentheses are /// preserved. fn strip_tag_count(s: &str) -> String { let trimmed = s.trim(); if trimmed.ends_with(')') { if let Some(open) = trimmed.rfind('(') { let inside = &trimmed[open + 1..trimmed.len() - 1]; if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) { return trimmed[..open].trim().to_string(); } } } trimmed.to_string() } fn parse_chapter_list(doc: &scraper::Html) -> Vec { let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap(); doc.select(&sel) .filter_map(|a| { let url = a.value().attr("href")?.trim().to_string(); if url.is_empty() { return None; } let title_text = collapse_whitespace(&a.text().collect::()); let number = parse_chapter_number(&title_text).unwrap_or(0); Some(SourceChapterRef { source_chapter_key: derive_chapter_key_from_url(&url), number, title: (!title_text.is_empty()).then_some(title_text), url, }) }) .collect() } fn parse_chapter_number(text: &str) -> Option { let mut buf = String::new(); for c in text.chars() { if c.is_ascii_digit() { buf.push(c); } else if !buf.is_empty() { break; } } buf.parse().ok() } fn derive_key_from_url(url: &str) -> String { url.split('?') .next() .unwrap_or(url) .trim_end_matches('/') .rsplit('/') .find(|s| !s.is_empty()) .unwrap_or(url) .to_string() } /// Chapter URLs on this source point at the reader's page 1, e.g. /// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the /// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment /// identifies a page *within* a chapter, so naively taking the last /// path component returns `"pg-1"` for every chapter and collapses /// them all under one source_chapter_key downstream. fn derive_chapter_key_from_url(url: &str) -> String { let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/'); let without_reader_page = match trimmed.rsplit_once('/') { Some((prefix, last)) if is_reader_page_segment(last) => prefix, _ => trimmed, }; without_reader_page .rsplit('/') .find(|s| !s.is_empty()) .unwrap_or(url) .to_string() } fn is_reader_page_segment(s: &str) -> bool { s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit()) } fn first_text(doc: &scraper::Html, sel: &str) -> Option { let s = scraper::Selector::parse(sel).ok()?; let el = doc.select(&s).next()?; let text = collapse_whitespace(&el.text().collect::()); (!text.is_empty()).then_some(text) } fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option { let s = scraper::Selector::parse(sel).ok()?; let el = doc.select(&s).next()?; el.value().attr(attr).map(str::to_string) } /// `td` whose contained `label` text begins with `label_prefix` — the /// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`. fn td_with_label<'a>( doc: &'a scraper::Html, label_prefix: &str, ) -> Option> { let td_sel = scraper::Selector::parse("td").unwrap(); let label_sel = scraper::Selector::parse("label").unwrap(); for td in doc.select(&td_sel) { for label in td.select(&label_sel) { let text: String = label.text().collect(); if text.trim().starts_with(label_prefix) { return Some(td); } } } None } fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec { let Some(td) = td_with_label(doc, label_prefix) else { return Vec::new(); }; let a_sel = scraper::Selector::parse("a").unwrap(); td.select(&a_sel) .map(|a| collapse_whitespace(&a.text().collect::())) .filter(|s| !s.is_empty()) .collect() } fn labelled_td_child_text( doc: &scraper::Html, label_prefix: &str, child_sel: &str, ) -> Option { let td = td_with_label(doc, label_prefix)?; let child = scraper::Selector::parse(child_sel).ok()?; let el = td.select(&child).next()?; let text = collapse_whitespace(&el.text().collect::()); (!text.is_empty()).then_some(text) } /// Returns the text content of the labelled `td` with the leading /// "Label:" portion stripped — used for "Alternative:" which puts the /// value directly in the cell rather than in a child element. fn labelled_td_value_after_label( doc: &scraper::Html, label_prefix: &str, ) -> Option { let td = td_with_label(doc, label_prefix)?; let full: String = td.text().collect(); let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full); let trimmed = collapse_whitespace(after); (!trimmed.is_empty()).then_some(trimmed) } fn collapse_whitespace(s: &str) -> String { s.split_whitespace().collect::>().join(" ") } fn compute_metadata_hash(m: &SourceManga) -> String { // Field separators are ASCII unit/record separators so a field // containing a delimiter character can't be mistaken for two // smaller fields. let mut h = Sha256::new(); fn feed(h: &mut Sha256, s: &str) { h.update(s.as_bytes()); h.update(b"\x1F"); } fn feed_list(h: &mut Sha256, xs: &[String]) { for s in xs { feed(h, s); } h.update(b"\x1E"); } feed(&mut h, &m.title); feed_list(&mut h, &m.alternative_titles); feed_list(&mut h, &m.authors); feed_list(&mut h, &m.genres); feed_list(&mut h, &m.tags); feed(&mut h, m.status.as_deref().unwrap_or("")); feed(&mut h, m.summary.as_deref().unwrap_or("")); feed(&mut h, m.cover_url.as_deref().unwrap_or("")); format!("{:x}", h.finalize()) } #[cfg(test)] mod tests { use super::*; const LISTING_HTML: &str = r#"

Foo Manga

Bar Baz

Empty href ignored

"#; const DETAIL_HTML: &str = r#"

Test Manga Title

A summary of the manga.

Author:Author One Author Two

Genre(s):Action Drama

Status:Ongoing

Alternative: Alt Title 1; Alt Title 2

Ch.1

Ch.2 - The Beginning

Chapter 3: Onward

"#; #[test] fn parse_manga_list_extracts_title_url_and_derives_key() { let refs = parse_manga_list(LISTING_HTML); assert_eq!(refs.len(), 2, "third entry has empty href and is skipped"); assert_eq!(refs[0].title, "Foo Manga"); assert_eq!(refs[0].url, "https://target.example/manga/foo"); assert_eq!(refs[0].source_manga_key, "foo"); assert_eq!(refs[1].title, "Bar Baz"); assert_eq!(refs[1].source_manga_key, "bar-baz"); } #[test] fn parse_manga_detail_pulls_all_fields() { let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse"); assert_eq!(m.source_manga_key, "test-key"); assert_eq!(m.title, "Test Manga Title"); assert_eq!(m.summary.as_deref(), Some("A summary of the manga.")); assert_eq!(m.authors, vec!["Author One", "Author Two"]); assert_eq!(m.genres, vec!["Action", "Drama"]); assert_eq!(m.status.as_deref(), Some("ongoing")); assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]); // Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy". assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]); assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg")); assert!(!m.metadata_hash.is_empty()); assert_eq!(m.chapters.len(), 3); assert_eq!(m.chapters[0].number, 1); assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1")); assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1"); assert_eq!(m.chapters[0].source_chapter_key, "1"); assert_eq!(m.chapters[1].number, 2); assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning")); assert_eq!(m.chapters[2].number, 3); assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward")); } #[test] fn status_normalized_case_insensitively() { assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing")); assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing")); assert_eq!(normalize_status(Some(" completed "), "k").as_deref(), Some("completed")); } #[test] fn unknown_status_logs_and_returns_none() { // Logging is observable in test output via tracing-test, but // here we just assert the contract: unknown becomes None // (and the manga is therefore still synced by the caller). assert!(normalize_status(Some("Hiatus"), "k").is_none()); assert!(normalize_status(Some(""), "k").is_none()); assert!(normalize_status(None, "k").is_none()); } #[test] fn strip_tag_count_drops_trailing_digit_parens_only() { assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy"); assert_eq!(strip_tag_count(" Action (5) "), "Action"); assert_eq!(strip_tag_count("Romance"), "Romance"); // Non-numeric parens stay put. assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)"); // Only the trailing paren is considered. assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)"); } #[test] fn parse_chapter_list_keeps_all_chapters_with_unique_keys() { // Real listing fixture from the target site. 15 rows: chapters // with various Ch.N markup, one hiatus row, three "notice." rows, // and duplicates of Ch.1 and Ch.52 from different uploaders. // Every row must survive parsing and every chapter must have a // distinct source_chapter_key — chapter URLs all end in `/pg-1/` // (the reader's page-1 entry point), and a naive // last-segment-of-URL derivation returns "pg-1" for every row, // collapsing the whole list into one downstream chapter row. let html = include_str!( "../../../tests/fixtures/target/chapter_list_uu.html" ); let doc = scraper::Html::parse_document(html); let chapters = parse_chapter_list(&doc); assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)"); let mut keys: Vec<&str> = chapters.iter().map(|c| c.source_chapter_key.as_str()).collect(); keys.sort(); let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]); assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}"); for c in &chapters { assert_ne!( c.source_chapter_key, "pg-1", "key must not be the reader-page segment: {:?}", c ); } // Latest chapter is first (source orders newest → oldest). assert_eq!(chapters[0].number, 67); assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official")); assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272"); // Duplicate-number chapters (different uploaders) survive as // two rows. The (manga_id, number) UNIQUE collapse is a // downstream schema concern handled separately. assert_eq!( chapters.iter().filter(|c| c.number == 52).count(), 2, "two Ch.52 uploads must both survive parsing" ); assert_eq!( chapters.iter().filter(|c| c.number == 1).count(), 2, "Ch.1 Official and Ch.1 Team Hazama are both kept" ); // Notices / hiatus rows have no leading digit so they parse to // number=0. They are not filtered out. let zero = chapters.iter().filter(|c| c.number == 0).count(); assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}"); } #[test] fn parse_chapter_number_grabs_first_integer_run() { assert_eq!(parse_chapter_number("Ch.1"), Some(1)); assert_eq!(parse_chapter_number("Chapter 12"), Some(12)); assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2)); // Decimal chapters keep the integer part (i32 storage). assert_eq!(parse_chapter_number("Ch.12.5"), Some(12)); assert_eq!(parse_chapter_number("Special"), None); } #[test] fn parse_last_page_picks_highest_pagination_link() { let html = r#"

"#; let doc = scraper::Html::parse_document(html); assert_eq!(parse_last_page(&doc), Some(47)); } #[test] fn parse_last_page_none_when_no_pagination() { let doc = scraper::Html::parse_document(""); assert!(parse_last_page(&doc).is_none()); } #[test] fn page_url_substitutes_numeric_path_segment() { assert_eq!( page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5), "https://site.example/list/5/?f=1&o=1&sortby=update_date&e=" ); // No numeric segment → URL returned unchanged. assert_eq!( page_url("https://site.example/list/?f=1", 5), "https://site.example/list/?f=1" ); } #[test] fn derive_key_strips_trailing_slash_and_query() { assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo"); assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo"); assert_eq!(derive_key_from_url("/manga/bar"), "bar"); } #[test] fn derive_chapter_key_strips_trailing_reader_page_segment() { // Listing links go to page 1 of the reader; strip /pg-\d+/. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"), "br_chapter-379272" ); assert_eq!( derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"), "to_chapter-13" ); // Defensive: deep-link to a non-first page should still resolve // to the same chapter identity. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"), "br_chapter-379272" ); // No reader-page suffix → behaves like derive_key_from_url. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/"), "br_chapter-379272" ); // Query strings are stripped. assert_eq!( derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"), "br_chapter-379272" ); // `pg-foo` is not a valid reader-page segment; treated as identity. assert_eq!( derive_chapter_key_from_url(".../uu/something/pg-foo/"), "pg-foo" ); // Bare `pg-` (no digits) likewise not stripped. assert_eq!( derive_chapter_key_from_url(".../uu/something/pg-/"), "pg-" ); } #[test] fn metadata_hash_is_stable_and_field_sensitive() { let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap(); let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap(); assert_eq!(base.metadata_hash, again.metadata_hash); // Same fields except status flipped — hash must change. let altered_html = DETAIL_HTML.replace("Ongoing", "Completed"); let altered = parse_manga_detail(&altered_html, "k", true).unwrap(); assert_ne!(base.metadata_hash, altered.metadata_hash); } #[test] fn missing_optional_fields_parse_to_none() { let html = r#"

Minimal

"#; let m = parse_manga_detail(html, "min", true).unwrap(); assert_eq!(m.title, "Minimal"); assert!(m.summary.is_none()); assert!(m.status.is_none()); assert!(m.authors.is_empty()); assert!(m.genres.is_empty()); assert!(m.tags.is_empty()); assert!(m.alternative_titles.is_empty()); assert!(m.chapters.is_empty()); } #[test] fn parse_manga_detail_skips_chapters_when_disabled() { // Same fixture that yields 3 chapters above; with include_chapters=false // the chapter table is ignored and the rest of the metadata still parses. let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap(); assert!(m.chapters.is_empty(), "chapters should be empty when disabled"); assert_eq!(m.title, "Test Manga Title", "other fields still parse"); assert_eq!(m.authors, vec!["Author One", "Author Two"]); } #[test] fn parse_manga_detail_errors_on_missing_title() { let html = "

nothing

"; let err = parse_manga_detail(html, "x", true).unwrap_err(); assert!(err.to_string().contains("missing .w-title h1")); } }