- target.rs swaps retry_on_transient → retry_on_transient_with_hook, signaling NEWNYM via ctx.tor between attempts when configured. - session.rs gains verify_session_with_recircuit; the bare verify_session is now a one-line wrapper passing tor=None, unauth_max_recircuit=0. The inner run_session_probe_loop is pure-over-IO and unit-tested with closure-based fakes. - content.rs extracts fetch_chapter_html_once + the closure-driven fetch_chapter_html_with_recircuit, used by sync_chapter_content to retry on Transient or Unauthenticated up to a recircuit_budget. Budget = 0 (no TOR) preserves original behavior bit-for-bit. - app.rs and bin/crawler.rs construct the controller before on_launch and pass it into verify_session_with_recircuit, so a transient hiccup at startup no longer requires PHPSESSID rotation. Recircuit budget defaults to CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS (3). Errors from NEWNYM are logged and swallowed — failing to recircuit should not take down the crawl. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
1053 lines
41 KiB
Rust
1053 lines
41 KiB
Rust
//! First concrete [`Source`] impl, modeled on the selectors of the
|
|
//! old Puppeteer crawler. The name "target" is a placeholder — rename
|
|
//! once the site is officially identified.
|
|
//!
|
|
//! `scraper`'s selector parser does not support `:has()` or
|
|
//! `:contains()`, so the labelled-`td` lookups from the old script
|
|
//! (`td:has(label:contains("Author:"))`) are implemented by walking
|
|
//! the parsed tree.
|
|
|
|
use std::time::Duration;
|
|
|
|
use anyhow::Context;
|
|
use async_trait::async_trait;
|
|
use sha2::{Digest, Sha256};
|
|
|
|
use super::{
|
|
DiscoverWalk, FetchContext, Source, SourceChapter, SourceChapterRef, SourceManga,
|
|
SourceMangaRef,
|
|
};
|
|
use crate::crawler::detect::{
|
|
has_logo_sentinel, is_broken_page_body, retry_on_transient_with_hook, PageError,
|
|
};
|
|
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
|
|
|
|
/// `sources.id` value for this Source impl. Exposed as a const so the
|
|
/// daemon can look up per-source state (e.g. the recovery flag) before
|
|
/// constructing the Source itself.
|
|
pub const SOURCE_ID: &str = "target";
|
|
|
|
/// In-loop retry budget for transient pages encountered during a single
|
|
/// `discover` walk. Bounded small because the next cron tick will pick up
|
|
/// where this run left off via the recovery flag — these inline retries
|
|
/// only need to absorb a brief site hiccup mid-walk, not a sustained
|
|
/// outage.
|
|
const PAGE_TRANSIENT_RETRY_ATTEMPTS: u32 = 3;
|
|
const PAGE_TRANSIENT_RETRY_DELAY: Duration = Duration::from_secs(2);
|
|
|
|
pub struct TargetSource {
|
|
base_url: String,
|
|
parse_chapters: bool,
|
|
}
|
|
|
|
impl TargetSource {
|
|
pub fn new(base_url: impl Into<String>) -> Self {
|
|
Self {
|
|
base_url: base_url.into(),
|
|
parse_chapters: true,
|
|
}
|
|
}
|
|
|
|
pub fn base_url(&self) -> &str {
|
|
&self.base_url
|
|
}
|
|
|
|
/// Skip the chapter-list selector when parsing detail pages.
|
|
/// The returned `SourceManga.chapters` will be empty even when the
|
|
/// page has a chapter table. Caller must also avoid calling
|
|
/// `repo::crawler::sync_manga_chapters` for these mangas — an
|
|
/// empty list would otherwise soft-drop the manga's existing
|
|
/// chapter rows.
|
|
pub fn without_chapter_parsing(mut self) -> Self {
|
|
self.parse_chapters = false;
|
|
self
|
|
}
|
|
}
|
|
|
|
#[async_trait]
|
|
impl Source for TargetSource {
|
|
fn id(&self) -> &'static str {
|
|
SOURCE_ID
|
|
}
|
|
|
|
async fn discover(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
) -> anyhow::Result<Box<dyn DiscoverWalk + Send>> {
|
|
// Probe page 1 up front (with transient retry) for two reasons:
|
|
// a broken first page should abort cleanly rather than mid-walk,
|
|
// and the HTML is handed straight to the first `next_batch` call
|
|
// so the walker doesn't re-fetch it. Page count is discovered
|
|
// incrementally — see `TargetSourceWalker::next_batch`.
|
|
let first_html = retry_on_transient_with_hook(
|
|
|| async {
|
|
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
|
|
},
|
|
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
|
PAGE_TRANSIENT_RETRY_DELAY,
|
|
|| async { recircuit_if_configured(ctx.tor).await },
|
|
)
|
|
.await?;
|
|
|
|
Ok(Box::new(TargetSourceWalker {
|
|
base_url: self.base_url.clone(),
|
|
next_page: 1,
|
|
first_page_html: Some(first_html),
|
|
}))
|
|
}
|
|
|
|
async fn fetch_manga(
|
|
&self,
|
|
ctx: &FetchContext<'_>,
|
|
r: &SourceMangaRef,
|
|
) -> anyhow::Result<SourceManga> {
|
|
// When we'll parse the chapter table, wait for at least one
|
|
// chapter row to appear — that's the marker most sensitive to
|
|
// the post-load JS partial-render race. When we won't, fall
|
|
// back to the layout-level `#logo` so we still wait for the
|
|
// page to settle.
|
|
let marker = if self.parse_chapters {
|
|
DETAIL_PAGE_CHAPTERS_MARKER
|
|
} else {
|
|
DETAIL_PAGE_LAYOUT_MARKER
|
|
};
|
|
let html = navigate(ctx, r.url.as_str(), marker).await?;
|
|
// Convert PageError → anyhow::Error via `?`. PageError stays
|
|
// downcastable from the wrapped anyhow::Error so the pipeline
|
|
// can still recognize Transient via `error.downcast_ref::<PageError>()`.
|
|
let manga = parse_manga_detail(&html, &r.source_manga_key, self.parse_chapters)
|
|
.with_context(|| format!("parse manga detail at {}", r.url))?;
|
|
Ok(manga)
|
|
}
|
|
|
|
async fn fetch_chapter_list(
|
|
&self,
|
|
_ctx: &FetchContext<'_>,
|
|
_manga: &SourceManga,
|
|
) -> anyhow::Result<Vec<SourceChapterRef>> {
|
|
anyhow::bail!("fetch_chapter_list not implemented yet")
|
|
}
|
|
|
|
async fn fetch_chapter(
|
|
&self,
|
|
_ctx: &FetchContext<'_>,
|
|
_r: &SourceChapterRef,
|
|
) -> anyhow::Result<SourceChapter> {
|
|
anyhow::bail!("fetch_chapter not implemented yet")
|
|
}
|
|
}
|
|
|
|
/// Walker returned by [`TargetSource::discover`]. Walks pages `1..` in
|
|
/// order, terminating as soon as a page renders cleanly with zero entries
|
|
/// — that's the "we ran off the end of the index" signal. Page 1's HTML
|
|
/// is cached at construction time (discover already had to fetch it for
|
|
/// the transient probe) so the first batch doesn't re-fetch.
|
|
///
|
|
/// A genuinely empty `Ok(vec![])` from `parse_manga_list_from` is what
|
|
/// stops us: the parser's `#logo` sentinel converts unrendered pages
|
|
/// into transient errors before they reach this loop, so an empty
|
|
/// parse result reliably means "no more entries."
|
|
struct TargetSourceWalker {
|
|
base_url: String,
|
|
next_page: i32,
|
|
first_page_html: Option<String>,
|
|
}
|
|
|
|
#[async_trait]
|
|
impl DiscoverWalk for TargetSourceWalker {
|
|
async fn next_batch(
|
|
&mut self,
|
|
ctx: &FetchContext<'_>,
|
|
) -> anyhow::Result<Option<Vec<SourceMangaRef>>> {
|
|
let page_num = self.next_page;
|
|
let page_refs = if page_num == 1 {
|
|
// Reuse the cached page-1 HTML from the initial probe. Take
|
|
// it (rather than clone) so a future re-entry that somehow
|
|
// revisits page 1 still falls back to a real fetch.
|
|
match self.first_page_html.take() {
|
|
Some(html) => {
|
|
let doc = scraper::Html::parse_document(&html);
|
|
parse_manga_list_from(&doc)?
|
|
}
|
|
None => {
|
|
retry_on_transient_with_hook(
|
|
|| async {
|
|
let html = navigate(
|
|
ctx,
|
|
self.base_url.as_str(),
|
|
LIST_PAGE_MARKER,
|
|
)
|
|
.await?;
|
|
let doc = scraper::Html::parse_document(&html);
|
|
parse_manga_list_from(&doc)
|
|
},
|
|
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
|
PAGE_TRANSIENT_RETRY_DELAY,
|
|
|| async { recircuit_if_configured(ctx.tor).await },
|
|
)
|
|
.await?
|
|
}
|
|
}
|
|
} else {
|
|
retry_on_transient_with_hook(
|
|
|| async {
|
|
let url = page_url(&self.base_url, page_num);
|
|
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
|
|
let doc = scraper::Html::parse_document(&html);
|
|
parse_manga_list_from(&doc)
|
|
},
|
|
PAGE_TRANSIENT_RETRY_ATTEMPTS,
|
|
PAGE_TRANSIENT_RETRY_DELAY,
|
|
|| async { recircuit_if_configured(ctx.tor).await },
|
|
)
|
|
.await?
|
|
};
|
|
tracing::info!(page_num, count = page_refs.len(), "page walked");
|
|
if page_refs.is_empty() {
|
|
return Ok(None);
|
|
}
|
|
self.next_page += 1;
|
|
Ok(Some(page_refs))
|
|
}
|
|
}
|
|
|
|
/// Per-page-type markers used by `navigate`'s post-navigation wait.
|
|
/// Each is the most specific element the parser will later look for —
|
|
/// waiting on it closes the partial-render race (e.g. `#chapter_table`
|
|
/// wrapper present but rows still being injected by post-load JS) that
|
|
/// the old fixed 1s sleep masked. See [`navigate`].
|
|
const LIST_PAGE_MARKER: &str = "#left_side .pic_list .updatesli";
|
|
const DETAIL_PAGE_CHAPTERS_MARKER: &str = "#chapter_table td h4 a.chico";
|
|
const DETAIL_PAGE_LAYOUT_MARKER: &str = "#logo";
|
|
|
|
/// Single point of rate-limited navigation. Every Source request goes
|
|
/// through here, so the per-host limiter map is the only knob that
|
|
/// controls per-origin RPS. Also the choke point for transient-page
|
|
/// detection — every fetched body is screened by
|
|
/// [`classify_navigate_html`] before being handed to a selector.
|
|
///
|
|
/// `marker` is a CSS selector the caller expects to find on the loaded
|
|
/// page. The wait is best-effort: a timeout is **not** an error
|
|
/// (legitimately-empty pages may never render the marker), it just
|
|
/// caps how long we'll hold for post-load JS to finish injecting
|
|
/// content. The parser's own sentinels and the universal broken-page
|
|
/// body check still catch real failures.
|
|
async fn navigate(
|
|
ctx: &FetchContext<'_>,
|
|
url: &str,
|
|
marker: &str,
|
|
) -> Result<String, PageError> {
|
|
ctx.rate.wait_for(url).await?;
|
|
let page = ctx
|
|
.browser
|
|
.new_page(url)
|
|
.await
|
|
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
|
match wait_for_nav(&page).await {
|
|
Ok(()) => {}
|
|
Err(NavError::Timeout(_)) => {
|
|
page.close().await.ok();
|
|
return Err(PageError::transient("nav timeout"));
|
|
}
|
|
Err(NavError::Cdp(e)) => {
|
|
page.close().await.ok();
|
|
return Err(PageError::Other(anyhow::Error::from(e)));
|
|
}
|
|
}
|
|
// Best-effort wait for the page-type marker. We deliberately
|
|
// discard a timeout here — see fn-level doc.
|
|
let _ = wait_for_selector(&page, marker, SELECTOR_TIMEOUT).await;
|
|
let html = page
|
|
.content()
|
|
.await
|
|
.map_err(|e| PageError::Other(anyhow::Error::from(e)))?;
|
|
page.close().await.ok();
|
|
classify_navigate_html(html)
|
|
}
|
|
|
|
/// Classify a fetched body. The broken-page template is universal across
|
|
/// the site — every page type (list, detail, chapter list, reader) gets
|
|
/// the same `we're sorry, the request file are not found` body when the
|
|
/// server is hiccuping. Catching it here means individual parsers
|
|
/// downstream don't have to repeat the check.
|
|
fn classify_navigate_html(html: String) -> Result<String, PageError> {
|
|
if is_broken_page_body(&html) {
|
|
return Err(PageError::transient("broken-page body signature"));
|
|
}
|
|
Ok(html)
|
|
}
|
|
|
|
/// Hook for [`retry_on_transient_with_hook`]: when TOR is configured,
|
|
/// signal `NEWNYM` so the next navigation draws a fresh exit. Errors
|
|
/// from the controller are logged and swallowed — failing to recircuit
|
|
/// shouldn't take down the crawl, the next attempt just runs on the
|
|
/// same circuit as before.
|
|
async fn recircuit_if_configured(tor: Option<&crate::crawler::tor::TorController>) {
|
|
if let Some(t) = tor {
|
|
if let Err(e) = t.new_identity().await {
|
|
tracing::warn!(error = %e, "TOR NEWNYM failed; retrying on same circuit");
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
/// Substitutes the first `/N/` path segment with the target page
|
|
/// number. Source impls that paginate via a different URL shape can
|
|
/// override this — for the modeled site the segment is always present.
|
|
fn page_url(template_url: &str, page: i32) -> String {
|
|
let bytes = template_url.as_bytes();
|
|
let mut i = 0;
|
|
while i + 1 < bytes.len() {
|
|
if bytes[i] == b'/' && bytes[i + 1].is_ascii_digit() {
|
|
let start = i;
|
|
let mut j = i + 1;
|
|
while j < bytes.len() && bytes[j].is_ascii_digit() {
|
|
j += 1;
|
|
}
|
|
if j < bytes.len() && bytes[j] == b'/' {
|
|
let mut out = String::with_capacity(template_url.len() + 4);
|
|
out.push_str(&template_url[..start]);
|
|
out.push_str(&format!("/{page}/"));
|
|
out.push_str(&template_url[j + 1..]);
|
|
return out;
|
|
}
|
|
}
|
|
i += 1;
|
|
}
|
|
template_url.to_string()
|
|
}
|
|
|
|
#[cfg(test)]
|
|
fn parse_manga_list(html: &str) -> Result<Vec<SourceMangaRef>, PageError> {
|
|
let doc = scraper::Html::parse_document(html);
|
|
parse_manga_list_from(&doc)
|
|
}
|
|
|
|
/// Parse a manga listing page. `#logo` is present on every well-formed
|
|
/// listing page on the source; its absence means the response is a
|
|
/// broken-page placeholder (transient) rather than a genuinely empty
|
|
/// listing. Empty listings (last-page tail, search with no hits) remain
|
|
/// `Ok(vec![])`.
|
|
fn parse_manga_list_from(doc: &scraper::Html) -> Result<Vec<SourceMangaRef>, PageError> {
|
|
if !has_logo_sentinel(doc) {
|
|
return Err(PageError::transient("manga list: #logo sentinel missing"));
|
|
}
|
|
let sel = scraper::Selector::parse("#left_side .pic_list .updatesli span a").unwrap();
|
|
Ok(doc
|
|
.select(&sel)
|
|
.filter_map(|a| {
|
|
let url = a.value().attr("href")?.trim().to_string();
|
|
if url.is_empty() {
|
|
return None;
|
|
}
|
|
let title = collapse_whitespace(&a.text().collect::<String>());
|
|
if title.is_empty() {
|
|
return None;
|
|
}
|
|
Some(SourceMangaRef {
|
|
source_manga_key: derive_key_from_url(&url),
|
|
title,
|
|
url,
|
|
})
|
|
})
|
|
.collect())
|
|
}
|
|
|
|
fn parse_manga_detail(
|
|
html: &str,
|
|
key: &str,
|
|
include_chapters: bool,
|
|
) -> Result<SourceManga, PageError> {
|
|
let doc = scraper::Html::parse_document(html);
|
|
|
|
// Sentinel first: a broken-page response will trip this before any
|
|
// anyhow context is added for missing required fields.
|
|
if !has_logo_sentinel(&doc) {
|
|
return Err(PageError::transient("manga detail: #logo sentinel missing"));
|
|
}
|
|
|
|
let title = first_text(&doc, ".w-title h1").context("missing .w-title h1")?;
|
|
let summary = first_text(&doc, ".manga_summary");
|
|
let cover_url = first_attr(&doc, ".cover > img:nth-child(1)", "src");
|
|
|
|
let authors = links_in_labelled_td(&doc, "Author");
|
|
let genres = links_in_labelled_td(&doc, "Genre");
|
|
let raw_status = labelled_td_child_text(&doc, "Status", "span");
|
|
let status = normalize_status(raw_status.as_deref(), key);
|
|
|
|
let alternative_titles = labelled_td_value_after_label(&doc, "Alternative")
|
|
.map(|s| {
|
|
s.split([';', ',', '|'])
|
|
.map(str::trim)
|
|
.filter(|p| !p.is_empty())
|
|
.map(String::from)
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
let tag_sel = scraper::Selector::parse(".aside-body a.tag").unwrap();
|
|
let tags: Vec<String> = doc
|
|
.select(&tag_sel)
|
|
.map(|a| collapse_whitespace(&a.text().collect::<String>()))
|
|
.map(|s| strip_tag_count(&s))
|
|
.filter(|s| !s.is_empty())
|
|
.collect();
|
|
|
|
let chapters = if include_chapters {
|
|
parse_chapter_list(&doc)?
|
|
} else {
|
|
Vec::new()
|
|
};
|
|
|
|
let mut manga = SourceManga {
|
|
source_manga_key: key.to_string(),
|
|
title,
|
|
alternative_titles,
|
|
authors,
|
|
genres,
|
|
tags,
|
|
status,
|
|
summary,
|
|
cover_url,
|
|
chapters,
|
|
metadata_hash: String::new(),
|
|
};
|
|
manga.metadata_hash = compute_metadata_hash(&manga);
|
|
Ok(manga)
|
|
}
|
|
|
|
/// Source advertises status as "Ongoing" or "Completed"; we normalize
|
|
/// to the lowercase form the `mangas.status` CHECK constraint accepts.
|
|
/// Anything else is a parse miss (selector drift, new value, etc.) and
|
|
/// returns `None` after logging — the manga sync continues regardless.
|
|
fn normalize_status(raw: Option<&str>, key: &str) -> Option<String> {
|
|
let trimmed = raw.map(str::trim).filter(|s| !s.is_empty())?;
|
|
if trimmed.eq_ignore_ascii_case("ongoing") {
|
|
Some("ongoing".to_string())
|
|
} else if trimmed.eq_ignore_ascii_case("completed") {
|
|
Some("completed".to_string())
|
|
} else {
|
|
tracing::error!(
|
|
key,
|
|
raw_status = trimmed,
|
|
"unknown manga status (expected 'Ongoing' or 'Completed'); continuing with status=None"
|
|
);
|
|
None
|
|
}
|
|
}
|
|
|
|
/// Strips a trailing digit-only `(NN)` suffix from a tag name, the form
|
|
/// the source uses to display tag counts. Non-numeric parentheses are
|
|
/// preserved.
|
|
fn strip_tag_count(s: &str) -> String {
|
|
let trimmed = s.trim();
|
|
if trimmed.ends_with(')') {
|
|
if let Some(open) = trimmed.rfind('(') {
|
|
let inside = &trimmed[open + 1..trimmed.len() - 1];
|
|
if !inside.is_empty() && inside.chars().all(|c| c.is_ascii_digit()) {
|
|
return trimmed[..open].trim().to_string();
|
|
}
|
|
}
|
|
}
|
|
trimmed.to_string()
|
|
}
|
|
|
|
/// Parse the chapter table on a manga detail page. Returns `Transient` if
|
|
/// `#chapter_table` isn't in the DOM at all — the table is required even
|
|
/// for mangas with no published chapters yet (the source renders an empty
|
|
/// `<table>`), so an absent table signals a partial render (post-load JS
|
|
/// not done, layout drift) rather than a legitimately empty list. Without
|
|
/// this sentinel, an empty `Vec` reaches `sync_manga_chapters` and the
|
|
/// soft-drop branch flips every existing chapter to `dropped_at`.
|
|
fn parse_chapter_list(doc: &scraper::Html) -> Result<Vec<SourceChapterRef>, PageError> {
|
|
if !has_chapter_table_sentinel(doc) {
|
|
return Err(PageError::transient(
|
|
"manga detail: #chapter_table sentinel missing",
|
|
));
|
|
}
|
|
let sel = scraper::Selector::parse("#chapter_table td h4 a.chico").unwrap();
|
|
Ok(doc
|
|
.select(&sel)
|
|
.filter_map(|a| {
|
|
let url = a.value().attr("href")?.trim().to_string();
|
|
if url.is_empty() {
|
|
return None;
|
|
}
|
|
let title_text = collapse_whitespace(&a.text().collect::<String>());
|
|
let number = parse_chapter_number(&title_text).unwrap_or(0);
|
|
Some(SourceChapterRef {
|
|
source_chapter_key: derive_chapter_key_from_url(&url),
|
|
number,
|
|
title: (!title_text.is_empty()).then_some(title_text),
|
|
url,
|
|
})
|
|
})
|
|
.collect())
|
|
}
|
|
|
|
/// Returns true when the chapter-table container is present in the DOM.
|
|
/// Source-specific: the target site uses `#chapter_table` as the wrapper
|
|
/// element. Distinguishes "table is present but empty" (legit edge case
|
|
/// for new mangas) from "table is missing entirely" (partial render).
|
|
fn has_chapter_table_sentinel(doc: &scraper::Html) -> bool {
|
|
let sel = scraper::Selector::parse("#chapter_table").expect("valid selector");
|
|
doc.select(&sel).next().is_some()
|
|
}
|
|
|
|
fn parse_chapter_number(text: &str) -> Option<i32> {
|
|
let mut buf = String::new();
|
|
for c in text.chars() {
|
|
if c.is_ascii_digit() {
|
|
buf.push(c);
|
|
} else if !buf.is_empty() {
|
|
break;
|
|
}
|
|
}
|
|
buf.parse().ok()
|
|
}
|
|
|
|
fn derive_key_from_url(url: &str) -> String {
|
|
url.split('?')
|
|
.next()
|
|
.unwrap_or(url)
|
|
.trim_end_matches('/')
|
|
.rsplit('/')
|
|
.find(|s| !s.is_empty())
|
|
.unwrap_or(url)
|
|
.to_string()
|
|
}
|
|
|
|
/// Chapter URLs on this source point at the reader's page 1, e.g.
|
|
/// `.../uu/br_chapter-379272/pg-1/`. The chapter identity is the
|
|
/// `br_chapter-N` (or `to_chapter-N`) segment — the `pg-\d+` segment
|
|
/// identifies a page *within* a chapter, so naively taking the last
|
|
/// path component returns `"pg-1"` for every chapter and collapses
|
|
/// them all under one source_chapter_key downstream.
|
|
fn derive_chapter_key_from_url(url: &str) -> String {
|
|
let trimmed = url.split('?').next().unwrap_or(url).trim_end_matches('/');
|
|
let without_reader_page = match trimmed.rsplit_once('/') {
|
|
Some((prefix, last)) if is_reader_page_segment(last) => prefix,
|
|
_ => trimmed,
|
|
};
|
|
without_reader_page
|
|
.rsplit('/')
|
|
.find(|s| !s.is_empty())
|
|
.unwrap_or(url)
|
|
.to_string()
|
|
}
|
|
|
|
fn is_reader_page_segment(s: &str) -> bool {
|
|
s.len() > 3 && s.starts_with("pg-") && s[3..].bytes().all(|b| b.is_ascii_digit())
|
|
}
|
|
|
|
fn first_text(doc: &scraper::Html, sel: &str) -> Option<String> {
|
|
let s = scraper::Selector::parse(sel).ok()?;
|
|
let el = doc.select(&s).next()?;
|
|
let text = collapse_whitespace(&el.text().collect::<String>());
|
|
(!text.is_empty()).then_some(text)
|
|
}
|
|
|
|
fn first_attr(doc: &scraper::Html, sel: &str, attr: &str) -> Option<String> {
|
|
let s = scraper::Selector::parse(sel).ok()?;
|
|
let el = doc.select(&s).next()?;
|
|
el.value().attr(attr).map(str::to_string)
|
|
}
|
|
|
|
/// `td` whose contained `label` text begins with `label_prefix` — the
|
|
/// `scraper`-friendly equivalent of `td:has(label:contains("Foo"))`.
|
|
fn td_with_label<'a>(
|
|
doc: &'a scraper::Html,
|
|
label_prefix: &str,
|
|
) -> Option<scraper::ElementRef<'a>> {
|
|
let td_sel = scraper::Selector::parse("td").unwrap();
|
|
let label_sel = scraper::Selector::parse("label").unwrap();
|
|
for td in doc.select(&td_sel) {
|
|
for label in td.select(&label_sel) {
|
|
let text: String = label.text().collect();
|
|
if text.trim().starts_with(label_prefix) {
|
|
return Some(td);
|
|
}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
fn links_in_labelled_td(doc: &scraper::Html, label_prefix: &str) -> Vec<String> {
|
|
let Some(td) = td_with_label(doc, label_prefix) else {
|
|
return Vec::new();
|
|
};
|
|
let a_sel = scraper::Selector::parse("a").unwrap();
|
|
td.select(&a_sel)
|
|
.map(|a| collapse_whitespace(&a.text().collect::<String>()))
|
|
.filter(|s| !s.is_empty())
|
|
.collect()
|
|
}
|
|
|
|
fn labelled_td_child_text(
|
|
doc: &scraper::Html,
|
|
label_prefix: &str,
|
|
child_sel: &str,
|
|
) -> Option<String> {
|
|
let td = td_with_label(doc, label_prefix)?;
|
|
let child = scraper::Selector::parse(child_sel).ok()?;
|
|
let el = td.select(&child).next()?;
|
|
let text = collapse_whitespace(&el.text().collect::<String>());
|
|
(!text.is_empty()).then_some(text)
|
|
}
|
|
|
|
/// Returns the text content of the labelled `td` with the leading
|
|
/// "Label:" portion stripped — used for "Alternative:" which puts the
|
|
/// value directly in the cell rather than in a child element.
|
|
fn labelled_td_value_after_label(
|
|
doc: &scraper::Html,
|
|
label_prefix: &str,
|
|
) -> Option<String> {
|
|
let td = td_with_label(doc, label_prefix)?;
|
|
let full: String = td.text().collect();
|
|
let after = full.split_once(':').map(|(_, r)| r).unwrap_or(&full);
|
|
let trimmed = collapse_whitespace(after);
|
|
(!trimmed.is_empty()).then_some(trimmed)
|
|
}
|
|
|
|
fn collapse_whitespace(s: &str) -> String {
|
|
s.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
}
|
|
|
|
fn compute_metadata_hash(m: &SourceManga) -> String {
|
|
// Field separators are ASCII unit/record separators so a field
|
|
// containing a delimiter character can't be mistaken for two
|
|
// smaller fields.
|
|
let mut h = Sha256::new();
|
|
fn feed(h: &mut Sha256, s: &str) {
|
|
h.update(s.as_bytes());
|
|
h.update(b"\x1F");
|
|
}
|
|
fn feed_list(h: &mut Sha256, xs: &[String]) {
|
|
for s in xs {
|
|
feed(h, s);
|
|
}
|
|
h.update(b"\x1E");
|
|
}
|
|
feed(&mut h, &m.title);
|
|
feed_list(&mut h, &m.alternative_titles);
|
|
feed_list(&mut h, &m.authors);
|
|
feed_list(&mut h, &m.genres);
|
|
feed_list(&mut h, &m.tags);
|
|
feed(&mut h, m.status.as_deref().unwrap_or(""));
|
|
feed(&mut h, m.summary.as_deref().unwrap_or(""));
|
|
feed(&mut h, m.cover_url.as_deref().unwrap_or(""));
|
|
format!("{:x}", h.finalize())
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
const LISTING_HTML: &str = r#"
|
|
<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<div id="left_side">
|
|
<div class="pic_list">
|
|
<div class="updatesli">
|
|
<span><a href="https://target.example/manga/foo">Foo Manga</a></span>
|
|
</div>
|
|
<div class="updatesli">
|
|
<span><a href="https://target.example/manga/bar-baz"> Bar Baz </a></span>
|
|
</div>
|
|
<div class="updatesli">
|
|
<span><a href="">Empty href ignored</a></span>
|
|
</div>
|
|
</div>
|
|
</div>
|
|
</body></html>
|
|
"#;
|
|
|
|
const DETAIL_HTML: &str = r#"
|
|
<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<div class="w-title"><h1>Test Manga Title</h1></div>
|
|
<div class="cover"><img src="/cover.jpg"><img src="/extra-not-cover.jpg"></div>
|
|
<div class="manga_summary">A summary of the manga.</div>
|
|
<table>
|
|
<tr><td><label>Author:</label><a href="/a/1">Author One</a><a href="/a/2">Author Two</a></td></tr>
|
|
<tr><td><label>Genre(s):</label><a href="/g/1">Action</a><a href="/g/2">Drama</a></td></tr>
|
|
<tr><td><label>Status:</label><span>Ongoing</span></td></tr>
|
|
<tr><td><label>Alternative:</label> Alt Title 1; Alt Title 2 </td></tr>
|
|
</table>
|
|
<aside><div class="aside-body">
|
|
<a class="tag">Fantasy (21)</a>
|
|
<a class="tag">Romance</a>
|
|
<a class="tag"> Action (5)</a>
|
|
<a class="not-a-tag">should-be-ignored</a>
|
|
</div></aside>
|
|
<table id="chapter_table">
|
|
<tr><td><h4><a class="chico" href="/manga/foo/chapter/1">Ch.1</a></h4></td></tr>
|
|
<tr><td><h4><a class="chico" href="/manga/foo/chapter/2">Ch.2 - The Beginning</a></h4></td></tr>
|
|
<tr><td><h4><a class="chico" href="/manga/foo/chapter/3">Chapter 3: Onward</a></h4></td></tr>
|
|
</table>
|
|
</body></html>
|
|
"#;
|
|
|
|
#[test]
|
|
fn parse_manga_list_extracts_title_url_and_derives_key() {
|
|
let refs = parse_manga_list(LISTING_HTML).expect("parse");
|
|
assert_eq!(refs.len(), 2, "third entry has empty href and is skipped");
|
|
assert_eq!(refs[0].title, "Foo Manga");
|
|
assert_eq!(refs[0].url, "https://target.example/manga/foo");
|
|
assert_eq!(refs[0].source_manga_key, "foo");
|
|
assert_eq!(refs[1].title, "Bar Baz");
|
|
assert_eq!(refs[1].source_manga_key, "bar-baz");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_list_returns_transient_when_logo_missing() {
|
|
// Broken-page response: no #logo, no listing. Empty Vec would
|
|
// hide this as "page has no mangas"; Transient is the signal
|
|
// upstream code retries on.
|
|
let html = r#"<html><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>"#;
|
|
let err = parse_manga_list(html).expect_err("expected Transient");
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_list_ok_empty_when_logo_present_but_no_items() {
|
|
// Last page of pagination, "no results" search, etc. Legitimately
|
|
// empty must stay distinguishable from "page is broken".
|
|
let html = r#"<html><body>\
|
|
<header><div id="logo">Target</div></header>\
|
|
<div id="left_side"><div class="pic_list"></div></div>\
|
|
</body></html>"#;
|
|
let refs = parse_manga_list(html).expect("logo present == not transient");
|
|
assert!(refs.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_pulls_all_fields() {
|
|
let m = parse_manga_detail(DETAIL_HTML, "test-key", true).expect("parse");
|
|
assert_eq!(m.source_manga_key, "test-key");
|
|
assert_eq!(m.title, "Test Manga Title");
|
|
assert_eq!(m.summary.as_deref(), Some("A summary of the manga."));
|
|
assert_eq!(m.authors, vec!["Author One", "Author Two"]);
|
|
assert_eq!(m.genres, vec!["Action", "Drama"]);
|
|
assert_eq!(m.status.as_deref(), Some("ongoing"));
|
|
assert_eq!(m.alternative_titles, vec!["Alt Title 1", "Alt Title 2"]);
|
|
// Counts in parentheses are stripped — "Fantasy (21)" → "Fantasy".
|
|
assert_eq!(m.tags, vec!["Fantasy", "Romance", "Action"]);
|
|
assert_eq!(m.cover_url.as_deref(), Some("/cover.jpg"));
|
|
assert!(!m.metadata_hash.is_empty());
|
|
|
|
assert_eq!(m.chapters.len(), 3);
|
|
assert_eq!(m.chapters[0].number, 1);
|
|
assert_eq!(m.chapters[0].title.as_deref(), Some("Ch.1"));
|
|
assert_eq!(m.chapters[0].url, "/manga/foo/chapter/1");
|
|
assert_eq!(m.chapters[0].source_chapter_key, "1");
|
|
assert_eq!(m.chapters[1].number, 2);
|
|
assert_eq!(m.chapters[1].title.as_deref(), Some("Ch.2 - The Beginning"));
|
|
assert_eq!(m.chapters[2].number, 3);
|
|
assert_eq!(m.chapters[2].title.as_deref(), Some("Chapter 3: Onward"));
|
|
}
|
|
|
|
#[test]
|
|
fn status_normalized_case_insensitively() {
|
|
assert_eq!(normalize_status(Some("Ongoing"), "k").as_deref(), Some("ongoing"));
|
|
assert_eq!(normalize_status(Some("ONGOING"), "k").as_deref(), Some("ongoing"));
|
|
assert_eq!(normalize_status(Some(" completed "), "k").as_deref(), Some("completed"));
|
|
}
|
|
|
|
#[test]
|
|
fn unknown_status_logs_and_returns_none() {
|
|
// Logging is observable in test output via tracing-test, but
|
|
// here we just assert the contract: unknown becomes None
|
|
// (and the manga is therefore still synced by the caller).
|
|
assert!(normalize_status(Some("Hiatus"), "k").is_none());
|
|
assert!(normalize_status(Some(""), "k").is_none());
|
|
assert!(normalize_status(None, "k").is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn strip_tag_count_drops_trailing_digit_parens_only() {
|
|
assert_eq!(strip_tag_count("Fantasy (21)"), "Fantasy");
|
|
assert_eq!(strip_tag_count(" Action (5) "), "Action");
|
|
assert_eq!(strip_tag_count("Romance"), "Romance");
|
|
// Non-numeric parens stay put.
|
|
assert_eq!(strip_tag_count("Slice of Life (sub)"), "Slice of Life (sub)");
|
|
// Only the trailing paren is considered.
|
|
assert_eq!(strip_tag_count("Tag (a) (12)"), "Tag (a)");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_chapter_list_keeps_all_chapters_with_unique_keys() {
|
|
// Real listing fixture from the target site. 15 rows: chapters
|
|
// with various Ch.N markup, one hiatus row, three "notice." rows,
|
|
// and duplicates of Ch.1 and Ch.52 from different uploaders.
|
|
// Every row must survive parsing and every chapter must have a
|
|
// distinct source_chapter_key — chapter URLs all end in `/pg-1/`
|
|
// (the reader's page-1 entry point), and a naive
|
|
// last-segment-of-URL derivation returns "pg-1" for every row,
|
|
// collapsing the whole list into one downstream chapter row.
|
|
let html = include_str!(
|
|
"../../../tests/fixtures/target/chapter_list_uu.html"
|
|
);
|
|
let doc = scraper::Html::parse_document(html);
|
|
let chapters = parse_chapter_list(&doc).expect("fixture has the table");
|
|
|
|
assert_eq!(chapters.len(), 15, "every row kept (notices/hiatus included)");
|
|
|
|
let mut keys: Vec<&str> =
|
|
chapters.iter().map(|c| c.source_chapter_key.as_str()).collect();
|
|
keys.sort();
|
|
let dupe = keys.windows(2).find(|w| w[0] == w[1]).map(|w| w[0]);
|
|
assert!(dupe.is_none(), "duplicate chapter key: {dupe:?}");
|
|
for c in &chapters {
|
|
assert_ne!(
|
|
c.source_chapter_key, "pg-1",
|
|
"key must not be the reader-page segment: {:?}", c
|
|
);
|
|
}
|
|
|
|
// Latest chapter is first (source orders newest → oldest).
|
|
assert_eq!(chapters[0].number, 67);
|
|
assert_eq!(chapters[0].title.as_deref(), Some("Ch.67 : Official"));
|
|
assert_eq!(chapters[0].source_chapter_key, "br_chapter-379272");
|
|
|
|
// Duplicate-number chapters (different uploaders) survive as
|
|
// two rows. The (manga_id, number) UNIQUE collapse is a
|
|
// downstream schema concern handled separately.
|
|
assert_eq!(
|
|
chapters.iter().filter(|c| c.number == 52).count(),
|
|
2,
|
|
"two Ch.52 uploads must both survive parsing"
|
|
);
|
|
assert_eq!(
|
|
chapters.iter().filter(|c| c.number == 1).count(),
|
|
2,
|
|
"Ch.1 Official and Ch.1 Team Hazama are both kept"
|
|
);
|
|
|
|
// Notices / hiatus rows have no leading digit so they parse to
|
|
// number=0. They are not filtered out.
|
|
let zero = chapters.iter().filter(|c| c.number == 0).count();
|
|
assert!(zero >= 4, "hiatus + 3 notices kept; got {zero}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_chapter_number_grabs_first_integer_run() {
|
|
assert_eq!(parse_chapter_number("Ch.1"), Some(1));
|
|
assert_eq!(parse_chapter_number("Chapter 12"), Some(12));
|
|
assert_eq!(parse_chapter_number("Ch.2 - The Beginning"), Some(2));
|
|
// Decimal chapters keep the integer part (i32 storage).
|
|
assert_eq!(parse_chapter_number("Ch.12.5"), Some(12));
|
|
assert_eq!(parse_chapter_number("Special"), None);
|
|
}
|
|
|
|
#[test]
|
|
fn page_url_substitutes_numeric_path_segment() {
|
|
assert_eq!(
|
|
page_url("https://site.example/list/1/?f=1&o=1&sortby=update_date&e=", 5),
|
|
"https://site.example/list/5/?f=1&o=1&sortby=update_date&e="
|
|
);
|
|
// No numeric segment → URL returned unchanged.
|
|
assert_eq!(
|
|
page_url("https://site.example/list/?f=1", 5),
|
|
"https://site.example/list/?f=1"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn derive_key_strips_trailing_slash_and_query() {
|
|
assert_eq!(derive_key_from_url("https://x.example/manga/foo/"), "foo");
|
|
assert_eq!(derive_key_from_url("https://x.example/manga/foo?p=1"), "foo");
|
|
assert_eq!(derive_key_from_url("/manga/bar"), "bar");
|
|
}
|
|
|
|
#[test]
|
|
fn derive_chapter_key_strips_trailing_reader_page_segment() {
|
|
// Listing links go to page 1 of the reader; strip /pg-\d+/.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/"),
|
|
"br_chapter-379272"
|
|
);
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/to_chapter-13/pg-1/"),
|
|
"to_chapter-13"
|
|
);
|
|
// Defensive: deep-link to a non-first page should still resolve
|
|
// to the same chapter identity.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-25/"),
|
|
"br_chapter-379272"
|
|
);
|
|
// No reader-page suffix → behaves like derive_key_from_url.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/br_chapter-379272/"),
|
|
"br_chapter-379272"
|
|
);
|
|
// Query strings are stripped.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/br_chapter-379272/pg-1/?ref=x"),
|
|
"br_chapter-379272"
|
|
);
|
|
// `pg-foo` is not a valid reader-page segment; treated as identity.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/something/pg-foo/"),
|
|
"pg-foo"
|
|
);
|
|
// Bare `pg-` (no digits) likewise not stripped.
|
|
assert_eq!(
|
|
derive_chapter_key_from_url(".../uu/something/pg-/"),
|
|
"pg-"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn metadata_hash_is_stable_and_field_sensitive() {
|
|
let base = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
|
|
let again = parse_manga_detail(DETAIL_HTML, "k", true).unwrap();
|
|
assert_eq!(base.metadata_hash, again.metadata_hash);
|
|
|
|
// Same fields except status flipped — hash must change.
|
|
let altered_html = DETAIL_HTML.replace("Ongoing", "Completed");
|
|
let altered = parse_manga_detail(&altered_html, "k", true).unwrap();
|
|
assert_ne!(base.metadata_hash, altered.metadata_hash);
|
|
}
|
|
|
|
#[test]
|
|
fn missing_optional_fields_parse_to_none() {
|
|
// Minimal but well-formed detail page: title is required, every
|
|
// other field is optional, but the chapter table is structural —
|
|
// its absence is treated as Transient (a freshly added manga
|
|
// renders the table empty, not absent). See
|
|
// `parse_chapter_list_returns_transient_when_table_missing` for
|
|
// the negative case.
|
|
let html = r#"<html><body>\
|
|
<header><div id="logo">Target</div></header>\
|
|
<div class="w-title"><h1>Minimal</h1></div>\
|
|
<table id="chapter_table"></table>\
|
|
</body></html>"#;
|
|
let m = parse_manga_detail(html, "min", true).unwrap();
|
|
assert_eq!(m.title, "Minimal");
|
|
assert!(m.summary.is_none());
|
|
assert!(m.status.is_none());
|
|
assert!(m.authors.is_empty());
|
|
assert!(m.genres.is_empty());
|
|
assert!(m.tags.is_empty());
|
|
assert!(m.alternative_titles.is_empty());
|
|
assert!(m.chapters.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_skips_chapters_when_disabled() {
|
|
// Same fixture that yields 3 chapters above; with include_chapters=false
|
|
// the chapter table is ignored and the rest of the metadata still parses.
|
|
let m = parse_manga_detail(DETAIL_HTML, "k", false).unwrap();
|
|
assert!(m.chapters.is_empty(), "chapters should be empty when disabled");
|
|
assert_eq!(m.title, "Test Manga Title", "other fields still parse");
|
|
assert_eq!(m.authors, vec!["Author One", "Author Two"]);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_errors_on_missing_title() {
|
|
// Logo present (page is alive) — failure here is a real parse
|
|
// miss (Other), not Transient.
|
|
let html = r#"<html><body>\
|
|
<header><div id="logo">Target</div></header>\
|
|
<p>nothing</p></body></html>"#;
|
|
let err = parse_manga_detail(html, "x", true).unwrap_err();
|
|
assert!(!err.is_transient(), "expected Other, got Transient: {err}");
|
|
assert!(err.to_string().contains("missing .w-title h1"));
|
|
}
|
|
|
|
#[test]
|
|
fn classify_navigate_html_passes_normal_body_through() {
|
|
let body = "<html><body><header><div id='logo'>Target</div></header>\
|
|
<p>content</p></body></html>"
|
|
.to_string();
|
|
let out = classify_navigate_html(body.clone()).expect("ok");
|
|
assert_eq!(out, body);
|
|
}
|
|
|
|
#[test]
|
|
fn classify_navigate_html_returns_transient_for_broken_template() {
|
|
let body = "<html><head></head><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>"
|
|
.to_string();
|
|
let err = classify_navigate_html(body).expect_err("expected Transient");
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_returns_transient_when_logo_missing() {
|
|
// Broken-page response on a detail URL — must be reported as
|
|
// Transient so the job is retried rather than logging "missing
|
|
// .w-title h1" against a permanently-skipped manga.
|
|
let html = "<html><body>\
|
|
<p>we're sorry, the request file are not found.</p>\
|
|
</body></html>";
|
|
let err = parse_manga_detail(html, "x", true).expect_err("expected Transient");
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_chapter_list_returns_transient_when_table_missing() {
|
|
// Partial render (post-load JS hadn't injected the table, layout
|
|
// drift, etc). Returning Vec::new() would silently soft-drop every
|
|
// existing chapter for the manga via sync_manga_chapters; Transient
|
|
// is the signal the job system retries on.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<div class="w-title"><h1>Test</h1></div>
|
|
</body></html>"#;
|
|
let doc = scraper::Html::parse_document(html);
|
|
let err = parse_chapter_list(&doc).expect_err("expected Transient");
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_chapter_list_ok_empty_when_table_present_but_no_rows() {
|
|
// A freshly-added manga with no chapters yet — the source renders
|
|
// the `<table id="chapter_table">` wrapper but no `<tr>` rows
|
|
// inside. Must stay distinguishable from a missing-table render.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<table id="chapter_table"></table>
|
|
</body></html>"#;
|
|
let doc = scraper::Html::parse_document(html);
|
|
let chapters = parse_chapter_list(&doc).expect("present table is not transient");
|
|
assert!(chapters.is_empty());
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_propagates_chapter_table_transient() {
|
|
// End-to-end: a detail page that survives the #logo sentinel but
|
|
// has the chapter table stripped must fail Transient at the parser
|
|
// boundary, not return a SourceManga with empty chapters.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<div class="w-title"><h1>Test Title</h1></div>
|
|
<div class="cover"><img src="/cover.jpg"></div>
|
|
<!-- intentionally no #chapter_table -->
|
|
</body></html>"#;
|
|
let err = parse_manga_detail(html, "key", true).expect_err("expected Transient");
|
|
assert!(err.is_transient(), "got non-transient: {err}");
|
|
}
|
|
|
|
#[test]
|
|
fn parse_manga_detail_skips_chapter_sentinel_when_include_chapters_false() {
|
|
// Metadata-only mode (`skip_chapters` upstream) must not require
|
|
// the chapter table — pipeline.rs avoids calling sync_manga_chapters
|
|
// for these mangas, so the absent table is not a correctness issue
|
|
// and shouldn't surface as Transient.
|
|
let html = r#"<html><body>
|
|
<header><div id="logo">Target</div></header>
|
|
<div class="w-title"><h1>Test Title</h1></div>
|
|
<div class="cover"><img src="/cover.jpg"></div>
|
|
</body></html>"#;
|
|
let manga = parse_manga_detail(html, "key", false)
|
|
.expect("metadata-only parse must not require chapter table");
|
|
assert!(manga.chapters.is_empty());
|
|
}
|
|
}
|