feat(crawler): recircuit TOR on transient pages and unauthenticated probes

- target.rs swaps retry_on_transient → retry_on_transient_with_hook,
  signaling NEWNYM via ctx.tor between attempts when configured.
- session.rs gains verify_session_with_recircuit; the bare
  verify_session is now a one-line wrapper passing tor=None,
  unauth_max_recircuit=0. The inner run_session_probe_loop is
  pure-over-IO and unit-tested with closure-based fakes.
- content.rs extracts fetch_chapter_html_once + the closure-driven
  fetch_chapter_html_with_recircuit, used by sync_chapter_content to
  retry on Transient or Unauthenticated up to a recircuit_budget.
  Budget = 0 (no TOR) preserves original behavior bit-for-bit.
- app.rs and bin/crawler.rs construct the controller before on_launch
  and pass it into verify_session_with_recircuit, so a transient
  hiccup at startup no longer requires PHPSESSID rotation.

Recircuit budget defaults to CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS (3).
Errors from NEWNYM are logged and swallowed — failing to recircuit
should not take down the crawl.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
MechaCat02
2026-05-31 18:52:28 +02:00
parent 8557e432a2
commit 8c6378b877
5 changed files with 699 additions and 114 deletions

View File

@@ -123,6 +123,18 @@ async fn spawn_crawler_daemon(
} }
let rate = Arc::new(rate); let rate = Arc::new(rate);
let tor = crate::crawler::tor::TorController::from_parts(
cfg.tor_control_url.as_deref(),
cfg.tor_control_password.as_deref(),
cfg.tor_control_cookie_path.as_deref(),
)
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
.map(Arc::new);
if let Some(t) = &tor {
tracing::info!(?t, "TOR control configured; transient pages will trigger NEWNYM");
}
let tor_recircuit_max = cfg.tor_recircuit_max_attempts;
// Browser manager. on_launch re-injects PHPSESSID on every fresh // Browser manager. on_launch re-injects PHPSESSID on every fresh
// chromium spawn so an idle teardown followed by re-launch stays // chromium spawn so an idle teardown followed by re-launch stays
// authenticated without operator action. // authenticated without operator action.
@@ -135,15 +147,22 @@ async fn spawn_crawler_daemon(
let sid = sid.clone(); let sid = sid.clone();
let domain = domain.clone(); let domain = domain.clone();
let start_url = start_url.clone(); let start_url = start_url.clone();
let tor_for_launch = tor.as_ref().map(Arc::clone);
let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| { let on_launch: browser_manager::OnLaunch = Arc::new(move |browser| {
let sid = sid.clone(); let sid = sid.clone();
let domain = domain.clone(); let domain = domain.clone();
let start_url = start_url.clone(); let start_url = start_url.clone();
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
Box::pin(async move { Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain) session::inject_phpsessid(&browser, &sid, &domain)
.await .await
.context("on_launch: inject_phpsessid")?; .context("on_launch: inject_phpsessid")?;
session::verify_session(&browser, &start_url) session::verify_session_with_recircuit(
&browser,
&start_url,
tor_for_launch.as_deref(),
tor_recircuit_max,
)
.await .await
.context("on_launch: verify_session")?; .context("on_launch: verify_session")?;
Ok(()) Ok(())
@@ -157,17 +176,6 @@ async fn spawn_crawler_daemon(
let session_expired = Arc::new(AtomicBool::new(false)); let session_expired = Arc::new(AtomicBool::new(false));
let tor = crate::crawler::tor::TorController::from_parts(
cfg.tor_control_url.as_deref(),
cfg.tor_control_password.as_deref(),
cfg.tor_control_cookie_path.as_deref(),
)
.context("build TorController from CRAWLER_TOR_CONTROL_* env")?
.map(Arc::new);
if let Some(t) = &tor {
tracing::info!(?t, "TOR control configured; transient pages will trigger NEWNYM");
}
let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| { let metadata_pass: Option<Arc<dyn MetadataPass>> = cfg.start_url.as_ref().map(|url| {
let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass { let m: Arc<dyn MetadataPass> = Arc::new(RealMetadataPass {
browser_manager: Arc::clone(&browser_manager), browser_manager: Arc::clone(&browser_manager),

View File

@@ -88,6 +88,11 @@ async fn main() -> anyhow::Result<()> {
.ok() .ok()
.filter(|s| !s.trim().is_empty()) .filter(|s| !s.trim().is_empty())
.map(std::path::PathBuf::from); .map(std::path::PathBuf::from);
let tor_recircuit_max_attempts: u32 = std::env::var("CRAWLER_TOR_RECIRCUIT_MAX_ATTEMPTS")
.ok()
.and_then(|s| s.parse().ok())
.unwrap_or(3)
.max(1);
let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false); let keep_browser_open = env_bool("CRAWLER_KEEP_BROWSER_OPEN", false);
let db = PgPoolOptions::new() let db = PgPoolOptions::new()
@@ -154,35 +159,6 @@ async fn main() -> anyhow::Result<()> {
"starting crawler" "starting crawler"
); );
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
// alive for the entire run — same lifecycle as the old direct
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
// session probe; bad cookies fail fast before any real work happens.
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
(Some(sid), Some(domain)) => {
let sid = sid.clone();
let domain = domain.clone();
let start_url_clone = start_url.clone();
Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url_clone.clone();
Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("inject_phpsessid")?;
session::verify_session(&browser, &start_url)
.await
.context("verify_session")?;
Ok(())
})
})
}
_ => browser_manager::noop_on_launch(),
};
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
let tor = mangalord::crawler::tor::TorController::from_parts( let tor = mangalord::crawler::tor::TorController::from_parts(
tor_control_url.as_deref(), tor_control_url.as_deref(),
tor_control_password.as_deref(), tor_control_password.as_deref(),
@@ -194,6 +170,42 @@ async fn main() -> anyhow::Result<()> {
tracing::info!(?t, "TOR control configured"); tracing::info!(?t, "TOR control configured");
} }
// BrowserManager with idle_timeout = ZERO so the CLI keeps Chromium
// alive for the entire run — same lifecycle as the old direct
// `browser::launch()` flow. on_launch re-injects PHPSESSID + runs the
// session probe; bad cookies fail fast before any real work happens.
let on_launch: browser_manager::OnLaunch = match (&phpsessid, &cookie_domain) {
(Some(sid), Some(domain)) => {
let sid = sid.clone();
let domain = domain.clone();
let start_url_clone = start_url.clone();
let tor_for_launch = tor.as_ref().map(Arc::clone);
Arc::new(move |browser| {
let sid = sid.clone();
let domain = domain.clone();
let start_url = start_url_clone.clone();
let tor_for_launch = tor_for_launch.as_ref().map(Arc::clone);
Box::pin(async move {
session::inject_phpsessid(&browser, &sid, &domain)
.await
.context("inject_phpsessid")?;
session::verify_session_with_recircuit(
&browser,
&start_url,
tor_for_launch.as_deref(),
tor_recircuit_max_attempts,
)
.await
.context("verify_session")?;
Ok(())
})
})
}
_ => browser_manager::noop_on_launch(),
};
let session_ready = phpsessid.is_some() && cookie_domain.is_some();
let manager = BrowserManager::new(options, Duration::ZERO, on_launch);
let result = run( let result = run(
Arc::clone(&manager), Arc::clone(&manager),
&db, &db,

View File

@@ -73,40 +73,35 @@ pub enum SyncOutcome {
SessionExpired, SessionExpired,
} }
/// Fetch all images for one chapter and persist them atomically. On /// Per-chapter recircuit budget for both transient pages and
/// any error after the first storage put, the DB transaction rolls /// `Unauthenticated` outcomes. When TOR is not configured the budget
/// back so the chapter stays at `page_count = 0` and is retried on the /// is effectively 0 (no recircuit attempted; original behavior).
/// next run. Bytes already written to storage become orphans; a future const CHAPTER_RECIRCUIT_MAX_ATTEMPTS: u32 = 3;
/// reaper sweeps them.
#[allow(clippy::too_many_arguments)] /// Outcome of [`fetch_chapter_html_with_recircuit`]. `Ok` carries the
pub async fn sync_chapter_content( /// final reader HTML; the other two map to `sync_chapter_content`'s
browser: &chromiumoxide::Browser, /// existing failure modes.
db: &PgPool, #[derive(Debug)]
storage: &dyn Storage, enum ChapterFetchOutcome {
http: &reqwest::Client, Ok(String),
rate: &HostRateLimiters, /// `ChapterProbe::Unauthenticated` after exhausting recircuit
chapter_id: Uuid, /// budget (or with budget=0). Caller returns
manga_id: Uuid, /// `SyncOutcome::SessionExpired`.
source_url: &str, SessionExpired,
force_refetch: bool, /// `ChapterProbe::Transient` after exhausting recircuit budget
allowlist: &DownloadAllowlist, /// (or with budget=0). Caller bails so the dispatcher does
max_image_bytes: usize, /// exponential backoff.
_tor: Option<&crate::crawler::tor::TorController>, PersistentTransient,
) -> anyhow::Result<SyncOutcome> {
// Skip if already fetched, unless caller explicitly forces.
if !force_refetch {
let (page_count,): (i32,) =
sqlx::query_as("SELECT page_count FROM chapters WHERE id = $1")
.bind(chapter_id)
.fetch_one(db)
.await
.context("read chapter page_count")?;
if page_count > 0 {
return Ok(SyncOutcome::Skipped);
}
} }
// Nav to chapter page (rate-limited per host). /// Single rate-limited Chromium navigation to the chapter URL,
/// returning the page HTML. Extracted from `sync_chapter_content` so
/// the recircuit loop can call it once per attempt.
async fn fetch_chapter_html_once(
browser: &chromiumoxide::Browser,
rate: &HostRateLimiters,
source_url: &str,
) -> anyhow::Result<String> {
rate.wait_for(source_url).await?; rate.wait_for(source_url).await?;
let page = browser let page = browser
.new_page(source_url) .new_page(source_url)
@@ -125,28 +120,128 @@ pub async fn sync_chapter_content(
crate::crawler::nav::SELECTOR_TIMEOUT, crate::crawler::nav::SELECTOR_TIMEOUT,
) )
.await; .await;
let html = page.content().await.context("read chapter html")?; let html = page.content().await.context("read chapter html")?;
page.close().await.ok(); page.close().await.ok();
Ok(html)
}
// Three-way session classification: distinguishes a transient /// Pure-over-IO loop: fetch + classify, with up to `recircuit_budget`
// hiccup (broken-page body or logged-in-but-no-reader) from a /// NEWNYM-and-retry cycles after a `Transient` or `Unauthenticated`
// genuine PHPSESSID expiry (no reader and no avatar widget). The /// outcome. `recircuit_budget = 0` collapses to the original
// earlier binary `#avatar_menu` check conflated both and froze /// single-shot behavior — `Unauthenticated` → `SessionExpired`,
// every worker on a layout shift. /// `Transient` → `PersistentTransient` on the first hit, no recircuit.
async fn fetch_chapter_html_with_recircuit<F, Fut, R, RFut>(
mut fetch: F,
mut recircuit: R,
recircuit_budget: u32,
source_url_for_msg: &str,
) -> anyhow::Result<ChapterFetchOutcome>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<String>>,
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
let mut recircuits = 0u32;
loop {
let html = fetch().await?;
match session::classify_chapter_probe(&html) { match session::classify_chapter_probe(&html) {
ChapterProbe::Unauthenticated => return Ok(SyncOutcome::SessionExpired), ChapterProbe::Ok => return Ok(ChapterFetchOutcome::Ok(html)),
ChapterProbe::Unauthenticated => {
if recircuits < recircuit_budget {
recircuits += 1;
tracing::warn!(
attempt = recircuits,
max = recircuit_budget,
url = source_url_for_msg,
"chapter probe Unauthenticated; signaling TOR NEWNYM and retrying"
);
recircuit().await;
continue;
}
return Ok(ChapterFetchOutcome::SessionExpired);
}
ChapterProbe::Transient => { ChapterProbe::Transient => {
if recircuits < recircuit_budget {
recircuits += 1;
tracing::warn!(
attempt = recircuits,
max = recircuit_budget,
url = source_url_for_msg,
"chapter probe Transient; signaling TOR NEWNYM and retrying"
);
recircuit().await;
continue;
}
return Ok(ChapterFetchOutcome::PersistentTransient);
}
}
}
}
/// Fetch all images for one chapter and persist them atomically. On
/// any error after the first storage put, the DB transaction rolls
/// back so the chapter stays at `page_count = 0` and is retried on the
/// next run. Bytes already written to storage become orphans; a future
/// reaper sweeps them.
#[allow(clippy::too_many_arguments)]
pub async fn sync_chapter_content(
browser: &chromiumoxide::Browser,
db: &PgPool,
storage: &dyn Storage,
http: &reqwest::Client,
rate: &HostRateLimiters,
chapter_id: Uuid,
manga_id: Uuid,
source_url: &str,
force_refetch: bool,
allowlist: &DownloadAllowlist,
max_image_bytes: usize,
tor: Option<&crate::crawler::tor::TorController>,
) -> anyhow::Result<SyncOutcome> {
// Skip if already fetched, unless caller explicitly forces.
if !force_refetch {
let (page_count,): (i32,) =
sqlx::query_as("SELECT page_count FROM chapters WHERE id = $1")
.bind(chapter_id)
.fetch_one(db)
.await
.context("read chapter page_count")?;
if page_count > 0 {
return Ok(SyncOutcome::Skipped);
}
}
// Fetch + classify with a recircuit budget when TOR is configured.
// Without TOR the closure-recircuit is a no-op and the loop reduces
// to the original single-attempt behavior.
let recircuit_budget = if tor.is_some() { CHAPTER_RECIRCUIT_MAX_ATTEMPTS } else { 0 };
let html = match fetch_chapter_html_with_recircuit(
|| fetch_chapter_html_once(browser, rate, source_url),
|| async {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
}
}
},
recircuit_budget,
source_url,
)
.await?
{
ChapterFetchOutcome::Ok(html) => html,
ChapterFetchOutcome::SessionExpired => return Ok(SyncOutcome::SessionExpired),
ChapterFetchOutcome::PersistentTransient => {
// Surface as a typed Err so the dispatcher path runs // Surface as a typed Err so the dispatcher path runs
// ack_failed with exponential backoff (rather than the // ack_failed with exponential backoff (rather than the
// session-expired sticky flag). // session-expired sticky flag).
anyhow::bail!( anyhow::bail!(
"chapter page at {source_url} returned a transient response \ "chapter page at {source_url} returned a transient response after \
(broken-page body or reader didn't render); will retry" {recircuit_budget} TOR recircuit(s); will retry"
); );
} }
ChapterProbe::Ok => {} };
}
let images = parse_chapter_pages(&html) let images = parse_chapter_pages(&html)
.with_context(|| format!("parse chapter pages at {source_url}"))?; .with_context(|| format!("parse chapter pages at {source_url}"))?;
@@ -305,4 +400,181 @@ mod tests {
let err = parse_chapter_pages(html).expect_err("expected Transient"); let err = parse_chapter_pages(html).expect_err("expected Transient");
assert!(err.is_transient(), "got non-transient: {err}"); assert!(err.is_transient(), "got non-transient: {err}");
} }
// --- fetch_chapter_html_with_recircuit -------------------------------
const OK_HTML: &str = r#"<html><body><a id="pic_container"><img id="page1" src="x"/></a></body></html>"#;
const UNAUTH_HTML: &str = r#"<html><body><header><div id="logo">x</div></header><main>please log in</main></body></html>"#;
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
#[tokio::test]
async fn recircuit_loop_ok_first_attempt() {
let mut recircuits = 0u32;
let mut fetches = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetches += 1;
async { Ok(OK_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetches, 1);
assert_eq!(recircuits, 0);
}
#[tokio::test]
async fn recircuit_loop_unauth_with_zero_budget_returns_session_expired() {
let mut recircuits = 0u32;
let mut fetches = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetches += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
0,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
assert_eq!(fetches, 1);
assert_eq!(recircuits, 0, "no recircuit when budget is 0 (TOR disabled)");
}
#[tokio::test]
async fn recircuit_loop_unauth_then_ok_within_budget() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
if n == 1 {
Ok(UNAUTH_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn recircuit_loop_unauth_exhausts_budget_returns_session_expired() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
2,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::SessionExpired));
// budget=2 → initial + 2 recircuit-and-retry = 3 fetches.
assert_eq!(fetch_n, 3);
assert_eq!(recircuits, 2);
}
#[tokio::test]
async fn recircuit_loop_transient_then_ok_within_budget() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
let n = fetch_n;
async move {
if n < 3 {
Ok(TRANSIENT_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok");
assert!(matches!(outcome, ChapterFetchOutcome::Ok(_)));
assert_eq!(fetch_n, 3);
assert_eq!(recircuits, 2);
}
#[tokio::test]
async fn recircuit_loop_transient_exhausts_budget_returns_persistent() {
let mut recircuits = 0u32;
let mut fetch_n = 0u32;
let outcome = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Ok(TRANSIENT_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
"https://example/c",
)
.await
.expect("ok-result");
assert!(matches!(outcome, ChapterFetchOutcome::PersistentTransient));
assert_eq!(fetch_n, 4, "budget=3 → 1 initial + 3 retries");
assert_eq!(recircuits, 3);
}
#[tokio::test]
async fn recircuit_loop_propagates_fetch_errors() {
let mut fetch_n = 0u32;
let err = fetch_chapter_html_with_recircuit(
|| {
fetch_n += 1;
async { Err(anyhow::anyhow!("nav timeout")) }
},
|| async {},
3,
"https://example/c",
)
.await
.expect_err("fetch error bubbles");
assert_eq!(fetch_n, 1);
assert!(format!("{err:#}").contains("nav timeout"));
}
} }

View File

@@ -162,38 +162,118 @@ const PROBE_RETRY_DELAY: Duration = Duration::from_secs(2);
/// limiter. The trade is worth it — failing here costs ~1s; failing 30 /// limiter. The trade is worth it — failing here costs ~1s; failing 30
/// minutes into a backfill costs 30 minutes. /// minutes into a backfill costs 30 minutes.
pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> { pub async fn verify_session(browser: &Browser, probe_url: &str) -> anyhow::Result<()> {
let mut attempt = 0u32; verify_session_with_recircuit(browser, probe_url, None, 0).await
}
/// Like [`verify_session`] but, when `tor` is `Some`, signals
/// `SIGNAL NEWNYM` between retries on transient pages AND treats
/// `Unauthenticated` as a recoverable failure (up to
/// `unauth_max_recircuit` recircuit cycles before giving up). The bare
/// `verify_session` is `verify_session_with_recircuit(..., None, 0)`.
///
/// When `tor` is `None`, `unauth_max_recircuit` is ignored — `Unauth`
/// stays a hard fail, matching the original behavior.
pub async fn verify_session_with_recircuit(
browser: &Browser,
probe_url: &str,
tor: Option<&crate::crawler::tor::TorController>,
unauth_max_recircuit: u32,
) -> anyhow::Result<()> {
let effective_unauth_budget = if tor.is_some() { unauth_max_recircuit } else { 0 };
run_session_probe_loop(
|| fetch_probe_html(browser, probe_url),
|| async {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; continuing with same circuit");
}
}
},
PROBE_MAX_ATTEMPTS,
effective_unauth_budget,
PROBE_RETRY_DELAY,
probe_url,
)
.await
}
/// Pure-over-IO loop body for the session probe. Generic over the
/// fetch and recircuit closures so it can be unit-tested without a
/// real browser or TOR daemon.
///
/// Semantics:
/// - `SessionProbe::Ok` → return `Ok(())`.
/// - `SessionProbe::Unauthenticated` → if `unauth_max_recircuit > 0`
/// and budget remaining, call `recircuit` + sleep + retry. Otherwise
/// bail with the "PHPSESSID expired" diagnostic, mentioning the
/// recircuit count so a TOR-misconfig diagnosis is easier.
/// - `SessionProbe::Transient` → up to `transient_max_attempts` total
/// tries, calling `recircuit` between each. After the cap, bail with
/// the "site down or rate-limiting" diagnostic.
async fn run_session_probe_loop<F, Fut, R, RFut>(
mut fetch_html: F,
mut recircuit: R,
transient_max_attempts: u32,
unauth_max_recircuit: u32,
retry_delay: Duration,
probe_url_for_msg: &str,
) -> anyhow::Result<()>
where
F: FnMut() -> Fut,
Fut: std::future::Future<Output = anyhow::Result<String>>,
R: FnMut() -> RFut,
RFut: std::future::Future<Output = ()>,
{
let mut transient_attempts = 0u32;
let mut unauth_recircuits = 0u32;
loop { loop {
attempt += 1; let html = fetch_html().await?;
let html = fetch_probe_html(browser, probe_url).await?;
match classify_probe(&html) { match classify_probe(&html) {
SessionProbe::Ok => { SessionProbe::Ok => {
tracing::info!(attempt, "session probe ok — #logo + #avatar_menu present"); tracing::info!(
transient_attempts,
unauth_recircuits,
"session probe ok — #logo + #avatar_menu present"
);
return Ok(()); return Ok(());
} }
SessionProbe::Unauthenticated => { SessionProbe::Unauthenticated => {
if unauth_recircuits < unauth_max_recircuit {
unauth_recircuits += 1;
tracing::warn!(
attempt = unauth_recircuits,
max = unauth_max_recircuit,
"session probe Unauthenticated despite PHPSESSID; signaling TOR \
NEWNYM and retrying"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
continue;
}
return Err(anyhow!( return Err(anyhow!(
"session probe failed — #avatar_menu not present at {probe_url} \ "session probe failed — #avatar_menu not present at {probe_url_for_msg} \
(page rendered the normal layout); PHPSESSID is missing, expired, \ after {unauth_recircuits} TOR recircuit(s); PHPSESSID is missing, \
or revoked. Refresh CRAWLER_PHPSESSID and re-run." expired, or revoked. Refresh CRAWLER_PHPSESSID and re-run."
)); ));
} }
SessionProbe::Transient if attempt < PROBE_MAX_ATTEMPTS => {
tracing::warn!(
attempt,
max_attempts = PROBE_MAX_ATTEMPTS,
"session probe got a transient page; retrying"
);
tokio::time::sleep(PROBE_RETRY_DELAY).await;
}
SessionProbe::Transient => { SessionProbe::Transient => {
transient_attempts += 1;
if transient_attempts >= transient_max_attempts {
return Err(anyhow!( return Err(anyhow!(
"session probe failed — probe page at {probe_url} returned a \ "session probe failed — probe page at {probe_url_for_msg} returned \
broken-page response after {PROBE_MAX_ATTEMPTS} attempts. \ a broken-page response after {transient_max_attempts} attempts. \
The site appears to be down or rate-limiting us; try again \ The site appears to be down or rate-limiting us; try again \
later before refreshing CRAWLER_PHPSESSID." later before refreshing CRAWLER_PHPSESSID."
)); ));
} }
tracing::warn!(
attempt = transient_attempts,
max_attempts = transient_max_attempts,
"session probe got a transient page; recircuit + retry"
);
recircuit().await;
tokio::time::sleep(retry_delay).await;
}
} }
} }
} }
@@ -336,6 +416,202 @@ mod tests {
assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok); assert_eq!(classify_chapter_probe(html), ChapterProbe::Ok);
} }
// --- run_session_probe_loop -----------------------------------------
//
// These tests exercise the recircuit-aware loop without a real
// browser. The fetch and recircuit closures are mocked over Vecs of
// canned outcomes / counters.
const OK_HTML: &str = r#"<html><body><div id="logo"></div><div id="avatar_menu"></div></body></html>"#;
const UNAUTH_HTML: &str = r#"<html><body><div id="logo"></div></body></html>"#;
const TRANSIENT_HTML: &str = "<html><body><p>we're sorry, the request file are not found.</p></body></html>";
#[tokio::test]
async fn probe_loop_ok_on_first_attempt_does_not_recircuit() {
let mut recircuits = 0u32;
let mut fetched = 0u32;
run_session_probe_loop(
|| {
fetched += 1;
async { Ok(OK_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on first attempt");
assert_eq!(fetched, 1);
assert_eq!(recircuits, 0);
}
#[tokio::test]
async fn probe_loop_unauth_then_ok_when_recircuit_budget_available() {
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(UNAUTH_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
3,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("recovers after one recircuit");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_unauth_with_zero_recircuit_budget_fails_fast() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("zero budget → fail");
assert_eq!(call, 1, "no retry when budget is 0");
assert_eq!(recircuits, 0);
let msg = format!("{err:#}");
assert!(msg.contains("Refresh CRAWLER_PHPSESSID"), "msg: {msg}");
}
#[tokio::test]
async fn probe_loop_unauth_after_exhausting_budget_emits_recircuit_count() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(UNAUTH_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
10, // transient budget irrelevant here
2,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("exhausts unauth budget");
// 3 fetches total: initial + 2 recircuit-and-retry
assert_eq!(call, 3);
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("2 TOR recircuit"), "expected recircuit count in error, got: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_repeats_until_max_then_errors() {
let mut recircuits = 0u32;
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Ok(TRANSIENT_HTML.to_string()) }
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("transient until max → fail");
assert_eq!(call, 3);
// recircuit fires between attempts: 3 attempts → 2 recircuits.
assert_eq!(recircuits, 2);
let msg = format!("{err:#}");
assert!(msg.contains("broken-page response after 3 attempts"), "msg: {msg}");
}
#[tokio::test]
async fn probe_loop_transient_then_ok_returns_ok_after_one_recircuit() {
let mut recircuits = 0u32;
let mut call = 0u32;
run_session_probe_loop(
|| {
call += 1;
let n = call;
async move {
if n == 1 {
Ok(TRANSIENT_HTML.to_string())
} else {
Ok(OK_HTML.to_string())
}
}
},
|| {
recircuits += 1;
async {}
},
3,
0,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect("ok on second try");
assert_eq!(call, 2);
assert_eq!(recircuits, 1);
}
#[tokio::test]
async fn probe_loop_propagates_fetch_errors_immediately() {
let mut call = 0u32;
let err = run_session_probe_loop(
|| {
call += 1;
async { Err(anyhow!("nav timeout")) }
},
|| async {},
5,
5,
Duration::from_millis(0),
"https://example/probe",
)
.await
.expect_err("fetch error bubbles");
assert_eq!(call, 1);
assert!(format!("{err:#}").contains("nav timeout"));
}
#[test] #[test]
fn classify_probe_trusts_broken_body_over_stray_avatar_match() { fn classify_probe_trusts_broken_body_over_stray_avatar_match() {
// Defensive: if a broken-page body somehow contains an // Defensive: if a broken-page body somehow contains an

View File

@@ -18,7 +18,7 @@ use super::{
SourceMangaRef, SourceMangaRef,
}; };
use crate::crawler::detect::{ use crate::crawler::detect::{
has_logo_sentinel, is_broken_page_body, retry_on_transient, PageError, has_logo_sentinel, is_broken_page_body, retry_on_transient_with_hook, PageError,
}; };
use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT}; use crate::crawler::nav::{wait_for_nav, wait_for_selector, NavError, SELECTOR_TIMEOUT};
@@ -79,12 +79,13 @@ impl Source for TargetSource {
// and the HTML is handed straight to the first `next_batch` call // and the HTML is handed straight to the first `next_batch` call
// so the walker doesn't re-fetch it. Page count is discovered // so the walker doesn't re-fetch it. Page count is discovered
// incrementally — see `TargetSourceWalker::next_batch`. // incrementally — see `TargetSourceWalker::next_batch`.
let first_html = retry_on_transient( let first_html = retry_on_transient_with_hook(
|| async { || async {
navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await navigate(ctx, self.base_url.as_str(), LIST_PAGE_MARKER).await
}, },
PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY, PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
) )
.await?; .await?;
@@ -169,7 +170,7 @@ impl DiscoverWalk for TargetSourceWalker {
parse_manga_list_from(&doc)? parse_manga_list_from(&doc)?
} }
None => { None => {
retry_on_transient( retry_on_transient_with_hook(
|| async { || async {
let html = navigate( let html = navigate(
ctx, ctx,
@@ -182,12 +183,13 @@ impl DiscoverWalk for TargetSourceWalker {
}, },
PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY, PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
) )
.await? .await?
} }
} }
} else { } else {
retry_on_transient( retry_on_transient_with_hook(
|| async { || async {
let url = page_url(&self.base_url, page_num); let url = page_url(&self.base_url, page_num);
let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?; let html = navigate(ctx, &url, LIST_PAGE_MARKER).await?;
@@ -196,6 +198,7 @@ impl DiscoverWalk for TargetSourceWalker {
}, },
PAGE_TRANSIENT_RETRY_ATTEMPTS, PAGE_TRANSIENT_RETRY_ATTEMPTS,
PAGE_TRANSIENT_RETRY_DELAY, PAGE_TRANSIENT_RETRY_DELAY,
|| async { recircuit_if_configured(ctx.tor).await },
) )
.await? .await?
}; };
@@ -274,6 +277,20 @@ fn classify_navigate_html(html: String) -> Result<String, PageError> {
Ok(html) Ok(html)
} }
/// Hook for [`retry_on_transient_with_hook`]: when TOR is configured,
/// signal `NEWNYM` so the next navigation draws a fresh exit. Errors
/// from the controller are logged and swallowed — failing to recircuit
/// shouldn't take down the crawl, the next attempt just runs on the
/// same circuit as before.
async fn recircuit_if_configured(tor: Option<&crate::crawler::tor::TorController>) {
if let Some(t) = tor {
if let Err(e) = t.new_identity().await {
tracing::warn!(error = %e, "TOR NEWNYM failed; retrying on same circuit");
}
}
}
/// Substitutes the first `/N/` path segment with the target page /// Substitutes the first `/N/` path segment with the target page
/// number. Source impls that paginate via a different URL shape can /// number. Source impls that paginate via a different URL shape can
/// override this — for the modeled site the segment is always present. /// override this — for the modeled site the segment is always present.