feat: crawler manga-list & metadata sync with cover download (0.23.0)
- TargetSource: first concrete impl of the Source trait, modeled on
the old Puppeteer crawler's selectors (+ status normalization,
tag-count stripping, chapter list)
- DiscoverMode::Backfill walks pagination last->1, reverse within each
page (oldest-first); Incremental walks forward
- RateLimiter (tokio-time aware) plumbed through FetchContext so the
pagination walk honors the same per-host budget as the outer loop
- repo::crawler: ensure_source, upsert_manga_from_source (returns
New/Updated/Unchanged + current cover_image_path for backfill
decisions), sync_manga_chapters, mark_dropped_mangas — all
transactional, with case-insensitive lookups and source-insertable
genres
- Cover image download via reqwest+infer; stored under
mangas/{id}/cover.{ext} via the Storage trait
- Single CRAWLER_PROXY env wires both Chromium (--proxy-server) and
reqwest::Proxy::all (HTTP/HTTPS/SOCKS5)
- Crawler binary: positional start URL or $CRAWLER_START_URL,
$CRAWLER_LIMIT (cap fetches + skip drop pass on partial runs),
$CRAWLER_SKIP_CHAPTERS (disable selector AND sync), $CRAWLER_RATE_MS
- Silences chromiumoxide 0.7's known CDP deserialize log spam via
default tracing filter + CdpError::Serde downgrade
- 9 sqlx integration tests + 11 selector/rate-limit unit tests
This commit is contained in:
69
backend/src/crawler/rate_limit.rs
Normal file
69
backend/src/crawler/rate_limit.rs
Normal file
@@ -0,0 +1,69 @@
|
||||
//! Per-host request pacing.
|
||||
//!
|
||||
//! Single-token bucket: each `wait().await` either returns immediately
|
||||
//! (if at least `interval` has elapsed since the last call) or sleeps
|
||||
//! just enough to satisfy it. Uses `tokio::time::Instant` so tests can
|
||||
//! run under `start_paused` virtual time without sleeping for real.
|
||||
|
||||
use std::time::Duration;
|
||||
use tokio::time::Instant;
|
||||
|
||||
#[derive(Debug)]
|
||||
pub struct RateLimiter {
|
||||
interval: Duration,
|
||||
last: Option<Instant>,
|
||||
}
|
||||
|
||||
impl RateLimiter {
|
||||
pub fn new(interval: Duration) -> Self {
|
||||
Self {
|
||||
interval,
|
||||
last: None,
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn wait(&mut self) {
|
||||
if let Some(last) = self.last {
|
||||
let elapsed = last.elapsed();
|
||||
if elapsed < self.interval {
|
||||
tokio::time::sleep(self.interval - elapsed).await;
|
||||
}
|
||||
}
|
||||
self.last = Some(Instant::now());
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn first_call_does_not_sleep() {
|
||||
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||
let t0 = Instant::now();
|
||||
rl.wait().await;
|
||||
assert_eq!(Instant::now() - t0, Duration::ZERO);
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn second_call_sleeps_to_fill_interval() {
|
||||
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||
let t0 = Instant::now();
|
||||
rl.wait().await;
|
||||
rl.wait().await;
|
||||
// Second call had to wait the full 100ms after the (instant)
|
||||
// first call.
|
||||
assert_eq!(Instant::now() - t0, Duration::from_millis(100));
|
||||
}
|
||||
|
||||
#[tokio::test(start_paused = true)]
|
||||
async fn no_sleep_if_interval_already_elapsed() {
|
||||
let mut rl = RateLimiter::new(Duration::from_millis(100));
|
||||
rl.wait().await;
|
||||
tokio::time::sleep(Duration::from_millis(250)).await;
|
||||
let t0 = Instant::now();
|
||||
rl.wait().await;
|
||||
// Already 250ms past — no further wait needed.
|
||||
assert_eq!(Instant::now() - t0, Duration::ZERO);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user