The target site orders by update_date DESC, and any new or updated manga pushes everyone down by one slot. The paginated walker was blind to this drift: * Backfill (page last -> 1): shifts push items into pages already finished. The displaced manga was silently missed; with mark_dropped_mangas running on a fully-completed walk, items even got false-dropped because last_seen_at was stale. * Incremental (page 1 -> last): a shift causes the slot-last item of an already-read page to reappear on the next page, leading to a redundant fetch_manga and an inflated consecutive_unchanged streak. Fix is two-pronged: 1. Backfill boundary re-check. After fetching each page P, re-fetch the previously-walked page P+1 and check where its old slot-0 key now sits. If it slid to slot K, the first K entries are items that used to live on P and slid past us; they get appended to the batch. If the anchor is gone entirely (multi-page shift or it was bumped to page 1), the whole re-fetched page is processed conservatively and the pipeline dedup absorbs the noise. The re-check must be the *last* navigation of the iteration to close the within-iteration race. 2. Run-scoped dedup in run_metadata_pass. A HashSet<String> of source_manga_keys avoids double-processing. The set uses a contains-then-insert pattern with insert firing *after* a successful upsert, so a transient fetch/upsert failure leaves the key retryable if it reappears later in the same pass (via the boundary re-check or another batch). Incremental mode does not run the re-check (shifts move in the same direction as the walk); only the dedup helps it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
58 lines
1.6 KiB
TOML
58 lines
1.6 KiB
TOML
[package]
|
|
name = "mangalord"
|
|
version = "0.35.1"
|
|
edition = "2021"
|
|
default-run = "mangalord"
|
|
|
|
[lib]
|
|
path = "src/lib.rs"
|
|
|
|
[[bin]]
|
|
name = "mangalord"
|
|
path = "src/main.rs"
|
|
|
|
[[bin]]
|
|
name = "crawler"
|
|
path = "src/bin/crawler.rs"
|
|
|
|
[dependencies]
|
|
axum = { version = "0.7", features = ["macros", "multipart"] }
|
|
tokio = { version = "1", features = ["full"] }
|
|
sqlx = { version = "0.8", features = ["runtime-tokio", "postgres", "uuid", "chrono", "macros", "migrate"] }
|
|
serde = { version = "1", features = ["derive"] }
|
|
serde_json = "1"
|
|
uuid = { version = "1", features = ["v4", "serde"] }
|
|
chrono = { version = "0.4", features = ["serde"] }
|
|
chrono-tz = "0.9"
|
|
tracing = "0.1"
|
|
tracing-subscriber = { version = "0.3", features = ["env-filter"] }
|
|
tower = { version = "0.5", features = ["util"] }
|
|
tower-http = { version = "0.6", features = ["trace", "cors"] }
|
|
thiserror = "1"
|
|
anyhow = "1"
|
|
async-trait = "0.1"
|
|
dotenvy = "0.15"
|
|
argon2 = "0.5"
|
|
rand = "0.8"
|
|
sha2 = "0.10"
|
|
subtle = "2"
|
|
base64 = "0.22"
|
|
axum-extra = { version = "0.9", features = ["cookie", "typed-header"] }
|
|
time = "0.3"
|
|
infer = "0.16"
|
|
tokio-util = { version = "0.7", features = ["io"] }
|
|
futures-core = "0.3"
|
|
futures-util = "0.3"
|
|
bytes = "1"
|
|
chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false }
|
|
scraper = "0.20"
|
|
reqwest = { version = "0.12", default-features = false, features = ["rustls-tls", "socks", "cookies", "stream"] }
|
|
|
|
[dev-dependencies]
|
|
tempfile = "3"
|
|
tower = { version = "0.5", features = ["util"] }
|
|
http-body-util = "0.1"
|
|
mime = "0.3"
|
|
futures-util = "0.3"
|
|
tokio = { version = "1", features = ["test-util"] }
|