From 26eccd0abe4a2dbfac31faf23d97f8acb6cc8077 Mon Sep 17 00:00:00 2001 From: MechaCat02 Date: Wed, 20 May 2026 22:07:56 +0200 Subject: [PATCH] feat: crawler scaffold with chromium launcher (0.22.0) - crawler module (browser, source trait, jobs, diff) + binary - chromiumoxide launcher with fetcher feature (auto-downloads Chromium on first run, caches under ~/.cache/mangalord/chromium) - LaunchOptions struct with extra_args, parseable from CRAWLER_BROWSER_MODE and CRAWLER_BROWSER_ARGS - migration 0012 introduces sources, manga_sources, chapter_sources, crawler_jobs - integration tests for headed + headless launch, ipify load+parse, and extra-args propagation (all #[ignore], opt-in) --- backend/Cargo.lock | 1297 +++++++++++++++++++++++- backend/Cargo.toml | 9 +- backend/migrations/0012_crawler.sql | 72 ++ backend/src/bin/crawler.rs | 29 + backend/src/crawler/browser.rs | 217 ++++ backend/src/crawler/diff.rs | 15 + backend/src/crawler/jobs.rs | 55 + backend/src/crawler/mod.rs | 19 + backend/src/crawler/source.rs | 105 ++ backend/src/lib.rs | 1 + backend/tests/crawler_browser_smoke.rs | 157 +++ frontend/package.json | 2 +- 12 files changed, 1951 insertions(+), 27 deletions(-) create mode 100644 backend/migrations/0012_crawler.sql create mode 100644 backend/src/bin/crawler.rs create mode 100644 backend/src/crawler/browser.rs create mode 100644 backend/src/crawler/diff.rs create mode 100644 backend/src/crawler/jobs.rs create mode 100644 backend/src/crawler/mod.rs create mode 100644 backend/src/crawler/source.rs create mode 100644 backend/tests/crawler_browser_smoke.rs diff --git a/backend/Cargo.lock b/backend/Cargo.lock index 18b4001..18c40f1 100644 --- a/backend/Cargo.lock +++ b/backend/Cargo.lock @@ -2,6 +2,25 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "adler2" +version = "2.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "320119579fcad9c21884f5c4861d16174d0e06250625266f50fe6898340abefa" + +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "getrandom 0.3.4", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -55,6 +74,20 @@ dependencies = [ "syn", ] +[[package]] +name = "async-tungstenite" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5359381fd414fbdb272c48f2111c16cb0bb3447bfacd59311ff3736da9f6664" +dependencies = [ + "futures-io", + "futures-util", + "log", + "pin-project-lite", + "tokio", + "tungstenite", +] + [[package]] name = "atoi" version = "2.0.0" @@ -208,6 +241,15 @@ dependencies = [ "generic-array", ] +[[package]] +name = "block2" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cdeb9d870516001442e364c5220d3574d2da8dc765554b4a617230d33fa58ef5" +dependencies = [ + "objc2", +] + [[package]] name = "bumpalo" version = "3.20.2" @@ -253,6 +295,94 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "cfg_aliases" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "613afe47fcd5fac7ccf1db93babcb082c5994d996f20b8b159f2ad1658eb5724" + +[[package]] +name = "chromiumoxide" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8380ce7721cc895fe8a184c49d615fe755b0c9a3d7986355cee847439fff907f" +dependencies = [ + "async-tungstenite", + "base64", + "cfg-if", + "chromiumoxide_cdp", + "chromiumoxide_fetcher", + "chromiumoxide_types", + "dunce", + "fnv", + "futures", + "futures-timer", + "pin-project-lite", + "reqwest", + "serde", + "serde_json", + "thiserror 1.0.69", + "tokio", + "tracing", + "url", + "which", + "winreg", +] + +[[package]] +name = "chromiumoxide_cdp" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cadbfb52fa0aeca43626f6c42ca04184b108b786f8e45198dc41a42aedcf2e50" +dependencies = [ + "chromiumoxide_pdl", + "chromiumoxide_types", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_fetcher" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "948c2f4660e2b68b876180a21ed37b6c695bf3b9c2cbc77e32bcc3d6ff295601" +dependencies = [ + "anyhow", + "directories", + "os_info", + "reqwest", + "thiserror 1.0.69", + "tokio", + "zip", +] + +[[package]] +name = "chromiumoxide_pdl" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c197aeb42872c5d4c923e7d8ad46d99a58fd0fec37f6491554ff677a6791d3c9" +dependencies = [ + "chromiumoxide_types", + "either", + "heck 0.4.1", + "once_cell", + "proc-macro2", + "quote", + "regex", + "serde", + "serde_json", +] + +[[package]] +name = "chromiumoxide_types" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "923486888790528d55ac37ec2f7483ed19eb8ccbb44701878e5856d1ceadf5d8" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "chrono" version = "0.4.44" @@ -323,6 +453,15 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "217698eaf96b4a3f0bc4f3662aaa55bdf913cd54d7204591faa790070c6d0853" +[[package]] +name = "crc32fast" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9481c1c90cbf2ac953f07c8d4a58aa3945c425b7185c9154d67a65e4230da511" +dependencies = [ + "cfg-if", +] + [[package]] name = "crossbeam-queue" version = "0.3.12" @@ -348,6 +487,35 @@ dependencies = [ "typenum", ] +[[package]] +name = "cssparser" +version = "0.31.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +dependencies = [ + "cssparser-macros", + "dtoa-short", + "itoa", + "phf 0.11.3", + "smallvec", +] + +[[package]] +name = "cssparser-macros" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331" +dependencies = [ + "quote", + "syn", +] + +[[package]] +name = "data-encoding" +version = "2.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4ae5f15dda3c708c0ade84bfee31ccab44a3da4f88015ed22f63732abe300c8" + [[package]] name = "der" version = "0.7.10" @@ -368,6 +536,17 @@ dependencies = [ "powerfmt", ] +[[package]] +name = "derive_more" +version = "0.99.20" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6edb4b64a43d977b8e99788fe3a04d483834fba1215a7e02caa415b626497f7f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "digest" version = "0.10.7" @@ -380,6 +559,37 @@ dependencies = [ "subtle", ] +[[package]] +name = "directories" +version = "5.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9a49173b84e034382284f27f1af4dcbbd231ffa358c0fe316541a7337f376a35" +dependencies = [ + "dirs-sys", +] + +[[package]] +name = "dirs-sys" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "520f05a5cbd335fae5a99ff7a6ab8627577660ee5cfd6a94a6a929b52ff0321c" +dependencies = [ + "libc", + "option-ext", + "redox_users", + "windows-sys 0.48.0", +] + +[[package]] +name = "dispatch2" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e0e367e4e7da84520dedcac1901e4da967309406d1e51017ae1abfb97adbd38" +dependencies = [ + "bitflags", + "objc2", +] + [[package]] name = "displaydoc" version = "0.2.5" @@ -397,6 +607,33 @@ version = "0.15.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1aaf95b3e5c8f23aa320147307562d361db0ae0d51242340f558153b4eb2439b" +[[package]] +name = "dtoa" +version = "1.0.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4c3cf4824e2d5f025c7b531afcb2325364084a16806f6d47fbc1f5fbd9960590" + +[[package]] +name = "dtoa-short" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cd1511a7b6a56299bd043a9c167a6d2bfb37bf84a6dfceaba651168adfb43c87" +dependencies = [ + "dtoa", +] + +[[package]] +name = "dunce" +version = "1.0.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92773504d58c093f6de2459af4af33faa518c13451eb8f2b5698ed3d36e7c813" + +[[package]] +name = "ego-tree" +version = "0.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "12a0bb14ac04a9fcf170d0bbbef949b44cc492f4452bd20c095636956f653642" + [[package]] name = "either" version = "1.15.0" @@ -465,6 +702,16 @@ version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" +[[package]] +name = "flate2" +version = "1.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "843fba2746e448b37e26a819579957415c8cef339bf08564fe8b7ddbd959573c" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + [[package]] name = "flume" version = "0.11.1" @@ -497,6 +744,31 @@ dependencies = [ "percent-encoding", ] +[[package]] +name = "futf" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df420e2e84819663797d1ec6544b13c5be84629e7bb00dc960d6917db2987843" +dependencies = [ + "mac", + "new_debug_unreachable", +] + +[[package]] +name = "futures" +version = "0.3.32" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b147ee9d1f6d097cef9ce628cd2ee62288d963e16fb287bd9286455b241382d" +dependencies = [ + "futures-channel", + "futures-core", + "futures-executor", + "futures-io", + "futures-sink", + "futures-task", + "futures-util", +] + [[package]] name = "futures-channel" version = "0.3.32" @@ -564,12 +836,19 @@ version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "037711b3d59c33004d3856fbdc83b99d4ff37a24768fa1be9ce3538a1cde4393" +[[package]] +name = "futures-timer" +version = "3.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af43fadb8a98512d547e37b4e92e0ced13e205c061b87b4623eff01d918d6968" + [[package]] name = "futures-util" version = "0.3.32" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "389ca41296e6190b48053de0321d02a77f32f8a5d2461dd38762c0593805c6d6" dependencies = [ + "futures-channel", "futures-core", "futures-io", "futures-macro", @@ -580,6 +859,15 @@ dependencies = [ "slab", ] +[[package]] +name = "fxhash" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c31b6d751ae2c7f11320402d34e41349dd1016f8d5d45e48c4312bc8625af50c" +dependencies = [ + "byteorder", +] + [[package]] name = "generic-array" version = "0.14.7" @@ -590,6 +878,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.24" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cfe4fbac503b8d1f88e6676011885f34b7174f46e59956bba534ba83abded4df" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.17" @@ -597,8 +894,24 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi", + "wasm-bindgen", +] + +[[package]] +name = "getrandom" +version = "0.3.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "899def5c37c4fd7b2664648c28120ecec138e4d395b459e5ca34f9cce2dd77fd" +dependencies = [ + "cfg-if", + "js-sys", + "libc", + "r-efi 5.3.0", + "wasip2", + "wasm-bindgen", ] [[package]] @@ -609,7 +922,7 @@ checksum = "0de51e6874e94e7bf76d726fc5d13ba782deca734ff60d5bb2fb2607c7406555" dependencies = [ "cfg-if", "libc", - "r-efi", + "r-efi 6.0.0", "wasip2", "wasip3", ] @@ -664,6 +977,12 @@ dependencies = [ "http", ] +[[package]] +name = "heck" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8" + [[package]] name = "heck" version = "0.5.0" @@ -703,6 +1022,20 @@ dependencies = [ "windows-sys 0.61.2", ] +[[package]] +name = "html5ever" +version = "0.27.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +dependencies = [ + "log", + "mac", + "markup5ever", + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "http" version = "1.4.0" @@ -766,6 +1099,23 @@ dependencies = [ "pin-project-lite", "smallvec", "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "tokio", + "tokio-rustls", + "tower-service", + "webpki-roots", ] [[package]] @@ -774,13 +1124,21 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ + "base64", "bytes", + "futures-channel", + "futures-util", "http", "http-body", "hyper", + "ipnet", + "libc", + "percent-encoding", "pin-project-lite", + "socket2", "tokio", "tower-service", + "tracing", ] [[package]] @@ -937,6 +1295,12 @@ dependencies = [ "cfb", ] +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + [[package]] name = "itoa" version = "1.0.18" @@ -1004,6 +1368,12 @@ dependencies = [ "vcpkg", ] +[[package]] +name = "linux-raw-sys" +version = "0.4.15" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d26c52dbd32dccf2d10cac7725f8eae5296885fb5703b261f7d0a0739ec807ab" + [[package]] name = "linux-raw-sys" version = "0.12.1" @@ -1031,9 +1401,21 @@ version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru-slab" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "112b39cec0b298b6c1999fee3e31427f74f676e4cb9879ed1a121b43661a4154" + +[[package]] +name = "mac" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" + [[package]] name = "mangalord" -version = "0.21.3" +version = "0.22.0" dependencies = [ "anyhow", "argon2", @@ -1042,6 +1424,7 @@ dependencies = [ "axum-extra", "base64", "bytes", + "chromiumoxide", "chrono", "dotenvy", "futures-core", @@ -1049,7 +1432,8 @@ dependencies = [ "http-body-util", "infer", "mime", - "rand", + "rand 0.8.6", + "scraper", "serde", "serde_json", "sha2", @@ -1067,6 +1451,20 @@ dependencies = [ "uuid", ] +[[package]] +name = "markup5ever" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +dependencies = [ + "log", + "phf 0.11.3", + "phf_codegen 0.11.3", + "string_cache", + "string_cache_codegen", + "tendril", +] + [[package]] name = "matchers" version = "0.2.0" @@ -1104,6 +1502,16 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "miniz_oxide" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fa76a2c86f704bdb222d66965fb3d63269ce38518b83cb0575fca855ebb6316" +dependencies = [ + "adler2", + "simd-adler32", +] + [[package]] name = "mio" version = "1.2.0" @@ -1132,6 +1540,24 @@ dependencies = [ "version_check", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + +[[package]] +name = "nix" +version = "0.31.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cf20d2fde8ff38632c426f1165ed7436270b44f199fc55284c38276f9db47c3d" +dependencies = [ + "bitflags", + "cfg-if", + "cfg_aliases", + "libc", +] + [[package]] name = "nu-ansi-term" version = "0.50.3" @@ -1152,7 +1578,7 @@ dependencies = [ "num-integer", "num-iter", "num-traits", - "rand", + "rand 0.8.6", "smallvec", "zeroize", ] @@ -1193,12 +1619,192 @@ dependencies = [ "libm", ] +[[package]] +name = "objc2" +version = "0.6.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a12a8ed07aefc768292f076dc3ac8c48f3781c8f2d5851dd3d98950e8c5a89f" +dependencies = [ + "objc2-encode", +] + +[[package]] +name = "objc2-cloud-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "73ad74d880bb43877038da939b7427bba67e9dd42004a18b809ba7d87cee241c" +dependencies = [ + "bitflags", + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-data" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b402a653efbb5e82ce4df10683b6b28027616a2715e90009947d50b8dd298fa" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2a180dd8642fa45cdb7dd721cd4c11b1cadd4929ce112ebd8b9f5803cc79d536" +dependencies = [ + "bitflags", + "dispatch2", + "objc2", +] + +[[package]] +name = "objc2-core-graphics" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e022c9d066895efa1345f8e33e584b9f958da2fd4cd116792e15e07e4720a807" +dependencies = [ + "bitflags", + "dispatch2", + "objc2", + "objc2-core-foundation", + "objc2-io-surface", +] + +[[package]] +name = "objc2-core-image" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5d563b38d2b97209f8e861173de434bd0214cf020e3423a52624cd1d989f006" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-location" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ca347214e24bc973fc025fd0d36ebb179ff30536ed1f80252706db19ee452009" +dependencies = [ + "objc2", + "objc2-foundation", +] + +[[package]] +name = "objc2-core-text" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0cde0dfb48d25d2b4862161a4d5fcc0e3c24367869ad306b0c9ec0073bfed92d" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", + "objc2-core-graphics", +] + +[[package]] +name = "objc2-encode" +version = "4.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef25abbcd74fb2609453eb695bd2f860d389e457f67dc17cafc8b8cbc89d0c33" + +[[package]] +name = "objc2-foundation" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3e0adef53c21f888deb4fa59fc59f7eb17404926ee8a6f59f5df0fd7f9f3272" +dependencies = [ + "bitflags", + "block2", + "libc", + "objc2", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-io-surface" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180788110936d59bab6bd83b6060ffdfffb3b922ba1396b312ae795e1de9d81d" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", +] + +[[package]] +name = "objc2-quartz-core" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96c1358452b371bf9f104e21ec536d37a650eb10f7ee379fff67d2e08d537f1f" +dependencies = [ + "bitflags", + "objc2", + "objc2-core-foundation", + "objc2-foundation", +] + +[[package]] +name = "objc2-ui-kit" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d87d638e33c06f577498cbcc50491496a3ed4246998a7fbba7ccb98b1e7eab22" +dependencies = [ + "bitflags", + "block2", + "objc2", + "objc2-cloud-kit", + "objc2-core-data", + "objc2-core-foundation", + "objc2-core-graphics", + "objc2-core-image", + "objc2-core-location", + "objc2-core-text", + "objc2-foundation", + "objc2-quartz-core", + "objc2-user-notifications", +] + +[[package]] +name = "objc2-user-notifications" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9df9128cbbfef73cda168416ccf7f837b62737d748333bfe9ab71c245d76613e" +dependencies = [ + "objc2", + "objc2-foundation", +] + [[package]] name = "once_cell" version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "option-ext" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "04744f49eae99ab78e0d5c0b603ab218f515ea8cfe5a456d7629ad883a3b6e7d" + +[[package]] +name = "os_info" +version = "3.15.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9cf20a545b305cf1da722b236b5155c9bb35f1d5ceb28c048bd96ca842f41b5b" +dependencies = [ + "android_system_properties", + "log", + "nix", + "objc2", + "objc2-foundation", + "objc2-ui-kit", + "windows-sys 0.61.2", +] + [[package]] name = "parking" version = "2.2.1" @@ -1235,7 +1841,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "346f04948ba92c43e8469c1ee6736c7563d71012b17d40745260fe106aac2166" dependencies = [ "base64ct", - "rand_core", + "rand_core 0.6.4", "subtle", ] @@ -1254,6 +1860,96 @@ version = "2.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b4f627cb1b25917193a259e49bdad08f671f8d9708acfd5fe0a8c1455d87220" +[[package]] +name = "phf" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" +dependencies = [ + "phf_shared 0.10.0", +] + +[[package]] +name = "phf" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd6780a80ae0c52cc120a26a1a42c1ae51b247a253e4e06113d23d2c2edd078" +dependencies = [ + "phf_macros", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_codegen" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" +dependencies = [ + "phf_generator 0.10.0", + "phf_shared 0.10.0", +] + +[[package]] +name = "phf_codegen" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aef8048c789fa5e851558d709946d6d79a8ff88c0440c587967f8e94bfb1216a" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", +] + +[[package]] +name = "phf_generator" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d5285893bb5eb82e6aaf5d59ee909a06a16737a8970984dd7746ba9283498d6" +dependencies = [ + "phf_shared 0.10.0", + "rand 0.8.6", +] + +[[package]] +name = "phf_generator" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3c80231409c20246a13fddb31776fb942c38553c51e871f8cbd687a4cfb5843d" +dependencies = [ + "phf_shared 0.11.3", + "rand 0.8.6", +] + +[[package]] +name = "phf_macros" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f84ac04429c13a7ff43785d75ad27569f2951ce0ffd30a3321230db2fc727216" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.10.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b6796ad771acdc0123d2a88dc428b5e38ef24456743ddb1744ed628f9815c096" +dependencies = [ + "siphasher 0.3.11", +] + +[[package]] +name = "phf_shared" +version = "0.11.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67eabc2ef2a60eb7faa00097bd1ffdb5bd28e62bf39990626a582201b7a754e5" +dependencies = [ + "siphasher 1.0.3", +] + [[package]] name = "pin-project-lite" version = "0.2.17" @@ -1317,6 +2013,12 @@ dependencies = [ "zerocopy", ] +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "prettyplease" version = "0.2.37" @@ -1336,6 +2038,61 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "quinn" +version = "0.11.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b9e20a958963c291dc322d98411f541009df2ced7b5a4f2bd52337638cfccf20" +dependencies = [ + "bytes", + "cfg_aliases", + "pin-project-lite", + "quinn-proto", + "quinn-udp", + "rustc-hash", + "rustls", + "socket2", + "thiserror 2.0.18", + "tokio", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-proto" +version = "0.11.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "434b42fec591c96ef50e21e886936e66d3cc3f737104fdb9b737c40ffb94c098" +dependencies = [ + "bytes", + "getrandom 0.3.4", + "lru-slab", + "rand 0.9.4", + "ring", + "rustc-hash", + "rustls", + "rustls-pki-types", + "slab", + "thiserror 2.0.18", + "tinyvec", + "tracing", + "web-time", +] + +[[package]] +name = "quinn-udp" +version = "0.5.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "addec6a0dcad8a8d96a771f815f0eaf55f9d1805756410b39f5fa81332574cbd" +dependencies = [ + "cfg_aliases", + "libc", + "once_cell", + "socket2", + "tracing", + "windows-sys 0.59.0", +] + [[package]] name = "quote" version = "1.0.45" @@ -1345,6 +2102,12 @@ dependencies = [ "proc-macro2", ] +[[package]] +name = "r-efi" +version = "5.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f" + [[package]] name = "r-efi" version = "6.0.0" @@ -1358,8 +2121,18 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ca0ecfa931c29007047d1bc58e623ab12e5590e8c7cc53200d5202b69266d8a" dependencies = [ "libc", - "rand_chacha", - "rand_core", + "rand_chacha 0.3.1", + "rand_core 0.6.4", +] + +[[package]] +name = "rand" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea" +dependencies = [ + "rand_chacha 0.9.0", + "rand_core 0.9.5", ] [[package]] @@ -1369,7 +2142,17 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" dependencies = [ "ppv-lite86", - "rand_core", + "rand_core 0.6.4", +] + +[[package]] +name = "rand_chacha" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3022b5f1df60f26e1ffddd6c66e8aa15de382ae63b3a0c1bfc0e4d3e3f325cb" +dependencies = [ + "ppv-lite86", + "rand_core 0.9.5", ] [[package]] @@ -1381,6 +2164,15 @@ dependencies = [ "getrandom 0.2.17", ] +[[package]] +name = "rand_core" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76afc826de14238e6e8c374ddcc1fa19e374fd8dd986b0d2af0d02377261d83c" +dependencies = [ + "getrandom 0.3.4", +] + [[package]] name = "redox_syscall" version = "0.5.18" @@ -1399,6 +2191,29 @@ dependencies = [ "bitflags", ] +[[package]] +name = "redox_users" +version = "0.4.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" +dependencies = [ + "getrandom 0.2.17", + "libredox", + "thiserror 1.0.69", +] + +[[package]] +name = "regex" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e10754a14b9137dd7b1e3e5b0493cc9171fdd105e0ab477f51b72e7f3ac0e276" +dependencies = [ + "aho-corasick", + "memchr", + "regex-automata", + "regex-syntax", +] + [[package]] name = "regex-automata" version = "0.4.14" @@ -1416,6 +2231,58 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "futures-core", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-util", + "js-sys", + "log", + "percent-encoding", + "pin-project-lite", + "quinn", + "rustls", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-rustls", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", + "webpki-roots", +] + +[[package]] +name = "ring" +version = "0.17.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a4689e6c2294d81e88dc6261c768b63bc4fcdb852be6d1352498b114f61383b7" +dependencies = [ + "cc", + "cfg-if", + "getrandom 0.2.17", + "libc", + "untrusted", + "windows-sys 0.52.0", +] + [[package]] name = "rsa" version = "0.9.10" @@ -1429,13 +2296,32 @@ dependencies = [ "num-traits", "pkcs1", "pkcs8", - "rand_core", + "rand_core 0.6.4", "signature", "spki", "subtle", "zeroize", ] +[[package]] +name = "rustc-hash" +version = "2.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe" + +[[package]] +name = "rustix" +version = "0.38.44" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" +dependencies = [ + "bitflags", + "errno", + "libc", + "linux-raw-sys 0.4.15", + "windows-sys 0.59.0", +] + [[package]] name = "rustix" version = "1.1.4" @@ -1445,10 +2331,45 @@ dependencies = [ "bitflags", "errno", "libc", - "linux-raw-sys", + "linux-raw-sys 0.12.1", "windows-sys 0.61.2", ] +[[package]] +name = "rustls" +version = "0.23.40" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ef86cd5876211988985292b91c96a8f2d298df24e75989a43a3c73f2d4d8168b" +dependencies = [ + "once_cell", + "ring", + "rustls-pki-types", + "rustls-webpki", + "subtle", + "zeroize", +] + +[[package]] +name = "rustls-pki-types" +version = "1.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "30a7197ae7eb376e574fe940d068c30fe0462554a3ddbe4eca7838e049c937a9" +dependencies = [ + "web-time", + "zeroize", +] + +[[package]] +name = "rustls-webpki" +version = "0.103.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "61c429a8649f110dddef65e2a5ad240f747e85f7758a6bccc7e5777bd33f756e" +dependencies = [ + "ring", + "rustls-pki-types", + "untrusted", +] + [[package]] name = "rustversion" version = "1.0.22" @@ -1467,6 +2388,41 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" +[[package]] +name = "scraper" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b90460b31bfe1fc07be8262e42c665ad97118d4585869de9345a84d501a9eaf0" +dependencies = [ + "ahash", + "cssparser", + "ego-tree", + "getopts", + "html5ever", + "once_cell", + "selectors", + "tendril", +] + +[[package]] +name = "selectors" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +dependencies = [ + "bitflags", + "cssparser", + "derive_more", + "fxhash", + "log", + "new_debug_unreachable", + "phf 0.10.1", + "phf_codegen 0.10.0", + "precomputed-hash", + "servo_arc", + "smallvec", +] + [[package]] name = "semver" version = "1.0.28" @@ -1539,6 +2495,15 @@ dependencies = [ "serde", ] +[[package]] +name = "servo_arc" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +dependencies = [ + "stable_deref_trait", +] + [[package]] name = "sha1" version = "0.10.6" @@ -1593,9 +2558,27 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "77549399552de45a898a580c1b41d445bf730df867cc44e6c0233bbc4b8329de" dependencies = [ "digest", - "rand_core", + "rand_core 0.6.4", ] +[[package]] +name = "simd-adler32" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "703d5c7ef118737c72f1af64ad2f6f8c5e1921f818cdcb97b8fe6fc69bf66214" + +[[package]] +name = "siphasher" +version = "0.3.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "38b58827f4464d87d377d175e90bf58eb00fd8716ff0a62f80356b5e61555d0d" + +[[package]] +name = "siphasher" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ee5873ec9cce0195efcb7a4e9507a04cd49aec9c83d0389df45b1ef7ba2e649" + [[package]] name = "slab" version = "0.4.12" @@ -1710,7 +2693,7 @@ checksum = "19a9c1841124ac5a61741f96e1d9e2ec77424bf323962dd894bdb93f37d5219b" dependencies = [ "dotenvy", "either", - "heck", + "heck 0.5.0", "hex", "once_cell", "proc-macro2", @@ -1757,7 +2740,7 @@ dependencies = [ "memchr", "once_cell", "percent-encoding", - "rand", + "rand 0.8.6", "rsa", "serde", "sha1", @@ -1797,7 +2780,7 @@ dependencies = [ "md-5", "memchr", "once_cell", - "rand", + "rand 0.8.6", "serde", "serde_json", "sha2", @@ -1842,6 +2825,31 @@ version = "1.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ce2be8dc25455e1f91df71bfa12ad37d7af1092ae736f3a6cd0e37bc7810596" +[[package]] +name = "string_cache" +version = "0.8.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bf776ba3fa74f83bf4b63c3dcbbf82173db2632ed8452cb2d891d33f459de70f" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared 0.11.3", + "precomputed-hash", + "serde", +] + +[[package]] +name = "string_cache_codegen" +version = "0.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c711928715f1fe0fe509c53b43e993a9a557babc2d0a3567d0a3006f1ac931a0" +dependencies = [ + "phf_generator 0.11.3", + "phf_shared 0.11.3", + "proc-macro2", + "quote", +] + [[package]] name = "stringprep" version = "0.1.5" @@ -1875,6 +2883,9 @@ name = "sync_wrapper" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -1896,10 +2907,21 @@ dependencies = [ "fastrand", "getrandom 0.4.2", "once_cell", - "rustix", + "rustix 1.1.4", "windows-sys 0.61.2", ] +[[package]] +name = "tendril" +version = "0.4.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d24a120c5fc464a3458240ee02c299ebcb9d67b5249c8848b09d639dca8d7bb0" +dependencies = [ + "futf", + "mac", + "utf-8", +] + [[package]] name = "thiserror" version = "1.0.69" @@ -2033,6 +3055,16 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + [[package]] name = "tokio-stream" version = "0.1.18" @@ -2081,12 +3113,15 @@ checksum = "68d6fdd9f81c2819c9a8b0e0cd91660e7746a8e6ea2ba7c6b2b057985f6bcb51" dependencies = [ "bitflags", "bytes", + "futures-util", "http", "http-body", "pin-project-lite", + "tower", "tower-layer", "tower-service", "tracing", + "url", ] [[package]] @@ -2163,6 +3198,30 @@ dependencies = [ "tracing-log", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + +[[package]] +name = "tungstenite" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6e2e2ce1e47ed2994fd43b04c8f618008d4cabdd5ee34027cf14f9d918edd9c8" +dependencies = [ + "byteorder", + "bytes", + "data-encoding", + "http", + "httparse", + "log", + "rand 0.8.6", + "sha1", + "thiserror 1.0.69", + "utf-8", +] + [[package]] name = "typenum" version = "1.20.0" @@ -2196,12 +3255,24 @@ version = "0.1.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7df058c713841ad818f1dc5d3fd88063241cc61f49f5fbea4b951e8cf5a8d71d" +[[package]] +name = "unicode-width" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ac048d71ede7ee76d585517add45da530660ef4390e49b098733c6e897f254" + [[package]] name = "unicode-xid" version = "0.2.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ebc1c04c71510c7f702b52b7c350734c9ff1295c464a03335b00bb84fc54f853" +[[package]] +name = "untrusted" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" + [[package]] name = "url" version = "2.5.8" @@ -2214,6 +3285,12 @@ dependencies = [ "serde", ] +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + [[package]] name = "utf8_iter" version = "1.0.4" @@ -2250,6 +3327,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -2293,6 +3379,16 @@ dependencies = [ "wasm-bindgen-shared", ] +[[package]] +name = "wasm-bindgen-futures" +version = "0.4.71" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96492d0d3ffba25305a7dc88720d250b1401d7edca02cc3bcd50633b424673b8" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + [[package]] name = "wasm-bindgen-macro" version = "0.2.121" @@ -2359,6 +3455,47 @@ dependencies = [ "semver", ] +[[package]] +name = "web-sys" +version = "0.3.98" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b572dff8bcf38bad0fa19729c89bb5748b2b9b1d8be70cf90df697e3a8f32aa" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "web-time" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb" +dependencies = [ + "js-sys", + "wasm-bindgen", +] + +[[package]] +name = "webpki-roots" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "52f5ee44c96cf55f1b349600768e3ece3a8f26010c05265ab73f945bb1a2eb9d" +dependencies = [ + "rustls-pki-types", +] + +[[package]] +name = "which" +version = "6.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b4ee928febd44d98f2f459a4a79bd4d928591333a494a10a868418ac1b39cf1f" +dependencies = [ + "either", + "home", + "rustix 0.38.44", + "winsafe", +] + [[package]] name = "whoami" version = "1.6.1" @@ -2434,7 +3571,25 @@ version = "0.48.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "677d2418bec65e3338edb076e806bc1ec15693c5d0104683f2efe857f61056a9" dependencies = [ - "windows-targets", + "windows-targets 0.48.5", +] + +[[package]] +name = "windows-sys" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "282be5f36a8ce781fad8c8ae18fa3f9beff57ec1b52cb3de0789201425d9a33d" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", ] [[package]] @@ -2452,13 +3607,29 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a2fa6e2155d7247be68c096456083145c183cbbbc2764150dda45a87197940c" dependencies = [ - "windows_aarch64_gnullvm", - "windows_aarch64_msvc", - "windows_i686_gnu", - "windows_i686_msvc", - "windows_x86_64_gnu", - "windows_x86_64_gnullvm", - "windows_x86_64_msvc", + "windows_aarch64_gnullvm 0.48.5", + "windows_aarch64_msvc 0.48.5", + "windows_i686_gnu 0.48.5", + "windows_i686_msvc 0.48.5", + "windows_x86_64_gnu 0.48.5", + "windows_x86_64_gnullvm 0.48.5", + "windows_x86_64_msvc 0.48.5", +] + +[[package]] +name = "windows-targets" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9b724f72796e036ab90c1021d4780d4d3d648aca59e491e6b98e725b84e99973" +dependencies = [ + "windows_aarch64_gnullvm 0.52.6", + "windows_aarch64_msvc 0.52.6", + "windows_i686_gnu 0.52.6", + "windows_i686_gnullvm", + "windows_i686_msvc 0.52.6", + "windows_x86_64_gnu 0.52.6", + "windows_x86_64_gnullvm 0.52.6", + "windows_x86_64_msvc 0.52.6", ] [[package]] @@ -2467,42 +3638,106 @@ version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2b38e32f0abccf9987a4e3079dfb67dcd799fb61361e53e2882c3cbaf0d905d8" +[[package]] +name = "windows_aarch64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc35310971f3b2dbbf3f0690a219f40e2d9afcf64f9ab7cc1be722937c26b4bc" +[[package]] +name = "windows_aarch64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" + [[package]] name = "windows_i686_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a75915e7def60c94dcef72200b9a8e58e5091744960da64ec734a6c6e9b3743e" +[[package]] +name = "windows_i686_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e9b5ad5ab802e97eb8e295ac6720e509ee4c243f69d781394014ebfe8bbfa0b" + +[[package]] +name = "windows_i686_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" + [[package]] name = "windows_i686_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f55c233f70c4b27f66c523580f78f1004e8b5a8b659e05a4eb49d4166cca406" +[[package]] +name = "windows_i686_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "53d40abd2583d23e4718fddf1ebec84dbff8381c07cae67ff7768bbf19c6718e" +[[package]] +name = "windows_x86_64_gnu" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "147a5c80aabfbf0c7d901cb5895d1de30ef2907eb21fbbab29ca94c5b08b1a78" + [[package]] name = "windows_x86_64_gnullvm" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0b7b52767868a23d5bab768e390dc5f5c55825b6d30b86c844ff2dc7414044cc" +[[package]] +name = "windows_x86_64_gnullvm" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ed94fce61571a4006852b7389a063ab983c02eb1bb37b47f8272ce92d06d9538" +[[package]] +name = "windows_x86_64_msvc" +version = "0.52.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" + +[[package]] +name = "winreg" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" +dependencies = [ + "cfg-if", + "windows-sys 0.48.0", +] + +[[package]] +name = "winsafe" +version = "0.0.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d135d17ab770252ad95e9a872d365cf3090e3be864a34ab46f48555993efc904" + [[package]] name = "wit-bindgen" version = "0.51.0" @@ -2525,7 +3760,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ea61de684c3ea68cb082b7a88508a8b27fcc8b797d738bfc99a82facf1d752dc" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "wit-parser", ] @@ -2536,7 +3771,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7c566e0f4b284dd6561c786d9cb0142da491f46a9fbed79ea69cdad5db17f21" dependencies = [ "anyhow", - "heck", + "heck 0.5.0", "indexmap", "prettyplease", "syn", @@ -2706,6 +3941,18 @@ dependencies = [ "syn", ] +[[package]] +name = "zip" +version = "0.6.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261" +dependencies = [ + "byteorder", + "crc32fast", + "crossbeam-utils", + "flate2", +] + [[package]] name = "zmij" version = "1.0.21" diff --git a/backend/Cargo.toml b/backend/Cargo.toml index 3b2d8bf..19e08c2 100644 --- a/backend/Cargo.toml +++ b/backend/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "mangalord" -version = "0.21.3" +version = "0.22.0" edition = "2021" [lib] @@ -10,6 +10,10 @@ path = "src/lib.rs" name = "mangalord" path = "src/main.rs" +[[bin]] +name = "crawler" +path = "src/bin/crawler.rs" + [dependencies] axum = { version = "0.7", features = ["macros", "multipart"] } tokio = { version = "1", features = ["full"] } @@ -36,7 +40,10 @@ time = "0.3" infer = "0.16" tokio-util = { version = "0.7", features = ["io"] } futures-core = "0.3" +futures-util = "0.3" bytes = "1" +chromiumoxide = { version = "0.7", features = ["tokio-runtime", "_fetcher-rusttls-tokio"], default-features = false } +scraper = "0.20" [dev-dependencies] tempfile = "3" diff --git a/backend/migrations/0012_crawler.sql b/backend/migrations/0012_crawler.sql new file mode 100644 index 0000000..91c425e --- /dev/null +++ b/backend/migrations/0012_crawler.sql @@ -0,0 +1,72 @@ +-- Crawler tables. +-- +-- Same philosophy as 0001_init.sql: new concepts go in new tables +-- joined to existing ones, not jammed onto `mangas`/`chapters`. A +-- crawled manga IS a manga; the only thing the source-link tables +-- carry is "where did this come from and when did we last see it". +-- That keeps the API and frontend source-agnostic. + +-- 1. Source registry. One row per site the crawler knows about. +-- `config` carries per-site knobs (base URL, rate limits, custom +-- selectors) so adding a source is a row insert plus a `Source` +-- trait impl — no schema change. +CREATE TABLE sources ( + id text PRIMARY KEY, + name text NOT NULL, + base_url text NOT NULL, + enabled boolean NOT NULL DEFAULT true, + config jsonb NOT NULL DEFAULT '{}'::jsonb, + created_at timestamptz NOT NULL DEFAULT now() +); + +-- 2. Link tables. `(source_id, source_*_key)` is the natural key the +-- source itself exposes; the FK to `mangas`/`chapters` is what +-- threads it back into our domain. `metadata_hash` is the signal +-- used by `crawler::diff` to detect updates without re-comparing +-- every field. `last_seen_at` + `dropped_at` is the soft-drop pair. +CREATE TABLE manga_sources ( + source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE, + source_manga_key text NOT NULL, + manga_id uuid NOT NULL REFERENCES mangas(id) ON DELETE CASCADE, + source_url text NOT NULL, + metadata_hash text, + first_seen_at timestamptz NOT NULL DEFAULT now(), + last_seen_at timestamptz NOT NULL DEFAULT now(), + dropped_at timestamptz, + PRIMARY KEY (source_id, source_manga_key) +); +CREATE INDEX manga_sources_manga_idx ON manga_sources (manga_id); +CREATE INDEX manga_sources_last_seen_idx ON manga_sources (source_id, last_seen_at); + +CREATE TABLE chapter_sources ( + source_id text NOT NULL REFERENCES sources(id) ON DELETE CASCADE, + source_chapter_key text NOT NULL, + chapter_id uuid NOT NULL REFERENCES chapters(id) ON DELETE CASCADE, + source_url text NOT NULL, + first_seen_at timestamptz NOT NULL DEFAULT now(), + last_seen_at timestamptz NOT NULL DEFAULT now(), + dropped_at timestamptz, + PRIMARY KEY (source_id, source_chapter_key) +); +CREATE INDEX chapter_sources_chapter_idx ON chapter_sources (chapter_id); + +-- 3. Persistent job queue. Workers lease with +-- `FOR UPDATE SKIP LOCKED`, heartbeat via `leased_until`, and ack +-- by transitioning state. The partial index keeps the hot path +-- (pick the next ready job) off the bulk of done/dead rows. +CREATE TABLE crawler_jobs ( + id uuid PRIMARY KEY DEFAULT gen_random_uuid(), + payload jsonb NOT NULL, + state text NOT NULL DEFAULT 'pending' + CHECK (state IN ('pending','running','done','failed','dead')), + attempts integer NOT NULL DEFAULT 0, + max_attempts integer NOT NULL DEFAULT 5, + scheduled_at timestamptz NOT NULL DEFAULT now(), + leased_until timestamptz, + last_error text, + created_at timestamptz NOT NULL DEFAULT now(), + updated_at timestamptz NOT NULL DEFAULT now() +); +CREATE INDEX crawler_jobs_ready_idx + ON crawler_jobs (scheduled_at) + WHERE state IN ('pending', 'failed'); diff --git a/backend/src/bin/crawler.rs b/backend/src/bin/crawler.rs new file mode 100644 index 0000000..42e8227 --- /dev/null +++ b/backend/src/bin/crawler.rs @@ -0,0 +1,29 @@ +//! Crawler binary. +//! +//! Today: a thin shell that launches Chromium via the shared +//! `crawler::browser` module and exits. Useful as an ad-hoc smoke test +//! for the launcher in addition to the integration test in +//! `tests/crawler_browser_smoke.rs`. +//! +//! Future: reads config, picks `Source` impls, runs the job loop. + +use mangalord::crawler::browser::{self, LaunchOptions}; +use tracing_subscriber::EnvFilter; + +#[tokio::main] +async fn main() -> anyhow::Result<()> { + dotenvy::dotenv().ok(); + tracing_subscriber::fmt() + .with_env_filter( + EnvFilter::try_from_default_env() + .unwrap_or_else(|_| "info,mangalord=debug".into()), + ) + .init(); + + let options = LaunchOptions::from_env(); + tracing::info!(?options, "launching browser"); + let handle = browser::launch(options).await?; + tracing::info!("browser launched; closing"); + handle.close().await?; + Ok(()) +} diff --git a/backend/src/crawler/browser.rs b/backend/src/crawler/browser.rs new file mode 100644 index 0000000..68e1846 --- /dev/null +++ b/backend/src/crawler/browser.rs @@ -0,0 +1,217 @@ +//! Chromium launcher and lifecycle. +//! +//! Uses `chromiumoxide`'s `fetcher` feature so we don't depend on a +//! system Chrome install — first call downloads a known-good revision +//! into a cache dir and reuses it forever after. `BrowserMode` toggles +//! headed vs headless; the headed path needs a display (real `$DISPLAY` +//! or `xvfb-run`). +//! +//! Extra Chromium command-line flags can be supplied through +//! [`LaunchOptions::extra_args`] in code, or via the +//! `CRAWLER_BROWSER_ARGS` env var (whitespace-separated) when going +//! through [`LaunchOptions::from_env`]. The launcher always also +//! injects `--no-sandbox` and `--disable-dev-shm-usage` because they're +//! near-mandatory for containerized Chromium; everything else is +//! caller-provided. + +use std::path::PathBuf; + +use anyhow::Context; +use chromiumoxide::browser::{Browser, BrowserConfig}; +use chromiumoxide::fetcher::{BrowserFetcher, BrowserFetcherOptions}; +use futures_util::StreamExt; +use tokio::task::JoinHandle; + +#[derive(Clone, Copy, Debug, PartialEq, Eq)] +pub enum BrowserMode { + /// Real window. Needs `$DISPLAY` (or `xvfb-run` wrapping the + /// binary). This is the default the old Puppeteer crawler used and + /// the assumed mode for the target site until we prove headless + /// works against it. + Headed, + /// No window. Faster, lower resource use, but more likely to trip + /// fingerprinting on hostile sites. + Headless, +} + +/// Configuration for a single browser launch. +/// +/// Public fields rather than a builder — there are only two of them +/// and callers benefit from struct literal syntax for clarity. +#[derive(Clone, Debug)] +pub struct LaunchOptions { + pub mode: BrowserMode, + /// Extra Chromium flags, appended after the launcher's own + /// defaults. Example: `vec!["--lang=de-DE".into(), + /// "--window-size=1280,800".into()]`. + pub extra_args: Vec, +} + +impl LaunchOptions { + pub fn headed() -> Self { + Self { + mode: BrowserMode::Headed, + extra_args: Vec::new(), + } + } + + pub fn headless() -> Self { + Self { + mode: BrowserMode::Headless, + extra_args: Vec::new(), + } + } + + /// Reads `CRAWLER_BROWSER_MODE` (`headless`|`headed`, default + /// `headed`) and `CRAWLER_BROWSER_ARGS` (whitespace-separated + /// Chromium flags). Flags containing whitespace aren't supported + /// through the env var — use the programmatic API for those. + pub fn from_env() -> Self { + let mode = match std::env::var("CRAWLER_BROWSER_MODE").as_deref() { + Ok("headless") => BrowserMode::Headless, + _ => BrowserMode::Headed, + }; + let extra_args = std::env::var("CRAWLER_BROWSER_ARGS") + .map(|s| parse_args(&s)) + .unwrap_or_default(); + Self { mode, extra_args } + } +} + +impl Default for LaunchOptions { + fn default() -> Self { + Self::headed() + } +} + +/// Whitespace-split a CRAWLER_BROWSER_ARGS-style string. Exposed +/// separately from `from_env` so it can be unit-tested without +/// touching process environment. +pub(crate) fn parse_args(s: &str) -> Vec { + s.split_whitespace().map(str::to_string).collect() +} + +/// Owned browser plus the spawned task that drives its CDP event loop. +/// Dropping `Handle` without calling `close` leaks the Chromium process +/// — always call `close().await` in production paths. +pub struct Handle { + browser: Browser, + driver: JoinHandle<()>, +} + +impl Handle { + pub fn browser(&self) -> &Browser { + &self.browser + } + + pub fn browser_mut(&mut self) -> &mut Browser { + &mut self.browser + } + + /// Closes the browser and awaits the driver task. Safe to call + /// multiple times — subsequent calls are no-ops. + pub async fn close(mut self) -> anyhow::Result<()> { + let _ = self.browser.close().await; + let _ = self.browser.wait().await; + let _ = self.driver.await; + Ok(()) + } +} + +/// Launches Chromium. Downloads it on first run via the `fetcher` +/// feature; subsequent runs hit the cache. The cache dir is +/// `$CRAWLER_CHROMIUM_DIR` if set, else `$HOME/.cache/mangalord/chromium`, +/// else `./.chromium-cache` as a last-resort repo-local fallback. +pub async fn launch(options: LaunchOptions) -> anyhow::Result { + let cache = cache_dir()?; + tokio::fs::create_dir_all(&cache) + .await + .with_context(|| format!("create cache dir {}", cache.display()))?; + + let fetcher = BrowserFetcher::new( + BrowserFetcherOptions::builder() + .with_path(&cache) + .build() + .map_err(|e| anyhow::anyhow!("fetcher options: {e}"))?, + ); + tracing::info!(path = %cache.display(), "ensuring chromium revision is present"); + let info = fetcher + .fetch() + .await + .context("download chromium via fetcher")?; + tracing::info!(executable = %info.executable_path.display(), "chromium ready"); + + let mut builder = BrowserConfig::builder() + .chrome_executable(info.executable_path) + // Linux containers / CI commonly lack the user namespaces + // Chromium's sandbox wants. Disable it; the crawler runs in its + // own container anyway. + .arg("--no-sandbox") + .arg("--disable-dev-shm-usage"); + for arg in &options.extra_args { + builder = builder.arg(arg); + } + if matches!(options.mode, BrowserMode::Headed) { + builder = builder.with_head(); + } + tracing::info!( + mode = ?options.mode, + extra_args = ?options.extra_args, + "building browser config" + ); + let config = builder + .build() + .map_err(|e| anyhow::anyhow!("browser config: {e}"))?; + + let (browser, mut handler) = Browser::launch(config) + .await + .context("launch chromium")?; + + let driver = tokio::spawn(async move { + while let Some(event) = handler.next().await { + if let Err(err) = event { + tracing::warn!(?err, "chromium handler event error"); + } + } + }); + + Ok(Handle { browser, driver }) +} + +fn cache_dir() -> anyhow::Result { + if let Ok(dir) = std::env::var("CRAWLER_CHROMIUM_DIR") { + return Ok(PathBuf::from(dir)); + } + if let Ok(home) = std::env::var("HOME") { + return Ok(PathBuf::from(home).join(".cache/mangalord/chromium")); + } + Ok(PathBuf::from("./.chromium-cache")) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_args_splits_on_whitespace() { + assert_eq!( + parse_args("--lang=de-DE --window-size=1280,800"), + vec!["--lang=de-DE", "--window-size=1280,800"] + ); + } + + #[test] + fn parse_args_tolerates_irregular_whitespace() { + // tabs, multiple spaces, leading/trailing — all collapsed. + assert_eq!( + parse_args(" --a\t--b --c=1\n"), + vec!["--a", "--b", "--c=1"] + ); + } + + #[test] + fn parse_args_empty_string_yields_empty_vec() { + assert!(parse_args("").is_empty()); + assert!(parse_args(" \t\n").is_empty()); + } +} diff --git a/backend/src/crawler/diff.rs b/backend/src/crawler/diff.rs new file mode 100644 index 0000000..2fbb587 --- /dev/null +++ b/backend/src/crawler/diff.rs @@ -0,0 +1,15 @@ +//! Change-detection rules between the source and our DB. +//! +//! | Event | Signal | +//! |--------------------|----------------------------------------------------------------------------------------| +//! | New manga | `(source_id, source_manga_key)` not in `manga_sources` | +//! | Updated metadata | freshly computed `metadata_hash` differs from the stored one | +//! | Dropped manga | `last_seen_at < discover_run_started_at` for N consecutive successful discover runs | +//! | New chapter | `(source_id, source_chapter_key)` not in `chapter_sources` | +//! | Dropped chapter | present in DB but absent from the latest `fetch_chapter_list` for the same manga | +//! +//! Dropped is always a soft flag (`dropped_at`), never a row delete — +//! restoring is a matter of clearing the flag if the source brings the +//! item back. +//! +//! Scaffold only — implementations land once `repo::crawler` exists. diff --git a/backend/src/crawler/jobs.rs b/backend/src/crawler/jobs.rs new file mode 100644 index 0000000..8b1dc26 --- /dev/null +++ b/backend/src/crawler/jobs.rs @@ -0,0 +1,55 @@ +//! Persistent job queue and the four job kinds. +//! +//! Backed by Postgres (the `crawler_jobs` table). Workers lease rows +//! with `SELECT ... FOR UPDATE SKIP LOCKED`, heartbeat via +//! `leased_until`, and ack by transitioning to `done` (or backoff / +//! `dead`). Handlers are idempotent so a crash mid-run is recoverable +//! by replay. +//! +//! Scaffold only — the actual queue wrapper and handler dispatch land +//! once we have the first `Source` impl exercising the pipeline. + +use serde::{Deserialize, Serialize}; +use uuid::Uuid; + +use super::source::DiscoverMode; + +#[derive(Clone, Debug, Serialize, Deserialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum JobPayload { + /// Walk the source index and enqueue `SyncManga` jobs. + Discover { + source_id: String, + mode: DiscoverMode, + }, + /// Fetch one manga's detail page, upsert metadata, enqueue + /// `SyncChapterList`. + SyncManga { + source_id: String, + source_manga_key: String, + }, + /// Diff the chapter list, enqueue `SyncChapterContent` for new + /// chapters, soft-drop vanished ones. + SyncChapterList { + source_id: String, + manga_id: Uuid, + source_manga_key: String, + }, + /// Download a single chapter's page images into storage. + SyncChapterContent { + source_id: String, + chapter_id: Uuid, + source_chapter_key: String, + }, +} + +#[derive(Clone, Copy, Debug, sqlx::Type, Serialize, Deserialize)] +#[sqlx(type_name = "text", rename_all = "snake_case")] +#[serde(rename_all = "snake_case")] +pub enum JobState { + Pending, + Running, + Done, + Failed, + Dead, +} diff --git a/backend/src/crawler/mod.rs b/backend/src/crawler/mod.rs new file mode 100644 index 0000000..f6a962e --- /dev/null +++ b/backend/src/crawler/mod.rs @@ -0,0 +1,19 @@ +//! Crawler subsystem. +//! +//! Runs as its own binary (`src/bin/crawler.rs`) and shares `domain`, +//! `repo`, and `storage` with the API binary. Layering mirrors the +//! `Storage` trait pattern: callers depend on the `source::Source` +//! trait, not on a concrete site; new sites plug in as additional +//! impls without touching the job runner. +//! +//! Submodules: +//! - [`browser`]: launches and pools Chromium via `chromiumoxide`. +//! First run downloads a known-good build via the `fetcher` feature. +//! - [`source`]: the `Source` trait. Per-site impls live alongside it. +//! - [`jobs`]: job kinds, queue wrapper, handler dispatch. +//! - [`diff`]: change detection — new / updated / dropped semantics. + +pub mod browser; +pub mod diff; +pub mod jobs; +pub mod source; diff --git a/backend/src/crawler/source.rs b/backend/src/crawler/source.rs new file mode 100644 index 0000000..23e3279 --- /dev/null +++ b/backend/src/crawler/source.rs @@ -0,0 +1,105 @@ +//! `Source` trait — the per-site abstraction. +//! +//! Job handlers depend on this trait, not on a concrete site. Adding a +//! new site is: implement `Source`, register it in a `sources` table +//! row, and the existing job pipeline picks it up unchanged. +//! +//! Scaffold only — the first concrete impl lands in a follow-up commit +//! once the target site is locked in. + +use async_trait::async_trait; +use chromiumoxide::browser::Browser; +use serde::{Deserialize, Serialize}; + +/// How a `discover` job should walk the source's index. +#[derive(Clone, Copy, Debug, Serialize, Deserialize)] +pub enum DiscoverMode { + /// Walk every index page from last back to first. Used for the + /// initial seed of a source. + Backfill, + /// Walk index pages from page 1 forward, stopping after + /// `stop_after_unchanged` consecutive mangas whose `metadata_hash` + /// matches storage. Used for the recurring cron tick. + Incremental { stop_after_unchanged: usize }, +} + +/// Pointer at a manga in the source's index, before we've fetched the +/// detail page. The `source_manga_key` is whatever stable id the source +/// uses (slug, numeric id, etc). +#[derive(Clone, Debug)] +pub struct SourceMangaRef { + pub source_manga_key: String, + pub title: String, + pub url: String, +} + +/// Full metadata returned by `fetch_manga`. The hash is computed by the +/// source impl (typically over the normalized field set) and is the +/// signal `diff` uses to detect metadata updates. +#[derive(Clone, Debug)] +pub struct SourceManga { + pub source_manga_key: String, + pub title: String, + pub alternative_titles: Vec, + pub authors: Vec, + pub genres: Vec, + pub tags: Vec, + pub status: Option, + pub summary: Option, + pub cover_url: Option, + pub metadata_hash: String, +} + +#[derive(Clone, Debug)] +pub struct SourceChapterRef { + pub source_chapter_key: String, + pub number: i32, + pub title: Option, + pub url: String, +} + +#[derive(Clone, Debug)] +pub struct SourceChapter { + pub source_chapter_key: String, + pub number: i32, + pub title: Option, + /// Ordered list of page image URLs, ready to be fetched and put + /// into `Storage`. + pub page_urls: Vec, +} + +/// Context passed to every `Source` call. Owns the browser handle, so +/// impls can `browser.new_page(...)` without bringing their own. +pub struct FetchContext<'a> { + pub browser: &'a Browser, +} + +#[async_trait] +pub trait Source: Send + Sync { + /// Stable identifier — also the row key in the `sources` table. + fn id(&self) -> &'static str; + + async fn discover( + &self, + ctx: &FetchContext<'_>, + mode: DiscoverMode, + ) -> anyhow::Result>; + + async fn fetch_manga( + &self, + ctx: &FetchContext<'_>, + r: &SourceMangaRef, + ) -> anyhow::Result; + + async fn fetch_chapter_list( + &self, + ctx: &FetchContext<'_>, + manga: &SourceManga, + ) -> anyhow::Result>; + + async fn fetch_chapter( + &self, + ctx: &FetchContext<'_>, + r: &SourceChapterRef, + ) -> anyhow::Result; +} diff --git a/backend/src/lib.rs b/backend/src/lib.rs index 42c23ee..637daab 100644 --- a/backend/src/lib.rs +++ b/backend/src/lib.rs @@ -2,6 +2,7 @@ pub mod api; pub mod app; pub mod auth; pub mod config; +pub mod crawler; pub mod domain; pub mod error; pub mod repo; diff --git a/backend/tests/crawler_browser_smoke.rs b/backend/tests/crawler_browser_smoke.rs new file mode 100644 index 0000000..1323618 --- /dev/null +++ b/backend/tests/crawler_browser_smoke.rs @@ -0,0 +1,157 @@ +//! Smoke test for the Chromium launcher. +//! +//! Marked `#[ignore]` because it (a) downloads ~150 MB of Chromium on +//! first run via the `fetcher` feature and (b) requires a real `$DISPLAY` +//! for the headed path. Run it explicitly: +//! +//! ```sh +//! cargo test --test crawler_browser_smoke -- --ignored --nocapture +//! ``` +//! +//! Override the cache location with `CRAWLER_CHROMIUM_DIR=/some/path` if +//! `$HOME/.cache/mangalord/chromium` isn't writable. + +use mangalord::crawler::browser::{self, LaunchOptions}; + +#[tokio::test] +#[ignore = "downloads Chromium and needs a display; run with --ignored"] +async fn headed_browser_can_navigate_and_read_title() { + // A data URL avoids any network dependency — we're testing the + // browser launcher, not connectivity. + const PAGE: &str = "data:text/html,Mangalord%20SmokeOK"; + + let handle = browser::launch(LaunchOptions::headed()) + .await + .expect("launch headed chromium"); + + let page = handle + .browser() + .new_page(PAGE) + .await + .expect("open new page"); + page.wait_for_navigation() + .await + .expect("wait for navigation"); + + let title = page.get_title().await.expect("get title"); + assert_eq!(title.as_deref(), Some("Mangalord Smoke")); + + handle.close().await.expect("close cleanly"); +} + +#[tokio::test] +#[ignore = "downloads Chromium; run with --ignored"] +async fn headless_browser_can_navigate_and_read_title() { + const PAGE: &str = "data:text/html,Headless%20OK"; + + let handle = browser::launch(LaunchOptions::headless()) + .await + .expect("launch headless chromium"); + + let page = handle.browser().new_page(PAGE).await.expect("open new page"); + page.wait_for_navigation().await.expect("wait for navigation"); + + let title = page.get_title().await.expect("get title"); + assert_eq!(title.as_deref(), Some("Headless OK")); + + handle.close().await.expect("close cleanly"); +} + +/// Live end-to-end: navigate to a real page, get the rendered HTML, and +/// parse it with `scraper`. ipify.org renders the visitor's public IP +/// into the page DOM, so a successful run proves browser → render → +/// `Html::parse_document` → selector → text extraction all work +/// against a real site. This is the same path each future `Source` +/// impl will take. +#[tokio::test] +#[ignore = "needs network; run with --ignored"] +async fn fetches_public_ip_from_ipify() { + use std::time::Duration; + + let handle = browser::launch(LaunchOptions::headless()) + .await + .expect("launch headless chromium"); + + let page = handle + .browser() + .new_page("https://www.ipify.org") + .await + .expect("open ipify"); + page.wait_for_navigation().await.expect("wait for navigation"); + // ipify injects the IP via JS after load, so the navigation event + // alone isn't enough — give the script a beat to run. + tokio::time::sleep(Duration::from_secs(2)).await; + + let html = page.content().await.expect("get rendered html"); + let doc = scraper::Html::parse_document(&html); + let body_sel = scraper::Selector::parse("body").unwrap(); + let body_text: String = doc + .select(&body_sel) + .next() + .map(|n| n.text().collect::>().join(" ")) + .unwrap_or_default(); + + let ip = extract_ipv4(&body_text) + .unwrap_or_else(|| panic!("no IPv4 found in ipify body: {body_text}")); + eprintln!("ipify says our public IP is: {ip}"); + + handle.close().await.expect("close cleanly"); +} + +/// Proves that `LaunchOptions::extra_args` actually reach Chromium and +/// influence its runtime. `--user-agent=...` overrides `navigator.userAgent`, +/// observable from JS — read it back via `page.evaluate`. +#[tokio::test] +#[ignore = "downloads Chromium; run with --ignored"] +async fn extra_args_reach_chromium() { + const UA: &str = "MangalordCrawlerTest/1.0"; + let options = LaunchOptions { + mode: browser::BrowserMode::Headless, + extra_args: vec![format!("--user-agent={UA}")], + }; + let handle = browser::launch(options).await.expect("launch with extra args"); + + let page = handle + .browser() + .new_page("about:blank") + .await + .expect("open page"); + page.wait_for_navigation().await.expect("wait"); + + let ua: String = page + .evaluate("navigator.userAgent") + .await + .expect("evaluate navigator.userAgent") + .into_value() + .expect("string value"); + assert_eq!( + ua, UA, + "extra --user-agent flag should override navigator.userAgent" + ); + + handle.close().await.expect("close cleanly"); +} + +/// Tiny dotted-quad finder — avoids pulling `regex` in just for one +/// test. Scans the first valid IPv4 substring (four 0..=255 octets +/// separated by dots). +fn extract_ipv4(s: &str) -> Option { + let bytes = s.as_bytes(); + let mut i = 0; + while i < bytes.len() { + if !bytes[i].is_ascii_digit() { + i += 1; + continue; + } + let start = i; + while i < bytes.len() && (bytes[i].is_ascii_digit() || bytes[i] == b'.') { + i += 1; + } + let candidate = &s[start..i]; + let parts: Vec<&str> = candidate.split('.').collect(); + if parts.len() == 4 && parts.iter().all(|p| p.parse::().is_ok()) { + return Some(candidate.to_string()); + } + } + None +} diff --git a/frontend/package.json b/frontend/package.json index 32b2e81..e2e4e84 100644 --- a/frontend/package.json +++ b/frontend/package.json @@ -1,6 +1,6 @@ { "name": "mangalord-frontend", - "version": "0.21.3", + "version": "0.22.0", "private": true, "type": "module", "scripts": {