From 6c44e7a2f5d7b98fae87fdb558aa40c8fdfa6dc7 Mon Sep 17 00:00:00 2001 From: RoloEdits Date: Tue, 10 Jan 2023 15:27:28 -0800 Subject: [PATCH] downloading story panels and making chapter lists concurrent (#5) * initial implementation * fixed file date_folder creation * changed date to published * made chapter_list_info concurrent * updated documentation * updated test * commented out tests that need mocking --- Cargo.lock | 350 ++++++++++++++++++++++++++++- Cargo.toml | 8 +- README.md | 13 +- cli_core/Cargo.toml | 2 +- kubera/README.md | 4 +- kubera/src/csv.rs | 2 +- kubera/src/lib.rs | 24 +- kubera/src/main.rs | 6 +- line_core/Cargo.toml | 9 +- line_core/src/chapter_list.rs | 61 +++-- line_core/src/daily_schedule.rs | 2 + line_core/src/lib.rs | 2 +- line_core/src/panels.rs | 304 +++++++++++++++++++++++++ line_core/src/series_info.rs | 105 ++++++--- lore-olympus/README.md | 4 +- lore-olympus/src/csv.rs | 2 +- lore-olympus/src/lib.rs | 24 +- lore-olympus/src/main.rs | 6 +- project_core/Cargo.toml | 5 +- project_core/src/lib.rs | 23 +- scrapetoon/Cargo.toml | 1 - scrapetoon/src/args.rs | 31 ++- scrapetoon/src/csv.rs | 2 +- scrapetoon/src/main.rs | 32 ++- the-god-of-high-school/README.md | 6 +- the-god-of-high-school/src/csv.rs | 2 +- the-god-of-high-school/src/lib.rs | 30 ++- the-god-of-high-school/src/main.rs | 6 +- tower-of-god/README.md | 4 +- tower-of-god/src/csv.rs | 2 +- tower-of-god/src/lib.rs | 24 +- tower-of-god/src/main.rs | 6 +- true-beauty/README.md | 4 +- true-beauty/src/csv.rs | 2 +- true-beauty/src/lib.rs | 31 ++- true-beauty/src/main.rs | 6 +- unordinary/README.md | 6 +- unordinary/src/csv.rs | 2 +- unordinary/src/lib.rs | 23 +- unordinary/src/main.rs | 6 +- 40 files changed, 1007 insertions(+), 175 deletions(-) create mode 100644 line_core/src/panels.rs diff --git a/Cargo.lock b/Cargo.lock index f641a3e..459af6e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,6 +2,12 @@ # It is not intended for manual editing. version = 3 +[[package]] +name = "adler" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f26201604c87b1e01bd3d98f8d5d9a8fcbb815e8cedb41ffccbeb4bf593a35fe" + [[package]] name = "aho-corasick" version = "0.7.19" @@ -54,6 +60,12 @@ version = "0.13.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" +[[package]] +name = "bit_field" +version = "0.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dcb6dd1c2376d2e096796e234a70e17e94cc2d5d54ff8ce42b28cef1d0d359a4" + [[package]] name = "bitflags" version = "1.3.2" @@ -78,6 +90,12 @@ version = "3.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" +[[package]] +name = "bytemuck" +version = "1.12.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aaa3a8d9a1ca92e282c96a32d6511b695d7d994d1d102ba85d279f9b2756947f" + [[package]] name = "byteorder" version = "1.4.3" @@ -173,6 +191,12 @@ dependencies = [ "unicode-width", ] +[[package]] +name = "color_quant" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d7b894f5411737b7867f4827955924d7c254fc9f4d91a6aad6b097804b1018b" + [[package]] name = "console" version = "0.15.2" @@ -220,6 +244,88 @@ version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5827cebf4670468b8772dd191856768aedcb1b0278a04f989f7766351917b9dc" +[[package]] +name = "crc32fast" +version = "1.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b540bd8bc810d3885c6ea91e2018302f68baba2129ab3e88f32389ee9370880d" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crossbeam" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2801af0d36612ae591caa9568261fddce32ce6e08a7275ea334a06a4ad021a2c" +dependencies = [ + "cfg-if", + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-epoch", + "crossbeam-queue", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-channel" +version = "0.5.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c2dd04ddaf88237dc3b8d8f9a3c1004b506b54b3313403944054d23c0870c521" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-deque" +version = "0.8.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "715e8152b692bba2d374b53d4875445368fdf21a94751410af607a5ac677d1fc" +dependencies = [ + "cfg-if", + "crossbeam-epoch", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-epoch" +version = "0.9.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "01a9af1f4c2ef74bb8aa1f7e19706bc72d03598c8a570bb5de72243c7a9d9d5a" +dependencies = [ + "autocfg", + "cfg-if", + "crossbeam-utils", + "memoffset", + "scopeguard", +] + +[[package]] +name = "crossbeam-queue" +version = "0.3.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cfb3ea8a53f37c40dea2c7bedcbd88bdfae54f5e2175d6ecaff1c988353add" +dependencies = [ + "cfg-if", + "crossbeam-utils", +] + +[[package]] +name = "crossbeam-utils" +version = "0.8.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4fb766fa798726286dbbb842f174001dab8abc7b627a1dd86e0b7222a95d929f" +dependencies = [ + "cfg-if", +] + +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "cssparser" version = "0.27.2" @@ -347,6 +453,12 @@ version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3a68a4904193147e0a8dec3314640e6db742afd5f6e634f428a6af230d9b3591" +[[package]] +name = "either" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "90e5c1c8368803113bf0c9584fc495a58b86dc8a29edbf8fe877d21d9507e797" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -362,6 +474,21 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "exr" +version = "1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8eb5f255b5980bb0c8cf676b675d1a99be40f316881444f44e0462eaf5df5ded" +dependencies = [ + "bit_field", + "flume", + "half", + "lebe", + "miniz_oxide", + "smallvec", + "threadpool", +] + [[package]] name = "fantoccini" version = "0.19.3" @@ -393,6 +520,29 @@ dependencies = [ "instant", ] +[[package]] +name = "flate2" +version = "1.0.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8a2db397cb1c8772f31494cb8917e48cd1e64f0fa7efac59fbd741a0a8ce841" +dependencies = [ + "crc32fast", + "miniz_oxide", +] + +[[package]] +name = "flume" +version = "0.10.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1657b4441c3403d9f7b3409e47575237dac27b1b5726df654a6ecbf92f0f7577" +dependencies = [ + "futures-core", + "futures-sink", + "nanorand", + "pin-project", + "spin 0.9.4", +] + [[package]] name = "fnv" version = "1.0.7" @@ -558,8 +708,20 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c05aeb6a22b8f62540c194aac980f2115af067bfe15a0734d7277a768d396b31" dependencies = [ "cfg-if", + "js-sys", "libc", "wasi 0.11.0+wasi-snapshot-preview1", + "wasm-bindgen", +] + +[[package]] +name = "gif" +version = "0.11.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3edd93c6756b4dfaf2709eafcc345ba2636565295c198a9cfbf75fa5e3e00b06" +dependencies = [ + "color_quant", + "weezl", ] [[package]] @@ -581,6 +743,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "half" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0" +dependencies = [ + "crunchy", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -736,6 +907,25 @@ dependencies = [ "unicode-normalization", ] +[[package]] +name = "image" +version = "0.24.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69b7ea949b537b0fd0af141fff8c77690f2ce96f4f41f042ccb6c69c6c965945" +dependencies = [ + "bytemuck", + "byteorder", + "color_quant", + "exr", + "gif", + "jpeg-decoder", + "num-rational", + "num-traits", + "png", + "scoped_threadpool", + "tiff", +] + [[package]] name = "indexmap" version = "1.9.2" @@ -755,6 +945,7 @@ dependencies = [ "console", "number_prefix", "portable-atomic", + "rayon", "unicode-width", ] @@ -785,6 +976,15 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" +[[package]] +name = "jpeg-decoder" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc0000e42512c92e31c2252315bda326620a4e034105e900c98ec492fa077b3e" +dependencies = [ + "rayon", +] + [[package]] name = "js-sys" version = "0.3.60" @@ -817,6 +1017,12 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" +[[package]] +name = "lebe" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "03087c2bad5e1034e8cace5926dec053fb3790248370865f5117a7d0213354c8" + [[package]] name = "libc" version = "0.2.137" @@ -829,9 +1035,16 @@ version = "0.1.0" dependencies = [ "chrono", "cli_core", + "crossbeam", + "image", + "indicatif", "project_core", + "rand 0.8.5", + "rayon", "regex", + "reqwest", "scraper", + "tokio", ] [[package]] @@ -911,12 +1124,30 @@ version = "2.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" +[[package]] +name = "memoffset" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5de893c32cde5f383baa4c04c5d6dbdd735cfd4a794b0debdb2bb1b421da5ff4" +dependencies = [ + "autocfg", +] + [[package]] name = "mime" version = "0.3.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d" +[[package]] +name = "miniz_oxide" +version = "0.6.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b275950c28b37e794e8c55d88aeb5e139d0ce23fdbbeda68f8d7174abdf9e8fa" +dependencies = [ + "adler", +] + [[package]] name = "mio" version = "0.8.5" @@ -929,6 +1160,15 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "nanorand" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a51313c5820b0b02bd422f4b44776fbf47961755c74ce64afc73bfad10226c3" +dependencies = [ + "getrandom 0.2.8", +] + [[package]] name = "native-tls" version = "0.2.11" @@ -969,6 +1209,17 @@ dependencies = [ "num-traits", ] +[[package]] +name = "num-rational" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0" +dependencies = [ + "autocfg", + "num-integer", + "num-traits", +] + [[package]] name = "num-traits" version = "0.2.15" @@ -1172,6 +1423,26 @@ dependencies = [ "siphasher", ] +[[package]] +name = "pin-project" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad29a609b6bcd67fee905812e544992d216af9d755757c05ed2d0e15a74c6ecc" +dependencies = [ + "pin-project-internal", +] + +[[package]] +name = "pin-project-internal" +version = "1.0.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "069bdb1e05adc7a8990dce9cc75370895fbe4e3d58b9b73bf1aee56359344a55" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "pin-project-lite" version = "0.2.9" @@ -1190,6 +1461,18 @@ version = "0.3.26" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6ac9a59f73473f1b8d852421e59e64809f025994837ef743615c6d0c5b305160" +[[package]] +name = "png" +version = "0.17.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5d708eaf860a19b19ce538740d2b4bdeeb8337fa53f7738455e706623ad5c638" +dependencies = [ + "bitflags", + "crc32fast", + "flate2", + "miniz_oxide", +] + [[package]] name = "portable-atomic" version = "0.3.15" @@ -1252,6 +1535,7 @@ name = "project_core" version = "0.1.0" dependencies = [ "chrono", + "rand 0.8.5", "reqwest", ] @@ -1345,6 +1629,28 @@ dependencies = [ "rand_core 0.5.1", ] +[[package]] +name = "rayon" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6db3a213adf02b3bcfd2d3846bb41cb22857d131789e01df434fb7e7bc0759b7" +dependencies = [ + "either", + "rayon-core", +] + +[[package]] +name = "rayon-core" +version = "1.10.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "cac410af5d00ab6884528b4ab69d1e8e146e8d471201800fa1b4524126de6ad3" +dependencies = [ + "crossbeam-channel", + "crossbeam-deque", + "crossbeam-utils", + "num_cpus", +] + [[package]] name = "redox_syscall" version = "0.2.16" @@ -1433,7 +1739,7 @@ dependencies = [ "cc", "libc", "once_cell", - "spin", + "spin 0.5.2", "untrusted", "web-sys", "winapi", @@ -1497,6 +1803,12 @@ dependencies = [ "windows-sys 0.36.1", ] +[[package]] +name = "scoped_threadpool" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d51f5df5af43ab3f1360b429fa5e0152ac5ce8c0bd6485cae490332e96846a8" + [[package]] name = "scopeguard" version = "1.1.0" @@ -1529,7 +1841,6 @@ dependencies = [ "line_core", "project_core", "static_assertions", - "tokio", ] [[package]] @@ -1708,6 +2019,15 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6e63cff320ae2c57904679ba7cb63280a3dc4613885beafb148ee7bf9aa9042d" +[[package]] +name = "spin" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7f6002a767bff9e83f8eeecf883ecb8011875a21ae8da43bffb817a57e78cc09" +dependencies = [ + "lock_api", +] + [[package]] name = "stable_deref_trait" version = "1.2.0" @@ -1897,6 +2217,26 @@ dependencies = [ "syn", ] +[[package]] +name = "threadpool" +version = "1.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d050e60b33d41c19108b32cea32164033a9013fe3b46cbd4457559bfbf77afaa" +dependencies = [ + "num_cpus", +] + +[[package]] +name = "tiff" +version = "0.8.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7449334f9ff2baf290d55d73983a7d6fa15e01198faef72af07e2a8db851e471" +dependencies = [ + "flate2", + "jpeg-decoder", + "weezl", +] + [[package]] name = "time" version = "0.1.44" @@ -2306,6 +2646,12 @@ dependencies = [ "untrusted", ] +[[package]] +name = "weezl" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9193164d4de03a926d909d3bc7c30543cecb35400c02114792c2cae20d5e2dbb" + [[package]] name = "winapi" version = "0.3.9" diff --git a/Cargo.toml b/Cargo.toml index 2fcb68a..4887efd 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -4,10 +4,16 @@ members = ["project_core", "line_core", "cli_core", "scrapetoon", "tower-of-god" [workspace.dependencies] clap = {version = "4.0.25", features = ["derive"]} +reqwest = {version = "0.11.13", features = ["rustls"]} scraper = "0.13.0" csv = "1.1.6" regex = "1.7.0" tokio = { version = "1.21.2", features = ["full"] } chrono = "0.4.23" thirtyfour = "0.31.0" -static_assertions = "1.1.0" \ No newline at end of file +static_assertions = "1.1.0" +rayon = "1.6.1" +indicatif = { version = "0.17.2", features = ["rayon"]} +image = "0.24.5" +rand = "0.8.5" +crossbeam = "0.8.2" \ No newline at end of file diff --git a/README.md b/README.md index d0ccb07..42a53dc 100644 --- a/README.md +++ b/README.md @@ -29,8 +29,8 @@ The likes information, once it gets to the millions, is truncated, i.e. 1.1M. Th The data gathered from here is organized like so: -| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | date | scrape_date | -|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | ---- | ----------- | +| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | published | scrape_date | +|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | --------- | ----------- | The `chapter`, `likes`, and `date` are all relative to one chapter, with a new chapter on each row. The date is in the ISO 8601 format. @@ -237,8 +237,6 @@ Once this is done, open up the `main.rs` file inside the `src` folder. It should look something like this, with red squiggles indicating errors. -![main.rs initial state with errors](imgs/lore_olympus_main_no_changes.png) - The fix is to change `tower_of_god` to the new name we have been using. A format like this: `use ::config;` and `::parse_chapters`. In our case we change to `lore_olympus` like so: @@ -250,7 +248,6 @@ use lore_olympus::config; mod csv; -#[tokio::main] async fn main() { let args = StoryCliArgs::parse(); @@ -260,7 +257,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ).await; + ); csv::write( &args.output, @@ -725,7 +722,7 @@ writer.write_record([ "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", @@ -838,8 +835,6 @@ If you want to contribute, first of all, thank you, communities can only grow wi Contributing are pretty simple. For features, open a branch with a name schema listed below, and then make whatever change you wanted, then begin the pull request process. - - | Type | Example | |:-------------- |:--------------------------------------------------:| | Story | `story/` | diff --git a/cli_core/Cargo.toml b/cli_core/Cargo.toml index 3e34cc4..6e02b49 100644 --- a/cli_core/Cargo.toml +++ b/cli_core/Cargo.toml @@ -6,5 +6,5 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -indicatif = "0.17.2" +indicatif = { workspace = true } clap = { workspace = true} \ No newline at end of file diff --git a/kubera/README.md b/kubera/README.md index 12a6a6f..2c262d8 100644 --- a/kubera/README.md +++ b/kubera/README.md @@ -2,5 +2,5 @@ Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season|season_chapter|arc| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:|:------------:|:-:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | season_chapter | arc | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:|:--------------:|:---:| diff --git a/kubera/src/csv.rs b/kubera/src/csv.rs index 7a51e46..d923759 100644 --- a/kubera/src/csv.rs +++ b/kubera/src/csv.rs @@ -30,7 +30,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/kubera/src/lib.rs b/kubera/src/lib.rs index bd6f0aa..7064f56 100644 --- a/kubera/src/lib.rs +++ b/kubera/src/lib.rs @@ -1,8 +1,9 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::{chapter_height_pixels, comments, SeriesInfo}; +use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -18,7 +19,7 @@ mod story_specific_parsing; /// # Errors /// /// Returns a tuple if Ok and if there is any progress made, else returns a `WebDriver` error. -pub async fn parse_chapters( +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -26,8 +27,21 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + + (series_info, result) +} + +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap, +) -> LinkedList { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await @@ -63,7 +77,7 @@ pub async fn parse_chapters( } else { // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, @@ -113,5 +127,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/kubera/src/main.rs b/kubera/src/main.rs index 2770691..a9cb598 100644 --- a/kubera/src/main.rs +++ b/kubera/src/main.rs @@ -5,8 +5,7 @@ use project_core::path_enforcer; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = kubera::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output), diff --git a/line_core/Cargo.toml b/line_core/Cargo.toml index 6e07d71..750fd28 100644 --- a/line_core/Cargo.toml +++ b/line_core/Cargo.toml @@ -10,4 +10,11 @@ project_core = { path = "../project_core" } cli_core = { path = "../cli_core" } regex = { workspace = true } scraper = { workspace = true} -chrono = { workspace = true} \ No newline at end of file +chrono = { workspace = true} +tokio = { workspace = true } +image = { workspace = true } +rayon = { workspace = true } +rand = { workspace = true } +indicatif = { workspace = true } +reqwest = { workspace = true } +crossbeam = { workspace = true } \ No newline at end of file diff --git a/line_core/src/chapter_list.rs b/line_core/src/chapter_list.rs index 9fc9f35..b56b315 100644 --- a/line_core/src/chapter_list.rs +++ b/line_core/src/chapter_list.rs @@ -1,42 +1,57 @@ use chrono::NaiveDate; -use cli_core::ProgressBarFactory; -use core::time; +use crossbeam::queue::SegQueue; +use indicatif::ParallelProgressIterator; use project_core::ResponseFactory; +use rayon::prelude::*; use scraper::{ElementRef, Html, Selector}; -use std::{collections::LinkedList, thread}; + +use std::collections::LinkedList; use crate::ChapterListInfo; + ///# Panics /// /// Will panic if there was a response but at the same time, the html text somehow didn't come with it unwrapping to a None. -pub async fn parse(end: u16, input_url: &str, chapter_info: &mut LinkedList) { - let bar = ProgressBarFactory::get_bar(end); +#[must_use] +pub fn parse(end: u16, input_url: &str) -> LinkedList { + // 8 Threads is around the line at which problems start to occur when pinging out too many times at once as all getting blocked + rayon::ThreadPoolBuilder::new() + .num_threads(6) + .build_global() + .unwrap(); - for page in 1..=end { - let url = format!("{input_url}&page={page}"); + let range: Vec<_> = (1..=end).collect(); + let total = range.len() as u64; - let html_response = if let Ok(ok) = ResponseFactory::get(&url).await { - ok - } else { - eprintln!("Error connecting to webpage, attempting to save progress and exit..."); + let chapter_info: SegQueue = SegQueue::new(); - assert!(!chapter_info.is_empty(), "Nothing to save, exiting."); + range + .into_par_iter() + .progress_count(total) + .for_each(|page| { + let url = format!("{input_url}&page={page}"); + work(&url, &chapter_info); + }); - break; - } - .text() - .await - .unwrap(); + let mut result: LinkedList = LinkedList::new(); - parse_each_chapters_chapter_info(&html_response, chapter_info); + for info in chapter_info { + result.push_back(info); + } - thread::sleep(time::Duration::from_secs(3)); + result +} - bar.inc(1); - } +#[tokio::main] +async fn work(url: &str, chapter_info: &SegQueue) { + if let Ok(response) = ResponseFactory::get(url).await { + let html = response.text().await.unwrap(); + + parse_each_chapters_chapter_info(&html, chapter_info); + }; } -fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList) { +fn parse_each_chapters_chapter_info(html: &str, chapter_info: &SegQueue) { let html = Html::parse_document(html); let chapter_selector = Selector::parse("ul#_listUl>li").unwrap(); @@ -45,7 +60,7 @@ fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList LinkedList { const DAILY_SCHEDULE: &str = "https://www.webtoons.com/en/dailySchedule"; diff --git a/line_core/src/lib.rs b/line_core/src/lib.rs index aca9ad8..327922c 100644 --- a/line_core/src/lib.rs +++ b/line_core/src/lib.rs @@ -2,9 +2,9 @@ pub mod chapter_height_pixels; pub mod chapter_list; pub mod comments; pub mod daily_schedule; +pub mod panels; pub mod series_info; -use project_core::regex; use std::collections::LinkedList; #[derive(Debug)] diff --git a/line_core/src/panels.rs b/line_core/src/panels.rs new file mode 100644 index 0000000..06598ae --- /dev/null +++ b/line_core/src/panels.rs @@ -0,0 +1,304 @@ +use crate::comments::parse_chapter_number; +use core::time; +use image::{GenericImage, ImageBuffer, RgbImage}; +use indicatif::ParallelProgressIterator; +use project_core::ResponseFactory; +use rand::prelude::*; +use rayon::prelude::*; +use reqwest::StatusCode; +use scraper::{Html, Selector}; +use std::{collections::VecDeque, fs, path::Path, thread}; + +/// # Panics +pub fn get(url: &str, path: &str, start: u16, end: u16) { + let path = Path::new(path); + + // 8 Threads is around the line at which problems start to occur when pinging out too many times at once as all getting blocked + rayon::ThreadPoolBuilder::new() + .num_threads(6) + .build_global() + .unwrap(); + + let range: Vec<_> = (start..=end).collect(); + + let total = range.len() as u64; + + range + .into_par_iter() + .progress_count(total) + .for_each(|chapter| { + get_chapter_panels(url, path, chapter); + }); +} + +#[tokio::main] +async fn get_chapter_panels(url: &str, path: &Path, chapter: u16) { + let url = url_builder(url, chapter); + + let response = ResponseFactory::get(&url).await.unwrap(); + + if response.status() != StatusCode::OK { + return; + } + + let body = response.text().await.unwrap(); + + let html = Html::parse_document(&body); + + let links = get_image_links(&html); + + let chapter_number = parse_chapter_number(&html); + + let downloaded_images = download_links_async(&links, &url, chapter_number).await; + + let image = stitch_images(&downloaded_images); + + write_images(&image, path, chapter_number); +} + +fn get_image_links(html: &Html) -> VecDeque { + let link_selector = Selector::parse(r#"img._images"#).unwrap(); + let mut links: VecDeque = VecDeque::new(); + + for link in html.select(&link_selector) { + let url = link.value().attr("data-url").unwrap().to_string(); + + let width = link + .value() + .attr("width") + .unwrap() + .to_string() + .parse::() + .unwrap() as u32; + + let height = link + .value() + .attr("height") + .unwrap() + .to_string() + .parse::() + .unwrap() as u32; + + let extension = parse_extension(&url); + + links.push_back(WebtoonHtmlImageData { + url, + height, + width, + extension, + }); + } + + links +} + +async fn download_links_async<'a>( + webtoon_image_data: &'a VecDeque, + url: &'a str, + chapter_number: u16, +) -> VecDeque> { + let mut rng = thread_rng(); + // 1-5 seconds + let random_wait = rng.gen_range(1..5); + + // So all the requests aren't sent at the same time + thread::sleep(time::Duration::from_secs(random_wait)); + + let client = reqwest::Client::new(); + + let mut images: VecDeque = VecDeque::new(); + + for image in webtoon_image_data { + let mut retries = 5; + let mut wait = 1; + let bytes = loop { + match client + .get(&image.url) + .header("referer", "https://www.webtoons.com/") + .send() + .await + { + Err(_) => { + if retries > 0 { + retries -= 1; + thread::sleep(time::Duration::from_secs(wait)); + wait *= 2; + } else { + panic!("Cannot connect. Check URL: {}", image.url) + } + } + Ok(ok) => break ok, + } + } + .bytes() + .await + .unwrap_or_else(|err| { + panic!( + "Error: {err}. Image Url: {} on Chapter {chapter_number}", + image.url + ) + }) + .to_vec(); + + let height = image.height; + let width = image.width; + let extension = &image.extension; + + images.push_back(IntermediateImageInfo { + bytes, + height, + width, + extension, + url, + }); + } + + images +} + +fn url_builder(base_url: &str, chapter: u16) -> String { + const BASE_URL: &str = r"https://www.webtoons.com/*/*/*/*/viewer?"; + + const EP_NO: &str = "&episode_no="; + + // The 'title_no=' portion + let title = base_url.split('?').collect::>()[1]; + + let fully_formed = format!("{BASE_URL}{title}{EP_NO}{chapter}"); + + fully_formed +} + +fn parse_extension(url: &str) -> String { + let path = Path::new(url); + + path.extension() + .unwrap() + .to_owned() + .into_string() + .unwrap() + .split('?') + .collect::>()[0] + .to_string() +} + +fn write_images(image: &BufferImage, path: &Path, chapter_number: u16) { + if !path.try_exists().expect("Check if chapter folder exists") { + fs::create_dir(path).expect("Create chapter folder"); + } + + let name = path.join(chapter_number.to_string()).with_extension("png"); + + image + .buffer + .save_with_format(name, image::ImageFormat::Png) + .expect("Write out final, large PNG"); +} + +fn stitch_images(images: &VecDeque>) -> BufferImage { + let min_width = images.get_min_width(); + let first_width = images.get_first_width(); + let max_height = images.calculate_max_height(); + + let mut offset: u32 = 0; + + let mut buffer: RgbImage = ImageBuffer::new(first_width, max_height); + + for image in images { + // Range of 50 pixels from the smallest width. + if image.width > min_width + 50 { + continue; + } + + let ext = match image.extension { + "jpg" => image::ImageFormat::Jpeg, + "png" => image::ImageFormat::Png, + "gif" => image::ImageFormat::Gif, + "webp" => image::ImageFormat::WebP, + _ => panic!("Unhandled File Type"), + }; + + let holder = image::load_from_memory_with_format(&image.bytes, ext) + .expect("Error Decoding Jpeg, got {}"); + + if holder.width() > first_width { + let resized = holder.resize( + first_width, + max_height, + image::imageops::FilterType::Lanczos3, + ); + + buffer + .copy_from(&resized.to_rgb8(), 0, offset) + .unwrap_or_else(|err| panic!("Error {err}: From: '{:?}'", image.url)); + + offset += resized.height(); + } else { + buffer + .copy_from(&holder.to_rgb8(), 0, offset) + .unwrap_or_else(|err| panic!("Error {err}: From: '{:?}'", image.url)); + offset += image.height; + } + } + + BufferImage { buffer } +} + +#[derive(Debug)] +struct IntermediateImageInfo<'a> { + bytes: Vec, + height: u32, + width: u32, + extension: &'a str, + url: &'a str, +} + +trait WebtoonImage { + fn calculate_max_height(&self) -> u32; + + fn get_min_width(&self) -> u32; + + fn get_first_width(&self) -> u32; +} + +impl<'a> WebtoonImage for VecDeque> { + fn calculate_max_height(&self) -> u32 { + let mut accum: u32 = 0; + for image in self { + accum += image.height; + } + + accum + } + + fn get_min_width(&self) -> u32 { + let mut min = 0; + + for image in self { + if min == 0 || min > image.width { + min = image.width; + } + } + + min + } + + fn get_first_width(&self) -> u32 { + return self + .iter() + .next() + .unwrap_or_else(|| panic!("Error from: {self:?}")) + .width; + } +} + +struct BufferImage { + buffer: RgbImage, +} + +struct WebtoonHtmlImageData { + url: String, + height: u32, + width: u32, + extension: String, +} diff --git a/line_core/src/series_info.rs b/line_core/src/series_info.rs index 398ca59..2a36289 100644 --- a/line_core/src/series_info.rs +++ b/line_core/src/series_info.rs @@ -6,17 +6,13 @@ use std::thread; use crate::{chapter_list, LikesDate}; -use super::{regex, LinkedList, SeriesInfo}; +use super::SeriesInfo; -pub async fn parse(end: u16, input_url: &str) -> SeriesInfo { - let (genre, _id) = parse_url_info(input_url); - - let mut chapter_list_info = LinkedList::new(); - - let (title, author, status, release_day, views, subscribers, rating) = - parse_series_page_info(input_url).await; - - chapter_list::parse(end, input_url, &mut chapter_list_info).await; +#[must_use] +pub fn parse(end: u16, input_url: &str) -> SeriesInfo { + let (title, author, status, release_day, views, subscribers, rating, genre) = + parse_series_page_info(input_url); + let chapter_list_info = chapter_list::parse(end, input_url); SeriesInfo { title, @@ -31,9 +27,10 @@ pub async fn parse(end: u16, input_url: &str) -> SeriesInfo { } } -pub async fn get_extra_info(pages: u16, url: &str) -> (SeriesInfo, HashMap) { +#[must_use] +pub fn get_extra_info(pages: u16, url: &str) -> (SeriesInfo, HashMap) { println!("Pre-Fetching Necessary Data"); - let series_info = parse(pages, url).await; + let series_info = parse(pages, url); println!("Completed Pre-Fetch"); let mut likes_date_hashmap: HashMap = HashMap::new(); @@ -50,19 +47,11 @@ pub async fn get_extra_info(pages: u16, url: &str) -> (SeriesInfo, HashMap (String, u16) { - let reg = regex![ - r"https://www.webtoons.com/../(?P.+)/(?P.+)/list\?title_no=(?P<id>\d+)" - ]; - - let cap = reg.captures(url).unwrap(); - - (cap["genre"].to_string(), cap["id"].parse::<u16>().unwrap()) -} - // Series Page -async fn parse_series_page_info(url: &str) -> (String, String, String, String, u64, u32, f32) { +#[tokio::main] +async fn parse_series_page_info( + url: &str, +) -> (String, String, String, String, u64, u32, f32, String) { let html = ResponseFactory::get(url) .await .map_or_else( @@ -73,6 +62,7 @@ async fn parse_series_page_info(url: &str) -> (String, String, String, String, u .await .expect("Error getting HTML from response"); + let genre = parse_genre(&html); let title = parse_series_page_title(&html); let author = parse_series_page_author(&html); let (release_day, status) = parse_series_page_release_day_and_status(&html); @@ -90,6 +80,7 @@ async fn parse_series_page_info(url: &str) -> (String, String, String, String, u views, subscribers, rating, + genre, ) } @@ -110,6 +101,22 @@ fn parse_series_page_title(html: &str) -> String { result.replace(':', ": ") } +fn parse_genre(html: &str) -> String { + let html = Html::parse_document(html); + let genre_selector = Selector::parse(r"h2.genre").unwrap(); + + let genre = html + .select(&genre_selector) + .next() + .unwrap() + .text() + .next() + .unwrap() + .to_string(); + + genre +} + fn parse_series_page_rating(html: &str) -> f32 { let html = Html::parse_document(html); let rating_selector = Selector::parse(r"em#_starScoreAverage").unwrap(); @@ -208,15 +215,18 @@ fn parse_series_page_release_day_and_status(html: &str) -> (String, String) { } } + const ONGOING: &str = "Ongoing"; + + // TODO: Make Day a Vec so stories with more than one day can show as such. let (day, status) = match result { - sub_text if sub_text.starts_with("SUN") => ("sunday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("MON") => ("monday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("TUE") => ("tuesday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("WED") => ("wednesday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("THU") => ("thursday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("FRI") => ("friday".to_string(), "ongoing".to_string()), - sub_text if sub_text.starts_with("SAT") => ("saturday".to_string(), "ongoing".to_string()), - _ => ("completed".to_string(), "completed".to_string()), + sub_text if sub_text.starts_with("SUN") => ("Sunday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("MON") => ("Monday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("TUE") => ("Tuesday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("WED") => ("Wednesday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("THU") => ("Thursday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("FRI") => ("Friday".to_string(), ONGOING.to_string()), + sub_text if sub_text.starts_with("SAT") => ("Saturday".to_string(), ONGOING.to_string()), + _ => ("Completed".to_string(), "Completed".to_string()), }; (day, status) @@ -251,6 +261,17 @@ fn parse_series_page_author(html: &str) -> String { result.trim().to_string() } +// Series Helpers +// fn parse_url_info(url: &str) -> (String, u16) { +// let reg = regex![ +// r"https://www.webtoons.com/../(?P<genre>.+)/(?P<title>.+)/list\?title_no=(?P<id>\d+)" +// ]; +// +// let cap = reg.captures(url).unwrap(); +// +// (cap["genre"].to_string(), cap["id"].parse::<u16>().unwrap()) +// } + #[cfg(test)] mod series_info_parsing_tests { use super::*; @@ -478,10 +499,10 @@ mod series_info_parsing_tests { let monday = parse_series_page_release_day_and_status(DAY); let completed = parse_series_page_release_day_and_status(COMPLETED); - assert_eq!(monday, ("monday".to_string(), "ongoing".to_string())); + assert_eq!(monday, ("Monday".to_string(), "Ongoing".to_string())); assert_eq!( completed, - ("completed".to_string(), "completed".to_string()) + ("Completed".to_string(), "Completed".to_string()) ); } @@ -539,4 +560,20 @@ mod series_info_parsing_tests { assert_eq!(result, "DARK MOON: THE BLOOD ALTAR"); } + + #[test] + fn should_parse_genre() { + const GENRE: &str = r#"<div class="info"> + <h2 class="genre g_romance">Romance</h2> + <h1 class="subj">Lore Olympus</h1> + <div class="author_area"> + <a href="https://www.webtoons.com/en/creator/rachelsmythe" class="author NPI=a:creator,g:en_en _gaLoggingLink">Rachel Smythe</a> + <button type="button" class="ico_info2 _btnAuthorInfo">author info</button> + </div> + </div>"#; + + let result = parse_genre(GENRE); + + assert_eq!(result, "Romance"); + } } diff --git a/lore-olympus/README.md b/lore-olympus/README.md index 6188d91..8fa4c83 100644 --- a/lore-olympus/README.md +++ b/lore-olympus/README.md @@ -2,5 +2,5 @@ Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:| diff --git a/lore-olympus/src/csv.rs b/lore-olympus/src/csv.rs index aaa5a1f..0348a07 100644 --- a/lore-olympus/src/csv.rs +++ b/lore-olympus/src/csv.rs @@ -28,7 +28,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/lore-olympus/src/lib.rs b/lore-olympus/src/lib.rs index 2c95ccc..7db1a39 100644 --- a/lore-olympus/src/lib.rs +++ b/lore-olympus/src/lib.rs @@ -1,8 +1,9 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::{chapter_height_pixels, comments, SeriesInfo}; +use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -14,7 +15,7 @@ mod story_specific_parsing; /// # Panics /// /// Will panic if `ChromeDriver` isn't running -pub async fn parse_chapters( +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -22,8 +23,21 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList<ChapterInfo>) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + + (series_info, result) +} + +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap<u16, LikesDate>, +) -> LinkedList<ChapterInfo> { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await @@ -59,7 +73,7 @@ pub async fn parse_chapters( } else { // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, @@ -109,5 +123,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/lore-olympus/src/main.rs b/lore-olympus/src/main.rs index cfef1e3..21cf6d8 100644 --- a/lore-olympus/src/main.rs +++ b/lore-olympus/src/main.rs @@ -5,8 +5,7 @@ use project_core::path_enforcer; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = lore_olympus::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output), diff --git a/project_core/Cargo.toml b/project_core/Cargo.toml index e05f03b..8a50a84 100644 --- a/project_core/Cargo.toml +++ b/project_core/Cargo.toml @@ -6,5 +6,6 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -reqwest = {version = "0.11.12", features = ["rustls"]} -chrono = { workspace = true} \ No newline at end of file +reqwest = { workspace = true} +chrono = { workspace = true} +rand = { workspace = true} \ No newline at end of file diff --git a/project_core/src/lib.rs b/project_core/src/lib.rs index f225efe..a18746a 100644 --- a/project_core/src/lib.rs +++ b/project_core/src/lib.rs @@ -1,7 +1,9 @@ use chrono::Utc; use core::time; +use rand::Rng; use reqwest::{Error, Response}; use std::path::{Path, PathBuf}; +use std::time::Duration; use std::{fs, thread}; pub mod regex; @@ -15,16 +17,23 @@ impl ResponseFactory { /// # Errors /// pub async fn get(url: &str) -> Result<Response, Error> { + let mut rng = rand::thread_rng(); + let mut retries = 5; let mut wait = 1; + let stop_rng = rng.gen_range(1..3); + + thread::sleep(Duration::from_secs(stop_rng)); + let response: Response = loop { + let wait_rng = rng.gen_range(1..3); match reqwest::get(url).await { Err(_) => { if retries > 0 { retries -= 1; - thread::sleep(time::Duration::from_secs(wait)); - wait *= 2; + thread::sleep(time::Duration::from_secs(wait + wait_rng)); + wait *= wait_rng; } else { panic!("Cannot connect. Check URL: {url}") } @@ -50,6 +59,7 @@ pub fn get_current_utc_date() -> String { } #[must_use] +/// # Panics pub fn create_date_folder(filepath: &str) -> PathBuf { let path = Path::new(filepath); @@ -57,7 +67,7 @@ pub fn create_date_folder(filepath: &str) -> PathBuf { let date_path = path.join(date_now); - if !path.exists() { + if !date_path.try_exists().unwrap() { fs::create_dir(&date_path).expect("Create date folder"); } @@ -87,7 +97,10 @@ pub fn path_enforcer(filepath: &str) -> &Path { // // let date = get_current_utc_date(); // -// let result = create_date_folder(given_path); +// let result = create_date_folder(given_path) +// .into_os_string() +// .into_string() +// .unwrap(); // // let test = format!("{given_path}\\{date}"); // @@ -98,7 +111,7 @@ pub fn path_enforcer(filepath: &str) -> &Path { // fn should_create_valid_folder() { // let given_path = r"D:\temp\temp"; // -// let result = path_enforcer(given_path); +// let result = path_enforcer(given_path).to_str().unwrap(); // // assert_eq!(result, given_path); // } diff --git a/scrapetoon/Cargo.toml b/scrapetoon/Cargo.toml index 7aadc68..fb5c38a 100644 --- a/scrapetoon/Cargo.toml +++ b/scrapetoon/Cargo.toml @@ -11,5 +11,4 @@ line_core = { path = "../line_core" } cli_core = { path = "../cli_core"} csv = { workspace = true } clap = { workspace = true } -tokio = { workspace = true } static_assertions = { workspace = true } \ No newline at end of file diff --git a/scrapetoon/src/args.rs b/scrapetoon/src/args.rs index ba92e3e..7fa71fb 100644 --- a/scrapetoon/src/args.rs +++ b/scrapetoon/src/args.rs @@ -5,30 +5,49 @@ use clap::{Parser, Subcommand}; pub struct Scrapetoon { /// Which source of data you wish to scrape #[command(subcommand)] - pub source: SourceData, + pub source: Source, } #[derive(Subcommand, Debug)] -pub enum SourceData { +pub enum Source { /// Scrapes Daily Schedule Daily { /// Path to save the output file #[arg(short = 'o', long = "output")] - output: String, + path: String, }, /// Scrapes a stories page - Story { + Page { /// Path to save the output file #[arg(short = 'o', long = "output")] - output: String, + path: String, /// URL to the Story Page #[arg(short, long)] url: String, /// The final page, where the story starts - #[arg(short = 'e', long = "end-page")] + #[arg(short = 'e', long = "end")] + end: u16, + }, + + /// Scrapes chapters to download as an image file + Panels { + /// Path to save the output image files + #[arg(short = 'o', long = "output")] + path: String, + + /// URL to the Story Page + #[arg(short, long)] + url: String, + + /// The earliest of the chapters to download + #[arg(short = 's', long = "start")] + start: u16, + + /// The latest chapter to download + #[arg(short = 'e', long = "end")] end: u16, }, } diff --git a/scrapetoon/src/csv.rs b/scrapetoon/src/csv.rs index f3c98fd..4cffbc7 100644 --- a/scrapetoon/src/csv.rs +++ b/scrapetoon/src/csv.rs @@ -76,7 +76,7 @@ pub fn write_series_info(path: &Path, series_info: &SeriesInfo) { "total_chapters", "chapter", "likes", - "chapter_release_date", + "published", "scrape_date", ]; diff --git a/scrapetoon/src/main.rs b/scrapetoon/src/main.rs index b7f2222..363de15 100644 --- a/scrapetoon/src/main.rs +++ b/scrapetoon/src/main.rs @@ -1,4 +1,4 @@ -use args::{Scrapetoon, SourceData}; +use args::{Scrapetoon, Source}; use clap::Parser; use project_core::create_date_folder; use std::path::Path; @@ -6,12 +6,11 @@ use std::path::Path; mod args; mod csv; -#[tokio::main] -async fn main() { +fn main() { let cli = Scrapetoon::parse(); match cli.source { - SourceData::Daily { output } => { + Source::Daily { path: output } => { let date_path = create_date_folder(&output); if !Path::new(&date_path).exists() { @@ -21,21 +20,30 @@ async fn main() { println!("Connecting to Daily Schedule..."); let daily = line_core::daily_schedule::parse(); - csv::write_daily_schedule(&date_path, &daily.await); + csv::write_daily_schedule(&date_path, &daily); println!("Finished scraping Daily Schedule!"); } - SourceData::Story { url, output, end } => { + Source::Page { + path: output, + url, + end, + } => { let date_path = create_date_folder(&output); - if !Path::new(&date_path).exists() { - eprintln!("Error! Invalid output path!"); - return; - } - println!("Connecting to Story Page..."); - let info = line_core::series_info::parse(end, &url).await; + let info = line_core::series_info::parse(end, &url); csv::write_series_info(&date_path, &info); println!("Finished scraping {}!", info.title); } + Source::Panels { + path, + url, + start, + end, + } => { + println!("Connecting..."); + line_core::panels::get(&url, &path, start, end); + println!("Finished Downloading Panels!"); + } } } diff --git a/the-god-of-high-school/README.md b/the-god-of-high-school/README.md index 770676e..7ea1bb5 100644 --- a/the-god-of-high-school/README.md +++ b/the-god-of-high-school/README.md @@ -1,6 +1,6 @@ -# The God of High School Chapter Scraping +# The God of High School Scraping Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:| diff --git a/the-god-of-high-school/src/csv.rs b/the-god-of-high-school/src/csv.rs index 222ef2f..687159e 100644 --- a/the-god-of-high-school/src/csv.rs +++ b/the-god-of-high-school/src/csv.rs @@ -28,7 +28,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/the-god-of-high-school/src/lib.rs b/the-god-of-high-school/src/lib.rs index 74c69dc..a2c8b04 100644 --- a/the-god-of-high-school/src/lib.rs +++ b/the-god-of-high-school/src/lib.rs @@ -1,9 +1,10 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::chapter_height_pixels; +use line_core::{chapter_height_pixels, LikesDate}; use line_core::{comments, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -13,7 +14,7 @@ use config::ChapterInfo; /// # Panics /// /// Will panic if `ChromeDriver` isn't running -pub async fn parse_chapters( +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -21,12 +22,25 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList<ChapterInfo>) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + + (series_info, result) +} + +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap<u16, LikesDate>, +) -> LinkedList<ChapterInfo> { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await - .expect("ChromeDriver not running."); + .unwrap(); let mut result: LinkedList<ChapterInfo> = LinkedList::new(); @@ -45,6 +59,7 @@ pub async fn parse_chapters( let url = format!("{}{chapter}", config.episode_url); + // Exponential back-off let mut retries = 5; let mut wait = 1; loop { @@ -55,14 +70,17 @@ pub async fn parse_chapters( thread::sleep(time::Duration::from_secs(wait)); wait *= 2; } else { + // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, }; } + // Needs a delay to wait for everything to load on the page. Go no lower than 3 seconds. Recommend 5. + // If you notice inconsistent behavior, can increase to see if that solves it. thread::sleep(time::Duration::from_secs(5)); let html = Html::parse_document(&driver.source().await.unwrap()); @@ -94,5 +112,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/the-god-of-high-school/src/main.rs b/the-god-of-high-school/src/main.rs index 0f4b98e..2750c6e 100644 --- a/the-god-of-high-school/src/main.rs +++ b/the-god-of-high-school/src/main.rs @@ -5,8 +5,7 @@ use the_god_of_high_school::config; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = the_god_of_high_school::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output), diff --git a/tower-of-god/README.md b/tower-of-god/README.md index 5677df1..82a1256 100644 --- a/tower-of-god/README.md +++ b/tower-of-god/README.md @@ -2,5 +2,5 @@ Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season|season_chapter| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:|:------------:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | season_chapter | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:|:--------------:| diff --git a/tower-of-god/src/csv.rs b/tower-of-god/src/csv.rs index 06892e5..4499718 100644 --- a/tower-of-god/src/csv.rs +++ b/tower-of-god/src/csv.rs @@ -30,7 +30,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/tower-of-god/src/lib.rs b/tower-of-god/src/lib.rs index fca9846..22e3310 100644 --- a/tower-of-god/src/lib.rs +++ b/tower-of-god/src/lib.rs @@ -1,8 +1,9 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::{chapter_height_pixels, comments, SeriesInfo}; +use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -18,7 +19,7 @@ mod story_specific_parsing; /// # Errors /// /// Returns a tuple if Ok and if there is any progress made, else returns a `WebDriver` error. -pub async fn parse_chapters( +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -26,8 +27,21 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList<ChapterInfo>) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + + (series_info, result) +} + +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap<u16, LikesDate>, +) -> LinkedList<ChapterInfo> { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await @@ -63,7 +77,7 @@ pub async fn parse_chapters( } else { // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, @@ -112,5 +126,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/tower-of-god/src/main.rs b/tower-of-god/src/main.rs index 34a9868..2458d55 100644 --- a/tower-of-god/src/main.rs +++ b/tower-of-god/src/main.rs @@ -5,8 +5,7 @@ use tower_of_god::config; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = tower_of_god::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output), diff --git a/true-beauty/README.md b/true-beauty/README.md index c51230c..f166c21 100644 --- a/true-beauty/README.md +++ b/true-beauty/README.md @@ -2,5 +2,5 @@ Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:| diff --git a/true-beauty/src/csv.rs b/true-beauty/src/csv.rs index 63eade0..cf53dcd 100644 --- a/true-beauty/src/csv.rs +++ b/true-beauty/src/csv.rs @@ -28,7 +28,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/true-beauty/src/lib.rs b/true-beauty/src/lib.rs index 497836b..e69aefa 100644 --- a/true-beauty/src/lib.rs +++ b/true-beauty/src/lib.rs @@ -1,8 +1,9 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::{chapter_height_pixels, comments, SeriesInfo}; +use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -12,7 +13,8 @@ use config::ChapterInfo; /// # Panics /// /// Will panic if `ChromeDriver` isn't running -pub async fn parse_chapters( + +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -20,12 +22,25 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList<ChapterInfo>) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + + (series_info, result) +} +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap<u16, LikesDate>, +) -> LinkedList<ChapterInfo> { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await - .expect("ChromeDriver not running."); + .unwrap(); let mut result: LinkedList<ChapterInfo> = LinkedList::new(); @@ -44,6 +59,7 @@ pub async fn parse_chapters( let url = format!("{}{chapter}", config.episode_url); + // Exponential back-off let mut retries = 5; let mut wait = 1; loop { @@ -54,14 +70,17 @@ pub async fn parse_chapters( thread::sleep(time::Duration::from_secs(wait)); wait *= 2; } else { + // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, }; } + // Needs a delay to wait for everything to load on the page. Go no lower than 3 seconds. Recommend 5. + // If you notice inconsistent behavior, can increase to see if that solves it. thread::sleep(time::Duration::from_secs(5)); let html = Html::parse_document(&driver.source().await.unwrap()); @@ -93,5 +112,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/true-beauty/src/main.rs b/true-beauty/src/main.rs index 338d4ef..d012333 100644 --- a/true-beauty/src/main.rs +++ b/true-beauty/src/main.rs @@ -5,8 +5,7 @@ use true_beauty::config; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = true_beauty::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output), diff --git a/unordinary/README.md b/unordinary/README.md index 6188d91..a851299 100644 --- a/unordinary/README.md +++ b/unordinary/README.md @@ -1,6 +1,6 @@ -# Lore Olympus Scraping +# Unordinary Scraping Data Format: -|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season| -|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:| +| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | +|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:| diff --git a/unordinary/src/csv.rs b/unordinary/src/csv.rs index 6ff7250..2d7b3f2 100644 --- a/unordinary/src/csv.rs +++ b/unordinary/src/csv.rs @@ -28,7 +28,7 @@ pub fn write( "total_comments", "likes", "total_likes", - "date", + "published", "user", "comment_body", "post_date", diff --git a/unordinary/src/lib.rs b/unordinary/src/lib.rs index 92d1597..7bea9a7 100644 --- a/unordinary/src/lib.rs +++ b/unordinary/src/lib.rs @@ -1,8 +1,9 @@ use cli_core::ProgressBarFactory; use core::time; -use line_core::{chapter_height_pixels, comments, SeriesInfo}; +use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo}; use project_core::SeriesConfiguration; use scraper::Html; +use std::collections::HashMap; use std::{collections::LinkedList, thread}; use thirtyfour::prelude::*; @@ -14,7 +15,7 @@ mod story_specific_parsing; /// # Panics /// /// Will panic if `ChromeDriver` isn't running -pub async fn parse_chapters( +pub fn parse_chapters( start: u16, end: u16, pages: u16, @@ -22,8 +23,20 @@ pub async fn parse_chapters( need_to_skip: fn(u16) -> bool, ) -> (SeriesInfo, LinkedList<ChapterInfo>) { let (series_info, chapter_likes_date_map) = - line_core::series_info::get_extra_info(pages, config.page_url).await; + line_core::series_info::get_extra_info(pages, config.page_url); + let result = work(start, end, config, need_to_skip, &chapter_likes_date_map); + (series_info, result) +} + +#[tokio::main] +async fn work( + start: u16, + end: u16, + config: &SeriesConfiguration<'_>, + need_to_skip: fn(u16) -> bool, + chapter_likes_date_map: &HashMap<u16, LikesDate>, +) -> LinkedList<ChapterInfo> { let capabilities = DesiredCapabilities::chrome(); let driver = WebDriver::new("http://localhost:9515", capabilities) .await @@ -59,7 +72,7 @@ pub async fn parse_chapters( } else { // If fails to connect it will return any already scraping eprintln!("Error connecting to webpage, saving progress and exiting..."); - return (series_info, result); + return result; } } Ok(ok) => break ok, @@ -107,5 +120,5 @@ pub async fn parse_chapters( driver.quit().await.unwrap(); - (series_info, result) + result } diff --git a/unordinary/src/main.rs b/unordinary/src/main.rs index ec45410..98edcbc 100644 --- a/unordinary/src/main.rs +++ b/unordinary/src/main.rs @@ -5,8 +5,7 @@ use unordinary::config; mod csv; -#[tokio::main] -async fn main() { +fn main() { let args = StoryCliArgs::parse(); let (series_info, parsed_chapters) = unordinary::parse_chapters( @@ -15,8 +14,7 @@ async fn main() { args.pages, &config::CONFIG, config::TO_SKIP, - ) - .await; + ); csv::write( path_enforcer(&args.output),