downloading story panels and making chapter lists concurrent (#5)

* initial implementation * fixed file date_folder creation * changed date to published * made chapter_list_info concurrent * updated documentation * updated test * commented out tests that need mocking
RoloEdits · Jan 10, 2023 · 6c44e7a · 6c44e7a
1 parent 3f231ed
commit 6c44e7a
Show file tree

Hide file tree

Showing 40 changed files with 1,007 additions and 175 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -4,10 +4,16 @@ members = ["project_core", "line_core", "cli_core", "scrapetoon", "tower-of-god"
 
 [workspace.dependencies]
 clap = {version = "4.0.25", features = ["derive"]}
+reqwest = {version = "0.11.13", features = ["rustls"]}
 scraper = "0.13.0"
 csv = "1.1.6"
 regex = "1.7.0"
 tokio = { version = "1.21.2", features = ["full"] }
 chrono = "0.4.23"
 thirtyfour = "0.31.0"
-static_assertions = "1.1.0"
+static_assertions = "1.1.0"
+rayon = "1.6.1"
+indicatif = { version = "0.17.2", features = ["rayon"]}
+image = "0.24.5"
+rand = "0.8.5"
+crossbeam = "0.8.2"
diff --git a/README.md b/README.md
@@ -29,8 +29,8 @@ The likes information, once it gets to the millions, is truncated, i.e. 1.1M. Th
 
 The data gathered from here is organized like so:
 
-| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | date | scrape_date |
-|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | ---- | ----------- |
+| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | published | scrape_date |
+|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | --------- | ----------- |
 
 The `chapter`, `likes`, and `date` are all relative to one chapter, with a new chapter on each row. The date is in the ISO 8601 format.
 
@@ -237,8 +237,6 @@ Once this is done, open up the `main.rs` file inside the `src` folder.
 
 It should look something like this, with red squiggles indicating errors. 
 
-![main.rs initial state with errors](imgs/lore_olympus_main_no_changes.png)
-
 The fix is to change `tower_of_god` to the new name we have been using. A format like this: `use <project name>::config;` and `<project name>::parse_chapters`.
 
 In our case we change to `lore_olympus` like so:
@@ -250,7 +248,6 @@ use lore_olympus::config;
 
 mod csv;
 
-#[tokio::main]
 async fn main() {
     let args = StoryCliArgs::parse();
 
@@ -260,7 +257,7 @@ async fn main() {
         args.pages,
         &config::CONFIG,
         config::TO_SKIP,
-    ).await;
+    );
 
     csv::write(
         &args.output,
@@ -725,7 +722,7 @@ writer.write_record([
             "total_comments",
             "likes",
             "total_likes",
-            "date",
+            "published",
             "user",
             "comment_body",
             "post_date",
@@ -838,8 +835,6 @@ If you want to contribute, first of all, thank you, communities can only grow wi
 
 Contributing are pretty simple. For features, open a branch with a name schema listed below, and then make whatever change you wanted, then begin the pull request process.
 
-
-
 | Type           | Example                                            |
 |:-------------- |:--------------------------------------------------:|
 | Story          | `story/<STORY_NAME>`                               |

diff --git a/cli_core/Cargo.toml b/cli_core/Cargo.toml
@@ -6,5 +6,5 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-indicatif = "0.17.2"
+indicatif = { workspace = true }
 clap = { workspace = true}
diff --git a/kubera/README.md b/kubera/README.md
@@ -2,5 +2,5 @@
 
 Data Format:
 
-|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season|season_chapter|arc|
-|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:|:------------:|:-:|
+| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | season_chapter | arc |
+|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:|:--------------:|:---:|
diff --git a/kubera/src/csv.rs b/kubera/src/csv.rs
@@ -30,7 +30,7 @@ pub fn write(
         "total_comments",
         "likes",
         "total_likes",
-        "date",
+        "published",
         "user",
         "comment_body",
         "post_date",

diff --git a/kubera/src/lib.rs b/kubera/src/lib.rs
@@ -1,8 +1,9 @@
 use cli_core::ProgressBarFactory;
 use core::time;
-use line_core::{chapter_height_pixels, comments, SeriesInfo};
+use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo};
 use project_core::SeriesConfiguration;
 use scraper::Html;
+use std::collections::HashMap;
 use std::{collections::LinkedList, thread};
 use thirtyfour::prelude::*;
 
@@ -18,16 +19,29 @@ mod story_specific_parsing;
 /// # Errors
 ///
 /// Returns a tuple if Ok and if there is any progress made, else returns a `WebDriver` error.
-pub async fn parse_chapters(
+pub fn parse_chapters(
     start: u16,
     end: u16,
     pages: u16,
     config: &SeriesConfiguration<'_>,
     need_to_skip: fn(u16) -> bool,
 ) -> (SeriesInfo, LinkedList<ChapterInfo>) {
     let (series_info, chapter_likes_date_map) =
-        line_core::series_info::get_extra_info(pages, config.page_url).await;
+        line_core::series_info::get_extra_info(pages, config.page_url);
 
+    let result = work(start, end, config, need_to_skip, &chapter_likes_date_map);
+
+    (series_info, result)
+}
+
+#[tokio::main]
+async fn work(
+    start: u16,
+    end: u16,
+    config: &SeriesConfiguration<'_>,
+    need_to_skip: fn(u16) -> bool,
+    chapter_likes_date_map: &HashMap<u16, LikesDate>,
+) -> LinkedList<ChapterInfo> {
     let capabilities = DesiredCapabilities::chrome();
     let driver = WebDriver::new("http://localhost:9515", capabilities)
         .await
@@ -63,7 +77,7 @@ pub async fn parse_chapters(
                     } else {
                         // If fails to connect it will return any already scraping
                         eprintln!("Error connecting to webpage, saving progress and exiting...");
-                        return (series_info, result);
+                        return result;
                     }
                 }
                 Ok(ok) => break ok,
@@ -113,5 +127,5 @@ pub async fn parse_chapters(
 
     driver.quit().await.unwrap();
 
-    (series_info, result)
+    result
 }
diff --git a/kubera/src/main.rs b/kubera/src/main.rs
@@ -5,8 +5,7 @@ use project_core::path_enforcer;
 
 mod csv;
 
-#[tokio::main]
-async fn main() {
+fn main() {
     let args = StoryCliArgs::parse();
 
     let (series_info, parsed_chapters) = kubera::parse_chapters(
@@ -15,8 +14,7 @@ async fn main() {
         args.pages,
         &config::CONFIG,
         config::TO_SKIP,
-    )
-    .await;
+    );
 
     csv::write(
         path_enforcer(&args.output),

diff --git a/line_core/Cargo.toml b/line_core/Cargo.toml
@@ -10,4 +10,11 @@ project_core = { path = "../project_core" }
 cli_core = { path = "../cli_core" }
 regex = { workspace = true }
 scraper = { workspace = true}
-chrono = { workspace = true}
+chrono = { workspace = true}
+tokio = { workspace = true }
+image = { workspace = true }
+rayon = { workspace = true }
+rand = { workspace = true }
+indicatif = { workspace = true }
+reqwest = { workspace = true }
+crossbeam = { workspace = true }
diff --git a/line_core/src/chapter_list.rs b/line_core/src/chapter_list.rs
@@ -1,42 +1,57 @@
 use chrono::NaiveDate;
-use cli_core::ProgressBarFactory;
-use core::time;
+use crossbeam::queue::SegQueue;
+use indicatif::ParallelProgressIterator;
 use project_core::ResponseFactory;
+use rayon::prelude::*;
 use scraper::{ElementRef, Html, Selector};
-use std::{collections::LinkedList, thread};
+
+use std::collections::LinkedList;
 
 use crate::ChapterListInfo;
+
 ///# Panics
 ///
 /// Will panic if there was a response but at the same time, the html text somehow didn't come with it unwrapping to a None.
-pub async fn parse(end: u16, input_url: &str, chapter_info: &mut LinkedList<ChapterListInfo>) {
-    let bar = ProgressBarFactory::get_bar(end);
+#[must_use]
+pub fn parse(end: u16, input_url: &str) -> LinkedList<ChapterListInfo> {
+    // 8 Threads is around the line at which problems start to occur when pinging out too many times at once as all getting blocked
+    rayon::ThreadPoolBuilder::new()
+        .num_threads(6)
+        .build_global()
+        .unwrap();
 
-    for page in 1..=end {
-        let url = format!("{input_url}&page={page}");
+    let range: Vec<_> = (1..=end).collect();
+    let total = range.len() as u64;
 
-        let html_response = if let Ok(ok) = ResponseFactory::get(&url).await {
-            ok
-        } else {
-            eprintln!("Error connecting to webpage, attempting to save progress and exit...");
+    let chapter_info: SegQueue<ChapterListInfo> = SegQueue::new();
 
-            assert!(!chapter_info.is_empty(), "Nothing to save, exiting.");
+    range
+        .into_par_iter()
+        .progress_count(total)
+        .for_each(|page| {
+            let url = format!("{input_url}&page={page}");
+            work(&url, &chapter_info);
+        });
 
-            break;
-        }
-        .text()
-        .await
-        .unwrap();
+    let mut result: LinkedList<ChapterListInfo> = LinkedList::new();
 
-        parse_each_chapters_chapter_info(&html_response, chapter_info);
+    for info in chapter_info {
+        result.push_back(info);
+    }
 
-        thread::sleep(time::Duration::from_secs(3));
+    result
+}
 
-        bar.inc(1);
-    }
+#[tokio::main]
+async fn work(url: &str, chapter_info: &SegQueue<ChapterListInfo>) {
+    if let Ok(response) = ResponseFactory::get(url).await {
+        let html = response.text().await.unwrap();
+
+        parse_each_chapters_chapter_info(&html, chapter_info);
+    };
 }
 
-fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList<ChapterListInfo>) {
+fn parse_each_chapters_chapter_info(html: &str, chapter_info: &SegQueue<ChapterListInfo>) {
     let html = Html::parse_document(html);
 
     let chapter_selector = Selector::parse("ul#_listUl>li").unwrap();
@@ -45,7 +60,7 @@ fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList<Ch
         let chapter_number = parse_chapter_number(&chapter);
         let likes = parse_chapter_like_amount(&chapter);
         let date = parse_chapter_date(&chapter);
-        chapter_info.push_back(ChapterListInfo {
+        chapter_info.push(ChapterListInfo {
             chapter_number,
             likes,
             date,

diff --git a/line_core/src/daily_schedule.rs b/line_core/src/daily_schedule.rs
@@ -7,6 +7,8 @@ use crate::DailyScheduleInfo;
 ///# Panics
 ///
 /// Will panic if there was a response but at the same time, the html text somehow didn't come with it unwrapping to a None.
+#[tokio::main]
+#[must_use]
 pub async fn parse() -> LinkedList<DailyScheduleInfo> {
     const DAILY_SCHEDULE: &str = "https://www.webtoons.com/en/dailySchedule";
 

diff --git a/line_core/src/lib.rs b/line_core/src/lib.rs
@@ -2,9 +2,9 @@ pub mod chapter_height_pixels;
 pub mod chapter_list;
 pub mod comments;
 pub mod daily_schedule;
+pub mod panels;
 pub mod series_info;
 
-use project_core::regex;
 use std::collections::LinkedList;
 
 #[derive(Debug)]