Skip to content

Commit

Permalink
downloading story panels and making chapter lists concurrent (#5)
Browse files Browse the repository at this point in the history
* initial implementation

* fixed file date_folder creation

* changed date to published

* made chapter_list_info concurrent

* updated documentation

* updated test

* commented out tests that need mocking
  • Loading branch information
RoloEdits authored Jan 10, 2023
1 parent 3f231ed commit 6c44e7a
Show file tree
Hide file tree
Showing 40 changed files with 1,007 additions and 175 deletions.
350 changes: 348 additions & 2 deletions Cargo.lock

Large diffs are not rendered by default.

8 changes: 7 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@ members = ["project_core", "line_core", "cli_core", "scrapetoon", "tower-of-god"

[workspace.dependencies]
clap = {version = "4.0.25", features = ["derive"]}
reqwest = {version = "0.11.13", features = ["rustls"]}
scraper = "0.13.0"
csv = "1.1.6"
regex = "1.7.0"
tokio = { version = "1.21.2", features = ["full"] }
chrono = "0.4.23"
thirtyfour = "0.31.0"
static_assertions = "1.1.0"
static_assertions = "1.1.0"
rayon = "1.6.1"
indicatif = { version = "0.17.2", features = ["rayon"]}
image = "0.24.5"
rand = "0.8.5"
crossbeam = "0.8.2"
13 changes: 4 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ The likes information, once it gets to the millions, is truncated, i.e. 1.1M. Th

The data gathered from here is organized like so:

| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | date | scrape_date |
|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | ---- | ----------- |
| title | author | genre | total_likes | status | release_day | views | subscribers | rating | chapter | likes | published | scrape_date |
|:-----:|:------:|:-----:|:-----------:|:------:|:-----------:| ----- | ----------- | ------ | ------- | ----- | --------- | ----------- |

The `chapter`, `likes`, and `date` are all relative to one chapter, with a new chapter on each row. The date is in the ISO 8601 format.

Expand Down Expand Up @@ -237,8 +237,6 @@ Once this is done, open up the `main.rs` file inside the `src` folder.
It should look something like this, with red squiggles indicating errors.
![main.rs initial state with errors](imgs/lore_olympus_main_no_changes.png)
The fix is to change `tower_of_god` to the new name we have been using. A format like this: `use <project name>::config;` and `<project name>::parse_chapters`.
In our case we change to `lore_olympus` like so:
Expand All @@ -250,7 +248,6 @@ use lore_olympus::config;
mod csv;
#[tokio::main]
async fn main() {
let args = StoryCliArgs::parse();
Expand All @@ -260,7 +257,7 @@ async fn main() {
args.pages,
&config::CONFIG,
config::TO_SKIP,
).await;
);
csv::write(
&args.output,
Expand Down Expand Up @@ -725,7 +722,7 @@ writer.write_record([
"total_comments",
"likes",
"total_likes",
"date",
"published",
"user",
"comment_body",
"post_date",
Expand Down Expand Up @@ -838,8 +835,6 @@ If you want to contribute, first of all, thank you, communities can only grow wi
Contributing are pretty simple. For features, open a branch with a name schema listed below, and then make whatever change you wanted, then begin the pull request process.
| Type | Example |
|:-------------- |:--------------------------------------------------:|
| Story | `story/<STORY_NAME>` |
Expand Down
2 changes: 1 addition & 1 deletion cli_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
indicatif = "0.17.2"
indicatif = { workspace = true }
clap = { workspace = true}
4 changes: 2 additions & 2 deletions kubera/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@

Data Format:

|title|author|genre|status|release_day|views|subscribers|rating|chapter|chapter_length|comments|total_comments|likes|total_likes|date|user|comment_body|post_date|upvotes|downvotes|reply_count| scrape_date|season|season_chapter|arc|
|:---:|:----:|:---:|:----:|:---------:|:---:|:---------:|:----:|:-----:|:------------:|:------:|:-------------|:---:|:---------:|:--:|:--:|:----------:|:-------:|:-----:|:-------:|:---------:|:----------:|:----:|:------------:|:-:|
| title | author | genre | status | release_day | views | subscribers | rating | chapter | chapter_length | comments | total_comments | likes | total_likes | published | user | comment_body | post_date | upvotes | downvotes | reply_count | scrape_date | season | season_chapter | arc |
|:-----:|:------:|:-----:|:------:|:-----------:|:-----:|:-----------:|:------:|:-------:|:--------------:|:--------:|:-------------- |:-----:|:-----------:|:---------:|:----:|:------------:|:---------:|:-------:|:---------:|:-----------:|:-----------:|:------:|:--------------:|:---:|
2 changes: 1 addition & 1 deletion kubera/src/csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ pub fn write(
"total_comments",
"likes",
"total_likes",
"date",
"published",
"user",
"comment_body",
"post_date",
Expand Down
24 changes: 19 additions & 5 deletions kubera/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
use cli_core::ProgressBarFactory;
use core::time;
use line_core::{chapter_height_pixels, comments, SeriesInfo};
use line_core::{chapter_height_pixels, comments, LikesDate, SeriesInfo};
use project_core::SeriesConfiguration;
use scraper::Html;
use std::collections::HashMap;
use std::{collections::LinkedList, thread};
use thirtyfour::prelude::*;

Expand All @@ -18,16 +19,29 @@ mod story_specific_parsing;
/// # Errors
///
/// Returns a tuple if Ok and if there is any progress made, else returns a `WebDriver` error.
pub async fn parse_chapters(
pub fn parse_chapters(
start: u16,
end: u16,
pages: u16,
config: &SeriesConfiguration<'_>,
need_to_skip: fn(u16) -> bool,
) -> (SeriesInfo, LinkedList<ChapterInfo>) {
let (series_info, chapter_likes_date_map) =
line_core::series_info::get_extra_info(pages, config.page_url).await;
line_core::series_info::get_extra_info(pages, config.page_url);

let result = work(start, end, config, need_to_skip, &chapter_likes_date_map);

(series_info, result)
}

#[tokio::main]
async fn work(
start: u16,
end: u16,
config: &SeriesConfiguration<'_>,
need_to_skip: fn(u16) -> bool,
chapter_likes_date_map: &HashMap<u16, LikesDate>,
) -> LinkedList<ChapterInfo> {
let capabilities = DesiredCapabilities::chrome();
let driver = WebDriver::new("http://localhost:9515", capabilities)
.await
Expand Down Expand Up @@ -63,7 +77,7 @@ pub async fn parse_chapters(
} else {
// If fails to connect it will return any already scraping
eprintln!("Error connecting to webpage, saving progress and exiting...");
return (series_info, result);
return result;
}
}
Ok(ok) => break ok,
Expand Down Expand Up @@ -113,5 +127,5 @@ pub async fn parse_chapters(

driver.quit().await.unwrap();

(series_info, result)
result
}
6 changes: 2 additions & 4 deletions kubera/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,7 @@ use project_core::path_enforcer;

mod csv;

#[tokio::main]
async fn main() {
fn main() {
let args = StoryCliArgs::parse();

let (series_info, parsed_chapters) = kubera::parse_chapters(
Expand All @@ -15,8 +14,7 @@ async fn main() {
args.pages,
&config::CONFIG,
config::TO_SKIP,
)
.await;
);

csv::write(
path_enforcer(&args.output),
Expand Down
9 changes: 8 additions & 1 deletion line_core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,11 @@ project_core = { path = "../project_core" }
cli_core = { path = "../cli_core" }
regex = { workspace = true }
scraper = { workspace = true}
chrono = { workspace = true}
chrono = { workspace = true}
tokio = { workspace = true }
image = { workspace = true }
rayon = { workspace = true }
rand = { workspace = true }
indicatif = { workspace = true }
reqwest = { workspace = true }
crossbeam = { workspace = true }
61 changes: 38 additions & 23 deletions line_core/src/chapter_list.rs
Original file line number Diff line number Diff line change
@@ -1,42 +1,57 @@
use chrono::NaiveDate;
use cli_core::ProgressBarFactory;
use core::time;
use crossbeam::queue::SegQueue;
use indicatif::ParallelProgressIterator;
use project_core::ResponseFactory;
use rayon::prelude::*;
use scraper::{ElementRef, Html, Selector};
use std::{collections::LinkedList, thread};

use std::collections::LinkedList;

use crate::ChapterListInfo;

///# Panics
///
/// Will panic if there was a response but at the same time, the html text somehow didn't come with it unwrapping to a None.
pub async fn parse(end: u16, input_url: &str, chapter_info: &mut LinkedList<ChapterListInfo>) {
let bar = ProgressBarFactory::get_bar(end);
#[must_use]
pub fn parse(end: u16, input_url: &str) -> LinkedList<ChapterListInfo> {
// 8 Threads is around the line at which problems start to occur when pinging out too many times at once as all getting blocked
rayon::ThreadPoolBuilder::new()
.num_threads(6)
.build_global()
.unwrap();

for page in 1..=end {
let url = format!("{input_url}&page={page}");
let range: Vec<_> = (1..=end).collect();
let total = range.len() as u64;

let html_response = if let Ok(ok) = ResponseFactory::get(&url).await {
ok
} else {
eprintln!("Error connecting to webpage, attempting to save progress and exit...");
let chapter_info: SegQueue<ChapterListInfo> = SegQueue::new();

assert!(!chapter_info.is_empty(), "Nothing to save, exiting.");
range
.into_par_iter()
.progress_count(total)
.for_each(|page| {
let url = format!("{input_url}&page={page}");
work(&url, &chapter_info);
});

break;
}
.text()
.await
.unwrap();
let mut result: LinkedList<ChapterListInfo> = LinkedList::new();

parse_each_chapters_chapter_info(&html_response, chapter_info);
for info in chapter_info {
result.push_back(info);
}

thread::sleep(time::Duration::from_secs(3));
result
}

bar.inc(1);
}
#[tokio::main]
async fn work(url: &str, chapter_info: &SegQueue<ChapterListInfo>) {
if let Ok(response) = ResponseFactory::get(url).await {
let html = response.text().await.unwrap();

parse_each_chapters_chapter_info(&html, chapter_info);
};
}

fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList<ChapterListInfo>) {
fn parse_each_chapters_chapter_info(html: &str, chapter_info: &SegQueue<ChapterListInfo>) {
let html = Html::parse_document(html);

let chapter_selector = Selector::parse("ul#_listUl>li").unwrap();
Expand All @@ -45,7 +60,7 @@ fn parse_each_chapters_chapter_info(html: &str, chapter_info: &mut LinkedList<Ch
let chapter_number = parse_chapter_number(&chapter);
let likes = parse_chapter_like_amount(&chapter);
let date = parse_chapter_date(&chapter);
chapter_info.push_back(ChapterListInfo {
chapter_info.push(ChapterListInfo {
chapter_number,
likes,
date,
Expand Down
2 changes: 2 additions & 0 deletions line_core/src/daily_schedule.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ use crate::DailyScheduleInfo;
///# Panics
///
/// Will panic if there was a response but at the same time, the html text somehow didn't come with it unwrapping to a None.
#[tokio::main]
#[must_use]
pub async fn parse() -> LinkedList<DailyScheduleInfo> {
const DAILY_SCHEDULE: &str = "https://www.webtoons.com/en/dailySchedule";

Expand Down
2 changes: 1 addition & 1 deletion line_core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@ pub mod chapter_height_pixels;
pub mod chapter_list;
pub mod comments;
pub mod daily_schedule;
pub mod panels;
pub mod series_info;

use project_core::regex;
use std::collections::LinkedList;

#[derive(Debug)]
Expand Down
Loading

0 comments on commit 6c44e7a

Please sign in to comment.