Skip to content

Commit

Permalink
feat: sort files into folders of their domains (#187)
Browse files Browse the repository at this point in the history
  • Loading branch information
EdJoPaTo authored Sep 5, 2023
1 parent 2ed6b21 commit fd9163f
Show file tree
Hide file tree
Showing 9 changed files with 148 additions and 113 deletions.
1 change: 1 addition & 0 deletions .github/workflows/run.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ jobs:
- run: cargo build

- name: website-stalker run --all
working-directory: sites
env:
WEBSITE_STALKER_FROM: ${{ secrets.WEBSITE_STALKER_FROM }}
run: cargo run -- run --all
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
/target

/sites
/sites/*/*
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0

## [Unreleased]

### Changed

- Files are sorted into folders of their domains

## [0.20.0] - 2023-04-11

### Added
Expand Down
File renamed without changes.
2 changes: 1 addition & 1 deletion src/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use crate::final_message::FinalMessage;
use crate::http::validate_from;
use crate::site::{Options, Site};

pub const EXAMPLE_CONF: &str = include_str!("../website-stalker.yaml");
pub const EXAMPLE_CONF: &str = include_str!("../sites/website-stalker.yaml");

#[derive(Debug, Deserialize, Serialize, PartialEq, Eq)]
pub struct Config {
Expand Down
55 changes: 35 additions & 20 deletions src/filename.rs
Original file line number Diff line number Diff line change
@@ -1,49 +1,64 @@
use lazy_regex::{lazy_regex, Lazy, Regex};
use url::Url;

pub fn basename(url: &Url) -> String {
static NON_ALPHANUM: Lazy<Regex> = lazy_regex!(r"[^a-zA-Z\d]+");

let host_part = url.domain().map_or_else(
|| url.host_str().expect("url has a host").to_string(),
pub fn domainfolder(url: &Url) -> Vec<String> {
let mut parts = url.domain().map_or_else(
|| vec![alphanum(url.host_str().expect("url has a host"))],
|domain| {
domain
.trim_start_matches("www.")
.split('.')
.rev()
.map(alphanum)
.collect::<Vec<_>>()
.join("-")
},
);
if let Some(port) = url.port() {
parts.push(port.to_string());
}
parts
}

pub fn filename(url: &Url) -> String {
let path = url.path();
let port = url.port().map(|o| o.to_string()).unwrap_or_default();
let query = url.query().unwrap_or_default();
let output = alphanum(&format!("{path}-{query}"));
if output.is_empty() {
"index".to_string()
} else {
output
}
}

let raw = format!("{host_part}-{port}-{path}-{query}");
let only_ascii = NON_ALPHANUM.replace_all(&raw, "-");
only_ascii.trim_matches('-').to_string()
fn alphanum(str: &str) -> String {
static NON_ALPHANUM: Lazy<Regex> = lazy_regex!(r"[^a-zA-Z\d]+");
NON_ALPHANUM
.replace_all(str, "-")
.trim_matches('-')
.to_string()
}

#[cfg(test)]
/// test base name
fn tb(url: &str) -> String {
let url = Url::parse(url).expect("url is valid");
println!("{url}");
basename(&url)
let folder = domainfolder(&url).join("/");
let file = filename(&url);
format!("{folder}/{file}")
}

#[test]
fn examples() {
assert_eq!(tb("https://edjopato.de/"), "de-edjopato");
assert_eq!(tb("https://edjopato.de/post/"), "de-edjopato-post");
assert_eq!(tb("https://edjopato.de/"), "de/edjopato/index");
assert_eq!(tb("https://edjopato.de/post/"), "de/edjopato/post");
}

#[test]
fn query_does_matter() {
assert_eq!(
tb("http://edjopato.de/?something=true"),
"de-edjopato-something-true",
"de/edjopato/something-true",
);
}

Expand Down Expand Up @@ -71,10 +86,10 @@ fn ending_slash_doesnt_matter() {

#[test]
fn extension_is_still_in_basename() {
assert_eq!(tb("http://edjopato.de/robot.txt"), "de-edjopato-robot-txt");
assert_eq!(tb("http://edjopato.de/robot.txt"), "de/edjopato/robot-txt");
assert_eq!(
tb("http://edjopato.de/robot.html"),
"de-edjopato-robot-html",
"de/edjopato/robot-html",
);
}

Expand All @@ -85,21 +100,21 @@ fn domain_prefix_www_doesnt_matter() {

#[test]
fn works_with_ipv4() {
assert_eq!(tb("http://127.0.0.1/test/"), "127-0-0-1-test");
assert_eq!(tb("http://127.0.0.1/test/"), "127-0-0-1/test");
}

#[test]
fn works_with_ipv4_with_port() {
assert_eq!(tb("http://127.0.0.1:12345/test/"), "127-0-0-1-12345-test");
assert_eq!(tb("http://127.0.0.1:12345/test/"), "127-0-0-1/12345/test");
}

#[test]
fn works_with_ipv6() {
assert_eq!(tb("http://[::1]/test/"), "1-test");
assert_eq!(tb("http://[::1]/test/"), "1/test");
}

#[test]
#[should_panic = "url is valid"]
fn fails_on_ipv6_with_interface() {
assert_eq!(tb("http://[fe80::1234%eth0]/test/"), "fe80-1234-eth0-test");
assert_eq!(tb("http://[fe80::1234%eth0]/test/"), "fe80-1234-eth0/test");
}
19 changes: 6 additions & 13 deletions src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use tokio::time::sleep;
use crate::cli::SubCommand;
use crate::config::{Config, EXAMPLE_CONF};
use crate::site::Site;
use crate::site_store::SiteStore;

mod cli;
mod config;
Expand All @@ -24,8 +23,6 @@ mod logger;
mod site;
mod site_store;

const SITE_FOLDER: &str = "sites";

#[derive(Debug)]
pub enum ChangeKind {
Init,
Expand Down Expand Up @@ -139,12 +136,9 @@ Hint: Change the filter or use all sites with 'run --all'."
}
}

let site_store = site_store::SiteStore::new(SITE_FOLDER.to_string())
.expect("failed to create sites directory");

if sites_amount == sites_total {
let basenames = Site::get_all_file_basenames(&sites);
let removed = site_store.remove_gone(&basenames)?;
let paths = Site::get_all_file_paths(&sites);
let removed = site_store::remove_gone(&paths)?;
for filename in removed {
logger::warn(&format!("Remove superfluous {filename:?}"));
}
Expand All @@ -165,15 +159,14 @@ Hint: Change the filter or use all sites with 'run --all'."
.group_by(|a| a.url.host_str().unwrap().to_string());
for (_, group) in &groups {
let from = config.from.clone();
let site_store = site_store.clone();
let sites = group.collect::<Vec<_>>();
let tx = tx.clone();
tokio::spawn(async move {
for (i, site) in sites.into_iter().enumerate() {
if i > 0 {
sleep(Duration::from_secs(5)).await;
}
let result = stalk_and_save_site(&site_store, &from, &site).await;
let result = stalk_and_save_site(&from, &site).await;
tx.send((site.url, result, site.options.ignore_error))
.await
.expect("failed to send stalking result");
Expand Down Expand Up @@ -234,7 +227,6 @@ Hint: Change the filter or use all sites with 'run --all'."
}

async fn stalk_and_save_site(
site_store: &SiteStore,
from: &str,
site: &Site,
) -> anyhow::Result<(ChangeKind, http::IpVersion, Duration)> {
Expand Down Expand Up @@ -263,8 +255,9 @@ async fn stalk_and_save_site(
let extension = content.extension.unwrap_or("txt");

// Use site.url as the file basename should only change when the config changes (manually)
let basename = site.to_file_base_name();
let changed = site_store.write_only_changed(&basename, extension, &content.text)?;
let mut path = site.to_file_path();
path.set_extension(extension);
let changed = site_store::write_only_changed(&path, &content.text)?;
Ok((changed, ip_version, took))
}

Expand Down
45 changes: 32 additions & 13 deletions src/site.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
use std::path::{Path, PathBuf};

use serde::{Deserialize, Serialize};
use url::Url;

Expand All @@ -19,7 +21,7 @@ pub struct Options {
pub ignore_error: bool,

#[serde(default, skip_serializing_if = "Option::is_none")]
pub filename: Option<String>,
pub filename: Option<PathBuf>,

#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub headers: Vec<String>,
Expand All @@ -33,24 +35,41 @@ impl Site {
self.options.is_valid()
}

pub fn to_file_base_name(&self) -> String {
self.options
.filename
.clone()
.unwrap_or_else(|| filename::basename(&self.url))
pub fn to_file_path(&self) -> PathBuf {
self.options.filename.clone().unwrap_or_else(|| {
let folder = filename::domainfolder(&self.url);
let [first, rest @ ..] = &folder[..] else {
unreachable!(
"domain has to have at least one segment {folder:?} {:?}",
self.url
);
};
let mut path = Path::new(first).to_path_buf();
for f in rest {
path = path.join(f);
}
path.join(filename::filename(&self.url))
})
}

fn unique_idenfier(&self) -> String {
self.to_file_path()
.to_str()
.expect("the path is unicode already")
.to_string()
}

pub fn get_all_file_basenames(sites: &[Self]) -> Vec<String> {
sites.iter().map(Self::to_file_base_name).collect()
pub fn get_all_file_paths(sites: &[Self]) -> Vec<PathBuf> {
sites.iter().map(Self::to_file_path).collect()
}

pub fn validate_no_duplicate(sites: &[Self]) -> Result<(), String> {
// TODO: return url or something of specific duplicates
let mut file_basenames = Self::get_all_file_basenames(sites);
file_basenames.sort_unstable();
let total = file_basenames.len();
file_basenames.dedup();
if file_basenames.len() == total {
let mut uniq = sites.iter().map(Self::unique_idenfier).collect::<Vec<_>>();
uniq.sort_unstable();
let total = uniq.len();
uniq.dedup();
if uniq.len() == total {
Ok(())
} else {
Err(
Expand Down
Loading

0 comments on commit fd9163f

Please sign in to comment.