Skip to content

Commit f208dcd

Browse files
authored
feat: css_sort (#190)
1 parent b7d0a88 commit f208dcd

File tree

4 files changed

+264
-0
lines changed

4 files changed

+264
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
99

1010
### Added
1111

12+
- new editor: `css_sort`
1213
- new editor: `debug_files`
1314

1415
### Fixed

README.md

Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -268,6 +268,71 @@ editors:
268268
- css_select: h1 > a
269269
```
270270

271+
#### `css_sort`
272+
273+
Sort elements matching to the given [CSS Selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors).
274+
Other elements not matching are kept.
275+
Elements below different parents are sorted independently.
276+
277+
Basic example:
278+
279+
```html
280+
<div><p>C</p><p>B</p></div>
281+
<div><p>D</p><p>A</p></div>
282+
```
283+
284+
with `p` as the selector will sort into this:
285+
286+
```html
287+
<div><p>B</p><p>C</p></div>
288+
<div><p>A</p><p>D</p></div>
289+
```
290+
291+
Examples:
292+
293+
```yaml
294+
editors:
295+
# Sort all articles
296+
- css_sort:
297+
selector: article
298+
```
299+
300+
The above example sorts by the whole element ([`outerHTML`](https://developer.mozilla.org/en-US/docs/Web/API/Element/outerHTML)).
301+
In order to sort by something specific for a given HTML element, editors can be used.
302+
303+
```yaml
304+
editors:
305+
# Sort articles by their heading
306+
- css_sort:
307+
selector: article
308+
sort_by: # the specified editors are applied to every selected HTML element independently
309+
- css_select: h2
310+
```
311+
312+
This might still sort in surprising ways as things like attributes are still included (`<h2 class="a">Z</h2>` is sorted before `<h2 class="z">A</h2>`).
313+
Therefore, editors like [`html_textify`](#html_textify) or [`html_sanitize`](#html_sanitize) are likely a good idea to be used in `sort_by`.
314+
315+
Tip: [`debug_files`](#debug_files) can help you understand what is happening. But don't forget to remove it after you are done testing:
316+
317+
```yaml
318+
editors:
319+
- css_sort:
320+
selector: article
321+
sort_by:
322+
- css_select: h2
323+
- html_sanitize
324+
- debug_files: /tmp/website-stalker/
325+
```
326+
327+
You can also reverse the sorting:
328+
329+
```yaml
330+
editors:
331+
- css_sort:
332+
selector: article
333+
reverse: true
334+
```
335+
271336
#### `debug_files`
272337

273338
This editor passes its input through without modifying it.

src/editor/css_sort.rs

Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
use std::collections::HashMap;
2+
3+
use scraper::{ElementRef, Html, Selector};
4+
use serde::Deserialize;
5+
use url::Url;
6+
7+
use super::Editor;
8+
use crate::logger;
9+
10+
#[derive(Debug, Clone, Deserialize)]
11+
pub struct CssSort {
12+
#[serde(deserialize_with = "super::deserialize_selector")]
13+
pub selector: Selector,
14+
15+
#[serde(default)]
16+
pub reverse: bool,
17+
18+
#[serde(default, skip_serializing_if = "Vec::is_empty")]
19+
pub sort_by: Vec<Editor>,
20+
}
21+
22+
impl CssSort {
23+
pub fn apply(&self, url: &Url, html: &str) -> String {
24+
let mut html = Html::parse_document(html);
25+
let selected = html.select(&self.selector).collect::<Vec<_>>();
26+
27+
let mut grouped_by_parent: HashMap<_, Vec<ElementRef>> = HashMap::new();
28+
for element in selected {
29+
if let Some(key) = element.parent().map(|parent| parent.id()) {
30+
grouped_by_parent.entry(key).or_default().push(element);
31+
}
32+
}
33+
34+
if grouped_by_parent.is_empty() {
35+
logger::warn(&format!("css_sort selector selected nothing to sort {url}"));
36+
}
37+
38+
// Get the order of the elements as ids
39+
// This removes the reference to html allowing to take mut references later on
40+
let sorted = grouped_by_parent
41+
.into_iter()
42+
.map(|(parent, mut elements)| {
43+
elements.sort_by_cached_key(|element| self.get_sort_key_from_element(url, element));
44+
if self.reverse {
45+
elements.reverse();
46+
}
47+
let elements = elements
48+
.iter()
49+
.map(|element| element.id())
50+
.collect::<Vec<_>>();
51+
(parent, elements)
52+
})
53+
.collect::<HashMap<_, _>>();
54+
55+
for (parent, sorted) in sorted {
56+
for id in &sorted {
57+
html.tree.get_mut(*id).unwrap().detach();
58+
}
59+
60+
// Insert them at the beginning of the parents children
61+
// This destroyes the order with the other elements in there but its way simpler to do for now
62+
let mut parent_mut = html.tree.get_mut(parent).unwrap();
63+
for id in sorted.into_iter().rev() {
64+
parent_mut.prepend_id(id);
65+
}
66+
}
67+
68+
html.html()
69+
}
70+
71+
fn get_sort_key_from_element(&self, url: &Url, element: &ElementRef) -> String {
72+
let content = super::Content {
73+
extension: Some("html"),
74+
text: element.html(),
75+
};
76+
Editor::apply_many(&self.sort_by, url, content).map_or_else(
77+
|error| {
78+
logger::error(&format!("css_sort sort_by failed {error}"));
79+
String::new()
80+
},
81+
|content| content.text,
82+
)
83+
}
84+
}
85+
86+
#[cfg(test)]
87+
mod tests {
88+
use super::*;
89+
90+
#[track_caller]
91+
fn case(css_sort: &CssSort, input: &str, expected: &str) {
92+
const PREFIX: &str = "<html><head></head><body>";
93+
const SUFFIX: &str = "</body></html>";
94+
95+
let url = Url::parse("https://edjopato.de/").unwrap();
96+
let html = css_sort.apply(&url, input);
97+
98+
assert!(html.starts_with(PREFIX));
99+
assert!(html.ends_with(SUFFIX));
100+
let end_index = html.len() - SUFFIX.len();
101+
let html = html.get(PREFIX.len()..end_index).unwrap();
102+
103+
assert_eq!(html, expected);
104+
}
105+
106+
#[test]
107+
fn simple_example() {
108+
let input = "<p>A</p><p>C</p><p>B</p>";
109+
let expected = "<p>A</p><p>B</p><p>C</p>";
110+
let sort_by = CssSort {
111+
selector: Selector::parse("p").unwrap(),
112+
sort_by: Vec::new(),
113+
reverse: false,
114+
};
115+
case(&sort_by, input, expected);
116+
}
117+
118+
#[test]
119+
fn reverse() {
120+
let input = "<p>A</p><p>C</p><p>B</p>";
121+
let expected = "<p>C</p><p>B</p><p>A</p>";
122+
let sort_by = CssSort {
123+
selector: Selector::parse("p").unwrap(),
124+
sort_by: Vec::new(),
125+
reverse: true,
126+
};
127+
case(&sort_by, input, expected);
128+
}
129+
130+
#[test]
131+
fn sort_by() {
132+
let input = r#"<article><h3>A</h3><a id="Y">Bla</a></article><article><h3>B</h3><a id="X">Bla</a></article>"#;
133+
let expected = r#"<article><h3>B</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="Y">Bla</a></article>"#;
134+
let sort_by = CssSort {
135+
selector: Selector::parse("article").unwrap(),
136+
sort_by: vec![Editor::CssSelect(Selector::parse("a").unwrap())],
137+
reverse: false,
138+
};
139+
case(&sort_by, input, expected);
140+
}
141+
142+
#[test]
143+
fn sort_by_same_key_keeps_order() {
144+
let input = r#"<article><h3>C</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="X">Bla</a></article>"#;
145+
let expected = r#"<article><h3>C</h3><a id="X">Bla</a></article><article><h3>A</h3><a id="X">Bla</a></article>"#;
146+
let sort_by = CssSort {
147+
selector: Selector::parse("article").unwrap(),
148+
sort_by: vec![Editor::CssSelect(Selector::parse("a").unwrap())],
149+
reverse: false,
150+
};
151+
case(&sort_by, input, expected);
152+
}
153+
154+
#[test]
155+
fn sorting_toplevel_keeps_children_unsorted() {
156+
let input = "<div><p>D</p><p>A</p></div><div><p>C</p><p>B</p></div>";
157+
let expected = "<div><p>C</p><p>B</p></div><div><p>D</p><p>A</p></div>";
158+
let sort_by = CssSort {
159+
selector: Selector::parse("div").unwrap(),
160+
sort_by: Vec::new(),
161+
reverse: false,
162+
};
163+
case(&sort_by, input, expected);
164+
}
165+
166+
#[test]
167+
fn sorting_bottomlevel_keeps_parents_unsorted() {
168+
let input = "<div><p>D</p><p>A</p></div><div><p>C</p><p>B</p></div>";
169+
let expected = "<div><p>A</p><p>D</p></div><div><p>B</p><p>C</p></div>";
170+
let sort_by = CssSort {
171+
selector: Selector::parse("p").unwrap(),
172+
sort_by: Vec::new(),
173+
reverse: false,
174+
};
175+
case(&sort_by, input, expected);
176+
}
177+
178+
/// Documents current sorting order when other elements are there.
179+
/// Needs to be adapted when sorting order is improved.
180+
#[test]
181+
fn sort_with_other_elements() {
182+
let input = "<div>1</div><p>A</p><img><p>B</p>";
183+
let expected = "<p>A</p><p>B</p><div>1</div><img>";
184+
let sort_by = CssSort {
185+
selector: Selector::parse("p").unwrap(),
186+
sort_by: Vec::new(),
187+
reverse: false,
188+
};
189+
case(&sort_by, input, expected);
190+
}
191+
}

src/editor/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ use url::Url;
66

77
pub mod css_remove;
88
pub mod css_selector;
9+
pub mod css_sort;
910
pub mod debug_files;
1011
pub mod html_markdown;
1112
pub mod html_pretty;
@@ -26,6 +27,7 @@ pub struct Content {
2627
pub enum Editor {
2728
CssRemove(#[serde(deserialize_with = "deserialize_selector")] scraper::Selector),
2829
CssSelect(#[serde(deserialize_with = "deserialize_selector")] scraper::Selector),
30+
CssSort(css_sort::CssSort),
2931
DebugFiles(PathBuf),
3032
HtmlMarkdownify,
3133
HtmlPrettify,
@@ -42,6 +44,7 @@ impl Editor {
4244
match self {
4345
Self::CssRemove(_) => "css_remove",
4446
Self::CssSelect(_) => "css_select",
47+
Self::CssSort(_) => "css_sort",
4548
Self::DebugFiles(_) => "debug_files",
4649
Self::HtmlMarkdownify => "html_markdownify",
4750
Self::HtmlPrettify => "html_prettify",
@@ -64,6 +67,10 @@ impl Editor {
6467
extension: Some("html"),
6568
text: css_selector::apply(selector, &input.text)?,
6669
}),
70+
Self::CssSort(sort) => Ok(Content {
71+
extension: Some("html"),
72+
text: sort.apply(url, &input.text),
73+
}),
6774
Self::DebugFiles(path) => debug_files::debug_files(path, input),
6875
Self::HtmlMarkdownify => Ok(Content {
6976
extension: Some("md"),

0 commit comments

Comments
 (0)