Skip to content

Commit 23a31b5

Browse files
authored
feat(css_tag_replace): new editor (#211)
1 parent 492ee52 commit 23a31b5

File tree

4 files changed

+129
-0
lines changed

4 files changed

+129
-0
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
1010
### Added
1111

1212
- Generate the JSON Schema for the `website-stalker.yaml` with `website-stalker json-schema`.
13+
- new editor: `css_tag_replace`
1314
- Support zstd response body decompression
1415

1516
## [0.25.1] - 2025-01-30

README.md

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,57 @@ editors:
347347
reverse: true
348348
```
349349

350+
#### `css_tag_replace`
351+
352+
Replace [HTML tags](https://developer.mozilla.org/en-US/docs/Glossary/Tag) matching a given [CSS Selector](https://developer.mozilla.org/en-US/docs/Learn/CSS/Building_blocks/Selectors).
353+
354+
For example, the following config will replace all `h3` tags with `h2` tags.
355+
356+
```yaml
357+
editors:
358+
- css_tag_replace:
359+
selector: h3
360+
replace: h2
361+
```
362+
363+
```diff
364+
<html>
365+
<head></head>
366+
<body>
367+
- <h3 class="green">
368+
+ <h2 class="green">
369+
Hello
370+
- </h3>
371+
+ </h2>
372+
World
373+
</body>
374+
</html>
375+
```
376+
377+
This can be helpful to ensure some kind of structure especially when editors like [`html_markdownify`](#html_markdownify) are used.
378+
Think about a website where only some subsections are of interest and selected via the [`css_select`](#css_select).
379+
While the `header` contains some `h1` the selected part skips the `h2` headings and continues with `h3` headings.
380+
Also, `<strong>` are (incorrectly) used as subheadings.
381+
Parsing this to Markdown results in less optimal structure as `h2` are skipped and `<strong>` don't result in headings.
382+
Ideal would be a single `h1` and then continous depending on the depth `h2`, `h3` and so on.
383+
The following can help with that:
384+
385+
```yaml
386+
editors:
387+
# Select the header and some interesting sections
388+
- css_select: header, main section.interesting
389+
# First migrate the h3 tags to h2 so there is no gap
390+
- css_tag_replace:
391+
selector: h3
392+
replace: h2
393+
# Then migrate all strong tags to proper headings
394+
- css_tag_replace:
395+
selector: strong
396+
replace: h3
397+
# In the end parse to Markdown
398+
- html_markdownify
399+
```
400+
350401
#### `debug_files`
351402
352403
This editor passes its input through without modifying it.

src/editor/css_tag_replace.rs

Lines changed: 70 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
use html5ever::{namespace_url, ns, LocalName, QualName};
2+
use schemars::JsonSchema;
3+
use scraper::{Html, Node, Selector};
4+
use serde::Deserialize;
5+
6+
#[derive(Debug, Clone, Deserialize, JsonSchema)]
7+
#[serde(deny_unknown_fields)]
8+
pub struct CssTagReplace {
9+
#[schemars(with = "String")]
10+
pub selector: Selector,
11+
12+
#[schemars(with = "String")]
13+
pub replace: LocalName,
14+
}
15+
16+
impl CssTagReplace {
17+
pub fn apply(&self, html: &str) -> anyhow::Result<String> {
18+
let mut html = Html::parse_document(html);
19+
let selected = html
20+
.select(&self.selector)
21+
.map(|element| element.id())
22+
.collect::<Vec<_>>();
23+
anyhow::ensure!(!selected.is_empty(), "selected nothing");
24+
for node_id in selected {
25+
let mut node = html
26+
.tree
27+
.get_mut(node_id)
28+
.expect("Element ID should exist as it was just taken from the given HTML");
29+
let Node::Element(element) = node.value() else {
30+
unreachable!("Select only selects elements");
31+
};
32+
element.name = QualName::new(None, ns!(html), self.replace.clone());
33+
}
34+
35+
Ok(html.html())
36+
}
37+
}
38+
39+
#[cfg(test)]
40+
#[track_caller]
41+
fn case<TAG: Into<LocalName>>(selectors: &str, replace: TAG, html: &str, expected: &str) {
42+
let result = CssTagReplace {
43+
selector: Selector::parse(selectors).unwrap(),
44+
replace: replace.into(),
45+
}
46+
.apply(html)
47+
.expect("Should select something");
48+
assert_eq!(result, expected);
49+
}
50+
51+
#[test]
52+
fn only_tag() {
53+
let html = "<body><h1>Hello</h1>World<h3>Foo</h3>Bar</body>";
54+
let expected = "<html><head></head><body><h1>Hello</h1>World<h2>Foo</h2>Bar</body></html>";
55+
case("h3", "h2", html, expected);
56+
}
57+
58+
#[test]
59+
fn keeps_attributes() {
60+
let html = r#"<body><h2 class="green">Hello</h2>World</body>"#;
61+
let expected = r#"<html><head></head><body><h1 class="green">Hello</h1>World</body></html>"#;
62+
case("h2", "h1", html, expected);
63+
}
64+
65+
#[test]
66+
fn more_specific_selector() {
67+
let html = "<body><header><div>Headline</div></header><main><div>Something</div></main></body>";
68+
let expected = "<html><head></head><body><header><div>Headline</div></header><main><p>Something</p></main></body></html>";
69+
case("main div", "p", html, expected);
70+
}

src/editor/mod.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub mod css_flatten;
99
pub mod css_remove;
1010
pub mod css_selector;
1111
pub mod css_sort;
12+
pub mod css_tag_replace;
1213
pub mod debug_files;
1314
pub mod html_markdown;
1415
pub mod html_pretty;
@@ -34,6 +35,7 @@ pub enum Editor {
3435
CssRemove(#[schemars(with = "String")] scraper::Selector),
3536
CssSelect(#[schemars(with = "String")] scraper::Selector),
3637
CssSort(css_sort::CssSort),
38+
CssTagReplace(css_tag_replace::CssTagReplace),
3739
DebugFiles(PathBuf),
3840
HtmlMarkdownify,
3941
HtmlPrettify,
@@ -52,6 +54,7 @@ impl Editor {
5254
Self::CssRemove(_) => "css_remove",
5355
Self::CssSelect(_) => "css_select",
5456
Self::CssSort(_) => "css_sort",
57+
Self::CssTagReplace(_) => "css_tag_replace",
5558
Self::DebugFiles(_) => "debug_files",
5659
Self::HtmlMarkdownify => "html_markdownify",
5760
Self::HtmlPrettify => "html_prettify",
@@ -82,6 +85,10 @@ impl Editor {
8285
extension: Some("html"),
8386
text: sort.apply(url, &input.text),
8487
}),
88+
Self::CssTagReplace(replace) => Ok(Content {
89+
extension: Some("html"),
90+
text: replace.apply(&input.text)?,
91+
}),
8592
Self::DebugFiles(path) => debug_files::debug_files(path, input),
8693
Self::HtmlMarkdownify => Ok(Content {
8794
extension: Some("md"),

0 commit comments

Comments
 (0)