-
Notifications
You must be signed in to change notification settings - Fork 11
/
Copy pathscraper.rs
55 lines (47 loc) · 1.63 KB
/
scraper.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
/// Shows how to use html5gum in combination with scraper.
///
/// Usage:
///
/// ```sh
/// echo '<h1><span class=hello>Hello</span></h1>' | cargo run --all-features --example scraper
/// ```
///
/// Essentially, your HTML parsing will be powered by a combination of html5gum and html5ever. This
/// has no immediate benefit over using scraper normally and is mostly done as a transitionary step
/// until html5gum has its own implementation of tree building and the DOM.
///
/// Requires the tree-builder feature.
use std::io::{stdin, Read};
use argh::FromArgs;
use html5ever::interface::tree_builder::TreeSink;
use html5gum::emitters::html5ever::parse_document;
use scraper::{Html, HtmlTreeSink, Selector};
/// Read some HTML from stdin and parse it according to the given selector.
#[derive(FromArgs)]
struct Cli {
/// turn off html5gum and just use regular scraper.
///
/// This can be useful for comparing the two in performance and correctness.
#[argh(switch)]
use_html5ever: bool,
/// a CSS selector, like ".hello"
#[argh(positional)]
selector: String,
}
fn main() {
let cli: Cli = argh::from_env();
let mut input = String::new();
stdin().read_to_string(&mut input).unwrap();
let dom = if cli.use_html5ever {
Html::parse_document(&input)
} else {
let dom = Html::new_document();
let tree_sink = HtmlTreeSink::new(dom);
let Ok(tree_sink) = parse_document(&input, tree_sink, Default::default());
tree_sink.finish()
};
let selector = Selector::parse(&cli.selector).unwrap();
for element in dom.select(&selector) {
println!("{:?}", element);
}
}