-
Notifications
You must be signed in to change notification settings - Fork 1
/
scrape.py
79 lines (64 loc) · 2.7 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# scrape: Extract HTML elements using an XPath query or CSS3 selector.
#
# Example usage:
# $ curl 'http://en.wikipedia.org/wiki/List_of_sovereign_states' -s \
# | scrape -be 'table.wikitable > tr > td > b > a'
#
# Dependencies: lxml and optionally cssselector
#
# Author: http://jeroenjanssens.com
import sys
import argparse
from lxml import etree
from sys import exit
def main():
parser = argparse.ArgumentParser()
parser.add_argument('html', nargs='?', type=argparse.FileType('rb'),
default=sys.stdin, help="HTML", metavar="HTML")
parser.add_argument('-a', '--argument', default="",
help="argument to extract from tag")
parser.add_argument('-b', '--body', action='store_true', default=False,
help="Enclose output with HTML and BODY tags")
parser.add_argument('-e', '--expression', default=[], action='append',
help="XPath query or CSS3 selector")
parser.add_argument('-f', '--file', default='',
help="File to read input from")
parser.add_argument('-x', '--check-existance', action='store_true', default=False,
help="Process return value signifying existance")
parser.add_argument('-r', '--rawinput', action='store_true', default=False,
help="Do not parse HTML before feeding etree (useful"
"for escaping CData)")
args = parser.parse_args()
args.expression = [e.decode('utf-8') for e in args.expression]
from cssselect import GenericTranslator
expression = [e if e.startswith('//') else GenericTranslator().css_to_xpath(e) for e in args.expression]
html_parser = etree.HTMLParser(encoding='utf-8', recover=True,
strip_cdata=True)
inp = open(args.file) if args.file else args.html
if args.rawinput:
document = etree.fromstring(inp.read())
else:
document = etree.parse(inp, html_parser)
if args.body:
sys.stdout.write("<!DOCTYPE html>\n<html>\n<body>\n")
for e in expression:
els = list(document.xpath(e))
if args.check_existance:
sys.exit(1 if len(els) == 0 else 0)
for e in els:
if isinstance(e, basestring):
text = e
elif not args.argument:
text = etree.tostring(e)
else:
text = e.get(args.argument)
if text is not None:
sys.stdout.write(text.encode('utf-8').strip() + "\t")
if args.body:
sys.stdout.write("</body>\n</html>")
sys.stdout.write('\n')
sys.stdout.flush()
if __name__ == "__main__":
exit(main())