Skip to content

Commit

Permalink
Add xpath text baseline (without output as it's huge)
Browse files Browse the repository at this point in the history
  • Loading branch information
lopuhin committed Jan 28, 2020
1 parent 8c1f1bd commit 4283a02
Showing 1 changed file with 30 additions and 0 deletions.
30 changes: 30 additions & 0 deletions run_xpath_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/usr/bin/env python3
import gzip
import json
from pathlib import Path

import lxml.html


def xpath_text(html: str) -> str:
root = lxml.html.fromstring(html)
bodies = root.xpath('//body')
if bodies:
root = bodies[0]
return ' '.join(root.xpath('.//text()'))


def main():
output = {}
for path in Path('html').glob('*.html.gz'):
with gzip.open(path, 'rt', encoding='utf8') as f:
html = f.read()
item_id = path.stem.split('.')[0]
output[item_id] = {'articleBody': xpath_text(html)}
(Path('output') / 'xpath-text.json').write_text(
json.dumps(output, sort_keys=True, ensure_ascii=False, indent=4),
encoding='utf8')


if __name__ == '__main__':
main()

0 comments on commit 4283a02

Please sign in to comment.