-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathdata.py
67 lines (54 loc) · 2.14 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# -*- coding: utf-8 -*-
import os
from os import listdir
from os.path import isfile, join
import json
from urllib.request import urlopen, Request
from lxml import html
class PlatoData(object):
"""
The `PlatoData` represents n object to crawl the stanford philosophy encyclopedia.
"""
BASE_URL = 'http://plato.stanford.edu/'
USER_AGENT = 'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0)'
DATA_DIR = './data'
def _get_page(self, url):
request = Request(url, headers={'User-Agent': self.USER_AGENT})
response = urlopen(request)
html = response.read()
response.close()
return html
def get_index(self):
url = '{}{}'.format(self.BASE_URL, 'contents.html')
response = self._get_page(url)
response = html.fromstring(response)
return response.xpath('//div[@id="content"]//li/a/@href')
def _get_entry(self, entry):
url = '{}{}'.format(self.BASE_URL, entry)
entry_filename = '{}/{}.json'.format(self.DATA_DIR, entry.split('/')[1])
if os.path.isfile(entry_filename):
return
response = self._get_page(url)
response = html.fromstring(response)
print(response.xpath('//div[@id="aueditable"]//h1//text()'))
data = response.xpath('//div[@id="main-text"]//p//text()')
with open(entry_filename, 'w') as outfile:
json.dump(data, outfile)
outfile.close()
def get_entries(self):
for entry in self.get_index():
self._get_entry(entry)
def prepare_corpus(self, download=False):
if download:
self.get_entries()
def _get_entry_text(entry):
with open('{}/{}'.format(self.DATA_DIR, entry), 'r') as f:
data = json.loads(f.read())
return ''.join(''.join(data).split('\n'))
entries = []
for f in listdir(self.DATA_DIR):
if isfile(join(self.DATA_DIR, f)) and not f.startswith('.'):
entries.append(_get_entry_text(f))
entries = '. '.join(entries)
with open('{}/data'.format(self.DATA_DIR), 'w') as outfile:
outfile.write(entries)