forked from Parsely/serpextract
-
Notifications
You must be signed in to change notification settings - Fork 5
/
update_list.py
100 lines (75 loc) · 2.89 KB
/
update_list.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
"""Update the search_engines.pickle list contained within the package.
Use this before deploying an update"""
from __future__ import absolute_import, division, print_function
import argparse
import os
import sys
from collections import OrderedDict
from subprocess import Popen, PIPE
try:
import cPickle as pickle
except ImportError:
import pickle
try:
import simplejson as json
except ImportError:
import json
from six.moves.urllib.request import urlopen
_here = lambda *paths: os.path.join(os.path.dirname(os.path.abspath(__file__)),
*paths)
def array(*args, **kwargs):
if args:
return list(args)
if kwargs:
return OrderedDict(kwargs)
piwik_search_engines = []
null = None
def parse_php(php_script):
if_index = php_script.find('if ')
start_index = php_script.find('array(', if_index) + 6
end_index = php_script.rfind(');')
json_body = php_script[start_index:end_index]
for line in json_body.split('\n'):
line = line.strip()
# comments
line = line.replace('//', '#')
if (line.startswith('#')
or line.startswith('/*')
or line.startswith('*')
or line.startswith('*/')
or line == ''):
continue
k, v = line.split('=>', 1)
k = k.strip().strip("'")
v = eval(v)[0]
piwik_search_engines.append((k, v))
return OrderedDict(piwik_search_engines)
def main():
parser = argparse.ArgumentParser(description='Update SearchEngines.php or search_engines.*.pickle')
parser.add_argument('target',
choices=['php', 'pickle'],
help='php: SearchEngines.php, pickle: search_engines.*.pickle')
args = parser.parse_args()
local_filename = _here('SearchEngines.php')
if args.target == 'php':
print('Updating search engine parser definitions from piwik (requires PHP).')
# load from piwik
piwik_url = 'https://raw.githubusercontent.com/piwik/piwik/2.14.3/core/DataFiles/SearchEngines.php'
piwik_f = urlopen(piwik_url)
php_script = piwik_f.read()
# save into local SearchEngines.php
with open(local_filename, 'w') as local_f:
local_f.write(php_script)
print('Saved piwik search engine parser definitions into %s' % local_filename)
elif args.target == 'pickle':
with open(local_filename) as local_f:
php_script = local_f.read()
piwik_engines = parse_php(php_script)
py_version = sys.version_info[0]
filename = _here('serpextract', 'search_engines.py{}.pickle'.format(py_version))
with open(filename, 'wb') as pickle_file:
pickle.dump(piwik_engines, pickle_file)
print('Saved {} search engine parser definitions into {}.'
.format(len(piwik_engines), filename))
if __name__ == '__main__':
main()