-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindex
executable file
·102 lines (86 loc) · 4.45 KB
/
index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
#!/usr/bin/env python3
import argparse
import subprocess
import os
import json
import psutil
print("Indexing...")
#512MB of reserved system memory
reserved_memory = (1024**3)//2
available_memory = psutil.virtual_memory()[0]
usable_memory = (available_memory - reserved_memory)//1024**2
my_env = os.environ.copy()
my_env["TERRIER_HEAP_MEM"] = f"{usable_memory}M"
parser = argparse.ArgumentParser()
parser.add_argument("--json", type=json.loads, required=True, help="json with arguments")
args, unknown = parser.parse_known_args()
if not os.path.isdir("/work/indexes"):
os.mkdir("/work/indexes/", 777)
index_options = {
"default" : {},
#http://ir.dcs.gla.ac.uk/wiki/Terrier/Disks1%262
"robust04" : {"terrier.mvn.coords": "org.apache.commons:commons-compress:1.18",
"files.mappings":"z:org.apache.commons.compress.compressors.z.ZCompressorInputStream:null," +
"0z:org.apache.commons.compress.compressors.z.ZCompressorInputStream:null," +
"1z:org.apache.commons.compress.compressors.z.ZCompressorInputStream:null," +
"2z:org.apache.commons.compress.compressors.z.ZCompressorInputStream:null",
"TrecDocTags.process":"TEXT,H3,DOCTITLE,HEADLINE,TTL"},
#http://ir.dcs.gla.ac.uk/wiki/Terrier/DOTGOV2
"gov2" : {"trec.collection.class":"TRECWebCollection",
"indexer.meta.forward.keys":"docno,url",
"indexer.meta.forward.keylens":"26,256",
"metaindex.compressed.crop.long":"true"
},
#http://ir.dcs.gla.ac.uk/wiki/Terrier/ClueWeb09-B
"cw09b" : {
"trec.collection.class":"WARC018Collection",
"indexer.meta.forward.keys":"docno,url",
"indexer.meta.forward.keylens":"26,256",
"indexer.meta.reverse.keys":"docno",
"metaindex.compressed.crop.long":"true"},
#http://ir.dcs.gla.ac.uk/wiki/Terrier/ClueWeb12
"cw12b" : {
"trec.collection.class":"WARC10Collection",
"indexer.meta.forward.keys":"docno,url",
"indexer.meta.forward.keylens":"26,256",
"indexer.meta.reverse.keys":"docno",
"metaindex.compressed.crop.long":"true"},
"core18" : {
"terrier.mvn.coords":"uk.ac.gla.dcs.terrierteam:terrier-wapo:0.1",
"trec.collection.class":"uk.ac.gla.terrier.indexing.WAPOCollection",
"indexer.meta.forward.keylens":"40"
}
}
print(args.json["opts"])
for collection in args.json["collections"]:
name, path = collection['name'], collection['path']
params = " ".join( [ "-D"+k+"="+v for k, v in index_options[name].items() ] )
block_index = False
opts=""
if "opts" in args.json:
if "block.indexing" in args.json["opts"] and args.json['opts']['block.indexing'] == 'true':
block_index=True
del args.json['opts']['block.indexing']
opts = opts + " ".join( ["-D"+k+"="+v for k, v in args.json["opts"].items()] )
print(opts)
subprocess.run("""
/work/terrier-core/bin/trec_setup.sh
{0}""".format(path).split(), env=my_env)
#grep out any readmes
subprocess.run(["/bin/sh", "-c", "egrep -vi 'readme' /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env)
#grep out congressional record
if name == "robust04":
subprocess.run(["/bin/sh", "-c", "egrep -vi 'cr93|read|dtd' /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env)
#grep out non-json files
if name == "core18":
subprocess.run(["/bin/sh", "-c", "egrep -vi 'md5sum|scripts' /work/terrier-core/etc/collection.spec > /work/terrier-core/etc/collection.spec.new; mv /work/terrier-core/etc/collection.spec.new /work/terrier-core/etc/collection.spec"], env=my_env)
print("Files to index...")
subprocess.run(["wc", "-l", "/work/terrier-core/etc/collection.spec"])
cmd="""
/work/terrier-core/bin/terrier batchindexing -p
-I /work/indexes/{0} {1} {2}""".format(name, params, opts)
if block_index:
cmd = cmd.replace("-p", "-b -p")
print("Indexing command is " + cmd)
subprocess.run(cmd.split(), env=my_env)
subprocess.run("/work/terrier-core/bin/terrier indexstats -I /work/indexes/{0}".format(name).split(), env=my_env)