-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaddRelatedEntries.py
140 lines (105 loc) · 4.6 KB
/
addRelatedEntries.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
"""
This script generates related entries, using three metrics:
* Sharing dependencies
* Sharing keywords
* Sharing keywords
These are weighted and used to find entries which are likely similar.
These are then added to the entries to improve site navigation.
"""
import os
import json
from writeFile import writeFile
def addRelatedEntries():
"""
First three dictionaries are created as follows:
dependencies = {"dependency": [list-of-entries, ...], ...}
keywords = {"keyword": [list-of-entries, ...], ...}
topics = {"topic": [list-of-entries, ...], ...}
Keywords that feature in more than 10 entries are dropped. Then
a dictionary is created with the relatedness scores between each
entry. Finally, the top three related entries are chosen for each
entry.
"""
hugoDir = "hugo/"
entriesDir = hugoDir + "content/entries/"
keywordsFile = hugoDir + "themes/afp/static/data/keywords.json"
keywords = {}
with open(keywordsFile) as file:
data = json.load(file)
for obj in data:
keywords[obj["keyword"]] = []
dependencies = {}
topics = {}
for entry in os.listdir(entriesDir):
shortname = entry[:-3]
with open(os.path.join(entriesDir, entry)) as file:
data = json.load(file)
if "dependencies" in data:
for dep in data["dependencies"]:
if dep in dependencies:
dependencies[dep].append(shortname)
else:
dependencies[dep] = [shortname]
if "topics" in data:
for topic in data["topics"]:
if topic in topics:
topics[topic].append(shortname)
else:
topics[topic] = [shortname]
for keyword in keywords.keys():
if keyword in data["abstract"].lower():
keywords[keyword].append(shortname)
for keyword, values in list(keywords.items()):
if len(values) > 10:
keywords.pop(keyword)
# writeFile("rake.json", keywords)
relatedEntries = {}
for dataSet in [(keywords, 1), (dependencies, 1.5), (topics, 0.5)]:
populateRelated(dataSet[0], relatedEntries, dataSet[1])
for entry in relatedEntries:
for keyword, value in list(relatedEntries[entry].items()):
if value <= 2.5:
relatedEntries[entry].pop(keyword)
finalRelatedEntries = {}
for keyword, values in relatedEntries.items():
finalRelatedEntries[keyword] = topThree(values)
# relatedEntriesData = []
# source = set()
# target = set()
for entry, related in finalRelatedEntries.items():
if related:
data = {"relatedEntries": related}
writeFile(entriesDir + entry + ".md", data)
# source.add(entry)
# for r in related:
# target.add(r)
# dataAppend(relatedEntriesData, entry, related)
# writeFile("relatedEntries.gv", relatedEntriesData)
def populateRelated(dataSet, relatedEntries, modifier=1):
"""This is a heavliy nested loop to create the relatedEntries dictionary.
For each of the categories, the list of entries associated with
each key is iterated over twice and, if the entries are not the
same, the modifier of that category is added to the relatedness
score between the two entries in the dictionary. As the loop
iterates twice over the value set, the resulting dictionary is
bijective — i.e., the value for A->B will be equal to B->A.
"""
for _, entries in dataSet.items():
for keyEntry in entries:
for valueEntry in entries:
if valueEntry != keyEntry:
if keyEntry in relatedEntries:
if valueEntry in relatedEntries[keyEntry]:
relatedEntries[keyEntry][valueEntry] += modifier
else:
relatedEntries[keyEntry][valueEntry] = modifier
else:
relatedEntries[keyEntry] = {valueEntry: modifier}
def topThree(dictionary):
"""Returns the highest three dictionary keys by value"""
return sorted(dictionary, key=dictionary.get, reverse=True)[:3]
# def dataAppend(data, entry, relatedEntries):
# for related in relatedEntries:
# data.append( entry + "->" + related)
if __name__ == "__main__":
addRelatedEntries()