This repository has been archived by the owner on Jan 23, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 13
/
utils.py
256 lines (209 loc) · 8.91 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
"""
Perform actions related to typosquatting.
These are the important misfits. They don't fit in elsewhere but these
functions need to be in a module somewhere.
"""
import collections
import datetime
import glob
import json
import os
import sys
from time import gmtime, localtime, strftime, time
from mrs_spellings import MrsWord
from termcolor import colored
import constants
from filters import distance_calculations, homophone_attack_screen, order_attack_screen
from scrapers import get_metadata
MAX_DISTANCE = constants.MAX_DISTANCE
def compare_metadata(pkg1, pkg2):
"""Retrieve and compare metadata of two PyPI packages.
Determine whether the package metadata has no identical fields
(i.e. no risk) or has at least one identical field (i.e. some risk).
This function operates on the theory that typosquatting packages
sometimes, perhaps often, borrow package metadata of the original
package in order to trick unsuspecting users.
Args:
pkg1 (str): name of first package to compare
pkg2 (str): name of second package to compare
Returns:
str: a value of "no_risk" or "some_risk"
"""
# Retrieve metadata for both packages
pkg1_metadata = get_metadata(pkg1)
pkg2_metadata = get_metadata(pkg2)
# Loop through identified fields to count number of identical fields
num_identical_fields = 0
# TODO: Decide if I should use any other fields?
fields_to_compare = [
"author_email",
"author",
"package_url",
"description",
"home_page",
"summary",
]
for field in fields_to_compare:
# Only increment num_identical_fields if the field is not empty
# and the fields are identical
blank_field = pkg1_metadata["info"][field] == ""
same_metadata = pkg1_metadata["info"][field] == pkg2_metadata["info"][field]
if (not blank_field) and same_metadata:
num_identical_fields += 1
# Categorize risk level based on count of identical fields
risk_level = "no_risk"
if num_identical_fields >= 1:
risk_level = "some_risk"
return risk_level
def create_suspicious_package_dict(
all_packages, top_packages, max_distance=MAX_DISTANCE
):
"""Examine all top packages for typosquatters.
Loop through all top packages and check for instances of
typosquatting. This includes confusion
Args:
all_packages (list): all package names
top_packages (list): package names to perform comparison
max_distance (int): maximum edit distance to check for typosquatting
Returns:
dict: top packages (key) and potential typosquatters (value)
"""
suspicious_packages = collections.OrderedDict()
for top_package in top_packages:
# Check for misspelling attacks
close_packages = distance_calculations(top_package, all_packages, max_distance)
# Check for confusion attcks
reverse_package = order_attack_screen(top_package, all_packages)
# If there actually is a reverse package squatter, add to list
if reverse_package:
close_packages.extend(reverse_package)
# Check for homophone attack
homophone_packages = homophone_attack_screen(top_package, all_packages)
suspicious_packages[top_package] = close_packages
return suspicious_packages
def store_squatting_candidates(squat_candidates):
"""Persist results of squatting candidate search.
Dump typosquatter candidate list to a json file. Store
with time-stamped file name to results folder.
Args:
squat_candidates (dict): top packages and potential typosquatters
"""
timestamp = strftime("%d-%b-%Y-%H-%M-%S", localtime())
full_file_name = timestamp + "-record" + ".json"
file_name = os.path.join("results", full_file_name)
with open(file_name, "w") as path:
json.dump(squat_candidates, path)
def create_potential_squatter_names(module_name):
"""Create a set of potential typosquatting names.
Given a module name, create a set of potential typosquatting
names based on qwerty distance, a measure of how close keys
are to each other. This is a more sophisticated measure of
keyboard key distance than levenshtein distance.
Args:
module_name (str): a name for a module
Returns:
list: potential typosquatting name
"""
potential_candidates = MrsWord(module_name).qwerty_swap()
potential_candidates_joined = " ".join(potential_candidates)
potential_candidates_set = set(potential_candidates_joined.split(" "))
return potential_candidates_set
def store_recent_scan_results(packages, folder="package_lists"):
"""Store results of scanning packages recently added to PyPI.
Save timestamped version of JSON file to allow analysis of packages
recently added to PyPI
Args:
packages (list): Packages on PyPI
folder (str): Folder in which to store JSON file
"""
timestamp = strftime("%Y-%m-%d-%H-%M-%S", gmtime())
filename = "pypi-package-list-" + timestamp + ".json"
# Platform-independent path joining
path = os.path.join(folder, filename)
with open(path, "w", encoding="utf-8") as f:
json.dump(packages, f, ensure_ascii=False, indent=4)
def load_most_recent_packages(folder="package_lists"):
"""Load the most recent package list from at least 24 hours ago.
Load the JSON file containing PyPI packages with the most recent
timestamp that was created at least 24 hours ago.
Args:
folder (str): Folder in which to check for file
Returns:
package_set (set): Packages loaded from JSON file
"""
# Identify all json files
path = os.path.join(folder, "*.json")
json_files = glob.glob(path)
# Find json file that is at least 24 hours old.
# TODO: Is the first one found the newest of the candidates? Need to check.
current_time = time()
newest_file_older_than_1day = ""
DAY_IN_SECONDS = 60 * 60 * 24
for file in json_files:
file_no_ext = os.path.splitext(file)[0] # Remove extension
yr, mon, day, hr, minute, sec = file_no_ext.split("-")[-6:] # get time
# Convert time variables to integers
yr = int(yr)
mon = int(mon)
day = int(day)
hr = int(hr)
minute = int(minute)
sec = int(sec)
dt = datetime.datetime(yr, mon, day, hr, minute, sec) # unix time
# Avoid bugs by using this conservative approach
file_timestamp = (dt - datetime.datetime(1970, 1, 1)) / datetime.timedelta(
seconds=1
)
if file_timestamp <= (current_time - DAY_IN_SECONDS):
newest_file_older_than_1day = file
break
# Check for existence of file and, if it exists, load it
if not newest_file_older_than_1day:
raise FileNotFoundError("No json files older than one day found.")
else:
with open(newest_file_older_than_1day, "r") as f:
package_set = set(json.load(f))
return package_set
def print_suspicious_packages(packages):
"""Pretty print a suspicious package list.
Packages with any identical metadata are printed in red while
other potential typosquatters are printed in the normal ink color.
Args:
packages (dict): (key) package and (value) potential typosquatters
"""
print("Number of packages to examine: " + str(len(packages)))
cnt_potential_squatters = 0
# Note: The complicated printing sequence below accomodates the
# decision to use coloring for packages with similar metadata.
for pkg in packages:
print(pkg, ": ", end="")
num_pkgs = len(packages[pkg])
# Check if there are any potential typosquatters
if num_pkgs > 0:
print("[", end="")
for index, squatter in enumerate(packages[pkg]):
# Check if package has at least some identical metadata
# Use color printing if so
if compare_metadata(pkg, squatter) == "some_risk":
print("'", end="")
print(colored(squatter, "red"), sep="", end="")
# This codes skips printing unnecessary characters
# at the end of the list of potential typosquatters
if index != (num_pkgs - 1):
print("', ", end="")
else:
print("'", end="")
# If package has no identical metadata, do normal printing
else:
print("'", end="")
print(squatter, end="")
if index != (num_pkgs - 1):
print("', ", end="")
else:
print("'", end="")
print("]")
# If package has no potential typosquatters, print null set
else:
print("[]")
cnt_potential_squatters += len(packages[pkg])
print("Number of potential typosquatters: " + str(cnt_potential_squatters))