-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcalculate_all_difficulties.py
104 lines (76 loc) · 3.42 KB
/
calculate_all_difficulties.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
calculate_results.py: applies the difficulty calculating code to every dataset in the current working directory.
In the paper corresponding to this code, we present a measure of difficulty for text classification datasets. In the
directory alongside this code are 90 different datasets which we made use of in varying capacity in the paper. This code
applies the difficulty calculating code to every dataset in the current working directory and writes the results to
"all_difficulties.csv" in the current working directory.
This code requires that the "edm" package be installed to run.
"""
# ======================================================================================================================
#
# CODE SETUP
#
# ======================================================================================================================
# ====>> Python Native Imports <<====
import os
import sys
import csv
# ====>> Own Package Imports <<====
from edm import report
import data_loader
# ====>> Authorship Info <<====
__author__ = ["Ed Collins", "Nikolai Rozanov", "Bingbing Zhang"]
__licence__ = "MIT"
__version__ = "0.0.1"
# ======================================================================================================================
# ======================================================================================================================
#
# FUNCTIONS
#
# ======================================================================================================================
def iterate_datasets(startDir):
"""
Iterates over all datasets. Each directory must have the same directory structure for this function:
DATASET_NAME/
|
|_ README.md
|
|_ eval/
| |
| |_ DATASET_NAME__TEST.csv
| |
| |_ DATASET_NAME__DEV.csv (optional)
|
|_ training/
|
|_ DATASET_NAME__FULL.csv
:param startDir : the directory to iterate over datasets from.
:type startDir : str
:returns : the training dataset path, the name of the dataset directory.
"""
for dirname in os.listdir(startDir):
dirpath = startDir + "/" + dirname
if not os.path.isdir(dirpath) or dirname.startswith(".") or dirname.startswith("__"):
continue
dsetPath = dirpath + "/training/" + dirname + "__FULL.csv"
yield dsetPath, dirname
# ======================================================================================================================
if __name__ == '__main__':
for dsetPath, dsetName in iterate_datasets(os.getcwd()):
print("----> Loading {} data...".format(dsetName), end=" ")
sys.stdout.flush()
sents, labels = data_loader.load_two_column_csv_data(dsetPath)
print("Done.")
difficultyResults = report.get_difficulty_components_dict(sents, labels)
result = (
dsetName,
difficultyResults["Distinct Words : Total Words"][0] ,
difficultyResults["Class Imbalance"][0] ,
difficultyResults["Class Diversity"][0] ,
1 - difficultyResults["1 - Min. Hell. Dist."][0] ,
difficultyResults["Mutual Information"][0] ,
difficultyResults["Difficulty"][0]
)
with open("all_difficulties.csv", "a") as f:
writer = csv.writer(f)
writer.writerow(result)