Skip to content

Commit e85642e

Browse files
authored
Create convert_icepahc.py
1 parent a9593d5 commit e85642e

File tree

1 file changed

+48
-0
lines changed

1 file changed

+48
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
from stanza.utils.datasets.constituency import utils
2+
3+
def read_psd_file(input_file):
4+
"""
5+
Convert the IcePaHC .psd file to text
6+
7+
Returns a list of sentences
8+
"""
9+
with open(input_file, encoding='utf-8') as file:
10+
lines = file.readlines()
11+
12+
output_trees = []
13+
current_tree = ''
14+
15+
# Add the trees as parsed sentences to the output_trees list
16+
for line in lines:
17+
if line.startswith("(ROOT"):
18+
if current_tree:
19+
cleaned_tree = ' '.join(current_tree.split())
20+
output_trees.append(cleaned_tree)
21+
current_tree = line
22+
else:
23+
current_tree += line
24+
25+
# Can't forget the last tree
26+
if current_tree:
27+
cleaned_tree = ' '.join(current_tree.split())
28+
output_trees.append(cleaned_tree.strip())
29+
30+
return output_trees
31+
32+
33+
def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1):
34+
35+
trees = read_psd_file(input_file)
36+
37+
print("Read %d trees" % len(trees))
38+
train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size)
39+
print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees)))
40+
41+
return train_trees, dev_trees, test_trees
42+
43+
44+
def main():
45+
treebank = convert_icepahc_treebank("simpleicepahc24.psd")
46+
47+
if __name__ == '__main__':
48+
main()

0 commit comments

Comments
 (0)