Skip to content

Commit

Permalink
Update prepare_con_dataset.py for the training of an Icelandic consti…
Browse files Browse the repository at this point in the history
…tuency parser

Create convert_icepahc.py

Update default_packages.py

Added a BERT model for Icelandic

Update default_packages.py
  • Loading branch information
ingunnjk committed Apr 26, 2024
1 parent 6e442a6 commit 77d753d
Show file tree
Hide file tree
Showing 3 changed files with 88 additions and 0 deletions.
15 changes: 15 additions & 0 deletions stanza/resources/default_packages.py
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,17 @@ def build_default_pretrains(default_treebanks):
# xlm-roberta-base : 89.31
"hy": "xlm-roberta-base",

# https://huggingface.co/mideind/IceBERT
# IceBERT-large is also available:
# https://huggingface.co/mideind/IceBERT-large
# Constituency F1 scores:
# No bert (in-order): 84.40%
# IceBERT (top-down): 88.66%
# IceBERT (finetuning, top-down): 90.38%
# IceBERT-large (top-down): 88.80%
# IceBERT-large (ft, top-down): 90.29%
"is": "mideind/IceBERT"

# Indonesian POS experiments: dev set of GSD
# python3 stanza/utils/training/run_pos.py id_gsd --no_bert
# python3 stanza/utils/training/run_pos.py id_gsd --bert_model ...
Expand Down Expand Up @@ -811,6 +822,10 @@ def build_default_pretrains(default_treebanks):
# hy
"xlm-roberta-base": "xlm-roberta-base",

# is
"mideind/IceBERT": "icebert",
"mideind/IceBERT-large": "icebert-large",

# id
"indolem/indobert-base-uncased": "indobert",
"indobenchmark/indobert-large-p1": "indobenchmark-large-p1",
Expand Down
48 changes: 48 additions & 0 deletions stanza/utils/datasets/constituency/convert_icepahc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
from stanza.utils.datasets.constituency import utils

def read_psd_file(input_file):
"""
Convert the IcePaHC .psd file to text
Returns a list of sentences
"""
with open(input_file, encoding='utf-8') as file:
lines = file.readlines()

output_trees = []
current_tree = ''

# Add the trees as parsed sentences to the output_trees list
for line in lines:
if line.startswith("(ROOT"):
if current_tree:
cleaned_tree = ' '.join(current_tree.split())
output_trees.append(cleaned_tree)
current_tree = line
else:
current_tree += line

# Can't forget the last tree
if current_tree:
cleaned_tree = ' '.join(current_tree.split())
output_trees.append(cleaned_tree.strip())

return output_trees


def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1):

trees = read_psd_file(input_file)

print("Read %d trees" % len(trees))
train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size)
print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees)))

return train_trees, dev_trees, test_trees


def main():
treebank = convert_icepahc_treebank("simpleicepahc24.psd")

if __name__ == '__main__':
main()
25 changes: 25 additions & 0 deletions stanza/utils/datasets/constituency/prepare_con_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,14 @@
Currently only German is converted, the German version being a
version of the Tiger Treebank
python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl
is_icepahc
The Icelandic Parsed Historical Corpus (IcePaHC), available at:
https://clarin.is/en/resources/icepahc/
A simplified/clean version of the IcePaHC treebank is used for the training
of the constituency parser, where for example empty phrases (traces and zero
subjects) and lemmas have been removed. This version is available at:
https://github.com/ingunnjk/IceConParse/tree/main/data
"""

import argparse
Expand Down Expand Up @@ -198,6 +206,7 @@
from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset
import stanza.utils.datasets.constituency.vtb_convert as vtb_convert
import stanza.utils.datasets.constituency.vtb_split as vtb_split
from stanza.utils.datasets.constituency.convert_icepahc import convert_icepahc_treebank

class UnknownDatasetError(ValueError):
def __init__(self, dataset, text):
Expand Down Expand Up @@ -470,6 +479,20 @@ def process_spmrl(paths, dataset_name, *args):

convert_spmrl(input_directory, output_directory, dataset_name)

def process_icepahc(paths, dataset_name, *args):
"""
Processes the Icelandic dataset, IcePaHC
"""
assert dataset_name == 'is_icepahc'

input_file = os.path.join(paths["CONSTITUENCY_BASE"], "simpleicepahc24.psd")
if not os.path.exists(input_file):
raise FileNotFoundError("Unable to find input file for IcePaHC. Expected in {}".format(input_file))
output_dir = paths["CONSTITUENCY_DATA_DIR"]

datasets = convert_icepahc_treebank(input_file)
write_dataset(datasets, output_dir, dataset_name)

DATASET_MAPPING = {
'da_arboretum': process_arboretum,

Expand All @@ -495,6 +518,8 @@ def process_spmrl(paths, dataset_name, *args):

'zh-hans_ctb-51': process_ctb_51,
'zh-hans_ctb-90': process_ctb_90,

'is_icepahc': process_icepahc,
}

def main(dataset_name, *args):
Expand Down

0 comments on commit 77d753d

Please sign in to comment.