diff --git a/stanza/resources/default_packages.py b/stanza/resources/default_packages.py index a51d68be6f..b99a724218 100644 --- a/stanza/resources/default_packages.py +++ b/stanza/resources/default_packages.py @@ -635,6 +635,17 @@ def build_default_pretrains(default_treebanks): # xlm-roberta-base : 89.31 "hy": "xlm-roberta-base", + # https://huggingface.co/mideind/IceBERT + # IceBERT-large is also available: + # https://huggingface.co/mideind/IceBERT-large + # Constituency F1 scores: + # No bert (in-order): 84.40% + # IceBERT (top-down): 88.66% + # IceBERT (finetuning, top-down): 90.38% + # IceBERT-large (top-down): 88.80% + # IceBERT-large (ft, top-down): 90.29% + "is": "mideind/IceBERT" + # Indonesian POS experiments: dev set of GSD # python3 stanza/utils/training/run_pos.py id_gsd --no_bert # python3 stanza/utils/training/run_pos.py id_gsd --bert_model ... @@ -811,6 +822,10 @@ def build_default_pretrains(default_treebanks): # hy "xlm-roberta-base": "xlm-roberta-base", + # is + "mideind/IceBERT": "icebert", + "mideind/IceBERT-large": "icebert-large", + # id "indolem/indobert-base-uncased": "indobert", "indobenchmark/indobert-large-p1": "indobenchmark-large-p1", diff --git a/stanza/utils/datasets/constituency/convert_icepahc.py b/stanza/utils/datasets/constituency/convert_icepahc.py new file mode 100644 index 0000000000..94c67ff006 --- /dev/null +++ b/stanza/utils/datasets/constituency/convert_icepahc.py @@ -0,0 +1,48 @@ +from stanza.utils.datasets.constituency import utils + +def read_psd_file(input_file): + """ + Convert the IcePaHC .psd file to text + + Returns a list of sentences + """ + with open(input_file, encoding='utf-8') as file: + lines = file.readlines() + + output_trees = [] + current_tree = '' + + # Add the trees as parsed sentences to the output_trees list + for line in lines: + if line.startswith("(ROOT"): + if current_tree: + cleaned_tree = ' '.join(current_tree.split()) + output_trees.append(cleaned_tree) + current_tree = line + else: + current_tree += line + + # Can't forget the last tree + if current_tree: + cleaned_tree = ' '.join(current_tree.split()) + output_trees.append(cleaned_tree.strip()) + + return output_trees + + +def convert_icepahc_treebank(input_file, train_size=0.8, dev_size=0.1): + + trees = read_psd_file(input_file) + + print("Read %d trees" % len(trees)) + train_trees, dev_trees, test_trees = utils.split_treebank(trees, train_size, dev_size) + print("Split %d trees into %d train %d dev %d test" % (len(trees), len(train_trees), len(dev_trees), len(test_trees))) + + return train_trees, dev_trees, test_trees + + +def main(): + treebank = convert_icepahc_treebank("simpleicepahc24.psd") + +if __name__ == '__main__': + main() diff --git a/stanza/utils/datasets/constituency/prepare_con_dataset.py b/stanza/utils/datasets/constituency/prepare_con_dataset.py index 532b2f260b..20698f29b8 100644 --- a/stanza/utils/datasets/constituency/prepare_con_dataset.py +++ b/stanza/utils/datasets/constituency/prepare_con_dataset.py @@ -171,6 +171,14 @@ Currently only German is converted, the German version being a version of the Tiger Treebank python3 -m stanza.utils.datasets.constituency.prepare_con_dataset de_spmrl + +is_icepahc + The Icelandic Parsed Historical Corpus (IcePaHC), available at: + https://clarin.is/en/resources/icepahc/ + A simplified/clean version of the IcePaHC treebank is used for the training + of the constituency parser, where for example empty phrases (traces and zero + subjects) and lemmas have been removed. This version is available at: + https://github.com/ingunnjk/IceConParse/tree/main/data """ import argparse @@ -198,6 +206,7 @@ from stanza.utils.datasets.constituency.utils import SHARDS, write_dataset import stanza.utils.datasets.constituency.vtb_convert as vtb_convert import stanza.utils.datasets.constituency.vtb_split as vtb_split +from stanza.utils.datasets.constituency.convert_icepahc import convert_icepahc_treebank class UnknownDatasetError(ValueError): def __init__(self, dataset, text): @@ -470,6 +479,20 @@ def process_spmrl(paths, dataset_name, *args): convert_spmrl(input_directory, output_directory, dataset_name) +def process_icepahc(paths, dataset_name, *args): + """ + Processes the Icelandic dataset, IcePaHC + """ + assert dataset_name == 'is_icepahc' + + input_file = os.path.join(paths["CONSTITUENCY_BASE"], "simpleicepahc24.psd") + if not os.path.exists(input_file): + raise FileNotFoundError("Unable to find input file for IcePaHC. Expected in {}".format(input_file)) + output_dir = paths["CONSTITUENCY_DATA_DIR"] + + datasets = convert_icepahc_treebank(input_file) + write_dataset(datasets, output_dir, dataset_name) + DATASET_MAPPING = { 'da_arboretum': process_arboretum, @@ -495,6 +518,8 @@ def process_spmrl(paths, dataset_name, *args): 'zh-hans_ctb-51': process_ctb_51, 'zh-hans_ctb-90': process_ctb_90, + + 'is_icepahc': process_icepahc, } def main(dataset_name, *args):