From 8b3f6f590e0002065eb943d987acdf3fd3ebb5eb Mon Sep 17 00:00:00 2001 From: bichomartiano Date: Fri, 27 Sep 2024 14:07:40 +0200 Subject: [PATCH 1/5] upload extraction example script --- run_extraction_example.py | 55 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 run_extraction_example.py diff --git a/run_extraction_example.py b/run_extraction_example.py new file mode 100644 index 00000000..378e7b72 --- /dev/null +++ b/run_extraction_example.py @@ -0,0 +1,55 @@ +import sys + +from musif.extract.features.core.constants import FILE_NAME +import os +from pathlib import Path + +import pandas as pd +from feature_extraction.custom_conf import CustomConf +from feature_extraction.processor_didone import DataProcessorDidone +from musif.extract.extract import FeaturesExtractor + +# MAIN FILE to run extractions of data by Didone Project. + +# directory containing xml files +data_dir = Path("data") / "xml" +DEST_PATH = "destination_path" + + +# directory containing .pkl files in case of previous extractions for cache +cache_dir = None + +# csv file containing files which raised error and need to be reextracted +path_error = 'martiser/error_files.csv' +errored_files = list(pd.read_csv(path_error, low_memory=False)[FILE_NAME]) + +# In case a partial extraction has been run, set here the previous df to avoid re-extracting these files. +# prev_path = str(prefix / NAME) + '.csv' +# exclude_files = list(pd.read_csv('martiser/extractions/total_8_12.csv', low_memory=False)['FileName']) + +# In case only some files need to be extracted. +# xml_files = [filename for filename in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, filename)) and filename.endswith('.xml')] +# limit_files = xml_files[0:len(xml_files)//4] + +extracted_df = FeaturesExtractor( + CustomConf("config_extraction_example.yml"), + data_dir = str(data_dir), + # musescore_dir = Path("data") / "musescore", #only for harmonic analysis + # exclude_files = exclude_files, + # limit_files = limit_files, + cache_dir=cache_dir, +).extract() + +extracted_df.to_csv(str(DEST_PATH)+'.csv', index=False) + +# The raw df will be now saved in the DEST_PATH, and now post-processed +# by the Didone Processor, and ALSO saved in 4 separated csv files. +p = DataProcessorDidone(str(DEST_PATH) + '.csv', "config_postprocess_example.yml") +p.process() + +p.data.drop('level_0', axis='columns') +p.save(str(DEST_PATH)) +final_name = f'{DEST_PATH}'+'_alldata'+'.csv' + +# Running tests to ensure features values make sense +os.system(f'python tests/test_of_test.py {final_name}') \ No newline at end of file From 09a811fb595e07fda1c4234ddc43c6b899d96a34 Mon Sep 17 00:00:00 2001 From: bichomartiano Date: Fri, 27 Sep 2024 19:40:45 +0200 Subject: [PATCH 2/5] update script --- run_extraction_example.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/run_extraction_example.py b/run_extraction_example.py index 378e7b72..c596f665 100644 --- a/run_extraction_example.py +++ b/run_extraction_example.py @@ -6,9 +6,10 @@ import pandas as pd from feature_extraction.custom_conf import CustomConf -from feature_extraction.processor_didone import DataProcessorDidone from musif.extract.extract import FeaturesExtractor +from musiF.musif.process.processor import DataProcessor + # MAIN FILE to run extractions of data by Didone Project. # directory containing xml files @@ -44,7 +45,7 @@ # The raw df will be now saved in the DEST_PATH, and now post-processed # by the Didone Processor, and ALSO saved in 4 separated csv files. -p = DataProcessorDidone(str(DEST_PATH) + '.csv', "config_postprocess_example.yml") +p = DataProcessor(str(DEST_PATH) + '.csv', "config_postprocess_example.yml") p.process() p.data.drop('level_0', axis='columns') From 92271b05f3bea083aca9fe8b39a87fc74485bb76 Mon Sep 17 00:00:00 2001 From: carlos Date: Sun, 29 Sep 2024 10:28:34 +0200 Subject: [PATCH 3/5] store error files in a file --- musif/__main__.py | 4 +++- musif/common/exceptions.py | 2 +- musif/extract/extract.py | 5 +++++ 3 files changed, 9 insertions(+), 2 deletions(-) diff --git a/musif/__main__.py b/musif/__main__.py index 2bbec352..493e993b 100644 --- a/musif/__main__.py +++ b/musif/__main__.py @@ -10,6 +10,7 @@ from musif.logs import perr, pinfo from musif.process.processor import DataProcessor +import pandas as pd def main( *paths, @@ -143,7 +144,7 @@ def main( if len(config.basic_modules) == 0: config.basic_modules = ["scoring"] extract_c.MUSIC21_FILE_EXTENSIONS = extension - raw_df = FeaturesExtractor(config, limit_files=paths).extract() + raw_df, error_files = FeaturesExtractor(config, limit_files=paths).extract() output_path = Path(output_path).with_suffix(".csv") # raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False) @@ -174,6 +175,7 @@ def main( string_cols = processed_df.select_dtypes(include=["object", "string"]).columns processed_df[string_cols] = processed_df[string_cols].replace("", "-") processed_df.to_csv(output_path, index=False) + pd.DataFrame(error_files).to_csv('errors_file.csv', index=False) if __name__ == "__main__": diff --git a/musif/common/exceptions.py b/musif/common/exceptions.py index c109057e..b55991e8 100644 --- a/musif/common/exceptions.py +++ b/musif/common/exceptions.py @@ -6,7 +6,7 @@ def __init__(self, file_path: str): class ParseFileError(Exception): - """Exception informing that an file couldn't be parsed with the specified attributes.""" + """Exception informing that a file couldn't be parsed with the specified attributes.""" def __init__(self, file_path: str): super().__init__(f"Parse error with file '{file_path}'") diff --git a/musif/extract/extract.py b/musif/extract/extract.py index d96153d6..b2fae376 100644 --- a/musif/extract/extract.py +++ b/musif/extract/extract.py @@ -310,12 +310,17 @@ def _process_corpus( self, filenames: List[PurePath] ) -> Tuple[List[dict], List[dict]]: def process_corpus_par(idx, filename): + error_files = [] try: if self._cfg.window_size is not None: score_features = self._process_score_windows(idx, filename) else: score_features = self._process_score(idx, filename) except Exception as e: + print('error!') + error_files.append(filename) + df = pd.DataFrame(error_files, columns=['ErrorFiles']) + df.to_csv(str(self._cfg.output_dir)+'/error_files.csv', mode='a', index=False) if self._cfg.ignore_errors: lerr( f"Error while extracting features for file {filename}, skipping it because `ignore_errors` is True!" From 5b083c235e89b2a6addac9a17efd58e094a07b0f Mon Sep 17 00:00:00 2001 From: carlos Date: Mon, 30 Sep 2024 20:49:16 +0200 Subject: [PATCH 4/5] FeaturesExtractor returning one variable --- musif/__main__.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/musif/__main__.py b/musif/__main__.py index 493e993b..94f6ec06 100644 --- a/musif/__main__.py +++ b/musif/__main__.py @@ -144,7 +144,7 @@ def main( if len(config.basic_modules) == 0: config.basic_modules = ["scoring"] extract_c.MUSIC21_FILE_EXTENSIONS = extension - raw_df, error_files = FeaturesExtractor(config, limit_files=paths).extract() + raw_df = FeaturesExtractor(config, limit_files=paths).extract() output_path = Path(output_path).with_suffix(".csv") # raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False) @@ -175,8 +175,6 @@ def main( string_cols = processed_df.select_dtypes(include=["object", "string"]).columns processed_df[string_cols] = processed_df[string_cols].replace("", "-") processed_df.to_csv(output_path, index=False) - pd.DataFrame(error_files).to_csv('errors_file.csv', index=False) - if __name__ == "__main__": import fire From 8856fbd5f203d143cba716500a3e572b2b2bc3cd Mon Sep 17 00:00:00 2001 From: carlos vaquero Date: Tue, 1 Oct 2024 12:25:37 +0200 Subject: [PATCH 5/5] removed unnecessary variables and saving --- musif/__main__.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/musif/__main__.py b/musif/__main__.py index 493e993b..c9fb7a94 100644 --- a/musif/__main__.py +++ b/musif/__main__.py @@ -10,8 +10,6 @@ from musif.logs import perr, pinfo from musif.process.processor import DataProcessor -import pandas as pd - def main( *paths, output_path: str = "musif_features.csv", @@ -144,7 +142,7 @@ def main( if len(config.basic_modules) == 0: config.basic_modules = ["scoring"] extract_c.MUSIC21_FILE_EXTENSIONS = extension - raw_df, error_files = FeaturesExtractor(config, limit_files=paths).extract() + raw_df = FeaturesExtractor(config, limit_files=paths).extract() output_path = Path(output_path).with_suffix(".csv") # raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False) @@ -175,7 +173,6 @@ def main( string_cols = processed_df.select_dtypes(include=["object", "string"]).columns processed_df[string_cols] = processed_df[string_cols].replace("", "-") processed_df.to_csv(output_path, index=False) - pd.DataFrame(error_files).to_csv('errors_file.csv', index=False) if __name__ == "__main__":