diff --git a/musif/__main__.py b/musif/__main__.py index 2bbec352..f0f1cb42 100644 --- a/musif/__main__.py +++ b/musif/__main__.py @@ -10,7 +10,6 @@ from musif.logs import perr, pinfo from musif.process.processor import DataProcessor - def main( *paths, output_path: str = "musif_features.csv", @@ -144,6 +143,7 @@ def main( config.basic_modules = ["scoring"] extract_c.MUSIC21_FILE_EXTENSIONS = extension raw_df = FeaturesExtractor(config, limit_files=paths).extract() + raw_df = FeaturesExtractor(config, limit_files=paths).extract() output_path = Path(output_path).with_suffix(".csv") # raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False) diff --git a/musif/common/exceptions.py b/musif/common/exceptions.py index c109057e..b55991e8 100644 --- a/musif/common/exceptions.py +++ b/musif/common/exceptions.py @@ -6,7 +6,7 @@ def __init__(self, file_path: str): class ParseFileError(Exception): - """Exception informing that an file couldn't be parsed with the specified attributes.""" + """Exception informing that a file couldn't be parsed with the specified attributes.""" def __init__(self, file_path: str): super().__init__(f"Parse error with file '{file_path}'") diff --git a/musif/extract/extract.py b/musif/extract/extract.py index d96153d6..0ab852c9 100644 --- a/musif/extract/extract.py +++ b/musif/extract/extract.py @@ -194,6 +194,10 @@ def find_files( else: return [] +import warnings +# Suppress all DeprecationWarning messages, particularly for .flat method +warnings.filterwarnings("ignore", category=DeprecationWarning, module='music21') + # sorted(obj.glob(f"*{extension}")) class FeaturesExtractor: @@ -310,12 +314,20 @@ def _process_corpus( self, filenames: List[PurePath] ) -> Tuple[List[dict], List[dict]]: def process_corpus_par(idx, filename): + error_files = [] + errors = [] try: if self._cfg.window_size is not None: score_features = self._process_score_windows(idx, filename) else: score_features = self._process_score(idx, filename) except Exception as e: + print(f"Error found on {filename}. Saving the filename and error print to {str(self._cfg.output_dir)}/error_files.csv for latter tracking") + error_files.append(filename) + errors.append(e) + df = pd.DataFrame({'ErrorFiles': error_files, + 'Errors': errors}) + df.to_csv(str(self._cfg.output_dir)+'/error_files.csv', mode='a', index=False) if self._cfg.ignore_errors: lerr( f"Error while extracting features for file {filename}, skipping it because `ignore_errors` is True!" diff --git a/run_extraction_example.py b/run_extraction_example.py new file mode 100644 index 00000000..c596f665 --- /dev/null +++ b/run_extraction_example.py @@ -0,0 +1,56 @@ +import sys + +from musif.extract.features.core.constants import FILE_NAME +import os +from pathlib import Path + +import pandas as pd +from feature_extraction.custom_conf import CustomConf +from musif.extract.extract import FeaturesExtractor + +from musiF.musif.process.processor import DataProcessor + +# MAIN FILE to run extractions of data by Didone Project. + +# directory containing xml files +data_dir = Path("data") / "xml" +DEST_PATH = "destination_path" + + +# directory containing .pkl files in case of previous extractions for cache +cache_dir = None + +# csv file containing files which raised error and need to be reextracted +path_error = 'martiser/error_files.csv' +errored_files = list(pd.read_csv(path_error, low_memory=False)[FILE_NAME]) + +# In case a partial extraction has been run, set here the previous df to avoid re-extracting these files. +# prev_path = str(prefix / NAME) + '.csv' +# exclude_files = list(pd.read_csv('martiser/extractions/total_8_12.csv', low_memory=False)['FileName']) + +# In case only some files need to be extracted. +# xml_files = [filename for filename in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, filename)) and filename.endswith('.xml')] +# limit_files = xml_files[0:len(xml_files)//4] + +extracted_df = FeaturesExtractor( + CustomConf("config_extraction_example.yml"), + data_dir = str(data_dir), + # musescore_dir = Path("data") / "musescore", #only for harmonic analysis + # exclude_files = exclude_files, + # limit_files = limit_files, + cache_dir=cache_dir, +).extract() + +extracted_df.to_csv(str(DEST_PATH)+'.csv', index=False) + +# The raw df will be now saved in the DEST_PATH, and now post-processed +# by the Didone Processor, and ALSO saved in 4 separated csv files. +p = DataProcessor(str(DEST_PATH) + '.csv', "config_postprocess_example.yml") +p.process() + +p.data.drop('level_0', axis='columns') +p.save(str(DEST_PATH)) +final_name = f'{DEST_PATH}'+'_alldata'+'.csv' + +# Running tests to ensure features values make sense +os.system(f'python tests/test_of_test.py {final_name}') \ No newline at end of file