Merge pull request #74 from DIDONEproject/develop

Error tracking plus run extraction with martiser
DIDONEproject · Oct 2, 2024 · 6f0a434 · 6f0a434
2 parents a082247 + 75d6cf1
commit 6f0a434
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 2 deletions.
diff --git a/musif/__main__.py b/musif/__main__.py
@@ -10,7 +10,6 @@
 from musif.logs import perr, pinfo
 from musif.process.processor import DataProcessor
 
-
 def main(
     *paths,
     output_path: str = "musif_features.csv",
@@ -144,6 +143,7 @@ def main(
         config.basic_modules = ["scoring"]
     extract_c.MUSIC21_FILE_EXTENSIONS = extension
     raw_df = FeaturesExtractor(config, limit_files=paths).extract()
+    raw_df = FeaturesExtractor(config, limit_files=paths).extract()
 
     output_path = Path(output_path).with_suffix(".csv")
     # raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False)

diff --git a/musif/common/exceptions.py b/musif/common/exceptions.py
@@ -6,7 +6,7 @@ def __init__(self, file_path: str):
 
 
 class ParseFileError(Exception):
-    """Exception informing that an file couldn't be parsed with the specified attributes."""
+    """Exception informing that a file couldn't be parsed with the specified attributes."""
 
     def __init__(self, file_path: str):
         super().__init__(f"Parse error with file '{file_path}'")

diff --git a/musif/extract/extract.py b/musif/extract/extract.py
@@ -194,6 +194,10 @@ def find_files(
     else:
         return []
 
+import warnings
+# Suppress all DeprecationWarning messages, particularly for .flat method
+warnings.filterwarnings("ignore", category=DeprecationWarning, module='music21')
+
 
 #  sorted(obj.glob(f"*{extension}"))
 class FeaturesExtractor:
@@ -310,12 +314,20 @@ def _process_corpus(
         self, filenames: List[PurePath]
     ) -> Tuple[List[dict], List[dict]]:
         def process_corpus_par(idx, filename):
+            error_files = []
+            errors = []
             try:
                 if self._cfg.window_size is not None:
                     score_features = self._process_score_windows(idx, filename)
                 else:
                     score_features = self._process_score(idx, filename)
             except Exception as e:
+                print(f"Error found on {filename}. Saving the filename and error print to {str(self._cfg.output_dir)}/error_files.csv for latter tracking")
+                error_files.append(filename)
+                errors.append(e)
+                df = pd.DataFrame({'ErrorFiles': error_files,
+                                   'Errors': errors})
+                df.to_csv(str(self._cfg.output_dir)+'/error_files.csv', mode='a', index=False)
                 if self._cfg.ignore_errors:
                     lerr(
                         f"Error while extracting features for file {filename}, skipping it because `ignore_errors` is True!"

diff --git a/run_extraction_example.py b/run_extraction_example.py
@@ -0,0 +1,56 @@
+import sys
+
+from musif.extract.features.core.constants import FILE_NAME
+import os
+from pathlib import Path
+
+import pandas as pd
+from feature_extraction.custom_conf import CustomConf
+from musif.extract.extract import FeaturesExtractor
+
+from musiF.musif.process.processor import DataProcessor
+
+# MAIN FILE to run extractions of data by Didone Project.
+
+# directory containing xml files
+data_dir = Path("data") / "xml"
+DEST_PATH = "destination_path"
+
+
+# directory containing .pkl files in case of previous extractions for cache
+cache_dir = None
+
+# csv file containing files which raised error and need to be reextracted
+path_error = 'martiser/error_files.csv'
+errored_files = list(pd.read_csv(path_error, low_memory=False)[FILE_NAME])
+
+# In case a partial extraction has been run, set here the previous df to avoid re-extracting these files.
+# prev_path = str(prefix / NAME) + '.csv'
+# exclude_files = list(pd.read_csv('martiser/extractions/total_8_12.csv', low_memory=False)['FileName'])
+
+# In case only some files need to be extracted.
+# xml_files = [filename for filename in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, filename)) and filename.endswith('.xml')]
+# limit_files = xml_files[0:len(xml_files)//4]
+
+extracted_df = FeaturesExtractor(
+    CustomConf("config_extraction_example.yml"),
+    data_dir = str(data_dir), 
+    # musescore_dir = Path("data") / "musescore", #only for harmonic analysis
+    # exclude_files = exclude_files,
+    # limit_files = limit_files,
+    cache_dir=cache_dir,
+).extract()
+
+extracted_df.to_csv(str(DEST_PATH)+'.csv', index=False)
+
+# The raw df will be now saved in the DEST_PATH, and now post-processed
+# by the Didone Processor, and ALSO saved in 4 separated csv files.
+p = DataProcessor(str(DEST_PATH) + '.csv', "config_postprocess_example.yml")
+p.process()
+
+p.data.drop('level_0', axis='columns')
+p.save(str(DEST_PATH))
+final_name = f'{DEST_PATH}'+'_alldata'+'.csv'
+
+# Running tests to ensure features values make sense 
+os.system(f'python tests/test_of_test.py {final_name}')