Skip to content

Commit

Permalink
Merge pull request #74 from DIDONEproject/develop
Browse files Browse the repository at this point in the history
Error tracking plus run extraction with martiser
  • Loading branch information
CarlosVaquero authored Oct 2, 2024
2 parents a082247 + 75d6cf1 commit 6f0a434
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 2 deletions.
2 changes: 1 addition & 1 deletion musif/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from musif.logs import perr, pinfo
from musif.process.processor import DataProcessor


def main(
*paths,
output_path: str = "musif_features.csv",
Expand Down Expand Up @@ -144,6 +143,7 @@ def main(
config.basic_modules = ["scoring"]
extract_c.MUSIC21_FILE_EXTENSIONS = extension
raw_df = FeaturesExtractor(config, limit_files=paths).extract()
raw_df = FeaturesExtractor(config, limit_files=paths).extract()

output_path = Path(output_path).with_suffix(".csv")
# raw_df.to_csv(output_path.with_suffix(".raw.csv"), index=False)
Expand Down
2 changes: 1 addition & 1 deletion musif/common/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def __init__(self, file_path: str):


class ParseFileError(Exception):
"""Exception informing that an file couldn't be parsed with the specified attributes."""
"""Exception informing that a file couldn't be parsed with the specified attributes."""

def __init__(self, file_path: str):
super().__init__(f"Parse error with file '{file_path}'")
Expand Down
12 changes: 12 additions & 0 deletions musif/extract/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ def find_files(
else:
return []

import warnings
# Suppress all DeprecationWarning messages, particularly for .flat method
warnings.filterwarnings("ignore", category=DeprecationWarning, module='music21')


# sorted(obj.glob(f"*{extension}"))
class FeaturesExtractor:
Expand Down Expand Up @@ -310,12 +314,20 @@ def _process_corpus(
self, filenames: List[PurePath]
) -> Tuple[List[dict], List[dict]]:
def process_corpus_par(idx, filename):
error_files = []
errors = []
try:
if self._cfg.window_size is not None:
score_features = self._process_score_windows(idx, filename)
else:
score_features = self._process_score(idx, filename)
except Exception as e:
print(f"Error found on {filename}. Saving the filename and error print to {str(self._cfg.output_dir)}/error_files.csv for latter tracking")
error_files.append(filename)
errors.append(e)
df = pd.DataFrame({'ErrorFiles': error_files,
'Errors': errors})
df.to_csv(str(self._cfg.output_dir)+'/error_files.csv', mode='a', index=False)
if self._cfg.ignore_errors:
lerr(
f"Error while extracting features for file {filename}, skipping it because `ignore_errors` is True!"
Expand Down
56 changes: 56 additions & 0 deletions run_extraction_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import sys

from musif.extract.features.core.constants import FILE_NAME
import os
from pathlib import Path

import pandas as pd
from feature_extraction.custom_conf import CustomConf
from musif.extract.extract import FeaturesExtractor

from musiF.musif.process.processor import DataProcessor

# MAIN FILE to run extractions of data by Didone Project.

# directory containing xml files
data_dir = Path("data") / "xml"
DEST_PATH = "destination_path"


# directory containing .pkl files in case of previous extractions for cache
cache_dir = None

# csv file containing files which raised error and need to be reextracted
path_error = 'martiser/error_files.csv'
errored_files = list(pd.read_csv(path_error, low_memory=False)[FILE_NAME])

# In case a partial extraction has been run, set here the previous df to avoid re-extracting these files.
# prev_path = str(prefix / NAME) + '.csv'
# exclude_files = list(pd.read_csv('martiser/extractions/total_8_12.csv', low_memory=False)['FileName'])

# In case only some files need to be extracted.
# xml_files = [filename for filename in os.listdir(data_dir) if os.path.isfile(os.path.join(data_dir, filename)) and filename.endswith('.xml')]
# limit_files = xml_files[0:len(xml_files)//4]

extracted_df = FeaturesExtractor(
CustomConf("config_extraction_example.yml"),
data_dir = str(data_dir),
# musescore_dir = Path("data") / "musescore", #only for harmonic analysis
# exclude_files = exclude_files,
# limit_files = limit_files,
cache_dir=cache_dir,
).extract()

extracted_df.to_csv(str(DEST_PATH)+'.csv', index=False)

# The raw df will be now saved in the DEST_PATH, and now post-processed
# by the Didone Processor, and ALSO saved in 4 separated csv files.
p = DataProcessor(str(DEST_PATH) + '.csv', "config_postprocess_example.yml")
p.process()

p.data.drop('level_0', axis='columns')
p.save(str(DEST_PATH))
final_name = f'{DEST_PATH}'+'_alldata'+'.csv'

# Running tests to ensure features values make sense
os.system(f'python tests/test_of_test.py {final_name}')

0 comments on commit 6f0a434

Please sign in to comment.