diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml index 8dfcfcbcfc..f772a642e1 100644 --- a/.github/workflows/examples_tests.yml +++ b/.github/workflows/examples_tests.yml @@ -30,6 +30,7 @@ jobs: WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} WML_APIKEY: ${{ secrets.WML_APIKEY }} GENAI_KEY: ${{ secrets.GENAI_KEY }} + SKIP_HEAVY_LOCAL: "True" steps: - uses: actions/checkout@v4 diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml index 2407bdf1ba..014926cfe4 100644 --- a/.github/workflows/inference_tests.yml +++ b/.github/workflows/inference_tests.yml @@ -32,6 +32,8 @@ jobs: WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} # Similar to WML_PROJECT_ID WX_API_KEY: ${{ secrets.WML_APIKEY }} # Similar to WML_APIKEY GENAI_KEY: ${{ secrets.GENAI_KEY }} + SKIP_HEAVY_LOCAL: "True" + steps: - uses: actions/checkout@v4 diff --git a/examples/evaluate_speech_recognition.py b/examples/evaluate_speech_recognition.py new file mode 100644 index 0000000000..1abaaa9362 --- /dev/null +++ b/examples/evaluate_speech_recognition.py @@ -0,0 +1,67 @@ +# this python script shows an example of running speech recognition evaluation for Granite Speech using the Hugging Face ESB datasets and the CommonVoice datasets + +import os + +from unitxt import evaluate, load_dataset +from unitxt.inference import ( + CrossProviderInferenceEngine, + HFGraniteSpeechInferenceEngine, +) +from unitxt.system_prompts import TextualSystemPrompt + +USE_RITS = False # whether to use RITS service +USE_WML = False # whether to use WML service + +test_dataset = load_dataset( + # select (uncomment) only one of the following cards (datasets) + # for evaluating a benchmark with multiple cards - see evaluate_speech_recognition_benchmark.py in the same directory (examples) + card="cards.esb.ami", + # card="cards.esb.voxpopuli", + # card="cards.esb.librispeech", + # card="cards.esb.spgispeech", + # card="cards.esb.earnings22", + # card="cards.esb.tedlium", + # card="cards.commonvoice.en" + # card="cards.commonvoice.de" + # card="cards.commonvoice.fr" + # card="cards.commonvoice.es" + # card="cards.commonvoice.pt" + split="test", + format="formats.chat_api", + max_test_instances=5, # to tun limited part of the test set + system_prompt=TextualSystemPrompt( + text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant" + ), +) + +if os.environ.get("SKIP_HEAVY_LOCAL", False): + exit() + +if not USE_RITS and not USE_WML: + # locally running the model, it needs GPU to run properly + model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b + revision="granite-speech-3.3.2-2b", + max_new_tokens=200, + ) +if USE_RITS: + # using the RITS remote service for inferencing + model = CrossProviderInferenceEngine( + model="granite-speech-3-3-8b", # in RITS only the 8b version of Granite Speech is available + provider="rits", + # provider_specific_args={"rits": {"max_new_tokens": 200}}, + max_new_tokens=200, + ) +if USE_WML: + # using the WML remote service for inferencing + # code to be completed + model = None + + +predictions = model(test_dataset) +results = evaluate( + predictions=predictions, data=test_dataset, calc_confidence_intervals=False +) + +print("Global scores:") +print(results.global_scores.summary) diff --git a/examples/evaluate_speech_recognition_benchmark.py b/examples/evaluate_speech_recognition_benchmark.py new file mode 100644 index 0000000000..1e4808daac --- /dev/null +++ b/examples/evaluate_speech_recognition_benchmark.py @@ -0,0 +1,45 @@ +# this python script shows an example of running speech recognition benchmark evaluation for Granite Speech +# using the Hugging Face ESB datasets (English) and the multilingial CommonVoice datasets + +# to run on a single test set use subset=... below; the list of subsets is: +# voxpopuli, ami, librispeech, spgispeech, tedlium, earnings22, +# commonvoice_en, commonvoice_de, commonvoice_es, commonvoice_fr, commonvoice_pt + +import os + +from unitxt import evaluate, load_dataset +from unitxt.inference import ( + HFGraniteSpeechInferenceEngine, +) +from unitxt.system_prompts import TextualSystemPrompt + +dataset = load_dataset( + "benchmarks.speech_recognition", + max_samples_per_subset=5, # while this is commented out, the entire test set is used + # subset="ami", #to tun only a single dataset + system_prompt=TextualSystemPrompt( + text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant" + ), + split="test", +) + + +if os.environ.get("SKIP_HEAVY_LOCAL", False): + exit() + +model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-2b", # two options for Granite Speech 3.3: 2b and 8b + revision="granite-speech-3.3.2-2b", + max_new_tokens=200, +) + +predictions = model(dataset) +results = evaluate( + predictions=predictions, data=dataset, calc_confidence_intervals=False +) + +print("Global scores:") +print(results.global_scores.summary) + +print("Subsets scores:") +print(results.subsets_scores.summary) diff --git a/examples/evaluate_speech_translation_benchmark.py b/examples/evaluate_speech_translation_benchmark.py new file mode 100644 index 0000000000..7c2deb4d68 --- /dev/null +++ b/examples/evaluate_speech_translation_benchmark.py @@ -0,0 +1,38 @@ +# this python script shows an example of running speech translation benchmark evaluation for Granite Speech +# using the Fleurs and Covost2 datasets + +# to run on a single test set use subset=... below; the list of subsets is: +# fleurs_en_de, fleurs_en_es, fleurs_en_fr, fleurs_en_it, fleurs_en_ja, fleurs_en_pt, fleurs_en_pt, +# covost2_en_de, covost2_en_ja, covost2_de_en, covost2_es_en, covost2_fr_en, covost2_pt_en + +from unitxt import evaluate, load_dataset +from unitxt.inference import ( + HFGraniteSpeechInferenceEngine, +) +from unitxt.system_prompts import TextualSystemPrompt + +dataset = load_dataset( + "benchmarks.speech_translation", + # max_samples_per_subset=100, # while this is commented out, the entire test set is used + # subset="fleurs_en_fr", #to run only a single test set + system_prompt=TextualSystemPrompt( + text="Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant" + ), + split="test", +) + +model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b + max_new_tokens=200, +) + +predictions = model(dataset) +results = evaluate( + predictions=predictions, data=dataset, calc_confidence_intervals=False +) + +print("Global scores:") +print(results.global_scores.summary) + +print("Subsets scores:") +print(results.subsets_scores.summary) diff --git a/examples/evaluate_speech_translation_covost2.py b/examples/evaluate_speech_translation_covost2.py new file mode 100644 index 0000000000..775e0b3df1 --- /dev/null +++ b/examples/evaluate_speech_translation_covost2.py @@ -0,0 +1,58 @@ +# this python script shows an example of running speech translation evaluation for Granite Speech + +from unitxt import evaluate, load_dataset +from unitxt.inference import ( + HFGraniteSpeechInferenceEngine, +) +from unitxt.system_prompts import TextualSystemPrompt + +debug = True # True for extra printing, set to False when commenting out max_test_instances below +max_test_instances = 8 + +# the available calanguages for the covost2 dataset dataset, are: +# translation from English to target language: +# de German +# ja Japanese +# translation from source language to English: +# de German +# es Spanish +# fr French +# pt Portuguese +test_dataset = load_dataset( # select (un-comment) one of the test sets below + card="cards.covost2.from_en.en_de", + # card="cards.covost2.from_en.en_ja", + # card="cards.covost2.to_en.de_en", + # card="cards.covost2.to_en.es_en", + # card="cards.covost2.to_en.fr_en", + # card="cards.covost2.to_en.pt_en", + split="test", + format="formats.chat_api", + max_test_instances=max_test_instances, # comment out for running the entire test + system_prompt=TextualSystemPrompt( + text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant" + ), +) + +if debug: + print(">>>>>>>>>>>>>> first test references >>>>>>>>>>>>") + for idx in range(max_test_instances): + print(f">>>>>> references {idx}: ", test_dataset["references"][idx]) + +model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b + max_new_tokens=200, +) + +predictions = model(test_dataset) + +if debug: # print translation reference texts for debug and inspection + print(">>>>>>>>>>>>>> first predictions >>>>>>>>>>>>") + for idx in range(max_test_instances): + print(f">>>>>>>>>>> {idx}: ", predictions[idx]) + +results = evaluate( + predictions=predictions, data=test_dataset, calc_confidence_intervals=False +) + +print("Global scores:") +print(results.global_scores.summary) diff --git a/examples/evaluate_speech_translation_fleurs.py b/examples/evaluate_speech_translation_fleurs.py new file mode 100644 index 0000000000..574ec6c2eb --- /dev/null +++ b/examples/evaluate_speech_translation_fleurs.py @@ -0,0 +1,58 @@ +# this python script shows an example of running speech translation evaluation for Granite Speech + +from unitxt import evaluate, load_dataset +from unitxt.inference import ( + HFGraniteSpeechInferenceEngine, +) +from unitxt.system_prompts import TextualSystemPrompt + +debug = False # True for extra printing, set to False when commenting out max_test_instances below +max_test_instances = 20 + +# the available cards for the fleurs dataset, reflecting the target language, are: +# de_de German +# es_419 Spanish, South America +# fr_fr French +# it_it Italian +# ja_jp Japanese +# pt_br Portuguese, Brazil +# cmn_hans_cn Chinese, Mandarin +test_dataset = load_dataset( # select (un-comment) one of the test sets below + # card="cards.fleurs.en_us.de_de", + # card="cards.fleurs.en_us.es_419", + # card="cards.fleurs.en_us.fr_fr", + # card="cards.fleurs.en_us.it_it", + # card="cards.fleurs.en_us.pt_br", + card="cards.fleurs.en_us.ja_jp", + # card="cards.fleurs.en_us.cmn_hans_cn", + split="test", + format="formats.chat_api", + # max_test_instances=max_test_instances, # comment out for running the entire test + system_prompt=TextualSystemPrompt( + text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant" + ), +) + +if debug: + print(">>>>>>>>>>>>>> test references >>>>>>>>>>>>") + for idx in range(max_test_instances): + print(f">>>>>> references {idx}: ", test_dataset["references"][idx]) + +model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-8b", # two options for Granite Speech 3.3: 2b and 8b + max_new_tokens=200, +) + +predictions = model(test_dataset) + +if debug: # print translation reference texts for debug and inspection + print(">>>>>>>>>>>>>> model predictions >>>>>>>>>>>>") + for idx in range(max_test_instances): + print(f">>>>>>>>>>> {idx}: ", predictions[idx]) + +results = evaluate( + predictions=predictions, data=test_dataset, calc_confidence_intervals=False +) + +print("Global scores:") +print(results.global_scores.summary) diff --git a/prepare/benchmarks/speech_recognition_english.py b/prepare/benchmarks/speech_recognition_english.py new file mode 100644 index 0000000000..114adf48f8 --- /dev/null +++ b/prepare/benchmarks/speech_recognition_english.py @@ -0,0 +1,38 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +benchmark = Benchmark( + subsets={ + "voxpopuli": DatasetRecipe( + card="cards.esb.voxpopuli", + format="formats.chat_api", + ), + "ami": DatasetRecipe( + card="cards.esb.ami", + format="formats.chat_api", + ), + "librispeech": DatasetRecipe( + card="cards.esb.librispeech", + format="formats.chat_api", + ), + "spgispeech": DatasetRecipe( + card="cards.esb.spgispeech", + format="formats.chat_api", + ), + "tedlium": DatasetRecipe( + card="cards.esb.tedlium", + format="formats.chat_api", + ), + "earnings22": DatasetRecipe( + card="cards.esb.earnings22", + format="formats.chat_api", + ), + "commonvoice": DatasetRecipe( + card="cards.esb.commonvoice", + format="formats.chat_api", + ), + }, +) + +add_to_catalog(benchmark, "benchmarks.speech_recognition", overwrite=True) diff --git a/prepare/benchmarks/speech_recognition_multilingual.py b/prepare/benchmarks/speech_recognition_multilingual.py new file mode 100644 index 0000000000..e0857e6839 --- /dev/null +++ b/prepare/benchmarks/speech_recognition_multilingual.py @@ -0,0 +1,26 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +benchmark = Benchmark( + subsets={ + "commonvoice_de": DatasetRecipe( + card="cards.commonvoice.de", + format="formats.chat_api", + ), + "commonvoice_es": DatasetRecipe( + card="cards.commonvoice.es", + format="formats.chat_api", + ), + "commonvoice_fr": DatasetRecipe( + card="cards.commonvoice.fr", + format="formats.chat_api", + ), + "commonvoice_pt": DatasetRecipe( + card="cards.commonvoice.pt", + format="formats.chat_api", + ), + }, +) + +add_to_catalog(benchmark, "benchmarks.speech_recognition_multilingual", overwrite=True) diff --git a/prepare/benchmarks/speech_translation.py b/prepare/benchmarks/speech_translation.py new file mode 100644 index 0000000000..3fd68e47c8 --- /dev/null +++ b/prepare/benchmarks/speech_translation.py @@ -0,0 +1,65 @@ +from unitxt.benchmark import Benchmark +from unitxt.catalog import add_to_catalog +from unitxt.standard import DatasetRecipe + +# running benchmarks with fleurs dataset, en-->xx +# running benchmarks with covost2 dataset, en-->xx +# running benchmarks with covost2 dataset, xx-->en +benchmark = Benchmark( + subsets={ + "fleurs_en_de": DatasetRecipe( + card="cards.fleurs.en_us.de_de", + format="formats.chat_api", + ), + "fleurs_en_es": DatasetRecipe( + card="cards.fleurs.en_us.es_419", + format="formats.chat_api", + ), + "fleurs_en_fr": DatasetRecipe( + card="cards.fleurs.en_us.fr_fr", + format="formats.chat_api", + ), + "fleurs_en_it": DatasetRecipe( + card="cards.fleurs.en_us.it_it", + format="formats.chat_api", + ), + "fleurs_en_ja": DatasetRecipe( + card="cards.fleurs.en_us.ja_jp", + format="formats.chat_api", + ), + "fleurs_en_pt": DatasetRecipe( + card="cards.fleurs.en_us.pt_br", + format="formats.chat_api", + ), + "fleurs_en_zh": DatasetRecipe( + card="cards.fleurs.en_us.cmn_hans_cn", + format="formats.chat_api", + ), + "covost2_en_de": DatasetRecipe( + card="cards.covost2.from_en.en_de", + format="formats.chat_api", + ), + "covost2_en_ja": DatasetRecipe( + card="cards.covost2.from_en.en_ja", + format="formats.chat_api", + ), + "covost2_de_en": DatasetRecipe( + card="cards.covost2.to_en.de_en", + format="formats.chat_api", + ), + "covost2_es_en": DatasetRecipe( + card="cards.covost2.to_en.es_en", + format="formats.chat_api", + ), + "covost2_fr_en": DatasetRecipe( + card="cards.covost2.to_en.fr_en", + format="formats.chat_api", + ), + "covost2_pt_en": DatasetRecipe( + card="cards.covost2.to_en.pt_en", + format="formats.chat_api", + ), + }, +) + +add_to_catalog(benchmark, "benchmarks.speech_translation", overwrite=True) diff --git a/prepare/cards/ami.py b/prepare/cards/ami.py new file mode 100644 index 0000000000..ada958eb3e --- /dev/null +++ b/prepare/cards/ami.py @@ -0,0 +1,42 @@ +from unitxt.audio_operators import ToAudio +from unitxt.card import TaskCard +from unitxt.catalog import add_to_catalog +from unitxt.loaders import LoadHF +from unitxt.test_utils.card import test_card + +for subset in ["ihm", "sdm"]: + card = TaskCard( + loader=LoadHF( + path="edinburghcstr/ami", + data_dir=subset, + revision="refs/convert/parquet", + splits=["train", "validation", "test"], + data_classification_policy=["public"], + streaming=True, + ), + preprocess_steps=[ + ToAudio(field="audio"), + ], + task="tasks.speech_recognition", + templates=[ + "templates.speech_recognition.default", + ], + __tags__={ + "license": "cc-by-4.0", + "language": "en", + "task_categories": ["automatic-speech-recognition"], + "size_categories": ["10K Audio: + return { + "audio": value, + } + + +def audio_to_base64(audio_data: Audio): + """Convert a HuggingFace Audio instance to a base64-encoded WAV string. + + Args: + audio_data (dict): The Audio instance from HuggingFace datasets + Contains 'array', 'sampling_rate', and 'path' keys + + Returns: + str: Base64-encoded WAV audio + """ + import base64 + from io import BytesIO + + import soundfile as sf + + # Create a BytesIO buffer to hold the WAV data + buffer = BytesIO() + audio = audio_data["audio"] + # Write the audio array to the buffer in WAV format + sf.write(buffer, audio["array"], audio["sampling_rate"], format="wav") + + # Get the bytes from the buffer + wav_bytes = buffer.getvalue() + + # Encode to base64 + return base64.b64encode(wav_bytes).decode("utf-8") + + +def base64_to_audio(base64_string, sampling_rate: Optional[int] = None): + import base64 + + from datasets import Audio + + audio_bytes = base64.b64decode(base64_string) + audio_feature = Audio(sampling_rate=sampling_rate) + return audio_feature.decode_example({"bytes": audio_bytes, "path": None}) diff --git a/src/unitxt/catalog/benchmarks/speech_recognition.json b/src/unitxt/catalog/benchmarks/speech_recognition.json new file mode 100644 index 0000000000..9a98348ca4 --- /dev/null +++ b/src/unitxt/catalog/benchmarks/speech_recognition.json @@ -0,0 +1,40 @@ +{ + "__type__": "benchmark", + "subsets": { + "voxpopuli": { + "__type__": "dataset_recipe", + "card": "cards.esb.voxpopuli", + "format": "formats.chat_api" + }, + "ami": { + "__type__": "dataset_recipe", + "card": "cards.esb.ami", + "format": "formats.chat_api" + }, + "librispeech": { + "__type__": "dataset_recipe", + "card": "cards.esb.librispeech", + "format": "formats.chat_api" + }, + "spgispeech": { + "__type__": "dataset_recipe", + "card": "cards.esb.spgispeech", + "format": "formats.chat_api" + }, + "tedlium": { + "__type__": "dataset_recipe", + "card": "cards.esb.tedlium", + "format": "formats.chat_api" + }, + "earnings22": { + "__type__": "dataset_recipe", + "card": "cards.esb.earnings22", + "format": "formats.chat_api" + }, + "commonvoice": { + "__type__": "dataset_recipe", + "card": "cards.esb.commonvoice", + "format": "formats.chat_api" + } + } +} diff --git a/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json b/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json new file mode 100644 index 0000000000..22d01d9d6a --- /dev/null +++ b/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json @@ -0,0 +1,25 @@ +{ + "__type__": "benchmark", + "subsets": { + "commonvoice_de": { + "__type__": "dataset_recipe", + "card": "cards.commonvoice.de", + "format": "formats.chat_api" + }, + "commonvoice_es": { + "__type__": "dataset_recipe", + "card": "cards.commonvoice.es", + "format": "formats.chat_api" + }, + "commonvoice_fr": { + "__type__": "dataset_recipe", + "card": "cards.commonvoice.fr", + "format": "formats.chat_api" + }, + "commonvoice_pt": { + "__type__": "dataset_recipe", + "card": "cards.commonvoice.pt", + "format": "formats.chat_api" + } + } +} diff --git a/src/unitxt/catalog/benchmarks/speech_translation.json b/src/unitxt/catalog/benchmarks/speech_translation.json new file mode 100644 index 0000000000..922e1a4f49 --- /dev/null +++ b/src/unitxt/catalog/benchmarks/speech_translation.json @@ -0,0 +1,70 @@ +{ + "__type__": "benchmark", + "subsets": { + "fleurs_en_de": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.de_de", + "format": "formats.chat_api" + }, + "fleurs_en_es": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.es_419", + "format": "formats.chat_api" + }, + "fleurs_en_fr": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.fr_fr", + "format": "formats.chat_api" + }, + "fleurs_en_it": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.it_it", + "format": "formats.chat_api" + }, + "fleurs_en_ja": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.ja_jp", + "format": "formats.chat_api" + }, + "fleurs_en_pt": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.pt_br", + "format": "formats.chat_api" + }, + "fleurs_en_zh": { + "__type__": "dataset_recipe", + "card": "cards.fleurs.en_us.cmn_hans_cn", + "format": "formats.chat_api" + }, + "covost2_en_de": { + "__type__": "dataset_recipe", + "card": "cards.covost2.from_en.en_de", + "format": "formats.chat_api" + }, + "covost2_en_ja": { + "__type__": "dataset_recipe", + "card": "cards.covost2.from_en.en_ja", + "format": "formats.chat_api" + }, + "covost2_de_en": { + "__type__": "dataset_recipe", + "card": "cards.covost2.to_en.de_en", + "format": "formats.chat_api" + }, + "covost2_es_en": { + "__type__": "dataset_recipe", + "card": "cards.covost2.to_en.es_en", + "format": "formats.chat_api" + }, + "covost2_fr_en": { + "__type__": "dataset_recipe", + "card": "cards.covost2.to_en.fr_en", + "format": "formats.chat_api" + }, + "covost2_pt_en": { + "__type__": "dataset_recipe", + "card": "cards.covost2.to_en.pt_en", + "format": "formats.chat_api" + } + } +} diff --git a/src/unitxt/catalog/cards/ami/ihm.json b/src/unitxt/catalog/cards/ami/ihm.json new file mode 100644 index 0000000000..6afd19f0a7 --- /dev/null +++ b/src/unitxt/catalog/cards/ami/ihm.json @@ -0,0 +1,45 @@ +{ + "__type__": "task_card", + "loader": { + "__type__": "load_hf", + "path": "edinburghcstr/ami", + "data_dir": "ihm", + "revision": "refs/convert/parquet", + "splits": [ + "train", + "validation", + "test" + ], + "data_classification_policy": [ + "public" + ], + "streaming": true + }, + "preprocess_steps": [ + { + "__type__": "to_audio", + "field": "audio" + } + ], + "task": "tasks.speech_recognition", + "templates": [ + "templates.speech_recognition.default" + ], + "__tags__": { + "license": "cc-by-4.0", + "language": "en", + "task_categories": [ + "automatic-speech-recognition" + ], + "size_categories": [ + "10K Union[str, List[Content]]: - # Regular expression to find tags with src attribute - img_tag_pattern = re.compile( - r"<" + f"{constants.image_tag}" + r'\s+[^>]*src=["\']([^"\']+)["\'][^>]*>', + image_tag = constants.image_tag + audio_tag = constants.audio_tag + + # Unified regex for both tags + tag_pattern = re.compile( + rf"<(?P{re.escape(image_tag)}|{re.escape(audio_tag)})\s+[^>]*src=[\"'](?P[^\"']+)[\"'][^>]*>", re.IGNORECASE, ) - # Find all matches of tags and their positions - matches = list(img_tag_pattern.finditer(text)) + matches = list(tag_pattern.finditer(text)) - # If no images are found, return the text as a plain string if not matches: return text contents: List[dict] = [] last_pos = 0 - # Process each match for match in matches: start, end = match.span() - img_url = match.group(1) + tag = match.group("tag").lower() + src = match.group("src") - # Add preceding text, if any + # Add preceding text if last_pos < start: contents.append({"type": "text", "text": text[last_pos:start]}) - # Add image content with a default detail level - if img_url.startswith("media/"): - image = dict_get(media, img_url[6:]) - data_url = image_to_data_url(image) + is_local = src.startswith("media/") + media_key = src[6:] if is_local else None + + if tag == image_tag: + if is_local: + image = dict_get(media, media_key) + data_url = image_to_data_url(image) + else: + data_url = src contents.append( { "type": "image_url", "image_url": {"url": data_url, "detail": "low"}, } ) - else: + + elif tag == audio_tag: + if is_local: + audio = dict_get(media, media_key) + data_url = audio_to_base64(audio) + else: + data_url = src contents.append( { - "type": "image_url", - "image_url": {"url": img_url, "detail": "low"}, + "type": "input_audio", + "input_audio": {"data": data_url, "format": "wav"}, } ) - # Update the last processed position last_pos = end - # Add any remaining text after the last image + # Add any trailing text if last_pos < len(text): contents.append({"type": "text", "text": text[last_pos:]}) @@ -512,6 +534,7 @@ def _format_instance_to_source( turns, ) media["images"] = [] + media["audios"] = [] return chat diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py index 87488d1da7..13169dbf3d 100644 --- a/src/unitxt/inference.py +++ b/src/unitxt/inference.py @@ -36,6 +36,7 @@ from tqdm.asyncio import tqdm_asyncio from .artifact import Artifact +from .audio_operators import base64_to_audio from .base_metric import Metric from .dataclass import InternalField, NonPositionalField from .deprecation_utils import deprecation @@ -69,6 +70,7 @@ class StandardAPIParamsMixin(Artifact): frequency_penalty: Optional[float] = None presence_penalty: Optional[float] = None max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None seed: Optional[int] = None stop: Union[Optional[str], List[str]] = None temperature: Optional[float] = None @@ -610,6 +612,12 @@ def get_logprobs( logprobs: List[List[Dict[str, Any]]] = [] + tokenizer = ( + self.processor.tokenizer + if hasattr(self.processor, "tokenizer") + else self.processor + ) + for sample_no, sample_scores in enumerate(transition_scores.detach().cpu()): sample_logprobs: List[Dict[str, Any]] = [] @@ -620,7 +628,7 @@ def get_logprobs( "logprob": float(score.cpu()), "top_tokens": [ { - "text": self.processor.decode(idx), + "text": tokenizer.decode(idx), "logprob": float( predictions.scores[n][sample_no][idx].cpu() ), @@ -1042,6 +1050,161 @@ def _infer_log_probs( return self._infer_fn(dataset, return_meta_data, True) +class HFGraniteSpeechInferenceEngine(HFInferenceEngineBase): + revision: str = None + lazy_load: bool = True + label: str = "hf_granite_speech" + audio_token: str = "<|audio|>" + sampling_rate: int = 16000 + + _requirements_list = ["torchaudio"] + + def compute_transition_scores( + self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int] + ) -> Sequence: + if not hasattr(self.model.config, "vocab_size"): + try: + self.model.config.vocab_size = self.model.vocab_size + except: + self.model.config.vocab_size = self.model.config.text_config.vocab_size + + return super().compute_transition_scores(sequences, scores, beam_indices) + + def _init_processor(self): + from transformers import AutoProcessor + + self.processor = AutoProcessor.from_pretrained(self.model_name) + + if not self.pad_token_id and hasattr(self.processor, "eos_token_id"): + self.pad_token_id = self.processor.eos_token_id + + def _init_model(self): + from transformers import AutoModelForSpeechSeq2Seq + + self.model = AutoModelForSpeechSeq2Seq.from_pretrained( + self.model_name, + revision=self.revision, + torch_dtype=self._get_torch_dtype(), + low_cpu_mem_usage=self.low_cpu_mem_usage, + device_map=self.device_map, + ) + if self.device_map is None: + self.model.to(self.device) + + def _get_input(self, instance): + if isinstance(instance["source"], list): + # Chat API format - extract audio from content + audios = [] + chat = [] + for turn in instance["source"]: + if isinstance(turn["content"], list): + turn_content = "" + for content in turn["content"]: + if content["type"] == "input_audio": + audios.append( + base64_to_audio( + content["input_audio"]["data"], + sampling_rate=self.sampling_rate, + ) + ) + turn_content += self.audio_token + elif content["type"] == "text": + turn_content += content["text"] + else: + raise ValueError( + f"Unsupported content type:{content['type']}" + ) + turn = {"role": turn["role"], "content": turn_content} + + chat.append(turn) + + if len(audios) > 1: + raise ValueError(f"Unsupported number of audio contents:{len(audios)}") + + audio = audios[0] + + return chat, audio + raise ValueError("Supports only chat api.") + + def prepare_inputs(self, instance) -> Mapping: + chat, audio = self._get_input(instance) + + text = self.processor.tokenizer.apply_chat_template( + chat, tokenize=False, add_generation_prompt=True + ) + + inputs: Mapping = self.processor( + text=[text], audio=audio["array"], return_tensors="pt" + ).to(self.device or self.device_map, self._get_torch_dtype()) + + return inputs + + def _infer_fn( + self, + dataset: Union[List[Dict[str, Any]], Dataset], + return_meta_data: bool, + return_logprobs: bool, + ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]: + results = [] + + for instance in tqdm(dataset): + processed_inputs = self.prepare_inputs(instance) + input_len = len(processed_inputs["input_ids"][0]) + + predictions = self.make_predictions(processed_inputs) + + sequences = predictions.sequences + + output_tokens = sequences[:, input_len:] + + output_tokens_strings = [] + for tokens in output_tokens: + output_tokens_strings.append( + [ + self.processor.tokenizer.decode(token, skip_special_tokens=True) + for token in tokens + ] + ) + + output_strings = [] + for tokens in output_tokens: + output_strings.append( + self.processor.tokenizer.decode(tokens, skip_special_tokens=True) + ) + + if return_logprobs: + final_outputs = self.get_logprobs(predictions, output_tokens_strings) + else: + final_outputs = output_strings + + results.append( + self.get_return_object( + output=final_outputs[0], + generated_text=output_strings[0], + output_tokens=len(output_tokens_strings[0]), + inp=instance["source"], + inp_tokens=None, + return_meta_data=return_meta_data, + ) + ) + + return results + + def _infer( + self, + dataset: Union[List[Dict[str, Any]], Dataset], + return_meta_data: bool = False, + ) -> Union[List[str], List[TextGenerationInferenceOutput]]: + return self._infer_fn(dataset, return_meta_data, False) + + def _infer_log_probs( + self, + dataset: Union[List[Dict[str, Any]], Dataset], + return_meta_data: bool = False, + ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]: + return self._infer_fn(dataset, return_meta_data, True) + + class HFPeftInferenceEngine(HFAutoModelInferenceEngine): label: str = "hf_peft_auto_model" @@ -1571,6 +1734,7 @@ class OpenAiInferenceEngineParamsMixin(Artifact): frequency_penalty: Optional[float] = None presence_penalty: Optional[float] = None max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None seed: Optional[int] = None stop: Union[Optional[str], List[str]] = None temperature: Optional[float] = None @@ -1588,6 +1752,7 @@ class OpenAiInferenceEngineParams(Artifact): frequency_penalty: Optional[float] = None presence_penalty: Optional[float] = None max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None seed: Optional[int] = None stop: Union[Optional[str], List[str]] = None temperature: Optional[float] = None @@ -1911,6 +2076,7 @@ def _get_model_name_for_endpoint(cls, model_name: str): class TogetherAiInferenceEngineParamsMixin(Artifact): max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None stop: Optional[List[str]] = None temperature: Optional[float] = None top_p: Optional[float] = None @@ -2071,6 +2237,7 @@ class WMLChatParamsMixin(Artifact): response_format: Optional[Dict[str, Any]] = None temperature: Optional[float] = None max_tokens: Optional[int] = None + max_new_tokens: Optional[int] = None time_limit: Optional[int] = None top_p: Optional[float] = None n: Optional[int] = None @@ -3028,6 +3195,7 @@ class VLLMParamsMixin(Artifact): bad_words: Optional[List[str]] = None ignore_eos: bool = False max_tokens: Optional[int] = 16 + max_new_tokens: Optional[int] = None min_tokens: int = 0 logprobs: Optional[int] = None prompt_logprobs: Optional[int] = None @@ -3282,6 +3450,7 @@ class CrossProviderInferenceEngine( "granite-guardian-3-1-8b": "ibm/granite-guardian-3-8b", "granite-guardian-3-2-5b": "ibm/granite-guardian-3-2-5b", "granite-vision-3-2-2b": "ibm/granite-vision-3-2-2b", + "granite-speech-3-3-8b": "ibm-granite/granite-speech-3.3-8b", "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct", "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct", "llama-3-1-405b-instruct": "meta-llama/llama-3-405b-instruct", @@ -3342,6 +3511,7 @@ class CrossProviderInferenceEngine( "granite-guardian-3-2-3b": "ibm-granite/granite-guardian-3.2-3b-a800m", "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b", "granite-guardian-3-3-8b": "ibm-granite/granite-guardian-3.3-8b", + "granite-speech-3-3-8b": "ibm-granite/granite-speech-3.3-8b", "llama-3-1-8b-instruct": "meta-llama/Llama-3.1-8B-Instruct", "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct", "llama-3-1-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8", diff --git a/src/unitxt/metric.py b/src/unitxt/metric.py index 822340fbcd..67ad255a0e 100644 --- a/src/unitxt/metric.py +++ b/src/unitxt/metric.py @@ -4,6 +4,7 @@ from .api import __file__ as _ from .artifact import __file__ as _ +from .audio_operators import __file__ as _ from .augmentors import __file__ as _ from .base_metric import __file__ as _ from .benchmark import __file__ as _ diff --git a/src/unitxt/metric_utils.py b/src/unitxt/metric_utils.py index 3bfc41094e..31aebce81d 100644 --- a/src/unitxt/metric_utils.py +++ b/src/unitxt/metric_utils.py @@ -4,13 +4,15 @@ from collections import defaultdict from functools import lru_cache from statistics import mean -from typing import Any, Dict, Iterable, List, Optional +from typing import Any, Dict, Iterable, List, Optional, Set import pandas as pd +from datasets import Dataset as HfDataset from datasets import Features, Value +from datasets import IterableDataset as HfIterableDataset -from .dataclass import Dataclass -from .error_utils import Documentation, UnitxtError, error_context +from .dataclass import Dataclass, Field +from .error_utils import Documentation, UnitxtError from .operator import ( InstanceOperator, MultiStreamOperator, @@ -22,8 +24,8 @@ ApplyMetric, ApplyOperatorsField, ArtifactFetcherMixin, - FlattenInstances, RecursiveCopy, + RemoveFields, Rename, ) from .register import _reset_env_local_catalogs, register_all_artifacts @@ -145,6 +147,19 @@ def group_str(json_str): return ",".join(f"{k}:{v}" for k, v in data.items()) +@lru_cache(maxsize=None) +def subset_stream_name(stream_name, subset, subset_depth): + return ( + stream_name + DEFAULT_STREAM_SUBSET_SEPARATOR + "/".join(subset[:subset_depth]) + ) + + +@lru_cache(maxsize=None) +def subset_group_stream_name(stream_name, subset, subset_depth, group): + subset_name = subset_stream_name(stream_name, subset, subset_depth) + return subset_name + "?" + group_str(group) + + class SplitSubsetsAndGroups(MultiStreamOperator): """Splits a MultiStream that is small - for metrics, hence: whole stream can sit in memory, split by the value of field 'group'. @@ -157,35 +172,69 @@ class SplitSubsetsAndGroups(MultiStreamOperator): subsets_depth specifies the depth of the prefix by which to split the stream. """ - subsets_field: str = "subset" - groups_field: str = "groups" + subset_groups: Dict[str, Set[str]] subset_depth: Optional[int] = None - def process(self, multi_stream: MultiStream) -> MultiStream: - result = defaultdict(list) - - for stream_name, stream in multi_stream.items(): - for i, instance in enumerate(stream): - instance["__idx__"] = i - - for field in [self.subsets_field, self.groups_field]: - if field not in instance: - raise ValueError( - f"Field {field} is missing from instance {instance}" - ) - - subset_stream_name = ( - stream_name - + DEFAULT_STREAM_SUBSET_SEPARATOR - + "/".join(instance[self.subsets_field][: self.subset_depth]) + def get_new_streams(self, stream_name): + new_streams = [] + for subset, groups in self.subset_groups.items(): + new_streams.append( + subset_stream_name( + stream_name=stream_name, + subset=subset, + subset_depth=self.subset_depth, ) + ) + for group in groups: + new_streams.append( + subset_group_stream_name( + stream_name=stream_name, + subset=subset, + subset_depth=self.subset_depth, + group=group, + ) + ) + return new_streams - result[subset_stream_name].append(instance) + def is_instance_included(self, instance, stream_name, new_stream_name) -> bool: + subset = tuple(instance.get("subset")) - for group in instance[self.groups_field]: - result[subset_stream_name + "?" + group_str(group)].append(instance) + if new_stream_name == subset_stream_name( + stream_name=stream_name, subset=subset, subset_depth=self.subset_depth + ): + return True - return MultiStream.from_iterables(result, copying=True) + for group in instance.get("groups"): + if new_stream_name == subset_group_stream_name( + stream_name=stream_name, + subset=subset, + subset_depth=self.subset_depth, + group=group, + ): + return True + + return False + + def filter_stream(self, stream, stream_name, new_stream_name): + for i, instance in enumerate(stream): + instance["__idx__"] = i + if self.is_instance_included(instance, stream_name, new_stream_name): + yield instance + + def process(self, multi_stream: MultiStream) -> MultiStream: + streams = {} + + for stream_name, stream in multi_stream.items(): + for new_stream_name in self.get_new_streams(stream_name): + streams[new_stream_name] = DynamicStream( + generator=self.filter_stream, + gen_kwargs={ + "stream": stream, + "stream_name": stream_name, + "new_stream_name": new_stream_name, + }, + ) + return MultiStream(streams) @lru_cache(maxsize=None) @@ -235,7 +284,15 @@ def process(self, multi_stream: MultiStream) -> MultiStream: idx = instance.pop("__idx__") if idx not in instances[origin]: - instances[origin][idx] = instance + instances[origin][idx] = {"score": instance["score"]} + if "processed_prediction" in instance: + instances[origin][idx]["processed_prediction"] = instance[ + "processed_prediction" + ] + if "processed_references" in instance: + instances[origin][idx]["processed_references"] = instance[ + "processed_references" + ] # from here below setting the global scores from that stream # can be done with first instance only @@ -340,36 +397,59 @@ def _inference_post_process( return [instance["processed_prediction"] for instance in multi_stream[split_name]] -class MetricRecipe(SequentialOperatorInitializer): +class PreProcessForEvaluation(SequentialOperatorInitializer): + def prepare(self): + register_all_artifacts() + self.steps = [ + FromPredictionsAndOriginalData(), + LoadJson(field="task_data"), + RecursiveCopy( + field="source", + to_field="task_data/source", + ), + ] + + +class PostProcessAfterEvaluation(SequentialOperator): + steps = [ + Rename( + field="raw_prediction", + to_field="prediction", + not_exist_ok=True, + not_exist_do_nothing=True, + ), + Rename( + field="raw_references", + to_field="references", + not_exist_ok=True, + not_exist_do_nothing=True, + ), + RecursiveCopy( + field="source", + to_field="task_data/source", + ), + RemoveFields(fields=["__idx__"], not_exist_ok=True), + ] + + +class MainEvaluationPipeline(SequentialOperator): calc_confidence_intervals: bool = True subset_depth: int = 2 + subset_groups: Dict[str, Set[str]] = Field(default_factory=dict) def prepare(self): register_all_artifacts() self.steps = [ - FromPredictionsAndOriginalData(), - LoadJson(field="task_data"), _post_process_steps, SplitSubsetsAndGroups( subset_depth=self.subset_depth, + subset_groups=self.subset_groups, ), ApplyMetric( "metrics", calc_confidence_intervals=self.calc_confidence_intervals, ), JoinSubsetsAndGroups(), - Rename( - field="raw_prediction", - to_field="prediction", - ), - Rename( - field="raw_references", - to_field="references", - ), - RecursiveCopy( - field="source", - to_field="task_data/source", - ), ] @@ -436,7 +516,7 @@ def __repr__(self): @property def summary(self): - df = self.to_df().round(2).fillna("") + df = self.to_df().round(4).fillna("") df = df.sort_index() df = df.drop("num_of_instances", axis=0) df = df.reset_index() @@ -744,16 +824,139 @@ def __repr__(self): class EvaluationResults(list): - def __init__(self, *args, metadata=None, **kwargs): - super().__init__(*args, **kwargs) + def __init__(self, stream, metadata=None): + """Initialize EvaluationResults with lazy evaluation. + + Args: + stream: An iterator or generator + metadata: Optional metadata dictionary + """ + super().__init__() + self._generator = iter(stream) + self._realized = False self.metadata = metadata if metadata is not None else {} + def _realize_up_to(self, index): + """Realize elements from generator up to the given index.""" + if self._realized: + return + + current_len = super().__len__() + + # For negative indices, we need to realize everything + if index < 0: + self._realize_all() + return + + # Calculate how many more elements we need + needed = index + 1 - current_len + + # Realize the needed elements + try: + for _ in range(needed): + item = next(self._generator) + super().append(item) + except StopIteration: + self._realized = True + self._generator = None + + def _realize_all(self): + """Realize all remaining elements from the generator.""" + if self._realized: + return + + try: + while True: + item = next(self._generator) + super().append(item) + except StopIteration: + self._realized = True + self._generator = None + + def __getitem__(self, index): + if isinstance(index, slice): + # For slices, we need to realize up to the maximum index + start, stop, step = index.indices(len(self)) + if stop > 0: + self._realize_up_to(stop - 1) + else: + self._realize_all() + else: + self._realize_up_to(index) + return super().__getitem__(index) + + def __len__(self): + # If not fully realized, we need to realize everything to know the length + if not self._realized: + self._realize_all() + return super().__len__() + + def __iter__(self): + # Yield already realized elements + for i in range(super().__len__()): + yield super().__getitem__(i) + + # Continue with unrealized elements + if self._generator is not None: + try: + while True: + item = next(self._generator) + super().append(item) + yield item + except StopIteration: + self._realized = True + self._generator = None + + def append(self, item): + self._realize_all() + super().append(item) + + def extend(self, iterable): + self._realize_all() + super().extend(iterable) + + def insert(self, index, item): + self._realize_up_to(index) + super().insert(index, item) + + def remove(self, value): + self._realize_all() + super().remove(value) + + def pop(self, index=-1): + self._realize_up_to(index) + return super().pop(index) + + def index(self, value, start=0, stop=None): + if stop is None: + self._realize_all() + else: + self._realize_up_to(stop) + return super().index(value, start, stop) + + def count(self, value): + self._realize_all() + return super().count(value) + + def sort(self, key=None, reverse=False): + self._realize_all() + super().sort(key=key, reverse=reverse) + + def reverse(self): + self._realize_all() + super().reverse() + + def clear(self): + super().clear() + self._generator = None + self._realized = True + @property def global_scores(self): return GlobalScores(self[0]["score"]["global"]) @property - def instance_scores(self) -> InstanceScores: + def instance_scores(self): return InstanceScores(self) @property @@ -775,6 +978,35 @@ def subsets_scores(self): return SubsetsScores(self[0]["score"]["subsets"]) +def extract_subset_groups(dataset): + # Column-wise access types + subset_groups = {} + if not isinstance(dataset, (HfDataset, HfIterableDataset, pd.DataFrame)): + dataset = { + "subset": (item.get("subset", []) for item in dataset), + "groups": (item.get("groups", []) for item in dataset), + } + + for subset, groups in zip(dataset["subset"], dataset["groups"]): + if len(subset) == 0: + subset = "" + else: + subset = tuple(subset) + + if subset not in subset_groups: + subset_groups[subset] = set() + for group in groups: + subset_groups[subset].add(group) + + return subset_groups + + +def merge_evaluation_results(results, dataset): + for result, instance in zip(results, dataset): + instance.update(result) + yield instance + + def _compute( predictions: List[Any], references: Iterable, @@ -784,19 +1016,34 @@ def _compute( ): _reset_env_local_catalogs() register_all_artifacts() - recipe = MetricRecipe(calc_confidence_intervals=calc_confidence_intervals) - with error_context(stage="Metric Processing"): - multi_stream = recipe( - predictions=predictions, references=references, split_name=split_name - ) + if not isinstance(references, (HfDataset, HfIterableDataset, pd.DataFrame, list)): + raise ValueError(f"Unsupported data type: {type(references)}") - if flatten: - operator = FlattenInstances() - multi_stream = operator(multi_stream) + subset_groups = extract_subset_groups(references) + + preprocess = PreProcessForEvaluation() - stream = multi_stream[split_name] - return EvaluationResults(stream) + preprocess_multi_stream = preprocess( + predictions=predictions, references=references, split_name=split_name + ) + + evaluate = MainEvaluationPipeline( + calc_confidence_intervals=calc_confidence_intervals, subset_groups=subset_groups + ) + + results_multi_stream = evaluate(preprocess_multi_stream) + + post_process = PostProcessAfterEvaluation() + + data_multi_stream = post_process(preprocess_multi_stream) + + return EvaluationResults( + merge_evaluation_results( + results_multi_stream[split_name], + data_multi_stream[split_name], + ) + ) """ diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py index 7e93751d4b..552b3fb006 100644 --- a/src/unitxt/metrics.py +++ b/src/unitxt/metrics.py @@ -3469,6 +3469,37 @@ def compute( return {self.main_score: result} +class WerFast(MapReduceMetric[Tuple[float, float], float]): + """Computes mean squared error between predictions and references. + + Range: [0, ∞) (lower is better) + Measures average squared differences between predicted and true values. + """ + + main_score = "wer" + prediction_type = str + single_reference_per_prediction = True + _requirements_list = ["jiwer>=3.0.0"] # added process_words function + + def prepare(self): + super().prepare() + import jiwer + + self._metric = jiwer.process_words + + def map( + self, prediction: str, references: List[str], task_data: Dict[str, Any] + ) -> Tuple[float, float]: + measures = self._metric(references[0], prediction) + incorrect = measures.substitutions + measures.deletions + measures.insertions + total = measures.substitutions + measures.deletions + measures.hits + return incorrect, total + + def reduce(self, intermediates: List[float]) -> Dict[str, Any]: + incorrect, total = map(sum, zip(*intermediates)) + return {self.main_score: incorrect / total if total > 0 else np.nan} + + class MeanSquaredError(MapReduceMetric[float, float]): """Computes mean squared error between predictions and references. @@ -3537,6 +3568,16 @@ def reduce(self, intermediates: List[Tuple[float, float]]) -> Dict[str, Any]: score, p_value = self.get_correlation_func()(list_a, list_b) + try: + score = float(score) + except: + pass + + try: + p_value = float(p_value) + except: + pass + return { self.main_score: score, f"{self.main_score}_p_value": p_value, diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py index 410b940df1..a6256bb176 100644 --- a/src/unitxt/operators.py +++ b/src/unitxt/operators.py @@ -366,15 +366,21 @@ class RemoveFields(InstanceOperator): Args: fields (List[str]): The fields to remove from each instance. + not_exist_ok (bool): If True, do not raise an error if a field does not exist. Defaults to False. """ fields: List[str] + not_exist_ok: bool = False def process( self, instance: Dict[str, Any], stream_name: Optional[str] = None ) -> Dict[str, Any]: for field_name in self.fields: - del instance[field_name] + try: + del instance[field_name] + except: + if not self.not_exist_ok: + raise return instance @@ -608,7 +614,12 @@ def process( if (not is_subpath(from_field, to_field)) and ( not is_subpath(to_field, from_field) ): - dict_delete(res, from_field, remove_empty_ancestors=True) + dict_delete( + res, + from_field, + remove_empty_ancestors=True, + not_exist_ok=self.not_exist_ok, + ) return res diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py index 6f13e10a33..87aaeb0f2e 100644 --- a/src/unitxt/processors.py +++ b/src/unitxt/processors.py @@ -566,3 +566,35 @@ def process_value(self, text: Any) -> Any: class ExtractVerbalJudgementBadGood(ExtractVerbalJudgment): classes = ["very bad", "bad", "mediocre", "good", "very good"] + + +class NormalizeTextWithWhisper(FieldOperator): + """A processor that uses uses whisper english normalizer.""" + + _requirements_list = ["transformers"] + + def prepare(self): + super().prepare() + from transformers import WhisperTokenizer + + self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base") + self._normalize = self.tokenizer.normalize + + def process_value(self, value: str) -> str: + return self._normalize(value) + + +class NormalizeTextBasicWithWhisper(FieldOperator): + """A processor that uses uses whisper multilingual normalizer.""" + + _requirements_list = ["transformers"] + + def prepare(self): + super().prepare() + from transformers import WhisperTokenizer + + self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base") + self._normalize = self.tokenizer.basic_normalize + + def process_value(self, value: str) -> str: + return self._normalize(value) diff --git a/src/unitxt/schema.py b/src/unitxt/schema.py index 5926b120cc..fe72f22852 100644 --- a/src/unitxt/schema.py +++ b/src/unitxt/schema.py @@ -130,6 +130,9 @@ def _prepare_media(self, instance): if isoftype(instance["media"]["images"][i], Image): instance["media"]["images"][i] = instance["media"]["images"][i]["image"] + for i in range(len(instance["media"]["audios"])): + instance["media"]["audios"][i] = instance["media"]["audios"][i]["audio"] + return instance def _get_instance_task_data( diff --git a/src/unitxt/serializers.py b/src/unitxt/serializers.py index 84a0258ba0..0f5e0c3f05 100644 --- a/src/unitxt/serializers.py +++ b/src/unitxt/serializers.py @@ -9,6 +9,7 @@ from .settings_utils import get_constants from .type_utils import isoftype, to_type_string from .types import ( + Audio, Conversation, Dialog, Document, @@ -151,6 +152,20 @@ def serialize(self, value: Image, instance: Dict[str, Any]) -> str: return f'<{constants.image_tag} src="media/images/{idx}">' +class AudioSerializer(SingleTypeSerializer): + serialized_type = Audio + + def serialize(self, value: Audio, instance: Dict[str, Any]) -> str: + if "media" not in instance: + instance["media"] = {} + if "audios" not in instance["media"]: + instance["media"]["audios"] = [] + idx = len(instance["media"]["audios"]) + instance["media"]["audios"].append({"audio": value["audio"]}) + value["audio"] = f"media/audios/{idx}" + return f'<{constants.audio_tag} src="media/audios/{idx}">' + + class VideoSerializer(ImageSerializer): serialized_type = Video @@ -205,6 +220,7 @@ class MultiTypeSerializer(Serializer): ToolCallSerializer(), DialogSerializer(), MultiDocumentSerializer(), + AudioSerializer(), ImageSerializer(), VideoSerializer(), TableSerializer(), diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py index b7e07be99d..ed4c9a996d 100644 --- a/src/unitxt/settings_utils.py +++ b/src/unitxt/settings_utils.py @@ -261,6 +261,7 @@ def __getattr__(self, key): constants.inference_stream = "__INFERENCE_STREAM__" constants.instance_stream = "__INSTANCE_STREAM__" constants.image_tag = "unitxt-img" + constants.audio_tag = "unitxt-audio" constants.demos_pool_field = "_demos_pool_" constants.demos_field = "demos" constants.instruction_field = "instruction" diff --git a/src/unitxt/stream_operators.py b/src/unitxt/stream_operators.py index b769a5d3b0..c675891cf1 100644 --- a/src/unitxt/stream_operators.py +++ b/src/unitxt/stream_operators.py @@ -127,6 +127,99 @@ def process(self, multi_stream: MultiStream) -> MultiStream: return multi_stream +class JoinStreamsFleurs(MultiStreamOperator): + """Join multiple streams into a single stream - special version for the fleurs speech translation dataset. + + left_stream contains the input speech in English, 1-3 spoken samples per unique text instance + The inference and the evaluation are done on all the spoken samples, including the repetitions, as they + represennt different speakers and different recording conditions, potentially resulting in different + output translations + right_stream contains the target translation text, 1-3 repetitions of the same target text translation + The code below removes the redundant target translations, to avoid duplications during later merging + of the two parts - the left and the right + + Args: + left_stream (str): the left stream. + right_stream (str): the right stream. + how: use "inner". + on: use "id". + new_stream_name (str): The name of the new stream resulting from the merge. + """ + + left_stream: str + right_stream: str + how: Literal["inner"] + on: Optional[List[str]] = None + left_on: Optional[List[str]] = None + right_on: Optional[List[str]] = None + new_stream_name: str + + def merge(self, multi_stream) -> List: + assert self.right_stream in multi_stream and self.left_stream in multi_stream + stream_dict = dict(multi_stream.items()) + left_stream = list(stream_dict[self.left_stream]) + right_stream = list(stream_dict[self.right_stream]) + left_stream_df = pd.DataFrame(left_stream) + # right_stream_df = pd.DataFrame(right_stream) + + # remove duplications from the right stream, this is intended for the FLEURS dataset (spoken translation) + # it removes unnecessary repetitions of the target translations, before merging the input speech with the target translations + right_deduplicate = [] + seen_ids = set() + for item in right_stream: + if item["id"] not in seen_ids: + seen_ids.add(item["id"]) + text = item["transcription"].strip() + if text[-1] not in [".", "!", "?"]: + item["transcription"] = ( + text + "." + ) # add '.' at the end of the reference translations + right_deduplicate.append(item) + right_stream_df = pd.DataFrame(right_deduplicate) + + merged_df = pd.merge( + left_stream_df, + right_stream_df, + how=self.how, + on=self.on, + left_on=self.left_on, + right_on=self.right_on, + ) + + def assert_col_values_are_identical(df: pd.DataFrame, col_name): + (col_name_1, col_name_2) = (f"{col_name}_x", f"{col_name}_y") + if not df.apply( + lambda row: str(row[col_name_1]) == str(row[col_name_2]), + axis=1, + ).all(): + raise UnitxtError( + f"'{col_name}' field is not identical in both left and right instances merged in JoinStreams." + ) + + # If 2 streams / Dataframes contains column with the same names, which are not the columns the join is operated + # on they will be renamed to "[column_name]_x" and "[column_name]_y". Some of these columns are metadsta + # columns that unitxt adds, which must be kept the same. This code verify that all datasets have + # the same metadata values and rename the columns accordingly. + common_cols_to_verify = ["data_classification_policy", "recipe_metadata"] + for common_col in common_cols_to_verify: + assert_col_values_are_identical(merged_df, common_col) + merged_df[common_col] = merged_df[f"{common_col}_x"] + merged_df = merged_df.drop( + columns=[f"{common_col}_x", f"{common_col}_y"], errors="ignore" + ) + + if len(merged_df) == 0: + raise UnitxtError( + f"JoinStreams resulted in an empty stream. It means that that keys in fields '{self.on}' on the left and on right streams do not match the merge policy of '{self.how}'." + ) + return merged_df.to_dict(orient="records") + + def process(self, multi_stream: MultiStream) -> MultiStream: + merged_records = self.merge(multi_stream) + multi_stream[self.new_stream_name] = ListStream(instances_list=merged_records) + return multi_stream + + class DeleteSplits(MultiStreamOperator): """Operator which delete splits in stream. diff --git a/src/unitxt/string_operators.py b/src/unitxt/string_operators.py index 03a1c3b2b3..fd20e17b64 100644 --- a/src/unitxt/string_operators.py +++ b/src/unitxt/string_operators.py @@ -93,6 +93,20 @@ def process_value(self, value: str) -> str: return value.strip() +class StripQuotation(FieldOperator): + def process_value(self, value: str) -> str: + if value.startswith('"') and value.endswith('"'): + return value.strip('"') + return value + + +class AddFullStop(FieldOperator): + def process_value(self, value: str) -> str: + if value[-1] not in [".", "?", "!"]: + return value + "." + return value + + class Replace(FieldOperator): old: str new: str diff --git a/src/unitxt/templates.py b/src/unitxt/templates.py index fd2685f6a9..a072302ac8 100644 --- a/src/unitxt/templates.py +++ b/src/unitxt/templates.py @@ -11,6 +11,7 @@ from .operator import InstanceOperator, Operator from .random_utils import new_random_generator from .serializers import ( + AudioSerializer, ConversationSerializer, DialogSerializer, ImageSerializer, @@ -63,6 +64,7 @@ class Template(InstanceOperator): serializer: Serializer = NonPositionalField( default_factory=lambda: MultiTypeSerializer( serializers=[ + AudioSerializer(), ImageSerializer(), VideoSerializer(), TableSerializer(), diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py index 48261b8f01..adce996e33 100644 --- a/tests/inference/test_inference_engine.py +++ b/tests/inference/test_inference_engine.py @@ -11,6 +11,7 @@ from unitxt.error_utils import UnitxtError from unitxt.inference import ( HFAutoModelInferenceEngine, + HFGraniteSpeechInferenceEngine, HFLlavaInferenceEngine, HFOptionSelectingInferenceEngine, HFPipelineBasedInferenceEngine, @@ -67,6 +68,44 @@ def get_image_dataset(format=None): ) +@lru_cache +def get_audio_dataset(format=None): + import numpy as np + + # Generate synthetic audio data (1 second of 16kHz audio) + sample_rate = 16000 + duration = 1.0 + num_samples = int(sample_rate * duration) + + # Generate synthetic audio (simple sine wave) + frequency = 440 # A4 note + time_values = np.linspace(0, duration, num_samples) + audio_data = np.sin(2 * np.pi * frequency * time_values).astype(np.float32) + + data = [ + { + "context": {"audio": {"array": audio_data, "sampling_rate": sample_rate}}, + "context_type": "audio", + "question": "What is the main topic of this audio?", + "answers": ["Music"], + }, + { + "context": {"audio": {"array": audio_data, "sampling_rate": sample_rate}}, + "context_type": "audio", + "question": "Describe the audio content", + "answers": ["Tone"], + }, + ] + + return create_dataset( + task="tasks.qa.with_context", + format=format, + test_set=data, + split="test", + data_classification_policy=["public"], + ) + + @lru_cache def get_text_dataset(format=None): instances = [ @@ -157,6 +196,36 @@ def test_llava_inference_engine(self): ["text", "logprob", "top_tokens"], ) + def test_granite_speech_inference_engine(self): + if os.environ.get("SKIP_HEAVY_LOCAL"): + return + + model = HFGraniteSpeechInferenceEngine( + model_name="ibm-granite/granite-speech-3.3-2b", + revision="granite-speech-3.3.2-2b", + max_new_tokens=10, + temperature=0.0, + ) + + # Test with chat API format + dataset = get_audio_dataset(format="formats.chat_api") + + predictions = model.infer(dataset) + + # Check that we get predictions for both instances + self.assertEqual(len(predictions), 2) + self.assertIsInstance(predictions[0], str) + self.assertIsInstance(predictions[1], str) + + # Test log probabilities inference + prediction = model.infer_log_probs(dataset) + + assert isoftype(prediction, List[List[Dict[str, Any]]]) + self.assertListEqual( + list(prediction[0][0].keys()), + ["text", "logprob", "top_tokens"], + ) + def test_watsonx_inference(self): model = WMLInferenceEngineGeneration( model_name="google/flan-t5-xl", diff --git a/tests/library/test_formats.py b/tests/library/test_formats.py index 8b22dc071c..d80089dfec 100644 --- a/tests/library/test_formats.py +++ b/tests/library/test_formats.py @@ -147,9 +147,10 @@ def test_openai_format_with_images(self): "target_prefix": "The answer is ", "system_prompt": "You are a smart assistant.", "media": { + "audios": [], "images": [ {"image": create_random_jpeg_image(2, 2, 1), "format": "JPEG"} - ] + ], }, }, ] @@ -259,7 +260,7 @@ def test_openai_format_with_images(self): }, ], "demos": demo_instances, - "media": {"images": []}, + "media": {"audios": [], "images": []}, }, ] diff --git a/tests/library/test_metric_utils.py b/tests/library/test_metric_utils.py index c1bc844483..6f0a53defa 100644 --- a/tests/library/test_metric_utils.py +++ b/tests/library/test_metric_utils.py @@ -9,7 +9,7 @@ class TestMetricUtils(UnitxtTestCase): def test_split_none(self): - operator = SplitSubsetsAndGroups() + operator = SplitSubsetsAndGroups(subset_groups={"": {}}) ms = MultiStream.from_iterables( { @@ -61,7 +61,15 @@ def test_split_none(self): self.assertEqual({k: list(v) for k, v in result.items()}, target) def test_split_groups(self): - operator = SplitSubsetsAndGroups() + operator = SplitSubsetsAndGroups( + subset_groups={ + "": { + '{"template":"templates.t1"}', + '{"num_demos": 1}', + '{"template":"templates.t2"}', + } + } + ) ms = MultiStream.from_iterables( { @@ -143,7 +151,7 @@ def test_split_groups(self): self.assertEqual({k: list(v) for k, v in result.items()}, target) def test_split_subsets(self): - operator = SplitSubsetsAndGroups() + operator = SplitSubsetsAndGroups(subset_groups={("mnli",): {}, ("squad",): {}}) ms = MultiStream.from_iterables( { @@ -197,7 +205,16 @@ def test_split_subsets(self): self.assertEqual({k: list(v) for k, v in result.items()}, target) def test_split_subset_and_groups(self): - operator = SplitSubsetsAndGroups() + operator = SplitSubsetsAndGroups( + subset_groups={ + ("mnli",): { + '{"template":"templates.t1"}', + '{"template":"templates.t2"}', + '{"num_demos": 1}', + }, + ("squad",): {'{"template":"templates.t1"}', '{"num_demos": 1}'}, + } + ) ms = MultiStream.from_iterables( { @@ -339,9 +356,6 @@ def test_join_none(self): list(result["test"]), [ { - "subset": [], - "groups": [], - "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 1.0, @@ -358,9 +372,6 @@ def test_join_none(self): }, }, { - "subset": [], - "groups": [], - "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 0.0, @@ -571,8 +582,6 @@ def test_join_groups(self): list(result["test"]), [ { - "subset": [], - "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'], "score": { "instance": { "accuracy": 1.0, @@ -613,8 +622,6 @@ def test_join_groups(self): }, }, { - "subset": [], - "groups": ['{"template":"templates.t2"}', '{"num_demos": 1}'], "score": { "instance": { "accuracy": 0.0, @@ -655,8 +662,6 @@ def test_join_groups(self): }, }, { - "subset": [], - "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'], "score": { "instance": { "f1": 1.0, @@ -776,9 +781,9 @@ def test_join_subsets(self): list(result["test"]), [ { - "subset": ["mnli"], - "groups": [], - "media": {"audios": [], "images": []}, + # "subset": ["mnli"], + # "groups": [], + # "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 1.0, @@ -810,9 +815,9 @@ def test_join_subsets(self): }, }, { - "subset": ["mnli"], - "groups": [], - "media": {"audios": [], "images": []}, + # "subset": ["mnli"], + # "groups": [], + # "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 0.0, @@ -844,9 +849,9 @@ def test_join_subsets(self): }, }, { - "subset": ["squad"], - "groups": [], - "media": {"audios": [], "images": []}, + # "subset": ["squad"], + # "groups": [], + # "media": {"audios": [], "images": []}, "score": { "instance": { "f1": 1.0, @@ -977,9 +982,6 @@ def test_join_nested_subsets(self): list(result["test"]), [ { - "subset": ["mnli", "first"], - "groups": [], - "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 1.0, @@ -1015,9 +1017,6 @@ def test_join_nested_subsets(self): }, }, { - "subset": ["mnli", "first"], - "groups": [], - "media": {"audios": [], "images": []}, "score": { "instance": { "accuracy": 0.0, @@ -1254,8 +1253,6 @@ def test_join_subsets_and_groups(self): list(result["test"]), [ { - "subset": ["mnli"], - "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'], "score": { "instance": { "accuracy": 1.0, @@ -1329,8 +1326,6 @@ def test_join_subsets_and_groups(self): }, }, { - "subset": ["mnli"], - "groups": ['{"template":"templates.t2"}', '{"num_demos": 1}'], "score": { "instance": { "accuracy": 0.0, @@ -1404,8 +1399,6 @@ def test_join_subsets_and_groups(self): }, }, { - "subset": ["squad"], - "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'], "score": { "instance": { "f1": 1.0, diff --git a/utils/install.sh b/utils/install.sh index 2e37fb1298..8f90d58d99 100644 --- a/utils/install.sh +++ b/utils/install.sh @@ -1,6 +1,8 @@ +sudo apt-get update +sudo apt-get install -y ffmpeg echo "blis==0" > constraints.txt curl -LsSf https://astral.sh/uv/install.sh | sh -uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu +uv pip install --upgrade --system torchcodec torchaudio torch --index-url https://download.pytorch.org/whl/cpu uv pip install --system -c constraints.txt -e ".[tests]" pip install --only-binary :all: spacy pip install coverage[toml]