diff --git a/.github/workflows/examples_tests.yml b/.github/workflows/examples_tests.yml
index 8dfcfcbcfc..f772a642e1 100644
--- a/.github/workflows/examples_tests.yml
+++ b/.github/workflows/examples_tests.yml
@@ -30,6 +30,7 @@ jobs:
       WML_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }}
       WML_APIKEY: ${{ secrets.WML_APIKEY }}
       GENAI_KEY: ${{ secrets.GENAI_KEY }}
+      SKIP_HEAVY_LOCAL: "True"
 
     steps:
     - uses: actions/checkout@v4
diff --git a/.github/workflows/inference_tests.yml b/.github/workflows/inference_tests.yml
index 2407bdf1ba..014926cfe4 100644
--- a/.github/workflows/inference_tests.yml
+++ b/.github/workflows/inference_tests.yml
@@ -32,6 +32,8 @@ jobs:
       WX_PROJECT_ID: ${{ secrets.WML_PROJECT_ID }} # Similar to WML_PROJECT_ID
       WX_API_KEY: ${{ secrets.WML_APIKEY }} # Similar to WML_APIKEY
       GENAI_KEY: ${{ secrets.GENAI_KEY }}
+      SKIP_HEAVY_LOCAL: "True"
+
 
     steps:
       - uses: actions/checkout@v4
diff --git a/examples/evaluate_speech_recognition.py b/examples/evaluate_speech_recognition.py
new file mode 100644
index 0000000000..1abaaa9362
--- /dev/null
+++ b/examples/evaluate_speech_recognition.py
@@ -0,0 +1,67 @@
+# this python script shows an example of running speech recognition evaluation for Granite Speech using the Hugging Face ESB datasets and the CommonVoice datasets
+
+import os
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    CrossProviderInferenceEngine,
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+USE_RITS = False  #  whether to use RITS service
+USE_WML = False  #  whether to use WML service
+
+test_dataset = load_dataset(
+    # select (uncomment) only one of the following cards (datasets)
+    # for evaluating a benchmark with multiple cards - see evaluate_speech_recognition_benchmark.py in the same directory (examples)
+    card="cards.esb.ami",
+    # card="cards.esb.voxpopuli",
+    # card="cards.esb.librispeech",
+    # card="cards.esb.spgispeech",
+    # card="cards.esb.earnings22",
+    # card="cards.esb.tedlium",
+    # card="cards.commonvoice.en"
+    # card="cards.commonvoice.de"
+    # card="cards.commonvoice.fr"
+    # card="cards.commonvoice.es"
+    # card="cards.commonvoice.pt"
+    split="test",
+    format="formats.chat_api",
+    max_test_instances=5,  # to tun limited part of the test set
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if os.environ.get("SKIP_HEAVY_LOCAL", False):
+    exit()
+
+if not USE_RITS and not USE_WML:
+    # locally running the model, it needs GPU to run properly
+    model = HFGraniteSpeechInferenceEngine(
+        model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+        revision="granite-speech-3.3.2-2b",
+        max_new_tokens=200,
+    )
+if USE_RITS:
+    # using the RITS remote service for inferencing
+    model = CrossProviderInferenceEngine(
+        model="granite-speech-3-3-8b",  # in RITS only the 8b version of Granite Speech is available
+        provider="rits",
+        # provider_specific_args={"rits": {"max_new_tokens": 200}},
+        max_new_tokens=200,
+    )
+if USE_WML:
+    # using the WML remote service for inferencing
+    # code to be completed
+    model = None
+
+
+predictions = model(test_dataset)
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/examples/evaluate_speech_recognition_benchmark.py b/examples/evaluate_speech_recognition_benchmark.py
new file mode 100644
index 0000000000..1e4808daac
--- /dev/null
+++ b/examples/evaluate_speech_recognition_benchmark.py
@@ -0,0 +1,45 @@
+# this python script shows an example of running speech recognition benchmark evaluation for Granite Speech
+# using the Hugging Face ESB datasets (English) and the multilingial CommonVoice datasets
+
+# to run on a single test set use subset=... below; the list of subsets is:
+# voxpopuli, ami, librispeech, spgispeech, tedlium, earnings22,
+# commonvoice_en, commonvoice_de, commonvoice_es, commonvoice_fr, commonvoice_pt
+
+import os
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+dataset = load_dataset(
+    "benchmarks.speech_recognition",
+    max_samples_per_subset=5,  # while this is commented out, the entire test set is used
+    # subset="ami",   #to tun only a single dataset
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+    split="test",
+)
+
+
+if os.environ.get("SKIP_HEAVY_LOCAL", False):
+    exit()
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-2b",  # two options for Granite Speech 3.3:  2b  and  8b
+    revision="granite-speech-3.3.2-2b",
+    max_new_tokens=200,
+)
+
+predictions = model(dataset)
+results = evaluate(
+    predictions=predictions, data=dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
+
+print("Subsets scores:")
+print(results.subsets_scores.summary)
diff --git a/examples/evaluate_speech_translation_benchmark.py b/examples/evaluate_speech_translation_benchmark.py
new file mode 100644
index 0000000000..7c2deb4d68
--- /dev/null
+++ b/examples/evaluate_speech_translation_benchmark.py
@@ -0,0 +1,38 @@
+# this python script shows an example of running speech translation benchmark evaluation for Granite Speech
+# using the Fleurs and Covost2 datasets
+
+# to run on a single test set use subset=... below; the list of subsets is:
+# fleurs_en_de, fleurs_en_es, fleurs_en_fr, fleurs_en_it, fleurs_en_ja, fleurs_en_pt, fleurs_en_pt,
+# covost2_en_de, covost2_en_ja, covost2_de_en, covost2_es_en, covost2_fr_en, covost2_pt_en
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+dataset = load_dataset(
+    "benchmarks.speech_translation",
+    # max_samples_per_subset=100,   # while this is commented out, the entire test set is used
+    # subset="fleurs_en_fr",        #to run only a single test set
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: December 19, 2024.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+    split="test",
+)
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(dataset)
+results = evaluate(
+    predictions=predictions, data=dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
+
+print("Subsets scores:")
+print(results.subsets_scores.summary)
diff --git a/examples/evaluate_speech_translation_covost2.py b/examples/evaluate_speech_translation_covost2.py
new file mode 100644
index 0000000000..775e0b3df1
--- /dev/null
+++ b/examples/evaluate_speech_translation_covost2.py
@@ -0,0 +1,58 @@
+# this python script shows an example of running speech translation evaluation for Granite Speech
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+debug = True  # True for extra printing, set to False when commenting out max_test_instances below
+max_test_instances = 8
+
+# the available calanguages for the covost2 dataset dataset, are:
+#  translation from English to target language:
+#   de        German
+#   ja        Japanese
+#  translation from source language to English:
+#   de        German
+#   es        Spanish
+#   fr        French
+#   pt        Portuguese
+test_dataset = load_dataset(  # select (un-comment) one of the test sets below
+    card="cards.covost2.from_en.en_de",
+    # card="cards.covost2.from_en.en_ja",
+    # card="cards.covost2.to_en.de_en",
+    # card="cards.covost2.to_en.es_en",
+    # card="cards.covost2.to_en.fr_en",
+    # card="cards.covost2.to_en.pt_en",
+    split="test",
+    format="formats.chat_api",
+    max_test_instances=max_test_instances,  # comment out for running the entire test
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if debug:
+    print(">>>>>>>>>>>>>>  first test references  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>   references {idx}:  ", test_dataset["references"][idx])
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(test_dataset)
+
+if debug:  # print translation reference texts for debug and inspection
+    print(">>>>>>>>>>>>>>  first predictions  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>>>>>> {idx}:   ", predictions[idx])
+
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/examples/evaluate_speech_translation_fleurs.py b/examples/evaluate_speech_translation_fleurs.py
new file mode 100644
index 0000000000..574ec6c2eb
--- /dev/null
+++ b/examples/evaluate_speech_translation_fleurs.py
@@ -0,0 +1,58 @@
+# this python script shows an example of running speech translation evaluation for Granite Speech
+
+from unitxt import evaluate, load_dataset
+from unitxt.inference import (
+    HFGraniteSpeechInferenceEngine,
+)
+from unitxt.system_prompts import TextualSystemPrompt
+
+debug = False  # True for extra printing, set to False when commenting out max_test_instances below
+max_test_instances = 20
+
+# the available cards for the fleurs dataset, reflecting the target language, are:
+#   de_de           German
+#   es_419          Spanish, South America
+#   fr_fr           French
+#   it_it           Italian
+#   ja_jp           Japanese
+#   pt_br           Portuguese, Brazil
+#   cmn_hans_cn     Chinese, Mandarin
+test_dataset = load_dataset(  # select (un-comment) one of the test sets below
+    # card="cards.fleurs.en_us.de_de",
+    # card="cards.fleurs.en_us.es_419",
+    # card="cards.fleurs.en_us.fr_fr",
+    # card="cards.fleurs.en_us.it_it",
+    # card="cards.fleurs.en_us.pt_br",
+    card="cards.fleurs.en_us.ja_jp",
+    # card="cards.fleurs.en_us.cmn_hans_cn",
+    split="test",
+    format="formats.chat_api",
+    # max_test_instances=max_test_instances,  # comment out for running the entire test
+    system_prompt=TextualSystemPrompt(
+        text="Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant"
+    ),
+)
+
+if debug:
+    print(">>>>>>>>>>>>>>  test references  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>   references {idx}:  ", test_dataset["references"][idx])
+
+model = HFGraniteSpeechInferenceEngine(
+    model_name="ibm-granite/granite-speech-3.3-8b",  # two options for Granite Speech 3.3:  2b  and  8b
+    max_new_tokens=200,
+)
+
+predictions = model(test_dataset)
+
+if debug:  # print translation reference texts for debug and inspection
+    print(">>>>>>>>>>>>>>  model predictions  >>>>>>>>>>>>")
+    for idx in range(max_test_instances):
+        print(f">>>>>>>>>>> {idx}:   ", predictions[idx])
+
+results = evaluate(
+    predictions=predictions, data=test_dataset, calc_confidence_intervals=False
+)
+
+print("Global scores:")
+print(results.global_scores.summary)
diff --git a/prepare/benchmarks/speech_recognition_english.py b/prepare/benchmarks/speech_recognition_english.py
new file mode 100644
index 0000000000..114adf48f8
--- /dev/null
+++ b/prepare/benchmarks/speech_recognition_english.py
@@ -0,0 +1,38 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+benchmark = Benchmark(
+    subsets={
+        "voxpopuli": DatasetRecipe(
+            card="cards.esb.voxpopuli",
+            format="formats.chat_api",
+        ),
+        "ami": DatasetRecipe(
+            card="cards.esb.ami",
+            format="formats.chat_api",
+        ),
+        "librispeech": DatasetRecipe(
+            card="cards.esb.librispeech",
+            format="formats.chat_api",
+        ),
+        "spgispeech": DatasetRecipe(
+            card="cards.esb.spgispeech",
+            format="formats.chat_api",
+        ),
+        "tedlium": DatasetRecipe(
+            card="cards.esb.tedlium",
+            format="formats.chat_api",
+        ),
+        "earnings22": DatasetRecipe(
+            card="cards.esb.earnings22",
+            format="formats.chat_api",
+        ),
+        "commonvoice": DatasetRecipe(
+            card="cards.esb.commonvoice",
+            format="formats.chat_api",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.speech_recognition", overwrite=True)
diff --git a/prepare/benchmarks/speech_recognition_multilingual.py b/prepare/benchmarks/speech_recognition_multilingual.py
new file mode 100644
index 0000000000..e0857e6839
--- /dev/null
+++ b/prepare/benchmarks/speech_recognition_multilingual.py
@@ -0,0 +1,26 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+benchmark = Benchmark(
+    subsets={
+        "commonvoice_de": DatasetRecipe(
+            card="cards.commonvoice.de",
+            format="formats.chat_api",
+        ),
+        "commonvoice_es": DatasetRecipe(
+            card="cards.commonvoice.es",
+            format="formats.chat_api",
+        ),
+        "commonvoice_fr": DatasetRecipe(
+            card="cards.commonvoice.fr",
+            format="formats.chat_api",
+        ),
+        "commonvoice_pt": DatasetRecipe(
+            card="cards.commonvoice.pt",
+            format="formats.chat_api",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.speech_recognition_multilingual", overwrite=True)
diff --git a/prepare/benchmarks/speech_translation.py b/prepare/benchmarks/speech_translation.py
new file mode 100644
index 0000000000..3fd68e47c8
--- /dev/null
+++ b/prepare/benchmarks/speech_translation.py
@@ -0,0 +1,65 @@
+from unitxt.benchmark import Benchmark
+from unitxt.catalog import add_to_catalog
+from unitxt.standard import DatasetRecipe
+
+# running benchmarks with fleurs dataset, en-->xx
+# running benchmarks with covost2 dataset, en-->xx
+# running benchmarks with covost2 dataset, xx-->en
+benchmark = Benchmark(
+    subsets={
+        "fleurs_en_de": DatasetRecipe(
+            card="cards.fleurs.en_us.de_de",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_es": DatasetRecipe(
+            card="cards.fleurs.en_us.es_419",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_fr": DatasetRecipe(
+            card="cards.fleurs.en_us.fr_fr",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_it": DatasetRecipe(
+            card="cards.fleurs.en_us.it_it",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_ja": DatasetRecipe(
+            card="cards.fleurs.en_us.ja_jp",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_pt": DatasetRecipe(
+            card="cards.fleurs.en_us.pt_br",
+            format="formats.chat_api",
+        ),
+        "fleurs_en_zh": DatasetRecipe(
+            card="cards.fleurs.en_us.cmn_hans_cn",
+            format="formats.chat_api",
+        ),
+        "covost2_en_de": DatasetRecipe(
+            card="cards.covost2.from_en.en_de",
+            format="formats.chat_api",
+        ),
+        "covost2_en_ja": DatasetRecipe(
+            card="cards.covost2.from_en.en_ja",
+            format="formats.chat_api",
+        ),
+        "covost2_de_en": DatasetRecipe(
+            card="cards.covost2.to_en.de_en",
+            format="formats.chat_api",
+        ),
+        "covost2_es_en": DatasetRecipe(
+            card="cards.covost2.to_en.es_en",
+            format="formats.chat_api",
+        ),
+        "covost2_fr_en": DatasetRecipe(
+            card="cards.covost2.to_en.fr_en",
+            format="formats.chat_api",
+        ),
+        "covost2_pt_en": DatasetRecipe(
+            card="cards.covost2.to_en.pt_en",
+            format="formats.chat_api",
+        ),
+    },
+)
+
+add_to_catalog(benchmark, "benchmarks.speech_translation", overwrite=True)
diff --git a/prepare/cards/ami.py b/prepare/cards/ami.py
new file mode 100644
index 0000000000..ada958eb3e
--- /dev/null
+++ b/prepare/cards/ami.py
@@ -0,0 +1,42 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.test_utils.card import test_card
+
+for subset in ["ihm", "sdm"]:
+    card = TaskCard(
+        loader=LoadHF(
+            path="edinburghcstr/ami",
+            data_dir=subset,
+            revision="refs/convert/parquet",
+            splits=["train", "validation", "test"],
+            data_classification_policy=["public"],
+            streaming=True,
+        ),
+        preprocess_steps=[
+            ToAudio(field="audio"),
+        ],
+        task="tasks.speech_recognition",
+        templates=[
+            "templates.speech_recognition.default",
+        ],
+        __tags__={
+            "license": "cc-by-4.0",
+            "language": "en",
+            "task_categories": ["automatic-speech-recognition"],
+            "size_categories": ["10K<n<100K"],
+            "modalities": ["audio", "text"],
+            "source": "AMI Meeting Corpus",
+        },
+        __description__=(
+            "AMI Meeting Corpus contains 100 hours of meeting recordings with synchronized "
+            "audio/video signals. The corpus includes close-talking and far-field microphones, "
+            "recorded in 3 different rooms with varied acoustic properties. Features mostly "
+            "non-native English speakers in meeting scenarios."
+        ),
+        __title__=f"AMI-{subset}",
+    )
+
+    test_card(card, strict=False)
+    add_to_catalog(card, f"cards.ami.{subset}", overwrite=True)
diff --git a/prepare/cards/commonvoice.py b/prepare/cards/commonvoice.py
new file mode 100644
index 0000000000..2c149dce97
--- /dev/null
+++ b/prepare/cards/commonvoice.py
@@ -0,0 +1,58 @@
+# This Python script is used to prepare cards for the CommonVoice ver. 17 dataset, used for evaluating multilingual speech recognition
+
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import Rename
+from unitxt.string_operators import StripQuotation
+from unitxt.test_utils.card import test_card
+
+subsets = ["en", "fr", "de", "es", "pt"]  # languages to use
+templates_ = {
+    "en": "templates.speech_recognition.default",
+    "fr": "templates.speech_recognition.multilingual",
+    "de": "templates.speech_recognition.multilingual",
+    "es": "templates.speech_recognition.multilingual",
+    "pt": "templates.speech_recognition.multilingual",
+}
+tasks_ = {
+    "en": "tasks.speech_recognition",
+    "fr": "tasks.speech_recognition_multilingual",
+    "de": "tasks.speech_recognition_multilingual",
+    "es": "tasks.speech_recognition_multilingual",
+    "pt": "tasks.speech_recognition_multilingual",
+}
+
+first = True
+for subset in subsets:
+    card = TaskCard(
+        loader=LoadHF(
+            path="mozilla-foundation/common_voice_17_0",
+            revision="refs/convert/parquet",
+            data_dir=subset,
+            data_classification_policy=["public"],
+            splits=[
+                "invalidated",
+                "other",
+                "test",
+                "train",
+                "validated",
+                "validation",
+            ],
+            streaming=True,
+        ),
+        preprocess_steps=[
+            ToAudio(field="audio"),
+            Rename(field="sentence", to_field="text"),
+            StripQuotation(field="text"),
+        ],
+        task=tasks_[subset],
+        templates=[templates_[subset]],
+        __title__="CommonVoice-17-" + subset,
+    )
+
+    if first:
+        test_card(card, strict=False)
+        first = False
+    add_to_catalog(card, f"cards.commonvoice.{subset}", overwrite=True)
diff --git a/prepare/cards/covost2.py b/prepare/cards/covost2.py
new file mode 100644
index 0000000000..b713bbd873
--- /dev/null
+++ b/prepare/cards/covost2.py
@@ -0,0 +1,145 @@
+# This Python script is used to prepare card for the covost2 dataset, used for evaluating speech translation
+
+import os
+
+from unitxt.audio_operators import ToAudio
+from unitxt.blocks import LoadHF, TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.operators import Set
+from unitxt.test_utils.card import test_card
+
+# The covost2 dataset supports translation from 21 languages into English, and from English to 15 languages
+# As opposed to the fluers dataset, there is no support from any to any
+
+# Entire list of supported language pairs (codes) for reference:
+# all_subsets_from_en = [
+#    "de",
+#    "tr",
+#    "fa",
+#    "sv-SE",
+#    "mn",
+#    "zh-CN",
+#    "cy",
+#    "ca",
+#    "sl",
+#    "et",
+#    "id",
+#    "ar",
+#    "ta",
+#    "lv",
+#    "ja"
+#    ]
+
+# all_subsets_to_en = [
+#    "fr",
+#    "de",
+#    "es",
+#    "ca",
+#    "it",
+#    "ru",
+#    "zh-CN",
+#    "pt",
+#    "fa",
+#    "et",
+#    "mn",
+#    "nl",
+#    "tr",
+#    "ar",
+#    "sv-SE",
+#    "lv",
+#    "sl",
+#    "ta",
+#    "ja",
+#    "id",
+#    "cy"
+#    ]
+
+# Currently we only use covost2 for evaluating translating to and from english for a limited list as below; additions may follow
+# We use the (basic) Whisper text normalization; before extending to other languages, check if Whisper basic normalizer supports them
+
+subsets_from_en = ["de", "ja"]
+
+subsets_to_en = ["fr", "de", "es", "pt"]
+
+lang_name = {
+    "de": "German",
+    "ja": "Japanese",
+    "es": "Spanish",
+    "fr": "French",
+    "pt": "Portuguese",
+    "en": "English",
+}
+
+# An example of how to load the covost2 dataset using Hugging Face's loader:
+#   dataset = datasets.load_dataset('facebook/covost2', 'en_de', data_dir='/dccstor/aharonsatt/data/covost2/en', split='test')
+#
+# For each language pair, the source dataset (the one that provides the audio) needs to be available locally
+# We are temporarily using the data in the CCC, at the path /dataset/speechdata/CoVoST2/xx  where xx is the source language code
+
+task_types = ["from_en", "to_en"]
+local_data_path = "/dataset/speechdata/CoVoST2/"  # the CommonVoice ver. 4 datasets are stored locally in the CCC; this local store is required for using HF CoVost 2 dataset
+
+first = True
+
+# from English
+for lang in subsets_from_en:
+    card = TaskCard(
+        loader=LoadHF(
+            path="facebook/covost2",
+            name="en_" + lang,
+            data_dir=local_data_path + "en",
+            split="test",
+            streaming=True,
+            # requirements=["datasets<4.0.0"],
+        ),
+        preprocess_steps=[
+            ToAudio(field="audio"),
+            Set(
+                fields={
+                    "source_language": lang_name["en"],
+                    "target_language": lang_name[lang],
+                }
+            ),
+        ],
+        task="tasks.translation.speech",
+        templates=[
+            "templates.translation.speech.default",
+        ],
+    )
+
+    if first and os.path.isdir(local_data_path):
+        test_card(card, debug=True)
+        first = False
+
+    add_to_catalog(card, f"cards.covost2.from_en.en_{lang}", overwrite=True)
+
+# to English
+for lang in subsets_to_en:
+    card = TaskCard(
+        loader=LoadHF(
+            path="facebook/covost2",
+            name=lang + "_en",
+            data_dir=local_data_path + lang,
+            split="test",
+            streaming=True,
+        ),
+        preprocess_steps=[
+            ToAudio(field="audio"),
+            Set(
+                fields={
+                    "source_language": lang_name[lang],
+                    "target_language": lang_name["en"],
+                }
+            ),
+        ],
+        task="tasks.translation.speech",
+        templates=[
+            "templates.translation.speech.default",
+        ],
+    )
+
+    if first and os.path.isdir(local_data_path):
+        test_card(card, demos_taken_from="test", num_demos=0)
+        first = False
+
+    add_to_catalog(card, f"cards.covost2.to_en.{lang}_en", overwrite=True)
diff --git a/prepare/cards/esb/ami.py b/prepare/cards/esb/ami.py
new file mode 100644
index 0000000000..012d1f77bc
--- /dev/null
+++ b/prepare/cards/esb/ami.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="ami",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="ami",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["10K<n<100K"],
+        "modalities": ["audio", "text"],
+        "source": "AMI Meeting Corpus",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "AMI Meeting Corpus from ESB (End-to-End Speech Benchmark) - Meeting recordings "
+        "with standardized preprocessing and consistent formatting. Test-only split from "
+        "the unified ESB dataset collection."
+    ),
+    __title__="ESB-AMI",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.ami", overwrite=True)
diff --git a/prepare/cards/esb/commonvoice.py b/prepare/cards/esb/commonvoice.py
new file mode 100644
index 0000000000..38f4f0fbfc
--- /dev/null
+++ b/prepare/cards/esb/commonvoice.py
@@ -0,0 +1,74 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="common_voice",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="common_voice",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "European Parliament recordings",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "CommonVoice from ESB (End-to-End Speech Benchmark) - Common Voice is a series of crowd-sourced open-licensed speech datasets where speakers record text from Wikipedia in various languages."
+    ),
+    __title__="ESB-CommonVoice",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.commonvoice", overwrite=True)
diff --git a/prepare/cards/esb/earnings22.py b/prepare/cards/esb/earnings22.py
new file mode 100644
index 0000000000..9f1081a373
--- /dev/null
+++ b/prepare/cards/esb/earnings22.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="earnings22",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="earnings22",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["100<n<1K"],
+        "modalities": ["audio", "text"],
+        "source": "earnings calls",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "Earnings-22 from ESB (End-to-End Speech Benchmark) - Corporate earnings calls "
+        "with standardized preprocessing and consistent formatting. Test-only split "
+        "from the unified ESB dataset collection."
+    ),
+    __title__="ESB-Earnings22",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.earnings22", overwrite=True)
diff --git a/prepare/cards/esb/gigaspeech.py b/prepare/cards/esb/gigaspeech.py
new file mode 100644
index 0000000000..5c7986b378
--- /dev/null
+++ b/prepare/cards/esb/gigaspeech.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="gigaspeech",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="gigaspeech",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "apache-2.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["100K<n<1M"],
+        "modalities": ["audio", "text"],
+        "source": "audiobooks, podcasts, YouTube",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "GigaSpeech from ESB (End-to-End Speech Benchmark) - Multi-domain English speech "
+        "corpus with standardized preprocessing and consistent formatting. Test-only split "
+        "from the unified ESB dataset collection."
+    ),
+    __title__="ESB-GigaSpeech",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.gigaspeech", overwrite=True)
diff --git a/prepare/cards/esb/librispeech.py b/prepare/cards/esb/librispeech.py
new file mode 100644
index 0000000000..8dcd31f77b
--- /dev/null
+++ b/prepare/cards/esb/librispeech.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="librispeech",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="librispeech",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "LibriVox audiobooks",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "LibriSpeech from ESB (End-to-End Speech Benchmark) - Read English speech from "
+        "audiobooks with standardized preprocessing and consistent formatting. Test-only "
+        "split from the unified ESB dataset collection."
+    ),
+    __title__="ESB-LibriSpeech",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.librispeech", overwrite=True)
diff --git a/prepare/cards/esb/spgispeech.py b/prepare/cards/esb/spgispeech.py
new file mode 100644
index 0000000000..bc50a3e821
--- /dev/null
+++ b/prepare/cards/esb/spgispeech.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="spgispeech",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="spgispeech",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "other",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["100K<n<1M"],
+        "modalities": ["audio", "text"],
+        "source": "financial earnings calls",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "SPGISpeech from ESB (End-to-End Speech Benchmark) - Financial earnings calls "
+        "with standardized preprocessing and consistent formatting. Test-only split "
+        "from the unified ESB dataset collection."
+    ),
+    __title__="ESB-SPGISpeech",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.spgispeech", overwrite=True)
diff --git a/prepare/cards/esb/tedlium.py b/prepare/cards/esb/tedlium.py
new file mode 100644
index 0000000000..31e837cf8a
--- /dev/null
+++ b/prepare/cards/esb/tedlium.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="tedlium",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="tedlium",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-nc-nd-3.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["10K<n<100K"],
+        "modalities": ["audio", "text"],
+        "source": "TED Talks",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "TED-LIUM from ESB (End-to-End Speech Benchmark) - TED Talk presentations "
+        "with standardized preprocessing and consistent formatting. Test-only split "
+        "from the unified ESB dataset collection."
+    ),
+    __title__="ESB-TEDLIUM",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.tedlium", overwrite=True)
diff --git a/prepare/cards/esb/voxpopuli.py b/prepare/cards/esb/voxpopuli.py
new file mode 100644
index 0000000000..612054c0dd
--- /dev/null
+++ b/prepare/cards/esb/voxpopuli.py
@@ -0,0 +1,76 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF, MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import Rename
+from unitxt.splitters import RenameSplits, SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+# the datasets in 'esb/diagnostic-dataset' contain two splits: 'clean' and 'other'
+# the script below downloads the two splits separately per dataset, then combines them into a single split named 'test'
+card = TaskCard(
+    loader=MultipleSourceLoader(
+        sources=[
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="voxpopuli",
+                        splits=["clean"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "clean": "test1",
+                        }
+                    ),
+                ]
+            ),
+            SourceSequentialOperator(
+                steps=[
+                    LoadHF(
+                        path="esb/diagnostic-dataset",
+                        name="voxpopuli",
+                        splits=["other"],
+                        data_classification_policy=["public"],
+                        streaming=True,
+                    ),
+                    RenameSplits(
+                        {
+                            "other": "test2",
+                        }
+                    ),
+                ]
+            ),
+        ]
+    ),
+    preprocess_steps=[
+        SplitRandomMix({"test": "test1+test2"}),
+        ToAudio(field="audio"),
+        Rename(field="norm_transcript", to_field="text"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "European Parliament recordings",
+        "benchmark": "ESB",
+    },
+    __description__=(
+        "VoxPopuli from ESB (End-to-End Speech Benchmark) - European Parliament recordings "
+        "with standardized preprocessing and consistent formatting. Test-only split from "
+        "the unified ESB dataset collection."
+    ),
+    __title__="ESB-VoxPopuli",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.esb.voxpopuli", overwrite=True)
diff --git a/prepare/cards/fleurs.py b/prepare/cards/fleurs.py
new file mode 100644
index 0000000000..f30187107b
--- /dev/null
+++ b/prepare/cards/fleurs.py
@@ -0,0 +1,259 @@
+# This Python script is used to prepare cards for the fleurs dataset, used for evaluating speech translation
+
+from unitxt.audio_operators import ToAudio
+from unitxt.blocks import LoadHF, TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import MultipleSourceLoader
+from unitxt.operator import SourceSequentialOperator
+from unitxt.operators import AddFields, RemoveFields, Rename
+from unitxt.splitters import RenameSplits
+from unitxt.stream_operators import JoinStreamsFleurs
+from unitxt.test_utils.card import test_card
+
+# Entire list of languages (codes) supported by fleurs, for reference:
+# all_subsets = [
+#     "af_za",
+#     "am_et",
+#     "ar_eg",
+#     "as_in",
+#     "ast_es",
+#     "az_az",
+#     "be_by",
+#     "bg_bg",
+#     "bn_in",
+#     "bs_ba",
+#     "ca_es",
+#     "ceb_ph",
+#     "ckb_iq",
+#     "cmn_hans_cn",
+#     "cs_cz",
+#     "cy_gb",
+#     "da_dk",
+#     "de_de",
+#     "el_gr",
+#     "en_us",
+#     "es_419",
+#     "et_ee",
+#     "fa_ir",
+#     "ff_sn",
+#     "fi_fi",
+#     "fil_ph",
+#     "fr_fr",
+#     "ga_ie",
+#     "gl_es",
+#     "gu_in",
+#     "ha_ng",
+#     "he_il",
+#     "hi_in",
+#     "hr_hr",
+#     "hu_hu",
+#     "hy_am",
+#     "id_id",
+#     "ig_ng",
+#     "is_is",
+#     "it_it",
+#     "ja_jp",
+#     "jv_id",
+#     "ka_ge",
+#     "kam_ke",
+#     "kea_cv",
+#     "kk_kz",
+#     "km_kh",
+#     "kn_in",
+#     "ko_kr",
+#     "ky_kg",
+#     "lb_lu",
+#     "lg_ug",
+#     "ln_cd",
+#     "lo_la",
+#     "lt_lt",
+#     "luo_ke",
+#     "lv_lv",
+#     "mi_nz",
+#     "mk_mk",
+#     "ml_in",
+#     "mn_mn",
+#     "mr_in",
+#     "ms_my",
+#     "mt_mt",
+#     "my_mm",
+#     "nb_no",
+#     "ne_np",
+#     "nl_nl",
+#     "nso_za",
+#     "ny_mw",
+#     "oc_fr",
+#     "om_et",
+#     "or_in",
+#     "pa_in",
+#     "pl_pl",
+#     "ps_af",
+#     "pt_br",
+#     "ro_ro",
+#     "ru_ru",
+#     "sd_in",
+#     "sk_sk",
+#     "sl_si",
+#     "sn_zw",
+#     "so_so",
+#     "sr_rs",
+#     "sv_se",
+#     "sw_ke",
+#     "ta_in",
+#     "te_in",
+#     "tg_tj",
+#     "th_th",
+#     "tr_tr",
+#     "uk_ua",
+#     "umb_ao",
+#     "ur_pk",
+#     "uz_uz",
+#     "vi_vn",
+#     "wo_sn",
+#     "xh_za",
+#     "yo_ng",
+#     "yue_hant_hk",
+#     "zu_za",
+# ]
+
+# Currently we only use fleurs for evaluating translating from english_us to the following 6 languages; additions may follow
+# The fleurs dataset can be used from any language to any language within its supported list of languages
+# We use the (basic) Whisper text normalization; before extending to other languages, check if Whisper basic normalizer supports them
+
+# "to" language:
+target_subsets = [
+    "de_de",
+    "es_419",
+    "fr_fr",
+    "it_it",
+    "ja_jp",
+    "pt_br",
+    "cmn_hans_cn",
+]
+
+# "from" language:
+source_subsets = [
+    "en_us",
+]
+
+lang_code = {
+    "en_us": "en",
+    "de_de": "de",
+    "es_419": "es",
+    "fr_fr": "fr",
+    "it_it": "it",
+    "ja_jp": "ja",
+    "pt_br": "pt",
+    "cmn_hans_cn": "zh",
+}
+
+lang_name = {
+    "en_us": "English",
+    "de_de": "German",
+    "es_419": "Spanish",
+    "fr_fr": "French",
+    "it_it": "Italian",
+    "ja_jp": "Japanese",
+    "pt_br": "Portuguese",
+    "cmn_hans_cn": "Chinese",
+}
+
+first = True
+for source_subset in source_subsets:
+    for target_subset in target_subsets:
+        card = TaskCard(
+            loader=MultipleSourceLoader(
+                sources=[
+                    SourceSequentialOperator(
+                        steps=[
+                            LoadHF(
+                                path="google/fleurs",
+                                revision="refs/convert/parquet",
+                                data_dir=source_subset,
+                                splits=["test"],
+                                data_classification_policy=["public"],
+                            ),
+                            RemoveFields(
+                                fields=[
+                                    "num_samples",
+                                    "path",
+                                    "raw_transcription",
+                                    "transcription",
+                                    "gender",
+                                    "lang_id",
+                                    "language",
+                                    "lang_group_id",
+                                ]
+                            ),
+                            RenameSplits(
+                                {
+                                    "test": "test_input",
+                                }
+                            ),
+                        ]
+                    ),
+                    SourceSequentialOperator(
+                        steps=[
+                            LoadHF(
+                                path="google/fleurs",
+                                revision="refs/convert/parquet",
+                                data_dir=target_subset,
+                                splits=["test"],
+                                data_classification_policy=["public"],
+                            ),
+                            RemoveFields(
+                                fields=[
+                                    "num_samples",
+                                    "path",
+                                    "audio",
+                                    "raw_transcription",
+                                    "gender",
+                                    "lang_id",
+                                    "language",
+                                    "lang_group_id",
+                                ]
+                            ),
+                            RenameSplits(
+                                {
+                                    "test": "test_target",
+                                }
+                            ),
+                        ]
+                    ),
+                ]
+            ),
+            preprocess_steps=[
+                # The following JointStream operator is unique for fleurs
+                # It removes redundant repetitions in the target ("right") stream - repetitions of the same translation text targets
+                # The repetitions are inherent to the structure of the fleurs dataset
+                # On the other hand, we do preserve the repeated recornings of the input speech (1-3 recordings per target instance),
+                #    as they contain recordings by different speakers and different recording conditions, therefore considered as
+                #    separate instannces
+                # The unique JointStream operator also adds '.' at the end of text instances, to improve the behavior of the metric (sacrebleu)
+                JoinStreamsFleurs(
+                    left_stream="test_input",
+                    right_stream="test_target",
+                    how="inner",
+                    on=["id"],
+                    new_stream_name="test",
+                ),
+                ToAudio(field="audio"),
+                Rename(field="transcription", to_field="translation"),
+                AddFields(
+                    {
+                        "source_language": lang_name[source_subset],
+                        "target_language": lang_name[target_subset],
+                    }
+                ),
+            ],
+            task="tasks.translation.speech",
+            templates=[
+                "templates.translation.speech.default",
+            ],
+        )
+        if first:
+            test_card(card, demos_taken_from="test", num_demos=0)
+            first = False
+        add_to_catalog(
+            card, f"cards.fleurs.{source_subset}.{target_subset}", overwrite=True
+        )
diff --git a/prepare/cards/gigaspeech.py b/prepare/cards/gigaspeech.py
new file mode 100644
index 0000000000..9d53820557
--- /dev/null
+++ b/prepare/cards/gigaspeech.py
@@ -0,0 +1,42 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="speechcolab/gigaspeech",
+        data_dir="xs",
+        revision="refs/convert/parquet",
+        splits=["train", "validation", "test"],
+        data_classification_policy=["public"],
+        streaming=True,
+    ),
+    preprocess_steps=[
+        ToAudio(field="audio"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "apache-2.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["100K<n<1M"],
+        "modalities": ["audio", "text"],
+        "source": "audiobooks, podcasts, YouTube",
+    },
+    __description__=(
+        "GigaSpeech is an evolving, multi-domain English speech recognition corpus with "
+        "10,000 hours of high quality labeled audio suitable for supervised training. "
+        "The transcribed audio data is collected from audiobooks, podcasts and YouTube, "
+        "covering both read and spontaneous speaking styles, and a variety of topics "
+        "such as arts, science, sports, etc."
+    ),
+    __title__="GigaSpeech-XS",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.gigaspeech.xs", overwrite=True)
diff --git a/prepare/cards/librispeech.py b/prepare/cards/librispeech.py
new file mode 100644
index 0000000000..d21c271285
--- /dev/null
+++ b/prepare/cards/librispeech.py
@@ -0,0 +1,77 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.test_utils.card import test_card
+
+# LibriSpeech test.clean
+card = TaskCard(
+    loader=LoadHF(
+        path="openslr/librispeech_asr",
+        data_dir="clean",
+        revision="refs/convert/parquet",
+        splits=["test"],
+        data_classification_policy=["public"],
+        streaming=True,
+    ),
+    preprocess_steps=[
+        ToAudio(field="audio"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "LibriVox audiobooks",
+    },
+    __description__=(
+        "LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech "
+        "derived from LibriVox audiobooks. This test.clean subset contains high-quality "
+        "recordings from speakers with low Word Error Rate for clean speech recognition evaluation."
+    ),
+    __title__="LibriSpeech-test.clean",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.librispeech.test_clean", overwrite=True)
+
+# LibriSpeech test
+card = TaskCard(
+    loader=LoadHF(
+        path="openslr/librispeech_asr",
+        data_dir="other",
+        revision="refs/convert/parquet",
+        splits=["test"],
+        data_classification_policy=["public"],
+        streaming=True,
+    ),
+    preprocess_steps=[
+        ToAudio(field="audio"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "LibriVox audiobooks",
+    },
+    __description__=(
+        "LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech "
+        "derived from LibriVox audiobooks. This test subset contains recordings from all "
+        "speakers including those with higher Word Error Rate for comprehensive evaluation."
+    ),
+    __title__="LibriSpeech-test",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.librispeech.test", overwrite=True)
diff --git a/prepare/cards/minds_14.py b/prepare/cards/minds_14.py
new file mode 100644
index 0000000000..a81a58d920
--- /dev/null
+++ b/prepare/cards/minds_14.py
@@ -0,0 +1,98 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.blocks import LoadHF, Set, TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.operators import MapInstanceValues, Rename
+from unitxt.splitters import SplitRandomMix
+from unitxt.test_utils.card import test_card
+
+classes = [
+    "abroad",
+    "address",
+    "app_error",
+    "atm_limit",
+    "balance",
+    "business_loan",
+    "card_issues",
+    "cash_deposit",
+    "direct_debit",
+    "freeze",
+    "high_value_payment",
+    "joint_account",
+    "latest_transactions",
+    "pay_bill",
+]
+
+card = TaskCard(
+    loader=LoadHF(path="PolyAI/minds14", name="en-US"),
+    preprocess_steps=[
+        SplitRandomMix(
+            {"train": "train[90%]", "validation": "train[5%]", "test": "train[5%]"}
+        ),
+        MapInstanceValues(
+            mappers={"intent_class": {str(i): label for i, label in enumerate(classes)}}
+        ),
+        Rename(field="intent_class", to_field="label"),
+        Set(
+            fields={
+                "text_type": "sentence",
+                "type_of_class": "intent",
+                "classes": classes,
+            }
+        ),
+        ToAudio(field="audio", to_field="text"),
+    ],
+    task="tasks.classification.multi_class",
+    templates="templates.classification.multi_class.all",
+    __tags__={
+        "annotations_creators": [
+            "expert-generated",
+            "crowdsourced",
+            "machine-generated",
+        ],
+        "language_creators": ["crowdsourced", "expert-generated"],
+        "language": [
+            "en",
+            "fr",
+            "it",
+            "es",
+            "pt",
+            "de",
+            "nl",
+            "ru",
+            "pl",
+            "cs",
+            "ko",
+            "zh",
+        ],
+        "license": "cc-by-4.0",
+        "multilinguality": "multilingual",
+        "size_categories": "10K<n<100K",
+        "task_categories": "automatic-speech-recognition",
+        "task_ids": "keyword-spotting",
+        "pretty_name": "MInDS-14",
+        "language_bcp47": [
+            "en",
+            "en-GB",
+            "en-US",
+            "en-AU",
+            "fr",
+            "it",
+            "es",
+            "pt",
+            "de",
+            "nl",
+            "ru",
+            "pl",
+            "cs",
+            "ko",
+            "zh",
+        ],
+        "tags": ["speech-recognition"],
+    },
+    __description__=(
+        "MINDS-14 is training and evaluation resource for intent detection task with spoken data. It covers 14 intents extracted from a commercial system in the e-banking domain, associated with spoken examples in 14 diverse language varieties."
+    ),
+)
+
+test_card(card)
+add_to_catalog(card, "cards.minds_14", overwrite=True)
diff --git a/prepare/cards/spgispeech.py b/prepare/cards/spgispeech.py
new file mode 100644
index 0000000000..5375a50ee1
--- /dev/null
+++ b/prepare/cards/spgispeech.py
@@ -0,0 +1,43 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import Rename
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="kensho/spgispeech",
+        data_dir="S",
+        revision="refs/convert/parquet",
+        splits=["train", "validation", "test"],
+        data_classification_policy=["public"],
+        streaming=True,
+    ),
+    preprocess_steps=[
+        Rename(field="transcript", to_field="text"),
+        ToAudio(field="audio"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "other",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "size_categories": ["100K<n<1M"],
+        "modalities": ["audio", "text"],
+        "source": "financial earnings calls",
+    },
+    __description__=(
+        "SPGISpeech is a large-scale transcription dataset of 5,000 hours of professionally-transcribed "
+        "financial audio, containing earnings calls from 2007-2020. The dataset features business English "
+        "with diverse accents from approximately 50,000 speakers, covering both spontaneous and narrated speech "
+        "in 5-15 second segments."
+    ),
+    __title__="SPGISpeech-S",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.spgispeech.s", overwrite=True)
diff --git a/prepare/cards/tedlium.py b/prepare/cards/tedlium.py
new file mode 100644
index 0000000000..fecc572513
--- /dev/null
+++ b/prepare/cards/tedlium.py
@@ -0,0 +1,42 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.test_utils.card import test_card
+
+for release in ["release1", "release2", "release3"]:
+    card = TaskCard(
+        loader=LoadHF(
+            path="LIUM/tedlium",
+            data_dir=release,
+            revision="refs/convert/parquet",
+            splits=["train", "validation", "test"],
+            data_classification_policy=["public"],
+            streaming=True,
+        ),
+        preprocess_steps=[
+            ToAudio(field="audio"),
+        ],
+        task="tasks.speech_recognition",
+        templates=[
+            "templates.speech_recognition.default",
+        ],
+        __tags__={
+            "license": "cc-by-nc-nd-3.0",
+            "language": "en",
+            "task_categories": ["automatic-speech-recognition"],
+            "size_categories": ["10K<n<100K"],
+            "modalities": ["audio", "text"],
+            "source": "TED Talks",
+        },
+        __description__=(
+            "The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz. "
+            f"This {release} contains progressively more transcribed speech training data, ranging from "
+            "118 hours (Release 1), to 207 hours (Release 2), to 452 hours (Release 3). The dataset "
+            "includes speaker information and is commonly used for automatic speech recognition evaluation."
+        ),
+        __title__=f"TEDLIUM-{release}",
+    )
+
+    test_card(card, strict=False)
+    add_to_catalog(card, f"cards.tedlium.{release}", overwrite=True)
diff --git a/prepare/cards/voxpopuli.py b/prepare/cards/voxpopuli.py
new file mode 100644
index 0000000000..8a253adfb4
--- /dev/null
+++ b/prepare/cards/voxpopuli.py
@@ -0,0 +1,44 @@
+from unitxt.audio_operators import ToAudio
+from unitxt.card import TaskCard
+from unitxt.catalog import add_to_catalog
+from unitxt.loaders import LoadHF
+from unitxt.operators import Rename
+from unitxt.test_utils.card import test_card
+
+card = TaskCard(
+    loader=LoadHF(
+        path="facebook/voxpopuli",
+        data_dir="en",
+        revision="refs/convert/parquet",
+        splits=["train", "validation", "test"],
+        data_classification_policy=["public"],
+        streaming=True,
+    ),
+    preprocess_steps=[
+        Rename(field="normalized_text", to_field="text"),
+        ToAudio(field="audio"),
+    ],
+    task="tasks.speech_recognition",
+    templates=[
+        "templates.speech_recognition.default",
+    ],
+    __tags__={
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": ["automatic-speech-recognition"],
+        "multilinguality": "multilingual",
+        "size_categories": ["1K<n<10K"],
+        "modalities": ["audio", "text"],
+        "source": "European Parliament recordings",
+    },
+    __description__=(
+        "VoxPopuli is a large-scale multilingual speech corpus for representation learning, "
+        "semi-supervised learning and interpretation. The raw data is collected from 2009-2020 "
+        "European Parliament event recordings. This English subset contains transcribed speech data "
+        "for automatic speech recognition with 1,791 hours of transcribed speech from 4,295 speakers."
+    ),
+    __title__="VoxPopuli-EN",
+)
+
+test_card(card, strict=False)
+add_to_catalog(card, "cards.voxpopuli.en", overwrite=True)
diff --git a/prepare/metrics/normalized_sacrebleu.py b/prepare/metrics/normalized_sacrebleu.py
index b45fd3a6d4..a5243d6486 100644
--- a/prepare/metrics/normalized_sacrebleu.py
+++ b/prepare/metrics/normalized_sacrebleu.py
@@ -3,6 +3,8 @@
 from unitxt.test_utils.metrics import test_metric
 
 language_to_tokenizer = {
+    "italian": None,
+    "it": None,
     "german": None,
     "deutch": None,
     "de": None,
@@ -22,6 +24,8 @@
     "ko": "ko-mecab",
     "japanese": "ja-mecab",
     "ja": "ja-mecab",
+    "chinese": "zh",
+    "zh": "zh",
 }
 
 metric = NormalizedSacrebleu(language_to_tokenizer=language_to_tokenizer)
diff --git a/prepare/metrics/wer.py b/prepare/metrics/wer.py
index 0cd6274d91..40f0c29273 100644
--- a/prepare/metrics/wer.py
+++ b/prepare/metrics/wer.py
@@ -1,8 +1,8 @@
 from unitxt import add_to_catalog
-from unitxt.metrics import Wer
+from unitxt.metrics import WerFast
 from unitxt.test_utils.metrics import test_metric
 
-metric = Wer()
+metric = WerFast()
 
 predictions = ["this is the prediction", "there is an other sample"]
 references = [["this is the reference"], ["there is another sample"]]
@@ -21,15 +21,14 @@
 ]
 
 global_target = {
-    "wer": 0.38,
+    "num_of_instances": 2,
     "score": 0.38,
+    "score_ci_high": 0.5,
+    "score_ci_low": 0.25,
     "score_name": "wer",
-    "wer_ci_low": 0.38,
-    "wer_ci_high": 0.38,
-    "score_ci_low": 0.38,
-    "score_ci_high": 0.38,
-    # Should by 0.375, but package rounds the scores
-    "num_of_instances": 2,
+    "wer": 0.38,
+    "wer_ci_high": 0.5,
+    "wer_ci_low": 0.25,
 }
 
 outputs = test_metric(
diff --git a/prepare/processors/processors.py b/prepare/processors/processors.py
index 760e6f0cb1..45e1fcb1ba 100644
--- a/prepare/processors/processors.py
+++ b/prepare/processors/processors.py
@@ -21,6 +21,8 @@
     Lower,
     LowerCaseTillPunc,
     MatchClosestOption,
+    NormalizeTextBasicWithWhisper,
+    NormalizeTextWithWhisper,
     PostProcess,
     RegexParser,
     RemoveArticles,
@@ -302,3 +304,15 @@ def add_processor_and_operator_to_catalog(
     overwrite=True,
     process_references=False,
 )
+
+add_processor_and_operator_to_catalog(
+    artifact_name="normalize_text_with_whisper",
+    operator=NormalizeTextWithWhisper(),
+    overwrite=True,
+)
+
+add_processor_and_operator_to_catalog(
+    artifact_name="normalize_text_basic_with_whisper",
+    operator=NormalizeTextBasicWithWhisper(),
+    overwrite=True,
+)
diff --git a/prepare/tasks/classification.py b/prepare/tasks/classification.py
index 462528e1ff..6bf41df150 100644
--- a/prepare/tasks/classification.py
+++ b/prepare/tasks/classification.py
@@ -104,7 +104,7 @@
 
 """,
         input_fields={
-            "text": str,
+            "text": Union[str, Image, Audio],
             "text_type": str,
             "classes": List[str],
             "type_of_class": str,
diff --git a/prepare/tasks/speech_recognition.py b/prepare/tasks/speech_recognition.py
new file mode 100644
index 0000000000..1ba0d3c8a7
--- /dev/null
+++ b/prepare/tasks/speech_recognition.py
@@ -0,0 +1,40 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.task import Task
+from unitxt.templates import InputOutputTemplate
+from unitxt.types import Audio
+
+add_to_catalog(
+    Task(
+        input_fields={
+            "audio": Audio,
+        },
+        reference_fields={"text": str},
+        prediction_type=str,
+        metrics=["metrics.wer"],
+        default_template=InputOutputTemplate(
+            input_format="{audio}can you transcribe the speech into a written format?",
+            output_format="{text}",
+            postprocessors=["processors.normalize_text_with_whisper"],
+        ),
+    ),
+    "tasks.speech_recognition",
+    overwrite=True,
+)
+
+add_to_catalog(
+    Task(
+        input_fields={
+            "audio": Audio,
+        },
+        reference_fields={"text": str},
+        prediction_type=str,
+        metrics=["metrics.wer"],
+        default_template=InputOutputTemplate(
+            input_format="{audio}can you transcribe the speech into a written format?",
+            output_format="{text}",
+            postprocessors=["processors.normalize_text_basic_with_whisper"],
+        ),
+    ),
+    "tasks.speech_recognition_multilingual",
+    overwrite=True,
+)
diff --git a/prepare/tasks/translation/speech.py b/prepare/tasks/translation/speech.py
new file mode 100644
index 0000000000..aa803f73f7
--- /dev/null
+++ b/prepare/tasks/translation/speech.py
@@ -0,0 +1,24 @@
+from unitxt.blocks import Task
+from unitxt.catalog import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+from unitxt.types import Audio
+
+add_to_catalog(
+    Task(
+        input_fields={
+            "audio": Audio,
+            "target_language": str,
+        },
+        reference_fields={"translation": str},
+        prediction_type=str,
+        metrics=["metrics.normalized_sacrebleu"],
+        default_template=InputOutputTemplate(
+            # input_format="{audio}listen to the speech and translate it to {target_language}",
+            input_format="{audio}translate the speech to {target_language}",
+            output_format="{translation}",
+            postprocessors=["processors.normalize_text_basic_with_whisper"],
+        ),
+    ),
+    "tasks.translation.speech",
+    overwrite=True,
+)
diff --git a/prepare/templates/speech_recognition/templates.py b/prepare/templates/speech_recognition/templates.py
new file mode 100644
index 0000000000..64f1aebc5c
--- /dev/null
+++ b/prepare/templates/speech_recognition/templates.py
@@ -0,0 +1,22 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+
+add_to_catalog(
+    InputOutputTemplate(
+        input_format="{audio}can you transcribe the speech into a written format?",
+        output_format="{text}",
+        postprocessors=["processors.normalize_text_with_whisper"],
+    ),
+    "templates.speech_recognition.default",
+    overwrite=True,
+)
+
+add_to_catalog(
+    InputOutputTemplate(
+        input_format="{audio}can you transcribe the speech into a written format?",
+        output_format="{text}",
+        postprocessors=["processors.normalize_text_basic_with_whisper"],
+    ),
+    "templates.speech_recognition.multilingual",
+    overwrite=True,
+)
diff --git a/prepare/templates/translation/speech.py b/prepare/templates/translation/speech.py
new file mode 100644
index 0000000000..508de51c4a
--- /dev/null
+++ b/prepare/templates/translation/speech.py
@@ -0,0 +1,24 @@
+from unitxt.catalog import add_to_catalog
+from unitxt.templates import InputOutputTemplate
+
+add_to_catalog(
+    InputOutputTemplate(
+        # input_format="{audio}listen to the speech and translate it to {target_language}",
+        input_format="{audio}translate the speech to {target_language}",
+        output_format="{translation}",
+        postprocessors=["processors.normalize_text_basic_with_whisper"],
+    ),
+    "templates.translation.speech.default",
+    overwrite=True,
+)
+
+add_to_catalog(
+    InputOutputTemplate(
+        # input_format="{audio}listen to the speech and translate it to {target_language}",
+        input_format="{audio}translate the speech to {target_language}",
+        output_format="{translation}",
+        # postprocessors=["processors.normalize_text_basic_with_whisper"],
+    ),
+    "templates.translation.speech.no_norm",
+    overwrite=True,
+)
diff --git a/pyproject.toml b/pyproject.toml
index 5a7db6c150..d21cd4c132 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -108,7 +108,18 @@ tests = [
     "sqlparse",
     "diskcache",
     "pydantic",
-    "jsonschema_rs"
+    "jsonschema_rs",
+    "torchcodec",
+    "soundfile",
+    "librosa",
+    "torchaudio",
+    "protobuf"
+]
+audio = [
+    "torchcodec",
+    "soundfile",
+    "librosa",
+    "torchaudio"
 ]
 ui = [
     "gradio",
@@ -128,7 +139,8 @@ inference-tests = [
   "tenacity",
   "diskcache",
   "numpy==1.26.4",
-  "ollama"
+  "ollama",
+  "torchaudio"
 ]
 assistant = [
     "streamlit",
diff --git a/src/unitxt/audio_operators.py b/src/unitxt/audio_operators.py
new file mode 100644
index 0000000000..67b4e00e4f
--- /dev/null
+++ b/src/unitxt/audio_operators.py
@@ -0,0 +1,52 @@
+from typing import Any, Dict, Optional
+
+from .operators import InstanceFieldOperator
+from .types import Audio
+
+
+class ToAudio(InstanceFieldOperator):
+    def prepare(self):
+        super().prepare()
+
+    def process_instance_value(self, value: Any, instance: Dict[str, Any]) -> Audio:
+        return {
+            "audio": value,
+        }
+
+
+def audio_to_base64(audio_data: Audio):
+    """Convert a HuggingFace Audio instance to a base64-encoded WAV string.
+
+    Args:
+        audio_data (dict): The Audio instance from HuggingFace datasets
+            Contains 'array', 'sampling_rate', and 'path' keys
+
+    Returns:
+        str: Base64-encoded WAV audio
+    """
+    import base64
+    from io import BytesIO
+
+    import soundfile as sf
+
+    # Create a BytesIO buffer to hold the WAV data
+    buffer = BytesIO()
+    audio = audio_data["audio"]
+    # Write the audio array to the buffer in WAV format
+    sf.write(buffer, audio["array"], audio["sampling_rate"], format="wav")
+
+    # Get the bytes from the buffer
+    wav_bytes = buffer.getvalue()
+
+    # Encode to base64
+    return base64.b64encode(wav_bytes).decode("utf-8")
+
+
+def base64_to_audio(base64_string, sampling_rate: Optional[int] = None):
+    import base64
+
+    from datasets import Audio
+
+    audio_bytes = base64.b64decode(base64_string)
+    audio_feature = Audio(sampling_rate=sampling_rate)
+    return audio_feature.decode_example({"bytes": audio_bytes, "path": None})
diff --git a/src/unitxt/catalog/benchmarks/speech_recognition.json b/src/unitxt/catalog/benchmarks/speech_recognition.json
new file mode 100644
index 0000000000..9a98348ca4
--- /dev/null
+++ b/src/unitxt/catalog/benchmarks/speech_recognition.json
@@ -0,0 +1,40 @@
+{
+    "__type__": "benchmark",
+    "subsets": {
+        "voxpopuli": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.voxpopuli",
+            "format": "formats.chat_api"
+        },
+        "ami": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.ami",
+            "format": "formats.chat_api"
+        },
+        "librispeech": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.librispeech",
+            "format": "formats.chat_api"
+        },
+        "spgispeech": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.spgispeech",
+            "format": "formats.chat_api"
+        },
+        "tedlium": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.tedlium",
+            "format": "formats.chat_api"
+        },
+        "earnings22": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.earnings22",
+            "format": "formats.chat_api"
+        },
+        "commonvoice": {
+            "__type__": "dataset_recipe",
+            "card": "cards.esb.commonvoice",
+            "format": "formats.chat_api"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json b/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json
new file mode 100644
index 0000000000..22d01d9d6a
--- /dev/null
+++ b/src/unitxt/catalog/benchmarks/speech_recognition_multilingual.json
@@ -0,0 +1,25 @@
+{
+    "__type__": "benchmark",
+    "subsets": {
+        "commonvoice_de": {
+            "__type__": "dataset_recipe",
+            "card": "cards.commonvoice.de",
+            "format": "formats.chat_api"
+        },
+        "commonvoice_es": {
+            "__type__": "dataset_recipe",
+            "card": "cards.commonvoice.es",
+            "format": "formats.chat_api"
+        },
+        "commonvoice_fr": {
+            "__type__": "dataset_recipe",
+            "card": "cards.commonvoice.fr",
+            "format": "formats.chat_api"
+        },
+        "commonvoice_pt": {
+            "__type__": "dataset_recipe",
+            "card": "cards.commonvoice.pt",
+            "format": "formats.chat_api"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/benchmarks/speech_translation.json b/src/unitxt/catalog/benchmarks/speech_translation.json
new file mode 100644
index 0000000000..922e1a4f49
--- /dev/null
+++ b/src/unitxt/catalog/benchmarks/speech_translation.json
@@ -0,0 +1,70 @@
+{
+    "__type__": "benchmark",
+    "subsets": {
+        "fleurs_en_de": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.de_de",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_es": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.es_419",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_fr": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.fr_fr",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_it": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.it_it",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_ja": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.ja_jp",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_pt": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.pt_br",
+            "format": "formats.chat_api"
+        },
+        "fleurs_en_zh": {
+            "__type__": "dataset_recipe",
+            "card": "cards.fleurs.en_us.cmn_hans_cn",
+            "format": "formats.chat_api"
+        },
+        "covost2_en_de": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.from_en.en_de",
+            "format": "formats.chat_api"
+        },
+        "covost2_en_ja": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.from_en.en_ja",
+            "format": "formats.chat_api"
+        },
+        "covost2_de_en": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.to_en.de_en",
+            "format": "formats.chat_api"
+        },
+        "covost2_es_en": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.to_en.es_en",
+            "format": "formats.chat_api"
+        },
+        "covost2_fr_en": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.to_en.fr_en",
+            "format": "formats.chat_api"
+        },
+        "covost2_pt_en": {
+            "__type__": "dataset_recipe",
+            "card": "cards.covost2.to_en.pt_en",
+            "format": "formats.chat_api"
+        }
+    }
+}
diff --git a/src/unitxt/catalog/cards/ami/ihm.json b/src/unitxt/catalog/cards/ami/ihm.json
new file mode 100644
index 0000000000..6afd19f0a7
--- /dev/null
+++ b/src/unitxt/catalog/cards/ami/ihm.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "edinburghcstr/ami",
+        "data_dir": "ihm",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "AMI Meeting Corpus"
+    },
+    "__description__": "AMI Meeting Corpus contains 100 hours of meeting recordings with synchronized audio/video signals. The corpus includes close-talking and far-field microphones, recorded in 3 different rooms with varied acoustic properties. Features mostly non-native English speakers in meeting scenarios.",
+    "__title__": "AMI-ihm"
+}
diff --git a/src/unitxt/catalog/cards/ami/sdm.json b/src/unitxt/catalog/cards/ami/sdm.json
new file mode 100644
index 0000000000..3a0fd08500
--- /dev/null
+++ b/src/unitxt/catalog/cards/ami/sdm.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "edinburghcstr/ami",
+        "data_dir": "sdm",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "AMI Meeting Corpus"
+    },
+    "__description__": "AMI Meeting Corpus contains 100 hours of meeting recordings with synchronized audio/video signals. The corpus includes close-talking and far-field microphones, recorded in 3 different rooms with varied acoustic properties. Features mostly non-native English speakers in meeting scenarios.",
+    "__title__": "AMI-sdm"
+}
diff --git a/src/unitxt/catalog/cards/commonvoice/de.json b/src/unitxt/catalog/cards/commonvoice/de.json
new file mode 100644
index 0000000000..721f7d8791
--- /dev/null
+++ b/src/unitxt/catalog/cards/commonvoice/de.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "mozilla-foundation/common_voice_17_0",
+        "revision": "refs/convert/parquet",
+        "data_dir": "de",
+        "data_classification_policy": [
+            "public"
+        ],
+        "splits": [
+            "invalidated",
+            "other",
+            "test",
+            "train",
+            "validated",
+            "validation"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "sentence",
+            "to_field": "text"
+        },
+        {
+            "__type__": "strip_quotation",
+            "field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition_multilingual",
+    "templates": [
+        "templates.speech_recognition.multilingual"
+    ],
+    "__title__": "CommonVoice-17-de"
+}
diff --git a/src/unitxt/catalog/cards/commonvoice/en.json b/src/unitxt/catalog/cards/commonvoice/en.json
new file mode 100644
index 0000000000..1b6fc08ed8
--- /dev/null
+++ b/src/unitxt/catalog/cards/commonvoice/en.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "mozilla-foundation/common_voice_17_0",
+        "revision": "refs/convert/parquet",
+        "data_dir": "en",
+        "data_classification_policy": [
+            "public"
+        ],
+        "splits": [
+            "invalidated",
+            "other",
+            "test",
+            "train",
+            "validated",
+            "validation"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "sentence",
+            "to_field": "text"
+        },
+        {
+            "__type__": "strip_quotation",
+            "field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__title__": "CommonVoice-17-en"
+}
diff --git a/src/unitxt/catalog/cards/commonvoice/es.json b/src/unitxt/catalog/cards/commonvoice/es.json
new file mode 100644
index 0000000000..f46757b049
--- /dev/null
+++ b/src/unitxt/catalog/cards/commonvoice/es.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "mozilla-foundation/common_voice_17_0",
+        "revision": "refs/convert/parquet",
+        "data_dir": "es",
+        "data_classification_policy": [
+            "public"
+        ],
+        "splits": [
+            "invalidated",
+            "other",
+            "test",
+            "train",
+            "validated",
+            "validation"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "sentence",
+            "to_field": "text"
+        },
+        {
+            "__type__": "strip_quotation",
+            "field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition_multilingual",
+    "templates": [
+        "templates.speech_recognition.multilingual"
+    ],
+    "__title__": "CommonVoice-17-es"
+}
diff --git a/src/unitxt/catalog/cards/commonvoice/fr.json b/src/unitxt/catalog/cards/commonvoice/fr.json
new file mode 100644
index 0000000000..a2a401f866
--- /dev/null
+++ b/src/unitxt/catalog/cards/commonvoice/fr.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "mozilla-foundation/common_voice_17_0",
+        "revision": "refs/convert/parquet",
+        "data_dir": "fr",
+        "data_classification_policy": [
+            "public"
+        ],
+        "splits": [
+            "invalidated",
+            "other",
+            "test",
+            "train",
+            "validated",
+            "validation"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "sentence",
+            "to_field": "text"
+        },
+        {
+            "__type__": "strip_quotation",
+            "field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition_multilingual",
+    "templates": [
+        "templates.speech_recognition.multilingual"
+    ],
+    "__title__": "CommonVoice-17-fr"
+}
diff --git a/src/unitxt/catalog/cards/commonvoice/pt.json b/src/unitxt/catalog/cards/commonvoice/pt.json
new file mode 100644
index 0000000000..8cc207a00f
--- /dev/null
+++ b/src/unitxt/catalog/cards/commonvoice/pt.json
@@ -0,0 +1,41 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "mozilla-foundation/common_voice_17_0",
+        "revision": "refs/convert/parquet",
+        "data_dir": "pt",
+        "data_classification_policy": [
+            "public"
+        ],
+        "splits": [
+            "invalidated",
+            "other",
+            "test",
+            "train",
+            "validated",
+            "validation"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "sentence",
+            "to_field": "text"
+        },
+        {
+            "__type__": "strip_quotation",
+            "field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition_multilingual",
+    "templates": [
+        "templates.speech_recognition.multilingual"
+    ],
+    "__title__": "CommonVoice-17-pt"
+}
diff --git a/src/unitxt/catalog/cards/covost2/from_en/en_de.json b/src/unitxt/catalog/cards/covost2/from_en/en_de.json
new file mode 100644
index 0000000000..b4c6601b8f
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/from_en/en_de.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "en_de",
+        "data_dir": "/dataset/speechdata/CoVoST2/en",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "English",
+                "target_language": "German"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/covost2/from_en/en_ja.json b/src/unitxt/catalog/cards/covost2/from_en/en_ja.json
new file mode 100644
index 0000000000..8b264f5a27
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/from_en/en_ja.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "en_ja",
+        "data_dir": "/dataset/speechdata/CoVoST2/en",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Japanese"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/covost2/to_en/de_en.json b/src/unitxt/catalog/cards/covost2/to_en/de_en.json
new file mode 100644
index 0000000000..ee4f72634b
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/to_en/de_en.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "de_en",
+        "data_dir": "/dataset/speechdata/CoVoST2/de",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "German",
+                "target_language": "English"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/covost2/to_en/es_en.json b/src/unitxt/catalog/cards/covost2/to_en/es_en.json
new file mode 100644
index 0000000000..9db2555e3e
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/to_en/es_en.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "es_en",
+        "data_dir": "/dataset/speechdata/CoVoST2/es",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "Spanish",
+                "target_language": "English"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/covost2/to_en/fr_en.json b/src/unitxt/catalog/cards/covost2/to_en/fr_en.json
new file mode 100644
index 0000000000..cbc65de860
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/to_en/fr_en.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "fr_en",
+        "data_dir": "/dataset/speechdata/CoVoST2/fr",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "French",
+                "target_language": "English"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/covost2/to_en/pt_en.json b/src/unitxt/catalog/cards/covost2/to_en/pt_en.json
new file mode 100644
index 0000000000..14a9af54d6
--- /dev/null
+++ b/src/unitxt/catalog/cards/covost2/to_en/pt_en.json
@@ -0,0 +1,28 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/covost2",
+        "name": "pt_en",
+        "data_dir": "/dataset/speechdata/CoVoST2/pt",
+        "split": "test",
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "source_language": "Portuguese",
+                "target_language": "English"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/esb/ami.json b/src/unitxt/catalog/cards/esb/ami.json
new file mode 100644
index 0000000000..f86545b23a
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/ami.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "ami",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "ami",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "AMI Meeting Corpus",
+        "benchmark": "ESB"
+    },
+    "__description__": "AMI Meeting Corpus from ESB (End-to-End Speech Benchmark) - Meeting recordings with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-AMI"
+}
diff --git a/src/unitxt/catalog/cards/esb/commonvoice.json b/src/unitxt/catalog/cards/esb/commonvoice.json
new file mode 100644
index 0000000000..cfc674e840
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/commonvoice.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "common_voice",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "common_voice",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "European Parliament recordings",
+        "benchmark": "ESB"
+    },
+    "__description__": "CommonVoice from ESB (End-to-End Speech Benchmark) - Common Voice is a series of crowd-sourced open-licensed speech datasets where speakers record text from Wikipedia in various languages.",
+    "__title__": "ESB-CommonVoice"
+}
diff --git a/src/unitxt/catalog/cards/esb/earnings22.json b/src/unitxt/catalog/cards/esb/earnings22.json
new file mode 100644
index 0000000000..c1af4f038f
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/earnings22.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "earnings22",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "earnings22",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "100<n<1K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "earnings calls",
+        "benchmark": "ESB"
+    },
+    "__description__": "Earnings-22 from ESB (End-to-End Speech Benchmark) - Corporate earnings calls with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-Earnings22"
+}
diff --git a/src/unitxt/catalog/cards/esb/gigaspeech.json b/src/unitxt/catalog/cards/esb/gigaspeech.json
new file mode 100644
index 0000000000..fd4c0d6dfe
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/gigaspeech.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "gigaspeech",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "gigaspeech",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "apache-2.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "100K<n<1M"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "audiobooks, podcasts, YouTube",
+        "benchmark": "ESB"
+    },
+    "__description__": "GigaSpeech from ESB (End-to-End Speech Benchmark) - Multi-domain English speech corpus with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-GigaSpeech"
+}
diff --git a/src/unitxt/catalog/cards/esb/librispeech.json b/src/unitxt/catalog/cards/esb/librispeech.json
new file mode 100644
index 0000000000..7269b6d53e
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/librispeech.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "librispeech",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "librispeech",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "LibriVox audiobooks",
+        "benchmark": "ESB"
+    },
+    "__description__": "LibriSpeech from ESB (End-to-End Speech Benchmark) - Read English speech from audiobooks with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-LibriSpeech"
+}
diff --git a/src/unitxt/catalog/cards/esb/spgispeech.json b/src/unitxt/catalog/cards/esb/spgispeech.json
new file mode 100644
index 0000000000..409780e213
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/spgispeech.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "spgispeech",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "spgispeech",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "other",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "100K<n<1M"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "financial earnings calls",
+        "benchmark": "ESB"
+    },
+    "__description__": "SPGISpeech from ESB (End-to-End Speech Benchmark) - Financial earnings calls with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-SPGISpeech"
+}
diff --git a/src/unitxt/catalog/cards/esb/tedlium.json b/src/unitxt/catalog/cards/esb/tedlium.json
new file mode 100644
index 0000000000..b16cf62873
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/tedlium.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "tedlium",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "tedlium",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-nc-nd-3.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "TED Talks",
+        "benchmark": "ESB"
+    },
+    "__description__": "TED-LIUM from ESB (End-to-End Speech Benchmark) - TED Talk presentations with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-TEDLIUM"
+}
diff --git a/src/unitxt/catalog/cards/esb/voxpopuli.json b/src/unitxt/catalog/cards/esb/voxpopuli.json
new file mode 100644
index 0000000000..5c994a6c35
--- /dev/null
+++ b/src/unitxt/catalog/cards/esb/voxpopuli.json
@@ -0,0 +1,93 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "voxpopuli",
+                        "splits": [
+                            "clean"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "clean": "test1"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "esb/diagnostic-dataset",
+                        "name": "voxpopuli",
+                        "splits": [
+                            "other"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ],
+                        "streaming": true
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "other": "test2"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "test": "test1+test2"
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "norm_transcript",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "European Parliament recordings",
+        "benchmark": "ESB"
+    },
+    "__description__": "VoxPopuli from ESB (End-to-End Speech Benchmark) - European Parliament recordings with standardized preprocessing and consistent formatting. Test-only split from the unified ESB dataset collection.",
+    "__title__": "ESB-VoxPopuli"
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/cmn_hans_cn.json b/src/unitxt/catalog/cards/fleurs/en_us/cmn_hans_cn.json
new file mode 100644
index 0000000000..4b48bf09b3
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/cmn_hans_cn.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "cmn_hans_cn",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Chinese"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/de_de.json b/src/unitxt/catalog/cards/fleurs/en_us/de_de.json
new file mode 100644
index 0000000000..ac82201a9e
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/de_de.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "de_de",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "German"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/es_419.json b/src/unitxt/catalog/cards/fleurs/en_us/es_419.json
new file mode 100644
index 0000000000..236f7b68c7
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/es_419.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "es_419",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Spanish"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/fr_fr.json b/src/unitxt/catalog/cards/fleurs/en_us/fr_fr.json
new file mode 100644
index 0000000000..1df10ca566
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/fr_fr.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "fr_fr",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "French"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/it_it.json b/src/unitxt/catalog/cards/fleurs/en_us/it_it.json
new file mode 100644
index 0000000000..9b8e09c535
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/it_it.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "it_it",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Italian"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/ja_jp.json b/src/unitxt/catalog/cards/fleurs/en_us/ja_jp.json
new file mode 100644
index 0000000000..a750d69325
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/ja_jp.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "ja_jp",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Japanese"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/fleurs/en_us/pt_br.json b/src/unitxt/catalog/cards/fleurs/en_us/pt_br.json
new file mode 100644
index 0000000000..1531e5ab20
--- /dev/null
+++ b/src/unitxt/catalog/cards/fleurs/en_us/pt_br.json
@@ -0,0 +1,112 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "multiple_source_loader",
+        "sources": [
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "en_us",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "raw_transcription",
+                            "transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_input"
+                        }
+                    }
+                ]
+            },
+            {
+                "__type__": "source_sequential_operator",
+                "steps": [
+                    {
+                        "__type__": "load_hf",
+                        "path": "google/fleurs",
+                        "revision": "refs/convert/parquet",
+                        "data_dir": "pt_br",
+                        "splits": [
+                            "test"
+                        ],
+                        "data_classification_policy": [
+                            "public"
+                        ]
+                    },
+                    {
+                        "__type__": "remove_fields",
+                        "fields": [
+                            "num_samples",
+                            "path",
+                            "audio",
+                            "raw_transcription",
+                            "gender",
+                            "lang_id",
+                            "language",
+                            "lang_group_id"
+                        ]
+                    },
+                    {
+                        "__type__": "rename_splits",
+                        "mapper": {
+                            "test": "test_target"
+                        }
+                    }
+                ]
+            }
+        ]
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "join_streams_fleurs",
+            "left_stream": "test_input",
+            "right_stream": "test_target",
+            "how": "inner",
+            "on": [
+                "id"
+            ],
+            "new_stream_name": "test"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        },
+        {
+            "__type__": "rename",
+            "field": "transcription",
+            "to_field": "translation"
+        },
+        {
+            "__type__": "add_fields",
+            "fields": {
+                "source_language": "English",
+                "target_language": "Portuguese"
+            }
+        }
+    ],
+    "task": "tasks.translation.speech",
+    "templates": [
+        "templates.translation.speech.default"
+    ]
+}
diff --git a/src/unitxt/catalog/cards/gigaspeech/xs.json b/src/unitxt/catalog/cards/gigaspeech/xs.json
new file mode 100644
index 0000000000..1cec909a8a
--- /dev/null
+++ b/src/unitxt/catalog/cards/gigaspeech/xs.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "speechcolab/gigaspeech",
+        "data_dir": "xs",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "apache-2.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "100K<n<1M"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "audiobooks, podcasts, YouTube"
+    },
+    "__description__": "GigaSpeech is an evolving, multi-domain English speech recognition corpus with 10,000 hours of high quality labeled audio suitable for supervised training. The transcribed audio data is collected from audiobooks, podcasts and YouTube, covering both read and spontaneous speaking styles, and a variety of topics such as arts, science, sports, etc.",
+    "__title__": "GigaSpeech-XS"
+}
diff --git a/src/unitxt/catalog/cards/librispeech/test.json b/src/unitxt/catalog/cards/librispeech/test.json
new file mode 100644
index 0000000000..cba62c3e34
--- /dev/null
+++ b/src/unitxt/catalog/cards/librispeech/test.json
@@ -0,0 +1,43 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "openslr/librispeech_asr",
+        "data_dir": "other",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "LibriVox audiobooks"
+    },
+    "__description__": "LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech derived from LibriVox audiobooks. This test subset contains recordings from all speakers including those with higher Word Error Rate for comprehensive evaluation.",
+    "__title__": "LibriSpeech-test"
+}
diff --git a/src/unitxt/catalog/cards/librispeech/test_clean.json b/src/unitxt/catalog/cards/librispeech/test_clean.json
new file mode 100644
index 0000000000..5eef2f2e77
--- /dev/null
+++ b/src/unitxt/catalog/cards/librispeech/test_clean.json
@@ -0,0 +1,43 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "openslr/librispeech_asr",
+        "data_dir": "clean",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-4.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "LibriVox audiobooks"
+    },
+    "__description__": "LibriSpeech is a corpus of approximately 1000 hours of 16kHz read English speech derived from LibriVox audiobooks. This test.clean subset contains high-quality recordings from speakers with low Word Error Rate for clean speech recognition evaluation.",
+    "__title__": "LibriSpeech-test.clean"
+}
diff --git a/src/unitxt/catalog/cards/minds_14.json b/src/unitxt/catalog/cards/minds_14.json
new file mode 100644
index 0000000000..73a6a871d0
--- /dev/null
+++ b/src/unitxt/catalog/cards/minds_14.json
@@ -0,0 +1,126 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "PolyAI/minds14",
+        "name": "en-US"
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "split_random_mix",
+            "mix": {
+                "train": "train[90%]",
+                "validation": "train[5%]",
+                "test": "train[5%]"
+            }
+        },
+        {
+            "__type__": "map_instance_values",
+            "mappers": {
+                "intent_class": {
+                    "0": "abroad",
+                    "1": "address",
+                    "2": "app_error",
+                    "3": "atm_limit",
+                    "4": "balance",
+                    "5": "business_loan",
+                    "6": "card_issues",
+                    "7": "cash_deposit",
+                    "8": "direct_debit",
+                    "9": "freeze",
+                    "10": "high_value_payment",
+                    "11": "joint_account",
+                    "12": "latest_transactions",
+                    "13": "pay_bill"
+                }
+            }
+        },
+        {
+            "__type__": "rename",
+            "field": "intent_class",
+            "to_field": "label"
+        },
+        {
+            "__type__": "set",
+            "fields": {
+                "text_type": "sentence",
+                "type_of_class": "intent",
+                "classes": [
+                    "abroad",
+                    "address",
+                    "app_error",
+                    "atm_limit",
+                    "balance",
+                    "business_loan",
+                    "card_issues",
+                    "cash_deposit",
+                    "direct_debit",
+                    "freeze",
+                    "high_value_payment",
+                    "joint_account",
+                    "latest_transactions",
+                    "pay_bill"
+                ]
+            }
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio",
+            "to_field": "text"
+        }
+    ],
+    "task": "tasks.classification.multi_class",
+    "templates": "templates.classification.multi_class.all",
+    "__tags__": {
+        "annotations_creators": [
+            "expert-generated",
+            "crowdsourced",
+            "machine-generated"
+        ],
+        "language_creators": [
+            "crowdsourced",
+            "expert-generated"
+        ],
+        "language": [
+            "en",
+            "fr",
+            "it",
+            "es",
+            "pt",
+            "de",
+            "nl",
+            "ru",
+            "pl",
+            "cs",
+            "ko",
+            "zh"
+        ],
+        "license": "cc-by-4.0",
+        "multilinguality": "multilingual",
+        "size_categories": "10K<n<100K",
+        "task_categories": "automatic-speech-recognition",
+        "task_ids": "keyword-spotting",
+        "pretty_name": "MInDS-14",
+        "language_bcp47": [
+            "en",
+            "en-GB",
+            "en-US",
+            "en-AU",
+            "fr",
+            "it",
+            "es",
+            "pt",
+            "de",
+            "nl",
+            "ru",
+            "pl",
+            "cs",
+            "ko",
+            "zh"
+        ],
+        "tags": [
+            "speech-recognition"
+        ]
+    },
+    "__description__": "MINDS-14 is training and evaluation resource for intent detection task with spoken data. It covers 14 intents extracted from a commercial system in the e-banking domain, associated with spoken examples in 14 diverse language varieties."
+}
diff --git a/src/unitxt/catalog/cards/spgispeech/s.json b/src/unitxt/catalog/cards/spgispeech/s.json
new file mode 100644
index 0000000000..a6e7e84765
--- /dev/null
+++ b/src/unitxt/catalog/cards/spgispeech/s.json
@@ -0,0 +1,50 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "kensho/spgispeech",
+        "data_dir": "S",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "rename",
+            "field": "transcript",
+            "to_field": "text"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "other",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "100K<n<1M"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "financial earnings calls"
+    },
+    "__description__": "SPGISpeech is a large-scale transcription dataset of 5,000 hours of professionally-transcribed financial audio, containing earnings calls from 2007-2020. The dataset features business English with diverse accents from approximately 50,000 speakers, covering both spontaneous and narrated speech in 5-15 second segments.",
+    "__title__": "SPGISpeech-S"
+}
diff --git a/src/unitxt/catalog/cards/tedlium/release1.json b/src/unitxt/catalog/cards/tedlium/release1.json
new file mode 100644
index 0000000000..7fe577f2b0
--- /dev/null
+++ b/src/unitxt/catalog/cards/tedlium/release1.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "LIUM/tedlium",
+        "data_dir": "release1",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-nc-nd-3.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "TED Talks"
+    },
+    "__description__": "The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz. This release1 contains progressively more transcribed speech training data, ranging from 118 hours (Release 1), to 207 hours (Release 2), to 452 hours (Release 3). The dataset includes speaker information and is commonly used for automatic speech recognition evaluation.",
+    "__title__": "TEDLIUM-release1"
+}
diff --git a/src/unitxt/catalog/cards/tedlium/release2.json b/src/unitxt/catalog/cards/tedlium/release2.json
new file mode 100644
index 0000000000..dabe23d352
--- /dev/null
+++ b/src/unitxt/catalog/cards/tedlium/release2.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "LIUM/tedlium",
+        "data_dir": "release2",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-nc-nd-3.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "TED Talks"
+    },
+    "__description__": "The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz. This release2 contains progressively more transcribed speech training data, ranging from 118 hours (Release 1), to 207 hours (Release 2), to 452 hours (Release 3). The dataset includes speaker information and is commonly used for automatic speech recognition evaluation.",
+    "__title__": "TEDLIUM-release2"
+}
diff --git a/src/unitxt/catalog/cards/tedlium/release3.json b/src/unitxt/catalog/cards/tedlium/release3.json
new file mode 100644
index 0000000000..8c77ea70a1
--- /dev/null
+++ b/src/unitxt/catalog/cards/tedlium/release3.json
@@ -0,0 +1,45 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "LIUM/tedlium",
+        "data_dir": "release3",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc-by-nc-nd-3.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "size_categories": [
+            "10K<n<100K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "TED Talks"
+    },
+    "__description__": "The TED-LIUM corpus is English-language TED talks, with transcriptions, sampled at 16kHz. This release3 contains progressively more transcribed speech training data, ranging from 118 hours (Release 1), to 207 hours (Release 2), to 452 hours (Release 3). The dataset includes speaker information and is commonly used for automatic speech recognition evaluation.",
+    "__title__": "TEDLIUM-release3"
+}
diff --git a/src/unitxt/catalog/cards/voxpopuli/en.json b/src/unitxt/catalog/cards/voxpopuli/en.json
new file mode 100644
index 0000000000..15b2f9487e
--- /dev/null
+++ b/src/unitxt/catalog/cards/voxpopuli/en.json
@@ -0,0 +1,51 @@
+{
+    "__type__": "task_card",
+    "loader": {
+        "__type__": "load_hf",
+        "path": "facebook/voxpopuli",
+        "data_dir": "en",
+        "revision": "refs/convert/parquet",
+        "splits": [
+            "train",
+            "validation",
+            "test"
+        ],
+        "data_classification_policy": [
+            "public"
+        ],
+        "streaming": true
+    },
+    "preprocess_steps": [
+        {
+            "__type__": "rename",
+            "field": "normalized_text",
+            "to_field": "text"
+        },
+        {
+            "__type__": "to_audio",
+            "field": "audio"
+        }
+    ],
+    "task": "tasks.speech_recognition",
+    "templates": [
+        "templates.speech_recognition.default"
+    ],
+    "__tags__": {
+        "license": "cc0-1.0",
+        "language": "en",
+        "task_categories": [
+            "automatic-speech-recognition"
+        ],
+        "multilinguality": "multilingual",
+        "size_categories": [
+            "1K<n<10K"
+        ],
+        "modalities": [
+            "audio",
+            "text"
+        ],
+        "source": "European Parliament recordings"
+    },
+    "__description__": "VoxPopuli is a large-scale multilingual speech corpus for representation learning, semi-supervised learning and interpretation. The raw data is collected from 2009-2020 European Parliament event recordings. This English subset contains transcribed speech data for automatic speech recognition with 1,791 hours of transcribed speech from 4,295 speakers.",
+    "__title__": "VoxPopuli-EN"
+}
diff --git a/src/unitxt/catalog/metrics/normalized_sacrebleu.json b/src/unitxt/catalog/metrics/normalized_sacrebleu.json
index 10ca810322..c1cd14c5a5 100644
--- a/src/unitxt/catalog/metrics/normalized_sacrebleu.json
+++ b/src/unitxt/catalog/metrics/normalized_sacrebleu.json
@@ -1,6 +1,8 @@
 {
     "__type__": "normalized_sacrebleu",
     "language_to_tokenizer": {
+        "italian": null,
+        "it": null,
         "german": null,
         "deutch": null,
         "de": null,
@@ -19,6 +21,8 @@
         "korean": "ko-mecab",
         "ko": "ko-mecab",
         "japanese": "ja-mecab",
-        "ja": "ja-mecab"
+        "ja": "ja-mecab",
+        "chinese": "zh",
+        "zh": "zh"
     }
 }
diff --git a/src/unitxt/catalog/metrics/wer.json b/src/unitxt/catalog/metrics/wer.json
index 42ba218dcd..64b84f9d32 100644
--- a/src/unitxt/catalog/metrics/wer.json
+++ b/src/unitxt/catalog/metrics/wer.json
@@ -1,3 +1,3 @@
 {
-    "__type__": "wer"
+    "__type__": "wer_fast"
 }
diff --git a/src/unitxt/catalog/operators/normalize_text_basic_with_whisper.json b/src/unitxt/catalog/operators/normalize_text_basic_with_whisper.json
new file mode 100644
index 0000000000..10c4ace7ae
--- /dev/null
+++ b/src/unitxt/catalog/operators/normalize_text_basic_with_whisper.json
@@ -0,0 +1,3 @@
+{
+    "__type__": "normalize_text_basic_with_whisper"
+}
diff --git a/src/unitxt/catalog/operators/normalize_text_with_whisper.json b/src/unitxt/catalog/operators/normalize_text_with_whisper.json
new file mode 100644
index 0000000000..106053fa32
--- /dev/null
+++ b/src/unitxt/catalog/operators/normalize_text_with_whisper.json
@@ -0,0 +1,3 @@
+{
+    "__type__": "normalize_text_with_whisper"
+}
diff --git a/src/unitxt/catalog/processors/normalize_text_basic_with_whisper.json b/src/unitxt/catalog/processors/normalize_text_basic_with_whisper.json
new file mode 100644
index 0000000000..f36e910372
--- /dev/null
+++ b/src/unitxt/catalog/processors/normalize_text_basic_with_whisper.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "post_process",
+    "process_references": true,
+    "process_prediction": true,
+    "operator": {
+        "__type__": "normalize_text_basic_with_whisper"
+    }
+}
diff --git a/src/unitxt/catalog/processors/normalize_text_with_whisper.json b/src/unitxt/catalog/processors/normalize_text_with_whisper.json
new file mode 100644
index 0000000000..985a19eb85
--- /dev/null
+++ b/src/unitxt/catalog/processors/normalize_text_with_whisper.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "post_process",
+    "process_references": true,
+    "process_prediction": true,
+    "operator": {
+        "__type__": "normalize_text_with_whisper"
+    }
+}
diff --git a/src/unitxt/catalog/tasks/classification/multi_class.json b/src/unitxt/catalog/tasks/classification/multi_class.json
index 97787fd30a..502b5151ac 100644
--- a/src/unitxt/catalog/tasks/classification/multi_class.json
+++ b/src/unitxt/catalog/tasks/classification/multi_class.json
@@ -2,7 +2,7 @@
     "__type__": "task",
     "__description__": "This is multi class text classification task.\n\nThe set of 'classes' we want to classify to is provided as a list of strings.\n\nThe 'text_type' is an optional field that defines the type of text we classify (e.g. \"document\", \"review\", etc.).\nThe 'type_of_class' is an oiptional field that the defines the type of classification we perform (e.g. \"sentiment\", \"harm\", \"risk\" etc..)\nThe 'text_type' and 'type_of_class' fields can be used by the template to customize the prompt.\n\nThe default reported metrics are the classical f1_micro (equivalent to accuracy for multi class classification), and f1_macro.\n\n",
     "input_fields": {
-        "text": "str",
+        "text": "Union[str, Image, Audio]",
         "text_type": "str",
         "classes": "List[str]",
         "type_of_class": "str"
diff --git a/src/unitxt/catalog/tasks/speech_recognition.json b/src/unitxt/catalog/tasks/speech_recognition.json
new file mode 100644
index 0000000000..23ad6ac6fa
--- /dev/null
+++ b/src/unitxt/catalog/tasks/speech_recognition.json
@@ -0,0 +1,21 @@
+{
+    "__type__": "task",
+    "input_fields": {
+        "audio": "Audio"
+    },
+    "reference_fields": {
+        "text": "str"
+    },
+    "prediction_type": "str",
+    "metrics": [
+        "metrics.wer"
+    ],
+    "default_template": {
+        "__type__": "input_output_template",
+        "input_format": "{audio}can you transcribe the speech into a written format?",
+        "output_format": "{text}",
+        "postprocessors": [
+            "processors.normalize_text_with_whisper"
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/tasks/speech_recognition_multilingual.json b/src/unitxt/catalog/tasks/speech_recognition_multilingual.json
new file mode 100644
index 0000000000..645aa0b8fd
--- /dev/null
+++ b/src/unitxt/catalog/tasks/speech_recognition_multilingual.json
@@ -0,0 +1,21 @@
+{
+    "__type__": "task",
+    "input_fields": {
+        "audio": "Audio"
+    },
+    "reference_fields": {
+        "text": "str"
+    },
+    "prediction_type": "str",
+    "metrics": [
+        "metrics.wer"
+    ],
+    "default_template": {
+        "__type__": "input_output_template",
+        "input_format": "{audio}can you transcribe the speech into a written format?",
+        "output_format": "{text}",
+        "postprocessors": [
+            "processors.normalize_text_basic_with_whisper"
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/tasks/translation/speech.json b/src/unitxt/catalog/tasks/translation/speech.json
new file mode 100644
index 0000000000..0fcb8a0c39
--- /dev/null
+++ b/src/unitxt/catalog/tasks/translation/speech.json
@@ -0,0 +1,22 @@
+{
+    "__type__": "task",
+    "input_fields": {
+        "audio": "Audio",
+        "target_language": "str"
+    },
+    "reference_fields": {
+        "translation": "str"
+    },
+    "prediction_type": "str",
+    "metrics": [
+        "metrics.normalized_sacrebleu"
+    ],
+    "default_template": {
+        "__type__": "input_output_template",
+        "input_format": "{audio}translate the speech to {target_language}",
+        "output_format": "{translation}",
+        "postprocessors": [
+            "processors.normalize_text_basic_with_whisper"
+        ]
+    }
+}
diff --git a/src/unitxt/catalog/templates/speech_recognition/default.json b/src/unitxt/catalog/templates/speech_recognition/default.json
new file mode 100644
index 0000000000..d86f5ef86e
--- /dev/null
+++ b/src/unitxt/catalog/templates/speech_recognition/default.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "input_output_template",
+    "input_format": "{audio}can you transcribe the speech into a written format?",
+    "output_format": "{text}",
+    "postprocessors": [
+        "processors.normalize_text_with_whisper"
+    ]
+}
diff --git a/src/unitxt/catalog/templates/speech_recognition/multilingual.json b/src/unitxt/catalog/templates/speech_recognition/multilingual.json
new file mode 100644
index 0000000000..d37c2af89e
--- /dev/null
+++ b/src/unitxt/catalog/templates/speech_recognition/multilingual.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "input_output_template",
+    "input_format": "{audio}can you transcribe the speech into a written format?",
+    "output_format": "{text}",
+    "postprocessors": [
+        "processors.normalize_text_basic_with_whisper"
+    ]
+}
diff --git a/src/unitxt/catalog/templates/translation/speech/default.json b/src/unitxt/catalog/templates/translation/speech/default.json
new file mode 100644
index 0000000000..2b475ea91a
--- /dev/null
+++ b/src/unitxt/catalog/templates/translation/speech/default.json
@@ -0,0 +1,8 @@
+{
+    "__type__": "input_output_template",
+    "input_format": "{audio}translate the speech to {target_language}",
+    "output_format": "{translation}",
+    "postprocessors": [
+        "processors.normalize_text_basic_with_whisper"
+    ]
+}
diff --git a/src/unitxt/catalog/templates/translation/speech/no_norm.json b/src/unitxt/catalog/templates/translation/speech/no_norm.json
new file mode 100644
index 0000000000..570ff18c29
--- /dev/null
+++ b/src/unitxt/catalog/templates/translation/speech/no_norm.json
@@ -0,0 +1,5 @@
+{
+    "__type__": "input_output_template",
+    "input_format": "{audio}translate the speech to {target_language}",
+    "output_format": "{translation}"
+}
diff --git a/src/unitxt/dataset.py b/src/unitxt/dataset.py
index 94529f42ff..9e21b71d4d 100644
--- a/src/unitxt/dataset.py
+++ b/src/unitxt/dataset.py
@@ -5,6 +5,7 @@
 
 from .api import __file__ as _
 from .artifact import __file__ as _
+from .audio_operators import __file__ as _
 from .augmentors import __file__ as _
 from .base_metric import __file__ as _
 from .benchmark import __file__ as _
diff --git a/src/unitxt/formats.py b/src/unitxt/formats.py
index 0c0215ec08..1b9a5ee997 100644
--- a/src/unitxt/formats.py
+++ b/src/unitxt/formats.py
@@ -12,6 +12,7 @@
     Union,
 )
 
+from .audio_operators import audio_to_base64
 from .dataclass import OptionalField
 from .dict_utils import dict_get
 from .error_utils import UnitxtError
@@ -302,6 +303,16 @@ class ImageFileContent(TypedDict):
     image_file: Dict[Literal["file_id"], str]
 
 
+class DataFormat(TypedDict):
+    data: str
+    format: Literal["wav"]
+
+
+class AudioInputContent(TypedDict):
+    type: Literal["input_audio"]
+    input_audio: DataFormat
+
+
 Content = Union[TextContent, ImageUrlContent, ImageFileContent]
 
 
@@ -378,53 +389,64 @@ class ChatAPIFormat(BaseFormat):
     place_instruction_in_user_turns: bool = False
 
     def to_content(self, text: str, media: Dict[str, Any]) -> Union[str, List[Content]]:
-        # Regular expression to find <img> tags with src attribute
-        img_tag_pattern = re.compile(
-            r"<" + f"{constants.image_tag}" + r'\s+[^>]*src=["\']([^"\']+)["\'][^>]*>',
+        image_tag = constants.image_tag
+        audio_tag = constants.audio_tag
+
+        # Unified regex for both tags
+        tag_pattern = re.compile(
+            rf"<(?P<tag>{re.escape(image_tag)}|{re.escape(audio_tag)})\s+[^>]*src=[\"'](?P<src>[^\"']+)[\"'][^>]*>",
             re.IGNORECASE,
         )
 
-        # Find all matches of <img> tags and their positions
-        matches = list(img_tag_pattern.finditer(text))
+        matches = list(tag_pattern.finditer(text))
 
-        # If no images are found, return the text as a plain string
         if not matches:
             return text
 
         contents: List[dict] = []
         last_pos = 0
 
-        # Process each match
         for match in matches:
             start, end = match.span()
-            img_url = match.group(1)
+            tag = match.group("tag").lower()
+            src = match.group("src")
 
-            # Add preceding text, if any
+            # Add preceding text
             if last_pos < start:
                 contents.append({"type": "text", "text": text[last_pos:start]})
 
-            # Add image content with a default detail level
-            if img_url.startswith("media/"):
-                image = dict_get(media, img_url[6:])
-                data_url = image_to_data_url(image)
+            is_local = src.startswith("media/")
+            media_key = src[6:] if is_local else None
+
+            if tag == image_tag:
+                if is_local:
+                    image = dict_get(media, media_key)
+                    data_url = image_to_data_url(image)
+                else:
+                    data_url = src
                 contents.append(
                     {
                         "type": "image_url",
                         "image_url": {"url": data_url, "detail": "low"},
                     }
                 )
-            else:
+
+            elif tag == audio_tag:
+                if is_local:
+                    audio = dict_get(media, media_key)
+                    data_url = audio_to_base64(audio)
+                else:
+                    data_url = src
                 contents.append(
                     {
-                        "type": "image_url",
-                        "image_url": {"url": img_url, "detail": "low"},
+                        "type": "input_audio",
+                        "input_audio": {"data": data_url, "format": "wav"},
                     }
                 )
 
-            # Update the last processed position
             last_pos = end
 
-        # Add any remaining text after the last image
+        # Add any trailing text
         if last_pos < len(text):
             contents.append({"type": "text", "text": text[last_pos:]})
 
@@ -512,6 +534,7 @@ def _format_instance_to_source(
             turns,
         )
         media["images"] = []
+        media["audios"] = []
         return chat
 
 
diff --git a/src/unitxt/inference.py b/src/unitxt/inference.py
index 87488d1da7..13169dbf3d 100644
--- a/src/unitxt/inference.py
+++ b/src/unitxt/inference.py
@@ -36,6 +36,7 @@
 from tqdm.asyncio import tqdm_asyncio
 
 from .artifact import Artifact
+from .audio_operators import base64_to_audio
 from .base_metric import Metric
 from .dataclass import InternalField, NonPositionalField
 from .deprecation_utils import deprecation
@@ -69,6 +70,7 @@ class StandardAPIParamsMixin(Artifact):
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
     seed: Optional[int] = None
     stop: Union[Optional[str], List[str]] = None
     temperature: Optional[float] = None
@@ -610,6 +612,12 @@ def get_logprobs(
 
         logprobs: List[List[Dict[str, Any]]] = []
 
+        tokenizer = (
+            self.processor.tokenizer
+            if hasattr(self.processor, "tokenizer")
+            else self.processor
+        )
+
         for sample_no, sample_scores in enumerate(transition_scores.detach().cpu()):
             sample_logprobs: List[Dict[str, Any]] = []
 
@@ -620,7 +628,7 @@ def get_logprobs(
                         "logprob": float(score.cpu()),
                         "top_tokens": [
                             {
-                                "text": self.processor.decode(idx),
+                                "text": tokenizer.decode(idx),
                                 "logprob": float(
                                     predictions.scores[n][sample_no][idx].cpu()
                                 ),
@@ -1042,6 +1050,161 @@ def _infer_log_probs(
         return self._infer_fn(dataset, return_meta_data, True)
 
 
+class HFGraniteSpeechInferenceEngine(HFInferenceEngineBase):
+    revision: str = None
+    lazy_load: bool = True
+    label: str = "hf_granite_speech"
+    audio_token: str = "<|audio|>"
+    sampling_rate: int = 16000
+
+    _requirements_list = ["torchaudio"]
+
+    def compute_transition_scores(
+        self, sequences: Sequence, scores: Sequence, beam_indices: Optional[int]
+    ) -> Sequence:
+        if not hasattr(self.model.config, "vocab_size"):
+            try:
+                self.model.config.vocab_size = self.model.vocab_size
+            except:
+                self.model.config.vocab_size = self.model.config.text_config.vocab_size
+
+        return super().compute_transition_scores(sequences, scores, beam_indices)
+
+    def _init_processor(self):
+        from transformers import AutoProcessor
+
+        self.processor = AutoProcessor.from_pretrained(self.model_name)
+
+        if not self.pad_token_id and hasattr(self.processor, "eos_token_id"):
+            self.pad_token_id = self.processor.eos_token_id
+
+    def _init_model(self):
+        from transformers import AutoModelForSpeechSeq2Seq
+
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            self.model_name,
+            revision=self.revision,
+            torch_dtype=self._get_torch_dtype(),
+            low_cpu_mem_usage=self.low_cpu_mem_usage,
+            device_map=self.device_map,
+        )
+        if self.device_map is None:
+            self.model.to(self.device)
+
+    def _get_input(self, instance):
+        if isinstance(instance["source"], list):
+            # Chat API format - extract audio from content
+            audios = []
+            chat = []
+            for turn in instance["source"]:
+                if isinstance(turn["content"], list):
+                    turn_content = ""
+                    for content in turn["content"]:
+                        if content["type"] == "input_audio":
+                            audios.append(
+                                base64_to_audio(
+                                    content["input_audio"]["data"],
+                                    sampling_rate=self.sampling_rate,
+                                )
+                            )
+                            turn_content += self.audio_token
+                        elif content["type"] == "text":
+                            turn_content += content["text"]
+                        else:
+                            raise ValueError(
+                                f"Unsupported content type:{content['type']}"
+                            )
+                    turn = {"role": turn["role"], "content": turn_content}
+
+                chat.append(turn)
+
+            if len(audios) > 1:
+                raise ValueError(f"Unsupported number of audio contents:{len(audios)}")
+
+            audio = audios[0]
+
+            return chat, audio
+        raise ValueError("Supports only chat api.")
+
+    def prepare_inputs(self, instance) -> Mapping:
+        chat, audio = self._get_input(instance)
+
+        text = self.processor.tokenizer.apply_chat_template(
+            chat, tokenize=False, add_generation_prompt=True
+        )
+
+        inputs: Mapping = self.processor(
+            text=[text], audio=audio["array"], return_tensors="pt"
+        ).to(self.device or self.device_map, self._get_torch_dtype())
+
+        return inputs
+
+    def _infer_fn(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool,
+        return_logprobs: bool,
+    ) -> Union[List[str], List[Dict], List[TextGenerationInferenceOutput]]:
+        results = []
+
+        for instance in tqdm(dataset):
+            processed_inputs = self.prepare_inputs(instance)
+            input_len = len(processed_inputs["input_ids"][0])
+
+            predictions = self.make_predictions(processed_inputs)
+
+            sequences = predictions.sequences
+
+            output_tokens = sequences[:, input_len:]
+
+            output_tokens_strings = []
+            for tokens in output_tokens:
+                output_tokens_strings.append(
+                    [
+                        self.processor.tokenizer.decode(token, skip_special_tokens=True)
+                        for token in tokens
+                    ]
+                )
+
+            output_strings = []
+            for tokens in output_tokens:
+                output_strings.append(
+                    self.processor.tokenizer.decode(tokens, skip_special_tokens=True)
+                )
+
+            if return_logprobs:
+                final_outputs = self.get_logprobs(predictions, output_tokens_strings)
+            else:
+                final_outputs = output_strings
+
+            results.append(
+                self.get_return_object(
+                    output=final_outputs[0],
+                    generated_text=output_strings[0],
+                    output_tokens=len(output_tokens_strings[0]),
+                    inp=instance["source"],
+                    inp_tokens=None,
+                    return_meta_data=return_meta_data,
+                )
+            )
+
+        return results
+
+    def _infer(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[List[str], List[TextGenerationInferenceOutput]]:
+        return self._infer_fn(dataset, return_meta_data, False)
+
+    def _infer_log_probs(
+        self,
+        dataset: Union[List[Dict[str, Any]], Dataset],
+        return_meta_data: bool = False,
+    ) -> Union[List[Dict], List[TextGenerationInferenceOutput]]:
+        return self._infer_fn(dataset, return_meta_data, True)
+
+
 class HFPeftInferenceEngine(HFAutoModelInferenceEngine):
     label: str = "hf_peft_auto_model"
 
@@ -1571,6 +1734,7 @@ class OpenAiInferenceEngineParamsMixin(Artifact):
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
     seed: Optional[int] = None
     stop: Union[Optional[str], List[str]] = None
     temperature: Optional[float] = None
@@ -1588,6 +1752,7 @@ class OpenAiInferenceEngineParams(Artifact):
     frequency_penalty: Optional[float] = None
     presence_penalty: Optional[float] = None
     max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
     seed: Optional[int] = None
     stop: Union[Optional[str], List[str]] = None
     temperature: Optional[float] = None
@@ -1911,6 +2076,7 @@ def _get_model_name_for_endpoint(cls, model_name: str):
 
 class TogetherAiInferenceEngineParamsMixin(Artifact):
     max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
     stop: Optional[List[str]] = None
     temperature: Optional[float] = None
     top_p: Optional[float] = None
@@ -2071,6 +2237,7 @@ class WMLChatParamsMixin(Artifact):
     response_format: Optional[Dict[str, Any]] = None
     temperature: Optional[float] = None
     max_tokens: Optional[int] = None
+    max_new_tokens: Optional[int] = None
     time_limit: Optional[int] = None
     top_p: Optional[float] = None
     n: Optional[int] = None
@@ -3028,6 +3195,7 @@ class VLLMParamsMixin(Artifact):
     bad_words: Optional[List[str]] = None
     ignore_eos: bool = False
     max_tokens: Optional[int] = 16
+    max_new_tokens: Optional[int] = None
     min_tokens: int = 0
     logprobs: Optional[int] = None
     prompt_logprobs: Optional[int] = None
@@ -3282,6 +3450,7 @@ class CrossProviderInferenceEngine(
             "granite-guardian-3-1-8b": "ibm/granite-guardian-3-8b",
             "granite-guardian-3-2-5b": "ibm/granite-guardian-3-2-5b",
             "granite-vision-3-2-2b": "ibm/granite-vision-3-2-2b",
+            "granite-speech-3-3-8b": "ibm-granite/granite-speech-3.3-8b",
             "llama-3-1-8b-instruct": "meta-llama/llama-3-1-8b-instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
             "llama-3-1-405b-instruct": "meta-llama/llama-3-405b-instruct",
@@ -3342,6 +3511,7 @@ class CrossProviderInferenceEngine(
             "granite-guardian-3-2-3b": "ibm-granite/granite-guardian-3.2-3b-a800m",
             "granite-guardian-3-2-5b": "ibm-granite/granite-guardian-3.2-5b",
             "granite-guardian-3-3-8b": "ibm-granite/granite-guardian-3.3-8b",
+            "granite-speech-3-3-8b": "ibm-granite/granite-speech-3.3-8b",
             "llama-3-1-8b-instruct": "meta-llama/Llama-3.1-8B-Instruct",
             "llama-3-1-70b-instruct": "meta-llama/llama-3-1-70b-instruct",
             "llama-3-1-405b-instruct": "meta-llama/llama-3-1-405b-instruct-fp8",
diff --git a/src/unitxt/metric.py b/src/unitxt/metric.py
index 822340fbcd..67ad255a0e 100644
--- a/src/unitxt/metric.py
+++ b/src/unitxt/metric.py
@@ -4,6 +4,7 @@
 
 from .api import __file__ as _
 from .artifact import __file__ as _
+from .audio_operators import __file__ as _
 from .augmentors import __file__ as _
 from .base_metric import __file__ as _
 from .benchmark import __file__ as _
diff --git a/src/unitxt/metric_utils.py b/src/unitxt/metric_utils.py
index 3bfc41094e..31aebce81d 100644
--- a/src/unitxt/metric_utils.py
+++ b/src/unitxt/metric_utils.py
@@ -4,13 +4,15 @@
 from collections import defaultdict
 from functools import lru_cache
 from statistics import mean
-from typing import Any, Dict, Iterable, List, Optional
+from typing import Any, Dict, Iterable, List, Optional, Set
 
 import pandas as pd
+from datasets import Dataset as HfDataset
 from datasets import Features, Value
+from datasets import IterableDataset as HfIterableDataset
 
-from .dataclass import Dataclass
-from .error_utils import Documentation, UnitxtError, error_context
+from .dataclass import Dataclass, Field
+from .error_utils import Documentation, UnitxtError
 from .operator import (
     InstanceOperator,
     MultiStreamOperator,
@@ -22,8 +24,8 @@
     ApplyMetric,
     ApplyOperatorsField,
     ArtifactFetcherMixin,
-    FlattenInstances,
     RecursiveCopy,
+    RemoveFields,
     Rename,
 )
 from .register import _reset_env_local_catalogs, register_all_artifacts
@@ -145,6 +147,19 @@ def group_str(json_str):
     return ",".join(f"{k}:{v}" for k, v in data.items())
 
 
+@lru_cache(maxsize=None)
+def subset_stream_name(stream_name, subset, subset_depth):
+    return (
+        stream_name + DEFAULT_STREAM_SUBSET_SEPARATOR + "/".join(subset[:subset_depth])
+    )
+
+
+@lru_cache(maxsize=None)
+def subset_group_stream_name(stream_name, subset, subset_depth, group):
+    subset_name = subset_stream_name(stream_name, subset, subset_depth)
+    return subset_name + "?" + group_str(group)
+
+
 class SplitSubsetsAndGroups(MultiStreamOperator):
     """Splits a MultiStream that is small - for metrics, hence: whole stream can sit in memory, split by the value of field 'group'.
 
@@ -157,35 +172,69 @@ class SplitSubsetsAndGroups(MultiStreamOperator):
     subsets_depth  specifies the depth of the prefix by which to split the stream.
     """
 
-    subsets_field: str = "subset"
-    groups_field: str = "groups"
+    subset_groups: Dict[str, Set[str]]
     subset_depth: Optional[int] = None
 
-    def process(self, multi_stream: MultiStream) -> MultiStream:
-        result = defaultdict(list)
-
-        for stream_name, stream in multi_stream.items():
-            for i, instance in enumerate(stream):
-                instance["__idx__"] = i
-
-                for field in [self.subsets_field, self.groups_field]:
-                    if field not in instance:
-                        raise ValueError(
-                            f"Field {field} is missing from instance {instance}"
-                        )
-
-                subset_stream_name = (
-                    stream_name
-                    + DEFAULT_STREAM_SUBSET_SEPARATOR
-                    + "/".join(instance[self.subsets_field][: self.subset_depth])
+    def get_new_streams(self, stream_name):
+        new_streams = []
+        for subset, groups in self.subset_groups.items():
+            new_streams.append(
+                subset_stream_name(
+                    stream_name=stream_name,
+                    subset=subset,
+                    subset_depth=self.subset_depth,
                 )
+            )
+            for group in groups:
+                new_streams.append(
+                    subset_group_stream_name(
+                        stream_name=stream_name,
+                        subset=subset,
+                        subset_depth=self.subset_depth,
+                        group=group,
+                    )
+                )
+        return new_streams
 
-                result[subset_stream_name].append(instance)
+    def is_instance_included(self, instance, stream_name, new_stream_name) -> bool:
+        subset = tuple(instance.get("subset"))
 
-                for group in instance[self.groups_field]:
-                    result[subset_stream_name + "?" + group_str(group)].append(instance)
+        if new_stream_name == subset_stream_name(
+            stream_name=stream_name, subset=subset, subset_depth=self.subset_depth
+        ):
+            return True
 
-        return MultiStream.from_iterables(result, copying=True)
+        for group in instance.get("groups"):
+            if new_stream_name == subset_group_stream_name(
+                stream_name=stream_name,
+                subset=subset,
+                subset_depth=self.subset_depth,
+                group=group,
+            ):
+                return True
+
+        return False
+
+    def filter_stream(self, stream, stream_name, new_stream_name):
+        for i, instance in enumerate(stream):
+            instance["__idx__"] = i
+            if self.is_instance_included(instance, stream_name, new_stream_name):
+                yield instance
+
+    def process(self, multi_stream: MultiStream) -> MultiStream:
+        streams = {}
+
+        for stream_name, stream in multi_stream.items():
+            for new_stream_name in self.get_new_streams(stream_name):
+                streams[new_stream_name] = DynamicStream(
+                    generator=self.filter_stream,
+                    gen_kwargs={
+                        "stream": stream,
+                        "stream_name": stream_name,
+                        "new_stream_name": new_stream_name,
+                    },
+                )
+        return MultiStream(streams)
 
 
 @lru_cache(maxsize=None)
@@ -235,7 +284,15 @@ def process(self, multi_stream: MultiStream) -> MultiStream:
 
                 idx = instance.pop("__idx__")
                 if idx not in instances[origin]:
-                    instances[origin][idx] = instance
+                    instances[origin][idx] = {"score": instance["score"]}
+                    if "processed_prediction" in instance:
+                        instances[origin][idx]["processed_prediction"] = instance[
+                            "processed_prediction"
+                        ]
+                    if "processed_references" in instance:
+                        instances[origin][idx]["processed_references"] = instance[
+                            "processed_references"
+                        ]
 
                 # from here below setting the global scores from that stream
                 # can be done with first instance only
@@ -340,36 +397,59 @@ def _inference_post_process(
     return [instance["processed_prediction"] for instance in multi_stream[split_name]]
 
 
-class MetricRecipe(SequentialOperatorInitializer):
+class PreProcessForEvaluation(SequentialOperatorInitializer):
+    def prepare(self):
+        register_all_artifacts()
+        self.steps = [
+            FromPredictionsAndOriginalData(),
+            LoadJson(field="task_data"),
+            RecursiveCopy(
+                field="source",
+                to_field="task_data/source",
+            ),
+        ]
+
+
+class PostProcessAfterEvaluation(SequentialOperator):
+    steps = [
+        Rename(
+            field="raw_prediction",
+            to_field="prediction",
+            not_exist_ok=True,
+            not_exist_do_nothing=True,
+        ),
+        Rename(
+            field="raw_references",
+            to_field="references",
+            not_exist_ok=True,
+            not_exist_do_nothing=True,
+        ),
+        RecursiveCopy(
+            field="source",
+            to_field="task_data/source",
+        ),
+        RemoveFields(fields=["__idx__"], not_exist_ok=True),
+    ]
+
+
+class MainEvaluationPipeline(SequentialOperator):
     calc_confidence_intervals: bool = True
     subset_depth: int = 2
+    subset_groups: Dict[str, Set[str]] = Field(default_factory=dict)
 
     def prepare(self):
         register_all_artifacts()
         self.steps = [
-            FromPredictionsAndOriginalData(),
-            LoadJson(field="task_data"),
             _post_process_steps,
             SplitSubsetsAndGroups(
                 subset_depth=self.subset_depth,
+                subset_groups=self.subset_groups,
             ),
             ApplyMetric(
                 "metrics",
                 calc_confidence_intervals=self.calc_confidence_intervals,
             ),
             JoinSubsetsAndGroups(),
-            Rename(
-                field="raw_prediction",
-                to_field="prediction",
-            ),
-            Rename(
-                field="raw_references",
-                to_field="references",
-            ),
-            RecursiveCopy(
-                field="source",
-                to_field="task_data/source",
-            ),
         ]
 
 
@@ -436,7 +516,7 @@ def __repr__(self):
 
     @property
     def summary(self):
-        df = self.to_df().round(2).fillna("")
+        df = self.to_df().round(4).fillna("")
         df = df.sort_index()
         df = df.drop("num_of_instances", axis=0)
         df = df.reset_index()
@@ -744,16 +824,139 @@ def __repr__(self):
 
 
 class EvaluationResults(list):
-    def __init__(self, *args, metadata=None, **kwargs):
-        super().__init__(*args, **kwargs)
+    def __init__(self, stream, metadata=None):
+        """Initialize EvaluationResults with lazy evaluation.
+
+        Args:
+            stream: An iterator or generator
+            metadata: Optional metadata dictionary
+        """
+        super().__init__()
+        self._generator = iter(stream)
+        self._realized = False
         self.metadata = metadata if metadata is not None else {}
 
+    def _realize_up_to(self, index):
+        """Realize elements from generator up to the given index."""
+        if self._realized:
+            return
+
+        current_len = super().__len__()
+
+        # For negative indices, we need to realize everything
+        if index < 0:
+            self._realize_all()
+            return
+
+        # Calculate how many more elements we need
+        needed = index + 1 - current_len
+
+        # Realize the needed elements
+        try:
+            for _ in range(needed):
+                item = next(self._generator)
+                super().append(item)
+        except StopIteration:
+            self._realized = True
+            self._generator = None
+
+    def _realize_all(self):
+        """Realize all remaining elements from the generator."""
+        if self._realized:
+            return
+
+        try:
+            while True:
+                item = next(self._generator)
+                super().append(item)
+        except StopIteration:
+            self._realized = True
+            self._generator = None
+
+    def __getitem__(self, index):
+        if isinstance(index, slice):
+            # For slices, we need to realize up to the maximum index
+            start, stop, step = index.indices(len(self))
+            if stop > 0:
+                self._realize_up_to(stop - 1)
+            else:
+                self._realize_all()
+        else:
+            self._realize_up_to(index)
+        return super().__getitem__(index)
+
+    def __len__(self):
+        # If not fully realized, we need to realize everything to know the length
+        if not self._realized:
+            self._realize_all()
+        return super().__len__()
+
+    def __iter__(self):
+        # Yield already realized elements
+        for i in range(super().__len__()):
+            yield super().__getitem__(i)
+
+        # Continue with unrealized elements
+        if self._generator is not None:
+            try:
+                while True:
+                    item = next(self._generator)
+                    super().append(item)
+                    yield item
+            except StopIteration:
+                self._realized = True
+                self._generator = None
+
+    def append(self, item):
+        self._realize_all()
+        super().append(item)
+
+    def extend(self, iterable):
+        self._realize_all()
+        super().extend(iterable)
+
+    def insert(self, index, item):
+        self._realize_up_to(index)
+        super().insert(index, item)
+
+    def remove(self, value):
+        self._realize_all()
+        super().remove(value)
+
+    def pop(self, index=-1):
+        self._realize_up_to(index)
+        return super().pop(index)
+
+    def index(self, value, start=0, stop=None):
+        if stop is None:
+            self._realize_all()
+        else:
+            self._realize_up_to(stop)
+        return super().index(value, start, stop)
+
+    def count(self, value):
+        self._realize_all()
+        return super().count(value)
+
+    def sort(self, key=None, reverse=False):
+        self._realize_all()
+        super().sort(key=key, reverse=reverse)
+
+    def reverse(self):
+        self._realize_all()
+        super().reverse()
+
+    def clear(self):
+        super().clear()
+        self._generator = None
+        self._realized = True
+
     @property
     def global_scores(self):
         return GlobalScores(self[0]["score"]["global"])
 
     @property
-    def instance_scores(self) -> InstanceScores:
+    def instance_scores(self):
         return InstanceScores(self)
 
     @property
@@ -775,6 +978,35 @@ def subsets_scores(self):
         return SubsetsScores(self[0]["score"]["subsets"])
 
 
+def extract_subset_groups(dataset):
+    # Column-wise access types
+    subset_groups = {}
+    if not isinstance(dataset, (HfDataset, HfIterableDataset, pd.DataFrame)):
+        dataset = {
+            "subset": (item.get("subset", []) for item in dataset),
+            "groups": (item.get("groups", []) for item in dataset),
+        }
+
+    for subset, groups in zip(dataset["subset"], dataset["groups"]):
+        if len(subset) == 0:
+            subset = ""
+        else:
+            subset = tuple(subset)
+
+        if subset not in subset_groups:
+            subset_groups[subset] = set()
+        for group in groups:
+            subset_groups[subset].add(group)
+
+    return subset_groups
+
+
+def merge_evaluation_results(results, dataset):
+    for result, instance in zip(results, dataset):
+        instance.update(result)
+        yield instance
+
+
 def _compute(
     predictions: List[Any],
     references: Iterable,
@@ -784,19 +1016,34 @@ def _compute(
 ):
     _reset_env_local_catalogs()
     register_all_artifacts()
-    recipe = MetricRecipe(calc_confidence_intervals=calc_confidence_intervals)
 
-    with error_context(stage="Metric Processing"):
-        multi_stream = recipe(
-            predictions=predictions, references=references, split_name=split_name
-        )
+    if not isinstance(references, (HfDataset, HfIterableDataset, pd.DataFrame, list)):
+        raise ValueError(f"Unsupported data type: {type(references)}")
 
-        if flatten:
-            operator = FlattenInstances()
-            multi_stream = operator(multi_stream)
+    subset_groups = extract_subset_groups(references)
+
+    preprocess = PreProcessForEvaluation()
 
-        stream = multi_stream[split_name]
-    return EvaluationResults(stream)
+    preprocess_multi_stream = preprocess(
+        predictions=predictions, references=references, split_name=split_name
+    )
+
+    evaluate = MainEvaluationPipeline(
+        calc_confidence_intervals=calc_confidence_intervals, subset_groups=subset_groups
+    )
+
+    results_multi_stream = evaluate(preprocess_multi_stream)
+
+    post_process = PostProcessAfterEvaluation()
+
+    data_multi_stream = post_process(preprocess_multi_stream)
+
+    return EvaluationResults(
+        merge_evaluation_results(
+            results_multi_stream[split_name],
+            data_multi_stream[split_name],
+        )
+    )
 
 
 """
diff --git a/src/unitxt/metrics.py b/src/unitxt/metrics.py
index 7e93751d4b..552b3fb006 100644
--- a/src/unitxt/metrics.py
+++ b/src/unitxt/metrics.py
@@ -3469,6 +3469,37 @@ def compute(
         return {self.main_score: result}
 
 
+class WerFast(MapReduceMetric[Tuple[float, float], float]):
+    """Computes mean squared error between predictions and references.
+
+    Range: [0, ∞) (lower is better)
+    Measures average squared differences between predicted and true values.
+    """
+
+    main_score = "wer"
+    prediction_type = str
+    single_reference_per_prediction = True
+    _requirements_list = ["jiwer>=3.0.0"]  # added process_words function
+
+    def prepare(self):
+        super().prepare()
+        import jiwer
+
+        self._metric = jiwer.process_words
+
+    def map(
+        self, prediction: str, references: List[str], task_data: Dict[str, Any]
+    ) -> Tuple[float, float]:
+        measures = self._metric(references[0], prediction)
+        incorrect = measures.substitutions + measures.deletions + measures.insertions
+        total = measures.substitutions + measures.deletions + measures.hits
+        return incorrect, total
+
+    def reduce(self, intermediates: List[float]) -> Dict[str, Any]:
+        incorrect, total = map(sum, zip(*intermediates))
+        return {self.main_score: incorrect / total if total > 0 else np.nan}
+
+
 class MeanSquaredError(MapReduceMetric[float, float]):
     """Computes mean squared error between predictions and references.
 
@@ -3537,6 +3568,16 @@ def reduce(self, intermediates: List[Tuple[float, float]]) -> Dict[str, Any]:
 
         score, p_value = self.get_correlation_func()(list_a, list_b)
 
+        try:
+            score = float(score)
+        except:
+            pass
+
+        try:
+            p_value = float(p_value)
+        except:
+            pass
+
         return {
             self.main_score: score,
             f"{self.main_score}_p_value": p_value,
diff --git a/src/unitxt/operators.py b/src/unitxt/operators.py
index 410b940df1..a6256bb176 100644
--- a/src/unitxt/operators.py
+++ b/src/unitxt/operators.py
@@ -366,15 +366,21 @@ class RemoveFields(InstanceOperator):
 
     Args:
         fields (List[str]): The fields to remove from each instance.
+        not_exist_ok (bool): If True, do not raise an error if a field does not exist. Defaults to False.
     """
 
     fields: List[str]
+    not_exist_ok: bool = False
 
     def process(
         self, instance: Dict[str, Any], stream_name: Optional[str] = None
     ) -> Dict[str, Any]:
         for field_name in self.fields:
-            del instance[field_name]
+            try:
+                del instance[field_name]
+            except:
+                if not self.not_exist_ok:
+                    raise
         return instance
 
 
@@ -608,7 +614,12 @@ def process(
             if (not is_subpath(from_field, to_field)) and (
                 not is_subpath(to_field, from_field)
             ):
-                dict_delete(res, from_field, remove_empty_ancestors=True)
+                dict_delete(
+                    res,
+                    from_field,
+                    remove_empty_ancestors=True,
+                    not_exist_ok=self.not_exist_ok,
+                )
 
         return res
 
diff --git a/src/unitxt/processors.py b/src/unitxt/processors.py
index 6f13e10a33..87aaeb0f2e 100644
--- a/src/unitxt/processors.py
+++ b/src/unitxt/processors.py
@@ -566,3 +566,35 @@ def process_value(self, text: Any) -> Any:
 
 class ExtractVerbalJudgementBadGood(ExtractVerbalJudgment):
     classes = ["very bad", "bad", "mediocre", "good", "very good"]
+
+
+class NormalizeTextWithWhisper(FieldOperator):
+    """A processor that uses uses whisper english normalizer."""
+
+    _requirements_list = ["transformers"]
+
+    def prepare(self):
+        super().prepare()
+        from transformers import WhisperTokenizer
+
+        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base")
+        self._normalize = self.tokenizer.normalize
+
+    def process_value(self, value: str) -> str:
+        return self._normalize(value)
+
+
+class NormalizeTextBasicWithWhisper(FieldOperator):
+    """A processor that uses uses whisper multilingual normalizer."""
+
+    _requirements_list = ["transformers"]
+
+    def prepare(self):
+        super().prepare()
+        from transformers import WhisperTokenizer
+
+        self.tokenizer = WhisperTokenizer.from_pretrained("openai/whisper-base")
+        self._normalize = self.tokenizer.basic_normalize
+
+    def process_value(self, value: str) -> str:
+        return self._normalize(value)
diff --git a/src/unitxt/schema.py b/src/unitxt/schema.py
index 5926b120cc..fe72f22852 100644
--- a/src/unitxt/schema.py
+++ b/src/unitxt/schema.py
@@ -130,6 +130,9 @@ def _prepare_media(self, instance):
             if isoftype(instance["media"]["images"][i], Image):
                 instance["media"]["images"][i] = instance["media"]["images"][i]["image"]
 
+        for i in range(len(instance["media"]["audios"])):
+            instance["media"]["audios"][i] = instance["media"]["audios"][i]["audio"]
+
         return instance
 
     def _get_instance_task_data(
diff --git a/src/unitxt/serializers.py b/src/unitxt/serializers.py
index 84a0258ba0..0f5e0c3f05 100644
--- a/src/unitxt/serializers.py
+++ b/src/unitxt/serializers.py
@@ -9,6 +9,7 @@
 from .settings_utils import get_constants
 from .type_utils import isoftype, to_type_string
 from .types import (
+    Audio,
     Conversation,
     Dialog,
     Document,
@@ -151,6 +152,20 @@ def serialize(self, value: Image, instance: Dict[str, Any]) -> str:
         return f'<{constants.image_tag} src="media/images/{idx}">'
 
 
+class AudioSerializer(SingleTypeSerializer):
+    serialized_type = Audio
+
+    def serialize(self, value: Audio, instance: Dict[str, Any]) -> str:
+        if "media" not in instance:
+            instance["media"] = {}
+        if "audios" not in instance["media"]:
+            instance["media"]["audios"] = []
+        idx = len(instance["media"]["audios"])
+        instance["media"]["audios"].append({"audio": value["audio"]})
+        value["audio"] = f"media/audios/{idx}"
+        return f'<{constants.audio_tag} src="media/audios/{idx}">'
+
+
 class VideoSerializer(ImageSerializer):
     serialized_type = Video
 
@@ -205,6 +220,7 @@ class MultiTypeSerializer(Serializer):
             ToolCallSerializer(),
             DialogSerializer(),
             MultiDocumentSerializer(),
+            AudioSerializer(),
             ImageSerializer(),
             VideoSerializer(),
             TableSerializer(),
diff --git a/src/unitxt/settings_utils.py b/src/unitxt/settings_utils.py
index b7e07be99d..ed4c9a996d 100644
--- a/src/unitxt/settings_utils.py
+++ b/src/unitxt/settings_utils.py
@@ -261,6 +261,7 @@ def __getattr__(self, key):
     constants.inference_stream = "__INFERENCE_STREAM__"
     constants.instance_stream = "__INSTANCE_STREAM__"
     constants.image_tag = "unitxt-img"
+    constants.audio_tag = "unitxt-audio"
     constants.demos_pool_field = "_demos_pool_"
     constants.demos_field = "demos"
     constants.instruction_field = "instruction"
diff --git a/src/unitxt/stream_operators.py b/src/unitxt/stream_operators.py
index b769a5d3b0..c675891cf1 100644
--- a/src/unitxt/stream_operators.py
+++ b/src/unitxt/stream_operators.py
@@ -127,6 +127,99 @@ def process(self, multi_stream: MultiStream) -> MultiStream:
         return multi_stream
 
 
+class JoinStreamsFleurs(MultiStreamOperator):
+    """Join multiple streams into a single stream - special version for the fleurs speech translation dataset.
+
+    left_stream contains the input speech in English, 1-3 spoken samples per unique text instance
+        The inference and the evaluation are done on all the spoken samples, including the repetitions, as they
+        represennt different speakers and different recording conditions, potentially resulting in different
+        output translations
+    right_stream contains the target translation text, 1-3 repetitions of the same target text translation
+        The code below removes the redundant target translations, to avoid duplications during later merging
+        of the two parts - the left and the right
+
+    Args:
+        left_stream (str): the left stream.
+        right_stream (str): the right stream.
+        how: use "inner".
+        on: use "id".
+        new_stream_name (str): The name of the new stream resulting from the merge.
+    """
+
+    left_stream: str
+    right_stream: str
+    how: Literal["inner"]
+    on: Optional[List[str]] = None
+    left_on: Optional[List[str]] = None
+    right_on: Optional[List[str]] = None
+    new_stream_name: str
+
+    def merge(self, multi_stream) -> List:
+        assert self.right_stream in multi_stream and self.left_stream in multi_stream
+        stream_dict = dict(multi_stream.items())
+        left_stream = list(stream_dict[self.left_stream])
+        right_stream = list(stream_dict[self.right_stream])
+        left_stream_df = pd.DataFrame(left_stream)
+        # right_stream_df = pd.DataFrame(right_stream)
+
+        # remove duplications from the right stream, this is intended for the FLEURS dataset (spoken translation)
+        # it removes unnecessary repetitions of the target translations, before merging the input speech with the target translations
+        right_deduplicate = []
+        seen_ids = set()
+        for item in right_stream:
+            if item["id"] not in seen_ids:
+                seen_ids.add(item["id"])
+                text = item["transcription"].strip()
+                if text[-1] not in [".", "!", "?"]:
+                    item["transcription"] = (
+                        text + "."
+                    )  # add '.' at the end of the reference translations
+                right_deduplicate.append(item)
+        right_stream_df = pd.DataFrame(right_deduplicate)
+
+        merged_df = pd.merge(
+            left_stream_df,
+            right_stream_df,
+            how=self.how,
+            on=self.on,
+            left_on=self.left_on,
+            right_on=self.right_on,
+        )
+
+        def assert_col_values_are_identical(df: pd.DataFrame, col_name):
+            (col_name_1, col_name_2) = (f"{col_name}_x", f"{col_name}_y")
+            if not df.apply(
+                lambda row: str(row[col_name_1]) == str(row[col_name_2]),
+                axis=1,
+            ).all():
+                raise UnitxtError(
+                    f"'{col_name}' field is not identical in both left and right instances merged in JoinStreams."
+                )
+
+        # If 2 streams / Dataframes contains column with the same names, which are not the columns the join is operated
+        # on they will be renamed to "[column_name]_x" and "[column_name]_y". Some of these columns are metadsta
+        # columns that unitxt adds, which must be kept the same. This code verify that all datasets have
+        # the same metadata values and rename the columns accordingly.
+        common_cols_to_verify = ["data_classification_policy", "recipe_metadata"]
+        for common_col in common_cols_to_verify:
+            assert_col_values_are_identical(merged_df, common_col)
+            merged_df[common_col] = merged_df[f"{common_col}_x"]
+            merged_df = merged_df.drop(
+                columns=[f"{common_col}_x", f"{common_col}_y"], errors="ignore"
+            )
+
+        if len(merged_df) == 0:
+            raise UnitxtError(
+                f"JoinStreams resulted in an empty stream. It means that that keys in fields '{self.on}' on the left and on right streams do not match the merge policy of '{self.how}'."
+            )
+        return merged_df.to_dict(orient="records")
+
+    def process(self, multi_stream: MultiStream) -> MultiStream:
+        merged_records = self.merge(multi_stream)
+        multi_stream[self.new_stream_name] = ListStream(instances_list=merged_records)
+        return multi_stream
+
+
 class DeleteSplits(MultiStreamOperator):
     """Operator which delete splits in stream.
 
diff --git a/src/unitxt/string_operators.py b/src/unitxt/string_operators.py
index 03a1c3b2b3..fd20e17b64 100644
--- a/src/unitxt/string_operators.py
+++ b/src/unitxt/string_operators.py
@@ -93,6 +93,20 @@ def process_value(self, value: str) -> str:
         return value.strip()
 
 
+class StripQuotation(FieldOperator):
+    def process_value(self, value: str) -> str:
+        if value.startswith('"') and value.endswith('"'):
+            return value.strip('"')
+        return value
+
+
+class AddFullStop(FieldOperator):
+    def process_value(self, value: str) -> str:
+        if value[-1] not in [".", "?", "!"]:
+            return value + "."
+        return value
+
+
 class Replace(FieldOperator):
     old: str
     new: str
diff --git a/src/unitxt/templates.py b/src/unitxt/templates.py
index fd2685f6a9..a072302ac8 100644
--- a/src/unitxt/templates.py
+++ b/src/unitxt/templates.py
@@ -11,6 +11,7 @@
 from .operator import InstanceOperator, Operator
 from .random_utils import new_random_generator
 from .serializers import (
+    AudioSerializer,
     ConversationSerializer,
     DialogSerializer,
     ImageSerializer,
@@ -63,6 +64,7 @@ class Template(InstanceOperator):
     serializer: Serializer = NonPositionalField(
         default_factory=lambda: MultiTypeSerializer(
             serializers=[
+                AudioSerializer(),
                 ImageSerializer(),
                 VideoSerializer(),
                 TableSerializer(),
diff --git a/tests/inference/test_inference_engine.py b/tests/inference/test_inference_engine.py
index 48261b8f01..adce996e33 100644
--- a/tests/inference/test_inference_engine.py
+++ b/tests/inference/test_inference_engine.py
@@ -11,6 +11,7 @@
 from unitxt.error_utils import UnitxtError
 from unitxt.inference import (
     HFAutoModelInferenceEngine,
+    HFGraniteSpeechInferenceEngine,
     HFLlavaInferenceEngine,
     HFOptionSelectingInferenceEngine,
     HFPipelineBasedInferenceEngine,
@@ -67,6 +68,44 @@ def get_image_dataset(format=None):
     )
 
 
+@lru_cache
+def get_audio_dataset(format=None):
+    import numpy as np
+
+    # Generate synthetic audio data (1 second of 16kHz audio)
+    sample_rate = 16000
+    duration = 1.0
+    num_samples = int(sample_rate * duration)
+
+    # Generate synthetic audio (simple sine wave)
+    frequency = 440  # A4 note
+    time_values = np.linspace(0, duration, num_samples)
+    audio_data = np.sin(2 * np.pi * frequency * time_values).astype(np.float32)
+
+    data = [
+        {
+            "context": {"audio": {"array": audio_data, "sampling_rate": sample_rate}},
+            "context_type": "audio",
+            "question": "What is the main topic of this audio?",
+            "answers": ["Music"],
+        },
+        {
+            "context": {"audio": {"array": audio_data, "sampling_rate": sample_rate}},
+            "context_type": "audio",
+            "question": "Describe the audio content",
+            "answers": ["Tone"],
+        },
+    ]
+
+    return create_dataset(
+        task="tasks.qa.with_context",
+        format=format,
+        test_set=data,
+        split="test",
+        data_classification_policy=["public"],
+    )
+
+
 @lru_cache
 def get_text_dataset(format=None):
     instances = [
@@ -157,6 +196,36 @@ def test_llava_inference_engine(self):
             ["text", "logprob", "top_tokens"],
         )
 
+    def test_granite_speech_inference_engine(self):
+        if os.environ.get("SKIP_HEAVY_LOCAL"):
+            return
+
+        model = HFGraniteSpeechInferenceEngine(
+            model_name="ibm-granite/granite-speech-3.3-2b",
+            revision="granite-speech-3.3.2-2b",
+            max_new_tokens=10,
+            temperature=0.0,
+        )
+
+        # Test with chat API format
+        dataset = get_audio_dataset(format="formats.chat_api")
+
+        predictions = model.infer(dataset)
+
+        # Check that we get predictions for both instances
+        self.assertEqual(len(predictions), 2)
+        self.assertIsInstance(predictions[0], str)
+        self.assertIsInstance(predictions[1], str)
+
+        # Test log probabilities inference
+        prediction = model.infer_log_probs(dataset)
+
+        assert isoftype(prediction, List[List[Dict[str, Any]]])
+        self.assertListEqual(
+            list(prediction[0][0].keys()),
+            ["text", "logprob", "top_tokens"],
+        )
+
     def test_watsonx_inference(self):
         model = WMLInferenceEngineGeneration(
             model_name="google/flan-t5-xl",
diff --git a/tests/library/test_formats.py b/tests/library/test_formats.py
index 8b22dc071c..d80089dfec 100644
--- a/tests/library/test_formats.py
+++ b/tests/library/test_formats.py
@@ -147,9 +147,10 @@ def test_openai_format_with_images(self):
                 "target_prefix": "The answer is ",
                 "system_prompt": "You are a smart assistant.",
                 "media": {
+                    "audios": [],
                     "images": [
                         {"image": create_random_jpeg_image(2, 2, 1), "format": "JPEG"}
-                    ]
+                    ],
                 },
             },
         ]
@@ -259,7 +260,7 @@ def test_openai_format_with_images(self):
                     },
                 ],
                 "demos": demo_instances,
-                "media": {"images": []},
+                "media": {"audios": [], "images": []},
             },
         ]
 
diff --git a/tests/library/test_metric_utils.py b/tests/library/test_metric_utils.py
index c1bc844483..6f0a53defa 100644
--- a/tests/library/test_metric_utils.py
+++ b/tests/library/test_metric_utils.py
@@ -9,7 +9,7 @@
 
 class TestMetricUtils(UnitxtTestCase):
     def test_split_none(self):
-        operator = SplitSubsetsAndGroups()
+        operator = SplitSubsetsAndGroups(subset_groups={"": {}})
 
         ms = MultiStream.from_iterables(
             {
@@ -61,7 +61,15 @@ def test_split_none(self):
         self.assertEqual({k: list(v) for k, v in result.items()}, target)
 
     def test_split_groups(self):
-        operator = SplitSubsetsAndGroups()
+        operator = SplitSubsetsAndGroups(
+            subset_groups={
+                "": {
+                    '{"template":"templates.t1"}',
+                    '{"num_demos": 1}',
+                    '{"template":"templates.t2"}',
+                }
+            }
+        )
 
         ms = MultiStream.from_iterables(
             {
@@ -143,7 +151,7 @@ def test_split_groups(self):
         self.assertEqual({k: list(v) for k, v in result.items()}, target)
 
     def test_split_subsets(self):
-        operator = SplitSubsetsAndGroups()
+        operator = SplitSubsetsAndGroups(subset_groups={("mnli",): {}, ("squad",): {}})
 
         ms = MultiStream.from_iterables(
             {
@@ -197,7 +205,16 @@ def test_split_subsets(self):
         self.assertEqual({k: list(v) for k, v in result.items()}, target)
 
     def test_split_subset_and_groups(self):
-        operator = SplitSubsetsAndGroups()
+        operator = SplitSubsetsAndGroups(
+            subset_groups={
+                ("mnli",): {
+                    '{"template":"templates.t1"}',
+                    '{"template":"templates.t2"}',
+                    '{"num_demos": 1}',
+                },
+                ("squad",): {'{"template":"templates.t1"}', '{"num_demos": 1}'},
+            }
+        )
 
         ms = MultiStream.from_iterables(
             {
@@ -339,9 +356,6 @@ def test_join_none(self):
             list(result["test"]),
             [
                 {
-                    "subset": [],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 1.0,
@@ -358,9 +372,6 @@ def test_join_none(self):
                     },
                 },
                 {
-                    "subset": [],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 0.0,
@@ -571,8 +582,6 @@ def test_join_groups(self):
             list(result["test"]),
             [
                 {
-                    "subset": [],
-                    "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "accuracy": 1.0,
@@ -613,8 +622,6 @@ def test_join_groups(self):
                     },
                 },
                 {
-                    "subset": [],
-                    "groups": ['{"template":"templates.t2"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "accuracy": 0.0,
@@ -655,8 +662,6 @@ def test_join_groups(self):
                     },
                 },
                 {
-                    "subset": [],
-                    "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "f1": 1.0,
@@ -776,9 +781,9 @@ def test_join_subsets(self):
             list(result["test"]),
             [
                 {
-                    "subset": ["mnli"],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
+                    # "subset": ["mnli"],
+                    # "groups": [],
+                    # "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 1.0,
@@ -810,9 +815,9 @@ def test_join_subsets(self):
                     },
                 },
                 {
-                    "subset": ["mnli"],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
+                    # "subset": ["mnli"],
+                    # "groups": [],
+                    # "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 0.0,
@@ -844,9 +849,9 @@ def test_join_subsets(self):
                     },
                 },
                 {
-                    "subset": ["squad"],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
+                    # "subset": ["squad"],
+                    # "groups": [],
+                    # "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "f1": 1.0,
@@ -977,9 +982,6 @@ def test_join_nested_subsets(self):
             list(result["test"]),
             [
                 {
-                    "subset": ["mnli", "first"],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 1.0,
@@ -1015,9 +1017,6 @@ def test_join_nested_subsets(self):
                     },
                 },
                 {
-                    "subset": ["mnli", "first"],
-                    "groups": [],
-                    "media": {"audios": [], "images": []},
                     "score": {
                         "instance": {
                             "accuracy": 0.0,
@@ -1254,8 +1253,6 @@ def test_join_subsets_and_groups(self):
             list(result["test"]),
             [
                 {
-                    "subset": ["mnli"],
-                    "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "accuracy": 1.0,
@@ -1329,8 +1326,6 @@ def test_join_subsets_and_groups(self):
                     },
                 },
                 {
-                    "subset": ["mnli"],
-                    "groups": ['{"template":"templates.t2"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "accuracy": 0.0,
@@ -1404,8 +1399,6 @@ def test_join_subsets_and_groups(self):
                     },
                 },
                 {
-                    "subset": ["squad"],
-                    "groups": ['{"template":"templates.t1"}', '{"num_demos": 1}'],
                     "score": {
                         "instance": {
                             "f1": 1.0,
diff --git a/utils/install.sh b/utils/install.sh
index 2e37fb1298..8f90d58d99 100644
--- a/utils/install.sh
+++ b/utils/install.sh
@@ -1,6 +1,8 @@
+sudo apt-get update
+sudo apt-get install -y ffmpeg
 echo "blis==0" > constraints.txt
 curl -LsSf https://astral.sh/uv/install.sh | sh
-uv pip install --upgrade --system torch --index-url https://download.pytorch.org/whl/cpu
+uv pip install --upgrade --system torchcodec torchaudio torch --index-url https://download.pytorch.org/whl/cpu
 uv pip install --system -c constraints.txt -e ".[tests]"
 pip install --only-binary :all: spacy
 pip install coverage[toml]