WavWriter enforce a maximum number of samples per frame (choose at …

…construction time). - Helps align behavior with `SampleProcessorBase`, before becoming one in the near future. - Drive-by: - Add a test when trying to render with multiple codec configs with different number of samples per frame (invalid in IAMF v1.1.0). - Remove obsolete doc that the `WavWriter` requires 1 or 2 channels. It has long been used with multiple channels. Add test coverage to mimic use with 9.1.6. - b/384048095: Defer by adding a note to the bug that `WavFileSplicer` needs more coverage. PiperOrigin-RevId: 715007629
AOMediaCodec · Jan 13, 2025 · 0c88d6e · 0c88d6e
1 parent abd7698
commit 0c88d6e
Show file tree

Hide file tree

Showing 10 changed files with 272 additions and 84 deletions.
diff --git a/iamf/cli/adm_to_user_metadata/adm/tests/wav_file_splicer_test.cc b/iamf/cli/adm_to_user_metadata/adm/tests/wav_file_splicer_test.cc
@@ -26,6 +26,8 @@
 #include "iamf/cli/tests/cli_test_utils.h"
 #include "iamf/obu/ia_sequence_header.h"
 
+// TODO(b/384048095): Add better tests for spliced wav files with LFE channels.
+
 namespace iamf_tools {
 namespace adm_to_user_metadata {
 namespace {

diff --git a/iamf/cli/adm_to_user_metadata/adm/wav_file_splicer.cc b/iamf/cli/adm_to_user_metadata/adm/wav_file_splicer.cc
@@ -45,6 +45,12 @@ namespace {
 
 constexpr int32_t kBitsPerByte = 8;
 constexpr size_t kSizeToFlush = 4096;
+
+// Arbitrary limit on how many samples will be written to the wav file at
+// once. Chosen to agree with `kSizeToFlush`, even if there are 16-bit
+// samples and one channel.
+constexpr size_t kMaxNumSamplesPerFrame = kSizeToFlush / 2;
+
 // Error tolerance set to the minimum precision allowed by ADM file to describe
 // timing related parameters.
 constexpr double kErrorTolerance = 1e-5;
@@ -305,7 +311,7 @@ absl::Status ConvertFromObjectsTo3OA(
   // Output channels set to 16 as objects get panned to 3OA.
   auto output_wav_writer = WavWriter::Create(
       output_file.string(), kOutputWavChannels, wav_file_fmt.samples_per_sec,
-      wav_file_fmt.bits_per_sample);
+      wav_file_fmt.bits_per_sample, kMaxNumSamplesPerFrame);
 
   // Calculate number of bytes per sample based on bits per sample.
   const int32_t bytes_per_sample =
@@ -383,7 +389,8 @@ absl::Status ConvertFromObjectsTo3OA(
     {
       auto wav_writer = WavWriter::Create(
           input_file.string(), wav_file_fmt.num_channels,
-          wav_file_fmt.samples_per_sec, wav_file_fmt.bits_per_sample);
+          wav_file_fmt.samples_per_sec, wav_file_fmt.bits_per_sample,
+          kMaxNumSamplesPerFrame);
       // Compute the length of audio samples corresponding to the current
       // segment duration. The samples excluded due the rounding error at each
       // segment is accounted in the next segment.
@@ -445,14 +452,15 @@ absl::Status SeparateLfeChannels(const std::filesystem::path& output_file_path,
   // the wav writer corresponding to non-LFE channels and subsequent indices
   // correspond to each LFE channel present.
   std::vector<std::unique_ptr<WavWriter>> nonlfe_lfe_wav_writer;
-  nonlfe_lfe_wav_writer.emplace_back(WavWriter::Create(
-      non_lfe_file_path, non_lfe_count, samples_per_sec, bits_per_sample));
+  nonlfe_lfe_wav_writer.emplace_back(
+      WavWriter::Create(non_lfe_file_path, non_lfe_count, samples_per_sec,
+                        bits_per_sample, kMaxNumSamplesPerFrame));
   for (int lfe_index = 1; lfe_index <= lfe_ids.size(); ++lfe_index) {
     nonlfe_lfe_wav_writer.emplace_back(WavWriter::Create(
         (output_file_path /
          absl::StrCat(file_prefix, "_converted", lfe_index + 1, ".wav"))
             .string(),
-        1, samples_per_sec, bits_per_sample));
+        1, samples_per_sec, bits_per_sample, kMaxNumSamplesPerFrame));
   }
 
   // The samples in the input wav are packed in a channel-interleaved fashion.
@@ -603,7 +611,8 @@ absl::Status SpliceWavFilesFromAdm(
                                            audio_object_index + 1, ".wav"))
               .string(),
           audio_tracks_for_audio_objects[audio_object_index].size(),
-          wav_file_fmt.samples_per_sec, wav_file_fmt.bits_per_sample));
+          wav_file_fmt.samples_per_sec, wav_file_fmt.bits_per_sample,
+          kMaxNumSamplesPerFrame));
     }
 
     // Write audio samples into the corresponding output wav file(s).

diff --git a/iamf/cli/encoder_main_lib.cc b/iamf/cli/encoder_main_lib.cc
@@ -11,6 +11,7 @@
  */
 #include "iamf/cli/encoder_main_lib.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <filesystem>
 #include <limits>
@@ -57,11 +58,12 @@ using iamf_tools_cli_proto::UserMetadata;
 std::unique_ptr<WavWriter> ProduceAllWavWriters(
     DecodedUleb128 mix_presentation_id, int sub_mix_index, int layout_index,
     const Layout&, const std::filesystem::path& prefix, int num_channels,
-    int sample_rate, int bit_depth) {
+    int sample_rate, int bit_depth, size_t max_input_samples_per_frame) {
   const auto wav_path = absl::StrCat(
       prefix.string(), "_rendered_id_", mix_presentation_id, "_sub_mix_",
       sub_mix_index, "_layout_", layout_index, ".wav");
-  return WavWriter::Create(wav_path, num_channels, sample_rate, bit_depth);
+  return WavWriter::Create(wav_path, num_channels, sample_rate, bit_depth,
+                           max_input_samples_per_frame);
 }
 
 absl::Status PartitionParameterMetadata(UserMetadata& user_metadata) {

diff --git a/iamf/cli/rendering_mix_presentation_finalizer.cc b/iamf/cli/rendering_mix_presentation_finalizer.cc
@@ -77,15 +77,18 @@ absl::Status CollectAudioElementsInSubMix(
   return absl::OkStatus();
 }
 
-absl::Status GetCommonSampleRateAndBitDepthFromAudioElementIds(
+absl::Status GetCommonCodecConfigPropertiesFromAudioElementIds(
     const std::vector<const AudioElementWithData*>& audio_elements_in_sub_mix,
     uint32_t& common_sample_rate, uint8_t& common_bit_depth,
-    bool& requires_resampling) {
+    uint32_t& common_num_samples_per_frame, bool& requires_resampling) {
   absl::flat_hash_set<uint32_t> sample_rates;
+  absl::flat_hash_set<uint32_t> num_samples_per_frame;
   absl::flat_hash_set<uint8_t> bit_depths;
 
   // Get all the bit-depths and sample_rates from each Audio Element.
   for (const auto* audio_element : audio_elements_in_sub_mix) {
+    num_samples_per_frame.insert(
+        audio_element->codec_config->GetNumSamplesPerFrame());
     sample_rates.insert(audio_element->codec_config->GetOutputSampleRate());
     bit_depths.insert(
         audio_element->codec_config->GetBitDepthToMeasureLoudness());
@@ -94,6 +97,13 @@ absl::Status GetCommonSampleRateAndBitDepthFromAudioElementIds(
   RETURN_IF_NOT_OK(GetCommonSampleRateAndBitDepth(
       sample_rates, bit_depths, common_sample_rate, common_bit_depth,
       requires_resampling));
+  if (num_samples_per_frame.size() != 1) {
+    return absl::InvalidArgumentError(
+        "Audio elements in a submix must have the same number of samples per "
+        "frame.");
+  }
+  common_num_samples_per_frame = *num_samples_per_frame.begin();
+
   return absl::OkStatus();
 }
 
@@ -526,7 +536,7 @@ absl::Status GenerateRenderingMetadataForLayouts(
     int sub_mix_index,
     std::vector<const AudioElementWithData*> audio_elements_in_sub_mix,
     uint32_t common_sample_rate, uint8_t loudness_calculator_bit_depth,
-    uint8_t wav_file_bit_depth,
+    uint8_t wav_file_bit_depth, uint32_t common_num_samples_per_frame,
     std::vector<LayoutRenderingMetadata>& output_layout_rendering_metadata) {
   output_layout_rendering_metadata.resize(sub_mix.layouts.size());
   for (int layout_index = 0; layout_index < sub_mix.layouts.size();
@@ -561,7 +571,7 @@ absl::Status GenerateRenderingMetadataForLayouts(
     layout_rendering_metadata.wav_writer = wav_writer_factory(
         mix_presentation_id, sub_mix_index, layout_index,
         layout.loudness_layout, file_path_prefix, num_channels,
-        common_sample_rate, wav_file_bit_depth);
+        common_sample_rate, wav_file_bit_depth, common_num_samples_per_frame);
   }
 
   return absl::OkStatus();
@@ -606,10 +616,11 @@ absl::Status GenerateRenderingMetadataForSubmixes(
 
     // Data common to all audio elements and layouts.
     bool requires_resampling;
-    RETURN_IF_NOT_OK(GetCommonSampleRateAndBitDepthFromAudioElementIds(
+    uint32_t common_num_samples_per_frame;
+    RETURN_IF_NOT_OK(GetCommonCodecConfigPropertiesFromAudioElementIds(
         audio_elements_in_sub_mix, submix_rendering_metadata.common_sample_rate,
         submix_rendering_metadata.loudness_calculator_bit_depth,
-        requires_resampling));
+        common_num_samples_per_frame, requires_resampling));
     if (requires_resampling) {
       // TODO(b/274689885): Convert to a common sample rate and/or bit-depth.
       return absl::UnimplementedError(
@@ -630,7 +641,7 @@ absl::Status GenerateRenderingMetadataForSubmixes(
         audio_elements_in_sub_mix, submix_rendering_metadata.common_sample_rate,
         submix_rendering_metadata.loudness_calculator_bit_depth,
         submix_rendering_metadata.wav_file_bit_depth,
-        layout_rendering_metadata));
+        common_num_samples_per_frame, layout_rendering_metadata));
   }
   return absl::OkStatus();
 }

diff --git a/iamf/cli/rendering_mix_presentation_finalizer.h b/iamf/cli/rendering_mix_presentation_finalizer.h
@@ -13,6 +13,7 @@
 #ifndef CLI_RENDERING_MIX_PRESENTATION_FINALIZER_H_
 #define CLI_RENDERING_MIX_PRESENTATION_FINALIZER_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <filesystem>
 #include <list>
@@ -109,12 +110,14 @@ class RenderingMixPresentationFinalizer {
    * \param num_channels Number of channels.
    * \param sample_rate Sample rate.
    * \param bit_depth Bit depth.
+   * \param num_samples_per_frame Number of samples per frame.
    * \return Unique pointer to a wav writer or `nullptr` if none is desired.
    */
   typedef absl::AnyInvocable<std::unique_ptr<WavWriter>(
       DecodedUleb128 mix_presentation_id, int sub_mix_index, int layout_index,
       const Layout& layout, const std::filesystem::path& prefix,
-      int num_channels, int sample_rate, int bit_depth) const>
+      int num_channels, int sample_rate, int bit_depth,
+      size_t num_samples_per_frame) const>
       WavWriterFactory;
 
   /*!\brief Creates a rendering mix presentation finalizer.

diff --git a/iamf/cli/tests/BUILD b/iamf/cli/tests/BUILD
@@ -387,7 +387,10 @@ cc_test(
         "//iamf/cli:rendering_mix_presentation_finalizer",
         "//iamf/cli:wav_reader",
         "//iamf/cli:wav_writer",
+        "//iamf/cli/proto:codec_config_cc_proto",
+        "//iamf/cli/proto_to_obu:codec_config_generator",
         "//iamf/cli/renderer:audio_element_renderer_base",
+        "//iamf/cli/user_metadata_builder:codec_config_obu_metadata_builder",
         "//iamf/cli/user_metadata_builder:iamf_input_layout",
         "//iamf/obu:audio_element",
         "//iamf/obu:codec_config",
@@ -401,6 +404,7 @@ cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
+        "@com_google_protobuf//:protobuf",
     ],
 )
 

diff --git a/iamf/cli/tests/rendering_mix_presentation_finalizer_test.cc b/iamf/cli/tests/rendering_mix_presentation_finalizer_test.cc
@@ -39,9 +39,12 @@
 #include "iamf/cli/loudness_calculator_base.h"
 #include "iamf/cli/loudness_calculator_factory_base.h"
 #include "iamf/cli/parameter_block_with_data.h"
+#include "iamf/cli/proto/codec_config.pb.h"
+#include "iamf/cli/proto_to_obu/codec_config_generator.h"
 #include "iamf/cli/renderer/audio_element_renderer_base.h"
 #include "iamf/cli/renderer_factory.h"
 #include "iamf/cli/tests/cli_test_utils.h"
+#include "iamf/cli/user_metadata_builder/codec_config_obu_metadata_builder.h"
 #include "iamf/cli/user_metadata_builder/iamf_input_layout.h"
 #include "iamf/cli/wav_reader.h"
 #include "iamf/cli/wav_writer.h"
@@ -50,6 +53,7 @@
 #include "iamf/obu/mix_presentation.h"
 #include "iamf/obu/param_definitions.h"
 #include "iamf/obu/types.h"
+#include "src/google/protobuf/repeated_ptr_field.h"
 
 namespace iamf_tools {
 namespace {
@@ -76,8 +80,15 @@ constexpr uint32_t kCommonParameterRate = kSampleRate;
 constexpr uint32_t kNumSamplesPerFrame = 8;
 constexpr uint8_t kCodecConfigBitDepth = 16;
 constexpr uint8_t kNoTrimFromEnd = 0;
+constexpr std::array<DecodedUleb128, 1> kMonoSubstreamIds = {0};
+constexpr std::array<DecodedUleb128, 1> kStereoSubstreamIds = {1};
+
 constexpr std::array<ChannelLabel::Label, 2> kStereoLabels = {kL2, kR2};
 
+typedef ::google::protobuf::RepeatedPtrField<
+    iamf_tools_cli_proto::CodecConfigObuMetadata>
+    CodecConfigObuMetadatas;
+
 class MockRenderer : public AudioElementRendererBase {
  public:
   MockRenderer(absl::Span<const ChannelLabel::Label> ordered_labels,
@@ -167,32 +178,32 @@ std::string GetFirstSubmixFirstLayoutExpectedPath() {
                       kMixPresentationId, kSuffixAfterMixPresentationId);
 }
 
-std::unique_ptr<WavWriter> ProduceNoWavWriters(DecodedUleb128, int, int,
-                                               const Layout&,
-                                               const std::filesystem::path&,
-                                               int, int, int) {
+std::unique_ptr<WavWriter> ProduceNoWavWriters(
+    DecodedUleb128 /*mix_presentation_id*/, int /*sub_mix_index*/,
+    int /*layout_index*/, const Layout& /*layout*/,
+    const std::filesystem::path& /*prefix*/, int /*num_channels*/,
+    int /*sample_rate*/, int /*bit_depth*/, size_t /*num_samples_per_frame*/) {
   return nullptr;
 }
 
 std::unique_ptr<WavWriter> ProduceFirstSubMixFirstLayoutWavWriter(
     DecodedUleb128 mix_presentation_id, int sub_mix_index, int layout_index,
     const Layout&, const std::filesystem::path& prefix, int num_channels,
-    int sample_rate, int bit_depth) {
+    int sample_rate, int bit_depth, size_t num_samples_per_frame) {
   if (sub_mix_index != 0 || layout_index != 0) {
     return nullptr;
   }
 
   const auto wav_path =
       absl::StrCat(prefix.string(), "_id_", mix_presentation_id,
                    kSuffixAfterMixPresentationId);
-  return WavWriter::Create(wav_path, num_channels, sample_rate, bit_depth);
+  return WavWriter::Create(wav_path, num_channels, sample_rate, bit_depth,
+                           num_samples_per_frame);
 }
 
 class FinalizerTest : public ::testing::Test {
  public:
   void InitPrerequisiteObusForMonoInput(DecodedUleb128 audio_element_id) {
-    const std::vector<DecodedUleb128> kMonoSubstreamIds = {0};
-
     AddLpcmCodecConfigWithIdAndSampleRate(kCodecConfigId, kSampleRate,
                                           codec_configs_);
     AddScalableAudioElementWithSubstreamIds(
@@ -201,8 +212,6 @@ class FinalizerTest : public ::testing::Test {
   }
 
   void InitPrerequisiteObusForStereoInput(DecodedUleb128 audio_element_id) {
-    const std::vector<DecodedUleb128> kStereoSubstreamIds = {0};
-
     AddLpcmCodecConfigWithIdAndSampleRate(kCodecConfigId, kSampleRate,
                                           codec_configs_);
     AddScalableAudioElementWithSubstreamIds(
@@ -321,6 +330,40 @@ TEST_F(FinalizerTest,
   CreateFinalizerExpectOk();
 }
 
+TEST_F(FinalizerTest, CreateFailsWitMismatchingNumSamplesPerFrame) {
+  // The first audio element references an LPCM codec config.
+  renderer_factory_ = std::make_unique<AlwaysNullRendererFactory>();
+  CodecConfigObuMetadatas metadata;
+  metadata.Add(CodecConfigObuMetadataBuilder::GetOpusCodecConfigObuMetadata(
+      kCodecConfigId, 960));
+  constexpr uint32_t kSecondCodecConfigId = kCodecConfigId + 1;
+  metadata.Add(CodecConfigObuMetadataBuilder::GetOpusCodecConfigObuMetadata(
+      kSecondCodecConfigId, 1920));
+  CodecConfigGenerator generator(metadata);
+  ASSERT_THAT(generator.Generate(codec_configs_), IsOk());
+
+  AddScalableAudioElementWithSubstreamIds(
+      IamfInputLayout::kMono, kAudioElementId, kCodecConfigId,
+      kMonoSubstreamIds, codec_configs_, audio_elements_);
+  // The second audio element references a codec Config with a different
+  // number of samples per frame.
+  constexpr DecodedUleb128 kStereoAudioElementId = kAudioElementId + 1;
+  AddScalableAudioElementWithSubstreamIds(
+      IamfInputLayout::kStereo, kStereoAudioElementId, kSecondCodecConfigId,
+      kStereoSubstreamIds, codec_configs_, audio_elements_);
+  // Mixing these is invalid because there must be only one codec config in IAMF
+  // v1.1.0.
+  AddMixPresentationObuWithAudioElementIds(
+      kMixPresentationId, {kAudioElementId, kStereoAudioElementId},
+      /*common_parameter_id=*/999, kCommonParameterRate, obus_to_finalize_);
+
+  EXPECT_FALSE(RenderingMixPresentationFinalizer::Create(
+                   output_directory_, output_wav_file_bit_depth_override_,
+                   renderer_factory_.get(), loudness_calculator_factory_.get(),
+                   audio_elements_, wav_writer_factory_, obus_to_finalize_)
+                   .ok());
+}
+
 // =========== Tests that work is delegated to the renderer factory. ===========
 TEST_F(FinalizerTest, ForwardsAudioElementToRenderer) {
   InitPrerequisiteObusForStereoInput(kAudioElementId);