spotify
diff --git a/‎scio-examples/src/main/scala/com/spotify/scio/examples/extra/SortMergeBucketExample.scala‎
Lines changed: 101 additions & 0 deletions b/‎scio-examples/src/main/scala/com/spotify/scio/examples/extra/SortMergeBucketExample.scala‎
Lines changed: 101 additions & 0 deletions
diff --git a/‎scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java‎
Lines changed: 25 additions & 5 deletions b/‎scio-smb/src/main/java/org/apache/beam/sdk/extensions/smb/AvroFileOperations.java‎
Lines changed: 25 additions & 5 deletions
@@ -280,3 +280,104 @@ object SortMergeBucketTransformExample {
     ()
   }
 }
+
+object SortMergeBucketMultiOutputExample {
+  import com.spotify.scio.smb._
+
+  case class AccountProjection(id: Int, amount: Double)
+  case class UserSummary(userId: Int, age: Int, totalAmount: Double, accountCount: Int)
+  case class UserDetails(userId: Int, age: Int, accounts: Seq[AccountProjection])
+  case class HighValueUser(userId: Int, age: Int, totalAmount: Double)
+
+  def pipeline(cmdLineArgs: Array[String]): ScioContext = {
+    val (sc, args) = ContextAndArgs(cmdLineArgs)
+    pipeline(sc, args)
+    sc
+  }
+
+  def pipeline(sc: ScioContext, args: Args): Unit = {
+    implicit val coder: Coder[GenericRecord] = avroGenericRecordCoder(
+      SortMergeBucketExample.UserDataSchema
+    )
+    implicit val scImpl: ScioContext = sc
+
+    // #SortMergeBucketExample_multi_output
+    // Create base collection with expensive shared computation
+    // This cogroup and map runs ONCE, results are shared across all outputs
+    val base = SMBCollection
+      .cogroup2(
+        classOf[Integer],
+        ParquetAvroSortedBucketIO
+          .read(new TupleTag[GenericRecord]("users"), SortMergeBucketExample.UserDataSchema)
+          .withFilterPredicate(FilterApi.lt(FilterApi.intColumn("age"), Int.box(50)))
+          .from(args("users")),
+        ParquetTypeSortedBucketIO
+          .read(new TupleTag[AccountProjection]("accounts"))
+          .from(args("accounts"))
+      )
+      .map { case (_, (users, accounts)) =>
+        // Expensive computation happens ONCE per key group
+        // Results are pushed to all three outputs below
+        val accountList = accounts.toSeq
+        val totalAmount = accountList.map(_.amount).sum
+        (users.toSeq, accountList, totalAmount)
+      }
+
+    // Output 1: Summary - just the aggregated metrics
+    base
+      .map { case (users, accounts, total) =>
+        UserSummary(
+          userId = users.head.get("userId").asInstanceOf[Int],
+          age = users.head.get("age").asInstanceOf[Int],
+          totalAmount = total,
+          accountCount = accounts.size
+        )
+      }
+      .saveAsSortedBucket(
+        ParquetTypeSortedBucketIO
+          .transformOutput[Integer, UserSummary]("userId")
+          .to(args("summaryOutput"))
+      )
+
+    // Output 2: Details - full account information
+    base
+      .map { case (users, accounts, _) =>
+        UserDetails(
+          userId = users.head.get("userId").asInstanceOf[Int],
+          age = users.head.get("age").asInstanceOf[Int],
+          accounts = accounts
+        )
+      }
+      .saveAsSortedBucket(
+        ParquetTypeSortedBucketIO
+          .transformOutput[Integer, UserDetails]("userId")
+          .to(args("detailsOutput"))
+      )
+
+    // Output 3: High value users only - filtered subset
+    base
+      .filter { case (_, _, total) => total > 1000.0 }
+      .map { case (users, _, total) =>
+        HighValueUser(
+          userId = users.head.get("userId").asInstanceOf[Int],
+          age = users.head.get("age").asInstanceOf[Int],
+          totalAmount = total
+        )
+      }
+      .saveAsSortedBucket(
+        ParquetTypeSortedBucketIO
+          .transformOutput[Integer, HighValueUser]("userId")
+          .to(args("highValueOutput"))
+      )
+
+    // All outputs execute automatically when sc.run() is called
+    // SMB data is read ONCE, expensive computation runs ONCE, zero shuffles!
+    // #SortMergeBucketExample_multi_output
+  }
+
+  def main(cmdLineArgs: Array[String]): Unit = {
+    val sc = pipeline(cmdLineArgs)
+    sc.run().waitUntilDone()
+    ()
+  }
+}
@@ -18,6 +18,8 @@
 package org.apache.beam.sdk.extensions.smb;
 
 import java.io.IOException;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutputStream;
 import java.io.Serializable;
 import java.nio.channels.Channels;
 import java.nio.channels.ReadableByteChannel;
@@ -44,7 +46,8 @@ public class AvroFileOperations<ValueT> extends FileOperations<ValueT> {
 
   private final AvroDatumFactory<ValueT> datumFactory;
   private final SerializableSchemaSupplier schemaSupplier;
-  private PatchedSerializableAvroCodecFactory codec;
+  private transient CodecFactory codec;
+  private PatchedSerializableAvroCodecFactory serializableCodec;
   private Map<String, Object> metadata;
 
   static CodecFactory defaultCodec() {
@@ -55,7 +58,7 @@ private AvroFileOperations(AvroDatumFactory<ValueT> datumFactory, Schema schema)
     super(Compression.UNCOMPRESSED, MimeTypes.BINARY); // Avro has its own compression via codec
     this.schemaSupplier = new SerializableSchemaSupplier(schema);
     this.datumFactory = datumFactory;
-    this.codec = new PatchedSerializableAvroCodecFactory(defaultCodec());
+    this.codec = defaultCodec();
   }
 
   public static <V extends IndexedRecord> AvroFileOperations<V> of(
@@ -64,7 +67,7 @@ public static <V extends IndexedRecord> AvroFileOperations<V> of(
   }
 
   public AvroFileOperations<ValueT> withCodec(CodecFactory codec) {
-    this.codec = new PatchedSerializableAvroCodecFactory(codec);
+    this.codec = codec;
     return this;
   }
 
@@ -76,7 +79,7 @@ public AvroFileOperations<ValueT> withMetadata(Map<String, Object> metadata) {
   @Override
   public void populateDisplayData(Builder builder) {
     super.populateDisplayData(builder);
-    builder.add(DisplayData.item("codecFactory", codec.getCodec().getClass()));
+    builder.add(DisplayData.item("codecFactory", codec.getClass()));
     builder.add(DisplayData.item("schema", schemaSupplier.schema.getFullName()));
   }
 
@@ -91,7 +94,7 @@ protected FileIO.Sink<ValueT> createSink() {
     final AvroIO.Sink<ValueT> sink =
         ((AvroIO.Sink<ValueT>) AvroIO.sink(getSchema()))
             .withDatumWriterFactory(datumFactory)
-            .withCodec(codec.getCodec());
+            .withCodec(codec);
 
     if (metadata != null) {
       return sink.withMetadata(metadata);
@@ -110,6 +113,23 @@ Schema getSchema() {
     return schemaSupplier.get();
   }
 
+  /**
+   * Custom serialization to handle non-serializable CodecFactory. Converts codec to
+   * PatchedSerializableAvroCodecFactory before serialization.
+   */
+  private void writeObject(ObjectOutputStream out) throws IOException {
+    // Convert CodecFactory to serializable form
+    serializableCodec = new PatchedSerializableAvroCodecFactory(codec);
+    out.defaultWriteObject();
+  }
+
+  /** Custom deserialization to restore CodecFactory from PatchedSerializableAvroCodecFactory. */
+  private void readObject(ObjectInputStream in) throws IOException, ClassNotFoundException {
+    in.defaultReadObject();
+    // Restore CodecFactory from serializable form
+    codec = serializableCodec.getCodec();
+  }
+
   private static class SerializableSchemaString implements Serializable {
     private final String schema;