apache · WweiL · Feb 1, 2025 · Feb 1, 2025 · Feb 1, 2025
diff --git a/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py b/python/pyspark/sql/connect/streaming/worker/foreach_batch_worker.py
@@ -43,26 +43,29 @@
 
 def main(infile: IO, outfile: IO) -> None:
     global spark
-    check_python_version(infile)
 
-    # Enable Spark Connect Mode
-    os.environ["SPARK_CONNECT_MODE_ENABLED"] = "1"
+    log_name = "Streaming ForeachBatch worker"
 
-    connect_url = os.environ["SPARK_CONNECT_LOCAL_URL"]
-    session_id = utf8_deserializer.loads(infile)
+    def init():
+        check_python_version(infile)
 
-    print(
-        "Streaming foreachBatch worker is starting with "
-        f"url {connect_url} and sessionId {session_id}."
-    )
+        # Enable Spark Connect Mode
+        os.environ["SPARK_CONNECT_MODE_ENABLED"] = "1"
 
-    # To attach to the existing SparkSession, we're setting the session_id in the URL.
-    connect_url = connect_url + ";session_id=" + session_id
-    spark_connect_session = SparkSession.builder.remote(connect_url).getOrCreate()
-    assert spark_connect_session.session_id == session_id
-    spark = spark_connect_session
+        connect_url = os.environ["SPARK_CONNECT_LOCAL_URL"]
+        session_id = utf8_deserializer.loads(infile)
 
-    log_name = "Streaming ForeachBatch worker"
+        print(f"{log_name} is starting with " f"url {connect_url} and sessionId {session_id}.")
+
+        # To attach to the existing SparkSession, we're setting the session_id in the URL.
+        connect_url = connect_url + ";session_id=" + session_id
+        spark_connect_session = SparkSession.builder.remote(connect_url).getOrCreate()
+        assert spark_connect_session.session_id == session_id
+        spark = spark_connect_session
+
+        func = worker.read_command(pickle_ser, infile)
+        write_int(0, outfile)
+        outfile.flush()
 
     def process(df_id, batch_id):  # type: ignore[no-untyped-def]
         global spark
@@ -72,9 +75,7 @@ def process(df_id, batch_id):  # type: ignore[no-untyped-def]
         print(f"{log_name} Completed batch {batch_id} with DF id {df_id}")
 
     try:
-        func = worker.read_command(pickle_ser, infile)
-        write_int(0, outfile)
-        outfile.flush()
+        init()
 
         while True:
             df_ref_id = utf8_deserializer.loads(infile)