Merge pull request #62 from badrinathpatchikolla/spark-3.3

Added Test GitHub Actions Pipeline and Resolved Issue #61
music-of-the-ainur · Oct 4, 2022 · b913888 · b913888
2 parents 33e2d6a + 2df0334
commit b913888
Show file tree

Hide file tree

Showing 4 changed files with 61 additions and 39 deletions.
diff --git a/.github/workflows/almaren-framework.yml b/.github/workflows/almaren-framework.yml
@@ -0,0 +1,41 @@
+name: Almaren Framework
+on: [push, pull_request]
+
+jobs:
+  Build:
+    runs-on: ubuntu-20.04
+    services:
+      postgres:
+        image: postgres:13.4
+        env:
+          POSTGRES_PASSWORD: postgres
+          POSTGRES_HOST_AUTH_METHOD: trust
+        ports:
+          - 5432:5432
+        options: >-
+          --health-cmd pg_isready
+          --health-interval 10s
+          --health-timeout 5s
+          --health-retries 5
+    steps:
+      - name : Check out repository code
+        uses: actions/checkout@v2
+      - name: Setup JDK
+        uses: actions/setup-java@v3
+        with:
+            distribution: temurin
+            java-version: 8
+            cache: sbt
+      - name: Build and test scala version
+        run: |
+          PGPASSWORD="postgres" psql -c 'create database almaren;' -U postgres -h localhost 
+          PGPASSWORD="postgres" psql -c "ALTER USER postgres PASSWORD 'foo' ;" -U postgres -h localhost 
+          PGPASSWORD="postgres" psql -c 'create role runner;' -U postgres -h localhost
+          PGPASSWORD="postgres" psql -c 'ALTER ROLE "runner" WITH  LOGIN SUPERUSER INHERIT CREATEDB CREATEROLE REPLICATION;' -U postgres -h localhost
+          sbt ++2.12.10 test
+          sbt ++2.13.9 test
+          rm -rf "$HOME/.ivy2/local" || true
+          find $HOME/Library/Caches/Coursier/v1        -name "ivydata-*.properties" -delete || true
+          find $HOME/.ivy2/cache                       -name "ivydata-*.properties" -delete || true
+          find $HOME/.cache/coursier/v1                -name "ivydata-*.properties" -delete || true
+          find $HOME/.sbt                              -name "*.lock"               -delete || true
diff --git a/.travis.yml b/.travis.yml
diff --git a/README.md b/README.md
@@ -2,7 +2,7 @@
 
 The Almaren Framework provides a simplified consistent minimalistic layer over Apache Spark, while still allowing you to take advantage of native Apache Spark features. You can even combine it with standard Spark code.
 
-[![Build Status](https://travis-ci.com/mantovani/almaren-framework.svg?branch=master)](https://travis-ci.com/mantovani/almaren-framework)
+[![Build Status](https://github.com/music-of-the-ainur/almaren-framework/actions/workflows/almaren-framework.yml/badge.svg)](https://github.com/music-of-the-ainur/almaren-framework/actions/workflows/almaren-framework.yml)
 [![Gitter Community](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/music-of-the-ainur/community)
 
 ## Table of Contents

diff --git a/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala b/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala
@@ -114,6 +114,7 @@ class Test extends AnyFunSuite with BeforeAndAfter {
   deserializerXmlTest()
   deserializerAvroTest()
   deserializerCsvTest()
+  deserializerCsvSampleOptionsTest()
   testInferSchemaJsonColumn()
   testInferSchemaDataframe(moviesDf)
 
@@ -436,6 +437,22 @@ class Test extends AnyFunSuite with BeforeAndAfter {
     test(newCsvSchemaDf, csvSchemaDf, "Deserialize CSV Schema")
   }
 
+  def deserializerCsvSampleOptionsTest(): Unit = {
+    val df = Seq(
+      ("John,Chris", "Smith", "London"),
+      ("David,Michael", "Jones", "India"),
+      ("Joseph,Mike", "Lee", "Russia"),
+      ("Chris,Tony", "Brown", "Indonesia"),
+    ).toDF("first_name", "last_name", "country")
+    val newCsvDF = almaren.builder
+      .sourceDataFrame(df)
+      .deserializer("CSV", "first_name", options = Map("header" -> "false",
+        "samplingRatio" -> "0.5",
+        "samplingMaxLines" -> "1"))
+      .batch
+    val csvDf = spark.read.parquet("src/test/resources/data/csvDeserializer.parquet")
+    test(newCsvDF, csvDf, "Deserialize CSV Sample Options")
+  }
   def deserializerXmlTest(): Unit = {
     val xmlStr = Seq(
       """ <json_string>
@@ -497,15 +514,14 @@ class Test extends AnyFunSuite with BeforeAndAfter {
     val jsonStr = Seq("""{"name":"John","age":21,"address":"New York"}""",
       """{"name":"Peter","age":18,"address":"Prague"}""",
       """{"name":"Tony","age":40,"address":"New York"}""").toDF("json_string").createOrReplaceTempView("sample_json_table")
-
     val df = spark.sql("select * from sample_json_table")
-    val jsonSchema = "`address` STRING,`age` BIGINT,`name` STRING"
+    val jsonSchema = "address STRING,age BIGINT,name STRING"
     val generatedSchema = Util.genDDLFromJsonString(df, "json_string", 0.1)
     testSchema(jsonSchema, generatedSchema, "Test infer schema for json column")
   }
 
   def testInferSchemaDataframe(df: DataFrame): Unit = {
-    val dfSchema = "`cast` ARRAY<STRING>,`genres` ARRAY<STRING>,`title` STRING,`year` BIGINT"
+    val dfSchema = "cast ARRAY<STRING>,genres ARRAY<STRING>,title STRING,year BIGINT"
     val generatedSchema = Util.genDDLFromDataFrame(df, 0.1)
     testSchema(dfSchema, generatedSchema, "Test infer schema for dataframe")
   }