diff --git a/.github/workflows/almaren-framework.yml b/.github/workflows/almaren-framework.yml new file mode 100644 index 00000000..7037615f --- /dev/null +++ b/.github/workflows/almaren-framework.yml @@ -0,0 +1,40 @@ +name: Almaren Framework +on: [push, pull_request] + +jobs: + Build: + runs-on: ubuntu-20.04 + services: + postgres: + image: postgres:13.4 + env: + POSTGRES_PASSWORD: postgres + POSTGRES_HOST_AUTH_METHOD: trust + ports: + - 5432:5432 + options: >- + --health-cmd pg_isready + --health-interval 10s + --health-timeout 5s + --health-retries 5 + steps: + - name : Check out repository code + uses: actions/checkout@v2 + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 8 + cache: sbt + - name: Build and test scala version + run: | + PGPASSWORD="postgres" psql -c 'create database almaren;' -U postgres -h localhost + PGPASSWORD="postgres" psql -c "ALTER USER postgres PASSWORD 'foo' ;" -U postgres -h localhost + PGPASSWORD="postgres" psql -c 'create role runner;' -U postgres -h localhost + PGPASSWORD="postgres" psql -c 'ALTER ROLE "runner" WITH LOGIN SUPERUSER INHERIT CREATEDB CREATEROLE REPLICATION;' -U postgres -h localhost + sbt ++2.12.10 test + rm -rf "$HOME/.ivy2/local" || true + find $HOME/Library/Caches/Coursier/v1 -name "ivydata-*.properties" -delete || true + find $HOME/.ivy2/cache -name "ivydata-*.properties" -delete || true + find $HOME/.cache/coursier/v1 -name "ivydata-*.properties" -delete || true + find $HOME/.sbt -name "*.lock" -delete || true \ No newline at end of file diff --git a/README.md b/README.md index e7bf485e..4b925a7e 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ The Almaren Framework provides a simplified consistent minimalistic layer over Apache Spark, while still allowing you to take advantage of native Apache Spark features. You can even combine it with standard Spark code. -[![Build Status](https://travis-ci.com/mantovani/almaren-framework.svg?branch=master)](https://travis-ci.com/mantovani/almaren-framework) +[![Build Status](https://github.com/music-of-the-ainur/almaren-framework/actions/workflows/almaren-framework.yml/badge.svg)](https://github.com/music-of-the-ainur/almaren-framework/actions/workflows/almaren-framework.yml) [![Gitter Community](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/music-of-the-ainur/community) ## Table of Contents diff --git a/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala b/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala index d28039d4..1ed76a20 100644 --- a/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala +++ b/src/test/scala/com/github/music/of/the/ainur/almaren/Test.scala @@ -113,6 +113,7 @@ class Test extends FunSuite with BeforeAndAfter { deserializerXmlTest() deserializerAvroTest() deserializerCsvTest() + deserializerCsvSampleOptionsTest() testInferSchemaJsonColumn() testInferSchemaDataframe(moviesDf) @@ -435,6 +436,23 @@ class Test extends FunSuite with BeforeAndAfter { test(newCsvSchemaDf, csvSchemaDf, "Deserialize CSV Schema") } + def deserializerCsvSampleOptionsTest(): Unit = { + val df = Seq( + ("John,Chris", "Smith", "London"), + ("David,Michael", "Jones", "India"), + ("Joseph,Mike", "Lee", "Russia"), + ("Chris,Tony", "Brown", "Indonesia"), + ).toDF("first_name", "last_name", "country") + val newCsvDF = almaren.builder + .sourceDataFrame(df) + .deserializer("CSV", "first_name", options = Map("header" -> "false", + "samplingRatio" -> "0.5", + "samplingMaxLines" -> "1")) + .batch + val csvDf = spark.read.parquet("src/test/resources/data/csvDeserializer.parquet") + test(newCsvDF, csvDf, "Deserialize CSV Sample Options") + } + def deserializerXmlTest(): Unit = { val xmlStr = Seq( """