Merge pull request #30 from fivetran/MagicBot/databricks-compatibility

fivetran-catfritz · web-flow · commit 0c18eda3ca99 · 2023-06-28T10:40:17.000-05:00
Feature: Databricks compatibility
diff --git a/.buildkite/hooks/pre-command b/.buildkite/hooks/pre-command
@@ -21,4 +21,5 @@ export CI_SNOWFLAKE_DBT_USER=$(gcloud secrets versions access latest --secret="C
 export CI_SNOWFLAKE_DBT_WAREHOUSE=$(gcloud secrets versions access latest --secret="CI_SNOWFLAKE_DBT_WAREHOUSE" --project="dbt-package-testing-363917")
 export CI_DATABRICKS_DBT_HOST=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HOST" --project="dbt-package-testing-363917")
 export CI_DATABRICKS_DBT_HTTP_PATH=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_HTTP_PATH" --project="dbt-package-testing-363917")
-export CI_DATABRICKS_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_TOKEN" --project="dbt-package-testing-363917")
+export CI_DATABRICKS_DBT_TOKEN=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_TOKEN" --project="dbt-package-testing-363917")
+export CI_DATABRICKS_DBT_CATALOG=$(gcloud secrets versions access latest --secret="CI_DATABRICKS_DBT_CATALOG" --project="dbt-package-testing-363917")
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -57,3 +57,18 @@ steps:
             - "CI_REDSHIFT_DBT_USER"
     commands: |
       bash .buildkite/scripts/run_models.sh redshift
+
+  - label: ":databricks: Run Tests - Databricks"
+    key: "run_dbt_databricks"
+    plugins:
+      - docker#v3.13.0:
+          image: "python:3.8"
+          shell: [ "/bin/bash", "-e", "-c" ]
+          environment:
+            - "BASH_ENV=/tmp/.bashrc"
+            - "CI_DATABRICKS_DBT_HOST"
+            - "CI_DATABRICKS_DBT_HTTP_PATH"
+            - "CI_DATABRICKS_DBT_TOKEN"
+            - "CI_DATABRICKS_DBT_CATALOG"
+    commands: |
+      bash .buildkite/scripts/run_models.sh databricks
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,10 +1,21 @@
-# dbt_iterable v0.UPDATE.UPDATE
+# dbt_iterable v0.8.0
+[PR #30](https://github.com/fivetran/dbt_iterable/pull/30) includes the following updates:
+## 🚨 Breaking Changes 🚨 (recommend `--full-refresh`)
+- Updated the incremental strategy for end model `iterable__events`:
+  - For Bigquery, Spark, and Databricks, the strategy has been updated to `insert_overwrite`. 
+  - For Snowflake, Redshift, and PostgreSQL, the strategy has been updated to `delete+insert`.
+  - We recommend running `dbt run --full-refresh` the next time you run your project.
+## 🎉 Feature Update 🎉
+- Databricks compatibility for Runtime 12.2 or later. 
+  - Note some models may run with an earlier runtime, however 12.2 or later is required to run all models. This is because of syntax changes from earlier versions for use with arrays and JSON.
+- We also recommend using the `dbt-databricks` adapter over `dbt-spark` because each adapter handles incremental models differently. If you must use the `dbt-spark` adapter and run into issues, please refer to [this section](https://docs.getdbt.com/reference/resource-configs/spark-configs#the-insert_overwrite-strategy) found in dbt's documentation of Spark configurations.
 
-## Under the Hood:
-
-- Incorporated the new `fivetran_utils.drop_schemas_automation` macro into the end of each Buildkite integration test job.
+[PR #27](https://github.com/fivetran/dbt_iterable/pull/27) includes the following updates:
+## 🚘 Under the Hood 🚘
+- Incorporated the new `fivetran_utils.drop_schemas_automation` macro into the end of each Buildkite integration test job. 
 - Updated the pull request [templates](/.github).
 
+
 # dbt_iterable v0.7.0
 [PR #28](https://github.com/fivetran/dbt_iterable/pull/28) adds the following changes:
 
diff --git a/README.md b/README.md
@@ -42,7 +42,11 @@ The following table provides a detailed list of all models materialized within t
 To use this dbt package, you must have the following:
 
 - At least one Fivetran Iterable connector syncing data into your destination.
-- A **BigQuery**, **Snowflake**, **Redshift**, or **PostgreSQL** destination.
+- A **BigQuery**, **Snowflake**, **Redshift**, **PostgreSQL**, or **Databricks** destination.
+
+### Databricks Configuration
+- **Databricks Runtime 12.2** or later is required to run all models in this package. 
+- We also recommend using the `dbt-databricks` adapter over `dbt-spark` because each adapter handles incremental models differently. If you must use the `dbt-spark` adapter and run into issues, please refer to [this section](https://docs.getdbt.com/reference/resource-configs/spark-configs#the-insert_overwrite-strategy) found in dbt's documentation of Spark configurations.
 
 ## Step 2: Install the package
 Include the following Iterable package version in your `packages.yml` file.
@@ -52,7 +56,7 @@ Include the following Iterable package version in your `packages.yml` file.
 ```yaml
 packages:
   - package: fivetran/iterable
-    version: [">=0.7.0", "<0.8.0"]
+    version: [">=0.8.0", "<0.9.0"]
 ```
 ## Step 3: Define database and schema variables
 By default, this package runs using your destination and the `iterable` schema of your [target database](https://docs.getdbt.com/docs/running-a-dbt-project/using-the-command-line-interface/configure-your-profile). If this is not where your Iterable data is located (for example, if your Iterable schema is named `iterable_fivetran`), add the following configuration to your root `dbt_project.yml` file:
@@ -143,7 +147,7 @@ packages:
       version: [">=1.0.0", "<2.0.0"]
 
     - package: fivetran/iterable_source
-      version: [">=0.6.0", "<0.7.0"]
+      version: [">=0.7.0", "<0.8.0"]
 ```
 
 # 🙌 How is this package maintained and can I contribute?
diff --git a/dbt_project.yml b/dbt_project.yml
@@ -1,5 +1,5 @@
 name: 'iterable'
-version: '0.7.0'
+version: '0.8.0'
 config-version: 2
 require-dbt-version: [">=1.3.0", "<2.0.0"]
 models:
diff --git a/docs/catalog.json b/docs/catalog.json
diff --git a/docs/index.html b/docs/index.html
diff --git a/docs/manifest.json b/docs/manifest.json
diff --git a/docs/run_results.json b/docs/run_results.json
diff --git a/integration_tests/ci/sample.profiles.yml b/integration_tests/ci/sample.profiles.yml
@@ -16,13 +16,13 @@ integration_tests:
       pass: "{{ env_var('CI_REDSHIFT_DBT_PASS') }}"
       dbname: "{{ env_var('CI_REDSHIFT_DBT_DBNAME') }}"
       port: 5439
-      schema: iterable_integration_tests
+      schema: iterable_integration_tests_03
       threads: 8
     bigquery:
       type: bigquery
       method: service-account-json
       project: 'dbt-package-testing'
-      schema: iterable_integration_tests
+      schema: iterable_integration_tests_03
       threads: 8
       keyfile_json: "{{ env_var('GCLOUD_SERVICE_KEY') | as_native }}"
     snowflake:
@@ -33,7 +33,7 @@ integration_tests:
       role: "{{ env_var('CI_SNOWFLAKE_DBT_ROLE') }}"
       database: "{{ env_var('CI_SNOWFLAKE_DBT_DATABASE') }}"
       warehouse: "{{ env_var('CI_SNOWFLAKE_DBT_WAREHOUSE') }}"
-      schema: iterable_integration_tests
+      schema: iterable_integration_tests_03
       threads: 8
     postgres:
       type: postgres
@@ -42,13 +42,13 @@ integration_tests:
       pass: "{{ env_var('CI_POSTGRES_DBT_PASS') }}"
       dbname: "{{ env_var('CI_POSTGRES_DBT_DBNAME') }}"
       port: 5432
-      schema: iterable_integration_tests
+      schema: iterable_integration_tests_03
       threads: 8
     databricks:
-      catalog: null
+      catalog: "{{ env_var('CI_DATABRICKS_DBT_CATALOG') }}"
       host: "{{ env_var('CI_DATABRICKS_DBT_HOST') }}"
       http_path: "{{ env_var('CI_DATABRICKS_DBT_HTTP_PATH') }}"
-      schema: iterable_integration_tests
-      threads: 2
+      schema: iterable_integration_tests_03
+      threads: 8
       token: "{{ env_var('CI_DATABRICKS_DBT_TOKEN') }}"
       type: databricks
diff --git a/integration_tests/dbt_project.yml b/integration_tests/dbt_project.yml
@@ -1,10 +1,10 @@
 config-version: 2
 name: 'iterable_integration_tests'
-version: '0.7.0'
+version: '0.8.0'
 profile: 'integration_tests'
 vars:
   iterable_source:
-    iterable_schema: iterable_integration_tests
+    iterable_schema: iterable_integration_tests_03
     iterable_campaign_history_identifier: "campaign_history_data"
     iterable_campaign_label_history_identifier: "campaign_label_history_data"
     iterable_campaign_list_history_identifier: "campaign_list_history_data"
@@ -46,4 +46,7 @@ seeds:
         message_type_id: "{%- if target.type == 'bigquery' -%} INT64 {%- else -%} bigint {%- endif -%}"
     user_history_data:
       +column_types:
-        updated_at: timestamp
+        updated_at: timestamp
+dispatch:
+  - macro_namespace: dbt_utils
+    search_order: ['spark_utils', 'dbt_utils']
diff --git a/models/intermediate/int_iterable__list_user_unnest.sql b/models/intermediate/int_iterable__list_user_unnest.sql
@@ -3,7 +3,7 @@
         unique_key='unique_key',
         incremental_strategy='insert_overwrite' if target.type in ('bigquery', 'spark', 'databricks') else 'delete+insert',
         partition_by={"field": "date_day", "data_type": "date"} if target.type not in ('spark','databricks') else ['date_day'],
-        file_format='delta',
+        file_format='parquet',
         on_schema_change='fail'
     ) 
 }}
@@ -71,9 +71,13 @@ with user_history as (
         is_current,
         email_list_ids,
         case when email_list_ids != '[]' then
-        {% if target.type == 'snowflake' %}
-        email_list_id.value
-        {% else %} email_list_id {% endif %} else null end as email_list_id
+            {% if target.type == 'snowflake' %}
+            email_list_id.value
+            {% elif target.type in ('spark','databricks') %}
+            email_list_id.col
+            {% else %} email_list_id {% endif %} 
+            else null 
+            end as email_list_id
 
     from user_history
 
@@ -83,8 +87,10 @@ with user_history as (
     {% elif target.type == 'bigquery' %}
     cross join 
         unnest(JSON_EXTRACT_STRING_ARRAY(email_list_ids)) as email_list_id
-    {% else %}
-    {# postgres #}
+    {% elif target.type in ('spark','databricks') %}
+    cross join 
+        lateral explode_outer(from_json(email_list_ids, 'array<int>')) as email_list_id
+    {% else %} {# target is postgres #}
     cross join 
         json_array_elements_text(cast((
             case when email_list_ids = '[]' then '["is_null"]' {# to not remove empty array-rows #}
diff --git a/models/iterable__events.sql b/models/iterable__events.sql
@@ -1,14 +1,11 @@
-{{
-    config(
+{{ config(
         materialized='incremental',
         unique_key='event_id',
-        partition_by={
-            "field": "created_on",
-            "data_type": "date"
-        } if target.type == 'bigquery' else none,
-        incremental_strategy = 'merge' if target.type not in ('snowflake', 'postgres', 'redshift') else 'delete+insert',
-        file_format = 'delta'
-    )
+        incremental_strategy='insert_overwrite' if target.type in ('bigquery', 'spark', 'databricks') else 'delete+insert',
+        partition_by={"field": "created_on", "data_type": "date"} if target.type not in ('spark','databricks') else ['created_on'],
+        file_format='parquet',
+        on_schema_change='fail'
+    ) 
 }}
 
 with events as (
diff --git a/packages.yml b/packages.yml
@@ -1,3 +1,3 @@
 packages:
-- package: fivetran/iterable_source
-  version: [">=0.6.0", "<0.7.0"]
+  - package: fivetran/iterable_source
+    version: [">=0.7.0", "<0.8.0"]