From f218e5c451a53a2f11baad4968bb152ab1896db9 Mon Sep 17 00:00:00 2001 From: Brad C Date: Sat, 13 Apr 2024 10:34:06 -0700 Subject: [PATCH 1/6] Fix typo in comments, and add 2 new CTEs to fct_model_directories model. --- .../marts/structure/fct_model_directories.sql | 35 ++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index 082fde7c..328fc27d 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -1,5 +1,5 @@ -- This model finds all cases where a model is NOT in the appropriate subdirectory: - -- For staging models: The files should be in nested in the staging folder in a subfolder that matches their source parent's name. + -- For staging models: The files should be nested in the staging folder in a subfolder that matches their source parent's name. -- For non-staging models: The files should be nested closest to their appropriate folder. {% set directory_pattern = get_directory_pattern() %} @@ -32,6 +32,37 @@ staging_models as ( and child_model_type = 'staging' ), +--Add this to get all the stage models with count of parent source. Next step will be to count which ones have more than one source parent +staging_by_parent_source_count as ( + select + count(distinct parent_source_name) as resource_count + ,child as resource_name + ,child_resource_type as resource_type + ,child_model_type as model_type + ,child_file_path as current_file_path + from staging_models + group by child, child_resource_type, child_model_type, child_file_path +), + +--Added this CTE to listagg() the multiple suggested paths and advise the user to split the source file into those two places. +multiple_sources_staging_to_split as ( + select + staging_models.child as resource_name, + staging_models.child_resource_type as resource_type, + staging_models.child_model_type as model_type, + staging_models.child_file_path as current_file_path, + 'More than one source. Split into separate staging models in: ' || + listagg('models/' || 'staging' || '/' || staging_models.parent_source_name || '/', ' AND ') + within group(order by 'models/' || 'staging' || '/' || staging_models.parent_source_name || '/') as change_file_path_to + from staging_models + join staging_by_parent_source_count on staging_models.child = staging_by_parent_source_count.resource_name and + staging_models.child_resource_type = staging_by_parent_source_count.resource_type and + staging_models.child_model_type = staging_by_parent_source_count.model_type and + staging_models.child_file_path = staging_by_parent_source_count.current_file_path + where staging_by_parent_source_count.resource_count > 1 + group by staging_models.child, staging_models.child_resource_type, staging_models.child_model_type, staging_models.child_file_path +), + -- find all staging models that are NOT in their source parent's subdirectory inappropriate_subdirectories_staging as ( select distinct -- must do distinct to avoid duplicates when staging model has multiple paths to a given source @@ -63,6 +94,8 @@ unioned as ( select * from inappropriate_subdirectories_staging union all select * from innappropriate_subdirectories_non_staging_models + union all --Added union all to append these results + select * from multiple_sources_staging_to_split ) select * from unioned From 9dd7ae1a10b02e2af90cb53fbea0f7ded924b8b3 Mon Sep 17 00:00:00 2001 From: Brad C Date: Sat, 1 Jun 2024 15:00:55 -0700 Subject: [PATCH 2/6] Added listagg() for models with more than one source. --- models/marts/structure/fct_model_directories.sql | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index 328fc27d..4b7c3e5c 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -1,6 +1,8 @@ -- This model finds all cases where a model is NOT in the appropriate subdirectory: -- For staging models: The files should be nested in the staging folder in a subfolder that matches their source parent's name. -- For non-staging models: The files should be nested closest to their appropriate folder. +{{ config(materialized='table') }} + {% set directory_pattern = get_directory_pattern() %} with all_graph_resources as ( @@ -40,8 +42,10 @@ staging_by_parent_source_count as ( ,child_resource_type as resource_type ,child_model_type as model_type ,child_file_path as current_file_path + ,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' as list_agg_string from staging_models group by child, child_resource_type, child_model_type, child_file_path + ,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' ), --Added this CTE to listagg() the multiple suggested paths and advise the user to split the source file into those two places. @@ -52,8 +56,9 @@ multiple_sources_staging_to_split as ( staging_models.child_model_type as model_type, staging_models.child_file_path as current_file_path, 'More than one source. Split into separate staging models in: ' || - listagg('models/' || 'staging' || '/' || staging_models.parent_source_name || '/', ' AND ') - within group(order by 'models/' || 'staging' || '/' || staging_models.parent_source_name || '/') as change_file_path_to + -- {# {{ dbt.listagg(measure="list_agg_string", delimiter_text="' AND '", order_by_clause="list_agg_string") }} as change_file_path_to + -- #} + listagg(list_agg_string, ' AND ') as change_file_path_to --Using listagg because list_agg_string() isn't working for Duckdb from staging_models join staging_by_parent_source_count on staging_models.child = staging_by_parent_source_count.resource_name and staging_models.child_resource_type = staging_by_parent_source_count.resource_type and From d38588b29c2389eb25d8e87982fde16fa80af687 Mon Sep 17 00:00:00 2001 From: Brad C Date: Sat, 12 Oct 2024 14:23:06 -0700 Subject: [PATCH 3/6] Used dbt_listagg() to create cross-platform string aggregated column. --- models/marts/structure/fct_model_directories.sql | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index 4b7c3e5c..52918c5c 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -56,9 +56,7 @@ multiple_sources_staging_to_split as ( staging_models.child_model_type as model_type, staging_models.child_file_path as current_file_path, 'More than one source. Split into separate staging models in: ' || - -- {# {{ dbt.listagg(measure="list_agg_string", delimiter_text="' AND '", order_by_clause="list_agg_string") }} as change_file_path_to - -- #} - listagg(list_agg_string, ' AND ') as change_file_path_to --Using listagg because list_agg_string() isn't working for Duckdb + {{ dbt.listagg(measure="list_agg_string", delimiter_text="' AND '", order_by_clause="order by current_file_path") }} as change_file_path_to from staging_models join staging_by_parent_source_count on staging_models.child = staging_by_parent_source_count.resource_name and staging_models.child_resource_type = staging_by_parent_source_count.resource_type and From 6987f6fd739d7cc0536858b02fce5c10a7caa32d Mon Sep 17 00:00:00 2001 From: Brad C Date: Sat, 12 Oct 2024 15:32:03 -0700 Subject: [PATCH 4/6] Modify path delimeter to use jinja function instead of hard-coded values. --- models/marts/structure/fct_model_directories.sql | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index 52918c5c..6b068410 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -42,10 +42,12 @@ staging_by_parent_source_count as ( ,child_resource_type as resource_type ,child_model_type as model_type ,child_file_path as current_file_path - ,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' as list_agg_string + --,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' as list_agg_string + ,'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' as list_agg_string from staging_models group by child, child_resource_type, child_model_type, child_file_path - ,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' + --,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' + ,'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' ), --Added this CTE to listagg() the multiple suggested paths and advise the user to split the source file into those two places. From eba70259a7554369e5b6f47ec4a837c84160b72c Mon Sep 17 00:00:00 2001 From: Brad C Date: Sat, 12 Oct 2024 15:53:03 -0700 Subject: [PATCH 5/6] Changed directory format and leading commas back to trailing commas. --- .../marts/structure/fct_model_directories.sql | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index 6b068410..c5efbff6 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -37,17 +37,15 @@ staging_models as ( --Add this to get all the stage models with count of parent source. Next step will be to count which ones have more than one source parent staging_by_parent_source_count as ( select - count(distinct parent_source_name) as resource_count - ,child as resource_name - ,child_resource_type as resource_type - ,child_model_type as model_type - ,child_file_path as current_file_path - --,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' as list_agg_string - ,'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' as list_agg_string + count(distinct parent_source_name) as resource_count, + child as resource_name, + child_resource_type as resource_type, + child_model_type as model_type, + child_file_path as current_file_path, + 'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' as list_agg_string from staging_models - group by child, child_resource_type, child_model_type, child_file_path - --,'models/' || 'staging' || '/' || staging_models.parent_source_name || '/' - ,'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' + group by child, child_resource_type, child_model_type, child_file_path, + 'models{{ directory_pattern }}staging{{ directory_pattern }}' || staging_models.parent_source_name || '{{ directory_pattern }}' ), --Added this CTE to listagg() the multiple suggested paths and advise the user to split the source file into those two places. From 41c18794ef90dd27c62ae36979f0bae51265f2f8 Mon Sep 17 00:00:00 2001 From: Brad C Date: Sun, 13 Oct 2024 12:18:08 -0700 Subject: [PATCH 6/6] Added if condition for listagg sort order for specific connector types. --- models/marts/structure/fct_model_directories.sql | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/models/marts/structure/fct_model_directories.sql b/models/marts/structure/fct_model_directories.sql index c5efbff6..57e9e211 100644 --- a/models/marts/structure/fct_model_directories.sql +++ b/models/marts/structure/fct_model_directories.sql @@ -56,7 +56,7 @@ multiple_sources_staging_to_split as ( staging_models.child_model_type as model_type, staging_models.child_file_path as current_file_path, 'More than one source. Split into separate staging models in: ' || - {{ dbt.listagg(measure="list_agg_string", delimiter_text="' AND '", order_by_clause="order by current_file_path") }} as change_file_path_to + {{ dbt.listagg(measure='list_agg_string', delimiter_text="' AND '", order_by_clause='order by current_file_path' if target.type in ['snowflake','redshift','duckdb','trino']) }} as change_file_path_to from staging_models join staging_by_parent_source_count on staging_models.child = staging_by_parent_source_count.resource_name and staging_models.child_resource_type = staging_by_parent_source_count.resource_type and