Skip to content

Commit

Permalink
Merge pull request #81 from co-cddo/allow-empty-string-columns
Browse files Browse the repository at this point in the history
Fix issue with empty string columns
  • Loading branch information
MotwaniM authored Nov 7, 2024
2 parents 1ae56a9 + b670f8b commit 6e2bffb
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 8 deletions.
3 changes: 3 additions & 0 deletions api/api/application/services/dataset_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ def dataset_has_correct_data_types(
data_frame,
)
for column in schema.columns:
if column.name not in column_types:
continue

actual_type = column_types[column.name]
expected_type = column.data_type

Expand Down
2 changes: 2 additions & 0 deletions api/api/domain/data_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,8 @@ def is_date_type(type: str) -> bool:
def extract_athena_types(df: DataFrame) -> dict:
types = {}
for column in df.columns:
if df[column].dropna().size == 0:
continue
dtype = str(infer_dtype(df[column], skipna=True))
try:
types[column] = PANDAS_TO_ATHENA_CONVERTER[dtype].value
Expand Down
29 changes: 21 additions & 8 deletions api/test/api/application/services/test_dataset_validation.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import re
from typing import List

import numpy as np
import pandas as pd
import pytest

Expand Down Expand Up @@ -468,6 +469,8 @@ def test_return_error_message_when_not_correct_datatypes(self):
"col3": [1, 5, True],
"col4": [1.5, 2.5, "A"],
"col5": ["2021-01-01", "2021-05-01", 1000],
"col6": [None, None, None],
"col7": [np.nan, np.nan, np.nan]
}
)
schema = Schema(
Expand Down Expand Up @@ -503,17 +506,27 @@ def test_return_error_message_when_not_correct_datatypes(self):
data_type="date",
allow_null=False,
),
Column(
name="col6",
partition_index=None,
data_type="string",
allow_null=True,
),
Column(
name="col7",
partition_index=None,
data_type="string",
allow_null=True,
),
],
)

try:
dataset_has_correct_data_types(df, schema)
except DatasetValidationError as error:
assert error.message == [
"Column [col2] has an incorrect data type. Expected boolean, received string",
"Column [col3] has an incorrect data type. Expected int, received string",
"Column [col4] has an incorrect data type. Expected double, received string",
]
data_frame, error_list = dataset_has_correct_data_types(df, schema)
assert error_list == [
"Column [col2] has an incorrect data type. Expected boolean, received string",
"Column [col3] has an incorrect data type. Expected int, received string",
"Column [col4] has an incorrect data type. Expected bigint, received string",
]

def test_return_error_message_when_dataset_has_illegal_chars_in_partition_columns(
self,
Expand Down

0 comments on commit 6e2bffb

Please sign in to comment.