Skip to content

Commit

Permalink
Merge pull request #65 from no10ds/release/v7.0.8-v0.1.6
Browse files Browse the repository at this point in the history
Release - v7.0.8 (API) v0.1.6 (SDK)
  • Loading branch information
TobyDrane authored Nov 15, 2023
2 parents 07b0f80 + f112988 commit 11c0ca1
Show file tree
Hide file tree
Showing 22 changed files with 374 additions and 270 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
rev: 1.7.5
hooks:
- id: bandit
exclude: '(tests|docs)/.*'
exclude: '(tests|docs|test)/.*'
- repo: https://github.com/psf/black
rev: 22.6.0
hooks:
Expand Down
6 changes: 4 additions & 2 deletions api/api/application/services/schema_infer_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,12 @@
)
from api.common.value_transformers import clean_column_name

from api.domain.data_types import extract_athena_types
from api.domain.data_types import extract_athena_types, is_date_type
from api.domain.schema import Schema, Column
from api.domain.schema_metadata import Owner, SchemaMetadata

DEFAULT_DATE_FORMAT = "%Y-%m-%d"


class SchemaInferService:
def infer_schema(
Expand Down Expand Up @@ -67,7 +69,7 @@ def _infer_columns(self, dataframe: pd.DataFrame) -> List[Column]:
partition_index=None,
data_type=_type,
allow_null=True,
format=None,
format=DEFAULT_DATE_FORMAT if is_date_type(_type) else None,
)
for name, _type in extract_athena_types(dataframe).items()
]
4 changes: 2 additions & 2 deletions api/api/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,9 +230,9 @@ def _set_security_headers(response) -> None:
"default-src 'self' "
f"{IDENTITY_PROVIDER_BASE_URL}; "
"script-src 'self' 'unsafe-inline' "
"cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js; "
"cdn.jsdelivr.net/npm/swagger-ui-dist@5.9.0/swagger-ui-bundle.js; "
"style-src 'self' "
"cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css; "
"cdn.jsdelivr.net/npm/swagger-ui-dist@5.9.0/swagger-ui.css; "
"img-src 'self' data: "
"fastapi.tiangolo.com/img/favicon.png;"
)
Expand Down
1 change: 1 addition & 0 deletions api/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ httpx
jinja2
pandas
psutil
pyarrow
pyjwt
pydantic[email]
python-multipart
Expand Down
41 changes: 41 additions & 0 deletions api/test/api/application/services/test_schema_infer_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
from pathlib import Path
from unittest.mock import patch

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pytest

from api.application.services.schema_infer_service import SchemaInferService
Expand Down Expand Up @@ -67,6 +70,44 @@ def test_infer_schema(self):
assert actual_schema == expected_schema
os.remove(temp_out_path)

def test_infer_schema_with_date(self):
expected_schema = Schema(
metadata=SchemaMetadata(
layer="raw",
domain="mydomain",
dataset="mydataset",
sensitivity="PUBLIC",
owners=[Owner(name="change_me", email="[email protected]")],
),
columns=[
Column(
name="colname1",
partition_index=None,
data_type="string",
allow_null=True,
format=None,
),
Column(
name="colname2",
partition_index=None,
data_type="date",
allow_null=True,
format="%Y-%m-%d",
),
],
).dict(exclude={"metadata": {"version"}})
df = pd.DataFrame(data={"colname1": ["something"], "colname2": ["2021-01-01"]})
df["colname2"] = pd.to_datetime(df["colname2"])
temp_out_path = tempfile.mkstemp(suffix=".parquet")[1]
path = Path(temp_out_path)
pq.write_table(pa.Table.from_pandas(df), path)

actual_schema = self.infer_schema_service.infer_schema(
"raw", "mydomain", "mydataset", "PUBLIC", path
)
assert actual_schema == expected_schema
os.remove(temp_out_path)

@patch("api.application.services.schema_infer_service.construct_chunked_dataframe")
def test_raises_error_when_parsing_provided_file_fails(
self, mock_construct_chunked_dataframe
Expand Down
2 changes: 2 additions & 0 deletions docs/api/schema.md
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,8 @@ the day (%d) is optional and a separator ('/' or '-') must be in place. Accepted
- %Y-%m -> 2021-01
- %m-%Y -> 01-2021

> When using the generate schema endpoint, rAPId will automatically detect a valid date column and specify a default date format of `%Y-%m-%d`, this can be changed before a schema upload if required.
### Booleans

In order to handle nullables we have introduced [pandas' boolean nullable data type](https://pandas.pydata.org/pandas-docs/stable/user_guide/boolean.html),
Expand Down
20 changes: 19 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,22 @@
# Changelog

## v7.0.8 / v0.1.6 (sdk) - _2023-11-15_

### Fixes

- Issue with date types when editing a schema on the UI because of no option to apply format column and therefore getting an _all fields are required_ error.
- Tweaked UI design when adding permissions to subject.
- SDK not uploading a Pandas Dataframe with a date field set correctly.
- Updated NextJS and Zod package version.

### Features

- Data bucket now has EventBridge notifications enabled by default.

### Closes relevant GitHub issues

- https://github.com/no10ds/rapid/issues/57

## v7.0.7 / v0.1.5 (sdk) - _2023-11-07_

### Fixes
Expand Down Expand Up @@ -85,7 +102,8 @@

- See the [migration doc](migration.md) for details on how to migrate to v7 from v6.

[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.7...HEAD
[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.8...HEAD
[v7.0.8 / v0.1.6 (sdk)]: https://github.com/no10ds/rapid/v7.0.7...v7.0.8
[v7.0.7 / v0.1.5 (sdk)]: https://github.com/no10ds/rapid/v7.0.6...v7.0.7
[v7.0.6 / v0.1.4 (sdk)]: https://github.com/no10ds/rapid/v7.0.5...v7.0.6
[v7.0.5 / v0.1.3 (sdk)]: https://github.com/no10ds/rapid/v7.0.4...v7.0.5
Expand Down
1 change: 1 addition & 0 deletions infrastructure/blocks/pipeline/iam.tf
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ resource "aws_iam_policy" "pipeline_ecr_access" {

resource "aws_iam_policy" "pipeline_ecr_public_access" {
# checkov:skip=CKV_AWS_355: GetAuthorizationToken has no resource constraint
# checkov:skip=CKV_AWS_287: GetServiceBearerToken has no resource constraint
name = "pipeline_ecr_public_access"
description = "Allow pipeline to access the public ECR"
tags = var.tags
Expand Down
5 changes: 5 additions & 0 deletions infrastructure/blocks/s3/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ resource "aws_s3_bucket_public_access_block" "rapid_data_storage" {
}


resource "aws_s3_bucket_notification" "rapid_data_storage" {
bucket = aws_s3_bucket.rapid_data_storage.id
eventbridge = true
}

resource "aws_s3_bucket" "logs" {
#checkov:skip=CKV_AWS_144:No need for cross region replication
#checkov:skip=CKV_AWS_145:No need for non default key
Expand Down
5 changes: 5 additions & 0 deletions infrastructure/modules/rapid/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ resource "aws_s3_bucket" "this" {
}
}

resource "aws_s3_bucket_notification" "this" {
bucket = aws_s3_bucket.this.id
eventbridge = true
}

resource "aws_s3_bucket_public_access_block" "this" {
bucket = aws_s3_bucket.this.id
ignore_public_acls = true
Expand Down
4 changes: 2 additions & 2 deletions infrastructure/modules/rapid/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ variable "app-replica-count-max" {
variable "application_version" {
type = string
description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)"
default = "v7.0.7"
default = "v7.0.8"
}

variable "ui_version" {
type = string
description = "The version number for the static ui (e.g.: v1.0.0, etc.)"
default = "v7.0.7"
default = "v7.0.8"
}

variable "catalog_disabled" {
Expand Down
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,5 +64,6 @@ plugins:
markdown_extensions:
- pymdownx.highlight
- pymdownx.extra
- pymdownx.magiclink
- pymdownx.tabbed:
alternate_style: true
4 changes: 2 additions & 2 deletions sdk/rapid/rapid.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,8 +245,8 @@ def convert_dataframe_for_file_upload(self, df: DataFrame):
"""
return {
"file": (
f"rapid-sdk-{int(datetime.now().timestamp())}.csv",
df.to_csv(index=False),
f"rapid-sdk-{int(datetime.now().timestamp())}.parquet",
df.to_parquet(index=False),
)
}

Expand Down
1 change: 1 addition & 0 deletions sdk/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ requests
requests-mock
twine
pydantic
pyarrow
4 changes: 2 additions & 2 deletions sdk/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@

setup(
name="rapid-sdk",
version="0.1.5",
version="0.1.6",
description="A python sdk for the rAPId API",
url="https://github.com/no10ds/rapid-sdk",
author="Lewis Card",
author_email="[email protected]",
license="MIT",
packages=find_packages(include=["rapid", "rapid.*"], exclude=["tests"]),
install_requires=["pandas", "requests", "deepdiff"],
install_requires=["pandas", "requests", "deepdiff", "pyarrow", "pydantic"],
include_package_data=True,
)
23 changes: 13 additions & 10 deletions sdk/tests/test_rapid.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from mock import Mock, call
from pandas import DataFrame
import pytest
import io
import pandas as pd
from requests_mock import Mocker

from rapid import Rapid
Expand Down Expand Up @@ -154,7 +155,7 @@ def test_upload_dataframe_success_after_waiting(
domain = "test_domain"
dataset = "test_dataset"
job_id = 1234
df = DataFrame()
df = pd.DataFrame()
requests_mock.post(
f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
json={"details": {"job_id": job_id}},
Expand All @@ -176,7 +177,7 @@ def test_upload_dataframe_success_no_waiting(
domain = "test_domain"
dataset = "test_dataset"
job_id = 1234
df = DataFrame()
df = pd.DataFrame()
requests_mock.post(
f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
json={"details": {"job_id": job_id}},
Expand All @@ -194,7 +195,7 @@ def test_upload_dataframe_failure(self, requests_mock: Mocker, rapid: Rapid):
domain = "test_domain"
dataset = "test_dataset"
job_id = 1234
df = DataFrame()
df = pd.DataFrame()
requests_mock.post(
f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
json={"details": {"job_id": job_id}},
Expand Down Expand Up @@ -238,20 +239,22 @@ def test_fetch_dataset_info_failure(self, requests_mock: Mocker, rapid: Rapid):

@pytest.mark.usefixtures("rapid")
def test_convert_dataframe_for_file_upload(self, rapid: Rapid):
df = DataFrame()
df = pd.DataFrame()
res = rapid.convert_dataframe_for_file_upload(df)
filename = res["file"][0]
data = res["file"][1]
assert filename.startswith("rapid-sdk") and filename.endswith(".csv")
assert data == "\n"
data = io.BytesIO(res["file"][1])
df = pd.read_parquet(data)

assert filename.startswith("rapid-sdk") and filename.endswith(".parquet")
assert len(df) == 0

@pytest.mark.usefixtures("requests_mock", "rapid")
def test_generate_schema_success(self, requests_mock: Mocker, rapid: Rapid):
layer = "raw"
domain = "test_domain"
dataset = "test_dataset"
sensitivity = "PUBLIC"
df = DataFrame()
df = pd.DataFrame()
mocked_response = {
"metadata": {
"layer": "raw",
Expand Down Expand Up @@ -299,7 +302,7 @@ def test_generate_schema_failure(self, requests_mock: Mocker, rapid: Rapid):
domain = "test_domain"
dataset = "test_dataset"
sensitivity = "PUBLIC"
df = DataFrame()
df = pd.DataFrame()
mocked_response = {"data": "dummy"}
requests_mock.post(
f"{RAPID_URL}/schema/{layer}/{sensitivity}/{domain}/{dataset}/generate",
Expand Down
Loading

0 comments on commit 11c0ca1

Please sign in to comment.