Merge pull request #65 from no10ds/release/v7.0.8-v0.1.6

Release - v7.0.8 (API) v0.1.6 (SDK)
no10ds · Nov 15, 2023 · 11c0ca1 · 11c0ca1
2 parents 07b0f80 + f112988
commit 11c0ca1
Show file tree

Hide file tree

Showing 22 changed files with 374 additions and 270 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -12,7 +12,7 @@ repos:
     rev: 1.7.5
     hooks:
       - id: bandit
-        exclude: '(tests|docs)/.*'
+        exclude: '(tests|docs|test)/.*'
   - repo: https://github.com/psf/black
     rev: 22.6.0
     hooks:

diff --git a/api/api/application/services/schema_infer_service.py b/api/api/application/services/schema_infer_service.py
@@ -13,10 +13,12 @@
 )
 from api.common.value_transformers import clean_column_name
 
-from api.domain.data_types import extract_athena_types
+from api.domain.data_types import extract_athena_types, is_date_type
 from api.domain.schema import Schema, Column
 from api.domain.schema_metadata import Owner, SchemaMetadata
 
+DEFAULT_DATE_FORMAT = "%Y-%m-%d"
+
 
 class SchemaInferService:
     def infer_schema(
@@ -67,7 +69,7 @@ def _infer_columns(self, dataframe: pd.DataFrame) -> List[Column]:
                 partition_index=None,
                 data_type=_type,
                 allow_null=True,
-                format=None,
+                format=DEFAULT_DATE_FORMAT if is_date_type(_type) else None,
             )
             for name, _type in extract_athena_types(dataframe).items()
         ]
diff --git a/api/api/entry.py b/api/api/entry.py
@@ -230,9 +230,9 @@ def _set_security_headers(response) -> None:
         "default-src 'self' "
         f"{IDENTITY_PROVIDER_BASE_URL}; "
         "script-src 'self' 'unsafe-inline' "
-        "cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui-bundle.js; "
+        "cdn.jsdelivr.net/npm/swagger-ui-dist@5.9.0/swagger-ui-bundle.js; "
         "style-src 'self' "
-        "cdn.jsdelivr.net/npm/swagger-ui-dist@5/swagger-ui.css; "
+        "cdn.jsdelivr.net/npm/swagger-ui-dist@5.9.0/swagger-ui.css; "
         "img-src 'self' data: "
         "fastapi.tiangolo.com/img/favicon.png;"
     )

diff --git a/api/requirements.txt b/api/requirements.txt
@@ -7,6 +7,7 @@ httpx
 jinja2
 pandas
 psutil
+pyarrow
 pyjwt
 pydantic[email]
 python-multipart

diff --git a/api/test/api/application/services/test_schema_infer_service.py b/api/test/api/application/services/test_schema_infer_service.py
@@ -3,6 +3,9 @@
 from pathlib import Path
 from unittest.mock import patch
 
+import pandas as pd
+import pyarrow as pa
+import pyarrow.parquet as pq
 import pytest
 
 from api.application.services.schema_infer_service import SchemaInferService
@@ -67,6 +70,44 @@ def test_infer_schema(self):
         assert actual_schema == expected_schema
         os.remove(temp_out_path)
 
+    def test_infer_schema_with_date(self):
+        expected_schema = Schema(
+            metadata=SchemaMetadata(
+                layer="raw",
+                domain="mydomain",
+                dataset="mydataset",
+                sensitivity="PUBLIC",
+                owners=[Owner(name="change_me", email="[email protected]")],
+            ),
+            columns=[
+                Column(
+                    name="colname1",
+                    partition_index=None,
+                    data_type="string",
+                    allow_null=True,
+                    format=None,
+                ),
+                Column(
+                    name="colname2",
+                    partition_index=None,
+                    data_type="date",
+                    allow_null=True,
+                    format="%Y-%m-%d",
+                ),
+            ],
+        ).dict(exclude={"metadata": {"version"}})
+        df = pd.DataFrame(data={"colname1": ["something"], "colname2": ["2021-01-01"]})
+        df["colname2"] = pd.to_datetime(df["colname2"])
+        temp_out_path = tempfile.mkstemp(suffix=".parquet")[1]
+        path = Path(temp_out_path)
+        pq.write_table(pa.Table.from_pandas(df), path)
+
+        actual_schema = self.infer_schema_service.infer_schema(
+            "raw", "mydomain", "mydataset", "PUBLIC", path
+        )
+        assert actual_schema == expected_schema
+        os.remove(temp_out_path)
+
     @patch("api.application.services.schema_infer_service.construct_chunked_dataframe")
     def test_raises_error_when_parsing_provided_file_fails(
         self, mock_construct_chunked_dataframe

diff --git a/docs/api/schema.md b/docs/api/schema.md
@@ -109,6 +109,8 @@ the day (%d) is optional and a separator ('/' or '-') must be in place. Accepted
 - %Y-%m -> 2021-01
 - %m-%Y -> 01-2021
 
+> When using the generate schema endpoint, rAPId will automatically detect a valid date column and specify a default date format of `%Y-%m-%d`, this can be changed before a schema upload if required.
+
 ### Booleans
 
 In order to handle nullables we have introduced [pandas' boolean nullable data type](https://pandas.pydata.org/pandas-docs/stable/user_guide/boolean.html),

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,5 +1,22 @@
 # Changelog
 
+## v7.0.8 / v0.1.6 (sdk) - _2023-11-15_
+
+### Fixes
+
+- Issue with date types when editing a schema on the UI because of no option to apply format column and therefore getting an _all fields are required_ error.
+- Tweaked UI design when adding permissions to subject.
+- SDK not uploading a Pandas Dataframe with a date field set correctly.
+- Updated NextJS and Zod package version.
+
+### Features
+
+- Data bucket now has EventBridge notifications enabled by default.
+
+### Closes relevant GitHub issues
+
+- https://github.com/no10ds/rapid/issues/57
+
 ## v7.0.7 / v0.1.5 (sdk) - _2023-11-07_
 
 ### Fixes
@@ -85,7 +102,8 @@
 
 - See the [migration doc](migration.md) for details on how to migrate to v7 from v6.
 
-[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.7...HEAD
+[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.8...HEAD
+[v7.0.8 / v0.1.6 (sdk)]: https://github.com/no10ds/rapid/v7.0.7...v7.0.8
 [v7.0.7 / v0.1.5 (sdk)]: https://github.com/no10ds/rapid/v7.0.6...v7.0.7
 [v7.0.6 / v0.1.4 (sdk)]: https://github.com/no10ds/rapid/v7.0.5...v7.0.6
 [v7.0.5 / v0.1.3 (sdk)]: https://github.com/no10ds/rapid/v7.0.4...v7.0.5

diff --git a/infrastructure/blocks/pipeline/iam.tf b/infrastructure/blocks/pipeline/iam.tf
@@ -55,6 +55,7 @@ resource "aws_iam_policy" "pipeline_ecr_access" {
 
 resource "aws_iam_policy" "pipeline_ecr_public_access" {
   # checkov:skip=CKV_AWS_355: GetAuthorizationToken has no resource constraint
+  # checkov:skip=CKV_AWS_287: GetServiceBearerToken has no resource constraint
   name        = "pipeline_ecr_public_access"
   description = "Allow pipeline to access the public ECR"
   tags        = var.tags

diff --git a/infrastructure/blocks/s3/main.tf b/infrastructure/blocks/s3/main.tf
@@ -40,6 +40,11 @@ resource "aws_s3_bucket_public_access_block" "rapid_data_storage" {
 }
 
 
+resource "aws_s3_bucket_notification" "rapid_data_storage" {
+  bucket      = aws_s3_bucket.rapid_data_storage.id
+  eventbridge = true
+}
+
 resource "aws_s3_bucket" "logs" {
   #checkov:skip=CKV_AWS_144:No need for cross region replication
   #checkov:skip=CKV_AWS_145:No need for non default key

diff --git a/infrastructure/modules/rapid/main.tf b/infrastructure/modules/rapid/main.tf
@@ -96,6 +96,11 @@ resource "aws_s3_bucket" "this" {
   }
 }
 
+resource "aws_s3_bucket_notification" "this" {
+  bucket      = aws_s3_bucket.this.id
+  eventbridge = true
+}
+
 resource "aws_s3_bucket_public_access_block" "this" {
   bucket                  = aws_s3_bucket.this.id
   ignore_public_acls      = true

diff --git a/infrastructure/modules/rapid/variables.tf b/infrastructure/modules/rapid/variables.tf
@@ -13,13 +13,13 @@ variable "app-replica-count-max" {
 variable "application_version" {
   type        = string
   description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)"
-  default     = "v7.0.7"
+  default     = "v7.0.8"
 }
 
 variable "ui_version" {
   type        = string
   description = "The version number for the static ui (e.g.: v1.0.0, etc.)"
-  default     = "v7.0.7"
+  default     = "v7.0.8"
 }
 
 variable "catalog_disabled" {

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -64,5 +64,6 @@ plugins:
 markdown_extensions:
   - pymdownx.highlight
   - pymdownx.extra
+  - pymdownx.magiclink
   - pymdownx.tabbed:
       alternate_style: true
diff --git a/sdk/rapid/rapid.py b/sdk/rapid/rapid.py
@@ -245,8 +245,8 @@ def convert_dataframe_for_file_upload(self, df: DataFrame):
         """
         return {
             "file": (
-                f"rapid-sdk-{int(datetime.now().timestamp())}.csv",
-                df.to_csv(index=False),
+                f"rapid-sdk-{int(datetime.now().timestamp())}.parquet",
+                df.to_parquet(index=False),
             )
         }
 

diff --git a/sdk/requirements.txt b/sdk/requirements.txt
@@ -7,3 +7,4 @@ requests
 requests-mock
 twine
 pydantic
+pyarrow
diff --git a/sdk/setup.py b/sdk/setup.py
@@ -2,13 +2,13 @@
 
 setup(
     name="rapid-sdk",
-    version="0.1.5",
+    version="0.1.6",
     description="A python sdk for the rAPId API",
     url="https://github.com/no10ds/rapid-sdk",
     author="Lewis Card",
     author_email="[email protected]",
     license="MIT",
     packages=find_packages(include=["rapid", "rapid.*"], exclude=["tests"]),
-    install_requires=["pandas", "requests", "deepdiff"],
+    install_requires=["pandas", "requests", "deepdiff", "pyarrow", "pydantic"],
     include_package_data=True,
 )
diff --git a/sdk/tests/test_rapid.py b/sdk/tests/test_rapid.py
@@ -1,6 +1,7 @@
 from mock import Mock, call
-from pandas import DataFrame
 import pytest
+import io
+import pandas as pd
 from requests_mock import Mocker
 
 from rapid import Rapid
@@ -154,7 +155,7 @@ def test_upload_dataframe_success_after_waiting(
         domain = "test_domain"
         dataset = "test_dataset"
         job_id = 1234
-        df = DataFrame()
+        df = pd.DataFrame()
         requests_mock.post(
             f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
             json={"details": {"job_id": job_id}},
@@ -176,7 +177,7 @@ def test_upload_dataframe_success_no_waiting(
         domain = "test_domain"
         dataset = "test_dataset"
         job_id = 1234
-        df = DataFrame()
+        df = pd.DataFrame()
         requests_mock.post(
             f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
             json={"details": {"job_id": job_id}},
@@ -194,7 +195,7 @@ def test_upload_dataframe_failure(self, requests_mock: Mocker, rapid: Rapid):
         domain = "test_domain"
         dataset = "test_dataset"
         job_id = 1234
-        df = DataFrame()
+        df = pd.DataFrame()
         requests_mock.post(
             f"{RAPID_URL}/datasets/{layer}/{domain}/{dataset}",
             json={"details": {"job_id": job_id}},
@@ -238,20 +239,22 @@ def test_fetch_dataset_info_failure(self, requests_mock: Mocker, rapid: Rapid):
 
     @pytest.mark.usefixtures("rapid")
     def test_convert_dataframe_for_file_upload(self, rapid: Rapid):
-        df = DataFrame()
+        df = pd.DataFrame()
         res = rapid.convert_dataframe_for_file_upload(df)
         filename = res["file"][0]
-        data = res["file"][1]
-        assert filename.startswith("rapid-sdk") and filename.endswith(".csv")
-        assert data == "\n"
+        data = io.BytesIO(res["file"][1])
+        df = pd.read_parquet(data)
+
+        assert filename.startswith("rapid-sdk") and filename.endswith(".parquet")
+        assert len(df) == 0
 
     @pytest.mark.usefixtures("requests_mock", "rapid")
     def test_generate_schema_success(self, requests_mock: Mocker, rapid: Rapid):
         layer = "raw"
         domain = "test_domain"
         dataset = "test_dataset"
         sensitivity = "PUBLIC"
-        df = DataFrame()
+        df = pd.DataFrame()
         mocked_response = {
             "metadata": {
                 "layer": "raw",
@@ -299,7 +302,7 @@ def test_generate_schema_failure(self, requests_mock: Mocker, rapid: Rapid):
         domain = "test_domain"
         dataset = "test_dataset"
         sensitivity = "PUBLIC"
-        df = DataFrame()
+        df = pd.DataFrame()
         mocked_response = {"data": "dummy"}
         requests_mock.post(
             f"{RAPID_URL}/schema/{layer}/{sensitivity}/{domain}/{dataset}/generate",
-Original file line number
+Diff line change
@@ Expand Up / @@ -7,6 +7,7 @@ httpx @@
     jinja2
     pandas
     psutil
+    pyarrow
     pyjwt
     pydantic[email]
     python-multipart
@@ Expand Down @@