Merge pull request #35 from no10ds/fix/v7-migration

Add fixes to the migration script and docs
no10ds · Sep 20, 2023 · 925c6f9 · 925c6f9
2 parents 1ba1c90 + 340221b
commit 925c6f9
Show file tree

Hide file tree

Showing 11 changed files with 142 additions and 104 deletions.
diff --git a/Makefile b/Makefile
@@ -189,16 +189,17 @@ ui-zip-and-release: ui-zip-contents ui-release ## Zip and release prod static ui
 
 ##
 release:
+	@python release.py --operation check
 	@git checkout ${commit}
 	@git tag -a "${version}" -m "Release tag for version ${version}"
 	@git checkout -
 	@git push origin ${version}
-	@python get_latest_release_changelog.py
+	@python release.py --operation create-changelog
 	@gh release create ${version} -F latest_release_changelog.md
 	@rm -rf latest_release_changelog.md
 
 
 # Migration --------------------
 ##
 migrate-v7:			## Run the migration
-	@cd api/; ./batect migrate-v7 -- "--layer ${layer} --all-layers ${all-layers}"
+	@cd api/; ./batect migrate-v7 -- --layer ${layer} --all-layers ${all-layers}
diff --git a/api/batect.yml b/api/batect.yml
@@ -130,4 +130,8 @@ tasks:
     description: Run the rAPId migration script
     run:
       container: service-image
+      environment:
+        AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
+        AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
+        AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-}
       command: 'python migrations/scripts/v7_layer_migration.py'
diff --git a/api/migrations/scripts/v7_layer_migration.py b/api/migrations/scripts/v7_layer_migration.py
@@ -13,10 +13,11 @@
 
 Please ensure that none of the crawlers are running when you start this script.
 """
+import sys
+import os
 import argparse
 from copy import deepcopy
 import json
-import os
 from typing import List
 import re
 from pprint import pprint
@@ -26,10 +27,15 @@
 
 dotenv.load_dotenv()
 
+# Add api to PYTHONPATH
+current_dir = os.path.dirname(os.path.abspath(__file__))
+target_dir = os.path.abspath(os.path.join(current_dir, "..", ".."))
+sys.path.append(target_dir)
 
 from api.domain.schema import Schema  # noqa: E402
 from api.application.services.schema_service import SchemaService  # noqa: E402
 from api.adapter.athena_adapter import AthenaAdapter  # noqa: E402
+from api.common.custom_exceptions import ConflictError  # noqa: E402
 
 
 AWS_REGION = os.environ["AWS_REGION"]
@@ -54,10 +60,10 @@ def main(
     athena_adapter,
 ):
     migrate_files(layer, s3_client)
+    migrate_permissions(layer, all_layers, dynamodb_client)
     schema_errors = migrate_schemas(layer, schema_service, glue_client)
     migrate_tables(layer, glue_client, athena_adapter)
     migrate_crawlers(glue_client, resource_client)
-    migrate_permissions(layer, all_layers, dynamodb_client)
 
     if schema_errors:
         print("- These were the schema errors that need to be addressed manually")
@@ -204,54 +210,10 @@ def migrate_tables(layer: str, glue_client, athena_adapter: AthenaAdapter):
             DatabaseName=GLUE_DB,
             TableName=table["Name"],
         )
-
-        glue_client.create_table(
-            DatabaseName=GLUE_DB,
-            TableInput={
-                "Name": f"{layer}_{table['Name']}",
-                "Owner": "hadoop",
-                "StorageDescriptor": {
-                    "Columns": table["StorageDescriptor"]["Columns"],
-                    "Location": table["StorageDescriptor"]["Location"].replace(
-                        "/data/", f"/data/{layer}/"
-                    ),
-                    "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
-                    "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
-                    "Compressed": False,
-                    "SerdeInfo": {
-                        "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
-                        "Parameters": {"serialization.format": "1"},
-                    },
-                    "NumberOfBuckets": -1,
-                    "StoredAsSubDirectories": False,
-                },
-                "PartitionKeys": table["PartitionKeys"],
-                "TableType": "EXTERNAL_TABLE",
-                "Parameters": {
-                    "classification": "parquet",
-                    "typeOfData": "file",
-                    "compressionType": "none",
-                    "EXTERNAL": "TRUE",
-                },
-            },
-            PartitionIndexes=[
-                {
-                    "Keys": [
-                        key["Name"]
-                        for key in partition_indexes["PartitionIndexDescriptorList"][0][
-                            "Keys"
-                        ]
-                    ],
-                    "IndexName": partition_indexes["PartitionIndexDescriptorList"][0][
-                        "IndexName"
-                    ],
-                }
-            ]
-            if partition_indexes["PartitionIndexDescriptorList"]
-            else [],
-        )
-
-        athena_adapter.query_sql_async(f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;")
+        if partition_indexes:
+            athena_adapter.query_sql_async(
+                f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;"
+            )
         glue_client.delete_table(Name=table["Name"], DatabaseName=GLUE_DB)
 
 
@@ -325,14 +287,10 @@ def migrate_schemas(layer, schema_service: SchemaService, glue_client):
             for column in schema.columns:
                 column.data_type = glue_column_types[column.name]
 
-            schema_service.store_schema(schema)
-            latest_version = schema_service.get_latest_schema(schema.metadata)
-
-            if (
-                latest_version.dataset_identifier()
-                != schema.metadata.dataset_identifier()
-            ):
-                schema_service.deprecate_schema(schema.metadata)
+            try:
+                schema_service.upload_schema(schema)
+            except ConflictError:
+                schema_service.update_schema(schema)
 
             s3_client.delete_object(Bucket=DATA_BUCKET, Key=file["Key"])
 

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -1,11 +1,24 @@
 # Changelog
 
+## v7.0.4 / v0.1.2 (sdk) - _2023-09-20_
+
+### Features
+
+- Improved release process
+- Added Athena workgroup and database as outputs of the rAPId module.
+
+### Fixes
+
+- Updated terraform default `application_version` and `ui_version` variables.
+- Migration script and documentation.
+
 ## v7.0.3 / v0.1.2 (sdk) - _2023-09-15_
 
 ### Fixes
 
 - Fixes issue where permissions were not being correctly read and causing api functionality to fail
 
+
 ## v7.0.2 / v0.1.2 (sdk) - _2023-09-14_
 
 ### Fixes
@@ -39,7 +52,8 @@
 
 - See the [migration doc](migration.md) for details on how to migrate to v7 from v6.
 
-[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.3...HEAD
+[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.4...HEAD
+[v7.0.4 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.3...v7.0.4
 [v7.0.3 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.2...v7.0.3
 [v7.0.2 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.1...v7.0.2
 [v7.0.1 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.0...v7.0.1

diff --git a/docs/migration.md b/docs/migration.md
@@ -13,38 +13,43 @@ To execute it, you'll need to decide:
 
 ### Prerequisites
 
-#### Infrastructure changes
-
-The v7.0.0 infrastructure changes need to be applied to your rAPId instance.
-
-Update the version of the rAPId terraform module that you are using and apply the terraform.
-
 #### Local requirements
 
 You will need the ability to run `Batect`, the requirements for which are listed [here](https://batect.dev/docs/getting-started/requirements/).
 
 ### Steps:
 
+#### Make the infrastructure changes
+
+The v7 infrastructure changes need to be applied to your rAPId instance.
+
+1. Add the `layers` variable to the rAPId cluster module. `layers` will be a list of the layers you wish to use in your rAPId instance. You can omit this if you just want to use the `default` layer.
+2. Change the rAPId module source to:
+   `[email protected]:no10ds/rapid.git//infrastructure/modules/rapid`
+3. Update both the application_version and ui_version variables to `v7.0.4`
+
+Apply these changes.
+
 #### Clone the repo
 
 To do this, run:
 
-`git clone -b v7.0.0 [email protected]:no10ds/rapid.git`
+`git clone -b v7.0.4 [email protected]:no10ds/rapid.git`
 
 #### Set your environment variables
 
 Within the rAPId repo, set the following variables in the `.env` file to match those of your rAPId instance and AWS account:
 
 ```
 # rAPId instance variables
-- AWS_REGION=
-- DATA_BUCKET=
-- RESOURCE_PREFIX=
+AWS_REGION=
+DATA_BUCKET=
+RESOURCE_PREFIX=
 
 # AWS environment variables
-- AWS_ACCESS_KEY_ID=
-- AWS_SECRET_ACCESS_KEY=
-- AWS_SESSION_TOKEN=
+AWS_ACCESS_KEY_ID=
+AWS_SECRET_ACCESS_KEY=
+AWS_SESSION_TOKEN=
 ```
 
 #### Run the migration script

diff --git a/get_latest_release_changelog.py b/get_latest_release_changelog.py
diff --git a/infrastructure/blocks/data-workflow/output.tf b/infrastructure/blocks/data-workflow/output.tf
@@ -5,7 +5,12 @@ output "athena_query_output_bucket_arn" {
 
 output "athena_workgroup_arn" {
   value       = module.data_workflow.athena_workgroup_arn
-  description = "Query workgroup for Athena"
+  description = "The ARN of the Query workgroup for Athena"
+}
+
+output "athena_workgroup_name" {
+  value       = module.data_workflow.athena_workgroup_name
+  description = "The name of the Query workgroup for Athena"
 }
 
 output "schema_table_arn" {

diff --git a/infrastructure/modules/data-workflow/output.tf b/infrastructure/modules/data-workflow/output.tf
@@ -5,7 +5,12 @@ output "athena_query_result_output_bucket_arn" {
 
 output "athena_workgroup_arn" {
   value       = aws_athena_workgroup.rapid_athena_workgroup.arn
-  description = "Query workgroup for Athena"
+  description = "The arn of the Query workgroup for Athena"
+}
+
+output "athena_workgroup_name" {
+  value       = aws_athena_workgroup.rapid_athena_workgroup.name
+  description = "The name of the Query workgroup for Athena"
 }
 
 output "catalogue_db_name" {

diff --git a/infrastructure/modules/rapid/outputs.tf b/infrastructure/modules/rapid/outputs.tf
@@ -0,0 +1,9 @@
+output "athena_workgroup_name" {
+  value       = module.data_workflow.athena_workgroup_name
+  description = "The name of the Query workgroup for Athena"
+}
+
+output "catalogue_db_name" {
+  value       = module.data_workflow.catalogue_db_name
+  description = "The name of the Glue Catalogue database"
+}
diff --git a/infrastructure/modules/rapid/variables.tf b/infrastructure/modules/rapid/variables.tf
@@ -13,13 +13,13 @@ variable "app-replica-count-max" {
 variable "application_version" {
   type        = string
   description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)"
-  default     = "v6.2.1"
+  default     = "v7.0.4"
 }
 
 variable "ui_version" {
   type        = string
   description = "The version number for the static ui (e.g.: v1.0.0, etc.)"
-  default     = "v6.0.1"
+  default     = "v7.0.4"
 }
 
 variable "catalog_disabled" {

diff --git a/release.py b/release.py
@@ -0,0 +1,63 @@
+import re
+import argparse
+
+
+def create_changelog():
+    with open("./docs/changelog.md", "r") as changelog_file:
+        changelog_lines = changelog_file.readlines()
+
+    parsed_lines = []
+
+    adding = False
+
+    for line in changelog_lines:
+        if re.match(
+            # Match for the release version number e.g. ## v7.0.0
+            f"##\s+v\d+\.\d+\.\d+",  # noqa: F541, W605
+            line,
+        ):
+            adding = not adding
+        if adding:
+            parsed_lines.append(line)
+
+    if not parsed_lines:
+        raise Exception(
+            "It looks like there is no release information in the changelog. Please check it."
+        )
+    else:
+        with open("latest_release_changelog.md", "w+") as latest_changelog:
+            latest_changelog.writelines(parsed_lines)
+
+
+def ask_yes_no_question(question):
+    while True:
+        answer = input(f"{question} (yes/no): ").strip().lower()
+        if answer == "yes" or answer == "y":
+            return True
+        elif answer == "no" or answer == "n":
+            print("Please fix this before continuing.")
+            exit(1)
+        else:
+            print("Please answer yes or no.")
+
+
+def check():
+    ask_yes_no_question("Have you updated the application and ui version in Terraform?")
+    ask_yes_no_question("Have you updated the changelog for this release?")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--operation",
+        help="The operation to carry out",
+        required=True,
+        choices=["check", "create-changelog"],
+    )
+    args = parser.parse_args()
+    match args.operation:
+        case "create-changelog":
+            create_changelog()
+        case "check":
+            check()