From 25c822c1df25dac1c5f2ef08798e40db1a2a89f8 Mon Sep 17 00:00:00 2001 From: lcard Date: Thu, 14 Sep 2023 22:15:31 +0100 Subject: [PATCH 1/3] Add fixes to the migration script and docs --- Makefile | 2 +- api/batect.yml | 4 ++ api/migrations/scripts/v7_layer_migration.py | 74 +++++--------------- docs/changelog.md | 6 ++ docs/migration.md | 31 ++++---- 5 files changed, 45 insertions(+), 72 deletions(-) diff --git a/Makefile b/Makefile index 5769073..6c0220a 100644 --- a/Makefile +++ b/Makefile @@ -201,4 +201,4 @@ release: # Migration -------------------- ## migrate-v7: ## Run the migration - @cd api/; ./batect migrate-v7 -- "--layer ${layer} --all-layers ${all-layers}" + @cd api/; ./batect migrate-v7 -- --layer ${layer} --all-layers ${all-layers} diff --git a/api/batect.yml b/api/batect.yml index e04c94d..de84076 100755 --- a/api/batect.yml +++ b/api/batect.yml @@ -130,4 +130,8 @@ tasks: description: Run the rAPId migration script run: container: service-image + environment: + AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-} + AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-} + AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-} command: 'python migrations/scripts/v7_layer_migration.py' diff --git a/api/migrations/scripts/v7_layer_migration.py b/api/migrations/scripts/v7_layer_migration.py index 9101481..5b68e9c 100644 --- a/api/migrations/scripts/v7_layer_migration.py +++ b/api/migrations/scripts/v7_layer_migration.py @@ -13,10 +13,11 @@ Please ensure that none of the crawlers are running when you start this script. """ +import sys +import os import argparse from copy import deepcopy import json -import os from typing import List import re from pprint import pprint @@ -26,10 +27,15 @@ dotenv.load_dotenv() +# Add api to PYTHONPATH +current_dir = os.path.dirname(os.path.abspath(__file__)) +target_dir = os.path.abspath(os.path.join(current_dir, "..", "..")) +sys.path.append(target_dir) from api.domain.schema import Schema # noqa: E402 from api.application.services.schema_service import SchemaService # noqa: E402 from api.adapter.athena_adapter import AthenaAdapter # noqa: E402 +from api.common.custom_exceptions import ConflictError # noqa: E402 AWS_REGION = os.environ["AWS_REGION"] @@ -54,10 +60,10 @@ def main( athena_adapter, ): migrate_files(layer, s3_client) + migrate_permissions(layer, all_layers, dynamodb_client) schema_errors = migrate_schemas(layer, schema_service, glue_client) migrate_tables(layer, glue_client, athena_adapter) migrate_crawlers(glue_client, resource_client) - migrate_permissions(layer, all_layers, dynamodb_client) if schema_errors: print("- These were the schema errors that need to be addressed manually") @@ -204,54 +210,10 @@ def migrate_tables(layer: str, glue_client, athena_adapter: AthenaAdapter): DatabaseName=GLUE_DB, TableName=table["Name"], ) - - glue_client.create_table( - DatabaseName=GLUE_DB, - TableInput={ - "Name": f"{layer}_{table['Name']}", - "Owner": "hadoop", - "StorageDescriptor": { - "Columns": table["StorageDescriptor"]["Columns"], - "Location": table["StorageDescriptor"]["Location"].replace( - "/data/", f"/data/{layer}/" - ), - "InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat", - "OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat", - "Compressed": False, - "SerdeInfo": { - "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", - "Parameters": {"serialization.format": "1"}, - }, - "NumberOfBuckets": -1, - "StoredAsSubDirectories": False, - }, - "PartitionKeys": table["PartitionKeys"], - "TableType": "EXTERNAL_TABLE", - "Parameters": { - "classification": "parquet", - "typeOfData": "file", - "compressionType": "none", - "EXTERNAL": "TRUE", - }, - }, - PartitionIndexes=[ - { - "Keys": [ - key["Name"] - for key in partition_indexes["PartitionIndexDescriptorList"][0][ - "Keys" - ] - ], - "IndexName": partition_indexes["PartitionIndexDescriptorList"][0][ - "IndexName" - ], - } - ] - if partition_indexes["PartitionIndexDescriptorList"] - else [], - ) - - athena_adapter.query_sql_async(f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;") + if partition_indexes: + athena_adapter.query_sql_async( + f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;" + ) glue_client.delete_table(Name=table["Name"], DatabaseName=GLUE_DB) @@ -325,14 +287,10 @@ def migrate_schemas(layer, schema_service: SchemaService, glue_client): for column in schema.columns: column.data_type = glue_column_types[column.name] - schema_service.store_schema(schema) - latest_version = schema_service.get_latest_schema(schema.metadata) - - if ( - latest_version.dataset_identifier() - != schema.metadata.dataset_identifier() - ): - schema_service.deprecate_schema(schema.metadata) + try: + schema_service.upload_schema(schema) + except ConflictError: + schema_service.update_schema(schema) s3_client.delete_object(Bucket=DATA_BUCKET, Key=file["Key"]) diff --git a/docs/changelog.md b/docs/changelog.md index 3372639..89c7632 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,11 @@ # Changelog +## v7.0.3 / v0.1.2 (sdk) - _2023-09-14_ + +### Fixes + +- Migration script and docs tweaks. + ## v7.0.2 / v0.1.2 (sdk) - _2023-09-14_ ### Fixes diff --git a/docs/migration.md b/docs/migration.md index 9b8b5f6..02ea107 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -13,23 +13,28 @@ To execute it, you'll need to decide: ### Prerequisites -#### Infrastructure changes - -The v7.0.0 infrastructure changes need to be applied to your rAPId instance. - -Update the version of the rAPId terraform module that you are using and apply the terraform. - #### Local requirements You will need the ability to run `Batect`, the requirements for which are listed [here](https://batect.dev/docs/getting-started/requirements/). ### Steps: +#### Make the infrastructure changes + +A v7.0.0 infrastructure changes need to be applied to your rAPId instance. + +1. Add the `layers` variable to the rAPId cluster module. `layers` will be a list of the layers you wish to use in your rAPId instance. You can omit this if you just want to use the `default` layer. +2. Change the rAPId module source to: + `git@github.com:no10ds/rapid.git//infrastructure/modules/rapid` +3. Update both the application_version and ui_version variables to `v7.0.3` + +Apply these changes + #### Clone the repo To do this, run: -`git clone -b v7.0.0 git@github.com:no10ds/rapid.git` +`git clone -b v7.0.3 git@github.com:no10ds/rapid.git` #### Set your environment variables @@ -37,14 +42,14 @@ Within the rAPId repo, set the following variables in the `.env` file to match t ``` # rAPId instance variables -- AWS_REGION= -- DATA_BUCKET= -- RESOURCE_PREFIX= +AWS_REGION= +DATA_BUCKET= +RESOURCE_PREFIX= # AWS environment variables -- AWS_ACCESS_KEY_ID= -- AWS_SECRET_ACCESS_KEY= -- AWS_SESSION_TOKEN= +AWS_ACCESS_KEY_ID= +AWS_SECRET_ACCESS_KEY= +AWS_SESSION_TOKEN= ``` #### Run the migration script From a49b39aef1e9996bc4b91d19b7d58fc25e35710e Mon Sep 17 00:00:00 2001 From: lcard Date: Wed, 20 Sep 2023 11:28:12 +0100 Subject: [PATCH 2/3] Improve the release process, update docs, add terraform outputs --- Makefile | 3 +- docs/changelog.md | 11 ++++ docs/migration.md | 13 +++- get_latest_release_changelog.py | 26 -------- infrastructure/blocks/data-workflow/output.tf | 7 ++- .../modules/data-workflow/output.tf | 7 ++- infrastructure/modules/rapid/outputs.tf | 9 +++ infrastructure/modules/rapid/variables.tf | 4 +- release.py | 63 +++++++++++++++++++ 9 files changed, 111 insertions(+), 32 deletions(-) delete mode 100644 get_latest_release_changelog.py create mode 100644 infrastructure/modules/rapid/outputs.tf create mode 100644 release.py diff --git a/Makefile b/Makefile index 5769073..f65b9ad 100644 --- a/Makefile +++ b/Makefile @@ -189,11 +189,12 @@ ui-zip-and-release: ui-zip-contents ui-release ## Zip and release prod static ui ## release: + @python release.py --operation check @git checkout ${commit} @git tag -a "${version}" -m "Release tag for version ${version}" @git checkout - @git push origin ${version} - @python get_latest_release_changelog.py + @python release.py --operation create-changelog @gh release create ${version} -F latest_release_changelog.md @rm -rf latest_release_changelog.md diff --git a/docs/changelog.md b/docs/changelog.md index ec4cca9..c5fc936 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,5 +1,16 @@ # Changelog +## v7.0.4 / v0.1.2 (sdk) - _2023-09-14_ + +### Features + +- Improved release process +- Added Athena workgroup and database as outputs of the rAPId module. + +### Fixes + +- Updated terraform default `application_version` and `ui_version` variables. + ## v7.0.3 / v0.1.2 (sdk) - _2023-09-15_ ### Fixes diff --git a/docs/migration.md b/docs/migration.md index 9b8b5f6..4ce04ef 100644 --- a/docs/migration.md +++ b/docs/migration.md @@ -25,11 +25,22 @@ You will need the ability to run `Batect`, the requirements for which are listed ### Steps: +#### Make the infrastructure changes + +The v7 infrastructure changes need to be applied to your rAPId instance. + +1. Add the `layers` variable to the rAPId cluster module. `layers` will be a list of the layers you wish to use in your rAPId instance. You can omit this if you just want to use the `default` layer. +2. Change the rAPId module source to: + `git@github.com:no10ds/rapid.git//infrastructure/modules/rapid` +3. Update both the application_version and ui_version variables to `v7.0.4` + +Apply these changes. + #### Clone the repo To do this, run: -`git clone -b v7.0.0 git@github.com:no10ds/rapid.git` +`git clone -b v7.0.4 git@github.com:no10ds/rapid.git` #### Set your environment variables diff --git a/get_latest_release_changelog.py b/get_latest_release_changelog.py deleted file mode 100644 index 555cc45..0000000 --- a/get_latest_release_changelog.py +++ /dev/null @@ -1,26 +0,0 @@ -import re - -with open("./docs/changelog.md", "r") as changelog_file: - changelog_lines = changelog_file.readlines() - -parsed_lines = [] - -adding = False - -for line in changelog_lines: - if re.match( - # Match for the release version number e.g. ## v7.0.0 - f"##\s+v\d+\.\d+\.\d+", # noqa: F541 - line, - ): - adding = not adding - if adding: - parsed_lines.append(line) - -if not parsed_lines: - raise Exception( - "It looks like there is no release information in the changelog. Please check it." - ) -else: - with open("latest_release_changelog.md", "w+") as latest_changelog: - latest_changelog.writelines(parsed_lines) diff --git a/infrastructure/blocks/data-workflow/output.tf b/infrastructure/blocks/data-workflow/output.tf index e9750f5..3f2c682 100644 --- a/infrastructure/blocks/data-workflow/output.tf +++ b/infrastructure/blocks/data-workflow/output.tf @@ -5,7 +5,12 @@ output "athena_query_output_bucket_arn" { output "athena_workgroup_arn" { value = module.data_workflow.athena_workgroup_arn - description = "Query workgroup for Athena" + description = "The ARN of the Query workgroup for Athena" +} + +output "athena_workgroup_name" { + value = module.data_workflow.athena_workgroup_name + description = "The name of the Query workgroup for Athena" } output "schema_table_arn" { diff --git a/infrastructure/modules/data-workflow/output.tf b/infrastructure/modules/data-workflow/output.tf index 6408864..4c9f956 100644 --- a/infrastructure/modules/data-workflow/output.tf +++ b/infrastructure/modules/data-workflow/output.tf @@ -5,7 +5,12 @@ output "athena_query_result_output_bucket_arn" { output "athena_workgroup_arn" { value = aws_athena_workgroup.rapid_athena_workgroup.arn - description = "Query workgroup for Athena" + description = "The arn of the Query workgroup for Athena" +} + +output "athena_workgroup_name" { + value = aws_athena_workgroup.rapid_athena_workgroup.name + description = "The name of the Query workgroup for Athena" } output "catalogue_db_name" { diff --git a/infrastructure/modules/rapid/outputs.tf b/infrastructure/modules/rapid/outputs.tf new file mode 100644 index 0000000..7bdd27e --- /dev/null +++ b/infrastructure/modules/rapid/outputs.tf @@ -0,0 +1,9 @@ +output "athena_workgroup_name" { + value = module.data_workflow.athena_workgroup_name + description = "The name of the Query workgroup for Athena" +} + +output "catalogue_db_name" { + value = module.data_workflow.catalogue_db_name + description = "The name of the Glue Catalogue database" +} diff --git a/infrastructure/modules/rapid/variables.tf b/infrastructure/modules/rapid/variables.tf index 2e1f6fe..41f889d 100644 --- a/infrastructure/modules/rapid/variables.tf +++ b/infrastructure/modules/rapid/variables.tf @@ -13,13 +13,13 @@ variable "app-replica-count-max" { variable "application_version" { type = string description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)" - default = "v6.2.1" + default = "v7.0.4" } variable "ui_version" { type = string description = "The version number for the static ui (e.g.: v1.0.0, etc.)" - default = "v6.0.1" + default = "v7.0.4" } variable "catalog_disabled" { diff --git a/release.py b/release.py new file mode 100644 index 0000000..c2a410d --- /dev/null +++ b/release.py @@ -0,0 +1,63 @@ +import re +import argparse + + +def create_changelog(): + with open("./docs/changelog.md", "r") as changelog_file: + changelog_lines = changelog_file.readlines() + + parsed_lines = [] + + adding = False + + for line in changelog_lines: + if re.match( + # Match for the release version number e.g. ## v7.0.0 + f"##\s+v\d+\.\d+\.\d+", # noqa: F541, W605 + line, + ): + adding = not adding + if adding: + parsed_lines.append(line) + + if not parsed_lines: + raise Exception( + "It looks like there is no release information in the changelog. Please check it." + ) + else: + with open("latest_release_changelog.md", "w+") as latest_changelog: + latest_changelog.writelines(parsed_lines) + + +def ask_yes_no_question(question): + while True: + answer = input(f"{question} (yes/no): ").strip().lower() + if answer == "yes" or answer == "y": + return True + elif answer == "no" or answer == "n": + print("Please fix this before continuing.") + exit(1) + else: + print("Please answer yes or no.") + + +def check(): + ask_yes_no_question("Have you updated the application and ui version in Terraform?") + ask_yes_no_question("Have you updated the changelog for this release?") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + + parser.add_argument( + "--operation", + help="The operation to carry out", + required=True, + choices=["check", "create-changelog"], + ) + args = parser.parse_args() + match args.operation: + case "create-changelog": + create_changelog() + case "check": + check() From a6fac4a982266c02a8876061451da4d3acc8f1ab Mon Sep 17 00:00:00 2001 From: lcard Date: Wed, 20 Sep 2023 11:31:55 +0100 Subject: [PATCH 3/3] Update changelog --- docs/changelog.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/changelog.md b/docs/changelog.md index c5fc936..1bfb2d1 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -1,6 +1,6 @@ # Changelog -## v7.0.4 / v0.1.2 (sdk) - _2023-09-14_ +## v7.0.4 / v0.1.2 (sdk) - _2023-09-20_ ### Features @@ -50,7 +50,8 @@ - See the [migration doc](migration.md) for details on how to migrate to v7 from v6. -[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.3...HEAD +[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.4...HEAD +[v7.0.4 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.3...v7.0.4 [v7.0.3 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.2...v7.0.3 [v7.0.2 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.1...v7.0.2 [v7.0.1 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.0...v7.0.1