Skip to content

Commit

Permalink
Merge pull request #35 from no10ds/fix/v7-migration
Browse files Browse the repository at this point in the history
Add fixes to the migration script and docs
  • Loading branch information
lcardno10 authored Sep 20, 2023
2 parents 1ba1c90 + 340221b commit 925c6f9
Show file tree
Hide file tree
Showing 11 changed files with 142 additions and 104 deletions.
5 changes: 3 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -189,16 +189,17 @@ ui-zip-and-release: ui-zip-contents ui-release ## Zip and release prod static ui

##
release:
@python release.py --operation check
@git checkout ${commit}
@git tag -a "${version}" -m "Release tag for version ${version}"
@git checkout -
@git push origin ${version}
@python get_latest_release_changelog.py
@python release.py --operation create-changelog
@gh release create ${version} -F latest_release_changelog.md
@rm -rf latest_release_changelog.md


# Migration --------------------
##
migrate-v7: ## Run the migration
@cd api/; ./batect migrate-v7 -- "--layer ${layer} --all-layers ${all-layers}"
@cd api/; ./batect migrate-v7 -- --layer ${layer} --all-layers ${all-layers}
4 changes: 4 additions & 0 deletions api/batect.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,4 +130,8 @@ tasks:
description: Run the rAPId migration script
run:
container: service-image
environment:
AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY_ID:-}
AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_ACCESS_KEY:-}
AWS_SESSION_TOKEN: ${AWS_SESSION_TOKEN:-}
command: 'python migrations/scripts/v7_layer_migration.py'
74 changes: 16 additions & 58 deletions api/migrations/scripts/v7_layer_migration.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@
Please ensure that none of the crawlers are running when you start this script.
"""
import sys
import os
import argparse
from copy import deepcopy
import json
import os
from typing import List
import re
from pprint import pprint
Expand All @@ -26,10 +27,15 @@

dotenv.load_dotenv()

# Add api to PYTHONPATH
current_dir = os.path.dirname(os.path.abspath(__file__))
target_dir = os.path.abspath(os.path.join(current_dir, "..", ".."))
sys.path.append(target_dir)

from api.domain.schema import Schema # noqa: E402
from api.application.services.schema_service import SchemaService # noqa: E402
from api.adapter.athena_adapter import AthenaAdapter # noqa: E402
from api.common.custom_exceptions import ConflictError # noqa: E402


AWS_REGION = os.environ["AWS_REGION"]
Expand All @@ -54,10 +60,10 @@ def main(
athena_adapter,
):
migrate_files(layer, s3_client)
migrate_permissions(layer, all_layers, dynamodb_client)
schema_errors = migrate_schemas(layer, schema_service, glue_client)
migrate_tables(layer, glue_client, athena_adapter)
migrate_crawlers(glue_client, resource_client)
migrate_permissions(layer, all_layers, dynamodb_client)

if schema_errors:
print("- These were the schema errors that need to be addressed manually")
Expand Down Expand Up @@ -204,54 +210,10 @@ def migrate_tables(layer: str, glue_client, athena_adapter: AthenaAdapter):
DatabaseName=GLUE_DB,
TableName=table["Name"],
)

glue_client.create_table(
DatabaseName=GLUE_DB,
TableInput={
"Name": f"{layer}_{table['Name']}",
"Owner": "hadoop",
"StorageDescriptor": {
"Columns": table["StorageDescriptor"]["Columns"],
"Location": table["StorageDescriptor"]["Location"].replace(
"/data/", f"/data/{layer}/"
),
"InputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat",
"OutputFormat": "org.apache.hadoop.hive.ql.io.parquet.MapredParquetOutputFormat",
"Compressed": False,
"SerdeInfo": {
"SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
"Parameters": {"serialization.format": "1"},
},
"NumberOfBuckets": -1,
"StoredAsSubDirectories": False,
},
"PartitionKeys": table["PartitionKeys"],
"TableType": "EXTERNAL_TABLE",
"Parameters": {
"classification": "parquet",
"typeOfData": "file",
"compressionType": "none",
"EXTERNAL": "TRUE",
},
},
PartitionIndexes=[
{
"Keys": [
key["Name"]
for key in partition_indexes["PartitionIndexDescriptorList"][0][
"Keys"
]
],
"IndexName": partition_indexes["PartitionIndexDescriptorList"][0][
"IndexName"
],
}
]
if partition_indexes["PartitionIndexDescriptorList"]
else [],
)

athena_adapter.query_sql_async(f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;")
if partition_indexes:
athena_adapter.query_sql_async(
f"MSCK REPAIR TABLE `f{layer}_{table['Name']}`;"
)
glue_client.delete_table(Name=table["Name"], DatabaseName=GLUE_DB)


Expand Down Expand Up @@ -325,14 +287,10 @@ def migrate_schemas(layer, schema_service: SchemaService, glue_client):
for column in schema.columns:
column.data_type = glue_column_types[column.name]

schema_service.store_schema(schema)
latest_version = schema_service.get_latest_schema(schema.metadata)

if (
latest_version.dataset_identifier()
!= schema.metadata.dataset_identifier()
):
schema_service.deprecate_schema(schema.metadata)
try:
schema_service.upload_schema(schema)
except ConflictError:
schema_service.update_schema(schema)

s3_client.delete_object(Bucket=DATA_BUCKET, Key=file["Key"])

Expand Down
16 changes: 15 additions & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
# Changelog

## v7.0.4 / v0.1.2 (sdk) - _2023-09-20_

### Features

- Improved release process
- Added Athena workgroup and database as outputs of the rAPId module.

### Fixes

- Updated terraform default `application_version` and `ui_version` variables.
- Migration script and documentation.

## v7.0.3 / v0.1.2 (sdk) - _2023-09-15_

### Fixes

- Fixes issue where permissions were not being correctly read and causing api functionality to fail


## v7.0.2 / v0.1.2 (sdk) - _2023-09-14_

### Fixes
Expand Down Expand Up @@ -39,7 +52,8 @@

- See the [migration doc](migration.md) for details on how to migrate to v7 from v6.

[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.3...HEAD
[Unreleased changes]: https://github.com/no10ds/rapid/compare/v7.0.4...HEAD
[v7.0.4 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.3...v7.0.4
[v7.0.3 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.2...v7.0.3
[v7.0.2 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.1...v7.0.2
[v7.0.1 / v0.1.2 (sdk)]: https://github.com/no10ds/rapid/v7.0.0...v7.0.1
Expand Down
31 changes: 18 additions & 13 deletions docs/migration.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,38 +13,43 @@ To execute it, you'll need to decide:

### Prerequisites

#### Infrastructure changes

The v7.0.0 infrastructure changes need to be applied to your rAPId instance.

Update the version of the rAPId terraform module that you are using and apply the terraform.

#### Local requirements

You will need the ability to run `Batect`, the requirements for which are listed [here](https://batect.dev/docs/getting-started/requirements/).

### Steps:

#### Make the infrastructure changes

The v7 infrastructure changes need to be applied to your rAPId instance.

1. Add the `layers` variable to the rAPId cluster module. `layers` will be a list of the layers you wish to use in your rAPId instance. You can omit this if you just want to use the `default` layer.
2. Change the rAPId module source to:
`[email protected]:no10ds/rapid.git//infrastructure/modules/rapid`
3. Update both the application_version and ui_version variables to `v7.0.4`

Apply these changes.

#### Clone the repo

To do this, run:

`git clone -b v7.0.0 [email protected]:no10ds/rapid.git`
`git clone -b v7.0.4 [email protected]:no10ds/rapid.git`

#### Set your environment variables

Within the rAPId repo, set the following variables in the `.env` file to match those of your rAPId instance and AWS account:

```
# rAPId instance variables
- AWS_REGION=
- DATA_BUCKET=
- RESOURCE_PREFIX=
AWS_REGION=
DATA_BUCKET=
RESOURCE_PREFIX=
# AWS environment variables
- AWS_ACCESS_KEY_ID=
- AWS_SECRET_ACCESS_KEY=
- AWS_SESSION_TOKEN=
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
AWS_SESSION_TOKEN=
```

#### Run the migration script
Expand Down
26 changes: 0 additions & 26 deletions get_latest_release_changelog.py

This file was deleted.

7 changes: 6 additions & 1 deletion infrastructure/blocks/data-workflow/output.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ output "athena_query_output_bucket_arn" {

output "athena_workgroup_arn" {
value = module.data_workflow.athena_workgroup_arn
description = "Query workgroup for Athena"
description = "The ARN of the Query workgroup for Athena"
}

output "athena_workgroup_name" {
value = module.data_workflow.athena_workgroup_name
description = "The name of the Query workgroup for Athena"
}

output "schema_table_arn" {
Expand Down
7 changes: 6 additions & 1 deletion infrastructure/modules/data-workflow/output.tf
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@ output "athena_query_result_output_bucket_arn" {

output "athena_workgroup_arn" {
value = aws_athena_workgroup.rapid_athena_workgroup.arn
description = "Query workgroup for Athena"
description = "The arn of the Query workgroup for Athena"
}

output "athena_workgroup_name" {
value = aws_athena_workgroup.rapid_athena_workgroup.name
description = "The name of the Query workgroup for Athena"
}

output "catalogue_db_name" {
Expand Down
9 changes: 9 additions & 0 deletions infrastructure/modules/rapid/outputs.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
output "athena_workgroup_name" {
value = module.data_workflow.athena_workgroup_name
description = "The name of the Query workgroup for Athena"
}

output "catalogue_db_name" {
value = module.data_workflow.catalogue_db_name
description = "The name of the Glue Catalogue database"
}
4 changes: 2 additions & 2 deletions infrastructure/modules/rapid/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ variable "app-replica-count-max" {
variable "application_version" {
type = string
description = "The version number for the application image (e.g.: v1.0.4, v1.0.x-latest, etc.)"
default = "v6.2.1"
default = "v7.0.4"
}

variable "ui_version" {
type = string
description = "The version number for the static ui (e.g.: v1.0.0, etc.)"
default = "v6.0.1"
default = "v7.0.4"
}

variable "catalog_disabled" {
Expand Down
63 changes: 63 additions & 0 deletions release.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import re
import argparse


def create_changelog():
with open("./docs/changelog.md", "r") as changelog_file:
changelog_lines = changelog_file.readlines()

parsed_lines = []

adding = False

for line in changelog_lines:
if re.match(
# Match for the release version number e.g. ## v7.0.0
f"##\s+v\d+\.\d+\.\d+", # noqa: F541, W605
line,
):
adding = not adding
if adding:
parsed_lines.append(line)

if not parsed_lines:
raise Exception(
"It looks like there is no release information in the changelog. Please check it."
)
else:
with open("latest_release_changelog.md", "w+") as latest_changelog:
latest_changelog.writelines(parsed_lines)


def ask_yes_no_question(question):
while True:
answer = input(f"{question} (yes/no): ").strip().lower()
if answer == "yes" or answer == "y":
return True
elif answer == "no" or answer == "n":
print("Please fix this before continuing.")
exit(1)
else:
print("Please answer yes or no.")


def check():
ask_yes_no_question("Have you updated the application and ui version in Terraform?")
ask_yes_no_question("Have you updated the changelog for this release?")


if __name__ == "__main__":
parser = argparse.ArgumentParser()

parser.add_argument(
"--operation",
help="The operation to carry out",
required=True,
choices=["check", "create-changelog"],
)
args = parser.parse_args()
match args.operation:
case "create-changelog":
create_changelog()
case "check":
check()

0 comments on commit 925c6f9

Please sign in to comment.