From 695dfbd54fc84d211a16d1f536780caec2e9a766 Mon Sep 17 00:00:00 2001 From: rogup Date: Mon, 14 Oct 2024 15:22:41 +0100 Subject: [PATCH 1/2] [back-end] Add back-end architecture docs --- .github/workflows/docs.yml | 2 + docs/architecture.md | 111 +++++++ docs/images/architecture-back-end.drawio.svg | 295 ++++++++++++++++++ docs/images/architecture-front-end.drawio.svg | 4 +- docs/monorepo.md | 15 +- 5 files changed, 419 insertions(+), 8 deletions(-) create mode 100644 docs/images/architecture-back-end.drawio.svg diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml index 75deab23..3a2f5cb8 100644 --- a/.github/workflows/docs.yml +++ b/.github/workflows/docs.yml @@ -4,6 +4,8 @@ name: docs on: push: + paths: + - "docs/**" branches: - main permissions: diff --git a/docs/architecture.md b/docs/architecture.md index a33beced..d19e69fd 100644 --- a/docs/architecture.md +++ b/docs/architecture.md @@ -18,3 +18,114 @@ subscribe to the Redux store without React, but this would be more complicated. ## Back-end architecture + +![Diagram](images/architecture-back-end.drawio.svg) + +### Dataset files + +All persistent data is stored on the back-end server as JSON files, in the following folder structure +as seen from the SERVER_DATA_ROOT location: + +``` +├── datasets +│ ├── some-dataset +│ │ ├── locations.json (array of lng-lat coordinates for each initiative) +│ │ ├── searchable.json (array of the property values and searchable strings for each initiative) +│ │ ├── initiatives +│ │ | ├── 0.json (full info of first initiative in the above aggregate JSONs) +│ │ | ├── 1.json +│ │ | ├── ... +│ ├── other-dataset +│ │ ├── ... +│ ├── ... +``` + +Additionally, for each dataset there's a `config.json`. This contains config for displaying the map +in the UI, including the vocabs (translations of data IDs), default sidebar panel, and popup +appearance. This config is not generated into the above folder structure, but kept in source control +in the `@mykomap/config` library. + +### Example file contents + +`locations.json`: + +``` +[ [1.21419, 50.45254], [0.21002, 49.33954], … ] +``` + +`searchable.json`: + +``` +{ + "fields": ["coun", "sz", "searchString"], + "values": [ + ["GB", "Small", "some co-op 2 green lane london n4 9qr"], + ["GB", "Large", "another co-op 15 brown street sheffield s7 0hg"], + ... + ] +} +``` + +#### Potential optimisation: + +Since there will be one row per item, with 100k items, every 10 characters adds a new megabyte. The really bulky bit is the text searchString part, so maybe it could be kept in its own plain text file, with one line per item. Searching it could be done by streaming it from disk, which avoids loading the entire file permanently into memory (for each dataset). + +For instance, this [SO thread](https://stackoverflow.com/questions/20187145/how-to-search-stream-for-string-in-node-js) has some sample stream-searching code, and a reference to a module which performs the streaming by what appears to be a fast non-buffering algorithm. + +`0.json`: + +``` +{ name: "Some Co-op", "desc": "A co-op that sells stuff", "lng": 1.21419, "lat": 50.45254, "coun": "GB", "sz": "Small", ... } +``` + +`config.json`: + +``` +{ + "prefixes": { + "https://example.com/sizes/1.1/": "sz", + ... + }, + "vocabs": { + "sz": { + "EN": { + "title": "Sizes", + "terms": { + "large": "Large", + "medium": "Medium", + "small": "Small" + } + } + }, + ... + }, + "popupFields": { + "sz": "text", + "websites": "clickable-list", + ... + }, + "ui": { ... }, + ... +} + +``` + +### Data generation + +These directories of JSONs, including the searchable strings in the `searchable.json` files, need to be pre-generated by a script. This script will be written in JS/TS and live in the monorepo, to be run on the back-end server. + +The script will take the full data CSV for a map (generated by the data factory) as inputs, and write the full data into the required JSON files in the directory structure specified above. + +#### Note: + +We will need to manually copy the `standard.csv` from the data factory server to the back-end. Maybe in the future, the data factory pipeline can be enhanced to write the JSON files to the back-end server so that no manual duplication is necessary (and maybe we can eventually get rid of the separate data server altogether). Or, the bacl-end server could be given a URL to the appropriate `standard.csv` file(s) as published by the data factory and download it from there as part of a `build-data` script (possibly when notified by a webhook, or possibly polling and checking the file modification date) + +### Dataset instances + +- For each dataset available in the `datasets` directory on server start, a dataset instance is created + by the Dataset service. Each Dataset instance has a: + - `searchable` property, which is just the `searchable.json` loaded as an in-memory object + - `getItem` method + - `getConfig` method, which includes the vocabs + - `getLocations` method, which returns a stream of the data + - `search` method, which involves iterating through `searchable` to find matching initiatives diff --git a/docs/images/architecture-back-end.drawio.svg b/docs/images/architecture-back-end.drawio.svg new file mode 100644 index 00000000..d08450df --- /dev/null +++ b/docs/images/architecture-back-end.drawio.svg @@ -0,0 +1,295 @@ + + + + + + + + +
+
+
+ Back-end application +
+
+
+
+ + Back-end application + +
+
+ + + + +
+
+
+ 'datasets' directory +
+
+
+
+ + 'datasets' directory + +
+
+ + + + +
+
+
+ + Mykomap +
+ front-end +
+
+
+
+
+
+ + Mykomap... + +
+
+ + + + + +
+
+
+ + + 'some-dataset' +
+ JSON files +
+
+
+
+
+
+
+ + 'some-dataset'... + +
+
+ + + + +
+
+
+ + Dataset service + +
+ contains logic for searching and serving data files +
+
+
+
+ + Dataset service... + +
+
+ + + + + +
+
+
+
+ Request a search +
+
+ or specific data +
+
+ from a dataset +
+
+
+
+
+ + Request a... + +
+
+ + + + + +
+
+
+ API +
+
+
+
+ + API + +
+
+ + + + +
+
+
+ + Fastify / ts-rest +
+ server +
+
+
+
+
+
+ + Fastify / ts-res... + +
+
+ + + + +
+
+
+ + 'some-dataset' +
+ instance +
+
+
+
+
+
+ + 'some-dataset'... + +
+
+ + + + +
+
+
+ + 'other-dataset' +
+ instance +
+
+
+
+
+
+ + 'other-dataset'... + +
+
+ + + + + +
+
+
+ + + 'other-dataset' +
+ JSON files +
+
+
+
+
+
+
+ + 'other-dataset'... + +
+
+ + + + + +
+
+
+
+ reads +
+
+
+
+
+ + reads + +
+
+ + + + + +
+
+
+
+ reads +
+
+
+
+
+ + reads + +
+
+ + + + +
+ + + + + Text is not SVG - cannot display + + + +
diff --git a/docs/images/architecture-front-end.drawio.svg b/docs/images/architecture-front-end.drawio.svg index bf8f19b5..960cc7c6 100644 --- a/docs/images/architecture-front-end.drawio.svg +++ b/docs/images/architecture-front-end.drawio.svg @@ -1,4 +1,4 @@ - + @@ -105,7 +105,7 @@ - + diff --git a/docs/monorepo.md b/docs/monorepo.md index 096c8331..9e92ae6a 100644 --- a/docs/monorepo.md +++ b/docs/monorepo.md @@ -1,19 +1,18 @@ -# Monorepo thinking +# Monorepo structure ``` monorepo |- apps |- @mykomap/front-end |- @mykomap/back-end - |- Map class instance for each config that needs a deployed backend - |- Single Fastify server shared across all maps + |- Dataset class instance for each dataset that needs a deployed backend + |- Single ts-rest/Fastify server shared across all datasets |- libs |- @mykomap/common |- search/filter code (imported in BE and some FE builds) |- API ts-rest contract + OpenAPI spec - |- @mykomap/variants, a folder for each map containing: - |- config.ts ... includes flag for whether to use search/filter in FE or BE - |- popup.ts (with an aim to commonalise this code and configure within config.ts) + |- @mykomap/config, a folder for each dataset containing: + |- config.json ... includes vocabs, UI config, and flag for whether to use search/filter in FE or BE ``` ## Ideas @@ -22,3 +21,7 @@ monorepo - Use git subtree, so we can publish sub-folders of the monorepo as independent repos that others can use. We can set this up later if required, but for now try to ensure repositories are not tightly coupled unecessarily. +- Differentiate between maps and datasets: currently a map's data and config is associated with a + single dataset on the backend. We may want to unlink these so that 2 maps can share the same + dataset with differnet config, or a map can show multiple datasets. This could be done in the + future. From b25c7b794838749fc7ce85229bef1e04c45ff9bc Mon Sep 17 00:00:00 2001 From: rogup Date: Mon, 14 Oct 2024 17:36:04 +0100 Subject: [PATCH 2/2] [back-end] Add datasetService and a new Dataset class --- apps/back-end/src/routes.ts | 4 ++ apps/back-end/src/services/Dataset.ts | 54 +++++++++++++++++ apps/back-end/src/services/datasetService.ts | 61 ++++++++++++++++++++ 3 files changed, 119 insertions(+) create mode 100644 apps/back-end/src/services/Dataset.ts create mode 100644 apps/back-end/src/services/datasetService.ts diff --git a/apps/back-end/src/routes.ts b/apps/back-end/src/routes.ts index afed75c1..6b3b1be0 100644 --- a/apps/back-end/src/routes.ts +++ b/apps/back-end/src/routes.ts @@ -4,6 +4,7 @@ import { contract } from "@mykomap/common"; import { FastifyPluginOptions, FastifyReply, FastifyRequest } from "fastify"; import fs from "node:fs"; import path from "node:path"; +import { initDatasets } from "./services/datasetService.js"; /** Provides the shared configuration options for the Mykomap router implementation. */ export interface MykomapRouterConfig extends FastifyPluginOptions { @@ -77,6 +78,9 @@ export function MykomapRouter( `'${opts.mykomap.dataRoot}'.`, ); + // TODO: uncomment this when the test/data has been created with the updated structure + // initDatasets(opts.mykomap.dataRoot); + // Concatenates the path components into an absolute file path const filePath = (...components: string[]): string => { const p = path.join(opts.mykomap.dataRoot ?? "", ...components) + ".json"; diff --git a/apps/back-end/src/services/Dataset.ts b/apps/back-end/src/services/Dataset.ts new file mode 100644 index 00000000..f854f368 --- /dev/null +++ b/apps/back-end/src/services/Dataset.ts @@ -0,0 +1,54 @@ +import fs from "node:fs"; +import path from "node:path"; +import { TsRestResponseError } from "@ts-rest/core"; + +import { contract } from "@mykomap/common"; + +export class Dataset { + id: string; + folderPath: string; + searchable: ({ [prop: string]: string } & { searchString: string })[]; + + constructor(id: string, dataRoot: string) { + this.id = id; + this.folderPath = path.join(dataRoot, "datasets", id); + this.searchable = JSON.parse( + fs.readFileSync(path.join(this.folderPath, "searchable.json"), "utf8"), + ); + } + + getItem = (itemId: number) => { + if ( + !fs.existsSync( + path.join(this.folderPath, "initiatives", `${itemId}.json`), + ) + ) { + throw new TsRestResponseError(contract.getDatasetItem, { + status: 404, + body: { + message: `can't retrieve data for dataset ${this.id} item ${itemId}`, + }, + }); + } + + return JSON.parse( + fs.readFileSync( + path.join(this.folderPath, "initiatives", `${itemId}.json`), + "utf8", + ), + ); + }; + + getConfig = () => { + // TODO: implementation + return {}; + }; + + getLocations = (): fs.ReadStream => + fs.createReadStream(path.join(this.folderPath, "locations.json"), "utf8"); + + search = (filter?: string[], text?: string): number[] => { + // TODO: implementation + return []; + }; +} diff --git a/apps/back-end/src/services/datasetService.ts b/apps/back-end/src/services/datasetService.ts new file mode 100644 index 00000000..a97b3c7a --- /dev/null +++ b/apps/back-end/src/services/datasetService.ts @@ -0,0 +1,61 @@ +import fs from "node:fs"; +import path from "node:path"; +import { TsRestResponseError } from "@ts-rest/core"; + +import { contract } from "@mykomap/common"; +import { Dataset } from "./Dataset.js"; + +const datasets: { [id: string]: Dataset } = {}; + +/** + * This method instantiates a Dataset object for each of the datasets in the dataRoot/datasets + * directory in the filesystem. + */ +export const initDatasets = (dataRoot: string) => { + const datasetIds = fs + .readdirSync(path.join(dataRoot, "datasets"), { withFileTypes: true }) + .filter((f) => f.isDirectory()) + .map((f) => f.name); + + console.log("Found datasets:", datasetIds); + + for (const datasetId of datasetIds) { + datasets[datasetId] = new Dataset(datasetId, dataRoot); + } +}; + +const getDatasetOrThrow404 = (datasetId: string): Dataset => { + const dataset = datasets[datasetId]; + + if (!dataset) + throw new TsRestResponseError(contract.searchDataset, { + status: 404, + body: { message: `dataset ${datasetId} doesn't exist` }, + }); + + return dataset; +}; + +export const getDatasetItem = (datasetId: string, datasetItemId: number) => { + const dataset = getDatasetOrThrow404(datasetId); + return dataset.getItem(datasetItemId); +}; + +export const getDatasetConfig = (datasetId: string) => { + const dataset = getDatasetOrThrow404(datasetId); + return dataset.getConfig(); +}; + +export const getDatasetLocations = (datasetId: string): fs.ReadStream => { + const dataset = getDatasetOrThrow404(datasetId); + return dataset.getLocations(); +}; + +export const searchDataset = ( + datasetId: string, + filter?: string[], + text?: string, +): number[] => { + const dataset = getDatasetOrThrow404(datasetId); + return dataset.search(filter, text); +};