DigitalCommons
diff --git a/‎.github/workflows/docs.yml
+2 b/‎.github/workflows/docs.yml
+2
diff --git a/‎apps/back-end/src/routes.ts
+4 b/‎apps/back-end/src/routes.ts
+4
diff --git a/‎apps/back-end/src/services/Dataset.ts
+54 b/‎apps/back-end/src/services/Dataset.ts
+54
diff --git a/‎apps/back-end/src/services/datasetService.ts
+61 b/‎apps/back-end/src/services/datasetService.ts
+61
diff --git a/‎docs/architecture.md
+111 b/‎docs/architecture.md
+111
@@ -4,6 +4,8 @@
 name: docs
 on:
   push:
+    paths:
+      - "docs/**"
     branches:
       - main
 permissions:
 
@@ -4,6 +4,7 @@ import { contract } from "@mykomap/common";
 import { FastifyPluginOptions, FastifyReply, FastifyRequest } from "fastify";
 import fs from "node:fs";
 import path from "node:path";
+import { initDatasets } from "./services/datasetService.js";
 
 /** Provides the shared configuration options for the Mykomap router implementation. */
 export interface MykomapRouterConfig extends FastifyPluginOptions {
@@ -77,6 +78,9 @@ export function MykomapRouter(
         `'${opts.mykomap.dataRoot}'.`,
     );
 
+  // TODO: uncomment this when the test/data has been created with the updated structure
+  // initDatasets(opts.mykomap.dataRoot);
+
   // Concatenates the path components into an absolute file path
   const filePath = (...components: string[]): string => {
     const p = path.join(opts.mykomap.dataRoot ?? "", ...components) + ".json";
 
@@ -0,0 +1,54 @@
+import fs from "node:fs";
+import path from "node:path";
+import { TsRestResponseError } from "@ts-rest/core";
+
+import { contract } from "@mykomap/common";
+
+export class Dataset {
+  id: string;
+  folderPath: string;
+  searchable: ({ [prop: string]: string } & { searchString: string })[];
+
+  constructor(id: string, dataRoot: string) {
+    this.id = id;
+    this.folderPath = path.join(dataRoot, "datasets", id);
+    this.searchable = JSON.parse(
+      fs.readFileSync(path.join(this.folderPath, "searchable.json"), "utf8"),
+    );
+  }
+
+  getItem = (itemId: number) => {
+    if (
+      !fs.existsSync(
+        path.join(this.folderPath, "initiatives", `${itemId}.json`),
+      )
+    ) {
+      throw new TsRestResponseError(contract.getDatasetItem, {
+        status: 404,
+        body: {
+          message: `can't retrieve data for dataset ${this.id} item ${itemId}`,
+        },
+      });
+    }
+
+    return JSON.parse(
+      fs.readFileSync(
+        path.join(this.folderPath, "initiatives", `${itemId}.json`),
+        "utf8",
+      ),
+    );
+  };
+
+  getConfig = () => {
+    // TODO: implementation
+    return {};
+  };
+
+  getLocations = (): fs.ReadStream =>
+    fs.createReadStream(path.join(this.folderPath, "locations.json"), "utf8");
+
+  search = (filter?: string[], text?: string): number[] => {
+    // TODO: implementation
+    return [];
+  };
+}
@@ -0,0 +1,61 @@
+import fs from "node:fs";
+import path from "node:path";
+import { TsRestResponseError } from "@ts-rest/core";
+
+import { contract } from "@mykomap/common";
+import { Dataset } from "./Dataset.js";
+
+const datasets: { [id: string]: Dataset } = {};
+
+/**
+ * This method instantiates a Dataset object for each of the datasets in the dataRoot/datasets
+ * directory in the filesystem.
+ */
+export const initDatasets = (dataRoot: string) => {
+  const datasetIds = fs
+    .readdirSync(path.join(dataRoot, "datasets"), { withFileTypes: true })
+    .filter((f) => f.isDirectory())
+    .map((f) => f.name);
+
+  console.log("Found datasets:", datasetIds);
+
+  for (const datasetId of datasetIds) {
+    datasets[datasetId] = new Dataset(datasetId, dataRoot);
+  }
+};
+
+const getDatasetOrThrow404 = (datasetId: string): Dataset => {
+  const dataset = datasets[datasetId];
+
+  if (!dataset)
+    throw new TsRestResponseError(contract.searchDataset, {
+      status: 404,
+      body: { message: `dataset ${datasetId} doesn't exist` },
+    });
+
+  return dataset;
+};
+
+export const getDatasetItem = (datasetId: string, datasetItemId: number) => {
+  const dataset = getDatasetOrThrow404(datasetId);
+  return dataset.getItem(datasetItemId);
+};
+
+export const getDatasetConfig = (datasetId: string) => {
+  const dataset = getDatasetOrThrow404(datasetId);
+  return dataset.getConfig();
+};
+
+export const getDatasetLocations = (datasetId: string): fs.ReadStream => {
+  const dataset = getDatasetOrThrow404(datasetId);
+  return dataset.getLocations();
+};
+
+export const searchDataset = (
+  datasetId: string,
+  filter?: string[],
+  text?: string,
+): number[] => {
+  const dataset = getDatasetOrThrow404(datasetId);
+  return dataset.search(filter, text);
+};
@@ -18,3 +18,114 @@
   subscribe to the Redux store without React, but this would be more complicated.
 
 ## Back-end architecture
+
+![Diagram](images/architecture-back-end.drawio.svg)
+
+### Dataset files
+
+All persistent data is stored on the back-end server as JSON files, in the following folder structure
+as seen from the SERVER_DATA_ROOT location:
+
+```
+├── datasets
+│   ├── some-dataset
+│   │   ├── locations.json (array of lng-lat coordinates for each initiative)
+│   │   ├── searchable.json (array of the property values and searchable strings for each initiative)
+│   │   ├── initiatives
+│   │   |   ├── 0.json (full info of first initiative in the above aggregate JSONs)
+│   │   |   ├── 1.json
+│   │   |   ├── ...
+│   ├── other-dataset
+│   │   ├── ...
+│   ├── ...
+```
+
+Additionally, for each dataset there's a `config.json`. This contains config for displaying the map
+in the UI, including the vocabs (translations of data IDs), default sidebar panel, and popup
+appearance. This config is not generated into the above folder structure, but kept in source control
+in the `@mykomap/config` library.
+
+### Example file contents
+
+`locations.json`:
+
+```
+[ [1.21419, 50.45254], [0.21002, 49.33954], … ]
+```
+
+`searchable.json`:
+
+```
+{
+  "fields": ["coun", "sz", "searchString"],
+  "values": [
+    ["GB", "Small", "some co-op 2 green lane london n4 9qr"],
+    ["GB", "Large", "another co-op 15 brown street sheffield s7 0hg"],
+    ...
+  ]
+}
+```
+
+#### Potential optimisation:
+
+Since there will be one row per item, with 100k items, every 10 characters adds a new megabyte. The really bulky bit is the text searchString part, so maybe it could be kept in its own plain text file, with one line per item. Searching it could be done by streaming it from disk, which avoids loading the entire file permanently into memory (for each dataset).
+
+For instance, this [SO thread](https://stackoverflow.com/questions/20187145/how-to-search-stream-for-string-in-node-js) has some sample stream-searching code, and a reference to a module which performs the streaming by what appears to be a fast non-buffering algorithm.
+
+`0.json`:
+
+```
+{ name: "Some Co-op", "desc": "A co-op that sells stuff", "lng": 1.21419, "lat": 50.45254, "coun": "GB", "sz": "Small", ... }
+```
+
+`config.json`:
+
+```
+{
+  "prefixes": {
+    "https://example.com/sizes/1.1/": "sz",
+    ...
+  },
+  "vocabs": {
+    "sz": {
+      "EN": {
+        "title": "Sizes",
+        "terms": {
+          "large": "Large",
+          "medium": "Medium",
+          "small": "Small"
+        }
+      }
+    },
+    ...
+  },
+  "popupFields": {
+    "sz": "text",
+    "websites": "clickable-list",
+    ...
+  },
+  "ui": { ... },
+  ...
+}
+
+```
+
+### Data generation
+
+These directories of JSONs, including the searchable strings in the `searchable.json` files, need to be pre-generated by a script. This script will be written in JS/TS and live in the monorepo, to be run on the back-end server.
+
+The script will take the full data CSV for a map (generated by the data factory) as inputs, and write the full data into the required JSON files in the directory structure specified above.
+
+#### Note:
+
+We will need to manually copy the `standard.csv` from the data factory server to the back-end. Maybe in the future, the data factory pipeline can be enhanced to write the JSON files to the back-end server so that no manual duplication is necessary (and maybe we can eventually get rid of the separate data server altogether). Or, the bacl-end server could be given a URL to the appropriate `standard.csv` file(s) as published by the data factory and download it from there as part of a `build-data` script (possibly when notified by a webhook, or possibly polling and checking the file modification date)
+
+### Dataset instances
+
+- For each dataset available in the `datasets` directory on server start, a dataset instance is created
+  by the Dataset service. Each Dataset instance has a:
+  - `searchable` property, which is just the `searchable.json` loaded as an in-memory object
+  - `getItem` method
+  - `getConfig` method, which includes the vocabs
+  - `getLocations` method, which returns a stream of the data
+  - `search` method, which involves iterating through `searchable` to find matching initiatives