Skip to content

Commit

Permalink
(DOCSP-31304): Implement 'all' command (#40)
Browse files Browse the repository at this point in the history
* Implement 'all' command

* Add tests

* Fix slight bug and add more logging

* Fix bug betterlike

* Address comments
  • Loading branch information
cbush authored Jul 13, 2023
1 parent 39c0f97 commit f0c4cf2
Show file tree
Hide file tree
Showing 8 changed files with 427 additions and 89 deletions.
1 change: 1 addition & 0 deletions chat-core/src/DatabaseConnection.ts
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ export const makeDatabaseConnection = async ({
},

async updateEmbeddedContent({ page, embeddedContent }) {
assert(embeddedContent.length !== 0);
embeddedContent.forEach((embeddedContent) => {
assert(
embeddedContent.sourceName === page.sourceName &&
Expand Down
3 changes: 3 additions & 0 deletions chat-core/src/Page.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,8 @@ export type PageStore = {
sources?: string[];
}): Promise<PersistedPage[]>;

/**
Updates or adds the given pages in the store.
*/
updatePages(pages: PersistedPage[]): Promise<void>;
};
87 changes: 87 additions & 0 deletions ingest/src/IngestMetaStore.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
import { MongoClient } from "mongodb";

/**
The ingest meta has information about ingest runs so that the script can
resume from a known successful run date.
If the 'since' date given to the embed command is too late, pages that were
updated during a failed run will not be picked up.
If too early, more pages and embeddings will be checked than necessary. The
embed command will not unnecessarily create new embeddings for page updates
that it has already created embeddings for, but it would still be wasteful to
have to check potentially all pages and embeddings when the date is early
enough.
*/
export type IngestMetaStore = {
/**
The ID of the specific metadata document this store is associated with.
Generally there should be only one document per ingest_meta collection per
database.
*/
readonly entryId: string;

/**
Returns the last successful run date for the store's entry.
*/
loadLastSuccessfulRunDate(): Promise<Date | null>;

/**
Sets the store's entry to the current date.
*/
updateLastSuccessfulRunDate(): Promise<void>;

/**
Closes the connection. Must be called when done.
*/
close(): Promise<void>;
};

export type IngestMetaEntry = {
_id: string;
lastIngestDate: Date;
};

/**
Creates a connection to ingest meta collection.
*/
export const makeIngestMetaStore = async ({
connectionUri,
databaseName,
entryId,
}: {
connectionUri: string;
databaseName: string;
entryId: string;
}): Promise<IngestMetaStore> => {
const client = await MongoClient.connect(connectionUri);
const collection = client
.db(databaseName)
.collection<IngestMetaEntry>("ingest_meta");
return {
entryId,

async close() {
await client.close();
},
async loadLastSuccessfulRunDate() {
return (
(await collection.findOne({ _id: entryId }))?.lastIngestDate ?? null
);
},
async updateLastSuccessfulRunDate() {
await collection.updateOne(
{
_id: entryId,
},
{
$set: {
_id: entryId,
lastIngestDate: new Date(),
},
},
{ upsert: true }
);
},
};
};
116 changes: 116 additions & 0 deletions ingest/src/commands/all.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
import { PageStore, EmbeddedContentStore, assertEnvVars } from "chat-core";
import { MongoClient } from "mongodb";
import { INGEST_ENV_VARS } from "../IngestEnvVars";
import { doAllCommand } from "./all";
import { makeIngestMetaStore } from "../IngestMetaStore";

import "dotenv/config";

jest.setTimeout(1000000);

describe("allCommand", () => {
const { MONGODB_CONNECTION_URI: connectionUri } =
assertEnvVars(INGEST_ENV_VARS);

const mockEmbeddedContentStore: EmbeddedContentStore = {
async deleteEmbeddedContent() {
return;
},
async findNearestNeighbors() {
return [];
},
async loadEmbeddedContent() {
return [];
},
async updateEmbeddedContent() {
return;
},
};
const mockPageStore: PageStore = {
async loadPages() {
return [];
},
async updatePages() {
return;
},
};

let databaseName: string;

beforeEach(async () => {
databaseName = `test-all-command-${Date.now()}-${Math.floor(
Math.random() * 10000000
)}`;
});

afterEach(async () => {
const client = await MongoClient.connect(connectionUri);
try {
const db = client.db(databaseName);
await db.dropDatabase();
} finally {
await client.close();
}
});

it("updates the metadata with the last successful timestamp", async () => {
const ingestMetaStore = await makeIngestMetaStore({
connectionUri,
databaseName,
entryId: "all",
});
try {
let lastSuccessfulRunDate =
await ingestMetaStore.loadLastSuccessfulRunDate();
expect(lastSuccessfulRunDate).toBeNull();
await doAllCommand({
pageStore: mockPageStore,
embeddedContentStore: mockEmbeddedContentStore,
connectionUri,
databaseName,
async doPagesCommand() {
return;
},
});
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate();
expect(lastSuccessfulRunDate?.getTime()).toBeGreaterThan(
Date.now() - 5000
);
expect(lastSuccessfulRunDate?.getTime()).toBeLessThanOrEqual(Date.now());
} finally {
await ingestMetaStore.close();
}
});

it("does not update the metadata with the last successful timestamp on failure", async () => {
const ingestMetaStore = await makeIngestMetaStore({
connectionUri,
databaseName,
entryId: "all",
});
try {
let lastSuccessfulRunDate =
await ingestMetaStore.loadLastSuccessfulRunDate();
expect(lastSuccessfulRunDate).toBeNull();
try {
await doAllCommand({
pageStore: mockPageStore,
embeddedContentStore: mockEmbeddedContentStore,
connectionUri,
databaseName,
async doPagesCommand() {
// Sudden failure!
throw new Error("Fail!");
},
});
} catch (e: unknown) {
expect((e as { message: string }).message).toBe("Fail!");
}
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate();
// Not updated because run failed
expect(lastSuccessfulRunDate).toBeNull();
} finally {
await ingestMetaStore.close();
}
});
});
80 changes: 76 additions & 4 deletions ingest/src/commands/all.ts
Original file line number Diff line number Diff line change
@@ -1,12 +1,84 @@
import { CommandModule } from "yargs";
import { doPagesCommand as officialDoPages } from "./pages";
import { doEmbedCommand } from "./embed";
import {
makeDatabaseConnection,
assertEnvVars,
EmbeddedContentStore,
PageStore,
logger,
} from "chat-core";
import { INGEST_ENV_VARS } from "../IngestEnvVars";
import { makeIngestMetaStore } from "../IngestMetaStore";

const commandModule: CommandModule = {
const commandModule: CommandModule<unknown, unknown> = {
command: "all",
async handler() {
console.log("Hello all!");
console.log("The time is:", new Date().toISOString());
const { MONGODB_CONNECTION_URI, MONGODB_DATABASE_NAME } =
assertEnvVars(INGEST_ENV_VARS);

const store = await makeDatabaseConnection({
connectionUri: MONGODB_CONNECTION_URI,
databaseName: MONGODB_DATABASE_NAME,
});

try {
await doAllCommand({
pageStore: store,
embeddedContentStore: store,
connectionUri: MONGODB_CONNECTION_URI,
databaseName: MONGODB_DATABASE_NAME,
});
} finally {
await store.close();
}
},
describe: "Testing command",
describe: "Run 'pages' and 'embed' since last successful run",
};

export default commandModule;

export const doAllCommand = async ({
pageStore,
embeddedContentStore,
connectionUri,
databaseName,
doPagesCommand = officialDoPages,
}: {
pageStore: PageStore;
embeddedContentStore: EmbeddedContentStore;
connectionUri: string;
databaseName: string;

// Mockable for unit test - otherwise will actually load pages from all
// sources, waste time
doPagesCommand?: typeof officialDoPages;
}) => {
const ingestMetaStore = await makeIngestMetaStore({
connectionUri,
databaseName,
entryId: "all",
});

try {
const lastSuccessfulRunDate =
await ingestMetaStore.loadLastSuccessfulRunDate();

logger.info(`Last successful run date: ${lastSuccessfulRunDate}`);

await doPagesCommand({
store: pageStore,
});

await doEmbedCommand({
since: lastSuccessfulRunDate ?? new Date("2023-01-01"),
pageStore,
embeddedContentStore,
});

logger.info(`Updating last successful run date`);
await ingestMetaStore.updateLastSuccessfulRunDate();
} finally {
await ingestMetaStore.close();
}
};
Loading

0 comments on commit f0c4cf2

Please sign in to comment.