-
Notifications
You must be signed in to change notification settings - Fork 65
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
(DOCSP-31304): Implement 'all' command (#40)
* Implement 'all' command * Add tests * Fix slight bug and add more logging * Fix bug betterlike * Address comments
- Loading branch information
Showing
8 changed files
with
427 additions
and
89 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,87 @@ | ||
import { MongoClient } from "mongodb"; | ||
|
||
/** | ||
The ingest meta has information about ingest runs so that the script can | ||
resume from a known successful run date. | ||
If the 'since' date given to the embed command is too late, pages that were | ||
updated during a failed run will not be picked up. | ||
If too early, more pages and embeddings will be checked than necessary. The | ||
embed command will not unnecessarily create new embeddings for page updates | ||
that it has already created embeddings for, but it would still be wasteful to | ||
have to check potentially all pages and embeddings when the date is early | ||
enough. | ||
*/ | ||
export type IngestMetaStore = { | ||
/** | ||
The ID of the specific metadata document this store is associated with. | ||
Generally there should be only one document per ingest_meta collection per | ||
database. | ||
*/ | ||
readonly entryId: string; | ||
|
||
/** | ||
Returns the last successful run date for the store's entry. | ||
*/ | ||
loadLastSuccessfulRunDate(): Promise<Date | null>; | ||
|
||
/** | ||
Sets the store's entry to the current date. | ||
*/ | ||
updateLastSuccessfulRunDate(): Promise<void>; | ||
|
||
/** | ||
Closes the connection. Must be called when done. | ||
*/ | ||
close(): Promise<void>; | ||
}; | ||
|
||
export type IngestMetaEntry = { | ||
_id: string; | ||
lastIngestDate: Date; | ||
}; | ||
|
||
/** | ||
Creates a connection to ingest meta collection. | ||
*/ | ||
export const makeIngestMetaStore = async ({ | ||
connectionUri, | ||
databaseName, | ||
entryId, | ||
}: { | ||
connectionUri: string; | ||
databaseName: string; | ||
entryId: string; | ||
}): Promise<IngestMetaStore> => { | ||
const client = await MongoClient.connect(connectionUri); | ||
const collection = client | ||
.db(databaseName) | ||
.collection<IngestMetaEntry>("ingest_meta"); | ||
return { | ||
entryId, | ||
|
||
async close() { | ||
await client.close(); | ||
}, | ||
async loadLastSuccessfulRunDate() { | ||
return ( | ||
(await collection.findOne({ _id: entryId }))?.lastIngestDate ?? null | ||
); | ||
}, | ||
async updateLastSuccessfulRunDate() { | ||
await collection.updateOne( | ||
{ | ||
_id: entryId, | ||
}, | ||
{ | ||
$set: { | ||
_id: entryId, | ||
lastIngestDate: new Date(), | ||
}, | ||
}, | ||
{ upsert: true } | ||
); | ||
}, | ||
}; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
import { PageStore, EmbeddedContentStore, assertEnvVars } from "chat-core"; | ||
import { MongoClient } from "mongodb"; | ||
import { INGEST_ENV_VARS } from "../IngestEnvVars"; | ||
import { doAllCommand } from "./all"; | ||
import { makeIngestMetaStore } from "../IngestMetaStore"; | ||
|
||
import "dotenv/config"; | ||
|
||
jest.setTimeout(1000000); | ||
|
||
describe("allCommand", () => { | ||
const { MONGODB_CONNECTION_URI: connectionUri } = | ||
assertEnvVars(INGEST_ENV_VARS); | ||
|
||
const mockEmbeddedContentStore: EmbeddedContentStore = { | ||
async deleteEmbeddedContent() { | ||
return; | ||
}, | ||
async findNearestNeighbors() { | ||
return []; | ||
}, | ||
async loadEmbeddedContent() { | ||
return []; | ||
}, | ||
async updateEmbeddedContent() { | ||
return; | ||
}, | ||
}; | ||
const mockPageStore: PageStore = { | ||
async loadPages() { | ||
return []; | ||
}, | ||
async updatePages() { | ||
return; | ||
}, | ||
}; | ||
|
||
let databaseName: string; | ||
|
||
beforeEach(async () => { | ||
databaseName = `test-all-command-${Date.now()}-${Math.floor( | ||
Math.random() * 10000000 | ||
)}`; | ||
}); | ||
|
||
afterEach(async () => { | ||
const client = await MongoClient.connect(connectionUri); | ||
try { | ||
const db = client.db(databaseName); | ||
await db.dropDatabase(); | ||
} finally { | ||
await client.close(); | ||
} | ||
}); | ||
|
||
it("updates the metadata with the last successful timestamp", async () => { | ||
const ingestMetaStore = await makeIngestMetaStore({ | ||
connectionUri, | ||
databaseName, | ||
entryId: "all", | ||
}); | ||
try { | ||
let lastSuccessfulRunDate = | ||
await ingestMetaStore.loadLastSuccessfulRunDate(); | ||
expect(lastSuccessfulRunDate).toBeNull(); | ||
await doAllCommand({ | ||
pageStore: mockPageStore, | ||
embeddedContentStore: mockEmbeddedContentStore, | ||
connectionUri, | ||
databaseName, | ||
async doPagesCommand() { | ||
return; | ||
}, | ||
}); | ||
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate(); | ||
expect(lastSuccessfulRunDate?.getTime()).toBeGreaterThan( | ||
Date.now() - 5000 | ||
); | ||
expect(lastSuccessfulRunDate?.getTime()).toBeLessThanOrEqual(Date.now()); | ||
} finally { | ||
await ingestMetaStore.close(); | ||
} | ||
}); | ||
|
||
it("does not update the metadata with the last successful timestamp on failure", async () => { | ||
const ingestMetaStore = await makeIngestMetaStore({ | ||
connectionUri, | ||
databaseName, | ||
entryId: "all", | ||
}); | ||
try { | ||
let lastSuccessfulRunDate = | ||
await ingestMetaStore.loadLastSuccessfulRunDate(); | ||
expect(lastSuccessfulRunDate).toBeNull(); | ||
try { | ||
await doAllCommand({ | ||
pageStore: mockPageStore, | ||
embeddedContentStore: mockEmbeddedContentStore, | ||
connectionUri, | ||
databaseName, | ||
async doPagesCommand() { | ||
// Sudden failure! | ||
throw new Error("Fail!"); | ||
}, | ||
}); | ||
} catch (e: unknown) { | ||
expect((e as { message: string }).message).toBe("Fail!"); | ||
} | ||
lastSuccessfulRunDate = await ingestMetaStore.loadLastSuccessfulRunDate(); | ||
// Not updated because run failed | ||
expect(lastSuccessfulRunDate).toBeNull(); | ||
} finally { | ||
await ingestMetaStore.close(); | ||
} | ||
}); | ||
}); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,12 +1,84 @@ | ||
import { CommandModule } from "yargs"; | ||
import { doPagesCommand as officialDoPages } from "./pages"; | ||
import { doEmbedCommand } from "./embed"; | ||
import { | ||
makeDatabaseConnection, | ||
assertEnvVars, | ||
EmbeddedContentStore, | ||
PageStore, | ||
logger, | ||
} from "chat-core"; | ||
import { INGEST_ENV_VARS } from "../IngestEnvVars"; | ||
import { makeIngestMetaStore } from "../IngestMetaStore"; | ||
|
||
const commandModule: CommandModule = { | ||
const commandModule: CommandModule<unknown, unknown> = { | ||
command: "all", | ||
async handler() { | ||
console.log("Hello all!"); | ||
console.log("The time is:", new Date().toISOString()); | ||
const { MONGODB_CONNECTION_URI, MONGODB_DATABASE_NAME } = | ||
assertEnvVars(INGEST_ENV_VARS); | ||
|
||
const store = await makeDatabaseConnection({ | ||
connectionUri: MONGODB_CONNECTION_URI, | ||
databaseName: MONGODB_DATABASE_NAME, | ||
}); | ||
|
||
try { | ||
await doAllCommand({ | ||
pageStore: store, | ||
embeddedContentStore: store, | ||
connectionUri: MONGODB_CONNECTION_URI, | ||
databaseName: MONGODB_DATABASE_NAME, | ||
}); | ||
} finally { | ||
await store.close(); | ||
} | ||
}, | ||
describe: "Testing command", | ||
describe: "Run 'pages' and 'embed' since last successful run", | ||
}; | ||
|
||
export default commandModule; | ||
|
||
export const doAllCommand = async ({ | ||
pageStore, | ||
embeddedContentStore, | ||
connectionUri, | ||
databaseName, | ||
doPagesCommand = officialDoPages, | ||
}: { | ||
pageStore: PageStore; | ||
embeddedContentStore: EmbeddedContentStore; | ||
connectionUri: string; | ||
databaseName: string; | ||
|
||
// Mockable for unit test - otherwise will actually load pages from all | ||
// sources, waste time | ||
doPagesCommand?: typeof officialDoPages; | ||
}) => { | ||
const ingestMetaStore = await makeIngestMetaStore({ | ||
connectionUri, | ||
databaseName, | ||
entryId: "all", | ||
}); | ||
|
||
try { | ||
const lastSuccessfulRunDate = | ||
await ingestMetaStore.loadLastSuccessfulRunDate(); | ||
|
||
logger.info(`Last successful run date: ${lastSuccessfulRunDate}`); | ||
|
||
await doPagesCommand({ | ||
store: pageStore, | ||
}); | ||
|
||
await doEmbedCommand({ | ||
since: lastSuccessfulRunDate ?? new Date("2023-01-01"), | ||
pageStore, | ||
embeddedContentStore, | ||
}); | ||
|
||
logger.info(`Updating last successful run date`); | ||
await ingestMetaStore.updateLastSuccessfulRunDate(); | ||
} finally { | ||
await ingestMetaStore.close(); | ||
} | ||
}; |
Oops, something went wrong.