From 8d6bea8771fb2913865c09910c94560800aa4f90 Mon Sep 17 00:00:00 2001 From: ogzhanolguncu Date: Wed, 7 Aug 2024 14:34:47 +0300 Subject: [PATCH] feat: add metadata to pdf --- src/file-loader.ts | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/file-loader.ts b/src/file-loader.ts index 5778c3f..3cbcdf5 100644 --- a/src/file-loader.ts +++ b/src/file-loader.ts @@ -1,3 +1,5 @@ +/* eslint-disable @typescript-eslint/no-unsafe-member-access */ +/* eslint-disable @typescript-eslint/no-unsafe-assignment */ /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable @typescript-eslint/no-unsafe-argument */ import { CSVLoader } from "@langchain/community/document_loaders/fs/csv"; @@ -67,7 +69,16 @@ export class FileDataLoader { const splitter = new RecursiveCharacterTextSplitter(args); const splittedDocuments = await splitter.splitDocuments(documents); - return mapDocumentsIntoInsertPayload(splittedDocuments); + return mapDocumentsIntoInsertPayload(splittedDocuments, (metadata: any, index: number) => ({ + source: metadata.source, + timestamp: new Date().toISOString(), + paragraphNumber: index + 1, + pageNumber: metadata.loc?.pageNumber || undefined, + author: metadata.pdf?.info?.Author || undefined, + title: metadata.pdf?.info?.Title || undefined, + totalPages: metadata.pdf?.totalPages || undefined, + language: metadata.pdf?.metadata?._metadata?.["dc:language"] || undefined, + })); } case "csv": { @@ -98,10 +109,14 @@ export class FileDataLoader { } } - function mapDocumentsIntoInsertPayload(splittedDocuments: Document[]) { - return splittedDocuments.map((document) => ({ + function mapDocumentsIntoInsertPayload( + splittedDocuments: Document[], + metadataMapper?: (metadata: any, index: number) => Record + ) { + return splittedDocuments.map((document, index) => ({ data: document.pageContent, id: nanoid(), + ...(metadataMapper ? { metadata: metadataMapper(document.metadata, index) } : {}), })); } }