diff --git a/.env.example b/.env.example index 8be694e..6d7c44e 100644 --- a/.env.example +++ b/.env.example @@ -15,6 +15,3 @@ MILVUS_URL=http://localhost:19530 # Cohere API Key - Reranking COHERE_API_KEY= - -# Jina.ai reader https://jina.ai/reader/#apiform -NUXT_JINA_READER=false diff --git a/.vscode/settings.json b/.vscode/settings.json index 9f9f1c5..a3f240c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,6 +10,7 @@ "cSpell.words": [ "dexie", "groq", + "jina", "knowledgebase", "nuxt" ], diff --git a/components/KnowledgeBaseForm.vue b/components/KnowledgeBaseForm.vue index 4124804..9c4efbd 100644 --- a/components/KnowledgeBaseForm.vue +++ b/components/KnowledgeBaseForm.vue @@ -51,10 +51,15 @@ const state = reactive({ name: props.data?.name || '', embedding: props.data?.embedding || undefined, description: props.data?.description || '', - urls: '' + urls: '', + pageParser: 'cheerio' as 'cheerio' | 'jinaReader' }) const loading = ref(false) const isModify = computed(() => props.type === 'update') +const parserList = [ + { label: 'Cheerio', value: 'cheerio' }, + { label: 'Jina Reader', value: 'jinaReader' }, +] async function onSubmit() { loading.value = true @@ -73,6 +78,7 @@ async function onSubmit() { formData.append("name", state.name) formData.append("description", state.description) formData.append("embedding", state.embedding) + formData.append("pageParser", state.pageParser) if (isModify.value) { formData.append('knowledgeBaseId', String(props.data!.id)) @@ -163,6 +169,10 @@ async function submit(formData: FormData) { + + + +
Cancel diff --git a/nuxt.config.ts b/nuxt.config.ts index aaf3423..4bc07a7 100644 --- a/nuxt.config.ts +++ b/nuxt.config.ts @@ -43,9 +43,4 @@ export default defineNuxtConfig({ ] } }, - runtimeConfig: { - jina: { - reader: false - }, // can be overridden by NUXT_JINA_READER environment variable - } }) diff --git a/server/types/index.ts b/server/types/index.ts index 644b78b..466a54b 100644 --- a/server/types/index.ts +++ b/server/types/index.ts @@ -1,5 +1,7 @@ import { MultiPartData } from 'h3' +export type PageParser = 'cheerio' | 'jinaReader' + export type KnowledgeBaseFormData = { name: string description: string @@ -7,4 +9,5 @@ export type KnowledgeBaseFormData = { knowledgeBaseId: number | null uploadedFiles: MultiPartData[] urls: string[] + pageParser: PageParser } diff --git a/server/utils/http.ts b/server/utils/http.ts index ed5af94..0cbccf5 100644 --- a/server/utils/http.ts +++ b/server/utils/http.ts @@ -1,5 +1,5 @@ import { MultiPartData, type H3Event } from 'h3' -import { KnowledgeBaseFormData } from '@/server/types' +import type { KnowledgeBaseFormData, PageParser } from '@/server/types' export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise => { const items = await readMultipartFormData(event) @@ -10,6 +10,7 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise { @@ -32,6 +33,9 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise { const Loaders = { @@ -27,16 +28,19 @@ export const loadDocuments = async (file: MultiPartData) => { return new Loaders[ext](blob).load() } -export const loadURL = async (url: string, jinaReader: boolean) => { +export const loadURL = async (url: string, pageParser: PageParser) => { console.log("URL: ", url) - if (jinaReader) { + if (pageParser === 'jinaReader') { console.log("Using Jina reader to load URL") const jinaUrl = `https://r.jina.ai/${url}` const response = await fetch(jinaUrl) const data = await response.text() - return [new Document({ - pageContent: data - })] + return [ + new Document({ + pageContent: data, + metadata: { source: url } + }) + ] } else { /*console.log("Using CheerioWebBaseLoader to load URL") const loader = new CheerioWebBaseLoader(url) @@ -84,10 +88,10 @@ export const ingestURLs = async ( event: H3Event ) => { const docs = [] - const config = useRuntimeConfig(event) + const { pageParser } = await parseKnowledgeBaseFormRequest(event) for (const url of urls) { - const loadedDocs = await loadURL(url, config?.jina?.reader) + const loadedDocs = await loadURL(url, pageParser) docs.push(...loadedDocs) }