From 69222e6c8d76a8c3fa5e03d3c116ad106664d799 Mon Sep 17 00:00:00 2001 From: satrong Date: Wed, 17 Apr 2024 12:22:39 +0800 Subject: [PATCH 1/2] feat: user selects page parser instead of configuring through env vars --- .vscode/settings.json | 1 + components/KnowledgeBaseForm.vue | 12 +++++++++++- nuxt.config.ts | 5 ----- server/types/index.ts | 3 +++ server/utils/http.ts | 9 +++++++-- server/utils/rag.ts | 22 ++++++++++++++-------- 6 files changed, 36 insertions(+), 16 deletions(-) diff --git a/.vscode/settings.json b/.vscode/settings.json index 9f9f1c5..a3f240c 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -10,6 +10,7 @@ "cSpell.words": [ "dexie", "groq", + "jina", "knowledgebase", "nuxt" ], diff --git a/components/KnowledgeBaseForm.vue b/components/KnowledgeBaseForm.vue index bf4209d..6262b84 100644 --- a/components/KnowledgeBaseForm.vue +++ b/components/KnowledgeBaseForm.vue @@ -17,10 +17,15 @@ const state = reactive({ name: props.data?.name || '', embedding: props.data?.embedding || '', description: props.data?.description || '', - urls: '' + urls: '', + pageParser: 'cheerio' as 'cheerio' | 'jinaReader' }) const loading = ref(false) const isModify = computed(() => props.type === 'update') +const parserList = [ + { label: 'Cheerio', value: 'cheerio' }, + { label: 'Jina Reader', value: 'jinaReader' }, +] async function onSubmit() { loading.value = true @@ -39,6 +44,7 @@ async function onSubmit() { formData.append("name", state.name) formData.append("description", state.description) formData.append("embedding", state.embedding) + formData.append("pageParser", state.pageParser) if (isModify.value) { formData.append('knowledgeBaseId', String(props.data!.id)) @@ -110,6 +116,10 @@ async function submit(formData: FormData) { + + + +
Cancel diff --git a/nuxt.config.ts b/nuxt.config.ts index aaf3423..4bc07a7 100644 --- a/nuxt.config.ts +++ b/nuxt.config.ts @@ -43,9 +43,4 @@ export default defineNuxtConfig({ ] } }, - runtimeConfig: { - jina: { - reader: false - }, // can be overridden by NUXT_JINA_READER environment variable - } }) diff --git a/server/types/index.ts b/server/types/index.ts index 644b78b..466a54b 100644 --- a/server/types/index.ts +++ b/server/types/index.ts @@ -1,5 +1,7 @@ import { MultiPartData } from 'h3' +export type PageParser = 'cheerio' | 'jinaReader' + export type KnowledgeBaseFormData = { name: string description: string @@ -7,4 +9,5 @@ export type KnowledgeBaseFormData = { knowledgeBaseId: number | null uploadedFiles: MultiPartData[] urls: string[] + pageParser: PageParser } diff --git a/server/utils/http.ts b/server/utils/http.ts index ed5af94..0cbccf5 100644 --- a/server/utils/http.ts +++ b/server/utils/http.ts @@ -1,5 +1,5 @@ import { MultiPartData, type H3Event } from 'h3' -import { KnowledgeBaseFormData } from '@/server/types' +import type { KnowledgeBaseFormData, PageParser } from '@/server/types' export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise => { const items = await readMultipartFormData(event) @@ -10,6 +10,7 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise { @@ -32,6 +33,9 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise { const Loaders = { @@ -25,17 +26,22 @@ export const loadDocuments = async (file: MultiPartData) => { return new Loaders[ext](blob).load() } -export const loadURL = async (url: string, jinaReader: boolean) => { +export const loadURL = async (url: string, pageParser: PageParser) => { console.log("URL: ", url) - if (jinaReader) { + if (pageParser === 'jinaReader') { console.log("Using Jina reader to load URL") const jinaUrl = `https://r.jina.ai/${url}` const response = await fetch(jinaUrl) const data = await response.text() - return [new Document({ - pageContent: data - })] - } else { + return [ + new Document({ + pageContent: data, + metadata: { source: url } + }) + ] + } + // default `cheerio` + else { console.log("Using CheerioWebBaseLoader to load URL") const loader = new CheerioWebBaseLoader(url) const docs = await loader.load() @@ -73,10 +79,10 @@ export const ingestURLs = async ( event: H3Event ) => { const docs = [] - const config = useRuntimeConfig(event) + const { pageParser } = await parseKnowledgeBaseFormRequest(event) for (const url of urls) { - const loadedDocs = await loadURL(url, config?.jina?.reader) + const loadedDocs = await loadURL(url, pageParser) docs.push(...loadedDocs) } From 9b5f700a3cb6a12c87f7a6151917bf55de261e4a Mon Sep 17 00:00:00 2001 From: satrong Date: Wed, 17 Apr 2024 12:26:05 +0800 Subject: [PATCH 2/2] chore: remove env `NUXT_JINA_READER` --- .env.example | 3 --- 1 file changed, 3 deletions(-) diff --git a/.env.example b/.env.example index 8be694e..6d7c44e 100644 --- a/.env.example +++ b/.env.example @@ -15,6 +15,3 @@ MILVUS_URL=http://localhost:19530 # Cohere API Key - Reranking COHERE_API_KEY= - -# Jina.ai reader https://jina.ai/reader/#apiform -NUXT_JINA_READER=false