diff --git a/.env.example b/.env.example
index 8be694e..6d7c44e 100644
--- a/.env.example
+++ b/.env.example
@@ -15,6 +15,3 @@ MILVUS_URL=http://localhost:19530
# Cohere API Key - Reranking
COHERE_API_KEY=
-
-# Jina.ai reader https://jina.ai/reader/#apiform
-NUXT_JINA_READER=false
diff --git a/.vscode/settings.json b/.vscode/settings.json
index 9f9f1c5..a3f240c 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -10,6 +10,7 @@
"cSpell.words": [
"dexie",
"groq",
+ "jina",
"knowledgebase",
"nuxt"
],
diff --git a/components/KnowledgeBaseForm.vue b/components/KnowledgeBaseForm.vue
index 4124804..9c4efbd 100644
--- a/components/KnowledgeBaseForm.vue
+++ b/components/KnowledgeBaseForm.vue
@@ -51,10 +51,15 @@ const state = reactive({
name: props.data?.name || '',
embedding: props.data?.embedding || undefined,
description: props.data?.description || '',
- urls: ''
+ urls: '',
+ pageParser: 'cheerio' as 'cheerio' | 'jinaReader'
})
const loading = ref(false)
const isModify = computed(() => props.type === 'update')
+const parserList = [
+ { label: 'Cheerio', value: 'cheerio' },
+ { label: 'Jina Reader', value: 'jinaReader' },
+]
async function onSubmit() {
loading.value = true
@@ -73,6 +78,7 @@ async function onSubmit() {
formData.append("name", state.name)
formData.append("description", state.description)
formData.append("embedding", state.embedding)
+ formData.append("pageParser", state.pageParser)
if (isModify.value) {
formData.append('knowledgeBaseId', String(props.data!.id))
@@ -163,6 +169,10 @@ async function submit(formData: FormData) {
+
+
+
+
Cancel
diff --git a/nuxt.config.ts b/nuxt.config.ts
index aaf3423..4bc07a7 100644
--- a/nuxt.config.ts
+++ b/nuxt.config.ts
@@ -43,9 +43,4 @@ export default defineNuxtConfig({
]
}
},
- runtimeConfig: {
- jina: {
- reader: false
- }, // can be overridden by NUXT_JINA_READER environment variable
- }
})
diff --git a/server/types/index.ts b/server/types/index.ts
index 644b78b..466a54b 100644
--- a/server/types/index.ts
+++ b/server/types/index.ts
@@ -1,5 +1,7 @@
import { MultiPartData } from 'h3'
+export type PageParser = 'cheerio' | 'jinaReader'
+
export type KnowledgeBaseFormData = {
name: string
description: string
@@ -7,4 +9,5 @@ export type KnowledgeBaseFormData = {
knowledgeBaseId: number | null
uploadedFiles: MultiPartData[]
urls: string[]
+ pageParser: PageParser
}
diff --git a/server/utils/http.ts b/server/utils/http.ts
index ed5af94..0cbccf5 100644
--- a/server/utils/http.ts
+++ b/server/utils/http.ts
@@ -1,5 +1,5 @@
import { MultiPartData, type H3Event } from 'h3'
-import { KnowledgeBaseFormData } from '@/server/types'
+import type { KnowledgeBaseFormData, PageParser } from '@/server/types'
export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise => {
const items = await readMultipartFormData(event)
@@ -10,6 +10,7 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise {
@@ -32,6 +33,9 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise {
const Loaders = {
@@ -27,16 +28,19 @@ export const loadDocuments = async (file: MultiPartData) => {
return new Loaders[ext](blob).load()
}
-export const loadURL = async (url: string, jinaReader: boolean) => {
+export const loadURL = async (url: string, pageParser: PageParser) => {
console.log("URL: ", url)
- if (jinaReader) {
+ if (pageParser === 'jinaReader') {
console.log("Using Jina reader to load URL")
const jinaUrl = `https://r.jina.ai/${url}`
const response = await fetch(jinaUrl)
const data = await response.text()
- return [new Document({
- pageContent: data
- })]
+ return [
+ new Document({
+ pageContent: data,
+ metadata: { source: url }
+ })
+ ]
} else {
/*console.log("Using CheerioWebBaseLoader to load URL")
const loader = new CheerioWebBaseLoader(url)
@@ -84,10 +88,10 @@ export const ingestURLs = async (
event: H3Event
) => {
const docs = []
- const config = useRuntimeConfig(event)
+ const { pageParser } = await parseKnowledgeBaseFormRequest(event)
for (const url of urls) {
- const loadedDocs = await loadURL(url, config?.jina?.reader)
+ const loadedDocs = await loadURL(url, pageParser)
docs.push(...loadedDocs)
}