Skip to content

Commit

Permalink
Merge pull request #282 from satrong/improve-041712
Browse files Browse the repository at this point in the history
feat: user selects page parser instead of configuring through env vars
  • Loading branch information
sugarforever authored Apr 17, 2024
2 parents 5107de1 + c7b5a12 commit cfadf08
Show file tree
Hide file tree
Showing 7 changed files with 33 additions and 18 deletions.
3 changes: 0 additions & 3 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,3 @@ MILVUS_URL=http://localhost:19530

# Cohere API Key - Reranking
COHERE_API_KEY=

# Jina.ai reader https://jina.ai/reader/#apiform
NUXT_JINA_READER=false
1 change: 1 addition & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"cSpell.words": [
"dexie",
"groq",
"jina",
"knowledgebase",
"nuxt"
],
Expand Down
12 changes: 11 additions & 1 deletion components/KnowledgeBaseForm.vue
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@ const state = reactive({
name: props.data?.name || '',
embedding: props.data?.embedding || undefined,
description: props.data?.description || '',
urls: ''
urls: '',
pageParser: 'cheerio' as 'cheerio' | 'jinaReader'
})
const loading = ref(false)
const isModify = computed(() => props.type === 'update')
const parserList = [
{ label: 'Cheerio', value: 'cheerio' },
{ label: 'Jina Reader', value: 'jinaReader' },
]
async function onSubmit() {
loading.value = true
Expand All @@ -73,6 +78,7 @@ async function onSubmit() {
formData.append("name", state.name)
formData.append("description", state.description)
formData.append("embedding", state.embedding)
formData.append("pageParser", state.pageParser)
if (isModify.value) {
formData.append('knowledgeBaseId', String(props.data!.id))
Expand Down Expand Up @@ -163,6 +169,10 @@ async function submit(formData: FormData) {
<UTextarea v-model="state.urls" autoresize :maxrows="6" placeholder="One per line" />
</UFormGroup>

<UFormGroup label="URL page parser" name="pageParser" class="mb-4">
<USelectMenu v-model="state.pageParser" :options="parserList" value-attribute="value" />
</UFormGroup>

<div class="flex justify-end">
<UButton color="gray" class="mr-2" @click="onClose()">Cancel</UButton>
<UButton type="submit" :loading="loading">
Expand Down
5 changes: 0 additions & 5 deletions nuxt.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,4 @@ export default defineNuxtConfig({
]
}
},
runtimeConfig: {
jina: {
reader: false
}, // can be overridden by NUXT_JINA_READER environment variable
}
})
3 changes: 3 additions & 0 deletions server/types/index.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import { MultiPartData } from 'h3'

export type PageParser = 'cheerio' | 'jinaReader'

export type KnowledgeBaseFormData = {
name: string
description: string
embedding: string
knowledgeBaseId: number | null
uploadedFiles: MultiPartData[]
urls: string[]
pageParser: PageParser
}
9 changes: 7 additions & 2 deletions server/utils/http.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { MultiPartData, type H3Event } from 'h3'
import { KnowledgeBaseFormData } from '@/server/types'
import type { KnowledgeBaseFormData, PageParser } from '@/server/types'

export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise<KnowledgeBaseFormData> => {
const items = await readMultipartFormData(event)
Expand All @@ -10,6 +10,7 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise<Kno
let _name = ''
let _description = ''
let _embedding = ''
let _pageParser: PageParser = 'cheerio'
const urls: string[] = []
const _knowledgeBaseId = event?.context?.params?.id
items?.forEach((item) => {
Expand All @@ -32,6 +33,9 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise<Kno
if (key === 'embedding') {
_embedding = decodedData
}
if (key === 'pageParser') {
_pageParser = decodedData as PageParser
}
})

const formData: KnowledgeBaseFormData = {
Expand All @@ -40,7 +44,8 @@ export const parseKnowledgeBaseFormRequest = async (event: H3Event): Promise<Kno
embedding: _embedding,
knowledgeBaseId: _knowledgeBaseId ? parseInt(_knowledgeBaseId) : null,
uploadedFiles,
urls: urls
urls: urls,
pageParser: _pageParser,
}

return formData
Expand Down
18 changes: 11 additions & 7 deletions server/utils/rag.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { RecursiveUrlLoader } from "langchain/document_loaders/web/recursive_url
import { compile } from "html-to-text"
import { MultiPartData, H3Event } from 'h3'
import { createRetriever } from '@/server/retriever'
import type { PageParser } from '@/server/types'

export const loadDocuments = async (file: MultiPartData) => {
const Loaders = {
Expand All @@ -27,16 +28,19 @@ export const loadDocuments = async (file: MultiPartData) => {
return new Loaders[ext](blob).load()
}

export const loadURL = async (url: string, jinaReader: boolean) => {
export const loadURL = async (url: string, pageParser: PageParser) => {
console.log("URL: ", url)
if (jinaReader) {
if (pageParser === 'jinaReader') {
console.log("Using Jina reader to load URL")
const jinaUrl = `https://r.jina.ai/${url}`
const response = await fetch(jinaUrl)
const data = await response.text()
return [new Document({
pageContent: data
})]
return [
new Document({
pageContent: data,
metadata: { source: url }
})
]
} else {
/*console.log("Using CheerioWebBaseLoader to load URL")
const loader = new CheerioWebBaseLoader(url)
Expand Down Expand Up @@ -84,10 +88,10 @@ export const ingestURLs = async (
event: H3Event
) => {
const docs = []
const config = useRuntimeConfig(event)
const { pageParser } = await parseKnowledgeBaseFormRequest(event)

for (const url of urls) {
const loadedDocs = await loadURL(url, config?.jina?.reader)
const loadedDocs = await loadURL(url, pageParser)
docs.push(...loadedDocs)
}

Expand Down

0 comments on commit cfadf08

Please sign in to comment.