Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: Deep Search PDF to MD file conversion #33

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,9 @@ IL_GRANITE_API=<GRANITE_HOST>
IL_GRANITE_MODEL_NAME=<GRANITE_MODEL_NAME>
IL_MERLINITE_API=<MERLINITE_HOST>
IL_MERLINITE_MODEL_NAME=<MERLINITE_MODEL_NAME>

DS_USERNAME=<DEEP_SEARCH_USER>
DS_API_KEY=<DEEP_SEARCH_API_KEY>
DS_HOST=<DEEP_SEARCH_HOST>
DS_PROJ_KEY=<DEEP_PROJECT_KEY>
DS_PROJ_NAME=<DEEP_PROJ_NAME>
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ npm-debug.log
.env
*.env
coverage
lib
taxonomy
config.yaml
generated
Expand Down
144 changes: 144 additions & 0 deletions src/app/api/conversion/route.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,144 @@
'use server';

import { NextResponse, NextRequest } from 'next/server';
import fetch from 'node-fetch';

interface AuthData {
access_token: string;
}

interface ConvertData {
task_id: string;
}

interface TaskStatus {
task_status: string;
result?: {
json_file_url: string;
md_file_url: string;
document_hash: string;
};
}

export async function POST(req: NextRequest) {
const { repoUrl, documentNames } = await req.json();
const USERNAME = process.env.DS_USERNAME;
const API_KEY = process.env.DS_API_KEY;
const HOST = process.env.DS_HOST;
const PROJ_KEY = process.env.DS_PROJ_KEY;
const BRANCH = 'main';

if (!USERNAME || !API_KEY || !HOST || !PROJ_KEY) {
console.error('Missing environment variables');
return NextResponse.json({ error: 'Missing environment variables' }, { status: 500 });
}

const pdfFileName = documentNames.find((name: string) => name.endsWith('.pdf'));
if (!pdfFileName) {
console.error('No PDF file found for conversion');
return NextResponse.json({ error: 'No PDF file found for conversion' }, { status: 400 });
}

const [repoOwner, repoName] = repoUrl.replace('https://github.com/', '').split('/');
const PDF_URL = `https://raw.githubusercontent.com/${repoOwner}/${repoName}/${BRANCH}/${pdfFileName}`;
console.log(`PDF URL for conversion: ${PDF_URL}`);

try {
console.log('Starting authentication...');
const authResponse = await fetch(`${HOST}/api/cps/user/v1/user/token`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: `Basic ${Buffer.from(`${USERNAME}:${API_KEY}`).toString('base64')}`
},
body: JSON.stringify({})
});

if (!authResponse.ok) {
const error = await authResponse.text();
console.error('Error during authentication:', error);
return NextResponse.json({ error }, { status: authResponse.status });
}

const authData = (await authResponse.json()) as AuthData;
const token = authData.access_token;
console.log('Authentication successful. Token obtained.');

console.log('Starting PDF conversion...');
const convertResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert`, {
method: 'POST',
headers: {
'Content-Type': 'application/json',
Authorization: token
},
body: JSON.stringify({
http_source: { url: PDF_URL, headers: {} }
})
});

if (!convertResponse.ok) {
const error = await convertResponse.text();
console.error('Error during PDF conversion:', error);
return NextResponse.json({ error }, { status: convertResponse.status });
}

const convertData = (await convertResponse.json()) as ConvertData;
const taskId = convertData.task_id;
console.log(`PDF conversion started. Task ID: ${taskId}`);

console.log('Checking conversion task status...');
let taskStatus: TaskStatus = { task_status: '', result: undefined };
let isTaskComplete = false;
while (!isTaskComplete) {
const taskResponse = await fetch(`${HOST}/api/cps/public/v2/project/${PROJ_KEY}/convert_tasks/${taskId}?wait=10`, {
method: 'GET',
headers: {
Authorization: token
}
});

if (!taskResponse.ok) {
const error = await taskResponse.text();
console.error('Error during task status check:', error);
return NextResponse.json({ error }, { status: taskResponse.status });
}

const taskText = await taskResponse.text();
try {
taskStatus = JSON.parse(taskText) as TaskStatus;
} catch (parseError) {
console.error('Error parsing task status response:', taskText);
return NextResponse.json({ error: 'Failed to parse task status response' }, { status: 500 });
}

console.log(`Task status: ${taskStatus.task_status}`);

if (taskStatus.result && ['SUCCESS', 'FAILURE'].includes(taskStatus.task_status)) {
isTaskComplete = true;
} else {
await new Promise((resolve) => setTimeout(resolve, 10000)); // Wait for 10 seconds before polling again
}
}

if (taskStatus.task_status === 'FAILURE') {
console.error('PDF Conversion Task failed.');
return NextResponse.json({ error: 'PDF Conversion Task failed' }, { status: 500 });
}

const result = {
json_file_url: taskStatus.result!.json_file_url,
md_file_url: taskStatus.result!.md_file_url,
document_hash: taskStatus.result!.document_hash
};

console.log('Task completed successfully.');
console.log(`JSON file URL: ${result.json_file_url}`);
console.log(`Markdown file URL: ${result.md_file_url}`);
console.log(`Document hash: ${result.document_hash}`);

return NextResponse.json(result);
} catch (error: unknown) {
console.error('Unexpected error:', error);
return NextResponse.json({ error: (error as Error).message }, { status: 500 });
}
}
2 changes: 1 addition & 1 deletion src/app/api/pr/knowledge/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
2 changes: 1 addition & 1 deletion src/app/api/pr/skill/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down
16 changes: 7 additions & 9 deletions src/app/api/upload/route.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@ import { getToken } from 'next-auth/jwt';
import { NextRequest } from 'next/server';

const GITHUB_API_URL = 'https://api.github.com';
const TAXONOMY_DOCUMENTS_REPO = process.env.TAXONOMY_DOCUMENTS_REPO!;
const TAXONOMY_DOCUMENTS_REPO = process.env.NEXT_PUBLIC_TAXONOMY_DOCUMENTS_REPO!;
const BASE_BRANCH = 'main';

export async function POST(req: NextRequest) {
const token = await getToken({ req, secret: process.env.NEXTAUTH_SECRET! });
console.log('GitHub Token:', token);
// console.log('GitHub Token:', token);

if (!token || !token.accessToken) {
console.error('Unauthorized: Missing or invalid access token');
Expand Down Expand Up @@ -64,7 +64,8 @@ export async function POST(req: NextRequest) {
const [name, extension] = file.fileName.split(/\.(?=[^.]+$)/);
return {
fileName: `${name}-${timestamp}.${extension}`,
fileContent: file.fileContent
fileContent: file.fileContent,
encoding: extension === 'pdf' ? 'base64' : 'utf-8'
};
});

Expand Down Expand Up @@ -160,7 +161,7 @@ async function createFilesCommit(
owner: string,
repo: string,
branchName: string,
files: { fileName: string; fileContent: string }[],
files: { fileName: string; fileContent: string; encoding: string }[],
userEmail: string,
baseSha: string
): Promise<string> {
Expand All @@ -173,7 +174,7 @@ async function createFilesCommit(
headers,
body: JSON.stringify({
content: file.fileContent,
encoding: 'utf-8'
encoding: file.encoding
})
}).then((response) => response.json())
)
Expand Down Expand Up @@ -202,12 +203,9 @@ async function createFilesCommit(
}

const treeData = await createTreeResponse.json();
console.log('Tree created:', treeData);
// console.log('Tree created:', treeData);

// Create commit with DCO sign-off
// TODO: if the user's github does not have an associated github email, we need to specify one in the upload section
// or reuse the one from the form. If we use the email field from the form, it needs to be null checked when
// the user clicks the upload documents button.
const createCommitResponse = await fetch(`${GITHUB_API_URL}/repos/${owner}/${repo}/git/commits`, {
method: 'POST',
headers,
Expand Down
4 changes: 2 additions & 2 deletions src/app/edit-submission/knowledge/[id]/page.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -503,7 +503,7 @@ Creator names: ${updatedAttributionData.creator_names}
className={useFileUpload ? 'button-active' : 'button-secondary'}
onClick={() => setUseFileUpload(true)}
>
Automatically Upload Documents
Upload Documents
</Button>
</div>
</FormGroup>
Expand Down Expand Up @@ -537,7 +537,7 @@ Creator names: ${updatedAttributionData.creator_names}
</FormGroup>
) : (
<>
<UploadFile onFilesChange={handleFilesChange} />
<UploadFile onFilesChange={handleFilesChange} files={uploadedFiles} isConverting={false} conversionMessage="" />
<Button variant="primary" onClick={handleDocumentUpload}>
Submit Files
</Button>
Expand Down
Loading