Skip to content

Commit

Permalink
Merge pull request #59 from sshivaditya2019/scraper
Browse files Browse the repository at this point in the history
  • Loading branch information
0x4007 authored Dec 20, 2024
2 parents 89b021f + ebf1821 commit dec3497
Show file tree
Hide file tree
Showing 7 changed files with 364 additions and 7 deletions.
9 changes: 8 additions & 1 deletion .github/knip.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,14 @@ import type { KnipConfig } from "knip";
const config: KnipConfig = {
entry: ["src/main.ts"],
project: ["src/**/*.ts"],
ignore: ["src/types/config.ts", "**/__mocks__/**", "**/__fixtures__/**", "src/types/database.ts"],
ignore: [
"src/types/config.ts",
"**/__mocks__/**",
"**/__fixtures__/**",
"src/types/database.ts",
"src/handlers/user-issue-scraper.ts",
"src/handlers/issue-scraper.ts",
],
ignoreExportsUsedInFile: true,
// eslint can also be safely ignored as per the docs: https://knip.dev/guides/handling-issues#eslint--jest
ignoreDependencies: ["eslint-config-prettier", "eslint-plugin-prettier", "ts-node"],
Expand Down
2 changes: 1 addition & 1 deletion .husky/commit-msg
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh"

yarn commitlint --edit "$1"
bun commitlint --edit "$1"
2 changes: 1 addition & 1 deletion .husky/pre-commit
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
#!/usr/bin/env sh
. "$(dirname -- "$0")/_/husky.sh"

yarn lint-staged
bun lint-staged
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ This is a plugin for [Ubiquibot](https://github.com/ubiquity-os/ubiquity-os-kern
## Testing Locally
- Run `yarn install` to install the dependencies.
- Run `yarn worker` to start the server.
- Run `bun install` to install the dependencies.
- Run `bun worker` to start the server.
- Make HTTP requests to the server to test the plugin with content type `Application/JSON`

```
Expand Down Expand Up @@ -62,4 +62,4 @@ This is a plugin for [Ubiquibot](https://github.com/ubiquity-os/ubiquity-os-kern

## Testing

- Run `yarn test` to run the tests.
- Run `bun test` to run the tests.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@
},
"lint-staged": {
"*.ts": [
"yarn prettier --write",
"bun prettier --write",
"eslint --fix"
],
"src/**.{ts,json}": [
Expand Down
304 changes: 304 additions & 0 deletions src/handlers/issue-scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,304 @@
import { createClient } from "@supabase/supabase-js";
import { VoyageAIClient } from "voyageai";
import { customOctokit as Octokit } from "@ubiquity-os/plugin-sdk/octokit";
import markdownit from "markdown-it";
import plainTextPlugin from "markdown-it-plain-text";
import "dotenv/config";
import { createAdapters } from "../adapters";
import { Context } from "../types/context";

interface MarkdownItWithPlainText extends markdownit {
plainText: string;
}

function markdownToPlainText(markdown: string | null): string | null {
if (!markdown) return markdown;
const md = markdownit() as MarkdownItWithPlainText;
md.use(plainTextPlugin);
md.render(markdown);
return md.plainText;
}

interface IssueMetadata {
nodeId: string;
number: number;
title: string;
body: string;
state: string;
repositoryName: string;
repositoryId: number;
assignees: string[];
authorId: number;
createdAt: string;
closedAt: string | null;
stateReason: string | null;
updatedAt: string;
}

interface IssueNode {
id: string;
number: number;
title: string;
body: string;
state: string;
stateReason: string | null;
createdAt: string;
updatedAt: string;
closedAt: string | null;
author: {
login: string;
} | null;
assignees: {
nodes: Array<{
login: string;
}>;
};
repository: {
id: string;
name: string;
owner: {
login: string;
};
};
}

interface GraphQlSearchResponse {
search: {
pageInfo: {
hasNextPage: boolean;
endCursor: string | null;
};
nodes: Array<IssueNode>;
};
}

const SEARCH_ISSUES_QUERY = `
query SearchIssues($searchText: String!, $after: String) {
search(
query: $searchText,
type: ISSUE,
first: 100,
after: $after
) {
pageInfo {
hasNextPage
endCursor
}
nodes {
... on Issue {
id
number
title
body
state
stateReason
createdAt
updatedAt
closedAt
author {
login
}
assignees(first: 10) {
nodes {
login
}
}
repository {
id
name
owner {
login
}
}
}
}
}
}
`;

async function fetchAuthorId(octokit: InstanceType<typeof Octokit>, login: string): Promise<number> {
try {
const response = await octokit.rest.users.getByUsername({ username: login });
return response.data.id;
} catch (error) {
console.error(`Error fetching author ID for ${login}:`, error);
return -1;
}
}

async function fetchUserIssues(octokit: InstanceType<typeof Octokit>, username: string): Promise<IssueNode[]> {
const allIssues: IssueNode[] = [];
let hasNextPage = true;
let cursor: string | null = null;

const searchText = `assignee:${username} is:issue is:closed`;

while (hasNextPage) {
const variables: { searchText: string; after?: string } = {
searchText,
};
if (cursor) {
variables.after = cursor;
}

const response: GraphQlSearchResponse = await octokit.graphql<GraphQlSearchResponse>(SEARCH_ISSUES_QUERY, variables);

const completedIssues = response.search.nodes.filter((issue) => issue.stateReason === "COMPLETED");
allIssues.push(...completedIssues);

hasNextPage = response.search.pageInfo.hasNextPage;
cursor = response.search.pageInfo.endCursor;

if (!cursor) break;
}

return allIssues;
}

// Pulls issues from GitHub and stores them in Supabase
export async function issueScraper(username: string, token?: string): Promise<string> {
try {
if (!username) {
throw new Error("Username is required");
}

const required = ["GITHUB_TOKEN", "SUPABASE_URL", "SUPABASE_KEY", "VOYAGEAI_API_KEY"];
const missing = required.filter((key) => !process.env[key]);
if (missing.length > 0) {
throw new Error(`Missing required environment variables: ${missing.join(", ")}`);
}

const context = {
adapters: {},
logger: {
info: (message: string, data: Record<string, unknown>) => console.log("INFO:", message + ":", data),
error: (message: string, data: Record<string, unknown>) => console.error("ERROR:", message + ":", data),
},
octokit: new Octokit({ auth: token || process.env.GITHUB_TOKEN }),
} as unknown as Context;

const supabaseUrl = process.env.SUPABASE_URL;
const supabaseKey = process.env.SUPABASE_KEY;
const voyageApiKey = process.env.VOYAGEAI_API_KEY;

if (!supabaseUrl || !supabaseKey || !voyageApiKey) {
throw new Error("Required environment variables are missing");
}

const supabase = createClient(supabaseUrl, supabaseKey);
const voyageClient = new VoyageAIClient({ apiKey: voyageApiKey });
const adapters = createAdapters(supabase, voyageClient, context);

const issues = await fetchUserIssues(context.octokit, username);
const processedIssues: Array<{ issue: IssueMetadata; error?: string }> = [];

for (const issue of issues) {
try {
const authorId = issue.author?.login ? await fetchAuthorId(context.octokit, issue.author.login) : -1;
const repoOwner = issue.repository.owner.login;

const metadata: IssueMetadata = {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: (issue.assignees?.nodes || []).map((assignee) => assignee.login),
authorId,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
};

const markdown = metadata.body + " " + metadata.title;
const plaintext = markdownToPlainText(markdown);
const embedding = await adapters.voyage.embedding.createEmbedding(plaintext);

const payload = {
issue: metadata,
action: "created",
sender: {
login: username,
},
repository: {
id: parseInt(issue.repository.id),
node_id: issue.repository.id,
name: issue.repository.name,
full_name: `${repoOwner}/${issue.repository.name}`,
owner: {
login: repoOwner,
id: authorId,
type: "User",
site_admin: false,
},
},
};

const { error } = await supabase.from("issues").upsert({
id: metadata.nodeId,
markdown,
plaintext,
embedding: JSON.stringify(embedding),
author_id: metadata.authorId,
modified_at: metadata.updatedAt,
payload: payload,
});

processedIssues.push({
issue: metadata,
error: error ? `Error storing issue: ${error.message}` : undefined,
});
} catch (error) {
processedIssues.push({
issue: {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: [],
authorId: -1,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
},
error: `Error processing issue: ${error instanceof Error ? error.message : "Unknown error"}`,
});
}
}

return JSON.stringify(
{
success: true,
stats: {
storageSuccessful: processedIssues.filter((p) => !p.error).length,
storageFailed: processedIssues.filter((p) => p.error).length,
},
errors: processedIssues
.filter((p) => p.error)
.map((p) => ({
type: "storage",
name: `${p.issue.repositoryName}#${p.issue.number}`,
error: p.error,
})),
issues: processedIssues.map((p) => ({
number: p.issue.number,
title: p.issue.title,
repo: p.issue.repositoryName,
error: p.error,
})),
},
null,
2
);
} catch (error) {
console.error("Error in issueScraper:", error);
throw error;
}
}
Loading

0 comments on commit dec3497

Please sign in to comment.