Skip to content

Commit

Permalink
Merge pull request #42 from ubiquity-os-marketplace/development
Browse files Browse the repository at this point in the history
Merge development into main
  • Loading branch information
gentlementlegen authored Nov 3, 2024
2 parents e4a8408 + 45306e3 commit 3f142fd
Show file tree
Hide file tree
Showing 7 changed files with 164 additions and 53 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/compute.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ on:
description: "Auth Token"
ref:
description: "Ref"
signature:
description: "Signature tp identify the Kernel"

jobs:
compute:
Expand Down
2 changes: 1 addition & 1 deletion src/adapters/supabase/helpers/issues.ts
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ export class Issues extends SuperSupabase {
}

async findSimilarIssues(markdown: string, threshold: number, currentId: string): Promise<IssueSimilaritySearchResult[] | null> {
const embedding = await this.context.adapters.voyage.embedding.createEmbedding(markdown);
const embedding = await this.context.adapters.voyage.embedding.createEmbedding(markdown, "query");
const { data, error } = await this.supabase.rpc("find_similar_issues", {
current_id: currentId,
query_embedding: embedding,
Expand Down
4 changes: 3 additions & 1 deletion src/adapters/voyage/helpers/embedding.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { VoyageAIClient } from "voyageai";
import { Context } from "../../../types";
import { SuperVoyage } from "./voyage";
import { EmbedRequestInputType } from "voyageai/api/types/EmbedRequestInputType";
const VECTOR_SIZE = 1024;

export class Embedding extends SuperVoyage {
Expand All @@ -11,13 +12,14 @@ export class Embedding extends SuperVoyage {
this.context = context;
}

async createEmbedding(text: string | null): Promise<number[]> {
async createEmbedding(text: string | null, inputType: EmbedRequestInputType = "document"): Promise<number[]> {
if (text === null) {
return new Array(VECTOR_SIZE).fill(0);
} else {
const response = await this.client.embed({
input: text,
model: "voyage-large-2-instruct",
inputType,
});
return (response.data && response.data[0]?.embedding) || [];
}
Expand Down
141 changes: 90 additions & 51 deletions src/handlers/issue-deduplication.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,34 +39,41 @@ export async function issueChecker(context: Context): Promise<boolean> {
return false;
}
issueBody = removeFootnotes(issueBody);
const similarIssues = await supabase.issue.findSimilarIssues(issue.title + removeFootnotes(issueBody), context.config.warningThreshold, issue.node_id);
const similarIssues = await supabase.issue.findSimilarIssues(issue.title + removeFootnotes(issueBody), 0.7, issue.node_id);
if (similarIssues && similarIssues.length > 0) {
const matchIssues = similarIssues.filter((issue) => issue.similarity >= context.config.matchThreshold);
const processedIssues = await processSimilarIssues(similarIssues, context, issueBody);
if (matchIssues.length > 0) {
logger.info(`Similar issue which matches more than ${context.config.matchThreshold} already exists`);
//To the issue body, add a footnote with the link to the similar issue
const updatedBody = await handleMatchIssuesComment(context, payload, issueBody, processedIssues);
issueBody = updatedBody || issueBody;
await octokit.issues.update({
owner: payload.repository.owner.login,
repo: payload.repository.name,
issue_number: issue.number,
body: issueBody,
state: "closed",
state_reason: "not_planned",
});
return true;
}

if (similarIssues.length > 0) {
logger.info(`Similar issue which matches more than ${context.config.warningThreshold} already exists`);
await handleSimilarIssuesComment(context, payload, issueBody, issue.number, similarIssues);
await handleSimilarIssuesComment(context, payload, issueBody, issue.number, processedIssues);
return true;
}
} else {
//Use the IssueBody (Without footnotes) to update the issue when no similar issues are found
await octokit.issues.update({
owner: payload.repository.owner.login,
repo: payload.repository.name,
issue_number: issue.number,
body: issueBody,
});
//Only if the issue has "possible duplicate" footnotes, update the issue
if (checkIfDuplicateFootNoteExists(issue.body || "")) {
await octokit.issues.update({
owner: payload.repository.owner.login,
repo: payload.repository.name,
issue_number: issue.number,
body: issueBody,
});
}
}
context.logger.info("No similar issues found");
return false;
Expand All @@ -85,7 +92,6 @@ function matchRepoOrgToSimilarIssueRepoOrg(repoOrg: string, similarIssueRepoOrg:
function findMostSimilarSentence(issueContent: string, similarIssueContent: string): { sentence: string; similarity: number; index: number } {
// Regex to match sentences while preserving URLs
const sentenceRegex = /([^.!?\s][^.!?]*(?:[.!?](?!['"]?\s|$)[^.!?]*)*[.!?]?['"]?(?=\s|$))/g;

// Function to split text into sentences while preserving URLs
const splitIntoSentences = (text: string): string[] => {
const sentences: string[] = [];
Expand Down Expand Up @@ -124,40 +130,7 @@ function findMostSimilarSentence(issueContent: string, similarIssueContent: stri
return { sentence: mostSimilarSentence, similarity: maxSimilarity, index: mostSimilarIndex };
}

async function handleSimilarIssuesComment(
context: Context,
payload: IssuePayload,
issueBody: string,
issueNumber: number,
similarIssues: IssueSimilaritySearchResult[]
) {
const issueList: IssueGraphqlResponse[] = await Promise.all(
similarIssues.map(async (issue: IssueSimilaritySearchResult) => {
const issueUrl: IssueGraphqlResponse = await context.octokit.graphql(
`query($issueNodeId: ID!) {
node(id: $issueNodeId) {
... on Issue {
title
url
number
body
repository {
name
owner {
login
}
}
}
}
}`,
{ issueNodeId: issue.issue_id }
);
issueUrl.similarity = Math.round(issue.similarity * 100).toString();
issueUrl.mostSimilarSentence = findMostSimilarSentence(issueBody, issueUrl.node.body);
return issueUrl;
})
);

async function handleSimilarIssuesComment(context: Context, payload: IssuePayload, issueBody: string, issueNumber: number, issueList: IssueGraphqlResponse[]) {
const relevantIssues = issueList.filter((issue) =>
matchRepoOrgToSimilarIssueRepoOrg(payload.repository.owner.login, issue.node.repository.owner.login, payload.repository.name, issue.node.repository.name)
);
Expand All @@ -177,31 +150,25 @@ async function handleSimilarIssuesComment(
let footnotes: string[] | undefined;
// Sort relevant issues by similarity in ascending order
relevantIssues.sort((a, b) => parseFloat(a.similarity) - parseFloat(b.similarity));

relevantIssues.forEach((issue, index) => {
const footnoteIndex = highestFootnoteIndex + index + 1; // Continue numbering from the highest existing footnote number
const footnoteRef = `[^0${footnoteIndex}^]`;
const modifiedUrl = issue.node.url.replace("https://github.com", "https://www.github.com");
const { sentence } = issue.mostSimilarSentence;

// Insert footnote reference in the body
const sentencePattern = new RegExp(`${sentence.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, "g");
updatedBody = updatedBody.replace(sentencePattern, `${sentence}${footnoteRef}`);

// Initialize footnotes array if not already done
if (!footnotes) {
footnotes = [];
}

// Add new footnote to the array
footnotes.push(`${footnoteRef}: ⚠ ${issue.similarity}% possible duplicate - [${issue.node.title}](${modifiedUrl}#${issue.node.number})\n\n`);
});

// Append new footnotes to the body, keeping the previous ones
if (footnotes) {
updatedBody += "\n\n" + footnotes.join("");
}

// Update the issue with the modified body
await context.octokit.issues.update({
owner: payload.repository.owner.login,
Expand All @@ -211,6 +178,72 @@ async function handleSimilarIssuesComment(
});
}

//When similarity is greater than match threshold, Add Caution mentioning the issues to which its is very much similar
async function handleMatchIssuesComment(
context: Context,
payload: IssuePayload,
issueBody: string,
issueList: IssueGraphqlResponse[]
): Promise<string | undefined> {
const relevantIssues = issueList.filter((issue) =>
matchRepoOrgToSimilarIssueRepoOrg(payload.repository.owner.login, issue.node.repository.owner.login, payload.repository.name, issue.node.repository.name)
);

if (relevantIssues.length === 0) {
context.logger.info("No relevant issues found with the same repository and organization");
}

if (!issueBody) {
return;
}
// Find existing footnotes in the body
const footnoteRegex = /\[\^(\d+)\^\]/g;
const existingFootnotes = issueBody.match(footnoteRegex) || [];
// Find the index with respect to the issue body string where the footnotes start if they exist
const footnoteIndex = existingFootnotes[0] ? issueBody.indexOf(existingFootnotes[0]) : issueBody.length;
let resultBuilder = "\n\n>[!CAUTION]\n> This issue may be a duplicate of the following issues:\n";
// Sort relevant issues by similarity in descending order
relevantIssues.sort((a, b) => parseFloat(b.similarity) - parseFloat(a.similarity));
// Append the similar issues to the resultBuilder
relevantIssues.forEach((issue) => {
const modifiedUrl = issue.node.url.replace("https://github.com", "https://www.github.com");
resultBuilder += `> - [${issue.node.title}](${modifiedUrl}#${issue.node.number})\n`;
});
// Insert the resultBuilder into the issue body
// Update the issue with the modified body
return issueBody.slice(0, footnoteIndex) + resultBuilder + issueBody.slice(footnoteIndex);
}

// Process similar issues and return the list of similar issues with their similarity scores
async function processSimilarIssues(similarIssues: IssueSimilaritySearchResult[], context: Context, issueBody: string): Promise<IssueGraphqlResponse[]> {
return await Promise.all(
similarIssues.map(async (issue: IssueSimilaritySearchResult) => {
const issueUrl: IssueGraphqlResponse = await context.octokit.graphql(
`query($issueNodeId: ID!) {
node(id: $issueNodeId) {
... on Issue {
title
url
number
body
repository {
name
owner {
login
}
}
}
}
}`,
{ issueNodeId: issue.issue_id }
);
issueUrl.similarity = Math.round(issue.similarity * 100).toString();
issueUrl.mostSimilarSentence = findMostSimilarSentence(issueBody, issueUrl.node.body);
return issueUrl;
})
);
}

/**
* Finds the edit distance between two strings using dynamic programming.
* The edit distance is a way of quantifying how dissimilar two strings are to one another by
Expand Down Expand Up @@ -259,5 +292,11 @@ export function removeFootnotes(content: string): string {
contentWithoutFootnotes = contentWithoutFootnotes.replace(new RegExp(`\\[\\^${footnoteNumber}\\^\\]`, "g"), "");
});
}
return contentWithoutFootnotes.replace(/\n{2,}/g, "\n").trim();
return contentWithoutFootnotes;
}

function checkIfDuplicateFootNoteExists(content: string): boolean {
const footnoteDefRegex = /\[\^(\d+)\^\]: ⚠ \d+% possible duplicate - [^\n]+(\n|$)/g;
const footnotes = content.match(footnoteDefRegex);
return !!footnotes;
}
2 changes: 2 additions & 0 deletions supabase/migrations/20241002004403_issue_comments.sql
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
DROP FUNCTION IF EXISTS find_similar_issues;

CREATE OR REPLACE FUNCTION find_similar_issues(current_id VARCHAR, query_embedding vector(1024), threshold float8, top_k INT)
RETURNS TABLE(issue_id VARCHAR, issue_plaintext TEXT, similarity float8) AS $$
DECLARE
Expand Down
33 changes: 33 additions & 0 deletions supabase/migrations/20241016000744_function_issue.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
DROP FUNCTION IF EXISTS find_similar_issues;

CREATE OR REPLACE FUNCTION find_similar_issues(current_id VARCHAR, query_embedding vector(1024), threshold float8, top_k INT)
RETURNS TABLE(issue_id VARCHAR, issue_plaintext TEXT, similarity float8) AS $$
DECLARE
current_quantized vector(1024);
current_repo TEXT;
current_org TEXT;
BEGIN
-- Ensure the query_embedding is in the correct format
current_quantized := query_embedding;

-- Extract the current issue's repo and org from the payload
SELECT
payload->'repository'->>'name'::text,
payload->'repository'->'owner'->>'login'::text
INTO current_repo, current_org
FROM issues
WHERE id = current_id;

RETURN QUERY
SELECT id AS issue_id,
plaintext AS issue_plaintext,
((0.5 * inner_product(current_quantized, embedding)) + 0.5 * (1 / (1 + l2_distance(current_quantized, embedding)))) as similarity
FROM issues
WHERE id <> current_id
AND COALESCE(payload->'repository'->>'name', '') = COALESCE(current_repo, '') -- To handle Private Issues
AND COALESCE(payload->'repository'->'owner'->>'login', '') = COALESCE(current_org, '') -- To handle Private Issues
AND ((0.5 * inner_product(current_quantized, embedding)) + 0.5 * (1 / (1 + l2_distance(current_quantized, embedding)))) > threshold
ORDER BY similarity DESC
LIMIT top_k;
END;
$$ LANGUAGE plpgsql;
33 changes: 33 additions & 0 deletions supabase/migrations/20241026185200_function_issue.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
DROP FUNCTION IF EXISTS find_similar_issues;

CREATE OR REPLACE FUNCTION find_similar_issues(current_id VARCHAR, query_embedding vector(1024), threshold float8, top_k INT)
RETURNS TABLE(issue_id VARCHAR, issue_plaintext TEXT, similarity float8) AS $$
DECLARE
current_quantized vector(1024);
current_repo TEXT;
current_org TEXT;
BEGIN
-- Ensure the query_embedding is in the correct format
current_quantized := query_embedding;

-- Extract the current issue's repo and org from the payload
SELECT
payload->'repository'->>'name'::text,
payload->'repository'->'owner'->>'login'::text
INTO current_repo, current_org
FROM issues
WHERE id = current_id;

RETURN QUERY
SELECT id AS issue_id,
plaintext AS issue_plaintext,
((0.8 * cosine_distance(current_quantized, embedding)) + 0.8 * (1 / (1 + l2_distance(current_quantized, embedding)))) as similarity
FROM issues
WHERE id <> current_id
AND COALESCE(payload->'repository'->>'name', '') = COALESCE(current_repo, '') -- To handle Private Issues
AND COALESCE(payload->'repository'->'owner'->>'login', '') = COALESCE(current_org, '') -- To handle Private Issues
AND ((0.8 * cosine_distance(current_quantized, embedding)) + 0.8 * (1 / (1 + l2_distance(current_quantized, embedding)))) > threshold
ORDER BY similarity DESC
LIMIT top_k;
END;
$$ LANGUAGE plpgsql;

0 comments on commit 3f142fd

Please sign in to comment.