diff --git a/src/adapters/supabase/helpers/issues.ts b/src/adapters/supabase/helpers/issues.ts index 059ee75..4e4f7e5 100644 --- a/src/adapters/supabase/helpers/issues.ts +++ b/src/adapters/supabase/helpers/issues.ts @@ -94,7 +94,7 @@ export class Issues extends SuperSupabase { } async findSimilarIssues(markdown: string, threshold: number, currentId: string): Promise { - const embedding = await this.context.adapters.voyage.embedding.createEmbedding(markdown); + const embedding = await this.context.adapters.voyage.embedding.createEmbedding(markdown, "query"); const { data, error } = await this.supabase.rpc("find_similar_issues", { current_id: currentId, query_embedding: embedding, diff --git a/src/adapters/voyage/helpers/embedding.ts b/src/adapters/voyage/helpers/embedding.ts index 575543e..a891792 100644 --- a/src/adapters/voyage/helpers/embedding.ts +++ b/src/adapters/voyage/helpers/embedding.ts @@ -1,6 +1,7 @@ import { VoyageAIClient } from "voyageai"; import { Context } from "../../../types"; import { SuperVoyage } from "./voyage"; +import { EmbedRequestInputType } from "voyageai/api/types/EmbedRequestInputType"; const VECTOR_SIZE = 1024; export class Embedding extends SuperVoyage { @@ -11,13 +12,14 @@ export class Embedding extends SuperVoyage { this.context = context; } - async createEmbedding(text: string | null): Promise { + async createEmbedding(text: string | null, inputType: EmbedRequestInputType = "document"): Promise { if (text === null) { return new Array(VECTOR_SIZE).fill(0); } else { const response = await this.client.embed({ input: text, model: "voyage-large-2-instruct", + inputType, }); return (response.data && response.data[0]?.embedding) || []; } diff --git a/src/handlers/issue-deduplication.ts b/src/handlers/issue-deduplication.ts index ca9ebb5..e7a68c2 100644 --- a/src/handlers/issue-deduplication.ts +++ b/src/handlers/issue-deduplication.ts @@ -39,16 +39,20 @@ export async function issueChecker(context: Context): Promise { return false; } issueBody = removeFootnotes(issueBody); - const similarIssues = await supabase.issue.findSimilarIssues(issue.title + removeFootnotes(issueBody), context.config.warningThreshold, issue.node_id); - + const similarIssues = await supabase.issue.findSimilarIssues(issue.title + removeFootnotes(issueBody), 0.7, issue.node_id); if (similarIssues && similarIssues.length > 0) { const matchIssues = similarIssues.filter((issue) => issue.similarity >= context.config.matchThreshold); + const processedIssues = await processSimilarIssues(similarIssues, context, issueBody); if (matchIssues.length > 0) { logger.info(`Similar issue which matches more than ${context.config.matchThreshold} already exists`); + //To the issue body, add a footnote with the link to the similar issue + const updatedBody = await handleMatchIssuesComment(context, payload, issueBody, processedIssues); + issueBody = updatedBody || issueBody; await octokit.issues.update({ owner: payload.repository.owner.login, repo: payload.repository.name, issue_number: issue.number, + body: issueBody, state: "closed", state_reason: "not_planned", }); @@ -56,7 +60,7 @@ export async function issueChecker(context: Context): Promise { } if (similarIssues.length > 0) { logger.info(`Similar issue which matches more than ${context.config.warningThreshold} already exists`); - await handleSimilarIssuesComment(context, payload, issueBody, issue.number, similarIssues); + await handleSimilarIssuesComment(context, payload, issueBody, issue.number, processedIssues); return true; } } else { @@ -85,7 +89,6 @@ function matchRepoOrgToSimilarIssueRepoOrg(repoOrg: string, similarIssueRepoOrg: function findMostSimilarSentence(issueContent: string, similarIssueContent: string): { sentence: string; similarity: number; index: number } { // Regex to match sentences while preserving URLs const sentenceRegex = /([^.!?\s][^.!?]*(?:[.!?](?!['"]?\s|$)[^.!?]*)*[.!?]?['"]?(?=\s|$))/g; - // Function to split text into sentences while preserving URLs const splitIntoSentences = (text: string): string[] => { const sentences: string[] = []; @@ -124,40 +127,7 @@ function findMostSimilarSentence(issueContent: string, similarIssueContent: stri return { sentence: mostSimilarSentence, similarity: maxSimilarity, index: mostSimilarIndex }; } -async function handleSimilarIssuesComment( - context: Context, - payload: IssuePayload, - issueBody: string, - issueNumber: number, - similarIssues: IssueSimilaritySearchResult[] -) { - const issueList: IssueGraphqlResponse[] = await Promise.all( - similarIssues.map(async (issue: IssueSimilaritySearchResult) => { - const issueUrl: IssueGraphqlResponse = await context.octokit.graphql( - `query($issueNodeId: ID!) { - node(id: $issueNodeId) { - ... on Issue { - title - url - number - body - repository { - name - owner { - login - } - } - } - } - }`, - { issueNodeId: issue.issue_id } - ); - issueUrl.similarity = Math.round(issue.similarity * 100).toString(); - issueUrl.mostSimilarSentence = findMostSimilarSentence(issueBody, issueUrl.node.body); - return issueUrl; - }) - ); - +async function handleSimilarIssuesComment(context: Context, payload: IssuePayload, issueBody: string, issueNumber: number, issueList: IssueGraphqlResponse[]) { const relevantIssues = issueList.filter((issue) => matchRepoOrgToSimilarIssueRepoOrg(payload.repository.owner.login, issue.node.repository.owner.login, payload.repository.name, issue.node.repository.name) ); @@ -177,31 +147,25 @@ async function handleSimilarIssuesComment( let footnotes: string[] | undefined; // Sort relevant issues by similarity in ascending order relevantIssues.sort((a, b) => parseFloat(a.similarity) - parseFloat(b.similarity)); - relevantIssues.forEach((issue, index) => { const footnoteIndex = highestFootnoteIndex + index + 1; // Continue numbering from the highest existing footnote number const footnoteRef = `[^0${footnoteIndex}^]`; const modifiedUrl = issue.node.url.replace("https://github.com", "https://www.github.com"); const { sentence } = issue.mostSimilarSentence; - // Insert footnote reference in the body const sentencePattern = new RegExp(`${sentence.replace(/[.*+?^${}()|[\]\\]/g, "\\$&")}`, "g"); updatedBody = updatedBody.replace(sentencePattern, `${sentence}${footnoteRef}`); - // Initialize footnotes array if not already done if (!footnotes) { footnotes = []; } - // Add new footnote to the array footnotes.push(`${footnoteRef}: ⚠ ${issue.similarity}% possible duplicate - [${issue.node.title}](${modifiedUrl}#${issue.node.number})\n\n`); }); - // Append new footnotes to the body, keeping the previous ones if (footnotes) { updatedBody += "\n\n" + footnotes.join(""); } - // Update the issue with the modified body await context.octokit.issues.update({ owner: payload.repository.owner.login, @@ -211,6 +175,72 @@ async function handleSimilarIssuesComment( }); } +//When similarity is greater than match threshold, Add Caution mentioning the issues to which its is very much similar +async function handleMatchIssuesComment( + context: Context, + payload: IssuePayload, + issueBody: string, + issueList: IssueGraphqlResponse[] +): Promise { + const relevantIssues = issueList.filter((issue) => + matchRepoOrgToSimilarIssueRepoOrg(payload.repository.owner.login, issue.node.repository.owner.login, payload.repository.name, issue.node.repository.name) + ); + + if (relevantIssues.length === 0) { + context.logger.info("No relevant issues found with the same repository and organization"); + } + + if (!issueBody) { + return; + } + // Find existing footnotes in the body + const footnoteRegex = /\[\^(\d+)\^\]/g; + const existingFootnotes = issueBody.match(footnoteRegex) || []; + // Find the index with respect to the issue body string where the footnotes start if they exist + const footnoteIndex = existingFootnotes[0] ? issueBody.indexOf(existingFootnotes[0]) : issueBody.length; + let resultBuilder = "\n\n>[!CAUTION]\n> This issue is very similar to the following issues:\n"; + // Sort relevant issues by similarity in descending order + relevantIssues.sort((a, b) => parseFloat(b.similarity) - parseFloat(a.similarity)); + // Append the similar issues to the resultBuilder + relevantIssues.forEach((issue) => { + const modifiedUrl = issue.node.url.replace("https://github.com", "https://www.github.com"); + resultBuilder += `> - [${issue.node.title}](${modifiedUrl}#${issue.node.number})\n`; + }); + // Insert the resultBuilder into the issue body + // Update the issue with the modified body + return issueBody.slice(0, footnoteIndex) + resultBuilder + issueBody.slice(footnoteIndex); +} + +// Process similar issues and return the list of similar issues with their similarity scores +async function processSimilarIssues(similarIssues: IssueSimilaritySearchResult[], context: Context, issueBody: string): Promise { + return await Promise.all( + similarIssues.map(async (issue: IssueSimilaritySearchResult) => { + const issueUrl: IssueGraphqlResponse = await context.octokit.graphql( + `query($issueNodeId: ID!) { + node(id: $issueNodeId) { + ... on Issue { + title + url + number + body + repository { + name + owner { + login + } + } + } + } + }`, + { issueNodeId: issue.issue_id } + ); + issueUrl.similarity = Math.round(issue.similarity * 100).toString(); + issueUrl.mostSimilarSentence = findMostSimilarSentence(issueBody, issueUrl.node.body); + return issueUrl; + }) + ); +} + /** * Finds the edit distance between two strings using dynamic programming. * The edit distance is a way of quantifying how dissimilar two strings are to one another by diff --git a/supabase/migrations/20241026185200_function_issue.sql b/supabase/migrations/20241026185200_function_issue.sql new file mode 100644 index 0000000..dbfa4c5 --- /dev/null +++ b/supabase/migrations/20241026185200_function_issue.sql @@ -0,0 +1,33 @@ +DROP FUNCTION IF EXISTS find_similar_issues; + +CREATE OR REPLACE FUNCTION find_similar_issues(current_id VARCHAR, query_embedding vector(1024), threshold float8, top_k INT) +RETURNS TABLE(issue_id VARCHAR, issue_plaintext TEXT, similarity float8) AS $$ +DECLARE + current_quantized vector(1024); + current_repo TEXT; + current_org TEXT; +BEGIN + -- Ensure the query_embedding is in the correct format + current_quantized := query_embedding; + + -- Extract the current issue's repo and org from the payload + SELECT + payload->'repository'->>'name'::text, + payload->'repository'->'owner'->>'login'::text + INTO current_repo, current_org + FROM issues + WHERE id = current_id; + + RETURN QUERY + SELECT id AS issue_id, + plaintext AS issue_plaintext, + ((0.8 * cosine_distance(current_quantized, embedding)) + 0.8 * (1 / (1 + l2_distance(current_quantized, embedding)))) as similarity + FROM issues + WHERE id <> current_id + AND COALESCE(payload->'repository'->>'name', '') = COALESCE(current_repo, '') -- To handle Private Issues + AND COALESCE(payload->'repository'->'owner'->>'login', '') = COALESCE(current_org, '') -- To handle Private Issues + AND ((0.8 * cosine_distance(current_quantized, embedding)) + 0.8 * (1 / (1 + l2_distance(current_quantized, embedding)))) > threshold + ORDER BY similarity DESC + LIMIT top_k; +END; +$$ LANGUAGE plpgsql; \ No newline at end of file