diff --git a/src/adapters/openai/helpers/completions.ts b/src/adapters/openai/helpers/completions.ts index dfd2bcb..42b61db 100644 --- a/src/adapters/openai/helpers/completions.ts +++ b/src/adapters/openai/helpers/completions.ts @@ -83,7 +83,7 @@ export class Completions extends SuperOpenAi { additionalContext.join("\n"), ].join("\n"); - // logger.info(`System message: ${sysMsg}`); + logger.info(`System message: ${sysMsg}`); logger.info(`Query: ${query}`); const res: OpenAI.Chat.Completions.ChatCompletion = await this.client.chat.completions.create({ @@ -165,6 +165,7 @@ export class Completions extends SuperOpenAi { } async findTokenLength(prompt: string, additionalContext: string[] = [], localContext: string[] = [], groundTruths: string[] = []): Promise { + // disallowedSpecial: new Set() because we pass the entire diff as the prompt we should account for all special characters return encode(prompt + additionalContext.join("\n") + localContext.join("\n") + groundTruths.join("\n"), { disallowedSpecial: new Set() }).length; } } diff --git a/src/handlers/ask-llm.ts b/src/handlers/ask-llm.ts index 0a48f46..8f6790b 100644 --- a/src/handlers/ask-llm.ts +++ b/src/handlers/ask-llm.ts @@ -8,34 +8,23 @@ import { fetchRepoDependencies, fetchRepoLanguageStats } from "./ground-truths/c import { findGroundTruths } from "./ground-truths/find-ground-truths"; import { bubbleUpErrorComment, logger } from "../helpers/errors"; -/** - * Asks a question to GPT and returns the response - * @param context - The context object containing environment and configuration details - * @param question - The question to ask GPT - * @returns The response from GPT - * @throws If no question is provided - */ export async function askQuestion(context: Context, question: string) { if (!question) { - throw context.logger.error("No question provided"); + throw logger.error("No question provided"); } + // using any links in comments or issue/pr bodies to fetch more context const { specAndBodies, streamlinedComments } = await recursivelyFetchLinkedIssues({ context, owner: context.payload.repository.owner.login, repo: context.payload.repository.name, }); + // build a nicely structure system message containing a streamlined chat history + // includes the current issue, any linked issues, and any linked PRs const formattedChat = await formatChatHistory(context, streamlinedComments, specAndBodies); - // logger.info(`${formattedChat.join("")}`); + logger.info(`${formattedChat.join("")}`); return await askLlm(context, question, formattedChat); } -/** - * Asks GPT a question and returns the completions - * @param context - The context object containing environment and configuration details - * @param question - The question to ask GPT - * @param formattedChat - The formatted chat history to provide context to GPT - * @returns completions - The completions generated by GPT - **/ export async function askLlm(context: Context, question: string, formattedChat: string[]): Promise { const { env: { UBIQUITY_OS_APP_NAME }, @@ -48,19 +37,24 @@ export async function askLlm(context: Context, question: string, formattedChat: } = context; try { + // using db functions to find similar comments and issues const [similarComments, similarIssues] = await Promise.all([ comment.findSimilarComments(question, 1 - similarityThreshold, ""), issue.findSimilarIssues(question, 1 - similarityThreshold, ""), ]); + // combine the similar comments and issues into a single array const similarText = [ ...(similarComments?.map((comment: CommentSimilaritySearchResult) => comment.comment_plaintext) || []), ...(similarIssues?.map((issue: IssueSimilaritySearchResult) => issue.issue_plaintext) || []), ]; + // filter out any empty strings formattedChat = formattedChat.filter((text) => text); + // rerank the similar text using voyageai const rerankedText = similarText.length > 0 ? await reranker.reRankResults(similarText, question) : []; + // gather structural data about the payload repository const [languages, { dependencies, devDependencies }] = await Promise.all([fetchRepoLanguageStats(context), fetchRepoDependencies(context)]); let groundTruths: string[] = []; diff --git a/src/handlers/comment-created-callback.ts b/src/handlers/comment-created-callback.ts index b11c36c..a45770d 100644 --- a/src/handlers/comment-created-callback.ts +++ b/src/handlers/comment-created-callback.ts @@ -35,6 +35,7 @@ export async function issueCommentCreatedCallback( } const metadataString = createStructuredMetadata( + // don't change this header, it's used for tracking "ubiquity-os-llm-response", logger.info(`Answer: ${answer}`, { metadata: { diff --git a/src/helpers/format-chat-history.ts b/src/helpers/format-chat-history.ts index 84855c4..dc38288 100644 --- a/src/helpers/format-chat-history.ts +++ b/src/helpers/format-chat-history.ts @@ -10,6 +10,7 @@ export async function formatChatHistory( streamlined: Record, specAndBodies: Record ): Promise { + // At this point really we should have all the context we can obtain but we try again just in case const keys = new Set([...Object.keys(streamlined), ...Object.keys(specAndBodies), createKey(context.payload.issue.html_url)]); const tokenLimits: TokenLimits = { modelMaxTokenLimit: context.adapters.openai.completions.getModelMaxTokenLimit(context.config.model), @@ -18,11 +19,14 @@ export async function formatChatHistory( tokensRemaining: 0, }; - // minus the output tokens we have this many tokens to use + // what we start out with tokenLimits.tokensRemaining = tokenLimits.modelMaxTokenLimit - tokenLimits.maxCompletionTokens; + // careful adding any more API calls here as it's likely to hit the secondary rate limit const chatHistory = await Promise.all( + // keys are owner/repo/issueNum; so for each issue, we want to create a block Array.from(keys).map(async (key, i) => { + // if we run out of tokens, we should stop if (tokenLimits.tokensRemaining < 0) { logger.error(`Ran out of tokens at block ${i}`); return ""; @@ -35,6 +39,7 @@ export async function formatChatHistory( isCurrentIssue: key === createKey(context.payload.issue.html_url), tokenLimits, }); + // update the token count tokenLimits.runningTokenCount = currentTokenCount; tokenLimits.tokensRemaining = tokenLimits.modelMaxTokenLimit - tokenLimits.maxCompletionTokens - currentTokenCount; return result; @@ -44,6 +49,7 @@ export async function formatChatHistory( return Array.from(new Set(chatHistory)).filter((x): x is string => !!x); } +// These give structure and provide the distinction between the different sections of the chat history function getCorrectHeaderString(prDiff: string | null, isCurrentIssue: boolean, isConvo: boolean) { const strings = { convo: { @@ -90,7 +96,8 @@ async function createContextBlockSection({ tokenLimits: TokenLimits; }): Promise<[number, string]> { let comments = streamlined[key]; - if (!comments || comments.length === 0) { + // just in case we try again but we should already have the comments + if (!comments || !comments.length) { const [owner, repo, number] = splitKey(key); const { comments: fetchedComments } = await fetchIssueComments({ context, @@ -98,7 +105,6 @@ async function createContextBlockSection({ repo, issueNum: parseInt(number), }); - comments = streamlineComments(fetchedComments)[key]; } @@ -108,8 +114,11 @@ async function createContextBlockSection({ throw context.logger.error("Issue number is not valid"); } + // Fetch our diff if we have one; this excludes the largest of files to keep within token limits const { diff } = await fetchPullRequestDiff(context, org, repo, issueNumber, tokenLimits); + // specification or pull request body let specOrBody = specAndBodies[key]; + // we should have it already but just in case if (!specOrBody) { specOrBody = ( @@ -122,23 +131,27 @@ async function createContextBlockSection({ )?.body || "No specification or body available"; } - const specHeader = getCorrectHeaderString(diff, isCurrentIssue, false); - const blockHeader = getCorrectHeaderString(diff, isCurrentIssue, true); + const specHeader = getCorrectHeaderString(diff, isCurrentIssue, false); //E.g: === Current Task Specification === + const blockHeader = getCorrectHeaderString(diff, isCurrentIssue, true); //E.g: === Linked Task Conversation === + // contains the actual spec or body const specBlock = [createHeader(specHeader, key), createSpecOrBody(specOrBody), createFooter(specHeader, key)]; + // contains the conversation const commentSection = createComment({ issueNumber, repo, org, comments }, specOrBody); let block; + // if we have a conversation, we should include it if (commentSection) { block = [specBlock.join("\n"), createHeader(blockHeader, key), commentSection, createFooter(blockHeader, key)]; } else { - // in this scenario we have no task/PR conversation, just the spec + // No need for empty sections in the chat history block = [specBlock.join("\n")]; } // only inject the README if this is the current issue as that's likely most relevant if (isCurrentIssue) { const readme = await pullReadmeFromRepoForIssue({ context, owner: org, repo }); + // give the readme it's own clear section if (readme) { const readmeBlock = readme ? [createHeader("README", key), createSpecOrBody(readme), createFooter("README", key)] : []; block = block.concat(readmeBlock); @@ -146,25 +159,15 @@ async function createContextBlockSection({ } if (!diff) { + // the diff was already encoded etc but we have added more to the block so we need to re-encode return [await context.adapters.openai.completions.findTokenLength(block.join("")), block.join("\n")]; } + // Build the block with the diff in it's own section const blockWithDiff = [block.join("\n"), createHeader(`Pull Request Diff`, key), diff, createFooter(`Pull Request Diff`, key)]; return [await context.adapters.openai.completions.findTokenLength(blockWithDiff.join("")), blockWithDiff.join("\n")]; } -/** - * Might not need to splice from the formatted window -function removeSections(fullText: string, header: string, footer: string): string { - const regex = new RegExp(`${escapeRegExp(header)}[\\s\\S]*?${escapeRegExp(footer)}`, 'g'); - return fullText.replace(regex, '').trim(); -} - -function escapeRegExp(text: string): string { - return text.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); -} - */ - function createHeader(content: string, repoString: string) { return `=== ${content} === ${repoString} ===\n`; } @@ -184,6 +187,7 @@ function createComment(comment: StreamlinedComments, specOrBody: string) { const seen = new Set(); comment.comments = comment.comments.filter((c) => { + // Do not include the same comment twice or the spec/body if (seen.has(c.id) || c.body === specOrBody) { return false; } diff --git a/src/helpers/issue-fetching.ts b/src/helpers/issue-fetching.ts index 101ddcd..4432067 100644 --- a/src/helpers/issue-fetching.ts +++ b/src/helpers/issue-fetching.ts @@ -7,28 +7,21 @@ import { dedupeStreamlinedComments, fetchCodeLinkedFromIssue, idIssueFromComment import { handleIssue, handleSpec, handleSpecAndBodyKeys, throttlePromises } from "./issue-handling"; import { processPullRequestDiff } from "./pull-request-parsing"; -/** - * Recursively fetches linked issues and processes them, including fetching comments and specifications. - * - * @param params - The parameters required to fetch the linked issues, including context and other details. - * @returns A promise that resolves to an object containing linked issues, specifications, streamlined comments, and seen issue keys. - */ export async function recursivelyFetchLinkedIssues(params: FetchParams) { + // take a first run at gathering everything we need and package it up const { linkedIssues, seen, specAndBodies, streamlinedComments } = await fetchLinkedIssues(params); + // build promises and throttle them; this calls handleSpec which is a recursive function potentially to great depth const fetchPromises = linkedIssues.map(async (linkedIssue) => await mergeCommentsAndFetchSpec(params, linkedIssue, streamlinedComments, specAndBodies, seen)); await throttlePromises(fetchPromises, 10); + // handle the keys that have been gathered const linkedIssuesKeys = linkedIssues.map((issue) => createKey(`${issue.owner}/${issue.repo}/${issue.issueNumber}`)); + // exhaustive list of unique keys from the first full pass const specAndBodyKeys = Array.from(new Set([...Object.keys(specAndBodies), ...Object.keys(streamlinedComments), ...linkedIssuesKeys])); + // this fn throttles from within but again, be weary of the rate limit await handleSpecAndBodyKeys(specAndBodyKeys, params, dedupeStreamlinedComments(streamlinedComments), seen); return { linkedIssues, specAndBodies, streamlinedComments }; } -/** - * Fetches linked issues recursively and processes them. - * - * @param params - The parameters required to fetch the linked issues, including context and other details. - * @returns A promise that resolves to an object containing linked issues, specifications, streamlined comments, and seen issue keys. - */ export async function fetchLinkedIssues(params: FetchParams) { const { comments, issue } = await fetchIssueComments(params); if (!issue) { @@ -100,15 +93,6 @@ export async function fetchLinkedIssues(params: FetchParams) { return { streamlinedComments, linkedIssues, specAndBodies, seen }; } -/** - * Merges comments and fetches the specification for a linked issue. - * - * @param params - The parameters required to fetch the linked issue, including context and other details. - * @param linkedIssue - The linked issue for which comments and specifications need to be fetched. - * @param streamlinedComments - A record of streamlined comments associated with issues. - * @param specOrBodies - A record of specifications or bodies associated with issues. - * @param seen - A set of issue keys that have already been processed to avoid duplication. - */ export async function mergeCommentsAndFetchSpec( params: FetchParams, linkedIssue: LinkedIssues, @@ -148,11 +132,6 @@ export async function fetchPullRequestDiff(context: Context, org: string, repo: return await processPullRequestDiff(diff, tokenLimits); } -/** - * Fetches an issue from the GitHub API. - * @param params - Context - * @returns A promise that resolves to an issue object or null if an error occurs. - */ export async function fetchIssue(params: FetchParams): Promise { const { octokit, payload, logger } = params.context; const { issueNum, owner, repo } = params; @@ -227,15 +206,6 @@ export async function fetchIssueComments(params: FetchParams) { }; } -/** - * Fetches and handles an issue based on the provided key and parameters. - * - * @param key - The unique key representing the issue in the format "owner/repo/issueNumber". - * @param params - The parameters required to fetch the issue, including context and other details. - * @param streamlinedComments - A record of streamlined comments associated with issues. - * @param seen - A set of issue keys that have already been processed to avoid duplication. - * @returns A promise that resolves to an array of streamlined comments for the specified issue. - */ export async function fetchAndHandleIssue( key: string, params: FetchParams, diff --git a/src/helpers/issue-handling.ts b/src/helpers/issue-handling.ts index 055f1dd..779cb26 100644 --- a/src/helpers/issue-handling.ts +++ b/src/helpers/issue-handling.ts @@ -4,14 +4,6 @@ import { StreamlinedComment } from "../types/llm"; import { idIssueFromComment, mergeStreamlinedComments, splitKey } from "./issue"; import { fetchLinkedIssues, fetchIssue, fetchAndHandleIssue, mergeCommentsAndFetchSpec } from "./issue-fetching"; -/** - * Handles the processing of an issue. - * - * @param params - The parameters required to fetch and handle issues. - * @param streamlinedComments - A record of streamlined comments indexed by keys. - * @param alreadySeen - A set of keys that have already been processed to avoid duplication. - * @returns A promise that resolves when the issue has been handled. - */ export async function handleIssue(params: FetchParams, streamlinedComments: Record, alreadySeen: Set) { if (alreadySeen.has(createKey(`${params.owner}/${params.repo}/${params.issueNum}`))) { return; @@ -22,17 +14,6 @@ export async function handleIssue(params: FetchParams, streamlinedComments: Reco return mergeStreamlinedComments(streamlinedComments, streamlined); } -/** - * Handles the processing of a specification or body text. - * - * @param params - The parameters required to fetch and handle issues. - * @param specOrBody - The specification or body text to be processed. - * @param specAndBodies - A record of specifications and bodies indexed by keys. - * @param key - The key associated with the current specification or body. - * @param seen - A set of keys that have already been processed to avoid duplication. - * @param streamlinedComments - A record of streamlined comments indexed by keys. - * @returns A promise that resolves to the updated record of specifications and bodies. - */ export async function handleSpec( params: FetchParams, specOrBody: string, @@ -73,14 +54,6 @@ export async function handleSpec( return specAndBodies; } -/** - * Handles the processing of a comment. - * - * @param params - The parameters required to fetch and handle issues. - * @param comment - The comment to be processed. - * @param streamlinedComments - A record of streamlined comments indexed by keys. - * @param seen - A set of keys that have already been processed to avoid duplication. - */ export async function handleComment( params: FetchParams, comment: StreamlinedComment, @@ -100,15 +73,8 @@ export async function handleComment( } } -/** - * Handles the processing of specification and body keys. - * - * @param keys - An array of keys representing issues or comments to be processed. - * @param params - The parameters required to fetch and handle issues. - * @param streamlinedComments - A record of streamlined comments indexed by keys. - * @param seen - A set of keys that have already been processed to avoid duplication. - */ export async function handleSpecAndBodyKeys(keys: string[], params: FetchParams, streamlinedComments: Record, seen: Set) { + // Make one last sweep just to be sure we have everything const commentProcessingPromises = keys.map(async (key) => { let comments = streamlinedComments[key]; if (!comments || comments.length === 0) { @@ -122,12 +88,6 @@ export async function handleSpecAndBodyKeys(keys: string[], params: FetchParams, await throttlePromises(commentProcessingPromises, 10); } -/** - * Throttles the execution of promises to ensure that no more than the specified limit are running concurrently. - * - * @param promises - An array of promises to be executed. - * @param limit - The maximum number of promises to run concurrently. - */ export async function throttlePromises(promises: Promise[], limit: number) { const executing: Promise[] = []; for (const promise of promises) { diff --git a/src/helpers/pull-request-parsing.ts b/src/helpers/pull-request-parsing.ts index fc61bbf..efa307b 100644 --- a/src/helpers/pull-request-parsing.ts +++ b/src/helpers/pull-request-parsing.ts @@ -6,12 +6,15 @@ import { EncodeOptions } from "gpt-tokenizer/esm/GptEncoding"; export async function processPullRequestDiff(diff: string, tokenLimits: TokenLimits) { const { runningTokenCount, tokensRemaining } = tokenLimits; + // parse the diff into per-file diffs for quicker processing const perFileDiffs = parsePerFileDiffs(diff); + // filter out obviously non-essential files; .png, .jpg, .pdf, etc. const essentialFileDiffs = perFileDiffs.filter(({ filename }) => { return isEssentialFile(filename); }); + // quick estimate using a simple heuristic; 3.5 characters per token const estimatedFileDiffStats = essentialFileDiffs.map(({ filename, diffContent }) => { const estimatedTokenCount = Math.ceil(diffContent.length / 3.5); return { filename, estimatedTokenCount, diffContent }; @@ -22,6 +25,7 @@ export async function processPullRequestDiff(diff: string, tokenLimits: TokenLim let currentTokenCount = runningTokenCount; const includedFileDiffs = []; + // Using the quick estimate, include as many files as possible without exceeding token limits for (const file of estimatedFileDiffStats) { if (currentTokenCount + file.estimatedTokenCount > tokensRemaining) { logger.info(`Skipping ${file.filename} to stay within token limits.`); @@ -31,11 +35,13 @@ export async function processPullRequestDiff(diff: string, tokenLimits: TokenLim currentTokenCount += file.estimatedTokenCount; } + // If no files can be included, return null if (includedFileDiffs.length === 0) { logger.error(`Cannot include any files from diff without exceeding token limits.`); return { diff: null }; } + // Accurately calculate token count for included files we have approximated to be under the limit const accurateFileDiffStats = await Promise.all( includedFileDiffs.map(async (file) => { const tokenCountArray = await encodeAsync(file.diffContent, { disallowedSpecial: new Set() }); @@ -44,8 +50,10 @@ export async function processPullRequestDiff(diff: string, tokenLimits: TokenLim }) ); + // Take an accurate reading of our current collection of files within the diff currentTokenCount = accurateFileDiffStats.reduce((sum, file) => sum + file.tokenCount, runningTokenCount); + // Remove files from the end of the list until we are within token limits while (currentTokenCount > tokensRemaining && accurateFileDiffStats.length > 0) { const removedFile = accurateFileDiffStats.pop(); currentTokenCount -= removedFile?.tokenCount || 0; @@ -57,11 +65,13 @@ export async function processPullRequestDiff(diff: string, tokenLimits: TokenLim return { diff: null }; } + // Build the diff with the included files const currentDiff = accurateFileDiffStats.map((file) => file.diffContent).join("\n"); return { diff: currentDiff }; } +// Helper to speed up tokenization export async function encodeAsync(text: string, options: EncodeOptions): Promise { return new Promise((resolve) => { const result = encode(text, options); @@ -69,22 +79,28 @@ export async function encodeAsync(text: string, options: EncodeOptions): Promise }); } +// Helper to parse a diff into per-file diffs export function parsePerFileDiffs(diff: string): { filename: string; diffContent: string }[] { + // regex to capture diff sections, including the last file const diffPattern = /^diff --git a\/(.*?) b\/.*$/gm; let match: RegExpExecArray | null; const perFileDiffs = []; let lastIndex = 0; + // iterate over each file in the diff while ((match = diffPattern.exec(diff)) !== null) { const filename = match[1]; const startIndex = match.index; + // if we have pushed a file into the array, "append" the diff content if (perFileDiffs.length > 0) { perFileDiffs[perFileDiffs.length - 1].diffContent = diff.substring(lastIndex, startIndex).trim(); } + perFileDiffs.push({ filename, diffContent: "" }); lastIndex = startIndex; } + // append the last file's diff content if (perFileDiffs.length > 0 && lastIndex < diff.length) { perFileDiffs[perFileDiffs.length - 1].diffContent = diff.substring(lastIndex).trim(); } @@ -92,6 +108,7 @@ export function parsePerFileDiffs(diff: string): { filename: string; diffContent return perFileDiffs; } +// This speeds things up considerably by skipping non-readable/non-relevant files function isEssentialFile(filename: string): boolean { const nonEssentialExtensions = [ // Image files @@ -241,4 +258,4 @@ function isEssentialFile(filename: string): boolean { ]; return !nonEssentialExtensions.some((ext) => filename.toLowerCase().endsWith(ext)); -} \ No newline at end of file +}