Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make evals handle failures more gracefully #18082

Merged
merged 3 commits into from
Sep 20, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
315 changes: 195 additions & 120 deletions crates/evals/src/eval.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,16 @@ use language::LanguageRegistry;
use node_runtime::FakeNodeRuntime;
use open_ai::OpenAiEmbeddingModel;
use project::Project;
use semantic_index::{OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status};
use semantic_index::{
EmbeddingProvider, OpenAiEmbeddingProvider, ProjectIndex, SemanticDb, Status,
};
use serde::{Deserialize, Serialize};
use settings::SettingsStore;
use smol::channel::bounded;
use smol::io::AsyncReadExt;
use smol::Timer;
use std::ops::RangeInclusive;
use std::path::PathBuf;
use std::time::Duration;
use std::{
fs,
Expand Down Expand Up @@ -237,6 +240,14 @@ async fn fetch_code_search_net_resources(http_client: &dyn HttpClient) -> Result
Ok(())
}

#[derive(Default, Debug)]
struct Counts {
covered_results: usize,
overlapped_results: usize,
covered_files: usize,
total_results: usize,
}

async fn run_evaluation(
only_repo: Option<String>,
executor: &BackgroundExecutor,
Expand Down Expand Up @@ -297,12 +308,11 @@ async fn run_evaluation(
cx.update(|cx| languages::init(language_registry.clone(), node_runtime.clone(), cx))
.unwrap();

let mut covered_result_count = 0;
let mut overlapped_result_count = 0;
let mut covered_file_count = 0;
let mut total_result_count = 0;
let mut counts = Counts::default();
eprint!("Running evals.");

let mut failures = Vec::new();

for evaluation_project in evaluations {
if only_repo
.as_ref()
Expand All @@ -314,27 +324,24 @@ async fn run_evaluation(
eprint!("\r\x1B[2K");
eprint!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. Project: {}...",
covered_result_count,
total_result_count,
overlapped_result_count,
total_result_count,
covered_file_count,
total_result_count,
counts.covered_results,
counts.total_results,
counts.overlapped_results,
counts.total_results,
counts.covered_files,
counts.total_results,
evaluation_project.repo
);

let repo_db_path =
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider.clone(), cx)
.await
.unwrap();

let repo_dir = repos_dir.join(&evaluation_project.repo);
if !repo_dir.exists() || repo_dir.join(SKIP_EVAL_PATH).exists() {
eprintln!("Skipping {}: directory not found", evaluation_project.repo);
continue;
}

let repo_db_path =
db_path.join(format!("{}.db", evaluation_project.repo.replace('/', "_")));

let project = cx
.update(|cx| {
Project::local(
Expand All @@ -349,125 +356,193 @@ async fn run_evaluation(
})
.unwrap();

let (worktree, _) = project
.update(cx, |project, cx| {
project.find_or_create_worktree(repo_dir, true, cx)
})?
.await?;
let repo = evaluation_project.repo.clone();
if let Err(err) = run_eval_project(
evaluation_project,
&user_store,
repo_db_path,
&repo_dir,
&mut counts,
project,
embedding_provider.clone(),
fs.clone(),
cx,
)
.await
{
eprintln!("{repo} eval failed with error: {:?}", err);

failures.push((repo, err));
}
}

worktree
.update(cx, |worktree, _| {
worktree.as_local().unwrap().scan_complete()
})
.unwrap()
.await;
eprintln!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured. {} failed.",
counts.covered_results,
counts.total_results,
counts.overlapped_results,
counts.total_results,
counts.covered_files,
counts.total_results,
failures.len(),
);

let project_index = cx
.update(|cx| semantic_index.create_project_index(project.clone(), cx))
.unwrap();
wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await;
if failures.is_empty() {
Ok(())
} else {
eprintln!("Failures:\n");

for query in evaluation_project.queries {
let results = cx
.update(|cx| {
for (index, (repo, failure)) in failures.iter().enumerate() {
eprintln!("Failure #{} - {repo}\n{:?}", index + 1, failure);
}

Err(anyhow::anyhow!("Some evals failed."))
}
}

#[allow(clippy::too_many_arguments)]
async fn run_eval_project(
evaluation_project: EvaluationProject,
user_store: &Model<UserStore>,
repo_db_path: PathBuf,
repo_dir: &Path,
counts: &mut Counts,
project: Model<Project>,
embedding_provider: Arc<dyn EmbeddingProvider>,
fs: Arc<dyn Fs>,
cx: &mut AsyncAppContext,
) -> Result<(), anyhow::Error> {
let mut semantic_index = SemanticDb::new(repo_db_path, embedding_provider, cx).await?;

let (worktree, _) = project
.update(cx, |project, cx| {
project.find_or_create_worktree(repo_dir, true, cx)
})?
.await?;

worktree
.update(cx, |worktree, _| {
worktree.as_local().unwrap().scan_complete()
})?
.await;

let project_index = cx.update(|cx| semantic_index.create_project_index(project.clone(), cx))?;
wait_for_indexing_complete(&project_index, cx, Some(Duration::from_secs(120))).await;

for query in evaluation_project.queries {
let results = {
// Retry search up to 3 times in case of timeout, network failure, etc.
let mut retries_remaining = 3;
let mut result;

loop {
match cx.update(|cx| {
let project_index = project_index.read(cx);
project_index.search(query.query.clone(), SEARCH_RESULT_LIMIT, cx)
})
.unwrap()
.await
.unwrap();

let results = SemanticDb::load_results(results, &fs.clone(), &cx)
.await
.unwrap();

let mut project_covered_result_count = 0;
let mut project_overlapped_result_count = 0;
let mut project_covered_file_count = 0;
let mut covered_result_indices = Vec::new();
for expected_result in &query.expected_results {
let mut file_matched = false;
let mut range_overlapped = false;
let mut range_covered = false;

for (ix, result) in results.iter().enumerate() {
if result.path.as_ref() == Path::new(&expected_result.file) {
file_matched = true;
let start_matched =
result.row_range.contains(&expected_result.lines.start());
let end_matched = result.row_range.contains(&expected_result.lines.end());

if start_matched || end_matched {
range_overlapped = true;
}

if start_matched && end_matched {
range_covered = true;
covered_result_indices.push(ix);
}) {
Ok(task) => match task.await {
Ok(answer) => {
result = Ok(answer);
break;
}
Err(err) => {
result = Err(err);
}
},
Err(err) => {
result = Err(err);
}
}

if range_covered {
project_covered_result_count += 1
};
if range_overlapped {
project_overlapped_result_count += 1
};
if file_matched {
project_covered_file_count += 1
};
if retries_remaining > 0 {
eprintln!(
"Retrying search after it failed on query {:?} with {:?}",
query, result
);
retries_remaining -= 1;
} else {
eprintln!(
"Ran out of retries; giving up on search which failed on query {:?} with {:?}",
query, result
);
break;
}
}
let outcome_repo = evaluation_project.repo.clone();

let query_results = EvaluationQueryOutcome {
repo: outcome_repo,
query: query.query,
total_result_count: query.expected_results.len(),
covered_result_count: project_covered_result_count,
overlapped_result_count: project_overlapped_result_count,
covered_file_count: project_covered_file_count,
expected_results: query.expected_results,
actual_results: results
.iter()
.map(|result| EvaluationSearchResult {
file: result.path.to_string_lossy().to_string(),
lines: result.row_range.clone(),
})
.collect(),
covered_result_indices,
};

overlapped_result_count += query_results.overlapped_result_count;
covered_result_count += query_results.covered_result_count;
covered_file_count += query_results.covered_file_count;
total_result_count += query_results.total_result_count;
SemanticDb::load_results(result?, &fs.clone(), &cx).await?
};

println!("{}", serde_json::to_string(&query_results).unwrap());
let mut project_covered_result_count = 0;
let mut project_overlapped_result_count = 0;
let mut project_covered_file_count = 0;
let mut covered_result_indices = Vec::new();
for expected_result in &query.expected_results {
let mut file_matched = false;
let mut range_overlapped = false;
let mut range_covered = false;

for (ix, result) in results.iter().enumerate() {
if result.path.as_ref() == Path::new(&expected_result.file) {
file_matched = true;
let start_matched = result.row_range.contains(&expected_result.lines.start());
let end_matched = result.row_range.contains(&expected_result.lines.end());

if start_matched || end_matched {
range_overlapped = true;
}

if start_matched && end_matched {
range_covered = true;
covered_result_indices.push(ix);
break;
}
}
}

if range_covered {
project_covered_result_count += 1
};
if range_overlapped {
project_overlapped_result_count += 1
};
if file_matched {
project_covered_file_count += 1
};
}
let outcome_repo = evaluation_project.repo.clone();

let query_results = EvaluationQueryOutcome {
repo: outcome_repo,
query: query.query,
total_result_count: query.expected_results.len(),
covered_result_count: project_covered_result_count,
overlapped_result_count: project_overlapped_result_count,
covered_file_count: project_covered_file_count,
expected_results: query.expected_results,
actual_results: results
.iter()
.map(|result| EvaluationSearchResult {
file: result.path.to_string_lossy().to_string(),
lines: result.row_range.clone(),
})
.collect(),
covered_result_indices,
};

user_store
.update(cx, |_, _| {
drop(semantic_index);
drop(project);
drop(worktree);
drop(project_index);
})
.unwrap();
}
counts.overlapped_results += query_results.overlapped_result_count;
counts.covered_results += query_results.covered_result_count;
counts.covered_files += query_results.covered_file_count;
counts.total_results += query_results.total_result_count;

eprint!(
"Running evals. {}/{} covered. {}/{} overlapped. {}/{} files captured.",
covered_result_count,
total_result_count,
overlapped_result_count,
total_result_count,
covered_file_count,
total_result_count,
);
println!("{}", serde_json::to_string(&query_results)?);
}

Ok(())
user_store.update(cx, |_, _| {
drop(semantic_index);
drop(project);
drop(worktree);
drop(project_index);
})
}

async fn wait_for_indexing_complete(
Expand Down Expand Up @@ -524,7 +599,7 @@ async fn fetch_eval_repos(
let evaluations = fs::read(&evaluations_path).expect("failed to read evaluations.json");
let evaluations: Vec<EvaluationProject> = serde_json::from_slice(&evaluations).unwrap();

eprint!("Fetching evaluation repositories...");
eprintln!("Fetching evaluation repositories...");

executor
.scoped(move |scope| {
Expand Down
Loading