From ec6df04f2702fe6c4da4caf9fd8dfb4d666f9899 Mon Sep 17 00:00:00 2001 From: Gowind Date: Sat, 23 Nov 2024 01:12:32 +0100 Subject: [PATCH 01/10] feat: Add benchmark for tfidf Add a benchmark for calculating tfidf scores and calculating the top 10 similar documents We use a cosine similarity for calculating relevancy of documents for a given query --- scripts/bench_tfidf.rs | 163 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 163 insertions(+) create mode 100644 scripts/bench_tfidf.rs diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs new file mode 100644 index 00000000..af2c9c56 --- /dev/null +++ b/scripts/bench_tfidf.rs @@ -0,0 +1,163 @@ +use std::collections::HashMap; +use std::fs::File; +use std::io::{self, BufRead, BufReader}; +use std::path::Path; + +const LINES_PER_DOCUMENT: usize = 1000; + +#[derive(Debug)] +struct TfidfCalculator { + document_frequency: HashMap, + term_frequencies: Vec>, + documents: Vec, // Store original documents + n_documents: usize, +} + +impl TfidfCalculator { + fn new() -> Self { + TfidfCalculator { + document_frequency: HashMap::new(), + term_frequencies: Vec::new(), + documents: Vec::new(), + n_documents: 0, + } + } + + fn tokenize(text: &str) -> Vec { + text.split_whitespace() + .map(|word| word.to_lowercase() + .chars() + .filter(|c| c.is_alphanumeric()) + .collect::()) + .filter(|word| !word.is_empty()) + .collect() + } + + fn process_document(&mut self, text: &str) { + let tokens = Self::tokenize(text); + let mut term_freq = HashMap::new(); + let mut seen_terms = HashMap::new(); + + for token in tokens { + *term_freq.entry(token.clone()).or_insert(0) += 1; + seen_terms.insert(token, true); + } + + for term in seen_terms.keys() { + *self.document_frequency.entry(term.clone()).or_insert(0) += 1; + } + + self.term_frequencies.push(term_freq); + self.documents.push(text.to_string()); + self.n_documents += 1; + } + + fn calculate_tfidf(&self, term_freq: &HashMap) -> HashMap { + let mut tfidf = HashMap::new(); + + for (term, freq) in term_freq { + if let Some(&df) = self.document_frequency.get(term) { + let tf = *freq as f64; + let idf = (self.n_documents as f64 / df as f64).ln(); + tfidf.insert(term.clone(), tf * idf); + } + } + + tfidf + } + + fn get_document_tfidf_vectors(&self) -> Vec> { + self.term_frequencies + .iter() + .map(|tf| self.calculate_tfidf(tf)) + .collect() + } + + fn calculate_query_tfidf(&self, query: &str) -> HashMap { + let tokens = Self::tokenize(query); + let mut term_freq = HashMap::new(); + + for token in tokens { + *term_freq.entry(token).or_insert(0) += 1; + } + + self.calculate_tfidf(&term_freq) + } + + fn cosine_similarity(v1: &HashMap, v2: &HashMap) -> f64 { + let mut dot_product = 0.0; + let mut norm1 = 0.0; + let mut norm2 = 0.0; + + // Calculate dot product and norms + for (term, score1) in v1 { + norm1 += score1 * score1; + if let Some(score2) = v2.get(term) { + dot_product += score1 * score2; + } + } + + for (_term, score2) in v2 { + norm2 += score2 * score2; + } + + // Return cosine similarity + if norm1 == 0.0 || norm2 == 0.0 { + 0.0 + } else { + dot_product / (norm1.sqrt() * norm2.sqrt()) + } + } + + fn find_matching_documents(&self, query: &str, top_n: usize) -> Vec<(usize, f64)> { + let query_tfidf = self.calculate_query_tfidf(query); + let doc_tfidfs = self.get_document_tfidf_vectors(); + + let mut similarities: Vec<(usize, f64)> = doc_tfidfs + .iter() + .enumerate() + .map(|(idx, doc_tfidf)| { + (idx, Self::cosine_similarity(&query_tfidf, doc_tfidf)) + }) + .collect(); + + similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); + + // Return top N results + similarities.into_iter().take(top_n).collect() + } +} + +fn main() -> io::Result<()> { + let args: Vec = std::env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let path = Path::new(&args[1]); + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut calculator = TfidfCalculator::new(); + + // Process the file in chunks of LINES_PER_DOCUMENT lines + let mut current_document = String::new(); + let mut line_count = 0; + + for line in reader.lines() { + let line = line?; + current_document.push_str(&line); + current_document.push('\n'); + line_count += 1; + + if line_count == LINES_PER_DOCUMENT { + calculator.process_document(¤t_document); + current_document.clear(); + line_count = 0; + } + } + + if !current_document.is_empty() { + calculator.process_document(¤t_document); + } +} \ No newline at end of file From 52cf048bcbcebe2bedaecf155c574bfe67554b3d Mon Sep 17 00:00:00 2001 From: Gowind Date: Wed, 27 Nov 2024 02:18:10 +0100 Subject: [PATCH 02/10] Get vanialla version of TfIdf matrix calculation working --- .vscode/launch.json | 2 +- Cargo.lock | 13 +-- Cargo.toml | 12 +++ scripts/bench_tfidf.rs | 201 ++++++++++++++++++++--------------------- 4 files changed, 120 insertions(+), 108 deletions(-) diff --git a/.vscode/launch.json b/.vscode/launch.json index 0a5246fc..6ac02dab 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -74,7 +74,7 @@ "-k", "inner_bf16" ], - "justMyCode": true + "justMyCode": false } ] } \ No newline at end of file diff --git a/Cargo.lock b/Cargo.lock index 7613205f..989888a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -408,9 +408,9 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.3" +version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b62dbe01f0b06f9d8dc7d49e05a0785f153b00b2c227856282f671e0318c9b15" +checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191" dependencies = [ "aho-corasick", "memchr", @@ -420,9 +420,9 @@ dependencies = [ [[package]] name = "regex-automata" -version = "0.4.5" +version = "0.4.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb987efffd3c6d0d8f5f89510bb458559eab11e4f869acb20bf845e016259cd" +checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908" dependencies = [ "aho-corasick", "memchr", @@ -431,9 +431,9 @@ dependencies = [ [[package]] name = "regex-syntax" -version = "0.8.2" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c08c74e62047bb2de4ff487b251e4a92e24f48745648451635cec7d591162d9f" +checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c" [[package]] name = "rustix" @@ -502,6 +502,7 @@ dependencies = [ "criterion", "half", "rand", + "regex", ] [[package]] diff --git a/Cargo.toml b/Cargo.toml index 32656a84..5630680a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,6 +23,10 @@ include = ["/rust/**", "/c/**", "/include/**", "/build.rs"] name = "simsimd" path = "rust/lib.rs" +[[bin]] +name = "tfidf" +path = "scripts/bench_tfidf.rs" + [build-dependencies] cc = "1.0.83" @@ -37,6 +41,11 @@ name = "sqeuclidean" harness = false path = "scripts/bench_sqeuclidean.rs" +[[bench]] +name = "tfidf" +harness = false +path = "scripts/bench_tfidf.rs" + [profile.bench] opt-level = 3 # Corresponds to -O3 lto = true # Enables Link Time Optimization for further optimizations @@ -47,3 +56,6 @@ rpath = false # On some systems, setting this to false can help with optimiz criterion = { version = "0.5.1" } rand = { version = "0.8.5" } half = { version = "2.4.0" } + +[dependencies] +regex = "1.11.1" diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index af2c9c56..c72d3409 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -1,134 +1,124 @@ use std::collections::HashMap; +use std::collections::HashSet; use std::fs::File; -use std::io::{self, BufRead, BufReader}; +use std::io::BufRead; +use std::io::BufReader; +use std::io::{self}; use std::path::Path; +use regex::Regex; const LINES_PER_DOCUMENT: usize = 1000; + +fn l2_normalize(v: &mut Vec) { + let mut sum = 0.0; + for i in v.iter() { + sum += i.powi(2); + } + sum = sum.sqrt(); + for element in v.iter_mut() { + *element = *element / sum; + } +} + #[derive(Debug)] struct TfidfCalculator { - document_frequency: HashMap, - term_frequencies: Vec>, - documents: Vec, // Store original documents - n_documents: usize, + terms: Vec, + row_norms: Vec>, + documents: Vec>, // Store original documents + idfs: HashMap, } impl TfidfCalculator { fn new() -> Self { TfidfCalculator { - document_frequency: HashMap::new(), - term_frequencies: Vec::new(), + terms: Vec::new(), + row_norms: Vec::new(), documents: Vec::new(), - n_documents: 0, + idfs: HashMap::new(), } } fn tokenize(text: &str) -> Vec { - text.split_whitespace() - .map(|word| word.to_lowercase() - .chars() - .filter(|c| c.is_alphanumeric()) - .collect::()) - .filter(|word| !word.is_empty()) - .collect() + let re = Regex::new(r"(?U)\b\w{2,}\b").unwrap(); + let matches: Vec = re.find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches } - fn process_document(&mut self, text: &str) { - let tokens = Self::tokenize(text); - let mut term_freq = HashMap::new(); - let mut seen_terms = HashMap::new(); - - for token in tokens { - *term_freq.entry(token.clone()).or_insert(0) += 1; - seen_terms.insert(token, true); + fn process_documents(&mut self, documents: Vec) { + let mut unique_terms = HashSet::new(); + for document in &documents { + let tokens = Self::tokenize(document); + for token in tokens { + if !unique_terms.contains(&token) { + unique_terms.insert(token); + } } - - for term in seen_terms.keys() { - *self.document_frequency.entry(term.clone()).or_insert(0) += 1; + } + self.terms = unique_terms.into_iter().collect(); + self.terms.sort(); + for document in &documents { + let tokens = Self::tokenize(document); + let mut term_document_freq = HashMap::new(); + for token in tokens.iter() { + let count = term_document_freq.entry(token).or_insert(0); + *count += 1; } - - self.term_frequencies.push(term_freq); - self.documents.push(text.to_string()); - self.n_documents += 1; - } - - fn calculate_tfidf(&self, term_freq: &HashMap) -> HashMap { - let mut tfidf = HashMap::new(); - - for (term, freq) in term_freq { - if let Some(&df) = self.document_frequency.get(term) { - let tf = *freq as f64; - let idf = (self.n_documents as f64 / df as f64).ln(); - tfidf.insert(term.clone(), tf * idf); - } + let mut row_counts = vec![0; self.terms.len()]; + for (idx, term) in self.terms.iter().enumerate() { + let count = term_document_freq.get(term).unwrap_or(&0); + row_counts[idx] = *count; } - - tfidf - } - - fn get_document_tfidf_vectors(&self) -> Vec> { - self.term_frequencies - .iter() - .map(|tf| self.calculate_tfidf(tf)) - .collect() + self.documents.push(row_counts); + } } - fn calculate_query_tfidf(&self, query: &str) -> HashMap { - let tokens = Self::tokenize(query); - let mut term_freq = HashMap::new(); - - for token in tokens { - *term_freq.entry(token).or_insert(0) += 1; + fn calculate_idf(&mut self) { + for term_id in 0..self.terms.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0 { + df += 1; + } } - - self.calculate_tfidf(&term_freq) + let n_documents = self.documents.len(); + let present = df as f64; + let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); + } } - fn cosine_similarity(v1: &HashMap, v2: &HashMap) -> f64 { - let mut dot_product = 0.0; - let mut norm1 = 0.0; - let mut norm2 = 0.0; - - // Calculate dot product and norms - for (term, score1) in v1 { - norm1 += score1 * score1; - if let Some(score2) = v2.get(term) { - dot_product += score1 * score2; - } - } - - for (_term, score2) in v2 { - norm2 += score2 * score2; - } - - // Return cosine similarity - if norm1 == 0.0 || norm2 == 0.0 { - 0.0 - } else { - dot_product / (norm1.sqrt() * norm2.sqrt()) + fn calculate_tfidf(&mut self) { + let document = &self.documents[0]; + let mut tfidf_vector = vec![0.0; self.terms.len()]; + for i in 0..self.terms.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f64; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0 { + tfidf_vector[term_id] = 0.0; + } else { + tfidf_vector[term_id] = tf * idf; + } } + self.row_norms.push(tfidf_vector); } - fn find_matching_documents(&self, query: &str, top_n: usize) -> Vec<(usize, f64)> { - let query_tfidf = self.calculate_query_tfidf(query); - let doc_tfidfs = self.get_document_tfidf_vectors(); - - let mut similarities: Vec<(usize, f64)> = doc_tfidfs - .iter() - .enumerate() - .map(|(idx, doc_tfidf)| { - (idx, Self::cosine_similarity(&query_tfidf, doc_tfidf)) - }) - .collect(); - - similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap()); - - // Return top N results - similarities.into_iter().take(top_n).collect() + fn normalize(&mut self) { + for row in self.row_norms.iter_mut() { + l2_normalize(row); + } } + + } + fn main() -> io::Result<()> { + let args: Vec = std::env::args().collect(); if args.len() != 2 { eprintln!("Usage: {} ", args[0]); @@ -140,7 +130,7 @@ fn main() -> io::Result<()> { let reader = BufReader::new(file); let mut calculator = TfidfCalculator::new(); - // Process the file in chunks of LINES_PER_DOCUMENT lines + let mut documents = Vec::new(); let mut current_document = String::new(); let mut line_count = 0; @@ -149,15 +139,24 @@ fn main() -> io::Result<()> { current_document.push_str(&line); current_document.push('\n'); line_count += 1; - if line_count == LINES_PER_DOCUMENT { - calculator.process_document(¤t_document); - current_document.clear(); + documents.push(std::mem::take(&mut current_document)); line_count = 0; + println!("current document size after take {}", current_document.len()); + } } if !current_document.is_empty() { - calculator.process_document(¤t_document); + documents.push(std::mem::take(&mut current_document)); } + calculator.process_documents(documents); + calculator.calculate_idf(); + calculator.calculate_tfidf(); + calculator.normalize(); + + + Ok(()) + + } \ No newline at end of file From 19850f6ac6750a9a163c4d3479d916bf1de989a8 Mon Sep 17 00:00:00 2001 From: Gowind Date: Wed, 27 Nov 2024 12:54:02 +0100 Subject: [PATCH 03/10] calculate cosine similarity using simsimd --- scripts/bench_tfidf.rs | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index c72d3409..8a9a8e61 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -6,7 +6,7 @@ use std::io::BufReader; use std::io::{self}; use std::path::Path; use regex::Regex; - +use simsimd::SpatialSimilarity; const LINES_PER_DOCUMENT: usize = 1000; @@ -24,6 +24,7 @@ fn l2_normalize(v: &mut Vec) { #[derive(Debug)] struct TfidfCalculator { terms: Vec, + term_index: HashMap, row_norms: Vec>, documents: Vec>, // Store original documents idfs: HashMap, @@ -33,6 +34,7 @@ impl TfidfCalculator { fn new() -> Self { TfidfCalculator { terms: Vec::new(), + term_index: HashMap::new(), row_norms: Vec::new(), documents: Vec::new(), idfs: HashMap::new(), @@ -59,6 +61,9 @@ impl TfidfCalculator { } self.terms = unique_terms.into_iter().collect(); self.terms.sort(); + for (idx, term) in self.terms.iter().enumerate() { + self.term_index.insert(term.clone(), idx); + } for document in &documents { let tokens = Self::tokenize(document); let mut term_document_freq = HashMap::new(); @@ -91,8 +96,8 @@ impl TfidfCalculator { } fn calculate_tfidf(&mut self) { - let document = &self.documents[0]; - let mut tfidf_vector = vec![0.0; self.terms.len()]; + for document in self.documents.iter() { + let mut tfidf_vector = vec![0.0; self.terms.len()]; for i in 0..self.terms.len() { let term_id = i; let term_frequency = document[term_id]; @@ -105,6 +110,7 @@ impl TfidfCalculator { } } self.row_norms.push(tfidf_vector); + } } fn normalize(&mut self) { @@ -112,6 +118,18 @@ impl TfidfCalculator { l2_normalize(row); } } + + fn tfidf_representation(&self, row: &str) -> Vec { + let mut row_vector = vec![0.0; self.terms.len()]; + let tokens = Self::tokenize(row); + for token in tokens.iter() { + let term_id = self.term_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + l2_normalize(& mut row_vector); + return row_vector; + } } @@ -142,8 +160,6 @@ fn main() -> io::Result<()> { if line_count == LINES_PER_DOCUMENT { documents.push(std::mem::take(&mut current_document)); line_count = 0; - println!("current document size after take {}", current_document.len()); - } } @@ -154,7 +170,13 @@ fn main() -> io::Result<()> { calculator.calculate_idf(); calculator.calculate_tfidf(); calculator.normalize(); - + + let query = "we went over to the whole world"; + let query_vector = calculator.tfidf_representation(query); + for (idx, row_vector) in calculator.row_norms.iter().enumerate() { + let similarity = f64::cosine(query_vector.as_ref(), row_vector.as_ref()); + println!("Similarity for document {}: {:?}", idx, similarity); + } Ok(()) From 4cf2e0cdd79e7ff18779d7aa61614d34366944f9 Mon Sep 17 00:00:00 2001 From: Gowind Date: Wed, 27 Nov 2024 12:57:02 +0100 Subject: [PATCH 04/10] add plain implementation of cosine similarity --- scripts/bench_tfidf.rs | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index 8a9a8e61..68953717 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -30,6 +30,38 @@ struct TfidfCalculator { idfs: HashMap, } +pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + + // Calculate dot product + let dot_product: f64 = vec1.iter() + .zip(vec2.iter()) + .map(|(a, b)| a * b) + .sum(); + + // Calculate magnitudes + let magnitude1: f64 = vec1.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + let magnitude2: f64 = vec2.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) +} + impl TfidfCalculator { fn new() -> Self { TfidfCalculator { @@ -175,7 +207,9 @@ fn main() -> io::Result<()> { let query_vector = calculator.tfidf_representation(query); for (idx, row_vector) in calculator.row_norms.iter().enumerate() { let similarity = f64::cosine(query_vector.as_ref(), row_vector.as_ref()); - println!("Similarity for document {}: {:?}", idx, similarity); + println!("Similarity for document via simsimd {}: {:?}", idx, similarity); + let plain_similarity = plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()); + println!("Similarity for document via plain cosine similarity {}: {:?}", idx, plain_similarity); } Ok(()) From 92eda8d3f78e2c04746e76bd01b487c48e56a4ef Mon Sep 17 00:00:00 2001 From: Gowind Date: Thu, 28 Nov 2024 00:54:55 +0100 Subject: [PATCH 05/10] Speed up stuff by removing Regex out of the hot loop --- Cargo.toml | 2 +- rust/lib.rs | 2 + scripts/bench_tfidf.rs | 280 ++++++++++++++++------------------------- scripts/tfidf.rs | 228 +++++++++++++++++++++++++++++++++ 4 files changed, 337 insertions(+), 175 deletions(-) create mode 100644 scripts/tfidf.rs diff --git a/Cargo.toml b/Cargo.toml index 5630680a..7ea7faa5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -25,7 +25,7 @@ path = "rust/lib.rs" [[bin]] name = "tfidf" -path = "scripts/bench_tfidf.rs" +path = "scripts/tfidf.rs" [build-dependencies] cc = "1.0.83" diff --git a/rust/lib.rs b/rust/lib.rs index 1445167d..0e4ff7f6 100644 --- a/rust/lib.rs +++ b/rust/lib.rs @@ -256,7 +256,9 @@ where /// The cosine similarity is a measure of similarity between two non-zero vectors /// of an dot product space that measures the cosine of the angle between them. fn cosine(a: &[Self], b: &[Self]) -> Option { + SpatialSimilarity::cos(a, b) + } } diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index 68953717..601b4690 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -1,218 +1,150 @@ -use std::collections::HashMap; -use std::collections::HashSet; -use std::fs::File; -use std::io::BufRead; -use std::io::BufReader; -use std::io::{self}; -use std::path::Path; +use std::collections::{HashMap, HashSet}; + +use criterion::{criterion_group, criterion_main, Criterion}; use regex::Regex; -use simsimd::SpatialSimilarity; -const LINES_PER_DOCUMENT: usize = 1000; +use simsimd::SpatialSimilarity as SimSIMD; +// mod native; fn l2_normalize(v: &mut Vec) { - let mut sum = 0.0; - for i in v.iter() { - sum += i.powi(2); - } - sum = sum.sqrt(); - for element in v.iter_mut() { - *element = *element / sum; - } + let mut sum = 0.0; + for i in v.iter() { + sum += i.powi(2); + } + sum = sum.sqrt(); + for element in v.iter_mut() { + *element = *element / sum; + } } #[derive(Debug)] struct TfidfCalculator { - terms: Vec, - term_index: HashMap, - row_norms: Vec>, - documents: Vec>, // Store original documents - idfs: HashMap, + tokens: Vec, + tokens_index: HashMap, + row_norms: Vec>, + documents: Vec>, + idfs: HashMap, + re: Regex } -pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { - // Check if vectors have the same length - if vec1.len() != vec2.len() { - return None; +impl TfidfCalculator { + fn new() -> Self { + TfidfCalculator { + tokens: Vec::new(), + tokens_index: HashMap::new(), + row_norms: Vec::new(), + documents: Vec::new(), + idfs: HashMap::new(), + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap() + } } - // Calculate dot product - let dot_product: f64 = vec1.iter() - .zip(vec2.iter()) - .map(|(a, b)| a * b) - .sum(); - - // Calculate magnitudes - let magnitude1: f64 = vec1.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - let magnitude2: f64 = vec2.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - // Prevent division by zero - if magnitude1 == 0.0 || magnitude2 == 0.0 { - return None; + fn tokenize(&self, text: &str) -> Vec { + let matches: Vec = self.re.find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches } - // Calculate cosine similarity - Some(dot_product / (magnitude1 * magnitude2)) -} + fn process_documents(&mut self, documents: Vec) { -impl TfidfCalculator { - fn new() -> Self { - TfidfCalculator { - terms: Vec::new(), - term_index: HashMap::new(), - row_norms: Vec::new(), - documents: Vec::new(), - idfs: HashMap::new(), - } + let mut unique_tokens = HashSet::new(); + let mut document_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_tokens.push(tokens); } + self.tokens = unique_tokens.into_iter().collect(); + self.tokens.sort(); - fn tokenize(text: &str) -> Vec { - let re = Regex::new(r"(?U)\b\w{2,}\b").unwrap(); - let matches: Vec = re.find_iter(text.to_lowercase().as_str()) - .map(|mat| mat.as_str().to_string()) - .collect(); - matches + for (idx, term) in self.tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); } - - fn process_documents(&mut self, documents: Vec) { - let mut unique_terms = HashSet::new(); - for document in &documents { - let tokens = Self::tokenize(document); - for token in tokens { - if !unique_terms.contains(&token) { - unique_terms.insert(token); - } - } - } - self.terms = unique_terms.into_iter().collect(); - self.terms.sort(); - for (idx, term) in self.terms.iter().enumerate() { - self.term_index.insert(term.clone(), idx); - } - for document in &documents { - let tokens = Self::tokenize(document); - let mut term_document_freq = HashMap::new(); - for token in tokens.iter() { - let count = term_document_freq.entry(token).or_insert(0); - *count += 1; - } - let mut row_counts = vec![0; self.terms.len()]; - for (idx, term) in self.terms.iter().enumerate() { - let count = term_document_freq.get(term).unwrap_or(&0); - row_counts[idx] = *count; - } - self.documents.push(row_counts); + + let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + matrix[row_idx][*term_id] += 1.0; } } + self.documents = std::mem::take(&mut matrix); - fn calculate_idf(&mut self) { - for term_id in 0..self.terms.len() { - let mut df = 0; - for document in self.documents.iter() { - if document[term_id] > 0 { - df += 1; - } + + } + + fn calculate_idf(&mut self) { + for term_id in 0..self.tokens.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0.0 { + df += 1; } - let n_documents = self.documents.len(); - let present = df as f64; - let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; - self.idfs.insert(term_id, idf); } + let n_documents = self.documents.len(); + let present = df as f64; + let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); } + } + + fn calculate_tfidf(&mut self) { + let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; - fn calculate_tfidf(&mut self) { - for document in self.documents.iter() { - let mut tfidf_vector = vec![0.0; self.terms.len()]; - for i in 0..self.terms.len() { + for (row_idx, document) in self.documents.iter().enumerate() { + for i in 0..self.tokens.len() { let term_id = i; let term_frequency = document[term_id]; let tf = term_frequency as f64; let idf = self.idfs.get(&term_id).unwrap(); - if term_frequency == 0 { - tfidf_vector[term_id] = 0.0; + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; } else { - tfidf_vector[term_id] = tf * idf; + tfidf_matrix[row_idx][term_id] = tf * idf; } } - self.row_norms.push(tfidf_vector); - } - } - - fn normalize(&mut self) { - for row in self.row_norms.iter_mut() { - l2_normalize(row); } + self.row_norms = std::mem::take(&mut tfidf_matrix); + } + + fn normalize(&mut self) { + for row in self.row_norms.iter_mut() { + l2_normalize(row); } + } - fn tfidf_representation(&self, row: &str) -> Vec { - let mut row_vector = vec![0.0; self.terms.len()]; - let tokens = Self::tokenize(row); - for token in tokens.iter() { - let term_id = self.term_index.get(token).unwrap(); - let term_idf = self.idfs.get(term_id).unwrap(); - row_vector[*term_id] += *term_idf; - } - l2_normalize(& mut row_vector); - return row_vector; + fn tfidf_representation(&self, row: &str) -> Vec { + let mut row_vector = vec![0.0; self.tokens.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; } - + l2_normalize(& mut row_vector); + return row_vector; + } + } -fn main() -> io::Result<()> { - - let args: Vec = std::env::args().collect(); - if args.len() != 2 { - eprintln!("Usage: {} ", args[0]); - std::process::exit(1); - } - - let path = Path::new(&args[1]); - let file = File::open(path)?; - let reader = BufReader::new(file); - let mut calculator = TfidfCalculator::new(); - - let mut documents = Vec::new(); - let mut current_document = String::new(); - let mut line_count = 0; - - for line in reader.lines() { - let line = line?; - current_document.push_str(&line); - current_document.push('\n'); - line_count += 1; - if line_count == LINES_PER_DOCUMENT { - documents.push(std::mem::take(&mut current_document)); - line_count = 0; - } - } +pub fn tfidf_benchmark(c: &mut Criterion) { - if !current_document.is_empty() { - documents.push(std::mem::take(&mut current_document)); - } - calculator.process_documents(documents); - calculator.calculate_idf(); - calculator.calculate_tfidf(); - calculator.normalize(); - - let query = "we went over to the whole world"; - let query_vector = calculator.tfidf_representation(query); - for (idx, row_vector) in calculator.row_norms.iter().enumerate() { - let similarity = f64::cosine(query_vector.as_ref(), row_vector.as_ref()); - println!("Similarity for document via simsimd {}: {:?}", idx, similarity); - let plain_similarity = plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()); - println!("Similarity for document via plain cosine similarity {}: {:?}", idx, plain_similarity); - } + let mut tfidf = TfidfCalculator::new(); - Ok(()) + c.bench_function("Tf-idf benchmark", |b| { + b.iter(|| { + + }); + }); +} -} \ No newline at end of file + +criterion_group!(benches, tfidf_benchmark); +criterion_main!(benches); diff --git a/scripts/tfidf.rs b/scripts/tfidf.rs new file mode 100644 index 00000000..60179712 --- /dev/null +++ b/scripts/tfidf.rs @@ -0,0 +1,228 @@ +use std::collections::HashMap; +use std::collections::HashSet; +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +use std::io::{self}; +use std::path::Path; +use regex::Regex; +use simsimd::SpatialSimilarity; +const LINES_PER_DOCUMENT: usize = 1000; + + +fn l2_normalize(v: &mut Vec) { + let mut sum = 0.0; + for i in v.iter() { + sum += i.powi(2); + } + sum = sum.sqrt(); + for element in v.iter_mut() { + *element = *element / sum; + } +} + +#[derive(Debug)] +struct TfidfCalculator { + tokens: Vec, + tokens_index: HashMap, + row_norms: Vec>, + documents: Vec>, + idfs: HashMap, + re: Regex +} + +pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + + // Calculate dot product + let dot_product: f64 = vec1.iter() + .zip(vec2.iter()) + .map(|(a, b)| a * b) + .sum(); + + // Calculate magnitudes + let magnitude1: f64 = vec1.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + let magnitude2: f64 = vec2.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) +} + + + + +impl TfidfCalculator { + fn new() -> Self { + TfidfCalculator { + tokens: Vec::new(), + tokens_index: HashMap::new(), + row_norms: Vec::new(), + documents: Vec::new(), + idfs: HashMap::new(), + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap() + } + } + + fn tokenize(&self, text: &str) -> Vec { + let matches: Vec = self.re.find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches + } + + fn process_documents(&mut self, documents: Vec) { + + let mut unique_tokens = HashSet::new(); + let mut document_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_tokens.push(tokens); + } + self.tokens = unique_tokens.into_iter().collect(); + self.tokens.sort(); + + for (idx, term) in self.tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); + } + + let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + matrix[row_idx][*term_id] += 1.0; + } + } + self.documents = std::mem::take(&mut matrix); + + + } + + fn calculate_idf(&mut self) { + for term_id in 0..self.tokens.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0.0 { + df += 1; + } + } + let n_documents = self.documents.len(); + let present = df as f64; + let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); + } + } + + fn calculate_tfidf(&mut self) { + let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; + + for (row_idx, document) in self.documents.iter().enumerate() { + for i in 0..self.tokens.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f64; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; + } else { + tfidf_matrix[row_idx][term_id] = tf * idf; + } + } + } + self.row_norms = std::mem::take(&mut tfidf_matrix); + } + + fn normalize(&mut self) { + for row in self.row_norms.iter_mut() { + l2_normalize(row); + } + } + + fn tfidf_representation(&self, row: &str) -> Vec { + let mut row_vector = vec![0.0; self.tokens.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + l2_normalize(& mut row_vector); + return row_vector; + } + + +} + + +fn main() -> io::Result<()> { + + let args: Vec = std::env::args().collect(); + if args.len() != 2 { + eprintln!("Usage: {} ", args[0]); + std::process::exit(1); + } + + let path = Path::new(&args[1]); + let file = File::open(path)?; + let reader = BufReader::new(file); + let mut calculator = TfidfCalculator::new(); + + let mut documents = Vec::new(); + /* + let mut current_document = String::new(); + let mut line_count = 0; + */ + for line in reader.lines() { + documents.push(line.unwrap()); + } + /* + let line = line?; + current_document.push_str(&line); + current_document.push('\n'); + line_count += 1; + if line_count == LINES_PER_DOCUMENT { + documents.push(std::mem::take(&mut current_document)); + line_count = 0; + } + } + + if !current_document.is_empty() { + documents.push(std::mem::take(&mut current_document)); + } + */ + calculator.process_documents(documents); + calculator.calculate_idf(); + calculator.calculate_tfidf(); + calculator.normalize(); + + let query = "we went over to the whole world"; + let query_vector = calculator.tfidf_representation(query); + for (idx, row_vector) in calculator.row_norms.iter().enumerate() { + let similarity = SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()).unwrap(); + println!("Similarity for document via simsimd {}: {:?}", idx, 1.0 - similarity); + let plain_similarity = plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()).unwrap(); + println!("Similarity for document via plain cosine similarity {}: {:?}", idx, plain_similarity); + } + + Ok(()) + + +} \ No newline at end of file From 0a5849aa52c258778bb6fca3dff203bc90b1d81d Mon Sep 17 00:00:00 2001 From: Gowind Date: Sun, 8 Dec 2024 01:18:59 +0100 Subject: [PATCH 06/10] Attempt to turn our code into a proper benchmark --- Cargo.toml | 6 +-- scripts/bench_tfidf.rs | 114 ++++++++++++++++++++++++++++++++++++----- scripts/tfidf.rs | 1 - 3 files changed, 102 insertions(+), 19 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 7ea7faa5..97e9ad32 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -31,10 +31,6 @@ path = "scripts/tfidf.rs" cc = "1.0.83" -[[bench]] -name = "cosine" -harness = false -path = "scripts/bench_cosine.rs" [[bench]] name = "sqeuclidean" @@ -43,7 +39,7 @@ path = "scripts/bench_sqeuclidean.rs" [[bench]] name = "tfidf" -harness = false +harness = true path = "scripts/bench_tfidf.rs" [profile.bench] diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index 601b4690..7daa55b1 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -1,8 +1,15 @@ -use std::collections::{HashMap, HashSet}; -use criterion::{criterion_group, criterion_main, Criterion}; +use criterion::{criterion_group, criterion_main, Criterion, black_box}; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +// use std::io::{self}; +use std::path::Path; use regex::Regex; -use simsimd::SpatialSimilarity as SimSIMD; + +use simsimd::SpatialSimilarity; // mod native; @@ -17,6 +24,39 @@ fn l2_normalize(v: &mut Vec) { } } + +pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + + // Calculate dot product + let dot_product: f64 = vec1.iter() + .zip(vec2.iter()) + .map(|(a, b)| a * b) + .sum(); + + // Calculate magnitudes + let magnitude1: f64 = vec1.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + let magnitude2: f64 = vec2.iter() + .map(|x| x.powi(2)) + .sum::() + .sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) +} + #[derive(Debug)] struct TfidfCalculator { tokens: Vec, @@ -133,18 +173,66 @@ impl TfidfCalculator { } -pub fn tfidf_benchmark(c: &mut Criterion) { - let mut tfidf = TfidfCalculator::new(); +fn tfidfsimilarity_benchmark(c: &mut Criterion) { + // Prepare your data and calculator here - c.bench_function("Tf-idf benchmark", |b| { - b.iter(|| { - - }); + let path = Path::new("leipzig10000.txt"); + let file = File::open(path).unwrap(); + let reader = BufReader::new(file); + + + let mut calculator = TfidfCalculator::new(); + + let mut documents = Vec::new(); + for line in reader.lines() { + documents.push(line.unwrap()); + } + + calculator.process_documents(documents); + calculator.calculate_idf(); + calculator.calculate_tfidf(); + calculator.normalize(); + + let mut group = c.benchmark_group("TF-IDF Similarity"); + + let query = "we went over to the whole world"; + let query_vector = calculator.tfidf_representation(query); + + // Ensure the work cannot be optimized away + group.bench_function("SimSIMD Similarity", |b| { + b.iter(|| { + let total_similarity: f64 = calculator.row_norms.iter() + .map(|row_vector| { + black_box( + SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0) + ) + }) + .sum(); + + // Force use of total_similarity to prevent optimization + black_box(total_similarity); + }) + }); + + group.bench_function("Rust Procedural Similarity", |b| { + b.iter(|| { + let total_similarity: f64 = calculator.row_norms.iter() + .map(|row_vector| { + black_box( + plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0) + ) + }) + .sum(); + + // Force use of total_similarity to prevent optimization + black_box(total_similarity); + }) }); -} - +} -criterion_group!(benches, tfidf_benchmark); -criterion_main!(benches); +criterion_group!(benches, tfidfsimilarity_benchmark); +criterion_main!(benches); \ No newline at end of file diff --git a/scripts/tfidf.rs b/scripts/tfidf.rs index 60179712..26e2dff0 100644 --- a/scripts/tfidf.rs +++ b/scripts/tfidf.rs @@ -7,7 +7,6 @@ use std::io::{self}; use std::path::Path; use regex::Regex; use simsimd::SpatialSimilarity; -const LINES_PER_DOCUMENT: usize = 1000; fn l2_normalize(v: &mut Vec) { From f9ce74029161e00277b3904f10fef82284641dcc Mon Sep 17 00:00:00 2001 From: Gowind Date: Sun, 8 Dec 2024 02:00:37 +0100 Subject: [PATCH 07/10] Make tfidf a proper benchmark --- Cargo.toml | 4 +- rust/lib.rs | 56 +++---- scripts/bench_tfidf.rs | 324 +++++++++++++++++++---------------------- scripts/tfidf.rs | 217 +++++++++++++-------------- 4 files changed, 282 insertions(+), 319 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 97e9ad32..47a9bfd7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -30,8 +30,6 @@ path = "scripts/tfidf.rs" [build-dependencies] cc = "1.0.83" - - [[bench]] name = "sqeuclidean" harness = false @@ -39,7 +37,7 @@ path = "scripts/bench_sqeuclidean.rs" [[bench]] name = "tfidf" -harness = true +harness = false path = "scripts/bench_tfidf.rs" [profile.bench] diff --git a/rust/lib.rs b/rust/lib.rs index 0e4ff7f6..f9be01d0 100644 --- a/rust/lib.rs +++ b/rust/lib.rs @@ -99,8 +99,20 @@ extern "C" { fn simsimd_kl_f32(a: *const f32, b: *const f32, c: usize, d: *mut Distance); fn simsimd_kl_f64(a: *const f64, b: *const f64, c: usize, d: *mut Distance); - fn simsimd_intersect_u16(a: *const u16, b: *const u16, a_length: usize, b_length: usize, d: *mut Distance); - fn simsimd_intersect_u32(a: *const u32, b: *const u32, a_length: usize, b_length: usize, d: *mut Distance); + fn simsimd_intersect_u16( + a: *const u16, + b: *const u16, + a_length: usize, + b_length: usize, + d: *mut Distance, + ); + fn simsimd_intersect_u32( + a: *const u32, + b: *const u32, + a_length: usize, + b_length: usize, + d: *mut Distance, + ); fn simsimd_uses_neon() -> i32; fn simsimd_uses_neon_f16() -> i32; @@ -225,8 +237,8 @@ where fn l2sq(a: &[Self], b: &[Self]) -> Option; /// Computes the Euclidean distance between two slices. - /// The Euclidean distance is the square root of - // sum of the squared differences between corresponding + /// The Euclidean distance is the square root of + // sum of the squared differences between corresponding /// elements of the two slices. fn l2(a: &[Self], b: &[Self]) -> Option; @@ -238,8 +250,8 @@ where } /// Computes the Euclidean distance between two slices. - /// The Euclidean distance is the square root of the - /// sum of the squared differences between corresponding + /// The Euclidean distance is the square root of the + /// sum of the squared differences between corresponding /// elements of the two slices. fn euclidean(a: &[Self], b: &[Self]) -> Option { SpatialSimilarity::l2(a, b) @@ -256,9 +268,7 @@ where /// The cosine similarity is a measure of similarity between two non-zero vectors /// of an dot product space that measures the cosine of the angle between them. fn cosine(a: &[Self], b: &[Self]) -> Option { - SpatialSimilarity::cos(a, b) - } } @@ -395,25 +405,21 @@ impl SpatialSimilarity for i8 { } impl Sparse for u16 { - fn intersect(a: &[Self], b: &[Self]) -> Option { let mut distance_value: Distance = 0.0; let distance_ptr: *mut Distance = &mut distance_value as *mut Distance; unsafe { simsimd_intersect_u16(a.as_ptr(), b.as_ptr(), a.len(), b.len(), distance_ptr) }; Some(distance_value) } - } impl Sparse for u32 { - fn intersect(a: &[Self], b: &[Self]) -> Option { let mut distance_value: Distance = 0.0; let distance_ptr: *mut Distance = &mut distance_value as *mut Distance; unsafe { simsimd_intersect_u32(a.as_ptr(), b.as_ptr(), a.len(), b.len(), distance_ptr) }; Some(distance_value) } - } impl SpatialSimilarity for f16 { @@ -460,7 +466,6 @@ impl SpatialSimilarity for f16 { } fn l2(a: &[Self], b: &[Self]) -> Option { - if a.len() != b.len() { return None; } @@ -995,7 +1000,6 @@ mod tests { .map(|&x| HalfF16::from_f32(x)) .collect(); - let a_simsimd: &[f16] = unsafe { std::slice::from_raw_parts(a_half.as_ptr() as *const f16, a_half.len()) }; let b_simsimd: &[f16] = @@ -1005,7 +1009,6 @@ mod tests { println!("The result of l2_f16 is {:.8}", result); assert_almost_equal(5.2, result, 0.01); } - } #[test] @@ -1162,8 +1165,8 @@ mod tests { #[test] fn test_intersect_u16() { { - let a_u16: &[u16] = &[153, 16384, 17408]; - let b_u16: &[u16] = &[15360, 16384, 7408]; + let a_u16: &[u16] = &[153, 16384, 17408]; + let b_u16: &[u16] = &[15360, 16384, 7408]; if let Some(result) = Sparse::intersect(a_u16, b_u16) { println!("The result of intersect_u16 is {:.8}", result); @@ -1172,37 +1175,36 @@ mod tests { } { - let a_u16: &[u16] = &[153, 11638, 08]; - let b_u16: &[u16] = &[15360, 16384, 7408]; + let a_u16: &[u16] = &[153, 11638, 08]; + let b_u16: &[u16] = &[15360, 16384, 7408]; if let Some(result) = Sparse::intersect(a_u16, b_u16) { println!("The result of intersect_u16 is {:.8}", result); assert_almost_equal(0.0, result, 0.0001); - } + } } } #[test] fn test_intersect_u32() { { - let a_u32: &[u32] = &[11, 153]; - let b_u32: &[u32] = &[11, 153, 7408, 16384]; + let a_u32: &[u32] = &[11, 153]; + let b_u32: &[u32] = &[11, 153, 7408, 16384]; if let Some(result) = Sparse::intersect(a_u32, b_u32) { println!("The result of intersect_u32 is {:.8}", result); assert_almost_equal(2.0, result, 0.0001); } } - + { - let a_u32: &[u32] = &[153, 7408, 11638]; - let b_u32: &[u32] = &[153, 7408, 11638]; + let a_u32: &[u32] = &[153, 7408, 11638]; + let b_u32: &[u32] = &[153, 7408, 11638]; if let Some(result) = Sparse::intersect(a_u32, b_u32) { println!("The result of intersect_u32 is {:.8}", result); assert_almost_equal(3.0, result, 0.0001); - } + } } - } } diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index 7daa55b1..12b3a87b 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -1,238 +1,212 @@ - -use criterion::{criterion_group, criterion_main, Criterion, black_box}; +use criterion::BenchmarkId; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; use std::collections::HashMap; use std::collections::HashSet; use std::fs::File; use std::io::BufRead; use std::io::BufReader; // use std::io::{self}; -use std::path::Path; use regex::Regex; +use std::path::Path; use simsimd::SpatialSimilarity; -// mod native; - fn l2_normalize(v: &mut Vec) { - let mut sum = 0.0; - for i in v.iter() { - sum += i.powi(2); - } - sum = sum.sqrt(); - for element in v.iter_mut() { - *element = *element / sum; - } + let mut sum = 0.0; + for i in v.iter() { + sum += i.powi(2); + } + sum = sum.sqrt(); + for element in v.iter_mut() { + *element = *element / sum; + } } - pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { - // Check if vectors have the same length - if vec1.len() != vec2.len() { - return None; - } - - // Calculate dot product - let dot_product: f64 = vec1.iter() - .zip(vec2.iter()) - .map(|(a, b)| a * b) - .sum(); - - // Calculate magnitudes - let magnitude1: f64 = vec1.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - let magnitude2: f64 = vec2.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - // Prevent division by zero - if magnitude1 == 0.0 || magnitude2 == 0.0 { - return None; - } - - // Calculate cosine similarity - Some(dot_product / (magnitude1 * magnitude2)) + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + + // Calculate dot product + let dot_product: f64 = vec1.iter().zip(vec2.iter()).map(|(a, b)| a * b).sum(); + + // Calculate magnitudes + let magnitude1: f64 = vec1.iter().map(|x| x.powi(2)).sum::().sqrt(); + + let magnitude2: f64 = vec2.iter().map(|x| x.powi(2)).sum::().sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) } #[derive(Debug)] struct TfidfCalculator { - tokens: Vec, - tokens_index: HashMap, - row_norms: Vec>, - documents: Vec>, - idfs: HashMap, - re: Regex + tokens: Vec, + tokens_index: HashMap, + row_norms: Vec>, + documents: Vec>, + idfs: HashMap, + re: Regex, } impl TfidfCalculator { - fn new() -> Self { - TfidfCalculator { - tokens: Vec::new(), - tokens_index: HashMap::new(), - row_norms: Vec::new(), - documents: Vec::new(), - idfs: HashMap::new(), - re: Regex::new(r"(?U)\b\w{2,}\b").unwrap() - } - } - - fn tokenize(&self, text: &str) -> Vec { - let matches: Vec = self.re.find_iter(text.to_lowercase().as_str()) - .map(|mat| mat.as_str().to_string()) - .collect(); - matches - } - - fn process_documents(&mut self, documents: Vec) { - - let mut unique_tokens = HashSet::new(); - let mut document_tokens: Vec> = Vec::new(); - for document in &documents { - let tokens = self.tokenize(document); - for token in tokens.iter() { - unique_tokens.insert(token.clone()); - } - document_tokens.push(tokens); + fn new() -> Self { + TfidfCalculator { + tokens: Vec::new(), + tokens_index: HashMap::new(), + row_norms: Vec::new(), + documents: Vec::new(), + idfs: HashMap::new(), + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), + } } - self.tokens = unique_tokens.into_iter().collect(); - self.tokens.sort(); - for (idx, term) in self.tokens.iter().enumerate() { - self.tokens_index.insert(term.clone(), idx); - } - - let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; - // I have count of terms in each document - for (row_idx, document) in document_tokens.iter().enumerate() { - for token in document.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - matrix[row_idx][*term_id] += 1.0; - } + fn tokenize(&self, text: &str) -> Vec { + let matches: Vec = self + .re + .find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches } - self.documents = std::mem::take(&mut matrix); - - } - - fn calculate_idf(&mut self) { - for term_id in 0..self.tokens.len() { - let mut df = 0; - for document in self.documents.iter() { - if document[term_id] > 0.0 { - df += 1; + fn process_documents(&mut self, documents: Vec) { + let mut unique_tokens = HashSet::new(); + let mut document_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_tokens.push(tokens); } - } - let n_documents = self.documents.len(); - let present = df as f64; - let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; - self.idfs.insert(term_id, idf); - } - } - - fn calculate_tfidf(&mut self) { - let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; - - for (row_idx, document) in self.documents.iter().enumerate() { - for i in 0..self.tokens.len() { - let term_id = i; - let term_frequency = document[term_id]; - let tf = term_frequency as f64; - let idf = self.idfs.get(&term_id).unwrap(); - if term_frequency == 0.0 { - tfidf_matrix[row_idx][term_id] = 0.0; - } else { - tfidf_matrix[row_idx][term_id] = tf * idf; - } + self.tokens = unique_tokens.into_iter().collect(); + self.tokens.sort(); + + for (idx, term) in self.tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); } - } - self.row_norms = std::mem::take(&mut tfidf_matrix); - } - fn normalize(&mut self) { - for row in self.row_norms.iter_mut() { - l2_normalize(row); + let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + matrix[row_idx][*term_id] += 1.0; + } + } + self.documents = std::mem::take(&mut matrix); } - } - - fn tfidf_representation(&self, row: &str) -> Vec { - let mut row_vector = vec![0.0; self.tokens.len()]; - let tokens = self.tokenize(row); - for token in tokens.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - let term_idf = self.idfs.get(term_id).unwrap(); - row_vector[*term_id] += *term_idf; + + fn calculate_idf(&mut self) { + for term_id in 0..self.tokens.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0.0 { + df += 1; + } + } + let n_documents = self.documents.len(); + let present = df as f64; + let idf = (((n_documents + 1) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); + } } - l2_normalize(& mut row_vector); - return row_vector; - } - -} + fn calculate_tfidf(&mut self) { + let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; + + for (row_idx, document) in self.documents.iter().enumerate() { + for i in 0..self.tokens.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f64; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; + } else { + tfidf_matrix[row_idx][term_id] = tf * idf; + } + } + } + self.row_norms = std::mem::take(&mut tfidf_matrix); + } + fn normalize(&mut self) { + for row in self.row_norms.iter_mut() { + l2_normalize(row); + } + } + fn tfidf_representation(&self, row: &str) -> Vec { + let mut row_vector = vec![0.0; self.tokens.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + l2_normalize(&mut row_vector); + return row_vector; + } +} -fn tfidfsimilarity_benchmark(c: &mut Criterion) { +pub fn tfidfsimilarity_benchmark(c: &mut Criterion) { // Prepare your data and calculator here - + let path = Path::new("leipzig10000.txt"); let file = File::open(path).unwrap(); let reader = BufReader::new(file); - let mut calculator = TfidfCalculator::new(); - + let mut documents = Vec::new(); for line in reader.lines() { documents.push(line.unwrap()); } - + calculator.process_documents(documents); calculator.calculate_idf(); calculator.calculate_tfidf(); calculator.normalize(); let mut group = c.benchmark_group("TF-IDF Similarity"); - + let query = "we went over to the whole world"; let query_vector = calculator.tfidf_representation(query); - // Ensure the work cannot be optimized away - group.bench_function("SimSIMD Similarity", |b| { + group.sample_size(10); + + group.bench_function("SimSIMD Cosine Similarity", |b| { b.iter(|| { - let total_similarity: f64 = calculator.row_norms.iter() - .map(|row_vector| { - black_box( - SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) - .unwrap_or(0.0) - ) - }) - .sum(); - - // Force use of total_similarity to prevent optimization - black_box(total_similarity); + let total_similarity: f64 = calculator.row_norms.iter().map(|row_vector| { + black_box( + SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }).sum(); + black_box(total_similarity) }) }); - group.bench_function("Rust Procedural Similarity", |b| { + group.bench_function("Rust plain Cosine similarity", |b| { b.iter(|| { - let total_similarity: f64 = calculator.row_norms.iter() - .map(|row_vector| { - black_box( - plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) - .unwrap_or(0.0) - ) - }) - .sum(); - - // Force use of total_similarity to prevent optimization - black_box(total_similarity); + let total_similarity: f64 = calculator.row_norms.iter().map(|row_vector| { + black_box( + plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }).sum(); + black_box(total_similarity) }) }); - } criterion_group!(benches, tfidfsimilarity_benchmark); -criterion_main!(benches); \ No newline at end of file +criterion_main!(benches); diff --git a/scripts/tfidf.rs b/scripts/tfidf.rs index 26e2dff0..53d7eacf 100644 --- a/scripts/tfidf.rs +++ b/scripts/tfidf.rs @@ -1,3 +1,5 @@ +use regex::Regex; +use simsimd::SpatialSimilarity; use std::collections::HashMap; use std::collections::HashSet; use std::fs::File; @@ -5,9 +7,6 @@ use std::io::BufRead; use std::io::BufReader; use std::io::{self}; use std::path::Path; -use regex::Regex; -use simsimd::SpatialSimilarity; - fn l2_normalize(v: &mut Vec) { let mut sum = 0.0; @@ -27,43 +26,31 @@ struct TfidfCalculator { row_norms: Vec>, documents: Vec>, idfs: HashMap, - re: Regex + re: Regex, } pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { - // Check if vectors have the same length - if vec1.len() != vec2.len() { - return None; - } - - // Calculate dot product - let dot_product: f64 = vec1.iter() - .zip(vec2.iter()) - .map(|(a, b)| a * b) - .sum(); - - // Calculate magnitudes - let magnitude1: f64 = vec1.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - let magnitude2: f64 = vec2.iter() - .map(|x| x.powi(2)) - .sum::() - .sqrt(); - - // Prevent division by zero - if magnitude1 == 0.0 || magnitude2 == 0.0 { - return None; - } - - // Calculate cosine similarity - Some(dot_product / (magnitude1 * magnitude2)) -} + // Check if vectors have the same length + if vec1.len() != vec2.len() { + return None; + } + // Calculate dot product + let dot_product: f64 = vec1.iter().zip(vec2.iter()).map(|(a, b)| a * b).sum(); + // Calculate magnitudes + let magnitude1: f64 = vec1.iter().map(|x| x.powi(2)).sum::().sqrt(); + let magnitude2: f64 = vec2.iter().map(|x| x.powi(2)).sum::().sqrt(); + + // Prevent division by zero + if magnitude1 == 0.0 || magnitude2 == 0.0 { + return None; + } + + // Calculate cosine similarity + Some(dot_product / (magnitude1 * magnitude2)) +} impl TfidfCalculator { fn new() -> Self { @@ -73,106 +60,101 @@ impl TfidfCalculator { row_norms: Vec::new(), documents: Vec::new(), idfs: HashMap::new(), - re: Regex::new(r"(?U)\b\w{2,}\b").unwrap() + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), } } fn tokenize(&self, text: &str) -> Vec { - let matches: Vec = self.re.find_iter(text.to_lowercase().as_str()) - .map(|mat| mat.as_str().to_string()) - .collect(); - matches + let matches: Vec = self + .re + .find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches } fn process_documents(&mut self, documents: Vec) { - - let mut unique_tokens = HashSet::new(); - let mut document_tokens: Vec> = Vec::new(); - for document in &documents { - let tokens = self.tokenize(document); - for token in tokens.iter() { - unique_tokens.insert(token.clone()); + let mut unique_tokens = HashSet::new(); + let mut document_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_tokens.push(tokens); } - document_tokens.push(tokens); - } - self.tokens = unique_tokens.into_iter().collect(); - self.tokens.sort(); - - for (idx, term) in self.tokens.iter().enumerate() { - self.tokens_index.insert(term.clone(), idx); - } - - let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; - // I have count of terms in each document - for (row_idx, document) in document_tokens.iter().enumerate() { - for token in document.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - matrix[row_idx][*term_id] += 1.0; + self.tokens = unique_tokens.into_iter().collect(); + self.tokens.sort(); + + for (idx, term) in self.tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); } - } - self.documents = std::mem::take(&mut matrix); - + let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + matrix[row_idx][*term_id] += 1.0; + } + } + self.documents = std::mem::take(&mut matrix); } fn calculate_idf(&mut self) { - for term_id in 0..self.tokens.len() { - let mut df = 0; - for document in self.documents.iter() { - if document[term_id] > 0.0 { - df += 1; - } + for term_id in 0..self.tokens.len() { + let mut df = 0; + for document in self.documents.iter() { + if document[term_id] > 0.0 { + df += 1; + } + } + let n_documents = self.documents.len(); + let present = df as f64; + let idf = (((n_documents + 1) as f64) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); } - let n_documents = self.documents.len(); - let present = df as f64; - let idf = ((( n_documents + 1 ) as f64) / (present + 1.0)).ln() + 1.0; - self.idfs.insert(term_id, idf); - } } fn calculate_tfidf(&mut self) { let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; for (row_idx, document) in self.documents.iter().enumerate() { - for i in 0..self.tokens.len() { - let term_id = i; - let term_frequency = document[term_id]; - let tf = term_frequency as f64; - let idf = self.idfs.get(&term_id).unwrap(); - if term_frequency == 0.0 { - tfidf_matrix[row_idx][term_id] = 0.0; - } else { - tfidf_matrix[row_idx][term_id] = tf * idf; + for i in 0..self.tokens.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f64; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; + } else { + tfidf_matrix[row_idx][term_id] = tf * idf; + } } - } } self.row_norms = std::mem::take(&mut tfidf_matrix); } fn normalize(&mut self) { - for row in self.row_norms.iter_mut() { - l2_normalize(row); - } + for row in self.row_norms.iter_mut() { + l2_normalize(row); + } } fn tfidf_representation(&self, row: &str) -> Vec { - let mut row_vector = vec![0.0; self.tokens.len()]; - let tokens = self.tokenize(row); - for token in tokens.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - let term_idf = self.idfs.get(term_id).unwrap(); - row_vector[*term_id] += *term_idf; - } - l2_normalize(& mut row_vector); - return row_vector; - } - - + let mut row_vector = vec![0.0; self.tokens.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + l2_normalize(&mut row_vector); + return row_vector; + } } - fn main() -> io::Result<()> { - let args: Vec = std::env::args().collect(); if args.len() != 2 { eprintln!("Usage: {} ", args[0]); @@ -185,14 +167,14 @@ fn main() -> io::Result<()> { let mut calculator = TfidfCalculator::new(); let mut documents = Vec::new(); - /* + /* let mut current_document = String::new(); let mut line_count = 0; */ for line in reader.lines() { - documents.push(line.unwrap()); + documents.push(line.unwrap()); } - /* + /* let line = line?; current_document.push_str(&line); current_document.push('\n'); @@ -211,17 +193,24 @@ fn main() -> io::Result<()> { calculator.calculate_idf(); calculator.calculate_tfidf(); calculator.normalize(); - + let query = "we went over to the whole world"; let query_vector = calculator.tfidf_representation(query); for (idx, row_vector) in calculator.row_norms.iter().enumerate() { - let similarity = SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()).unwrap(); - println!("Similarity for document via simsimd {}: {:?}", idx, 1.0 - similarity); - let plain_similarity = plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()).unwrap(); - println!("Similarity for document via plain cosine similarity {}: {:?}", idx, plain_similarity); + let similarity = + SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()).unwrap(); + println!( + "Similarity for document via simsimd {}: {:?}", + idx, + 1.0 - similarity + ); + let plain_similarity = + plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()).unwrap(); + println!( + "Similarity for document via plain cosine similarity {}: {:?}", + idx, plain_similarity + ); } - - Ok(()) - -} \ No newline at end of file + Ok(()) +} From a487d02cbaf6334a54ab8c9419b7db01e9dfa65a Mon Sep 17 00:00:00 2001 From: Gowind Date: Thu, 2 Jan 2025 16:43:31 +0100 Subject: [PATCH 08/10] Add NEON implementation for SPDOT --- Cargo.toml | 5 ++ build.rs | 1 + c/lib.c | 41 ++++++++++ include/simsimd/simsimd.h | 12 +++ include/simsimd/sparse.h | 161 ++++++++++++++++++++++++++++++++++++-- rust/lib.rs | 39 ++++++++- scripts/bench_tfidf.rs | 34 +++++--- 7 files changed, 271 insertions(+), 22 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 47a9bfd7..0edff2e1 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -40,6 +40,11 @@ name = "tfidf" harness = false path = "scripts/bench_tfidf.rs" +[[bench]] +name = "spdot" +harness = false +path = "scripts/bench_spdot.rs" + [profile.bench] opt-level = 3 # Corresponds to -O3 lto = true # Enables Link Time Optimization for further optimizations diff --git a/build.rs b/build.rs index 0a65c28c..be763957 100644 --- a/build.rs +++ b/build.rs @@ -59,6 +59,7 @@ fn main() { println!("cargo:rerun-if-changed=include/simsimd/dot.h"); println!("cargo:rerun-if-changed=include/simsimd/spatial.h"); + println!("cargo:rerun-if-changed=include/simsimd/sparse.h"); println!("cargo:rerun-if-changed=include/simsimd/probability.h"); println!("cargo:rerun-if-changed=include/simsimd/binary.h"); println!("cargo:rerun-if-changed=include/simsimd/types.h"); diff --git a/c/lib.c b/c/lib.c index 42872632..cadfe460 100644 --- a/c/lib.c +++ b/c/lib.c @@ -96,6 +96,45 @@ extern "C" { metric(a, b, a_length, b_length, result); \ } +#define SIMSIMD_DECLARATION_SPARSE(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_size_t a_length, simsimd_size_t b_length, \ + simsimd_distance_t *result) { \ + static simsimd_metric_sparse_punned_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \ + simsimd_capabilities(), simsimd_cap_any_k, \ + (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, a_length, b_length, result); \ + } + +#define SIMSIMD_DECLARATION_SPARSE_DOT(name, extension, type) \ + SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ + simsimd_##extension##_t const *a_weights, \ + simsimd_##extension##_t const *b_weights, \ + simsimd_size_t a_length, simsimd_size_t b_length, \ + simsimd_distance_t *result) { \ + static simsimd_metric_sparse_weight_punned_t metric = 0; \ + if (metric == 0) { \ + simsimd_capability_t used_capability; \ + simsimd_find_kernel_punned(simsimd_metric_##name##_k, simsimd_datatype_##extension##_k, \ + simsimd_capabilities(), simsimd_cap_any_k, \ + (simsimd_kernel_punned_t *)(&metric), &used_capability); \ + if (!metric) { \ + *(simsimd_u64_t *)result = 0x7FF0000000000001ull; \ + return; \ + } \ + } \ + metric(a, b, a_weights, b_weights, a_length, b_length, result); \ + } + + #define SIMSIMD_DECLARATION_CURVED(name, extension, type) \ SIMSIMD_DYNAMIC void simsimd_##name##_##extension(simsimd_##type##_t const *a, simsimd_##type##_t const *b, \ simsimd_##type##_t const *c, simsimd_size_t n, \ @@ -195,6 +234,8 @@ SIMSIMD_DECLARATION_DENSE(js, f64, f64) // Sparse sets SIMSIMD_DECLARATION_SPARSE(intersect, u16, u16) SIMSIMD_DECLARATION_SPARSE(intersect, u32, u32) +// Sparse dot +SIMSIMD_DECLARATION_SPARSE_DOT(spdot_weights, f32, u16) // Curved spaces SIMSIMD_DECLARATION_CURVED(bilinear, f64, f64) diff --git a/include/simsimd/simsimd.h b/include/simsimd/simsimd.h index 2eb4f805..cc4c746c 100644 --- a/include/simsimd/simsimd.h +++ b/include/simsimd/simsimd.h @@ -259,6 +259,12 @@ typedef void (*simsimd_metric_sparse_punned_t)(void const *a, void const *b, simsimd_size_t a_length, simsimd_size_t b_length, // simsimd_distance_t *d); + +typedef void (*simsimd_metric_sparse_weight_punned_t)(void const *a, void const *b, // + void const *a_weights, void const *b_weights, // + simsimd_size_t a_length, simsimd_size_t b_length, // + simsimd_distance_t *d); + /** * @brief Type-punned function pointer for curved vector spaces and similarity measures. * @@ -616,6 +622,8 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, si case simsimd_metric_kl_k: *m = (m_t)&simsimd_kl_f32_neon, *c = simsimd_cap_neon_k; return; case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_neon, *c = simsimd_cap_neon_k; return; case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_neon, *c = simsimd_cap_neon_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_f32_neon, *c = simsimd_cap_serial_k; return; + default: break; } #endif @@ -658,6 +666,7 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_f32(simsimd_capability_t v, si case simsimd_metric_mahalanobis_k: *m = (m_t)&simsimd_mahalanobis_f32_serial, *c = simsimd_cap_serial_k; return; case simsimd_metric_fma_k: *m = (m_t)&simsimd_fma_f32_serial, *c = simsimd_cap_serial_k; return; case simsimd_metric_wsum_k: *m = (m_t)&simsimd_wsum_f32_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_serial, *c = simsimd_cap_serial_k; return; default: break; } } @@ -1122,6 +1131,7 @@ SIMSIMD_INTERNAL void _simsimd_find_kernel_punned_u16(simsimd_capability_t v, si #endif if (v & simsimd_cap_serial_k) switch (k) { case simsimd_metric_intersect_k: *m = (m_t)&simsimd_intersect_u16_serial, *c = simsimd_cap_serial_k; return; + case simsimd_metric_spdot_weights_k: *m = (m_t)&simsimd_spdot_weights_u16_serial, *c = simsimd_cap_serial_k; return; default: break; } } @@ -2087,6 +2097,8 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16(simsimd_u16_t const *a, simsimd_u1 simsimd_spdot_weights_u16_sve2(a, b, a_weights, b_weights, a_length, b_length, d); #elif SIMSIMD_TARGET_TURIN simsimd_spdot_weights_u16_turin(a, b, a_weights, b_weights, a_length, b_length, d); +// #elif SIMSIMD_TARGET_NEON +// simsimd_spdot_weights_u16_neon(a, b, a_weights, b_weights, a_length, b_length, d); #else simsimd_spdot_weights_u16_serial(a, b, a_weights, b_weights, a_length, b_length, d); #endif diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h index 493828bb..f3a5e8e8 100644 --- a/include/simsimd/sparse.h +++ b/include/simsimd/sparse.h @@ -49,6 +49,8 @@ #define SIMSIMD_SPARSE_H #include "types.h" +#include +#include #ifdef __cplusplus extern "C" { @@ -72,7 +74,7 @@ SIMSIMD_PUBLIC void simsimd_spdot_counts_u16_serial( // simsimd_distance_t *results); SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_serial( // simsimd_u16_t const *a, simsimd_u16_t const *b, // - simsimd_bf16_t const *a_weights, simsimd_bf16_t const *b_weights, // + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, // simsimd_size_t a_length, simsimd_size_t b_length, // simsimd_distance_t *results); @@ -182,18 +184,18 @@ SIMSIMD_MAKE_INTERSECT_LINEAR(accurate, u32, size) // simsimd_intersect_u32_accu simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *results) { \ simsimd_##counter_type##_t intersection_size = 0; \ simsimd_##accumulator_type##_t weights_product = 0; \ - simsimd_size_t i = 0, j = 0; \ + simsimd_size_t i = 0, j = 0; \ while (i != a_length && j != b_length) { \ simsimd_##input_type##_t ai = a[i]; \ simsimd_##input_type##_t bj = b[j]; \ int matches = ai == bj; \ simsimd_##counter_type##_t awi = load_and_convert(a_weights + i); \ - simsimd_##counter_type##_t bwi = load_and_convert(b_weights + i); \ - weights_product += matches * awi * bwi; \ + simsimd_##counter_type##_t bwi = load_and_convert(b_weights + j); \ + weights_product += matches * awi * bwi; \ intersection_size += matches; \ i += ai < bj; \ j += ai >= bj; \ - } \ + } \ results[0] = intersection_size; \ results[1] = weights_product; \ } @@ -255,8 +257,9 @@ SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u16, size) // simsimd_intersect_u16_ser SIMSIMD_MAKE_INTERSECT_GALLOPING(serial, u32, size) // simsimd_intersect_u32_serial SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_counts, u16, size, i16, i32, SIMSIMD_DEREFERENCE) // simsimd_spdot_counts_u16_serial -SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, size, bf16, f32, - SIMSIMD_BF16_TO_F32) // simsimd_spdot_weights_u16_serial +SIMSIMD_MAKE_INTERSECT_WEIGHTED(serial, spdot_weights, u16, f64, f32, f32, + SIMSIMD_DEREFERENCE) // simsimd_spdot_weights_u16_serial + /* The AVX-512 implementations are inspired by the "Faster-Than-Native Alternatives * for x86 VP2INTERSECT Instructions" paper by Guille Diez-Canas, 2022. @@ -1059,6 +1062,150 @@ SIMSIMD_PUBLIC void simsimd_intersect_u32_neon( // *results += vaddvq_u32(c_counts_vec.u32x4); } +inline uint8_t skips(uint16x8_t indexes, uint16_t max) { + uint16x8_t smaller_elements = vcleq_u16(indexes, vdupq_n_u16(max)); + uint8x8_t high_bits = vshrn_n_u16(smaller_elements, 8); + uint64_t results = vreinterpret_u64_u8(high_bits); + int leading_zeroes = __builtin_clzl(results); + return leading_zeroes / 8; +} + +void process_remaining( + simsimd_u16_t const *a, simsimd_u16_t const *b, + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, + simsimd_size_t a_length, simsimd_size_t b_length, + simsimd_distance_t *results) { + + simsimd_u16_t const * a_end = a + a_length; + simsimd_u16_t const * b_end = b + b_length; + simsimd_distance_t intersection_size = results[0]; + simsimd_distance_t total_product = results[1]; + while(a < a_end && b < b_end) { + if(*a == *b) { + total_product += (*a_weights * *b_weights); + a += 1; b += 1; a_weights += 1; b_weights += 1; + intersection_size += 1; + } + if(*a < *b) { a += 1; a_weights += 1;} + if(*b < *a) { b += 1; b_weights += 1;} + } + results[0] = intersection_size; + results[1] = total_product; +} + +#define EXTRACT_LANE_U16(vec, lane) \ + ((lane) == 0 ? vgetq_lane_u16(vec, 0) : \ + (lane) == 1 ? vgetq_lane_u16(vec, 1) : \ + (lane) == 2 ? vgetq_lane_u16(vec, 2) : \ + (lane) == 3 ? vgetq_lane_u16(vec, 3) : \ + (lane) == 4 ? vgetq_lane_u16(vec, 4) : \ + (lane) == 5 ? vgetq_lane_u16(vec, 5) : \ + (lane) == 6 ? vgetq_lane_u16(vec, 6) : \ + vgetq_lane_u16(vec, 7)) + + + +#define EXTRACT_LANE_F32(vec, lane) \ + ((lane) == 0 ? vgetq_lane_f32(vec, 0) : \ + (lane) == 1 ? vgetq_lane_f32(vec, 1) : \ + (lane) == 2 ? vgetq_lane_f32(vec, 2) : \ + vgetq_lane_f32(vec, 3)) + +#define EXT_LANE_U16(vec, lane) \ + ((lane) == 0 ? vgetq_lane_u16(vec, 0) : \ + (lane) == 1 ? vgetq_lane_u16(vec, 1) : \ + (lane) == 2 ? vgetq_lane_u16(vec, 2) : \ + (lane) == 3 ? vgetq_lane_u16(vec, 3) : \ + (lane) == 4 ? vgetq_lane_u16(vec, 4) : \ + (lane) == 5 ? vgetq_lane_u16(vec, 5) : \ + (lane) == 6 ? vgetq_lane_u16(vec, 6) : \ + vgetq_lane_u16(vec, 7)) + +SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( + simsimd_u16_t const *a, simsimd_u16_t const *b, + simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, + simsimd_size_t a_length, simsimd_size_t b_length, + simsimd_distance_t *results) { + + + const simsimd_size_t register_size = 8; + simsimd_size_t a_idx = 0, b_idx = 0; + simsimd_size_t intersection_size = 0; + float total_product = 0.0f; + + #ifndef EXT_LANE + #define EXT_LANE(vec, lane) vget_lane_u16(vec, lane) + #endif + #ifndef EXT_F_LANE + #define EXT_F_LANE(vec, lane) vgetq_lane_f32(vec, lane) + #endif + simsimd_u16_t const * a_end = a + a_length; + simsimd_u16_t const * b_end = b + b_length; + // printf("conditions %d %d and ends %p %p %p \n", (a + a_idx + 8 < a_end), (b + b_idx + 8 < b_end), a_end, b, b_end); + while((a + a_idx + 8) < a_end && ( b + b_idx + 8) < b_end) { + + uint16x8_t a_vec, b_vec; + a_vec = vld1q_u16(a + a_idx) ; + b_vec = vld1q_u16(b + b_idx) ; + + uint16_t a_min = vgetq_lane_u16(a_vec, 0); + uint16_t a_max = vgetq_lane_u16(a_vec, 7); + uint16_t b_min = vgetq_lane_u16(b_vec, 0); + uint16_t b_max = vgetq_lane_u16(b_vec, 7); + while(a_max < b_min && a + 16 < a_end) { + a += 8; + a_vec = vld1q_u16(a); + a_max = vgetq_lane_u16(a_vec, 7); + } + a_min = vgetq_lane_u16(a_vec, 0); + while(b_max < a_min && b + 16 < a_end) { + b += 8; + b_vec = vld1q_u16(b); + b_max = vgetq_lane_u16(b_vec, 7); + } + a_min = vgetq_lane_u16(b_vec, 0); + + float32x4_t a_weights_vec_low = vld1q_f32(a_weights + a_idx); + float32x4_t a_weights_vec_high = vld1q_f32(a_weights + a_idx + 4); + + float32x4_t b_weights_vec_low = vld1q_f32(b_weights + b_idx); + float32x4_t b_weights_vec_high = vld1q_f32(b_weights + b_idx + 4); + float32x4_t zeroes = vdupq_n_f32(0.0); + for (int lane = 0; lane < register_size; lane++) { + uint16_t elem = EXT_LANE_U16(b_vec, lane); + uint16x8_t compare_vec = vceqq_u16(a_vec, vdupq_n_u16(elem)); + uint16_t has_matches = vaddvq_u16(compare_vec); + if(has_matches == 0) { + continue; + } + uint16x4_t matches_low = vget_low_u16(compare_vec); + uint16x4_t matches_high = vget_high_u16(compare_vec); + uint32x4_t masks_low = vmovl_s16(matches_low); + uint32x4_t masks_high = vmovl_s16(matches_high); + float b_weight = lane < 4 ? + EXTRACT_LANE_F32(b_weights_vec_low, lane) : + EXTRACT_LANE_F32(b_weights_vec_high, lane - 4); + float32x4_t b_weights = vdupq_n_f32(b_weight); + float32x4_t bl = vbslq_f32(masks_low, b_weights, zeroes); + float32x4_t bh = vbslq_f32(masks_high, b_weights, zeroes); + float32x4_t weights_product_low = vmulq_f32(bl, a_weights_vec_low); + float32x4_t weights_product_high = vmulq_f32(bh, a_weights_vec_high); + total_product += vaddvq_f32(weights_product_low); + total_product += vaddvq_f32(weights_product_high); + uint16_t upper_intersections = vaddv_u16(vshr_n_u16(matches_low, 15)); + uint16_t lower_intersections = vaddv_u16(vshr_n_u16(matches_high, 15)); + intersection_size += upper_intersections; + intersection_size += lower_intersections; + } + uint8_t askips = skips(a_vec, b_max); + a_idx += 8 - askips; + uint8_t bskips = skips(b_vec, a_max); + b_idx += 8 - bskips; + } + + process_remaining(a, b,a_weights,b_weights, a_length, b_length, results); + +} #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON diff --git a/rust/lib.rs b/rust/lib.rs index f9be01d0..f0695dc4 100644 --- a/rust/lib.rs +++ b/rust/lib.rs @@ -82,7 +82,7 @@ extern "C" { fn simsimd_l2_i8(a: *const i8, b: *const i8, c: usize, d: *mut Distance); fn simsimd_l2_f16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); - fn simsimd_l2_bf16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); + // fn simsimd_l2_bf16(a: *const u16, b: *const u16, c: usize, d: *mut Distance); fn simsimd_l2_f32(a: *const f32, b: *const f32, c: usize, d: *mut Distance); fn simsimd_l2_f64(a: *const f64, b: *const f64, c: usize, d: *mut Distance); @@ -114,6 +114,16 @@ extern "C" { d: *mut Distance, ); + fn simsimd_spdot_weights_f32( + a: *const u16, + b: *const u16, + a_weights: *const f32, + b_weights: *const f32, + a_length: usize, + b_length: usize, + intersection: *mut f64, + ); + fn simsimd_uses_neon() -> i32; fn simsimd_uses_neon_f16() -> i32; fn simsimd_uses_neon_bf16() -> i32; @@ -139,7 +149,8 @@ impl f16 {} /// A half-precision floating point number, called brain float. #[repr(transparent)] -pub struct bf16(u16); +#[derive(Debug, Clone, Copy)] +pub struct bf16(pub u16); impl bf16 {} @@ -340,6 +351,30 @@ where fn intersect(a: &[Self], b: &[Self]) -> Option; } +pub fn sparse_dot_product( + a: &[u16], + b: &[u16], + a_weights: &[f32], + b_weights: &[f32], +) -> (Distance, Distance) { + let mut intersection = [0.0; 2]; + let af32_ptr = a_weights.as_ptr() as *const f32; + let bf32_ptr = b_weights.as_ptr() as *const f32; + unsafe { + crate::simsimd_spdot_weights_f32( + a.as_ptr(), + b.as_ptr(), + af32_ptr, + bf32_ptr, + a.len(), + b.len(), + intersection.as_mut_ptr() + ); + } + (intersection[0], intersection[1]) + +} + impl BinarySimilarity for u8 { fn hamming(a: &[Self], b: &[Self]) -> Option { if a.len() != b.len() { diff --git a/scripts/bench_tfidf.rs b/scripts/bench_tfidf.rs index 12b3a87b..7cdbe03c 100644 --- a/scripts/bench_tfidf.rs +++ b/scripts/bench_tfidf.rs @@ -182,27 +182,35 @@ pub fn tfidfsimilarity_benchmark(c: &mut Criterion) { let query_vector = calculator.tfidf_representation(query); group.sample_size(10); - + group.bench_function("SimSIMD Cosine Similarity", |b| { b.iter(|| { - let total_similarity: f64 = calculator.row_norms.iter().map(|row_vector| { - black_box( - SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) - .unwrap_or(0.0), - ) - }).sum(); + let total_similarity: f64 = calculator + .row_norms + .iter() + .map(|row_vector| { + black_box( + SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }) + .sum(); black_box(total_similarity) }) }); group.bench_function("Rust plain Cosine similarity", |b| { b.iter(|| { - let total_similarity: f64 = calculator.row_norms.iter().map(|row_vector| { - black_box( - plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) - .unwrap_or(0.0), - ) - }).sum(); + let total_similarity: f64 = calculator + .row_norms + .iter() + .map(|row_vector| { + black_box( + plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()) + .unwrap_or(0.0), + ) + }) + .sum(); black_box(total_similarity) }) }); From deaea659bd5573ffa3b82df1b43b6af3429b076a Mon Sep 17 00:00:00 2001 From: Gowind Date: Thu, 2 Jan 2025 16:57:18 +0100 Subject: [PATCH 09/10] Add benchmark file for spdot --- scripts/bench_spdot.rs | 229 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 229 insertions(+) create mode 100644 scripts/bench_spdot.rs diff --git a/scripts/bench_spdot.rs b/scripts/bench_spdot.rs new file mode 100644 index 00000000..1618fe0f --- /dev/null +++ b/scripts/bench_spdot.rs @@ -0,0 +1,229 @@ +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use std::collections::HashMap; +use std::collections::HashSet; +use std::fmt::Display; +use std::fmt::Formatter; +use std::fs::File; +use std::io::BufRead; +use std::io::BufReader; +use std::path::Path; +use regex::Regex; + +use simsimd::sparse_dot_product; +//use half::bf16 as hbf16; +#[derive(Clone, Debug)] +struct SparseVector { + indices: Vec, + values: Vec, +} +impl SparseVector { + fn from_dense(dense_vec: &[f32]) -> Self { + if dense_vec.len() >= u16::MAX as usize { + panic!("Dense vector is too large to convert to sparse vector"); + } + let mut indices: Vec = Vec::new(); + let mut values = Vec::new(); + + for (idx, &value) in dense_vec.iter().enumerate() { + if value != 0.0 { + indices.push(idx.try_into().unwrap()); + values.push(value); + } + } + + SparseVector { indices, values } + } + + fn sparse_dot_product(&self, other: &SparseVector) -> (u16, f64) { + let mut result = 0.0; + let mut i = 0; + let mut j = 0; + let mut matches: u16 = 0; + while i < self.indices.len() && j < other.indices.len() { + if self.indices[i] == other.indices[j] { + matches += 1; + result += f64::from( self.values[i] * other.values[j]); + i += 1; + j += 1; + } else if self.indices[i] < other.indices[j] { + i += 1; + } else { + j += 1; + } + } + + (matches, result) + } +} + + +impl Display for SparseVector { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SparseVector {{ indices: {:?}, values: {:?} }}", self.indices, self.values) + } +} + +#[derive(Debug)] +struct TfidfCalculator { + tokens_index: HashMap, + row_norms: Vec, + idfs: HashMap, + re: Regex, +} + +impl TfidfCalculator { + pub fn new() -> Self { + TfidfCalculator { + tokens_index: HashMap::new(), + row_norms: Vec::new(), + idfs: HashMap::new(), + re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), + } + } + + fn tokenize(&self, text: &str) -> Vec { + let matches: Vec = self + .re + .find_iter(text.to_lowercase().as_str()) + .map(|mat| mat.as_str().to_string()) + .collect(); + matches + } + + pub fn calculate_document_tfidf_vectors(&mut self, documents: Vec) { + let mut unique_tokens = HashSet::new(); + let mut document_as_tokens: Vec> = Vec::new(); + for document in &documents { + let tokens = self.tokenize(document); + for token in tokens.iter() { + unique_tokens.insert(token.clone()); + } + document_as_tokens.push(tokens); + } + let mut tokens: Vec = unique_tokens.into_iter().collect(); + tokens.sort(); + + for (idx, term) in tokens.iter().enumerate() { + self.tokens_index.insert(term.clone(), idx); + } + + let mut document_term_frequencies: Vec> = + vec![vec![0.0; tokens.len()]; documents.len()]; + // I have count of terms in each document + for (row_idx, document) in document_as_tokens.iter().enumerate() { + for token in document.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + document_term_frequencies[row_idx][*term_id] += 1.0; + } + } + // calculate idf of terms + for term_id in 0..tokens.len() { + let mut df = 0; + for document in document_term_frequencies.iter() { + if document[term_id] > 0.0 { + df += 1; + } + } + let n_documents = document_term_frequencies.len(); + let present = df as f32; + let idf = (((n_documents + 1) as f32) / (present + 1.0)).ln() + 1.0; + self.idfs.insert(term_id, idf); + } + + let mut tfidf_matrix: Vec> = + vec![vec![0.0; tokens.len()]; document_term_frequencies.len()]; + + for (row_idx, document) in document_term_frequencies.iter().enumerate() { + for i in 0..tokens.len() { + let term_id = i; + let term_frequency = document[term_id]; + let tf = term_frequency as f32; + let idf = self.idfs.get(&term_id).unwrap(); + if term_frequency == 0.0 { + tfidf_matrix[row_idx][term_id] = 0.0; + } else { + tfidf_matrix[row_idx][term_id] = tf * idf; + } + } + } + self.row_norms = tfidf_matrix.iter().map(|row| { + SparseVector::from_dense(row.as_slice()) + }).collect(); + + } + + fn sparse_tfidf_representation(&self, row: &str) -> SparseVector { + let mut row_vector: Vec = vec![0.0; self.tokens_index.len()]; + let tokens = self.tokenize(row); + for token in tokens.iter() { + let term_id = self.tokens_index.get(token).unwrap(); + let term_idf = self.idfs.get(term_id).unwrap(); + row_vector[*term_id] += *term_idf; + } + return SparseVector::from_dense(row_vector.as_slice()); + } + +} + +pub fn spdot_benchmark(c: &mut Criterion) { + // Prepare your data and calculator here + + let path = Path::new("leipzig10000.txt"); + let file = File::open(path).unwrap(); + let reader = BufReader::new(file); + + let mut calculator = TfidfCalculator::new(); + + let mut documents = Vec::new(); + for line in reader.lines() { + documents.push(line.unwrap()); + } + + calculator.calculate_document_tfidf_vectors(documents); + + let sparse_vectors = &calculator.row_norms; + + let mut group = c.benchmark_group("Sparse Dot Product"); + + // let query = "we went over to the whole world"; + // let query_vector = calculator.sparse_tfidf_representation(query); + let mut largest = &sparse_vectors[0]; + for row_vector in sparse_vectors.iter() { + if row_vector.indices.len() > largest.indices.len() { + largest = row_vector; + } + } + let query_vector = largest.clone(); + println!("Query Vector: {}", query_vector.indices.len()); + group.bench_function("SimSIMD spdot", |b| { + b.iter(|| { + let total_similarity: f64 = sparse_vectors + .iter() + .map(|row_vector| { + sparse_dot_product( + row_vector.indices.as_slice(), + query_vector.indices.as_slice(), + query_vector.values.as_slice(), + row_vector.values.as_slice(), + ) + } + ).map(|(_similar_items, similarity_dot_product)| similarity_dot_product) + .sum(); + black_box(total_similarity); + }) + }); + + group.bench_function("Rust plain spdot", |b| { + b.iter(|| { + let total_similarity: f64 = sparse_vectors + .iter() + .map(|row_vector| black_box(row_vector.sparse_dot_product(&query_vector))) + .map(|(_similar_items, similarity_dot_product)| similarity_dot_product) + .sum(); + black_box(total_similarity) + }) + }); +} + +criterion_group!(benches, spdot_benchmark); +criterion_main!(benches); From 8b82d9f21a90dc945efc3af5207ee6634eb0e070 Mon Sep 17 00:00:00 2001 From: Gowind Date: Thu, 9 Jan 2025 16:02:33 +0100 Subject: [PATCH 10/10] fix: Fix bug in SIMD accelerated algorithm The SIMD version was reporting the number of matched items between the Sparse vectors incorrectly. Fix the bug and remove unnecessary variables from the functions Add a bin file with custom loop for calculating benchmarks. This is faster to run and does averaging of timing runs for various vector sizes --- Cargo.lock | 42 ++++++- Cargo.toml | 7 +- include/simsimd/sparse.h | 41 +++---- scripts/bench_spdot.rs | 253 ++++++++++++++------------------------- scripts/spdot.rs | 142 ++++++++++++++++++++++ scripts/tfidf.rs | 216 --------------------------------- 6 files changed, 296 insertions(+), 405 deletions(-) create mode 100644 scripts/spdot.rs delete mode 100644 scripts/tfidf.rs diff --git a/Cargo.lock b/Cargo.lock index 989888a7..c9cae64a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -41,6 +41,12 @@ version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + [[package]] name = "cast" version = "0.3.0" @@ -199,9 +205,9 @@ dependencies = [ [[package]] name = "getrandom" -version = "0.2.12" +version = "0.2.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "190092ea657667030ac6a35e305e62fc4dd69fd98ac98631e5d3a2b1575a12b5" +checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7" dependencies = [ "cfg-if", "libc", @@ -261,9 +267,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.152" +version = "0.2.169" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "13e3bf6590cbc649f4d1a3eefc9d5d6eb746f5200ffb04e5e142700b8faa56e7" +checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a" [[package]] name = "linux-raw-sys" @@ -334,9 +340,12 @@ dependencies = [ [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "proc-macro2" @@ -708,3 +717,24 @@ name = "windows_x86_64_msvc" version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" + +[[package]] +name = "zerocopy" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" +dependencies = [ + "byteorder", + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.7.35" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] diff --git a/Cargo.toml b/Cargo.toml index 0edff2e1..8b2552d7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,8 +24,8 @@ name = "simsimd" path = "rust/lib.rs" [[bin]] -name = "tfidf" -path = "scripts/tfidf.rs" +name = "spdot" +path = "scripts/spdot.rs" [build-dependencies] cc = "1.0.83" @@ -53,8 +53,9 @@ rpath = false # On some systems, setting this to false can help with optimiz [dev-dependencies] criterion = { version = "0.5.1" } -rand = { version = "0.8.5" } half = { version = "2.4.0" } [dependencies] regex = "1.11.1" +rand = { version = "0.8.5" } + diff --git a/include/simsimd/sparse.h b/include/simsimd/sparse.h index f3a5e8e8..457e27fe 100644 --- a/include/simsimd/sparse.h +++ b/include/simsimd/sparse.h @@ -1073,12 +1073,10 @@ inline uint8_t skips(uint16x8_t indexes, uint16_t max) { void process_remaining( simsimd_u16_t const *a, simsimd_u16_t const *b, simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, - simsimd_size_t a_length, simsimd_size_t b_length, + simsimd_u16_t const *a_end, simsimd_u16_t const *b_end, simsimd_distance_t *results) { - simsimd_u16_t const * a_end = a + a_length; - simsimd_u16_t const * b_end = b + b_length; - simsimd_distance_t intersection_size = results[0]; + simsimd_distance_t intersection_size = results[0]; simsimd_distance_t total_product = results[1]; while(a < a_end && b < b_end) { if(*a == *b) { @@ -1126,10 +1124,8 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( simsimd_f32_t const *a_weights, simsimd_f32_t const *b_weights, simsimd_size_t a_length, simsimd_size_t b_length, simsimd_distance_t *results) { - const simsimd_size_t register_size = 8; - simsimd_size_t a_idx = 0, b_idx = 0; simsimd_size_t intersection_size = 0; float total_product = 0.0f; @@ -1139,14 +1135,15 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( #ifndef EXT_F_LANE #define EXT_F_LANE(vec, lane) vgetq_lane_f32(vec, lane) #endif + simsimd_u16_t const * a_start = a; + simsimd_u16_t const * b_start = b; simsimd_u16_t const * a_end = a + a_length; simsimd_u16_t const * b_end = b + b_length; - // printf("conditions %d %d and ends %p %p %p \n", (a + a_idx + 8 < a_end), (b + b_idx + 8 < b_end), a_end, b, b_end); - while((a + a_idx + 8) < a_end && ( b + b_idx + 8) < b_end) { + while((a + 8) < a_end && ( b + 8) < b_end) { uint16x8_t a_vec, b_vec; - a_vec = vld1q_u16(a + a_idx) ; - b_vec = vld1q_u16(b + b_idx) ; + a_vec = vld1q_u16(a) ; + b_vec = vld1q_u16(b) ; uint16_t a_min = vgetq_lane_u16(a_vec, 0); uint16_t a_max = vgetq_lane_u16(a_vec, 7); @@ -1154,22 +1151,24 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( uint16_t b_max = vgetq_lane_u16(b_vec, 7); while(a_max < b_min && a + 16 < a_end) { a += 8; + a_weights += 8; a_vec = vld1q_u16(a); a_max = vgetq_lane_u16(a_vec, 7); } a_min = vgetq_lane_u16(a_vec, 0); - while(b_max < a_min && b + 16 < a_end) { + while(b_max < a_min && b + 16 < b_end) { b += 8; + b_weights += 8; b_vec = vld1q_u16(b); b_max = vgetq_lane_u16(b_vec, 7); } - a_min = vgetq_lane_u16(b_vec, 0); + b_min = vgetq_lane_u16(b_vec, 0); - float32x4_t a_weights_vec_low = vld1q_f32(a_weights + a_idx); - float32x4_t a_weights_vec_high = vld1q_f32(a_weights + a_idx + 4); + float32x4_t a_weights_vec_low = vld1q_f32(a_weights); + float32x4_t a_weights_vec_high = vld1q_f32(a_weights + 4); - float32x4_t b_weights_vec_low = vld1q_f32(b_weights + b_idx); - float32x4_t b_weights_vec_high = vld1q_f32(b_weights + b_idx + 4); + float32x4_t b_weights_vec_low = vld1q_f32(b_weights); + float32x4_t b_weights_vec_high = vld1q_f32(b_weights + 4); float32x4_t zeroes = vdupq_n_f32(0.0); for (int lane = 0; lane < register_size; lane++) { uint16_t elem = EXT_LANE_U16(b_vec, lane); @@ -1198,14 +1197,16 @@ SIMSIMD_PUBLIC void simsimd_spdot_weights_u16_f32_neon( intersection_size += lower_intersections; } uint8_t askips = skips(a_vec, b_max); - a_idx += 8 - askips; + a += 8 - askips; uint8_t bskips = skips(b_vec, a_max); - b_idx += 8 - bskips; + b += 8 - bskips; } - - process_remaining(a, b,a_weights,b_weights, a_length, b_length, results); + results[0] += intersection_size; + results[1] += total_product; + process_remaining(a, b,a_weights,b_weights, a_start + a_length, b_start + b_length, results); } + #pragma clang attribute pop #pragma GCC pop_options #endif // SIMSIMD_TARGET_NEON diff --git a/scripts/bench_spdot.rs b/scripts/bench_spdot.rs index 1618fe0f..25be69e5 100644 --- a/scripts/bench_spdot.rs +++ b/scripts/bench_spdot.rs @@ -1,14 +1,9 @@ +use criterion::BenchmarkId; use criterion::{black_box, criterion_group, criterion_main, Criterion}; -use std::collections::HashMap; -use std::collections::HashSet; +use rand::Rng; use std::fmt::Display; use std::fmt::Formatter; -use std::fs::File; -use std::io::BufRead; -use std::io::BufReader; -use std::path::Path; -use regex::Regex; - +use rand::seq::index::sample; use simsimd::sparse_dot_product; //use half::bf16 as hbf16; #[derive(Clone, Debug)] @@ -63,167 +58,105 @@ impl Display for SparseVector { } } -#[derive(Debug)] -struct TfidfCalculator { - tokens_index: HashMap, - row_norms: Vec, - idfs: HashMap, - re: Regex, -} - -impl TfidfCalculator { - pub fn new() -> Self { - TfidfCalculator { - tokens_index: HashMap::new(), - row_norms: Vec::new(), - idfs: HashMap::new(), - re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), - } +fn generate_intersecting_vectors(first_size: usize, second_size: usize, intersection_size: usize) -> (SparseVector, SparseVector) +{ + let mut rng = rand::thread_rng(); + let mut first_vector_indices: Vec = Vec::with_capacity(first_size); + let mut second_vector_indices: Vec = Vec::with_capacity(second_size); + let mut first_vector: Vec = Vec::with_capacity(first_size); + let mut second_vector: Vec = Vec::with_capacity(second_size); + let unique_first = first_size - intersection_size; + let unique_second = second_size - intersection_size; + assert!(intersection_size + unique_first + unique_second <= 65535, "Too many elements in the vectors"); + let total = intersection_size + (first_size - intersection_size) + (second_size - intersection_size); + + let unique_indices: Vec = sample(&mut rng, 65535, total).into_iter().map(|x| x as u16).collect(); + // assert!( unique_indices.len() == total, "unique_indices length is not correct: {}, expected {}", unique_indices.len(), total); + + first_vector_indices.extend(unique_indices.iter().take(intersection_size)); + second_vector_indices.extend(unique_indices.iter().take(intersection_size)); + first_vector_indices.extend(unique_indices.iter().skip(intersection_size).take(first_size - intersection_size)); + second_vector_indices.extend(unique_indices.iter().skip(intersection_size).skip(first_size - intersection_size).take(second_size - intersection_size)); + first_vector_indices.sort(); + second_vector_indices.sort(); + + for _i in 0..first_size { + let value: f32 = rng.gen(); + first_vector.push(value); } - - fn tokenize(&self, text: &str) -> Vec { - let matches: Vec = self - .re - .find_iter(text.to_lowercase().as_str()) - .map(|mat| mat.as_str().to_string()) - .collect(); - matches + for _i in 0..second_size { + let value: f32 = rng.gen(); + second_vector.push(value); } - pub fn calculate_document_tfidf_vectors(&mut self, documents: Vec) { - let mut unique_tokens = HashSet::new(); - let mut document_as_tokens: Vec> = Vec::new(); - for document in &documents { - let tokens = self.tokenize(document); - for token in tokens.iter() { - unique_tokens.insert(token.clone()); - } - document_as_tokens.push(tokens); - } - let mut tokens: Vec = unique_tokens.into_iter().collect(); - tokens.sort(); - - for (idx, term) in tokens.iter().enumerate() { - self.tokens_index.insert(term.clone(), idx); - } + (SparseVector{indices: first_vector_indices, values: first_vector}, SparseVector{indices: second_vector_indices, values: second_vector}) + +} - let mut document_term_frequencies: Vec> = - vec![vec![0.0; tokens.len()]; documents.len()]; - // I have count of terms in each document - for (row_idx, document) in document_as_tokens.iter().enumerate() { - for token in document.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - document_term_frequencies[row_idx][*term_id] += 1.0; - } - } - // calculate idf of terms - for term_id in 0..tokens.len() { - let mut df = 0; - for document in document_term_frequencies.iter() { - if document[term_id] > 0.0 { - df += 1; - } - } - let n_documents = document_term_frequencies.len(); - let present = df as f32; - let idf = (((n_documents + 1) as f32) / (present + 1.0)).ln() + 1.0; - self.idfs.insert(term_id, idf); - } - let mut tfidf_matrix: Vec> = - vec![vec![0.0; tokens.len()]; document_term_frequencies.len()]; - - for (row_idx, document) in document_term_frequencies.iter().enumerate() { - for i in 0..tokens.len() { - let term_id = i; - let term_frequency = document[term_id]; - let tf = term_frequency as f32; - let idf = self.idfs.get(&term_id).unwrap(); - if term_frequency == 0.0 { - tfidf_matrix[row_idx][term_id] = 0.0; - } else { - tfidf_matrix[row_idx][term_id] = tf * idf; - } +fn bench_dot_products(c: &mut Criterion) { + // Define test parameters + let first_lens = [66, 129, 513, 1025, 2049]; + let second_lens = [9, 17, 33]; + let intersection_ratios = [0.1, 0.5, 0.9]; + + // Create benchmark group for plain implementation + let mut plain_group = c.benchmark_group("plain_dot_product"); + for &first_len in first_lens.iter() { + for &second_len in second_lens.iter() { + for &ratio in intersection_ratios.iter() { + let intersection_size = (ratio * second_len as f32).ceil() as usize; + let params = format!("{}x{}@{}", first_len, second_len, ratio); + + plain_group.bench_with_input( + BenchmarkId::new("plain", params), + &(first_len, second_len, intersection_size), + |b, &(f_len, s_len, i_size)| { + b.iter_with_setup( + || generate_intersecting_vectors(f_len, s_len, i_size), + |(first_vector, second_vector)| { + let (similar_items, _dot_product) = first_vector.sparse_dot_product(&second_vector); + black_box(similar_items); + } + ); + } + ); } } - self.row_norms = tfidf_matrix.iter().map(|row| { - SparseVector::from_dense(row.as_slice()) - }).collect(); - - } - - fn sparse_tfidf_representation(&self, row: &str) -> SparseVector { - let mut row_vector: Vec = vec![0.0; self.tokens_index.len()]; - let tokens = self.tokenize(row); - for token in tokens.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - let term_idf = self.idfs.get(term_id).unwrap(); - row_vector[*term_id] += *term_idf; - } - return SparseVector::from_dense(row_vector.as_slice()); } - -} - -pub fn spdot_benchmark(c: &mut Criterion) { - // Prepare your data and calculator here - - let path = Path::new("leipzig10000.txt"); - let file = File::open(path).unwrap(); - let reader = BufReader::new(file); - - let mut calculator = TfidfCalculator::new(); - - let mut documents = Vec::new(); - for line in reader.lines() { - documents.push(line.unwrap()); - } - - calculator.calculate_document_tfidf_vectors(documents); - - let sparse_vectors = &calculator.row_norms; - - let mut group = c.benchmark_group("Sparse Dot Product"); - - // let query = "we went over to the whole world"; - // let query_vector = calculator.sparse_tfidf_representation(query); - let mut largest = &sparse_vectors[0]; - for row_vector in sparse_vectors.iter() { - if row_vector.indices.len() > largest.indices.len() { - largest = row_vector; + plain_group.finish(); + + // Create benchmark group for NEON implementation + let mut neon_group = c.benchmark_group("neon_dot_product"); + for &first_len in first_lens.iter() { + for &second_len in second_lens.iter() { + for &ratio in intersection_ratios.iter() { + let intersection_size = (ratio * second_len as f32).ceil() as usize; + let params = format!("{}x{}@{}", first_len, second_len, ratio); + + neon_group.bench_with_input( + BenchmarkId::new("neon", params), + &(first_len, second_len, intersection_size), + |b, &(f_len, s_len, i_size)| { + b.iter_with_setup( + || generate_intersecting_vectors(f_len, s_len, i_size), + |(first_vector, second_vector)| { + let (similar_items, _dot_product) = sparse_dot_product( + first_vector.indices.as_slice(), + second_vector.indices.as_slice(), + first_vector.values.as_slice(), + second_vector.values.as_slice(), + ); + black_box(similar_items) + } + ); + } + ); + } } } - let query_vector = largest.clone(); - println!("Query Vector: {}", query_vector.indices.len()); - group.bench_function("SimSIMD spdot", |b| { - b.iter(|| { - let total_similarity: f64 = sparse_vectors - .iter() - .map(|row_vector| { - sparse_dot_product( - row_vector.indices.as_slice(), - query_vector.indices.as_slice(), - query_vector.values.as_slice(), - row_vector.values.as_slice(), - ) - } - ).map(|(_similar_items, similarity_dot_product)| similarity_dot_product) - .sum(); - black_box(total_similarity); - }) - }); - - group.bench_function("Rust plain spdot", |b| { - b.iter(|| { - let total_similarity: f64 = sparse_vectors - .iter() - .map(|row_vector| black_box(row_vector.sparse_dot_product(&query_vector))) - .map(|(_similar_items, similarity_dot_product)| similarity_dot_product) - .sum(); - black_box(total_similarity) - }) - }); + neon_group.finish(); } -criterion_group!(benches, spdot_benchmark); -criterion_main!(benches); +criterion_group!(benches, bench_dot_products); +criterion_main!(benches); \ No newline at end of file diff --git a/scripts/spdot.rs b/scripts/spdot.rs new file mode 100644 index 00000000..948c1a00 --- /dev/null +++ b/scripts/spdot.rs @@ -0,0 +1,142 @@ +use rand::Rng; +use std::fmt::{Formatter, Display}; +use rand::seq::index::sample; +use simsimd::sparse_dot_product; +//use half::bf16 as hbf16; +#[derive(Clone, Debug)] +struct SparseVector { + indices: Vec, + values: Vec, +} +impl SparseVector { + fn from_dense(dense_vec: &[f32]) -> Self { + if dense_vec.len() >= u16::MAX as usize { + panic!("Dense vector is too large to convert to sparse vector"); + } + let mut indices: Vec = Vec::new(); + let mut values = Vec::new(); + + for (idx, &value) in dense_vec.iter().enumerate() { + if value != 0.0 { + indices.push(idx.try_into().unwrap()); + values.push(value); + } + } + + SparseVector { indices, values } + } + + fn sparse_dot_product(&self, other: &SparseVector) -> (u16, f64) { + let mut result = 0.0; + let mut i = 0; + let mut j = 0; + let mut matches: u16 = 0; + while i < self.indices.len() && j < other.indices.len() { + if self.indices[i] == other.indices[j] { + matches += 1; + result += f64::from( self.values[i] * other.values[j]); + i += 1; + j += 1; + } else if self.indices[i] < other.indices[j] { + i += 1; + } else { + j += 1; + } + } + + (matches, result) + } +} + + +impl Display for SparseVector { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + write!(f, "SparseVector {{ indices: {:?}, values: {:?} }}", self.indices, self.values) + } +} + +fn generate_intersecting_vectors(first_size: usize, second_size: usize, intersection_size: usize) -> (SparseVector, SparseVector) +{ + let mut rng = rand::thread_rng(); + let mut first_vector_indices: Vec = Vec::with_capacity(first_size); + let mut second_vector_indices: Vec = Vec::with_capacity(second_size); + let mut first_vector: Vec = Vec::with_capacity(first_size); + let mut second_vector: Vec = Vec::with_capacity(second_size); + let unique_first = first_size - intersection_size; + let unique_second = second_size - intersection_size; + assert!(intersection_size + unique_first + unique_second <= 65535, "Too many elements in the vectors"); + let total = intersection_size + (first_size - intersection_size) + (second_size - intersection_size); + + let unique_indices: Vec = sample(&mut rng, 65535, total).into_iter().map(|x| x as u16).collect(); + // assert!( unique_indices.len() == total, "unique_indices length is not correct: {}, expected {}", unique_indices.len(), total); + + first_vector_indices.extend(unique_indices.iter().take(intersection_size)); + second_vector_indices.extend(unique_indices.iter().take(intersection_size)); + first_vector_indices.extend(unique_indices.iter().skip(intersection_size).take(first_size - intersection_size)); + second_vector_indices.extend(unique_indices.iter().skip(intersection_size).skip(first_size - intersection_size).take(second_size - intersection_size)); + first_vector_indices.sort(); + second_vector_indices.sort(); + + for _i in 0..first_size { + let value: f32 = rng.gen(); + first_vector.push(value); + } + for _i in 0..second_size { + let value: f32 = rng.gen(); + second_vector.push(value); + } + + (SparseVector{indices: first_vector_indices, values: first_vector}, SparseVector{indices: second_vector_indices, values: second_vector}) + +} + +pub fn main() { + for first_len in [64, 128, 512, 1024, 2048] { + for second_len in [8, 16, 32] { + for intersection_ratio in [0.1, 0.5, 0.9] { + let intersection_size = (intersection_ratio * second_len as f32).ceil() as usize; + let (first_vector, second_vector) = generate_intersecting_vectors(first_len, second_len, intersection_size); + let mut total_ns: u128 = 0; + for _j in 0..10 { + let start = std::time::Instant::now(); + let (_similar_items, _dot_product) = first_vector.sparse_dot_product(&second_vector); + let elapsed = start.elapsed(); + assert!(_similar_items == intersection_size as u16, "similar items: {}, intersection_size: {}", _similar_items, intersection_size); + total_ns += elapsed.as_nanos(); + } + println!("plain dot product: {} against {}, avg elapsed_time ns: {}", first_len, second_len, total_ns/10); + + + + } + } + } + + for first_len in [64, 128, 512, 1024, 2048] { + for second_len in [8, 16, 32] { + for intersection_ratio in [0.1, 0.5, 0.9] { + let intersection_size = (intersection_ratio * second_len as f32).ceil() as usize; + let (first_vector, second_vector) = generate_intersecting_vectors(first_len, second_len, intersection_size); + let mut total_ns: u128 = 0; + for _j in 0..10 { + let start = std::time::Instant::now(); + let (neon_similar_items, _dot_product) = sparse_dot_product( + first_vector.indices.as_slice(), + second_vector.indices.as_slice(), + first_vector.values.as_slice(), + second_vector.values.as_slice(), + ); + let elapsed = start.elapsed(); + total_ns += elapsed.as_nanos(); + assert!(neon_similar_items == intersection_size as f64, "similar items: {}, intersection_size: {}\n {}, \n second {}", neon_similar_items, intersection_size, first_vector, second_vector); + } + println!("NEON: {} vs {} avg elapsed_time ns: {}",first_len, second_len, total_ns/10); + + + } + } + } + + +} + \ No newline at end of file diff --git a/scripts/tfidf.rs b/scripts/tfidf.rs deleted file mode 100644 index 53d7eacf..00000000 --- a/scripts/tfidf.rs +++ /dev/null @@ -1,216 +0,0 @@ -use regex::Regex; -use simsimd::SpatialSimilarity; -use std::collections::HashMap; -use std::collections::HashSet; -use std::fs::File; -use std::io::BufRead; -use std::io::BufReader; -use std::io::{self}; -use std::path::Path; - -fn l2_normalize(v: &mut Vec) { - let mut sum = 0.0; - for i in v.iter() { - sum += i.powi(2); - } - sum = sum.sqrt(); - for element in v.iter_mut() { - *element = *element / sum; - } -} - -#[derive(Debug)] -struct TfidfCalculator { - tokens: Vec, - tokens_index: HashMap, - row_norms: Vec>, - documents: Vec>, - idfs: HashMap, - re: Regex, -} - -pub fn plain_cosine_similarity(vec1: &[f64], vec2: &[f64]) -> Option { - // Check if vectors have the same length - if vec1.len() != vec2.len() { - return None; - } - - // Calculate dot product - let dot_product: f64 = vec1.iter().zip(vec2.iter()).map(|(a, b)| a * b).sum(); - - // Calculate magnitudes - let magnitude1: f64 = vec1.iter().map(|x| x.powi(2)).sum::().sqrt(); - - let magnitude2: f64 = vec2.iter().map(|x| x.powi(2)).sum::().sqrt(); - - // Prevent division by zero - if magnitude1 == 0.0 || magnitude2 == 0.0 { - return None; - } - - // Calculate cosine similarity - Some(dot_product / (magnitude1 * magnitude2)) -} - -impl TfidfCalculator { - fn new() -> Self { - TfidfCalculator { - tokens: Vec::new(), - tokens_index: HashMap::new(), - row_norms: Vec::new(), - documents: Vec::new(), - idfs: HashMap::new(), - re: Regex::new(r"(?U)\b\w{2,}\b").unwrap(), - } - } - - fn tokenize(&self, text: &str) -> Vec { - let matches: Vec = self - .re - .find_iter(text.to_lowercase().as_str()) - .map(|mat| mat.as_str().to_string()) - .collect(); - matches - } - - fn process_documents(&mut self, documents: Vec) { - let mut unique_tokens = HashSet::new(); - let mut document_tokens: Vec> = Vec::new(); - for document in &documents { - let tokens = self.tokenize(document); - for token in tokens.iter() { - unique_tokens.insert(token.clone()); - } - document_tokens.push(tokens); - } - self.tokens = unique_tokens.into_iter().collect(); - self.tokens.sort(); - - for (idx, term) in self.tokens.iter().enumerate() { - self.tokens_index.insert(term.clone(), idx); - } - - let mut matrix = vec![vec![0.0; self.tokens.len()]; documents.len()]; - // I have count of terms in each document - for (row_idx, document) in document_tokens.iter().enumerate() { - for token in document.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - matrix[row_idx][*term_id] += 1.0; - } - } - self.documents = std::mem::take(&mut matrix); - } - - fn calculate_idf(&mut self) { - for term_id in 0..self.tokens.len() { - let mut df = 0; - for document in self.documents.iter() { - if document[term_id] > 0.0 { - df += 1; - } - } - let n_documents = self.documents.len(); - let present = df as f64; - let idf = (((n_documents + 1) as f64) / (present + 1.0)).ln() + 1.0; - self.idfs.insert(term_id, idf); - } - } - - fn calculate_tfidf(&mut self) { - let mut tfidf_matrix = vec![vec![0.0; self.tokens.len()]; self.documents.len()]; - - for (row_idx, document) in self.documents.iter().enumerate() { - for i in 0..self.tokens.len() { - let term_id = i; - let term_frequency = document[term_id]; - let tf = term_frequency as f64; - let idf = self.idfs.get(&term_id).unwrap(); - if term_frequency == 0.0 { - tfidf_matrix[row_idx][term_id] = 0.0; - } else { - tfidf_matrix[row_idx][term_id] = tf * idf; - } - } - } - self.row_norms = std::mem::take(&mut tfidf_matrix); - } - - fn normalize(&mut self) { - for row in self.row_norms.iter_mut() { - l2_normalize(row); - } - } - - fn tfidf_representation(&self, row: &str) -> Vec { - let mut row_vector = vec![0.0; self.tokens.len()]; - let tokens = self.tokenize(row); - for token in tokens.iter() { - let term_id = self.tokens_index.get(token).unwrap(); - let term_idf = self.idfs.get(term_id).unwrap(); - row_vector[*term_id] += *term_idf; - } - l2_normalize(&mut row_vector); - return row_vector; - } -} - -fn main() -> io::Result<()> { - let args: Vec = std::env::args().collect(); - if args.len() != 2 { - eprintln!("Usage: {} ", args[0]); - std::process::exit(1); - } - - let path = Path::new(&args[1]); - let file = File::open(path)?; - let reader = BufReader::new(file); - let mut calculator = TfidfCalculator::new(); - - let mut documents = Vec::new(); - /* - let mut current_document = String::new(); - let mut line_count = 0; - */ - for line in reader.lines() { - documents.push(line.unwrap()); - } - /* - let line = line?; - current_document.push_str(&line); - current_document.push('\n'); - line_count += 1; - if line_count == LINES_PER_DOCUMENT { - documents.push(std::mem::take(&mut current_document)); - line_count = 0; - } - } - - if !current_document.is_empty() { - documents.push(std::mem::take(&mut current_document)); - } - */ - calculator.process_documents(documents); - calculator.calculate_idf(); - calculator.calculate_tfidf(); - calculator.normalize(); - - let query = "we went over to the whole world"; - let query_vector = calculator.tfidf_representation(query); - for (idx, row_vector) in calculator.row_norms.iter().enumerate() { - let similarity = - SpatialSimilarity::cosine(query_vector.as_ref(), row_vector.as_ref()).unwrap(); - println!( - "Similarity for document via simsimd {}: {:?}", - idx, - 1.0 - similarity - ); - let plain_similarity = - plain_cosine_similarity(query_vector.as_ref(), row_vector.as_ref()).unwrap(); - println!( - "Similarity for document via plain cosine similarity {}: {:?}", - idx, plain_similarity - ); - } - - Ok(()) -}