Skip to content

Commit

Permalink
Search second part.
Browse files Browse the repository at this point in the history
  • Loading branch information
redji committed Apr 6, 2024
1 parent 87906a2 commit de3170d
Show file tree
Hide file tree
Showing 6 changed files with 619 additions and 591 deletions.
23 changes: 22 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,25 @@

npm i
rename .env-example to to .env, provide your API_KEY and OPEN_API_KEY
node prompt.js
node prompt.js
docker pull qdrant/qdrant
docker run -p 6333:6333 qdrant/qdrant
Comment out tests that you don't want to run and have fun.


Search task.
Install and run qdrant
Create DB with 512 vectors
Edit name of db in client calls
Run node createEmbeddings.js it will populate your DB with vector data passed through model set in ./embeddings/index.js (you can edit it)
Comment out line 145 with processInBatchesAndSaveToJSON();
Uncomment line 146 and run it again.

In first step results of creation of embeddings are stored in JSON file to provide you space to manipulate the DB Collection or the script without need of running out of cash in OpenAI each time you run the script
Don't open the embeddings.json file because it's large and probably will cluster your RAM and hang editor.
To inspect results you can console log in next step while straming from file.

Second step is uploading your embeddings to vector DB, so if you'lll do uncommenting and rerun the sciprt it will add vectors.
Inspect data consistency in DB by visualising it and going through imported data, if everything looks right, run:
node prompt.js and it will execute elastic search on dataset.
Have fun!
157 changes: 143 additions & 14 deletions createEmbeddings.js
Original file line number Diff line number Diff line change
@@ -1,17 +1,146 @@
import { saveEmbeddingsToFile, generateEmbeddings } from './embeddings/index.js';
import { fetchJSONData } from './io/index.js';
import axios from 'axios';
import { config } from 'dotenv';
import { generateEmbeddings } from './embeddings/index.js'
import fs from 'fs';
import JSONStream from 'JSONStream';
import {QdrantClient} from '@qdrant/js-client-rest';
const client = new QdrantClient({url: 'http://127.0.0.1:6333'});

const jsonDataUrl = 'https://unknow.news/archiwum_aidevs.json';
try {
// Step 1: Fetch JSON data from the provided URL
const jsonData = await fetchJSONData(jsonDataUrl);
const embeddingsFilePath = 'embeddings.json';

async function fetchJSONData(url) {
const response = await axios.get(url);
return response.data;
}
/*
Generate embeddings with OpenAI and save them to file for future use,
to not call AI each time when working on generation and processing.
*/
async function processInBatchesAndSaveToJSON() {
try {
const jsonData = await fetchJSONData(jsonDataUrl);
for (let i = 0; i<jsonData.length; i++) {
let entry = jsonData[i];
await generateEmbeddings(entry.title + ' ' + entry.info).then(async (response) => {
fs.appendFileSync(embeddingsFilePath, JSON.stringify({...response}) + '\n');
console.log('Processed: ' + i + '.');
})
}

// const totalEntries = jsonData.length;
// const batchSize = 10;
// let k = 0;

// for (let i = 0; i < totalEntries; i += batchSize) {
// if (k === 10) {
// k = 0;
// }
// const startIndex = i;
// const endIndex = Math.min(i + batchSize, totalEntries);
// const contentData = jsonData.slice(startIndex, endIndex).map(entry => entry.title + ' ' + entry.info);
// await generateEmbeddingsForBatch(contentData).then(async (response) => {
// response.forEach(async (embeddedElement, index) => {
// fs.appendFileSync(embeddingsFilePath, JSON.stringify({ id: 'i = ' + i + ';k = ' + k + ';index = ' + index, response }) + '\n');
// })
// });
// k++;
// console.log(`Batch ${startIndex / batchSize + 1} saved to ${embeddingsFilePath}`);
// }
console.log('Embeddings generation completed.');
} catch (error) {
console.error('An error occurred:', error);
}
}
async function addToDBFromJSON () {
try {
const jsonData = await fetchJSONData(jsonDataUrl);
const stream = fs.createReadStream(embeddingsFilePath, { encoding: 'utf8' });
const jsonStream = JSONStream.parse('*');
stream.pipe(jsonStream);
let id = 0;
jsonStream.on('data', async (data) => {
if(typeof data === 'object' && Array.isArray(data)) {
const upsertObject = {
wait: true,
points: [{
id,
vector: data[0].embedding,
payload: jsonData[id]
}]
}
id++;
const operation_info = await client.upsert('test_collection4', upsertObject)
console.log(operation_info);
}
});

// Handle errors
stream.on('error', (err) => {
console.error('Error reading file:', err);
});
} catch (error) {
console.error('An error occurred:', error);
}
}
// element.data.map((embedding, index2) => {
// console.log((i * 100) * (index * 10) + index2);
// });
// const operation_info = await client.upsert('AIDevsDB00001', {
// wait: true,
// points: embeddedElement.data.map((data, indexOfEmbedding) => {
// const id = i + index
// return {
// id,
// vector: data.embedding,
// payload: jsonData[parseInt(i.toString() + index.toString())]
// }
// })
// });
// console.log(operation_info)

// for (const element in response) {
// const operation_info = await client.upsert('ai_devs_embeddings_search', {
// wait: true,
// points: response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// })
// });
// console.log(operation_info)
// }
// const operation_info = await client.upsert('ai_devs_embeddings_search', {
// wait: true,
// points: response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// })
// });
// console.log(operation_info)
// const json_to_save = response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// });
// fs.appendFileSync(embeddingsFilePath, JSON.stringify(json_to_save) + '\n');




// Step 2: Extract content from JSON data
const contentData = jsonData.map(entry => entry.info);

// Step 3: Generate embeddings for the content using a pre-trained model
const embeddings = await generateEmbeddings(contentData);
const embeddingsFilePath = 'embeddings.json';
saveEmbeddingsToFile(embeddings, embeddingsFilePath);
} catch (error) {
console.error('An error occurred:', error);
}
processInBatchesAndSaveToJSON();
// addToDBFromJSON();
92 changes: 7 additions & 85 deletions embeddings/index.js
Original file line number Diff line number Diff line change
@@ -1,86 +1,8 @@
import fs from 'fs';
import tf from '@tensorflow/tfjs-node';
import * as use from '@tensorflow-models/universal-sentence-encoder';

export async function saveEmbeddingsToFile(embeddings, filePath) {
// Convert TensorFlow tensors to arrays
const embeddingsArrays = await Promise.all(embeddings.map(embedding => embedding.array()));

// Serialize embeddings to JSON
const serializedEmbeddings = JSON.stringify(embeddingsArrays);

// Write serialized embeddings to file
fs.writeFileSync(filePath, serializedEmbeddings);
import { embedding } from '../openAPI/index.js'
export async function generateEmbeddings(contentData, i) {
return await embedding ({
model: 'text-embedding-3-large',
input: contentData,
dimensions: 512
})
}
export async function loadEmbeddingsFromFile(filePath) {
// Read serialized embeddings from file
const serializedEmbeddings = fs.readFileSync(filePath, 'utf8');

// Parse serialized embeddings from JSON
const embeddingsArrays = JSON.parse(serializedEmbeddings);

// Convert arrays to TensorFlow tensors
const embeddings = embeddingsArrays.map(array => tf.tensor(array));

return embeddings;
}
export async function generateEmbeddings(contentData) {
// Example: Dummy function to generate embeddings
const embeddings = [];
for (const content of contentData) {
const embedding = await embedTextData(content);
embeddings.push(embedding);
}
return embeddings;
}
export async function embedTextData(text) {
const model = await use.load();
// Assume text is a string
const embeddings = await model.embed(text);
return embeddings;
}
export function findSimilar(embeddingsTensor, queryEmbedding, contentData, k) {
const cosineSimilarities = [];
// Compute cosine similarity between query embedding and each content embedding
for (let i = 0; i < contentData.length; i++) {
const contentEmbedding = embeddingsTensor.gather([i]); // Gather the i-th embedding

// Ensure query embedding has at least 2 dimensions
const queryExpanded = tf.expandDims(queryEmbedding, 0);

// Ensure content embedding has at least 2 dimensions
const contentExpanded = tf.expandDims(contentEmbedding, 0);

// Log shapes for debugging
console.log('Query embedding shape:', queryExpanded.shape);
console.log('Content embedding shape:', contentExpanded.shape);

// Calculate cosine similarity
const similarity = tf.tidy(() => {
const dotProduct = tf.matMul(queryExpanded, contentExpanded, true, false);
console.log('Dot product:', dotProduct.dataSync());

const queryMagnitude = tf.norm(queryExpanded);
console.log('Query magnitude:', queryMagnitude.dataSync());

const contentMagnitude = tf.norm(contentExpanded);
console.log('Content magnitude:', contentMagnitude.dataSync());

return dotProduct.div(queryMagnitude.mul(contentMagnitude)).dataSync()[0];
});

// Store the similarity score along with the index
cosineSimilarities.push({ index: i, similarity });

// Log computed similarity for debugging
console.log(`Computed similarity for index ${i}: ${similarity}`);
}

// Sort similarities in descending order
cosineSimilarities.sort((a, b) => b.similarity - a.similarity);

// Return top k most similar indices
const topIndices = cosineSimilarities.slice(0, k).map(item => item.index);
console.log('Top indices:', topIndices);
return topIndices;
}
1 change: 0 additions & 1 deletion io/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ export async function fetchJSONData(url) {
const response = await axios.get(url);
return response.data;
} catch (error) {
console.error('Error fetching JSON data:', error);
throw error;
}
}
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
"author": "Michał Furmaniak",
"license": "ISC",
"dependencies": {
"@qdrant/js-client-rest": "^1.8.1",
"@tensorflow-models/universal-sentence-encoder": "^1.3.3",
"@tensorflow/tfjs": "^3.8.0",
"@tensorflow/tfjs-converter": "^3.8.0",
Expand Down
Loading

0 comments on commit de3170d

Please sign in to comment.