Search second part.

redji · Apr 6, 2024 · de3170d · de3170d
1 parent 87906a2
commit de3170d
Show file tree

Hide file tree

Showing 6 changed files with 619 additions and 591 deletions.
diff --git a/README.md b/README.md
@@ -1,4 +1,25 @@
 
  npm i
  rename .env-example to to .env, provide your API_KEY and OPEN_API_KEY
- node prompt.js
+ node prompt.js
+ docker pull qdrant/qdrant
+ docker run -p 6333:6333 qdrant/qdrant
+ Comment out tests that you don't want to run and have fun.
+
+
+ Search task.
+ Install and run qdrant
+ Create DB with 512 vectors
+ Edit name of db in client calls
+ Run node createEmbeddings.js it will populate your DB with vector data passed through model set in ./embeddings/index.js (you can edit it)
+ Comment out line 145 with processInBatchesAndSaveToJSON();
+ Uncomment line 146 and run it again.
+
+ In first step results of creation of embeddings are stored in JSON file to provide you space to manipulate the DB Collection or the script without need of running out of cash in OpenAI each time you run the script
+ Don't open the embeddings.json file because it's large and probably will cluster your RAM and hang editor.
+ To inspect results you can console log in next step while straming from file.
+
+ Second step is uploading your embeddings to vector DB, so if you'lll do uncommenting and rerun the sciprt it will add vectors.
+ Inspect data consistency in DB by visualising it and going through imported data, if everything looks right, run:
+ node prompt.js and it will execute elastic search on dataset.
+ Have fun!
diff --git a/createEmbeddings.js b/createEmbeddings.js
@@ -1,17 +1,146 @@
-import { saveEmbeddingsToFile, generateEmbeddings } from './embeddings/index.js';
-import { fetchJSONData } from './io/index.js';
+import axios from 'axios';
+import { config } from 'dotenv';
+import { generateEmbeddings } from './embeddings/index.js'
+import fs from 'fs';
+import JSONStream from 'JSONStream';
+import {QdrantClient} from '@qdrant/js-client-rest';
+const client = new QdrantClient({url: 'http://127.0.0.1:6333'});
+
 const jsonDataUrl = 'https://unknow.news/archiwum_aidevs.json';
-try {
-	// Step 1: Fetch JSON data from the provided URL
-	const jsonData = await fetchJSONData(jsonDataUrl);
+const embeddingsFilePath = 'embeddings.json';
+
+async function fetchJSONData(url) {
+    const response = await axios.get(url);
+    return response.data;
+}
+/*
+	Generate embeddings with OpenAI and save them to file for future use,
+	to not call AI each time when working on generation and processing.
+	*/
+async function processInBatchesAndSaveToJSON() {
+    try {
+        const jsonData = await fetchJSONData(jsonDataUrl);
+        for (let i = 0; i<jsonData.length; i++) {
+			let entry = jsonData[i];
+			await generateEmbeddings(entry.title + ' ' + entry.info).then(async (response) => {
+				fs.appendFileSync(embeddingsFilePath, JSON.stringify({...response}) + '\n');
+				console.log('Processed: ' + i + '.');
+			})
+		}
+
+		// const totalEntries = jsonData.length;
+        // const batchSize = 10;
+		// let k = 0;
+
+        // for (let i = 0; i < totalEntries; i += batchSize) {
+		// 	if (k === 10) {
+		// 		k = 0;
+		// 	}
+        //     const startIndex = i;
+        //     const endIndex = Math.min(i + batchSize, totalEntries);
+        //     const contentData = jsonData.slice(startIndex, endIndex).map(entry => entry.title + ' ' + entry.info);
+        //     await generateEmbeddingsForBatch(contentData).then(async (response) => {
+		// 		response.forEach(async (embeddedElement, index) => {
+		// 			fs.appendFileSync(embeddingsFilePath, JSON.stringify({ id: 'i = ' + i + ';k = ' + k + ';index = ' + index, response }) + '\n');
+		// 		})
+		// 	});
+		// 	k++;
+        //     console.log(`Batch ${startIndex / batchSize + 1} saved to ${embeddingsFilePath}`);
+        // }
+        console.log('Embeddings generation completed.');
+    } catch (error) {
+        console.error('An error occurred:', error);
+    }
+}
+async function addToDBFromJSON () {
+	try {
+        const jsonData = await fetchJSONData(jsonDataUrl);
+		const stream = fs.createReadStream(embeddingsFilePath, { encoding: 'utf8' });
+		const jsonStream = JSONStream.parse('*');
+		stream.pipe(jsonStream);
+		let id = 0;
+		jsonStream.on('data', async (data) => {
+			if(typeof data === 'object' && Array.isArray(data)) {
+				const upsertObject = {
+					wait: true,
+					points: [{
+						id,
+						vector: data[0].embedding,
+						payload: jsonData[id]
+					}]
+				}
+				id++;
+				const operation_info = await client.upsert('test_collection4', upsertObject)
+				console.log(operation_info);
+			}
+		});
+
+		// Handle errors
+		stream.on('error', (err) => {
+			console.error('Error reading file:', err);
+		});
+	} catch (error) {
+        console.error('An error occurred:', error);
+    }
+}
+	// element.data.map((embedding, index2) => {
+					// 	console.log((i * 100) * (index * 10) + index2);
+					// });
+					// const operation_info = await client.upsert('AIDevsDB00001', {
+					// 	wait: true,
+					// 	points: embeddedElement.data.map((data, indexOfEmbedding) => {
+					// 		const id = i + index
+					// 		return {
+					// 			id,
+					// 			vector: data.embedding,
+					// 			payload: jsonData[parseInt(i.toString() + index.toString())]
+					// 		}
+					// 	})
+					// });
+					// console.log(operation_info)
+
+			// for (const element in response) {
+				// 	const operation_info = await client.upsert('ai_devs_embeddings_search', {
+				// 		wait: true,
+				// 		points: response.map((embedding, index) => {
+				// 			console.log(embedding)
+				// 			const id = index + (i * batchSize);
+				// 			return {
+				// 				id,
+				// 				vector: embedding.data[0].embedding,
+				// 				payload: jsonData[id]
+				// 			}
+				// 		})
+				// 	});
+				// 	console.log(operation_info)
+				// }
+				// const operation_info = await client.upsert('ai_devs_embeddings_search', {
+				// 	wait: true,
+				// 	points: response.map((embedding, index) => {
+				// 		console.log(embedding)
+				// 		const id = index + (i * batchSize);
+				// 		return {
+				// 			id,
+				// 			vector: embedding.data[0].embedding,
+				// 			payload: jsonData[id]
+				// 		}
+				// 	})
+				// });
+				// console.log(operation_info)
+				// const json_to_save = response.map((embedding, index) => {
+				// 	console.log(embedding)
+				// 	const id = index + (i * batchSize);
+				// 	return {
+				// 		id,
+				// 		vector: embedding.data[0].embedding,
+				// 		payload: jsonData[id]
+				// 	}
+				// });
+				// fs.appendFileSync(embeddingsFilePath, JSON.stringify(json_to_save) + '\n');
+
+
+
 
-	// Step 2: Extract content from JSON data
-	const contentData = jsonData.map(entry => entry.info);
 
-	// Step 3: Generate embeddings for the content using a pre-trained model
-	const embeddings = await generateEmbeddings(contentData);
-	const embeddingsFilePath = 'embeddings.json';
-	saveEmbeddingsToFile(embeddings, embeddingsFilePath);
-} catch (error) {
-	console.error('An error occurred:', error);
-}
+processInBatchesAndSaveToJSON();
+// addToDBFromJSON();
diff --git a/embeddings/index.js b/embeddings/index.js
@@ -1,86 +1,8 @@
-import fs from 'fs';
-import tf from '@tensorflow/tfjs-node';
-import * as use from '@tensorflow-models/universal-sentence-encoder';
-
-export async function saveEmbeddingsToFile(embeddings, filePath) {
-    // Convert TensorFlow tensors to arrays
-    const embeddingsArrays = await Promise.all(embeddings.map(embedding => embedding.array()));
-
-    // Serialize embeddings to JSON
-    const serializedEmbeddings = JSON.stringify(embeddingsArrays);
-
-    // Write serialized embeddings to file
-    fs.writeFileSync(filePath, serializedEmbeddings);
+import { embedding } from '../openAPI/index.js'
+export async function generateEmbeddings(contentData, i) {
+    return await embedding ({
+        model: 'text-embedding-3-large',
+        input: contentData,
+        dimensions: 512
+    })
 }
-export async function loadEmbeddingsFromFile(filePath) {
-    // Read serialized embeddings from file
-    const serializedEmbeddings = fs.readFileSync(filePath, 'utf8');
-
-    // Parse serialized embeddings from JSON
-    const embeddingsArrays = JSON.parse(serializedEmbeddings);
-
-    // Convert arrays to TensorFlow tensors
-    const embeddings = embeddingsArrays.map(array => tf.tensor(array));
-
-    return embeddings;
-}
-export async function generateEmbeddings(contentData) {
-    // Example: Dummy function to generate embeddings
-    const embeddings = [];
-    for (const content of contentData) {
-        const embedding = await embedTextData(content);
-        embeddings.push(embedding);
-    }
-    return embeddings;
-}
-export async function embedTextData(text) {
-	const model = await use.load();
-    // Assume text is a string
-	const embeddings = await model.embed(text);
-    return embeddings;
-}
-export function findSimilar(embeddingsTensor, queryEmbedding, contentData, k) {
-    const cosineSimilarities = [];
-    // Compute cosine similarity between query embedding and each content embedding
-    for (let i = 0; i < contentData.length; i++) {
-        const contentEmbedding = embeddingsTensor.gather([i]); // Gather the i-th embedding
-
-        // Ensure query embedding has at least 2 dimensions
-        const queryExpanded = tf.expandDims(queryEmbedding, 0);
-
-        // Ensure content embedding has at least 2 dimensions
-        const contentExpanded = tf.expandDims(contentEmbedding, 0);
-
-        // Log shapes for debugging
-        console.log('Query embedding shape:', queryExpanded.shape);
-        console.log('Content embedding shape:', contentExpanded.shape);
-
-        // Calculate cosine similarity
-        const similarity = tf.tidy(() => {
-            const dotProduct = tf.matMul(queryExpanded, contentExpanded, true, false);
-            console.log('Dot product:', dotProduct.dataSync());
-
-            const queryMagnitude = tf.norm(queryExpanded);
-            console.log('Query magnitude:', queryMagnitude.dataSync());
-
-            const contentMagnitude = tf.norm(contentExpanded);
-            console.log('Content magnitude:', contentMagnitude.dataSync());
-
-            return dotProduct.div(queryMagnitude.mul(contentMagnitude)).dataSync()[0];
-        });
-
-        // Store the similarity score along with the index
-        cosineSimilarities.push({ index: i, similarity });
-
-        // Log computed similarity for debugging
-        console.log(`Computed similarity for index ${i}: ${similarity}`);
-    }
-
-    // Sort similarities in descending order
-    cosineSimilarities.sort((a, b) => b.similarity - a.similarity);
-
-    // Return top k most similar indices
-    const topIndices = cosineSimilarities.slice(0, k).map(item => item.index);
-    console.log('Top indices:', topIndices);
-    return topIndices;
-}
diff --git a/io/index.js b/io/index.js
@@ -5,7 +5,6 @@ export async function fetchJSONData(url) {
         const response = await axios.get(url);
         return response.data;
     } catch (error) {
-        console.error('Error fetching JSON data:', error);
         throw error;
     }
 }

diff --git a/package.json b/package.json
@@ -10,6 +10,7 @@
   "author": "Michał Furmaniak",
   "license": "ISC",
   "dependencies": {
+    "@qdrant/js-client-rest": "^1.8.1",
     "@tensorflow-models/universal-sentence-encoder": "^1.3.3",
     "@tensorflow/tfjs": "^3.8.0",
     "@tensorflow/tfjs-converter": "^3.8.0",