-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
203 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
import { saveEmbeddingsToFile, generateEmbeddings } from './embeddings/index.js'; | ||
import { fetchJSONData } from './io/index.js'; | ||
const jsonDataUrl = 'https://unknow.news/archiwum_aidevs.json'; | ||
try { | ||
// Step 1: Fetch JSON data from the provided URL | ||
const jsonData = await fetchJSONData(jsonDataUrl); | ||
|
||
// Step 2: Extract content from JSON data | ||
const contentData = jsonData.map(entry => entry.info); | ||
|
||
// Step 3: Generate embeddings for the content using a pre-trained model | ||
const embeddings = await generateEmbeddings(contentData); | ||
const embeddingsFilePath = 'embeddings.json'; | ||
saveEmbeddingsToFile(embeddings, embeddingsFilePath); | ||
} catch (error) { | ||
console.error('An error occurred:', error); | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
import fs from 'fs'; | ||
import tf from '@tensorflow/tfjs-node'; | ||
import * as use from '@tensorflow-models/universal-sentence-encoder'; | ||
|
||
export async function saveEmbeddingsToFile(embeddings, filePath) { | ||
// Convert TensorFlow tensors to arrays | ||
const embeddingsArrays = await Promise.all(embeddings.map(embedding => embedding.array())); | ||
|
||
// Serialize embeddings to JSON | ||
const serializedEmbeddings = JSON.stringify(embeddingsArrays); | ||
|
||
// Write serialized embeddings to file | ||
fs.writeFileSync(filePath, serializedEmbeddings); | ||
} | ||
export async function loadEmbeddingsFromFile(filePath) { | ||
// Read serialized embeddings from file | ||
const serializedEmbeddings = fs.readFileSync(filePath, 'utf8'); | ||
|
||
// Parse serialized embeddings from JSON | ||
const embeddingsArrays = JSON.parse(serializedEmbeddings); | ||
|
||
// Convert arrays to TensorFlow tensors | ||
const embeddings = embeddingsArrays.map(array => tf.tensor(array)); | ||
|
||
return embeddings; | ||
} | ||
export async function generateEmbeddings(contentData) { | ||
// Example: Dummy function to generate embeddings | ||
const embeddings = []; | ||
for (const content of contentData) { | ||
const embedding = await embedTextData(content); | ||
embeddings.push(embedding); | ||
} | ||
return embeddings; | ||
} | ||
export async function embedTextData(text) { | ||
const model = await use.load(); | ||
// Assume text is a string | ||
const embeddings = await model.embed(text); | ||
return embeddings; | ||
} | ||
export function findSimilar(embeddingsTensor, queryEmbedding, contentData, k) { | ||
const cosineSimilarities = []; | ||
// Compute cosine similarity between query embedding and each content embedding | ||
for (let i = 0; i < contentData.length; i++) { | ||
const contentEmbedding = embeddingsTensor.gather([i]); // Gather the i-th embedding | ||
|
||
// Ensure query embedding has at least 2 dimensions | ||
const queryExpanded = tf.expandDims(queryEmbedding, 0); | ||
|
||
// Ensure content embedding has at least 2 dimensions | ||
const contentExpanded = tf.expandDims(contentEmbedding, 0); | ||
|
||
// Log shapes for debugging | ||
console.log('Query embedding shape:', queryExpanded.shape); | ||
console.log('Content embedding shape:', contentExpanded.shape); | ||
|
||
// Calculate cosine similarity | ||
const similarity = tf.tidy(() => { | ||
const dotProduct = tf.matMul(queryExpanded, contentExpanded, true, false); | ||
console.log('Dot product:', dotProduct.dataSync()); | ||
|
||
const queryMagnitude = tf.norm(queryExpanded); | ||
console.log('Query magnitude:', queryMagnitude.dataSync()); | ||
|
||
const contentMagnitude = tf.norm(contentExpanded); | ||
console.log('Content magnitude:', contentMagnitude.dataSync()); | ||
|
||
return dotProduct.div(queryMagnitude.mul(contentMagnitude)).dataSync()[0]; | ||
}); | ||
|
||
// Store the similarity score along with the index | ||
cosineSimilarities.push({ index: i, similarity }); | ||
|
||
// Log computed similarity for debugging | ||
console.log(`Computed similarity for index ${i}: ${similarity}`); | ||
} | ||
|
||
// Sort similarities in descending order | ||
cosineSimilarities.sort((a, b) => b.similarity - a.similarity); | ||
|
||
// Return top k most similar indices | ||
const topIndices = cosineSimilarities.slice(0, k).map(item => item.index); | ||
console.log('Top indices:', topIndices); | ||
return topIndices; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
import axios from 'axios' | ||
|
||
export async function fetchJSONData(url) { | ||
try { | ||
const response = await axios.get(url); | ||
return response.data; | ||
} catch (error) { | ||
console.error('Error fetching JSON data:', error); | ||
throw error; | ||
} | ||
} | ||
|
||
export default { | ||
fetchJSONData | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters