-
Notifications
You must be signed in to change notification settings - Fork 0
/
createEmbeddings.js
147 lines (136 loc) · 4.67 KB
/
createEmbeddings.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import axios from 'axios';
import { config } from 'dotenv';
import { generateEmbeddings } from './embeddings/index.js'
import fs from 'fs';
import JSONStream from 'JSONStream';
import {QdrantClient} from '@qdrant/js-client-rest';
const client = new QdrantClient({url: 'http://127.0.0.1:6333'});
// const jsonDataUrl = 'https://unknow.news/archiwum_aidevs.json';
const jsonDataUrl = 'https://tasks.aidevs.pl/data/people.json';
const embeddingsFilePath = 'embeddings.json';
async function fetchJSONData(url) {
const response = await axios.get(url);
return response.data;
}
/*
Generate embeddings with OpenAI and save them to file for future use,
to not call AI each time when working on generation and processing.
*/
async function processInBatchesAndSaveToJSON() {
try {
const jsonData = await fetchJSONData(jsonDataUrl);
for (let i = 0; i<jsonData.length; i++) {
let entry = jsonData[i];
await generateEmbeddings(entry.imie + ' ' + entry.nazwisko).then(async (response) => {
fs.appendFileSync(embeddingsFilePath, JSON.stringify({...response}) + '\n');
console.log('Processed: ' + i + '.');
})
}
// const totalEntries = jsonData.length;
// const batchSize = 10;
// let k = 0;
// for (let i = 0; i < totalEntries; i += batchSize) {
// if (k === 10) {
// k = 0;
// }
// const startIndex = i;
// const endIndex = Math.min(i + batchSize, totalEntries);
// const contentData = jsonData.slice(startIndex, endIndex).map(entry => entry.title + ' ' + entry.info);
// await generateEmbeddingsForBatch(contentData).then(async (response) => {
// response.forEach(async (embeddedElement, index) => {
// fs.appendFileSync(embeddingsFilePath, JSON.stringify({ id: 'i = ' + i + ';k = ' + k + ';index = ' + index, response }) + '\n');
// })
// });
// k++;
// console.log(`Batch ${startIndex / batchSize + 1} saved to ${embeddingsFilePath}`);
// }
console.log('Embeddings generation completed.');
} catch (error) {
console.error('An error occurred:', error);
}
}
async function addToDBFromJSON () {
try {
const jsonData = await fetchJSONData(jsonDataUrl);
const stream = fs.createReadStream(embeddingsFilePath, { encoding: 'utf8' });
const jsonStream = JSONStream.parse('*');
stream.pipe(jsonStream);
let id = 0;
jsonStream.on('data', async (data) => {
if(typeof data === 'object' && Array.isArray(data)) {
const upsertObject = {
wait: true,
points: [{
id,
vector: data[0].embedding,
payload: jsonData[id]
}]
}
id++;
const operation_info = await client.upsert('knowledgeDB', upsertObject)
console.log(operation_info);
}
});
// Handle errors
stream.on('error', (err) => {
console.error('Error reading file:', err);
});
} catch (error) {
console.error('An error occurred:', error);
}
}
// element.data.map((embedding, index2) => {
// console.log((i * 100) * (index * 10) + index2);
// });
// const operation_info = await client.upsert('AIDevsDB00001', {
// wait: true,
// points: embeddedElement.data.map((data, indexOfEmbedding) => {
// const id = i + index
// return {
// id,
// vector: data.embedding,
// payload: jsonData[parseInt(i.toString() + index.toString())]
// }
// })
// });
// console.log(operation_info)
// for (const element in response) {
// const operation_info = await client.upsert('ai_devs_embeddings_search', {
// wait: true,
// points: response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// })
// });
// console.log(operation_info)
// }
// const operation_info = await client.upsert('ai_devs_embeddings_search', {
// wait: true,
// points: response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// })
// });
// console.log(operation_info)
// const json_to_save = response.map((embedding, index) => {
// console.log(embedding)
// const id = index + (i * batchSize);
// return {
// id,
// vector: embedding.data[0].embedding,
// payload: jsonData[id]
// }
// });
// fs.appendFileSync(embeddingsFilePath, JSON.stringify(json_to_save) + '\n');
// processInBatchesAndSaveToJSON();
addToDBFromJSON();