Skip to content

Commit 9909e1a

Browse files
authored
Create extract-quiz-vocabulary.ts
1 parent 8fe6f5f commit 9909e1a

File tree

1 file changed

+278
-0
lines changed

1 file changed

+278
-0
lines changed

scripts/extract-quiz-vocabulary.ts

Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
import * as fs from "fs"
2+
import * as path from "path"
3+
4+
interface ExtractedWord {
5+
id: string
6+
arabic: string
7+
transliteration: string
8+
meanings: string[]
9+
difficulty: "Beginner" | "Intermediate" | "Advanced"
10+
category: string
11+
tags: string[]
12+
examples: Array<{
13+
id: string
14+
surahNumber: number
15+
surahName: string
16+
ayahNumber: number
17+
arabicText: string
18+
translationText: string
19+
wordLocation: {
20+
startIndex: number
21+
endIndex: number
22+
}
23+
hasAudio: boolean
24+
}>
25+
hasAudio: boolean
26+
frequency?: number
27+
rootLetters?: string
28+
}
29+
30+
const SURAH_NAMES: { [key: number]: string } = {
31+
1: "Al-Fatihah",
32+
92: "Al-Layl",
33+
93: "Ad-Duha",
34+
94: "Ash-Sharh",
35+
95: "At-Tin",
36+
96: "Al-Alaq",
37+
97: "Al-Qadr",
38+
98: "Al-Bayyinah",
39+
99: "Az-Zalzalah",
40+
100: "Al-Adiyat",
41+
101: "Al-Qari'ah",
42+
102: "At-Takathur",
43+
103: "Al-Asr",
44+
104: "Al-Humazah",
45+
105: "Al-Fil",
46+
106: "Quraysh",
47+
107: "Al-Ma'un",
48+
108: "Al-Kawthar",
49+
109: "Al-Kafirun",
50+
110: "An-Nasr",
51+
111: "Al-Masad",
52+
112: "Al-Ikhlas",
53+
113: "Al-Falaq",
54+
114: "An-Nas",
55+
}
56+
57+
function cleanArabicText(text: string): string {
58+
// Remove diacritics and clean up the text
59+
return text
60+
.replace(/[\u064B-\u0652\u0670\u0640]/g, "") // Remove diacritics and tatweel
61+
.replace(/[^\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF\s]/g, "") // Keep only Arabic characters and spaces
62+
.trim()
63+
}
64+
65+
function generateTransliteration(arabic: string): string {
66+
// Basic Arabic to Latin transliteration mapping
67+
const transliterationMap: { [key: string]: string } = {
68+
ا: "a",
69+
ب: "b",
70+
ت: "t",
71+
ث: "th",
72+
ج: "j",
73+
ح: "h",
74+
خ: "kh",
75+
د: "d",
76+
ذ: "dh",
77+
ر: "r",
78+
ز: "z",
79+
س: "s",
80+
ش: "sh",
81+
ص: "s",
82+
ض: "d",
83+
ط: "t",
84+
ظ: "z",
85+
ع: "'",
86+
غ: "gh",
87+
ف: "f",
88+
ق: "q",
89+
ك: "k",
90+
ل: "l",
91+
م: "m",
92+
ن: "n",
93+
ه: "h",
94+
و: "w",
95+
ي: "y",
96+
ة: "h",
97+
ى: "a",
98+
ء: "'",
99+
}
100+
101+
return arabic
102+
.split("")
103+
.map((char) => transliterationMap[char] || char)
104+
.join("")
105+
}
106+
107+
function generateBasicTranslation(arabic: string): string {
108+
// Very basic word translations - these should be manually reviewed
109+
const basicTranslations: { [key: string]: string } = {
110+
الله: "Allah",
111+
رب: "Lord",
112+
الرحمن: "The Most Merciful",
113+
الرحيم: "The Most Compassionate",
114+
الحمد: "Praise",
115+
العالمين: "of the worlds",
116+
يوم: "Day",
117+
الدين: "of Judgment",
118+
إياك: "You alone",
119+
نعبد: "we worship",
120+
نستعين: "we seek help",
121+
اهدنا: "Guide us",
122+
الصراط: "the path",
123+
المستقيم: "the straight",
124+
}
125+
126+
return basicTranslations[arabic] || `[Translation needed for: ${arabic}]`
127+
}
128+
129+
function assignDifficulty(arabic: string): "Beginner" | "Intermediate" | "Advanced" {
130+
const length = arabic.length
131+
if (length <= 3) return "Beginner"
132+
if (length <= 6) return "Intermediate"
133+
return "Advanced"
134+
}
135+
136+
function assignCategory(arabic: string, surahNumber: number): string {
137+
// Basic categorization based on common patterns
138+
if (arabic.includes("الله")) return "Divine Names"
139+
if (arabic.includes("صل") || arabic.includes("دع")) return "Prayer & Worship"
140+
if (arabic.includes("يوم") || arabic.includes("آخر")) return "Afterlife"
141+
if (surahNumber <= 5) return "Opening Chapters"
142+
if (surahNumber >= 92) return "Short Chapters"
143+
return "Quranic Vocabulary"
144+
}
145+
146+
async function extractVocabularyFromQuizzes() {
147+
console.log("🚀 Starting Surah Quiz Vocabulary Extraction\n")
148+
149+
const dataDir = path.join(process.cwd(), "data")
150+
const files = fs.readdirSync(dataDir)
151+
152+
const surahQuizFiles = files
153+
.filter((file) => file.startsWith("surah-") && file.endsWith("-quiz-data.ts") && file !== "surah-quiz-types.ts")
154+
.sort((a, b) => {
155+
const aNum = Number.parseInt(a.match(/surah-(\d+)-/)?.[1] || "0")
156+
const bNum = Number.parseInt(b.match(/surah-(\d+)-/)?.[1] || "0")
157+
return aNum - bNum
158+
})
159+
160+
console.log(`Processing ${surahQuizFiles.length} quiz files...`)
161+
162+
const extractedWords: ExtractedWord[] = []
163+
const seenWords = new Set<string>()
164+
165+
let totalWordsFound = 0
166+
let beginnerCount = 0
167+
let intermediateCount = 0
168+
let advancedCount = 0
169+
const categoryCount: { [key: string]: number } = {}
170+
171+
for (const file of surahQuizFiles) {
172+
try {
173+
const filePath = path.join(dataDir, file)
174+
const content = fs.readFileSync(filePath, "utf-8")
175+
176+
// Extract Surah number from filename
177+
const surahMatch = file.match(/surah-(\d+)-quiz-data\.ts/)
178+
const surahNumber = surahMatch ? Number.parseInt(surahMatch[1]) : 0
179+
const surahName = SURAH_NAMES[surahNumber] || `Surah ${surahNumber}`
180+
181+
console.log(`Processing ${file} (Surah ${surahNumber}: ${surahName})...`)
182+
183+
// Extract Arabic text using regex
184+
const arabicMatches =
185+
content.match(
186+
/[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+(?:\s+[\u0600-\u06FF\u0750-\u077F\u08A0-\u08FF\uFB50-\uFDFF\uFE70-\uFEFF]+)*/g,
187+
) || []
188+
189+
console.log(` Found ${arabicMatches.length} Arabic texts`)
190+
191+
for (const arabicText of arabicMatches) {
192+
const cleanedArabic = cleanArabicText(arabicText)
193+
194+
if (cleanedArabic.length < 2 || seenWords.has(cleanedArabic)) {
195+
continue // Skip very short words or duplicates
196+
}
197+
198+
seenWords.add(cleanedArabic)
199+
200+
const transliteration = generateTransliteration(cleanedArabic)
201+
const translation = generateBasicTranslation(cleanedArabic)
202+
const difficulty = assignDifficulty(cleanedArabic)
203+
const category = assignCategory(cleanedArabic, surahNumber)
204+
205+
// Count by difficulty
206+
if (difficulty === "Beginner") beginnerCount++
207+
else if (difficulty === "Intermediate") intermediateCount++
208+
else advancedCount++
209+
210+
// Count by category
211+
categoryCount[category] = (categoryCount[category] || 0) + 1
212+
213+
const word: ExtractedWord = {
214+
id: `quiz-extracted-${totalWordsFound + 1}`,
215+
arabic: cleanedArabic,
216+
transliteration,
217+
meanings: [translation],
218+
difficulty,
219+
category,
220+
tags: ["quiz-extracted", `surah-${surahNumber}`, category.toLowerCase().replace(/\s+/g, "-")],
221+
examples: [
222+
{
223+
id: `example-${totalWordsFound + 1}`,
224+
surahNumber,
225+
surahName,
226+
ayahNumber: 1, // Default, should be manually corrected
227+
arabicText: cleanedArabic,
228+
translationText: translation,
229+
wordLocation: {
230+
startIndex: 0,
231+
endIndex: cleanedArabic.length,
232+
},
233+
hasAudio: false,
234+
},
235+
],
236+
hasAudio: false,
237+
frequency: 1,
238+
}
239+
240+
extractedWords.push(word)
241+
totalWordsFound++
242+
}
243+
} catch (error) {
244+
console.error(`Error processing ${file}:`, error)
245+
}
246+
}
247+
248+
// Generate the vocabulary data file
249+
const outputPath = path.join(dataDir, "vocabulary-data-quiz-extracted.ts")
250+
const fileContent = `// Auto-generated vocabulary from Surah quiz data
251+
// Generated on: ${new Date().toISOString()}
252+
// Total words: ${totalWordsFound}
253+
254+
import { VocabularyWord, Difficulty } from "../types/vocabulary"
255+
256+
export const quizExtractedVocabulary: VocabularyWord[] = ${JSON.stringify(extractedWords, null, 2)}
257+
`
258+
259+
fs.writeFileSync(outputPath, fileContent, "utf-8")
260+
261+
console.log(`\n📁 Generated vocabulary file: ${outputPath}`)
262+
console.log(`📊 Statistics:`)
263+
console.log(` - Total words extracted: ${totalWordsFound}`)
264+
console.log(` - Beginner: ${beginnerCount}`)
265+
console.log(` - Intermediate: ${intermediateCount}`)
266+
console.log(` - Advanced: ${advancedCount}`)
267+
268+
console.log(`\n📚 Categories:`)
269+
Object.entries(categoryCount).forEach(([category, count]) => {
270+
console.log(` - ${category}: ${count}`)
271+
})
272+
273+
console.log(`\n✅ Vocabulary extraction completed successfully!`)
274+
console.log(` The new words will be automatically included in your dictionary.`)
275+
console.log(` You may want to review and improve the translations manually.`)
276+
}
277+
278+
extractVocabularyFromQuizzes().catch(console.error)

0 commit comments

Comments
 (0)