Skip to content

Commit

Permalink
fix(contentlayer): Improve Topics Processing and Generation (#61)
Browse files Browse the repository at this point in the history
* fix(contentlayer): topics counts generation

- fix issues with generation of topics
- streamline the code by grouping the calculations and generation into a single method
- more meaningful names for json outputs

* refactor(contentlayer): simplify topics processing and clarify terminology

- terminology change to refer to "topics" instead of "categories"
- Simplify data processing to do a single pass and use a single structure
- Consolidate JSON generation into cleaner functions

* refactor(contentlayer): use our own topics index

- start using our own topics index instead of optech's
- add different display option for misc topics in categories
  • Loading branch information
kouloumos authored Dec 4, 2024
1 parent 6bbe171 commit 5907312
Show file tree
Hide file tree
Showing 7 changed files with 100 additions and 113 deletions.
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
/build
/public/categories.json
/public/types-data.json
/public/tag-data.json
/public/topics-by-category-counts.json
/public/aliases.json
/public/topics-data.json
/public/topics-counts.json
/public/speaker-data.json
/public/source-count-data.json
/public/sources-data.json
Expand Down
190 changes: 89 additions & 101 deletions contentlayer.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const Resources = defineNestedType(() => ({
url: { type: "string" },
},
}));
export interface CategoryInfo {
export interface Topic {
title: string;
slug: string;
optech_url: string;
Expand All @@ -28,6 +28,14 @@ export interface CategoryInfo {
excerpt: string;
}

// The full processed topic we use internally
interface ProcessedTopic {
name: string; // Display name (from topic.title or original tag)
slug: string; // Slugified identifier
count: number; // Number of occurrences
categories: string[]; // List of categories it belongs to
}

interface TagInfo {
name: string;
slug: string;
Expand All @@ -38,25 +46,6 @@ interface ContentTree {
[key: string]: ContentTree | ContentTranscriptType[];
}

/**
* Count the occurrences of all tags across transcripts and write to json file
*/
function createTagCount(allTranscripts: ContentTranscriptType[]): {
tagCounts: Record<string, number>;
} {
const tagCounts: Record<string, number> = {};

for (const file of allTranscripts) {
if (!file.tags) continue;

for (const tag of file.tags) {
const formattedTag = createSlug(tag);
tagCounts[formattedTag] = (tagCounts[formattedTag] || 0) + 1;
}
}

return { tagCounts };
}

const getTranscriptAliases = (allTranscripts: ContentTranscriptType[]) => {
const aliases: Record<string, string> = {};
Expand All @@ -74,105 +63,106 @@ const getTranscriptAliases = (allTranscripts: ContentTranscriptType[]) => {
fs.writeFileSync("./public/aliases.json", JSON.stringify(aliases));
};

const getCategories = () => {
const filePath = path.join(process.cwd(), "public", "categories.json");
const getTopics = () => {
const filePath = path.join(process.cwd(), "public", "topics.json");
const fileContents = fs.readFileSync(filePath, "utf8");
return JSON.parse(fileContents);
};

function organizeTags(transcripts: ContentTranscriptType[]) {
const categories: CategoryInfo[] = getCategories();
const { tagCounts } = createTagCount(transcripts);
function buildTopicsMap(transcripts: ContentTranscriptType[], topics: Topic[]): Map<string, ProcessedTopic> {
// Create topics lookup map (includes aliases)
const topicsLookup = new Map<string, Topic>();
topics.forEach(topic => {
topicsLookup.set(topic.slug, topic);
topic.aliases?.forEach(alias => topicsLookup.set(alias, topic));
});

const tagsByCategory: { [category: string]: TagInfo[] } = {};
const tagsWithoutCategory = new Set<string>();
const categorizedTags = new Set<string>();
// Build the main topics map
const processedTopics = new Map<string, ProcessedTopic>();

// Create a map for faster category lookup
const categoryMap = new Map<string, CategoryInfo>();
// Process all transcripts
transcripts.forEach(transcript => {
transcript.tags?.forEach(tag => {
const slug = createSlug(tag);
const topic = topicsLookup.get(slug);

categories.forEach((cat) => {
cat.categories.forEach((category) => {
if (!tagsByCategory[category]) {
tagsByCategory[category] = [];
if (!processedTopics.has(slug)) {
processedTopics.set(slug, {
name: topic?.title || tag,
slug,
count: 1,
categories: topic?.categories || ["Miscellaneous"],
});
} else {
const processed = processedTopics.get(slug)!;
processed.count += 1;
}
});
categoryMap.set(createSlug(cat.slug), cat);
cat.aliases?.forEach((alias) => categoryMap.set(alias, cat));
});

// Process all tags at once
const allTags = new Set(
transcripts.flatMap(
(transcript) => transcript.tags?.map((tag) => tag) || []
)
);

allTags.forEach((tag) => {
const catInfo = categoryMap.get(tag);
if (catInfo) {
catInfo.categories.forEach((category) => {
if (!tagsByCategory[category].some((t) => t.slug === tag)) {
tagsByCategory[category].push({
name: catInfo.title,
slug: tag,
count: tagCounts[tag] || 0,
});
}
});
categorizedTags.add(tag);
} else {
tagsWithoutCategory.add(tag);
}
});
return processedTopics;
}

// Add "Miscellaneous" category with remaining uncategorized tags
if (tagsWithoutCategory.size > 0) {
tagsByCategory["Miscellaneous"] = Array.from(tagsWithoutCategory).map(
(tag) => ({
name: tag,
slug: tag,
count: tagCounts[tag] || 0,
})
);
function generateAlphabeticalList(processedTopics: Map<string, ProcessedTopic>): TopicsData[] {
const result: TopicsData[] = [];
// The categories property is not needed for this list, so we drop it
for (const { name, slug, count } of processedTopics.values()) {
result.push({ name, slug, count });
}
return result.sort((a, b) => a.name.localeCompare(b.name));
}

// Sort tags alphabetically within each category
Object.keys(tagsByCategory).forEach((category) => {
tagsByCategory[category].sort((a, b) => a.name.localeCompare(b.name));
function generateCategorizedList(processedTopics: Map<string, ProcessedTopic>): Record<string, TopicsData[]> {
const categorizedTopics: Record<string, TopicsData[]> = {};

Array.from(processedTopics.values()).forEach(({ name, slug, count, categories }) => {
categories.forEach(category => {
if (!categorizedTopics[category]) {
categorizedTopics[category] = [];
}

// Check if topic name contains category name and ends with "(Miscellaneous)"
const modifiedName = name.includes(category) && name.endsWith("(Miscellaneous)")
? "Miscellaneous"
: name;

categorizedTopics[category].push({ name: modifiedName, slug, count });
});
});

// Sort topics within each category
Object.values(categorizedTopics).forEach(topics => {
topics.sort((a, b) => {
if (a.name == "Miscellaneous") return 1;
if (b.name == "Miscellaneous") return -1;
return a.name.localeCompare(b.name)
});
});

fs.writeFileSync("./public/tag-data.json", JSON.stringify(tagsByCategory));
return { tagsByCategory, tagsWithoutCategory };
return categorizedTopics;
}

function organizeTopics(transcripts: ContentTranscriptType[]) {
const slugTopics: any = {};
const topicsArray: TopicsData[] = [];
function generateTopicsCounts(transcripts: ContentTranscriptType[]) {
// Get topics
const topics = getTopics();

transcripts.forEach((transcript) => {
const slugTags = transcript.tags?.map((tag) => ({
slug: createSlug(tag),
name: tag,
}));
// Build the primary data structure
const processedTopics = buildTopicsMap(transcripts, topics);

slugTags?.forEach(({ slug, name }) => {
if (slugTopics[slug] !== undefined) {
const index = slugTopics[slug];
topicsArray[index].count += 1;
} else {
const topicsLength = topicsArray.length;
slugTopics[slug] = topicsLength;
topicsArray[topicsLength] = {
slug,
name,
count: 1,
};
}
});
});
// Generate both output formats
const alphabeticalList = generateAlphabeticalList(processedTopics);
const categorizedList = generateCategorizedList(processedTopics);

fs.writeFileSync("./public/topics-data.json", JSON.stringify(topicsArray));
// Write output files
fs.writeFileSync(
"./public/topics-counts.json",
JSON.stringify(alphabeticalList, null, 2)
);

fs.writeFileSync(
"./public/topics-by-category-counts.json",
JSON.stringify(categorizedList, null, 2)
);
}

function createSpeakers(transcripts: ContentTranscriptType[]) {
Expand Down Expand Up @@ -468,13 +458,11 @@ export default makeSource({
"STYLE.md",
"twitter_handles.json",
".json",
"2018-08-17-richard-bondi-bitcoin-cli-regtest.es.md",
],
onSuccess: async (importData) => {
const { allTranscripts, allSources } = await importData();
organizeTags(allTranscripts);
generateTopicsCounts(allTranscripts);
createTypesCount(allTranscripts, allSources);
organizeTopics(allTranscripts);
getTranscriptAliases(allTranscripts);
createSpeakers(allTranscripts);
generateSourcesCount(allTranscripts, allSources);
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"private": true,
"scripts": {
"dev": "next dev",
"fetch-categories": "node scripts/fetchCategories.js",
"fetch-topics": "node scripts/fetchTopics.js",
"submodules:update": "git submodule update --init && git submodule update --remote",
"build": "npm run submodules:update && npm run fetch-categories && next build",
"build": "npm run submodules:update && npm run fetch-topics && next build",
"start": "next start",
"lint": "next lint"
},
Expand Down
8 changes: 4 additions & 4 deletions scripts/fetchCategories.js → scripts/fetchTopics.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ const fs = require("fs");
const path = require("path");
const https = require("https");

const url = "https://bitcoinops.org/topics.json";
const outputPath = path.join(__dirname, "..", "public", "categories.json");
const url = "https://raw.githubusercontent.com/bitcoinsearch/topics-index/refs/heads/main/topics.json";
const outputPath = path.join(__dirname, "..", "public", "topics.json");

https
.get(url, (res) => {
Expand All @@ -15,9 +15,9 @@ https

res.on("end", () => {
fs.writeFileSync(outputPath, data);
console.log("Categories data has been fetched and saved to public folder.");
console.log("Topics data has been fetched and saved to public folder.");
});
})
.on("error", (err) => {
console.error("Error fetching categories:", err.message);
console.error("Error fetching topics:", err.message);
});
2 changes: 1 addition & 1 deletion src/app/(explore)/categories/page.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import React from "react";
import TranscriptContentPage from "@/components/explore/TranscriptContentPage";
import allCategoriesTopic from "@/public/tag-data.json";
import allCategoriesTopic from "@/public/topics-by-category-counts.json";

const CategoriesPage = () => {

Expand Down
3 changes: 1 addition & 2 deletions src/app/(explore)/topics/page.tsx
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import React from "react";
import TranscriptContentPage from "@/components/explore/TranscriptContentPage";
import allTopics from "@/public/topics-data.json";
import allTopics from "@/public/topics-counts.json";

const TopicsPage = () => {

return (
<div className="flex flex-col text-black">
<TranscriptContentPage header="Topics" data={allTopics} description="Bitcoin is made up of an endless amount of topics, and there’s no shortage of rabbit holes to go down. " type="alphabet" linkName="tags"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import Wrapper from "@/components/layout/Wrapper";
import ExploreTranscriptClient from "./ExploreTranscriptClient";

function getTags() {
const filePath = path.join(process.cwd(), "public", "tag-data.json");
const filePath = path.join(process.cwd(), "public", "topics-by-category-counts.json");
const fileContents = fs.readFileSync(filePath, "utf8");
return JSON.parse(fileContents);
}
Expand Down

0 comments on commit 5907312

Please sign in to comment.