Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(contentlayer): Improve Topics Processing and Generation #61

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@
/build
/public/categories.json
/public/types-data.json
/public/tag-data.json
/public/topics-by-category-counts.json
/public/aliases.json
/public/topics-data.json
/public/topics-counts.json
/public/speaker-data.json
/public/source-count-data.json
/public/sources-data.json
Expand Down
190 changes: 89 additions & 101 deletions contentlayer.config.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ const Resources = defineNestedType(() => ({
url: { type: "string" },
},
}));
export interface CategoryInfo {
export interface Topic {
title: string;
slug: string;
optech_url: string;
Expand All @@ -28,6 +28,14 @@ export interface CategoryInfo {
excerpt: string;
}

// The full processed topic we use internally
interface ProcessedTopic {
name: string; // Display name (from topic.title or original tag)
slug: string; // Slugified identifier
count: number; // Number of occurrences
categories: string[]; // List of categories it belongs to
}

interface TagInfo {
name: string;
slug: string;
Expand All @@ -38,25 +46,6 @@ interface ContentTree {
[key: string]: ContentTree | ContentTranscriptType[];
}

/**
* Count the occurrences of all tags across transcripts and write to json file
*/
function createTagCount(allTranscripts: ContentTranscriptType[]): {
tagCounts: Record<string, number>;
} {
const tagCounts: Record<string, number> = {};

for (const file of allTranscripts) {
if (!file.tags) continue;

for (const tag of file.tags) {
const formattedTag = createSlug(tag);
tagCounts[formattedTag] = (tagCounts[formattedTag] || 0) + 1;
}
}

return { tagCounts };
}

const getTranscriptAliases = (allTranscripts: ContentTranscriptType[]) => {
const aliases: Record<string, string> = {};
Expand All @@ -74,105 +63,106 @@ const getTranscriptAliases = (allTranscripts: ContentTranscriptType[]) => {
fs.writeFileSync("./public/aliases.json", JSON.stringify(aliases));
};

const getCategories = () => {
const filePath = path.join(process.cwd(), "public", "categories.json");
const getTopics = () => {
const filePath = path.join(process.cwd(), "public", "topics.json");
const fileContents = fs.readFileSync(filePath, "utf8");
return JSON.parse(fileContents);
};

function organizeTags(transcripts: ContentTranscriptType[]) {
const categories: CategoryInfo[] = getCategories();
const { tagCounts } = createTagCount(transcripts);
function buildTopicsMap(transcripts: ContentTranscriptType[], topics: Topic[]): Map<string, ProcessedTopic> {
// Create topics lookup map (includes aliases)
const topicsLookup = new Map<string, Topic>();
topics.forEach(topic => {
topicsLookup.set(topic.slug, topic);
topic.aliases?.forEach(alias => topicsLookup.set(alias, topic));
});

const tagsByCategory: { [category: string]: TagInfo[] } = {};
const tagsWithoutCategory = new Set<string>();
const categorizedTags = new Set<string>();
// Build the main topics map
const processedTopics = new Map<string, ProcessedTopic>();

// Create a map for faster category lookup
const categoryMap = new Map<string, CategoryInfo>();
// Process all transcripts
transcripts.forEach(transcript => {
transcript.tags?.forEach(tag => {
const slug = createSlug(tag);
const topic = topicsLookup.get(slug);

categories.forEach((cat) => {
cat.categories.forEach((category) => {
if (!tagsByCategory[category]) {
tagsByCategory[category] = [];
if (!processedTopics.has(slug)) {
processedTopics.set(slug, {
name: topic?.title || tag,
slug,
count: 1,
categories: topic?.categories || ["Miscellaneous"],
});
} else {
const processed = processedTopics.get(slug)!;
processed.count += 1;
}
});
categoryMap.set(createSlug(cat.slug), cat);
cat.aliases?.forEach((alias) => categoryMap.set(alias, cat));
});

// Process all tags at once
const allTags = new Set(
transcripts.flatMap(
(transcript) => transcript.tags?.map((tag) => tag) || []
)
);

allTags.forEach((tag) => {
const catInfo = categoryMap.get(tag);
if (catInfo) {
catInfo.categories.forEach((category) => {
if (!tagsByCategory[category].some((t) => t.slug === tag)) {
tagsByCategory[category].push({
name: catInfo.title,
slug: tag,
count: tagCounts[tag] || 0,
});
}
});
categorizedTags.add(tag);
} else {
tagsWithoutCategory.add(tag);
}
});
return processedTopics;
}

// Add "Miscellaneous" category with remaining uncategorized tags
if (tagsWithoutCategory.size > 0) {
tagsByCategory["Miscellaneous"] = Array.from(tagsWithoutCategory).map(
(tag) => ({
name: tag,
slug: tag,
count: tagCounts[tag] || 0,
})
);
function generateAlphabeticalList(processedTopics: Map<string, ProcessedTopic>): TopicsData[] {
const result: TopicsData[] = [];
// The categories property is not needed for this list, so we drop it
for (const { name, slug, count } of processedTopics.values()) {
result.push({ name, slug, count });
}
return result.sort((a, b) => a.name.localeCompare(b.name));
}

// Sort tags alphabetically within each category
Object.keys(tagsByCategory).forEach((category) => {
tagsByCategory[category].sort((a, b) => a.name.localeCompare(b.name));
function generateCategorizedList(processedTopics: Map<string, ProcessedTopic>): Record<string, TopicsData[]> {
const categorizedTopics: Record<string, TopicsData[]> = {};

Array.from(processedTopics.values()).forEach(({ name, slug, count, categories }) => {
categories.forEach(category => {
if (!categorizedTopics[category]) {
categorizedTopics[category] = [];
}

// Check if topic name contains category name and ends with "(Miscellaneous)"
const modifiedName = name.includes(category) && name.endsWith("(Miscellaneous)")
? "Miscellaneous"
: name;

categorizedTopics[category].push({ name: modifiedName, slug, count });
});
});

// Sort topics within each category
Object.values(categorizedTopics).forEach(topics => {
topics.sort((a, b) => {
if (a.name == "Miscellaneous") return 1;
if (b.name == "Miscellaneous") return -1;
return a.name.localeCompare(b.name)
});
});

fs.writeFileSync("./public/tag-data.json", JSON.stringify(tagsByCategory));
return { tagsByCategory, tagsWithoutCategory };
return categorizedTopics;
}

function organizeTopics(transcripts: ContentTranscriptType[]) {
const slugTopics: any = {};
const topicsArray: TopicsData[] = [];
function generateTopicsCounts(transcripts: ContentTranscriptType[]) {
// Get topics
const topics = getTopics();

transcripts.forEach((transcript) => {
const slugTags = transcript.tags?.map((tag) => ({
slug: createSlug(tag),
name: tag,
}));
// Build the primary data structure
const processedTopics = buildTopicsMap(transcripts, topics);

slugTags?.forEach(({ slug, name }) => {
if (slugTopics[slug] !== undefined) {
const index = slugTopics[slug];
topicsArray[index].count += 1;
} else {
const topicsLength = topicsArray.length;
slugTopics[slug] = topicsLength;
topicsArray[topicsLength] = {
slug,
name,
count: 1,
};
}
});
});
// Generate both output formats
const alphabeticalList = generateAlphabeticalList(processedTopics);
const categorizedList = generateCategorizedList(processedTopics);

fs.writeFileSync("./public/topics-data.json", JSON.stringify(topicsArray));
// Write output files
fs.writeFileSync(
"./public/topics-counts.json",
JSON.stringify(alphabeticalList, null, 2)
);

fs.writeFileSync(
"./public/topics-by-category-counts.json",
JSON.stringify(categorizedList, null, 2)
);
}

function createSpeakers(transcripts: ContentTranscriptType[]) {
Expand Down Expand Up @@ -468,13 +458,11 @@ export default makeSource({
"STYLE.md",
"twitter_handles.json",
".json",
"2018-08-17-richard-bondi-bitcoin-cli-regtest.es.md",
],
onSuccess: async (importData) => {
const { allTranscripts, allSources } = await importData();
organizeTags(allTranscripts);
generateTopicsCounts(allTranscripts);
createTypesCount(allTranscripts, allSources);
organizeTopics(allTranscripts);
getTranscriptAliases(allTranscripts);
createSpeakers(allTranscripts);
generateSourcesCount(allTranscripts, allSources);
Expand Down
4 changes: 2 additions & 2 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@
"private": true,
"scripts": {
"dev": "next dev",
"fetch-categories": "node scripts/fetchCategories.js",
"fetch-topics": "node scripts/fetchTopics.js",
"submodules:update": "git submodule update --init && git submodule update --remote",
"build": "npm run submodules:update && npm run fetch-categories && next build",
"build": "npm run submodules:update && npm run fetch-topics && next build",
"start": "next start",
"lint": "next lint"
},
Expand Down
8 changes: 4 additions & 4 deletions scripts/fetchCategories.js → scripts/fetchTopics.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ const fs = require("fs");
const path = require("path");
const https = require("https");

const url = "https://bitcoinops.org/topics.json";
const outputPath = path.join(__dirname, "..", "public", "categories.json");
const url = "https://raw.githubusercontent.com/bitcoinsearch/topics-index/refs/heads/main/topics.json";
const outputPath = path.join(__dirname, "..", "public", "topics.json");

https
.get(url, (res) => {
Expand All @@ -15,9 +15,9 @@ https

res.on("end", () => {
fs.writeFileSync(outputPath, data);
console.log("Categories data has been fetched and saved to public folder.");
console.log("Topics data has been fetched and saved to public folder.");
});
})
.on("error", (err) => {
console.error("Error fetching categories:", err.message);
console.error("Error fetching topics:", err.message);
});
2 changes: 1 addition & 1 deletion src/app/(explore)/categories/page.tsx
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import React from "react";
import TranscriptContentPage from "@/components/explore/TranscriptContentPage";
import allCategoriesTopic from "@/public/tag-data.json";
import allCategoriesTopic from "@/public/topics-by-category-counts.json";

const CategoriesPage = () => {

Expand Down
3 changes: 1 addition & 2 deletions src/app/(explore)/topics/page.tsx
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
import React from "react";
import TranscriptContentPage from "@/components/explore/TranscriptContentPage";
import allTopics from "@/public/topics-data.json";
import allTopics from "@/public/topics-counts.json";

const TopicsPage = () => {

return (
<div className="flex flex-col text-black">
<TranscriptContentPage header="Topics" data={allTopics} description="Bitcoin is made up of an endless amount of topics, and there’s no shortage of rabbit holes to go down. " type="alphabet" linkName="tags"/>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import Wrapper from "@/components/layout/Wrapper";
import ExploreTranscriptClient from "./ExploreTranscriptClient";

function getTags() {
const filePath = path.join(process.cwd(), "public", "tag-data.json");
const filePath = path.join(process.cwd(), "public", "topics-by-category-counts.json");
const fileContents = fs.readFileSync(filePath, "utf8");
return JSON.parse(fileContents);
}
Expand Down