Skip to content

Commit

Permalink
Generate islands of related variant characters, solve issues #5 and #6
Browse files Browse the repository at this point in the history
  • Loading branch information
Transfusion committed Jan 30, 2021
1 parent 3427c5a commit 19d0d1b
Show file tree
Hide file tree
Showing 6 changed files with 405 additions and 4 deletions.
3 changes: 2 additions & 1 deletion etl/constants.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
export const UNIHAN_ZIP_URL = 'https://unicode.org/Public/UNIDATA/Unihan.zip';
export const UNIHAN_ZIP_URL = "https://unicode.org/Public/UNIDATA/Unihan.zip";

// export const UNIHAN_ZIP_URL = "http://localhost:8000/Unihan.zip";

Expand Down Expand Up @@ -30,3 +30,4 @@ export const UNICODE_IDS_RESOLVED_PREFIXES = [
];

export const CJKVI_VARIANTS_SUBDIR_NAME = "cjkvi-variants";
export const MANUAL_CJKVI_VARIANTS_SUBDIR_NAME = "manual-cjkvi-variants";
80 changes: 79 additions & 1 deletion etl/genVariants.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
import { addJapaneseShinKyu } from "./genVariants";
import {
addJapaneseShinKyu,
generateVariantIslands,
islandsToObject,
expandVariantIslands,
} from "./genVariants";
const IVS = require("ivs");

test("all ivs sequences stripped", (done) => {
Expand All @@ -19,3 +24,76 @@ test("all ivs sequences stripped", (done) => {
done();
});
});

test("generate variants islands", () => {
const adjMap = {
// 1st island
A: new Set(["B", "D"]),
D: new Set(["A"]),
B: new Set(["A", "C"]),
C: new Set(["B", "E"]),
E: new Set(["C"]),
// 2nd island
F: new Set(["G"]),
G: new Set(["F", "H", "I"]),
H: new Set(["G"]),
I: new Set(["G", "J"]),
J: new Set(["I"]),
};

const islands = generateVariantIslands(adjMap);
expect(islands).toEqual([
["A", "B", "C", "E", "D"],
["F", "G", "H", "I", "J"],
]);
});

test("generate variants islands lookup", () => {
const islands = [
["A", "B", "C", "E", "D"],
["F", "G", "H", "I", "J"],
];

expect(islandsToObject(islands)).toEqual({
islands,
chars: {
A: [0],
B: [0],
C: [0],
D: [0],
E: [0],
F: [1],
G: [1],
H: [1],
I: [1],
J: [1],
},
});
});

test("expand variants islands", () => {
const adjMap = {
// 1st island
A: new Set(["B", "D"]),
D: new Set(["A", "Y"]),
B: new Set(["A", "C"]),
C: new Set(["B", "E"]),
E: new Set(["C"]),
// 2nd island
F: new Set(["G"]),
G: new Set(["F", "H", "I"]),
H: new Set(["G"]),
I: new Set(["G", "J"]),
J: new Set(["I", "Z"]),
};

const islands = [
["A", "B", "C", "E", "D"],
["F", "G", "H", "I", "J"],
];

expect(expandVariantIslands(adjMap, islands)).toEqual([
["A", "B", "C", "E", "D", "Y"],
["F", "G", "H", "I", "J", "Z"],
]);
});
97 changes: 97 additions & 0 deletions etl/genVariants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import {
getJPOldStyleData,
getCommonTraditionalCharacters,
getCommonSimplifiedCharacters,
getAllVariantsPerCharacter,
getOrthographicVariantsPerCharacter,
} from "./variants-fetcher";

import { Hanja as everydayHanja1800 } from "./hanja-for-everyday-use-1800.json";
Expand Down Expand Up @@ -199,12 +201,107 @@ const addRadicals = (map: VariantsMap) => {
}
};

export const generateVariantIslands = (inputMap: {
[key: string]: Set<string>;
}) => {
const chars = Object.keys(inputMap);
const visitedChars = new Set<string>();

let currentIsland = [] as string[];
const dfs = (depth: number, char: string) => {
// nothing is done with the depth thus far
if (visitedChars.has(char)) return;
visitedChars.add(char);
currentIsland.push(char);

for (let neighbor of inputMap[char]) {
dfs(depth + 1, neighbor);
}
};

const res = [] as string[][];
for (let char of chars) {
if (!visitedChars.has(char)) {
dfs(0, char);
res.push(currentIsland);
currentIsland = [];
}
}

return res;
};

export const expandVariantIslands = (
allVariants: { [key: string]: Set<string> },
islands: string[][]
) => {
const expanded = [] as string[][];
for (let island of islands) {
const setified = new Set(island);
for (let character of island) {
for (let neighbor of allVariants[character]) {
setified.add(neighbor);
}
}
expanded.push(Array.from(setified));
}
return expanded;
};

export const islandsToObject = (islands: string[][]): VariantsIslandsLookup => {
const map = { islands, chars: {} } as VariantsIslandsLookup;

for (let i = 0; i < islands.length; i++) {
const island = islands[i];
for (let char of island) {
if (!(char in map.chars)) {
map.chars[char] = [];
}
map.chars[char].push(i);
}
}

return map;
};

const main = async () => {
/* generate a list of known variants first, e.g. whether a
character is a known radical, joyo kanji, simplified character, gukja, etc.*/
// const IRGSourcesString = (await getRawIRGSources()).split("\n");
// probably more efficient to assign a list to each character instead of doing an O(number of variants) lookup in the frontend for every single character...

const getOrthographicVariantsPerCharacterPromise = () =>
new Promise<{
[key: string]: Set<string>;
}>((resolve, reject) => {
const ivs = new IVS(() => {
resolve(getOrthographicVariantsPerCharacter(ivs));
});
});

const getAllVariantsPerCharacterPromise = () =>
new Promise<{
[key: string]: Set<string>;
}>((resolve, reject) => {
const ivs = new IVS(() => {
resolve(getAllVariantsPerCharacter(ivs));
});
});

const allVariants = await getAllVariantsPerCharacterPromise();
const orthoVariants = await getOrthographicVariantsPerCharacterPromise();

const islands = expandVariantIslands(
allVariants,
generateVariantIslands(orthoVariants)
);

const islandsLookup = islandsToObject(islands);
writeJSON(
JSON.stringify(islandsLookup),
JSON_FILE_NAMES.variantsIslandsLookup
);

const addJapaneseShinKyuPromise = (map: VariantsMap) =>
new Promise<void>((resolve, reject) => {
const ivs = new IVS(() => {
Expand Down
Loading

0 comments on commit 19d0d1b

Please sign in to comment.