Skip to content

Commit

Permalink
Add Babelstone Han PUA to the ETL process, add new CharacterVariant
Browse files Browse the repository at this point in the history
  • Loading branch information
Transfusion committed Feb 24, 2021
1 parent abe6911 commit a296945
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 0 deletions.
5 changes: 5 additions & 0 deletions etl/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,3 +33,8 @@ export const CJKVI_VARIANTS_SUBDIR_NAME = "cjkvi-variants";
export const MANUAL_CJKVI_VARIANTS_SUBDIR_NAME = "manual-cjkvi-variants";

export const CJKVI_TABLES_SUBDIR_NAME = "cjkvi-tables";

export const BABELSTONE_PUA_JSON_NAME = "pua-latest.json";
export const BABELSTONE_PUA_JSON_URL =
"https://babelstone-pua.transfusion.eu.org/pua-latest.json";
export const BABELSTONE_SUBDIR_NAME = "babel";
8 changes: 8 additions & 0 deletions etl/genBaseForwardReverse.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
writeJSON,
// writeData, writeReverseMapProtobuf
} from "./writer";
import { getPUAData } from "./babelstone-pua-fetcher";
// import protobuf from "protobufjs";
// import { getRawIRGSources } from "./unihan-fetcher";

Expand Down Expand Up @@ -364,6 +365,13 @@ const main = async () => {
resolvedIDSData = resolvedIDSData.concat(getAllResolvedIDSData(sourceFile));
}

const babelstonePUAData = await getPUAData();

for (let pua of babelstonePUAData.data) {
const entry = [pua.cp, pua.char, pua.ids];
resolvedIDSData.push(entry);
}

const {
baseRadicals,
forwardMap,
Expand Down
17 changes: 17 additions & 0 deletions etl/genReadings.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { getRawReadings } from "./unihan-fetcher";
import { writeJSON } from "./writer";

import { JSON_FILE_NAMES } from "../src/constants";
import { getPUAData } from "./babelstone-pua-fetcher";

const main = async () => {
const RawReadingsString = (await getRawReadings()).split("\n");
Expand All @@ -19,6 +20,22 @@ const main = async () => {
map[char][field] = entry[2];
}

const babelstonePUAData = await getPUAData();
for (let { char, note, src, src_refs, enc_stat } of babelstonePUAData.data) {
map[char] = {
note,
src,
src_refs,
enc_stat,
};

for (let key of Object.keys(map[char])) {
if (map[char][key].length === 0) {
delete map[char][key];
}
}
}

// write to file
writeJSON(JSON.stringify(map), JSON_FILE_NAMES.readings);
};
Expand Down
16 changes: 16 additions & 0 deletions etl/genVariants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ import {
} from "./variants-fetcher";

import { Hanja as everydayHanja1800 } from "./hanja-for-everyday-use-1800.json";
import { getPUAData } from "./babelstone-pua-fetcher";

const IVS = require("ivs");
const utfstring = require("utfstring");
Expand Down Expand Up @@ -221,6 +222,20 @@ const addRadicals = (map: VariantsSet) => {
}
};

const addBabelstoneHanPUA = async (map: VariantsSet) => {
const createMapEntry = (char: string) => {
if (!(char in map)) {
map[char] = new Set<number>();
}
};

const babelstonePUAData = await getPUAData();
for (let pua of babelstonePUAData.data) {
createMapEntry(pua.char);
map[pua.char].add(CharacterVariant.unicode_pua);
}
};

export const generateVariantIslands = (inputMap: {
[key: string]: Set<string>;
}) => {
Expand Down Expand Up @@ -400,6 +415,7 @@ const main = async () => {
addChinese(map);
// jp-old-style.txt makes use of IVS characters which I strip
await addJapaneseShinKyuPromise(map);
await addBabelstoneHanPUA(map);

const mapArr = {} as { [key: string]: number[] };
for (let char in map) {
Expand Down
2 changes: 2 additions & 0 deletions src/types/common.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ export enum CharacterVariant {
sawndip_simplified = 12, // TODO
sawndip = 13,
radical = 14,

unicode_pua = 15,
}

// nothing to do with any existing standards, self-defined
Expand Down

0 comments on commit a296945

Please sign in to comment.