diff --git a/extension/character_info.ts b/extension/character_info.ts new file mode 100644 index 000000000..40b48e23b --- /dev/null +++ b/extension/character_info.ts @@ -0,0 +1,186 @@ +export const kanaToHiraganaNormalizationMap: Record = { + ァ: 'ぁ', + ア: 'あ', + ィ: 'ぃ', + イ: 'い', + ゥ: 'ぅ', + ウ: 'う', + ェ: 'ぇ', + エ: 'え', + ォ: 'ぉ', + オ: 'お', + カ: 'か', + ガ: 'が', + キ: 'き', + ギ: 'ぎ', + ク: 'く', + グ: 'ぐ', + ケ: 'け', + ゲ: 'げ', + コ: 'こ', + ゴ: 'ご', + サ: 'さ', + ザ: 'ざ', + シ: 'し', + ジ: 'じ', + ス: 'す', + ズ: 'ず', + セ: 'せ', + ゼ: 'ぜ', + ソ: 'そ', + ゾ: 'ぞ', + タ: 'た', + ダ: 'だ', + チ: 'ち', + ヂ: 'ぢ', + ッ: 'っ', + ツ: 'つ', + ヅ: 'づ', + テ: 'て', + デ: 'で', + ト: 'と', + ド: 'ど', + ナ: 'な', + ニ: 'に', + ヌ: 'ぬ', + ネ: 'ね', + ノ: 'の', + ハ: 'は', + バ: 'ば', + パ: 'ぱ', + ヒ: 'ひ', + ビ: 'び', + ピ: 'ぴ', + フ: 'ふ', + ブ: 'ぶ', + プ: 'ぷ', + ヘ: 'へ', + ベ: 'べ', + ペ: 'ぺ', + ホ: 'ほ', + ボ: 'ぼ', + ポ: 'ぽ', + マ: 'ま', + ミ: 'み', + ム: 'む', + メ: 'め', + モ: 'も', + ャ: 'ゃ', + ヤ: 'や', + ュ: 'ゅ', + ユ: 'ゆ', + ョ: 'ょ', + ヨ: 'よ', + ラ: 'ら', + リ: 'り', + ル: 'る', + レ: 'れ', + ロ: 'ろ', + ヮ: 'ゎ', + ワ: 'わ', + ヲ: 'を', + ン: 'ん', + ヴ: 'ゔ', + ァ: 'ぁ', + ア: 'あ', + ィ: 'ぃ', + イ: 'い', + ゥ: 'ぅ', + ウ: 'う', + ェ: 'ぇ', + エ: 'え', + ォ: 'ぉ', + オ: 'お', + カ: 'か', + ガ: 'が', + キ: 'き', + ギ: 'ぎ', + ク: 'く', + グ: 'ぐ', + ケ: 'け', + ゲ: 'げ', + コ: 'こ', + ゴ: 'ご', + サ: 'さ', + ザ: 'ざ', + シ: 'し', + ジ: 'じ', + ス: 'す', + ズ: 'ず', + セ: 'せ', + ゼ: 'ぜ', + ソ: 'そ', + ゾ: 'ぞ', + タ: 'た', + ダ: 'だ', + チ: 'ち', + ヂ: 'ぢ', + ッ: 'っ', + ツ: 'つ', + ヅ: 'づ', + テ: 'て', + デ: 'で', + ト: 'と', + ド: 'ど', + ナ: 'な', + ニ: 'に', + ヌ: 'ぬ', + ネ: 'ね', + ノ: 'の', + ハ: 'は', + バ: 'ば', + パ: 'ぱ', + ヒ: 'ひ', + ビ: 'び', + ピ: 'ぴ', + フ: 'ふ', + ブ: 'ぶ', + プ: 'ぷ', + ヘ: 'へ', + ベ: 'べ', + ペ: 'ぺ', + ホ: 'ほ', + ボ: 'ぼ', + ポ: 'ぽ', + マ: 'ま', + ミ: 'み', + ム: 'む', + メ: 'め', + モ: 'も', + ャ: 'ゃ', + ヤ: 'や', + ュ: 'ゅ', + ユ: 'ゆ', + ョ: 'ょ', + ヨ: 'よ', + ラ: 'ら', + リ: 'り', + ル: 'る', + レ: 'れ', + ロ: 'ろ', + ワ: 'わ', + ヲ: 'を', + ン: 'ん', + // ヴ: 'ゔ', +}; + +export enum KANA { + HIRAGANA_START = 0x3041, + HIRAGANA_END = 0x309f, + KATAKANA_START = 0x30a0, + KATAKANA_END = 0x30ff, + HW_KATAKANA_START = 0xff66, + HW_KATAKANA_END = 0xff9d, +} + +export enum PUNCTUATION { + //characters below are in order for easy copy + //Note:3rd character is invisible but copyable + //Note:4th and 5th look the same but are actually different. + // ゙, ゚, ‌, ~, 〜 + VOICED_MARK = 0xff9e, + SEMI_VOICED_MARK = 0xff9f, + ZERO_WIDTH_JOINER = 0x200c, + J_TILDE = 0xff5e, + HW_J_TILDE = 0x301c, +} diff --git a/extension/data.ts b/extension/data.ts index 8460c97ec..b1d8e8cd6 100644 --- a/extension/data.ts +++ b/extension/data.ts @@ -42,29 +42,36 @@ /** Exposes abstraction over dictionary files allowing searches and lookups. */ import { Config } from './configuration'; - -// Be careful of using directly due to object keys. -const defaultDictEntryData = { - kanji: '', - onkun: '', - nanori: '', - bushumei: '', - misc: {} as Record, - eigo: '', - hasNames: false, - data: [] as { entry: string; reason: string | undefined }[], - hasMore: false, - title: '', - index: 0, - matchLen: 0, -}; -type DictEntryData = typeof defaultDictEntryData; +import { + KANA, + PUNCTUATION, + kanaToHiraganaNormalizationMap, +} from './character_info'; interface Deinflection { word: string; type: number; reason: string; } +type DictData = { + entry: string; + reason?: string; +}; + +type DictEntryData = { + kanji: string; + onkun: string; + nanori: string; + bushumei: string; + misc: Record; + eigo: string; + hasNames: boolean; + data: DictData[]; + hasMore: boolean; + title: string; + index: number; + matchLen: number; +}; interface DeinflectionRule { /** The conjugated ending which we are deinflecting from. */ @@ -86,6 +93,22 @@ interface DeinflectionRuleGroup { rules: DeinflectionRule[]; } +// Be careful of using directly due to object keys. +const defaultDictEntryData: DictEntryData = { + kanji: '', + onkun: '', + nanori: '', + bushumei: '', + misc: {}, + eigo: '', + hasNames: false, + data: [], + hasMore: false, + title: '', + index: 0, + matchLen: 0, +}; + class RcxDict { private static instance: RcxDict; @@ -143,8 +166,8 @@ class RcxDict { async fileReadAsyncAsArray(url: string): Promise { const file = await this.fileReadAsync(url); - return file.split('\n').filter((o) => { - return o && o.length > 0; + return file.split('\n').filter((line) => { + return line && line.length > 0; }); } @@ -156,14 +179,17 @@ class RcxDict { } fileReadArray(name: string) { - const a = this.fileRead(name).split('\n'); + const fileLines = this.fileRead(name).split('\n'); // Is this just in case there is blank shit in the file. It was written // by Jon though. // I suppose this is more robust - while (a.length > 0 && a[a.length - 1].length === 0) { - a.pop(); + while ( + fileLines.length > 0 && + fileLines[fileLines.length - 1].length === 0 + ) { + fileLines.pop(); } - return a; + return fileLines; } loadNames() { @@ -199,69 +225,61 @@ class RcxDict { // i = 1: skip header for (let i = 1; i < buffer.length; ++i) { const ruleOrReason = buffer[i].split('\t'); - if (ruleOrReason.length === 1) { this.difReasons.push(ruleOrReason[0]); } else if (ruleOrReason.length === 4) { - const o: DeinflectionRule = { + const deinflectionRule: DeinflectionRule = { from: ruleOrReason[0], to: ruleOrReason[1], typeMask: parseInt(ruleOrReason[2]), reasonIndex: parseInt(ruleOrReason[3]), }; - - if (currentLength !== o.from.length) { - currentLength = o.from.length; + if (currentLength !== deinflectionRule.from.length) { + currentLength = deinflectionRule.from.length; group = { fromLength: currentLength, rules: [] }; this.difRules.push(group); } - group.rules.push(o); + group.rules.push(deinflectionRule); } } } find(data: string, text: string): string | null { - const tlen = text.length; - let beg = 0; + const textLength = text.length; + let beginning = 0; let end = data.length - 1; - let i; - let mi; - let mis; - - while (beg < end) { - mi = (beg + end) >> 1; - i = data.lastIndexOf('\n', mi) + 1; - - mis = data.substr(i, tlen); + while (beginning < end) { + const middle = (beginning + end) / 2; + const i = data.lastIndexOf('\n', middle) + 1; + const mis = data.substr(i, textLength); if (text < mis) { end = i - 1; } else if (text > mis) { - beg = data.indexOf('\n', mi + 1) + 1; + beginning = data.indexOf('\n', middle + 1) + 1; } else { - return data.substring(i, data.indexOf('\n', mi + 1)); + return data.substring(i, data.indexOf('\n', middle + 1)); } } return null; } - deinflect(word: string) { - const r = []; - const have: { [key: string]: number } = {}; - let o; - - o = { word: word, type: 0xff, reason: '' } as Deinflection; - r.push(o); + deinflect(word: string): Deinflection[] { + const possibleDeinflections: Deinflection[] = [ + { word, type: 0xff, reason: '' }, + ]; + const have: Record = {}; have[word] = 0; let i; let j; let k; + let o; i = 0; do { - word = r[i].word; + word = possibleDeinflections[i].word; const wordLen = word.length; - const type = r[i].type; + const type = possibleDeinflections[i].type; for (j = 0; j < this.difRules.length; ++j) { const g = this.difRules[j]; @@ -277,112 +295,96 @@ class RcxDict { } o = { word: word, type: 0xff, reason: '' } as Deinflection; if (have[newWord] !== undefined) { - o = r[have[newWord]]; + o = possibleDeinflections[have[newWord]]; o.type |= rule.typeMask >> 8; continue; } - have[newWord] = r.length; - if (r[i].reason.length) { + have[newWord] = possibleDeinflections.length; + if (possibleDeinflections[i].reason.length) { o.reason = - this.difReasons[rule.reasonIndex] + ' < ' + r[i].reason; + this.difReasons[rule.reasonIndex] + + ' < ' + + possibleDeinflections[i].reason; } else { o.reason = this.difReasons[rule.reasonIndex]; } o.type = rule.typeMask >> 8; o.word = newWord; - r.push(o); + possibleDeinflections.push(o); } } } } - } while (++i < r.length); + } while (++i < possibleDeinflections.length); - return r; + return possibleDeinflections; } - // katakana -> hiragana conversion tables - ch: number[] = [ - 0x3092, 0x3041, 0x3043, 0x3045, 0x3047, 0x3049, 0x3083, 0x3085, 0x3087, - 0x3063, 0x30fc, 0x3042, 0x3044, 0x3046, 0x3048, 0x304a, 0x304b, 0x304d, - 0x304f, 0x3051, 0x3053, 0x3055, 0x3057, 0x3059, 0x305b, 0x305d, 0x305f, - 0x3061, 0x3064, 0x3066, 0x3068, 0x306a, 0x306b, 0x306c, 0x306d, 0x306e, - 0x306f, 0x3072, 0x3075, 0x3078, 0x307b, 0x307e, 0x307f, 0x3080, 0x3081, - 0x3082, 0x3084, 0x3086, 0x3088, 0x3089, 0x308a, 0x308b, 0x308c, 0x308d, - 0x308f, 0x3093, - ]; + isKana(charCode: number): boolean { + return ( + (charCode >= KANA.HIRAGANA_START && charCode <= KANA.HIRAGANA_END) || + (charCode >= KANA.KATAKANA_START && charCode <= KANA.KATAKANA_END) || + (charCode >= KANA.HW_KATAKANA_START && charCode <= KANA.HW_KATAKANA_END) + ); + } + /** + * Returns the input string converted into hiragana. If any characters are not + * found in the [NormalizationMap](./character_info.ts) then the character + * will be returned as is. + * + * @param kanaWord - A string of kana characters. + * @returns The conversion of kanaWord into hiragana. + */ + convertToHiragana(kanaWord: string): string { + let result = ''; + + for (let i = 0; i < kanaWord.length; i++) { + const currentChar = kanaWord.charAt(i); + const currentCharCode = currentChar.charCodeAt(0); + let isSemiVoiced = false; + let isVoiced = false; + const isHalfWidthKatakana = + currentCharCode >= KANA.HW_KATAKANA_START && + currentCharCode <= KANA.HW_KATAKANA_END; + let key = ''; + if (currentCharCode < 0x3000) { + break; + } - cv: number[] = [ - 0x30f4, 0xff74, 0xff75, 0x304c, 0x304e, 0x3050, 0x3052, 0x3054, 0x3056, - 0x3058, 0x305a, 0x305c, 0x305e, 0x3060, 0x3062, 0x3065, 0x3067, 0x3069, - 0xff85, 0xff86, 0xff87, 0xff88, 0xff89, 0x3070, 0x3073, 0x3076, 0x3079, - 0x307c, - ]; - cs: number[] = [0x3071, 0x3074, 0x3077, 0x307a, 0x307d]; + if (isHalfWidthKatakana) { + const nextChar = kanaWord.charAt(i + 1); + const nextCharCode = nextChar?.charCodeAt(0); + isSemiVoiced = nextCharCode === PUNCTUATION.SEMI_VOICED_MARK; + isVoiced = nextCharCode === PUNCTUATION.VOICED_MARK; + key = isSemiVoiced || isVoiced ? currentChar + nextChar : currentChar; + } else { + key = currentChar; + } + + const hiragana = kanaToHiraganaNormalizationMap[key]; + result += hiragana !== undefined ? hiragana : currentChar; + + if (isSemiVoiced || isVoiced) { + i++; + } + } + return result; + } + + normalize(str: string): string { + return str.replace(/[\u200C\u301C\uFF5E]+/g, ''); + } wordSearch( word: string, doNames: boolean, max?: number ): DictEntryData | null { - let i; - let u; - let v; - let reason: string; - let p; - const trueLen = [0]; const entry = RcxDict.createDefaultDictEntry(); - - // half & full-width katakana to hiragana conversion - // note: katakana vu is never converted to hiragana - - p = 0; - reason = ''; - for (i = 0; i < word.length; ++i) { - u = v = word.charCodeAt(i); - - // Skip Zero-width non-joiner used in Google Docs between every - // character. - if (u === 8204) { - p = 0; - continue; - } - - if (u <= 0x3000) { - break; - } - - // full-width katakana to hiragana - if (u >= 0x30a1 && u <= 0x30f3) { - u -= 0x60; - } else if (u >= 0xff66 && u <= 0xff9d) { - // half-width katakana to hiragana - u = this.ch[u - 0xff66]; - } else if (u === 0xff9e) { - // voiced (used in half-width katakana) to hiragana - if (p >= 0xff73 && p <= 0xff8e) { - reason = reason.substr(0, reason.length - 1); - u = this.cv[p - 0xff73]; - } - } else if (u === 0xff9f) { - // semi-voiced (used in half-width katakana) to hiragana - if (p >= 0xff8a && p <= 0xff8e) { - reason = reason.substr(0, reason.length - 1); - u = this.cs[p - 0xff8a]; - } - } else if (u === 0xff5e) { - // ignore J~ - p = 0; - continue; - } - - reason += String.fromCharCode(u); - // need to keep real length because of the half-width semi/voiced - // conversion - trueLen[reason.length] = i + 1; - p = v; - } - word = reason; + const normalizedWord = this.normalize(word); + const newConvertedWord = this.convertToHiragana(normalizedWord); + word = newConvertedWord; let dict: string; let index; @@ -402,7 +404,6 @@ class RcxDict { index = this.nameIndex as string; maxTrim = 20; // this.config.namax; entry.hasNames = true; - console.log('doNames'); } else { dict = this.wordDict; index = this.wordIndex; @@ -417,22 +418,22 @@ class RcxDict { while (word.length > 0) { const showInf = count !== 0; - let trys; + let possibleDeinflections: Deinflection[]; if (doNames) { - trys = [{ word: word, type: 0xff, reason: null }]; + possibleDeinflections = [{ word: word, type: 0xff, reason: '' }]; } else { - trys = this.deinflect(word); + possibleDeinflections = this.deinflect(word); } - for (i = 0; i < trys.length; i++) { - u = trys[i]; + for (let i = 0; i < possibleDeinflections.length; i++) { + const currentDeinflection = possibleDeinflections[i]; - let ix = cache[u.word]; + let ix = cache[currentDeinflection.word]; if (!ix) { - const result = this.find(index, u.word + ','); + const result = this.find(index, currentDeinflection.word + ','); if (!result) { - cache[u.word] = []; + cache[currentDeinflection.word] = []; continue; } // The first value in result is the word itself so skip it @@ -441,7 +442,7 @@ class RcxDict { .split(',') .slice(1) .map((offset) => parseInt(offset)); - cache[u.word] = ix; + cache[currentDeinflection.word] = ix; } for (let j = 0; j < ix.length; ++j) { @@ -451,7 +452,6 @@ class RcxDict { } const dentry = dict.substring(ofs, dict.indexOf('\n', ofs)); - let ok = true; if (i > 0) { // > 0 a de-inflected word @@ -465,7 +465,7 @@ class RcxDict { let w; const x = dentry.split(/[,()]/); - const y = u.type; + const y = currentDeinflection.type; let z = x.length - 1; if (z > 10) { z = 10; @@ -501,15 +501,16 @@ class RcxDict { have[ofs] = 1; ++count; if (maxLen === 0) { - maxLen = trueLen[word.length]; + maxLen = word.length; } let reason: string | undefined; - if (trys[i].reason) { + if (possibleDeinflections[i].reason) { if (showInf) { - reason = '< ' + trys[i].reason + ' < ' + word; + reason = + '< ' + possibleDeinflections[i].reason + ' < ' + word; } else { - reason = '< ' + trys[i].reason; + reason = '< ' + possibleDeinflections[i].reason; } } @@ -529,7 +530,6 @@ class RcxDict { if (entry.data.length === 0) { return null; } - entry.matchLen = maxLen; return entry; } @@ -566,10 +566,9 @@ class RcxDict { kanjiSearch(kanji: string): DictEntryData | null { const hex = '0123456789ABCDEF'; - let i; - i = kanji.charCodeAt(0); - if (i < 0x3000) { + let kanjiCharCode = kanji.charCodeAt(0); + if (kanjiCharCode < 0x3000) { return null; } @@ -588,14 +587,14 @@ class RcxDict { entry.misc = {}; entry.misc.U = - hex[(i >>> 12) & 15] + - hex[(i >>> 8) & 15] + - hex[(i >>> 4) & 15] + - hex[i & 15]; + hex[(kanjiCharCode >>> 12) & 15] + + hex[(kanjiCharCode >>> 8) & 15] + + hex[(kanjiCharCode >>> 4) & 15] + + hex[kanjiCharCode & 15]; const b = a[1].split(' '); - for (i = 0; i < b.length; ++i) { - if (b[i].match(/^([A-Z]+)(.*)/)) { + for (kanjiCharCode = 0; kanjiCharCode < b.length; ++kanjiCharCode) { + if (b[kanjiCharCode].match(/^([A-Z]+)(.*)/)) { if (!entry.misc[RegExp.$1]) { entry.misc[RegExp.$1] = RegExp.$2; } else { diff --git a/extension/test/data_test.ts b/extension/test/data_test.ts index 4d512b786..6ef28f444 100644 --- a/extension/test/data_test.ts +++ b/extension/test/data_test.ts @@ -1,6 +1,7 @@ import { Config } from '../configuration'; -import { RcxDict } from '../data'; +import { DictEntryData, RcxDict } from '../data'; import { expect, use } from '@esm-bundle/chai'; +import Sinon from 'sinon'; import chaiLike from 'chai-like'; import chaiThings from 'chai-things'; import sinonChrome from 'sinon-chrome'; @@ -37,6 +38,8 @@ describe('data.ts', function () { it('should include deinflections of length one or more', function () { expect(rcxDict.deinflect('です')).to.include.something.like({ word: 'だ', + reason: 'polite', + type: 32, }); }); @@ -63,4 +66,250 @@ describe('data.ts', function () { ).to.not.include.something.like({ entry: /^凡打 .*/ }); }); }); + + describe('kanjiSearch', function () { + it('should return null for kanji with char code < 0x3000', function () { + const result = rcxDict.kanjiSearch('A'); + + expect(result).to.be.null; + }); + + it('should return null if kanjiData entry is not properly formatted', function () { + const result = rcxDict.kanjiSearch('子9'); + + expect(result).to.be.null; + }); + + it('should return a DictEntryData object for valid kanji', function () { + const result: DictEntryData | null = rcxDict.kanjiSearch('日'); + + expect(result).to.deep.include({ + kanji: '日', + eigo: 'day, sun, Japan, counter for days', + }); + }); + + it('should set kanji property of DictEntryData object', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.kanji).to.equal('日'); + }); + + it('should set misc -> U to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + U: '65E5', + }); + }); + it('should set misc -> B to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + B: '72', + }); + }); + it('should set misc -> G to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + G: '1', + }); + }); + it('should set misc -> S to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + S: '4', + }); + }); + it('should set misc -> F to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + F: '1', + }); + }); + it('should set misc -> N to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + N: '2097', + }); + }); + it('should set misc -> V to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + V: '2410', + }); + }); + it('should set misc -> H to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + H: '3027', + }); + }); + it('should set misc -> DK to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + DK: '1915', + }); + }); + it('should set misc -> L to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + L: '12 day', + }); + }); + it('should set misc -> DN to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + DN: '12 day', + }); + }); + it('should set misc -> E to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + E: '62', + }); + }); + it('should set misc -> IN to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + IN: '5', + }); + }); + it('should set misc -> P to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + P: '3-3-1', + }); + }); + it('should set misc -> I to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + I: '4c0.1', + }); + }); + it('should set misc -> Y to correct value', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.misc).to.deep.include({ + Y: 'ri4', + }); + }); + + it('should set onkun property of DictEntryData object', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.onkun).to.contain('ニチ、 ジツ、 ひ、 -び、 -か'); + }); + + it('should set nanori property of DictEntryData object', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.nanori).to.contain( + 'あ、 あき、 いる、 く、 くさ、 こう、 す、 たち、 に、 にっ、 につ、 へ' + ); + }); + + it('should set bushumei property of DictEntryData object to empty string if null', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.bushumei).to.equal(''); + }); + + it('should set eigo property of DictEntryData object', function () { + const result = rcxDict.kanjiSearch('日'); + + expect(result?.eigo).to.contain('day, sun, Japan'); + }); + }); + + describe('(fileRead)', function () { + it('should take a valid file url and return a response', function () { + const url = 'data/dict.dat'; + + const result = rcxDict.fileRead(url); + + expect(result).to.be.a('string'); + }); + }); + + describe('(fileReadArray)', function () { + it('should call fileRead and return an array of data', function () { + const fileName = 'data/dict.dat'; + + const result = rcxDict.fileReadArray(fileName); + + expect(result).to.be.an('array'); + expect(result).to.have.lengthOf.at.least(1); + }); + it('should remove any whitespace or newlines from end of array', function () { + const stub = Sinon.stub(rcxDict, 'fileReadArray').returns([ + 'testing', + 'is', + 'fun \n', + ]); + + const results: string[] = rcxDict.fileReadArray('someFile/here'); + + expect(results).to.be.an('array'); + expect(results[results.length - 1].endsWith(' ')).to.be.false; + Sinon.assert.calledOnce(stub); + }); + }); + + describe('(loadNames)', function () { + it('should load name dictionary information if not loaded already', function () { + rcxDict.loadNames(); + expect(rcxDict.nameDict).to.exist; + expect(rcxDict.nameIndex).to.exist; + }); + }); + + describe('isKana', function () { + it('should return true if charCode is a kana character', function () { + const kanaCode = 'て'.charCodeAt(0); + expect(rcxDict.isKana(kanaCode)).to.be.true; + }); + it('should return false if non kana character is searched', function () { + const nonKanaCode = 'Test'.charCodeAt(0); + expect(rcxDict.isKana(nonKanaCode)).to.be.false; + }); + }); + + describe('convertToHiragana', function () { + it('should return hiragana character when passed half-width katakana', function () { + const kanaCode = 'テスト'.charCodeAt(0); + expect(rcxDict.isKana(kanaCode)).to.be.true; + }); + it('should return hiragana character when passed full-width katakana', function () { + const nonKanaCode = 'Test'.charCodeAt(0); + expect(rcxDict.isKana(nonKanaCode)).to.be.false; + }); + it('should do correct hiragana lookup when parsing voiced and semi-voiced half-width katakana', function () { + // const expected = 'デッスクトップ' + const hwInput = 'ポジティブ'; + expect(rcxDict.convertToHiragana(hwInput)).to.equal('ぽじてぃぶ'); + }); + }); + describe('normalize', function () { + it('should strip any spaces, tabs, new lines or zero-width-joiner', function () { + const expected = 'てすと'; + + const result = rcxDict.normalize('てす‌‌と'); // also contains ZWJ + + expect(result).to.equal(expected); + }); + }); });