diff --git a/scratch.js b/scratch.js index 9611930ef..a5264000a 100644 --- a/scratch.js +++ b/scratch.js @@ -5,7 +5,7 @@ var nlp = require('./src/index'); // nlp.verbose('tagger'); // const corpus = require('nlp-corpus'); // let sotu = corpus.sotu.parsed()[23]; -const fresh = require('./test/unit/lib/freshPrince.js'); +// const fresh = require('./test/unit/lib/freshPrince.js'); // bug.1 // .? vs * @@ -31,5 +31,6 @@ const fresh = require('./test/unit/lib/freshPrince.js'); // r.tag('#Person'); // console.timeEnd('tag'); -let r = nlp('work with F.B.I.').nouns(); -r.debug(); +let r = nlp('the F.B.I.'); +// console.log(r.list[0].terms[1].normal); +console.log(r.out('normal')); diff --git a/src/term/index.js b/src/term/index.js index ba3e85c7b..45dbf301e 100644 --- a/src/term/index.js +++ b/src/term/index.js @@ -2,6 +2,9 @@ const fns = require('./paths').fns; const build_whitespace = require('./whitespace'); const makeUID = require('./makeUID'); +//normalization +const addNormal = require('./methods/normalize/normalize').addNormal; +const addRoot = require('./methods/normalize/root'); const Term = function(str) { this._text = fns.ensureString(str); @@ -10,12 +13,13 @@ const Term = function(str) { let parsed = build_whitespace(this._text); this.whitespace = parsed.whitespace; this._text = parsed.text; - // console.log(this.whitespace, this._text); this.parent = null; this.silent_term = ''; + //normalize the _text + addNormal(this); + addRoot(this); //has this term been modified this.dirty = false; - this.normalize(); //make a unique id for this term this.uid = makeUID(this.normal); @@ -42,6 +46,12 @@ const Term = function(str) { }); }; +//run each time a new text is set +Term.prototype.normalize = function() { + addNormal(this); + addRoot(this); + return this; +}; /** where in the sentence is it? zero-based. */ Term.prototype.index = function() { @@ -60,7 +70,7 @@ Term.prototype.clone = function() { return term; }; -require('./methods/normalize')(Term); +// require('./methods/normalize')(Term); require('./methods/misc')(Term); require('./methods/out')(Term); require('./methods/tag')(Term); diff --git a/src/term/methods/misc.js b/src/term/methods/misc.js index 482f6eea4..5196debdb 100644 --- a/src/term/methods/misc.js +++ b/src/term/methods/misc.js @@ -1,10 +1,9 @@ 'use strict'; const bestTag = require('./bestTag'); +const isAcronym = require('./normalize/isAcronym'); + //regs- -const periodAcronym = /([A-Z]\.)+[A-Z]?$/; -const oneLetterAcronym = /^[A-Z]\.$/; -const noPeriodAcronym = /[A-Z]{3}$/; const hasVowel = /[aeiouy]/i; const hasLetter = /[a-z]/; const hasNumber = /[0-9]/; @@ -17,24 +16,10 @@ const addMethods = (Term) => { bestTag: function () { return bestTag(this); }, - - /** does it appear to be an acronym, like FBI or M.L.B. */ + /** is this term like F.B.I. or NBA */ isAcronym: function () { - //like N.D.A - if (periodAcronym.test(this.text) === true) { - return true; - } - //like 'F.' - if (oneLetterAcronym.test(this.text) === true) { - return true; - } - //like NDA - if (noPeriodAcronym.test(this.text) === true) { - return true; - } - return false; + return isAcronym(this._text); }, - /** check if it is word-like in english */ isWord: function () { let t = this; diff --git a/src/term/methods/normalize/index.js b/src/term/methods/normalize/index.js deleted file mode 100644 index f0f1344a6..000000000 --- a/src/term/methods/normalize/index.js +++ /dev/null @@ -1,21 +0,0 @@ -'use strict'; -const addNormal = require('./normalize').addNormal; -const addRoot = require('./root'); - -const addMethods = (Term) => { - - const methods = { - normalize: function () { - addNormal(this); - addRoot(this); - return this; - }, - }; - //hook them into result.proto - Object.keys(methods).forEach((k) => { - Term.prototype[k] = methods[k]; - }); - return Term; -}; - -module.exports = addMethods; diff --git a/src/term/methods/normalize/isAcronym.js b/src/term/methods/normalize/isAcronym.js new file mode 100644 index 000000000..8d4c412af --- /dev/null +++ b/src/term/methods/normalize/isAcronym.js @@ -0,0 +1,23 @@ +'use strict'; +//regs - +const periodAcronym = /([A-Z]\.)+[A-Z]?$/; +const oneLetterAcronym = /^[A-Z]\.$/; +const noPeriodAcronym = /[A-Z]{3}$/; + +/** does it appear to be an acronym, like FBI or M.L.B. */ +const isAcronym = function (str) { + //like N.D.A + if (periodAcronym.test(str) === true) { + return true; + } + //like 'F.' + if (oneLetterAcronym.test(str) === true) { + return true; + } + //like NDA + if (noPeriodAcronym.test(str) === true) { + return true; + } + return false; +}; +module.exports = isAcronym; diff --git a/src/term/methods/normalize/normalize.js b/src/term/methods/normalize/normalize.js index f18340914..a861113df 100644 --- a/src/term/methods/normalize/normalize.js +++ b/src/term/methods/normalize/normalize.js @@ -1,5 +1,7 @@ 'use strict'; const killUnicode = require('./unicode'); +const isAcronym = require('./isAcronym'); + //some basic operations on a string to reduce noise exports.normalize = function(str) { @@ -32,7 +34,7 @@ exports.addNormal = function (term) { let str = term._text || ''; str = exports.normalize(str); //compact acronyms - if (term.isAcronym()) { + if (isAcronym(term._text)) { str = str.replace(/\./g, ''); } //nice-numbers