Skip to content

Commit

Permalink
Merge pull request #775 from JaredReisinger/fix-doc-offset-start
Browse files Browse the repository at this point in the history
Fix doc/result offset start
  • Loading branch information
spencermountain authored Aug 10, 2020
2 parents 3fffa74 + 1df7d92 commit 2b90ad5
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 30 deletions.
19 changes: 10 additions & 9 deletions src/Doc/methods/output/_offset.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,16 @@ const calcOffset = function(doc, result, options) {
// return n
// }, 0)

// offset information for the entire doc starts at the first term, and
// is as long as the whole text (note that there may be an issue where
// leading punctuation is counted in the doc text length, but is
// *excluded* from the term[0] start position)
o.offset = Object.assign(
{},
o.terms[0].offset,
{ length: o.text.length }
)
// The offset information for the entire doc starts at (or just before)
// the first term, and is as long as the whole text. The code originally
// copied the entire offset value from terms[0], but since we're now
// overriding 2 of the three fields, it's cleaner to just create an all-
// new object and not pretend it's "just" the same as terms[0].
o.offset = {
index: o.terms[0].offset.index,
start: o.terms[0].offset.start - o.text.indexOf(o.terms[0].text),
length: o.text.length
}
})
}
}
Expand Down
40 changes: 19 additions & 21 deletions tests/output/offset.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ test('offset-punctuation', function(t) {
let doc = nlp(`one (two two) more `)
let m = doc.match('two two')
let obj = m.json({ offset: true, terms: false })[0]
t.equal(obj.offset.start, 5, '4 two-start')
t.equal(obj.offset.start, 4, '4 two-start')
t.equal(obj.offset.length, 9, '4 two-length')

doc = nlp(`0123, 678`)
Expand All @@ -43,14 +43,14 @@ test('offset-terms', function(t) {
let doc = nlp(`hello world`)
let obj = doc.json({ offset: true, terms: true })[0]

t.equal(obj.offset.start, 0, 'doc-start')
t.equal(obj.offset.length, 11, 'doc-length')
t.equal(obj.offset.start, 0, '6 doc-start')
t.equal(obj.offset.length, 11, '6 doc-length')

t.equal(obj.terms[0].offset.start, 0, 'term 0-start')
t.equal(obj.terms[0].offset.length, 5, 'term 0-length')
t.equal(obj.terms[0].offset.start, 0, '6 term 0-start')
t.equal(obj.terms[0].offset.length, 5, '6 term 0-length')

t.equal(obj.terms[1].offset.start, 6, 'term 0-start')
t.equal(obj.terms[1].offset.length, 5, 'term 0-length')
t.equal(obj.terms[1].offset.start, 6, '6 term 1-start')
t.equal(obj.terms[1].offset.length, 5, '6 term 1-length')

t.end()
})
Expand All @@ -59,14 +59,14 @@ test('offset-terms-whitespace', function(t) {
let doc = nlp(` hello world`)
let obj = doc.json({ offset: true, terms: true })[0]

t.equal(obj.offset.start, 1, 'doc-start')
t.equal(obj.offset.length, 11, 'doc-length')
t.equal(obj.offset.start, 1, '7 doc-start')
t.equal(obj.offset.length, 11, '7 doc-length')

t.equal(obj.terms[0].offset.start, 1, 'term 0-start')
t.equal(obj.terms[0].offset.length, 5, 'term 0-length')
t.equal(obj.terms[0].offset.start, 1, '7 term 0-start')
t.equal(obj.terms[0].offset.length, 5, '7 term 0-length')

t.equal(obj.terms[1].offset.start, 7, 'term 0-start')
t.equal(obj.terms[1].offset.length, 5, 'term 0-length')
t.equal(obj.terms[1].offset.start, 7, '7 term 1-start')
t.equal(obj.terms[1].offset.length, 5, '7 term 1-length')

t.end()
})
Expand All @@ -75,16 +75,14 @@ test('offset-terms-punctuation', function(t) {
let doc = nlp(`"hello world`)
let obj = doc.json({ offset: true, terms: true })[0]

// The doc-level offset should perhaps be 0->12 or 1->11... but 1->12 is not
// really sane. This test will need to change if/when that gets figured out.
t.equal(obj.offset.start, 1, 'doc-start') // <==== arguably wrong!
t.equal(obj.offset.length, 12, 'doc-length') // <==== arguably wrong!
t.equal(obj.offset.start, 0, '8 doc-start')
t.equal(obj.offset.length, 12, '8 doc-length')

t.equal(obj.terms[0].offset.start, 1, 'term 0-start')
t.equal(obj.terms[0].offset.length, 5, 'term 0-length')
t.equal(obj.terms[0].offset.start, 1, '8 term 0-start')
t.equal(obj.terms[0].offset.length, 5, '8 term 0-length')

t.equal(obj.terms[1].offset.start, 7, 'term 0-start')
t.equal(obj.terms[1].offset.length, 5, 'term 0-length')
t.equal(obj.terms[1].offset.start, 7, '8 term 1-start')
t.equal(obj.terms[1].offset.length, 5, '8 term 1-length')

t.end()
})

0 comments on commit 2b90ad5

Please sign in to comment.