Skip to content

Commit

Permalink
Improve tagger: Return iterators over WordData, remove groups, para…
Browse files Browse the repository at this point in the history
…llelize deserialization (#70)

* use vec instead of hashmap internally in tagger

* parallelize tagger deserialization

* return iterators in tagger

* chore: improve benches a bit (#71)

* remove redundant vectors in chunker

* cleanup wordidmap

* update changelog

Co-authored-by: Bernhard Schuster <[email protected]>
  • Loading branch information
bminixhofer and drahnr authored Apr 24, 2021
1 parent 26e1983 commit 602aaf1
Show file tree
Hide file tree
Showing 10 changed files with 417 additions and 241 deletions.
18 changes: 18 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,21 @@
# 0.6.4

## Internal improvements

- Decrease time it takes to load the `Tokenizer` by ~ 40% (#70).
- Tag lookup is backed by a vector instead of a hashmap now.

## Breaking changes

- The tagger now returns iterators over tags instead of allocating a vector.
- Remove `get_group_members` function.

# 0.6.3

## Fixes

- Fix a bug where calling `Rule::suggest` in parallel across threads would cause a panic (#68, thanks @drahnr!)

# 0.6.2

## Internal improvements
Expand Down
22 changes: 19 additions & 3 deletions nlprule/benches/load.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,31 @@
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use nlprule::{Rules, Tokenizer};
use std::time::Duration;

fn criterion_benchmark(c: &mut Criterion) {
fn parse_tokenizer(c: &mut Criterion) {
c.bench_function("load tokenizer", |b| {
b.iter(|| Tokenizer::new(black_box("../storage/en_tokenizer.bin")).unwrap())
});
}

fn parse_rules(c: &mut Criterion) {
c.bench_function("load rules", |b| {
b.iter(|| Rules::new(black_box("../storage/en_rules.bin")).unwrap())
});
}

criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);
fn no_warmup_criterion() -> Criterion {
Criterion::default()
.sample_size(20)
.warm_up_time(Duration::from_nanos(1))
}

criterion_group!(
name = parse;
config = no_warmup_criterion();
targets =
parse_rules,
parse_tokenizer,
);

criterion_main!(parse);
60 changes: 33 additions & 27 deletions nlprule/src/compile/impls.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ use crate::{
tokenizer::{
chunk,
multiword::{MultiwordTagger, MultiwordTaggerFields},
tag::{Tagger, TaggerLangOptions},
tag::{Tagger, TaggerLangOptions, WordIdMap},
Tokenizer, TokenizerLangOptions,
},
types::*,
Expand Down Expand Up @@ -94,9 +94,6 @@ impl Tagger {
common_words: &HashSet<String>,
lang_options: TaggerLangOptions,
) -> std::io::Result<Self> {
let mut tags = DefaultHashMap::default();
let mut groups = DefaultHashMap::default();

let mut tag_store = HashSet::new();
let mut word_store = HashSet::new();

Expand Down Expand Up @@ -148,24 +145,25 @@ impl Tagger {
.map(|(i, x)| (x.to_string(), PosIdInt::from_value_unchecked(i as u16)))
.collect();

let mut tags: Vec<Option<Vec<(WordIdInt, PosIdInt)>>> = vec![None; word_store.len()];

for (word, inflection, tag) in lines.iter() {
let word_id = word_store.get_by_left(word).unwrap();
let lemma_id = word_store.get_by_left(inflection).unwrap();
let pos_id = tag_store.get_by_left(tag).unwrap();

let group = groups.entry(*lemma_id).or_insert_with(Vec::new);
if !group.contains(word_id) {
group.push(*word_id);
match &mut tags[word_id.value() as usize] {
Some(vec) => {
vec.push((*lemma_id, *pos_id));
}
None => {
tags[word_id.value() as usize] = Some(vec![(*lemma_id, *pos_id)]);
}
}

tags.entry(*word_id)
.or_insert_with(Vec::new)
.push((*lemma_id, *pos_id));
}

Ok(Tagger {
tags,
groups,
tags: WordIdMap(tags),
word_store,
tag_store,
lang_options,
Expand Down Expand Up @@ -453,24 +451,32 @@ pub(in crate::compile) struct ContextData {
outcomes: Vec<usize>,
}

impl From<ContextData> for chunk::Context {
fn from(data: ContextData) -> Self {
chunk::Context {
parameters: data.parameters,
outcomes: data.outcomes,
}
}
}

impl From<ModelData> for chunk::Model {
fn from(data: ModelData) -> Self {
let mut outcomes: Vec<usize> = Vec::new();
let mut parameters: Vec<f32> = Vec::new();

let pmap = data
.pmap
.into_iter()
.map(|(key, value)| {
assert_eq!(value.outcomes.len(), value.parameters.len());

let offset = outcomes.len();
let length = value.outcomes.len();

outcomes.extend(value.outcomes);
parameters.extend(value.parameters);

(chunk::hash::hash_str(&key), (offset, length))
})
.collect::<DefaultHashMap<_, _>>();

chunk::Model {
outcome_labels: data.outcome_labels,
pmap: data
.pmap
.into_iter()
.map(|(key, value)| (chunk::hash::hash_str(&key), value.into()))
.collect::<DefaultHashMap<_, _>>(),
outcomes,
parameters,
pmap,
}
}
}
Expand Down
1 change: 1 addition & 0 deletions nlprule/src/compile/parse_structure.rs
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ fn parse_match(m: structure::Match, engine: &Engine, info: &mut BuildInfo) -> Re
|| m.postag_replace.is_some()
|| m.text.is_some()
{
// this would need a fully functional PosReplacer to work
return Err(Error::Unimplemented(
"postag, postag_regex, postag_replace and text in `match` are not implemented.".into(),
));
Expand Down
5 changes: 2 additions & 3 deletions nlprule/src/filter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,11 @@ impl Filterable for NoDisambiguationEnglishPartialPosTagFilter {
fn keep(&self, sentence: &MatchSentence, graph: &MatchGraph) -> bool {
graph.by_id(self.id).tokens(sentence).all(|token| {
if let Some(captures) = self.regexp.captures(&token.word().as_str()) {
let tags = sentence
let mut tags = sentence
.tagger()
.get_tags(&captures.get(1).unwrap().as_str());

tags.iter()
.any(|x| self.postag_regexp.is_match(x.pos().as_str()))
tags.any(|x| self.postag_regexp.is_match(x.pos().as_str()))
} else {
false
}
Expand Down
32 changes: 3 additions & 29 deletions nlprule/src/rule/grammar.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,35 +51,9 @@ pub struct PosReplacer {
}

impl PosReplacer {
fn apply(&self, text: &str, sentence: &MatchSentence) -> Option<String> {
let mut candidates: Vec<_> = sentence
.tagger()
.get_tags(text)
.iter()
.map(|x| {
let group_words = sentence.tagger().get_group_members(&x.lemma().as_str());
let mut data = Vec::new();
for word in group_words {
if let Some(i) = sentence
.tagger()
.get_tags(word)
.iter()
.position(|x| self.matcher.is_match(x.pos()))
{
data.push((word.to_string(), i));
}
}
data
})
.rev()
.flatten()
.collect();
candidates.sort_by(|(_, a), (_, b)| a.cmp(b));
if candidates.is_empty() {
None
} else {
Some(candidates.remove(0).0)
}
fn apply(&self, _text: &str, _sentence: &MatchSentence) -> Option<String> {
// TODO: needs to be implemented with correct ordering, currently rules which would need this are disabled
unimplemented!()
}
}

Expand Down
12 changes: 7 additions & 5 deletions nlprule/src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -324,11 +324,13 @@ impl Tokenizer {
IncompleteToken::new(
Word::new(
self.tagger.id_word(token_text.into()),
self.tagger.get_tags_with_options(
token_text,
if is_sentence_start { Some(true) } else { None },
None,
),
self.tagger
.get_tags_with_options(
token_text,
if is_sentence_start { Some(true) } else { None },
None,
)
.collect(),
),
Span::new(
byte_start..byte_start + token_text.len(),
Expand Down
108 changes: 50 additions & 58 deletions nlprule/src/tokenizer/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,6 @@ fn softmax(vec: &mut Vec<f32>) {
}
}

#[derive(Debug, Serialize, Deserialize, Clone)]
pub(crate) struct Context {
pub(crate) parameters: Vec<f32>,
pub(crate) outcomes: Vec<usize>,
}

#[derive(Debug, Clone)]
struct Sequence<'a> {
outcomes: Vec<&'a str>,
Expand Down Expand Up @@ -122,77 +116,70 @@ pub(crate) mod hash {
#[derive(Serialize, Deserialize)]
struct ModelFields {
outcome_labels: Vec<String>,
// stores each hash and the length of the context of the hash
// this is kind of close to gzip compression already and it is difficult
// where to draw the line. The chunker model should have a custom
// serialization implementation anyway for bf16 compression so this is OK here.
cols: Vec<(u64, u8)>,
// stores the context outcome labels
rows: Vec<u8>,
// stores the context parameter values
values: Vec<bf16>,
pmap: Vec<(u64, u8)>,
outcomes: Vec<u8>,
parameters: Vec<bf16>,
}

impl From<Model> for ModelFields {
fn from(model: Model) -> Self {
let mut cols = Vec::new();
let mut rows = Vec::new();
let mut values = Vec::new();
let mut pmap: Vec<_> = model.pmap.into_iter().collect();
pmap.sort_by_key(|(_, (offset, _))| *offset);

for (key, context) in model.pmap.iter() {
assert_eq!(context.outcomes.len(), context.parameters.len());
assert!(context.outcomes.len() <= std::u8::MAX as usize);
cols.push((*key, context.outcomes.len() as u8));

for (label, value) in context.outcomes.iter().zip(context.parameters.iter()) {
assert!(*label <= std::u8::MAX as usize);
let pmap = pmap
.into_iter()
.map(|(key, (_, length))| {
assert!(length <= u8::MAX as usize);
(key, length as u8)
})
.collect();

rows.push(*label as u8);
values.push(bf16::from_f32(*value));
}
}
let outcomes = model
.outcomes
.into_iter()
.map(|outcome| {
assert!(outcome <= u8::MAX as usize);
outcome as u8
})
.collect();
let parameters = model.parameters.into_iter().map(bf16::from_f32).collect();

ModelFields {
outcome_labels: model.outcome_labels,
cols,
rows,
values,
pmap,
outcomes,
parameters,
}
}
}

impl From<ModelFields> for Model {
fn from(data: ModelFields) -> Self {
let mut pmap = DefaultHashMap::new();
let mut offset = 0;

for (key, length) in data.pmap {
pmap.insert(key, (offset, length as usize));

let mut row_iter = data.rows.iter();
let mut value_iter = data.values.iter();

for (key, n) in data.cols.iter() {
let outcomes: Vec<_> = (0..*n as usize)
.map(|_| *row_iter.next().expect("checked in From<Model> impl") as usize)
.collect();
let parameters: Vec<_> = (0..*n as usize)
.map(|_| {
value_iter
.next()
.expect("checked in From<Model> impl")
.to_f32()
})
.collect();

pmap.insert(
*key,
Context {
outcomes,
parameters,
},
);
offset += length as usize;
}

let outcomes = data
.outcomes
.into_iter()
.map(|outcome| outcome as usize)
.collect();
let parameters = data
.parameters
.into_iter()
.map(|parameter| parameter.to_f32())
.collect();

Model {
outcome_labels: data.outcome_labels,
pmap,
outcomes,
parameters,
}
}
}
Expand All @@ -201,16 +188,21 @@ impl From<ModelFields> for Model {
#[serde(from = "ModelFields", into = "ModelFields")]
pub(crate) struct Model {
pub(crate) outcome_labels: Vec<String>,
pub(crate) pmap: DefaultHashMap<u64, Context>,
pub(crate) outcomes: Vec<usize>,
pub(crate) parameters: Vec<f32>,
pub(crate) pmap: DefaultHashMap<u64, (usize, usize)>,
}

impl Model {
fn eval(&self, context: &[u64]) -> Vec<f32> {
let mut prior =
vec![(1. / (self.outcome_labels.len() as f32)).ln(); self.outcome_labels.len()];

for context in context.iter().filter_map(|x| self.pmap.get(&x)) {
for (idx, param) in context.outcomes.iter().zip(context.parameters.iter()) {
for (offset, length) in context.iter().filter_map(|x| self.pmap.get(&x)) {
let outcomes = &self.outcomes[*offset..*offset + length];
let parameters = &self.parameters[*offset..*offset + length];

for (idx, param) in outcomes.iter().zip(parameters.iter()) {
prior[*idx] += param;
}
}
Expand Down
Loading

0 comments on commit 602aaf1

Please sign in to comment.