From ee0b50234931f534f068071712958da8a571b3f3 Mon Sep 17 00:00:00 2001 From: Bastian Rihm Date: Fri, 17 Nov 2023 08:45:28 +0100 Subject: [PATCH] Improve search results (#63) * Generate wildcard question and add clean index mapping * JSON field support * Do not add keyword mappings to _all field * Do not add fields to index that are not searchable * Add motion change recos to search * Add mediafile directory to additional fields --- pkg/meta/collections.go | 21 ++++++---- pkg/meta/filters.go | 35 +++++++++++----- pkg/meta/member.go | 1 + pkg/search/textindex.go | 89 ++++++++++++++++++++++++++++++++--------- search.yml | 40 ++++++++++++++++++ 5 files changed, 149 insertions(+), 37 deletions(-) diff --git a/pkg/meta/collections.go b/pkg/meta/collections.go index 9495446..39dc003 100644 --- a/pkg/meta/collections.go +++ b/pkg/meta/collections.go @@ -16,17 +16,24 @@ type Collection struct { // CollectionRelation describes a related collection type CollectionRelation struct { - Type string `json:"type"` - Collection *string `json:"collection,omitempty"` - Fields map[string]*CollectionRelation `json:"fields"` + Type string `json:"type" yaml:"type"` + Collection *string `json:"collection,omitempty" yaml:"collection,omitempty"` + Fields map[string]*CollectionRelation `json:"fields" yaml:"fields"` +} + +// CollectionSearchableConfig contains per field config of a collection +type CollectionSearchableConfig struct { + Type *string `yaml:"type,omitempty"` + Analyzer *string `yaml:"analyzer,omitempty"` } // CollectionDescription is the collection format for search filters type CollectionDescription struct { - Searchable []string `yaml:"searchable"` - Additional []string `yaml:"additional"` - Contains []string `yaml:"contains,omitempty"` - Relations map[string]*CollectionRelation `yaml:"relations,omitempty"` + Searchable []string `yaml:"searchable"` + SearchableConfig map[string]*CollectionSearchableConfig `yaml:"searchable_config,omitempty"` + Additional []string `yaml:"additional"` + Contains []string `yaml:"contains,omitempty"` + Relations map[string]*CollectionRelation `yaml:"relations,omitempty"` } // Collections is part of the meta model. diff --git a/pkg/meta/filters.go b/pkg/meta/filters.go index b035acb..2136e18 100644 --- a/pkg/meta/filters.go +++ b/pkg/meta/filters.go @@ -9,11 +9,12 @@ import ( // Filter is part of the meta model. type Filter struct { - Name string - Items []string - Additional []string - Contains map[string]struct{} - Relations map[string]*CollectionRelation + Name string + Items []string + ItemsConfig map[string]*CollectionSearchableConfig + Additional []string + Contains map[string]struct{} + Relations map[string]*CollectionRelation } // FilterKey is part of the meta model. @@ -65,11 +66,12 @@ func (fs *Filters) UnmarshalYAML(value *yaml.Node) error { } *fs = append(*fs, Filter{ - Name: s.Name, - Items: fsm[s].Searchable, - Additional: fsm[s].Additional, - Relations: relations, - Contains: contains, + Name: s.Name, + Items: fsm[s].Searchable, + ItemsConfig: fsm[s].SearchableConfig, + Additional: fsm[s].Additional, + Relations: relations, + Contains: contains, }) } return nil @@ -101,11 +103,16 @@ func (fs Filters) Retain(verbose bool) func(string, string, *Member) bool { keep := map[key]struct{}{} additional := map[key]struct{}{} relations := map[key]*CollectionRelation{} + config := map[key]*CollectionSearchableConfig{} for _, m := range fs { for _, f := range m.Items { keep[key{rel: m.Name, field: f}] = struct{}{} } + for f, data := range m.ItemsConfig { + config[key{rel: m.Name, field: f}] = data + } + for _, f := range m.Additional { additional[key{rel: m.Name, field: f}] = struct{}{} } @@ -119,6 +126,14 @@ func (fs Filters) Retain(verbose bool) func(string, string, *Member) bool { m.Relation = relations[key{rel: rk, field: fk}] } + if c, ok := config[key{rel: rk, field: fk}]; ok { + if c.Type != nil { + m.Type = *c.Type + } + + m.Analyzer = c.Analyzer + } + if _, ok := additional[key{rel: rk, field: fk}]; ok { m.Searchable = false return true diff --git a/pkg/meta/member.go b/pkg/meta/member.go index 2fa5c75..f073a03 100644 --- a/pkg/meta/member.go +++ b/pkg/meta/member.go @@ -24,6 +24,7 @@ type Member struct { RestrictionMode string `yaml:"restriction_mode"` Required bool `yaml:"required"` Searchable bool `yaml:"-"` + Analyzer *string `yaml:"-"` Relation *CollectionRelation `yaml:"-"` Order int32 `yaml:"-"` } diff --git a/pkg/search/textindex.go b/pkg/search/textindex.go index bb7f0e7..dbf53f7 100644 --- a/pkg/search/textindex.go +++ b/pkg/search/textindex.go @@ -5,10 +5,12 @@ package search import ( + "bytes" "fmt" "html" "os" "strconv" + "strings" "time" log "github.com/sirupsen/logrus" @@ -19,6 +21,7 @@ import ( "github.com/blevesearch/bleve/v2" "github.com/blevesearch/bleve/v2/analysis" "github.com/blevesearch/bleve/v2/analysis/analyzer/keyword" + "github.com/blevesearch/bleve/v2/analysis/analyzer/simple" bleveHtml "github.com/blevesearch/bleve/v2/analysis/char/html" "github.com/blevesearch/bleve/v2/analysis/lang/de" "github.com/blevesearch/bleve/v2/analysis/token/lowercase" @@ -143,38 +146,57 @@ func (bt bleveType) BleveType() string { } func buildIndexMapping(collections meta.Collections) mapping.IndexMapping { - numberFieldMapping := bleve.NewNumericFieldMapping() + + numberedRelationFieldMapping := bleve.NewNumericFieldMapping() + numberedRelationFieldMapping.IncludeInAll = false + textFieldMapping := bleve.NewTextFieldMapping() textFieldMapping.Analyzer = de.AnalyzerName htmlFieldMapping := bleve.NewTextFieldMapping() htmlFieldMapping.Analyzer = deHTML - keywordFieldMapping := bleve.NewTextFieldMapping() - keywordFieldMapping.Analyzer = keyword.Name + collectionInfoFieldMapping := bleve.NewTextFieldMapping() + collectionInfoFieldMapping.Analyzer = keyword.Name + collectionInfoFieldMapping.IncludeInAll = false + + simpleFieldMapping := bleve.NewTextFieldMapping() + simpleFieldMapping.Analyzer = simple.Name indexMapping := mapping.NewIndexMapping() indexMapping.TypeField = "_bleve_type" for name, col := range collections { docMapping := bleve.NewDocumentMapping() - docMapping.AddFieldMappingsAt("_bleve_type", keywordFieldMapping) + docMapping.AddFieldMappingsAt("_bleve_type", collectionInfoFieldMapping) for fname, cf := range col.Fields { if cf.Searchable { - switch cf.Type { - case "HTMLStrict", "HTMLPermissive": - docMapping.AddFieldMappingsAt(fname, htmlFieldMapping) - case "string", "text": - docMapping.AddFieldMappingsAt(fname, textFieldMapping) - case "generic-relation": - docMapping.AddFieldMappingsAt(fname, keywordFieldMapping) - case "relation", "number": - docMapping.AddFieldMappingsAt(fname, numberFieldMapping) - case "number[]": - docMapping.AddFieldMappingsAt(fname, numberFieldMapping) - default: - log.Errorf("unsupport type %q on field %s\n", cf.Type, fname) + if cf.Analyzer == nil { + switch cf.Type { + case "HTMLStrict", "HTMLPermissive": + docMapping.AddFieldMappingsAt(fname, htmlFieldMapping) + case "string", "text": + docMapping.AddFieldMappingsAt(fname, textFieldMapping) + docMapping.AddFieldMappingsAt("_"+fname+"_original", simpleFieldMapping) + case "generic-relation": + docMapping.AddFieldMappingsAt(fname, collectionInfoFieldMapping) + case "relation", "relation-list": + docMapping.AddFieldMappingsAt(fname, numberedRelationFieldMapping) + case "number", "number[]": + docMapping.AddFieldMappingsAt(fname, numberFieldMapping) + default: + log.Errorf("unsupport type %q on field %s\n", cf.Type, fname) + } + } else { + switch *cf.Analyzer { + case "html": + docMapping.AddFieldMappingsAt(fname, htmlFieldMapping) + case "simple": + docMapping.AddFieldMappingsAt(fname, simpleFieldMapping) + default: + log.Errorf("unsupported analyzer %q on field %s\n", *cf.Analyzer, fname) + } } } } @@ -187,9 +209,19 @@ func buildIndexMapping(collections meta.Collections) mapping.IndexMapping { } func (bt bleveType) fill(fields map[string]*meta.Member, data []byte) { - for fname := range fields { + for fname, field := range fields { + if !field.Searchable { + continue + } + switch fields[fname].Type { - case "HTMLStrict", "HTMLPermissive", "string", "text", "generic-relation": + case "string", "text": + if v, err := jsonparser.GetString(data, fname); err == nil { + bt[fname] = v + bt["_"+fname+"_original"] = v + continue + } + case "HTMLStrict", "HTMLPermissive", "generic-relation": if v, err := jsonparser.GetString(data, fname); err == nil { bt[fname] = v continue @@ -207,6 +239,13 @@ func (bt bleveType) fill(fields map[string]*meta.Member, data []byte) { } }, fname) continue + case "json-int-string-map": + bt[fname] = []string{} + jsonparser.ObjectEach(data, func(key []byte, value []byte, dataType jsonparser.ValueType, offset int) error { + bt[fname] = append(bt[fname].([]string), string(value)) + return nil + }, fname) + continue default: if v, _, _, err := jsonparser.Get(data, fname); err == nil { bt[fname] = v @@ -350,8 +389,18 @@ func (ti *TextIndex) Search(question string, collections []string, meetingID int log.Debugf("searching for %q took %v\n", question, time.Since(start)) }() + var wildcardQuestion bytes.Buffer + for _, w := range strings.Split(question, " ") { + if w[0] != byte('*') && w[len(w)-1] != byte('*') { + wildcardQuestion.WriteString("*" + strings.ToLower(w) + "* ") + } + } + wildcardQuery := bleve.NewQueryStringQuery(wildcardQuestion.String()) + var q query.Query - matchQuery := bleve.NewQueryStringQuery(question) + matchQueryOriginal := bleve.NewQueryStringQuery(question) + matchQueryOriginal.SetBoost(5) + matchQuery := bleve.NewDisjunctionQuery(matchQueryOriginal, wildcardQuery) if meetingID > 0 { fmid := float64(meetingID) diff --git a/search.yml b/search.yml index 342f437..bc259ed 100644 --- a/search.yml +++ b/search.yml @@ -53,6 +53,7 @@ mediafile: - owner_id additional: - id + - is_directory relations: owner_id: type: generic-relation @@ -84,6 +85,11 @@ motion: - text - reason - meeting_id + - amendment_paragraphs + searchable_config: + amendment_paragraphs: + type: json-int-string-map + analyzer: html additional: - id - sequential_number @@ -121,6 +127,33 @@ motion: pronoun: null username: null gender: null +motion_change_recommendation: + contains: + - motion + searchable: + - other_description + - text + - meeting_id + additional: + - id + - motion_id + relations: + motion_id: + type: relation + collection: motion + fields: + id: null + title: null + number: null + sequential_number: null + meeting_id: null + meeting_id: + type: relation + collection: meeting + fields: + id: null + name: null + motions_show_sequential_number: null poll: contains: - topic @@ -219,6 +252,13 @@ user: - organization_management_level - meeting_ids - owner_id + searchable_config: + first_name: + analyzer: simple + last_name: + analyzer: simple + email: + analyzer: simple additional: - id relations: