Skip to content

Commit b49f7c2

Browse files
committed
don't remove non-latin characters during tokenization
1 parent 2c8dbd1 commit b49f7c2

File tree

2 files changed

+10
-2
lines changed

2 files changed

+10
-2
lines changed

ext/xregexp/nonLetterRegex.js

Lines changed: 7 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

js/places/fullTextSearch.js

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
/* global db Dexie */
22

3+
importScripts('../../ext/xregexp/nonLetterRegex.js')
4+
35
const whitespaceRegex = /\s+/g
4-
const notWordOrWhitespaceRegex = /[^\w\s]/g
56

67
// stop words list from https://github.com/weixsong/elasticlunr.js/blob/master/lib/stop_word_filter.js
78
const stopWords = {
@@ -129,7 +130,7 @@ const stopWords = {
129130

130131
/* this is used in placesWorker.js when a history item is created */
131132
function tokenize (string) {
132-
return string.trim().toLowerCase().replace(notWordOrWhitespaceRegex, ' ').split(whitespaceRegex).filter(function (token) {
133+
return string.trim().toLowerCase().replace(nonLetterRegex, ' ').split(whitespaceRegex).filter(function (token) {
133134
return !stopWords[token]
134135
}).slice(0, 20000)
135136
}

0 commit comments

Comments
 (0)