Skip to content

Commit

Permalink
大文字小文字の正規化に対応した
Browse files Browse the repository at this point in the history
  • Loading branch information
Chanmoro committed Feb 12, 2021
1 parent 90bba41 commit 71fe521
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 9 deletions.
13 changes: 13 additions & 0 deletions examples/example.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,16 @@
print('`京都`で検索', index['京都'])
print()
print('`京都府`で検索', index['京都府'])
print()

index_2 = naivesearch('./examples/fruits.txt')
print('`apple`で検索', index_2['apple']) # Apple
print()
print('`APPLE`で検索', index_2['APPLE']) # Apple
print()
print('`Orange`で検索', index_2['Orange']) # Orange
print()
print('`ORANGE`で検索', index_2['ORANGE']) # Orange
print()
print('`京都`で検索', index_2['京都']) # ヒットしない
print()
5 changes: 3 additions & 2 deletions examples/fruits.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ Grape
Mango
Blueberry
Pomegranate
Carambola(U.K) – starfruit (U.S)
Carambola
starfruit
Plum
Banana
Raspberry
Expand All @@ -25,4 +26,4 @@ Grapefruit
Melon
Coconut
Avocado
Peachモン
Peach
8 changes: 7 additions & 1 deletion naivesearch/indexer/formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@

Formatter = Callable[[str], str]


class UnicodeNormalizer:
def __call__(self, x :str):
def __call__(self, x: str):
return unicodedata.normalize('NFKC', x)


class LowerCaseNormalizer:
def __call__(self, x: str):
return x.lower()
4 changes: 2 additions & 2 deletions naivesearch/indexer/inverted_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,12 @@
class Reader(Iterable[str]):
pass


class InvertedIndex:
index: Dict[str, List[str]] = defaultdict(list)
chunkers: List[Chunker]

def __init__(self, reader: Reader, chunkers: List[Chunker]):
self.index: Dict[str, List[str]] = defaultdict(list)
self.chunkers = chunkers

logger.info('Start indexing.')
Expand All @@ -40,4 +41,3 @@ def __getitem__(self, q):
result = result & chain

return list(result)

9 changes: 8 additions & 1 deletion naivesearch/indexer/test_formatter.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from naivesearch import InvertedIndex
from typing import Callable, List, Optional
from .formatter import UnicodeNormalizer
from .formatter import UnicodeNormalizer, LowerCaseNormalizer


class TestFormatter:
def test_unicode_normalizer(self):
Expand All @@ -10,3 +11,9 @@ def test_unicode_normalizer(self):
rhs = '⼈⼝'
assert lhs != rhs
assert formatter(lhs) == formatter(rhs)

def test_lower_case_normalizer(self):
formatter = LowerCaseNormalizer()
lhs = 'UPPER'
rhs = 'upper'
assert formatter(lhs) == formatter(rhs)
7 changes: 4 additions & 3 deletions naivesearch/main.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
from naivesearch.indexer import InvertedIndex
from naivesearch.indexer.formatter import UnicodeNormalizer
from naivesearch.indexer.formatter import UnicodeNormalizer, LowerCaseNormalizer
from naivesearch.indexer.converter import BigramConverter
from naivesearch.indexer.chunker import CharacterChunker


def naivesearch(filepath: str):

def file_reader(filepath):
Expand All @@ -13,8 +14,8 @@ def file_reader(filepath):
index = InvertedIndex(
file_reader(filepath),
[
BigramConverter(CharacterChunker(UnicodeNormalizer()))
# BigramConverter(CharacterChunker(UnicodeNormalizer())),
BigramConverter(CharacterChunker(LowerCaseNormalizer()))
]
)
return index

0 comments on commit 71fe521

Please sign in to comment.