This repository has been archived by the owner on Nov 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindexing.lisp
110 lines (92 loc) · 3.73 KB
/
indexing.lisp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
(in-package #:cache-cache)
(ql:quickload "trivial-benchmark")
(trivial-benchmark:with-timing (100)
(find-issues "gitlab"))
; - SAMPLES TOTAL MINIMUM MAXIMUM MEDIAN AVERAGE DEVIATION
; REAL-TIME 100 0.584927 0.004651 0.096082 0.004797 0.005849 0.009075
; RUN-TIME 100 0.609375 0 0.09375 0 0.006094 0.01146
; USER-RUN-TIME 100 0.5625 0 0.0625 0 0.005625 0.009249
; SYSTEM-RUN-TIME 100 0.03125 0 0.03125 0 0.000312 0.003109
; PAGE-FAULTS 100 0 0 0 0 0 0.0
; GC-RUN-TIME 100 93.75 0 93.75 0 0.9375 9.328008
; BYTES-CONSED 100 819744656 8191504 8257136 8191536 8197446.5 18767.557
; EVAL-CALLS 100 0 0 0 0 0 0.0
; => NIL
(defun ngrams (n string)
"Generate all N-grams of STRING, the list is not deduplicated."
(loop :for i :below (- (length string) 3)
:for ngram = (make-array
n
:element-type (array-element-type string)
:displaced-to string
:displaced-index-offset i)
:collect ngram))
(defun normalized-ngrams (n string)
(remove-duplicates
(mapcar #'string-downcase
(ngrams n string))
:test #'string=))
#++
(normalized-ngrams 3 "abcdefgAbcd")
(defun build-index (&optional (issues *issues*))
(let ((index (make-hash-table :test #'equal)))
(when issues
(maphash
#'(lambda (id issue)
(loop :for ngram :in (normalized-ngrams
3
(issue-title issue))
:do (pushnew id (gethash ngram index))))
issues))
index))
(defparameter *index* (build-index))
#++
(progn
(time (length
(jzon:stringify *index*))))
(defun narrow-down-using-index (index query &optional candidates)
(loop
:for ngram :in (normalized-ngrams 3 query)
:do
(setf candidates
(if candidates
;; Update candidates
(intersection
candidates
(gethash ngram index))
;; Initialize candidates
(gethash ngram index)))
;; No candidates left, early exit
:while candidates
:finally (return candidates)))
#++
(length
(narrow-down-using-index *index* "gitlab"))
(trivial-benchmark:with-timing (100)
(let ((query "gitlab"))
(search-in-list
query
(mapcar
#'issue-by-id
(narrow-down-using-index *index* query))
:key #'issue-title)))
; - SAMPLES TOTAL MINIMUM MAXIMUM MEDIAN AVERAGE DEVIATION
; REAL-TIME 100 0.023952 0.000224 0.000632 0.00023 0.00024 0.000042
; RUN-TIME 100 0.03125 0 0.015625 0 0.000312 0.002188
; USER-RUN-TIME 100 0 0 0 0 0 0.0
; SYSTEM-RUN-TIME 100 0.015625 0 0.015625 0 0.000156 0.001555
; PAGE-FAULTS 100 0 0 0 0 0 0.0
; GC-RUN-TIME 100 0 0 0 0 0 0.0
; BYTES-CONSED 100 7208112 65520 131072 65536 72081.12 19658.295
; EVAL-CALLS 100 0 0 0 0 0 0.0
; => NIL
(/
0.00024
0.005849)
;; It took 4% of the (real) time
(defun find-issues* (query)
"Return the issues that contains all the parts of QUERY in their KEY."
(search-in-list/and
(split-sequence:split-sequence #\Space query :remove-empty-subseqs t)
(a:hash-table-values *issues*)
:key #'issue-title))