-
Notifications
You must be signed in to change notification settings - Fork 10
/
ac_scan.cpp
90 lines (83 loc) · 2.39 KB
/
ac_scan.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#define _CRT_SECURE_NO_WARNINGS
#define _SCL_SECURE_NO_WARNINGS
#include <terark/fsa/fsa.hpp>
#include <terark/fsa/base_ac.hpp>
#include <terark/util/autoclose.hpp>
#include <terark/util/linebuf.hpp>
#include <getopt.h>
using namespace terark;
void usage(const char* prog) {
fprintf(stderr, R"EOS(usage: %s Options
Options:
-i AC-Automata-File
AC-Automata-File is a file built by `ac_build.exe` from a set of patterns
-f TXT-File
a text file to be scaned, if omitted, read from stdin
)EOS", prog);
}
struct OnHit {
void operator()(size_t endpos, const uint32_t* words, size_t cnt, size_t state) const {
for (size_t i = 0; i < cnt; ++i) {
if (ac->has_word_length()) {
int wlen = ac->wlen(words[i]);
size_t pos = endpos - wlen;
#ifndef NDEBUG
try {
std::string acWord = ac->restore_word(state, words[i]);
assert(fstring(text + pos, wlen) == acWord);
}
catch (const std::invalid_argument&) {
// not a DoubleArray AC automaton
}
#endif
printf("hit_endpos=%04d : word_id=%06d : %.*s\n", int(endpos), words[i], wlen, text + pos);
}
else {
printf("hit_endpos=%04d : word_id=%06d\n", int(endpos), words[i]);
(void)(state); // remove compiler warning
}
}
}
const BaseAC* ac;
const char* text;
};
int main(int argc, char* argv[]) {
const char* dfa_file = NULL;
const char* txt_file = NULL;
for (int opt=0; (opt = getopt(argc, argv, "i:f:")) != -1; ) {
switch (opt) {
case '?': usage(argv[0]); return 3;
case 'i': dfa_file = optarg; break;
case 'f': txt_file = optarg; break;
}
}
if (NULL == dfa_file) {
fprintf(stderr, "usage: %s -i input_ac_dfa_file [-f text_file_to_be_matched]\n", argv[0]);
return 1;
}
std::unique_ptr<BaseDFA> dfa(BaseDFA::load_from(dfa_file));
if (dfa->get_ac() == NULL) {
fprintf(stderr, "Fail: file: %s is not a AC DFA\n", dfa_file);
return 1;
}
terark::Auto_fclose fp;
if (txt_file) {
fp = fopen(txt_file, "r");
if (NULL == fp) {
fprintf(stderr, "Fail: fopen(%s, r) = %s\n", txt_file, strerror(errno));
return 1;
}
}
OnHit on_hit = { dfa->get_ac(), NULL };
if (!on_hit.ac->has_word_length()) {
fprintf(stderr, "Pattern length was not saved in AC Automata,\n"
" -- Only match endpos and pattern_id will be reported!\n");
}
terark::LineBuf line;
while (line.getline(fp.self_or(stdin)) > 0) {
line.chomp();
on_hit.text = line.p;
on_hit.ac->ac_scan(line, ref(on_hit));
}
return 0;
}