Skip to content
Snippets Groups Projects
Commit a6251d32 authored by Bruno Freitas Tissei's avatar Bruno Freitas Tissei
Browse files

Add Aho Corasick

parent 8fa1490a
No related branches found
No related tags found
No related merge requests found
/// Aho-Corasick
///
/// Time:
/// - build: O(n)
/// - preprocess: O(n)
/// - match: O(p + n)
/// - match_all: O(p + x)
/// Space: O(n + p)
///
/// Caution:
/// - Match might not find all occurences when repeated strings are given (fix with map)
// *: Use only if "match_all" is necessary
struct AhoCorasick {
struct Node {
//*: vector<int> words;
map<char,int> next;
int idx, fail, cnt, hei;
Node() : idx(-1), fail(0), cnt(0), hei(0) {}
int has(char i) { return next.count(i); }
int &operator[](char i) { return next[i]; }
};
vector<int> top;
vector<Node> trie;
AhoCorasick(const vector<string> &v) {
trie.pb(Node());
build(v);
top = preprocess();
}
int insert(const string &s) {
int n = 0;
for (int i = 0; i < s.size(); n = trie[n][s[i]], ++i)
if (!trie[n].has(s[i])) {
trie[n][s[i]] = trie.size();
//*: trie[n].hei = i + 1;
trie.pb(Node());
}
return n;
}
void build(const vector<string> &v) {
for (int i = 0; i < v.size(); ++i) {
int n = insert(v[i]);
trie[n].idx = i;
//*: trie[n].words.pb(i);
}
preprocess();
}
inline int suffix(int v, char c) {
while (v != 0 && !trie[v].has(c)) v = trie[v].fail;
if (trie[v].has(c)) v = trie[v][c];
return v;
}
vector<int> preprocess() {
vector<int> Q = { 0 };
for (int i = 0; i != Q.size(); ++i) {
int u = Q[i];
for (auto j : trie[u].next) {
trie[j.se].fail = u ? suffix(trie[u].fail, j.fi) : trie[u].fail;
//*: trie[j.se].words.insert(trie[j.se].words.end(),
// all(trie[trie[j.se].fail].words));
Q.pb(j.se);
}
}
return Q;
}
// Returns vector with indices of the strings occuring at
// least once in pattern p
vector<int> match(const string &p) {
int u = 0;
for (auto i : p) {
u = suffix(u, i);
trie[u].cnt++;
}
for (int i = top.size() - 1; i >= 0; --i)
trie[trie[top[i]].fail].cnt += trie[top[i]].cnt;
vector<int> ans;
for (auto i : trie)
if (i.idx != -1 && i.cnt)
ans.pb(i.idx);
sort(all(ans));
return ans;
}
// Returns all occurences of strings in p, where ans[i].fi
// is the indice of the string and ans[i].se is where in p
// does the string start
vector<ii> match_all(const string &p) {
int u = 0;
vector<ii> ans;
for (int i = 0; i < p.size(); ++i) {
u = suffix(u, p[i]);
for (auto j : trie[u].words)
ans.pb({j, i - trie[u].hei + 1});
}
return ans;
}
};
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment