From a6251d329572eb8b34b3283fa975108a86bd58ed Mon Sep 17 00:00:00 2001 From: Bruno Freitas Tissei <bft15@inf.ufpr.br> Date: Fri, 22 Nov 2019 01:14:00 -0300 Subject: [PATCH] Add Aho Corasick Signed-off-by: Bruno Freitas Tissei <bft15@inf.ufpr.br> --- algorithms/string/aho_corasick.cpp | 109 +++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 algorithms/string/aho_corasick.cpp diff --git a/algorithms/string/aho_corasick.cpp b/algorithms/string/aho_corasick.cpp new file mode 100644 index 0000000..23f0302 --- /dev/null +++ b/algorithms/string/aho_corasick.cpp @@ -0,0 +1,109 @@ +/// Aho-Corasick +/// +/// Time: +/// - build: O(n) +/// - preprocess: O(n) +/// - match: O(p + n) +/// - match_all: O(p + x) +/// Space: O(n + p) +/// +/// Caution: +/// - Match might not find all occurences when repeated strings are given (fix with map) + +// *: Use only if "match_all" is necessary +struct AhoCorasick { + struct Node { + //*: vector<int> words; + map<char,int> next; + int idx, fail, cnt, hei; + + Node() : idx(-1), fail(0), cnt(0), hei(0) {} + int has(char i) { return next.count(i); } + int &operator[](char i) { return next[i]; } + }; + + vector<int> top; + vector<Node> trie; + + AhoCorasick(const vector<string> &v) { + trie.pb(Node()); + build(v); + top = preprocess(); + } + + int insert(const string &s) { + int n = 0; + for (int i = 0; i < s.size(); n = trie[n][s[i]], ++i) + if (!trie[n].has(s[i])) { + trie[n][s[i]] = trie.size(); + //*: trie[n].hei = i + 1; + trie.pb(Node()); + } + return n; + } + + void build(const vector<string> &v) { + for (int i = 0; i < v.size(); ++i) { + int n = insert(v[i]); + trie[n].idx = i; + //*: trie[n].words.pb(i); + } + preprocess(); + } + + inline int suffix(int v, char c) { + while (v != 0 && !trie[v].has(c)) v = trie[v].fail; + if (trie[v].has(c)) v = trie[v][c]; + return v; + } + + vector<int> preprocess() { + vector<int> Q = { 0 }; + for (int i = 0; i != Q.size(); ++i) { + int u = Q[i]; + for (auto j : trie[u].next) { + trie[j.se].fail = u ? suffix(trie[u].fail, j.fi) : trie[u].fail; + + //*: trie[j.se].words.insert(trie[j.se].words.end(), + // all(trie[trie[j.se].fail].words)); + Q.pb(j.se); + } + } + return Q; + } + + // Returns vector with indices of the strings occuring at + // least once in pattern p + vector<int> match(const string &p) { + int u = 0; + for (auto i : p) { + u = suffix(u, i); + trie[u].cnt++; + } + + for (int i = top.size() - 1; i >= 0; --i) + trie[trie[top[i]].fail].cnt += trie[top[i]].cnt; + + vector<int> ans; + for (auto i : trie) + if (i.idx != -1 && i.cnt) + ans.pb(i.idx); + + sort(all(ans)); + return ans; + } + + // Returns all occurences of strings in p, where ans[i].fi + // is the indice of the string and ans[i].se is where in p + // does the string start + vector<ii> match_all(const string &p) { + int u = 0; + vector<ii> ans; + for (int i = 0; i < p.size(); ++i) { + u = suffix(u, p[i]); + for (auto j : trie[u].words) + ans.pb({j, i - trie[u].hei + 1}); + } + return ans; + } +}; -- GitLab