diff options
author | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 06:35:14 +0100 |
---|---|---|
committer | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 06:47:40 +0100 |
commit | 26e4da9f87a7330d96bbeea0d47d875d3a96b1f8 (patch) | |
tree | cbce4f003f3f877968d70e8bf42dcb2bf755221e | |
parent | 45188449a04a1455ade01caf470e43a926d752aa (diff) | |
download | coursera-26e4da9f87a7330d96bbeea0d47d875d3a96b1f8.zip coursera-26e4da9f87a7330d96bbeea0d47d875d3a96b1f8.tar.gz |
Algorithms : complete 04-algorithms_on_strings 01-suffix_trees
5 files changed, 481 insertions, 41 deletions
diff --git a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp index 08cfe9b..d5f9483 100644 --- a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp @@ -12,7 +12,27 @@ typedef vector<edges> trie; trie build_trie(vector<string> & patterns) { trie t; - // write your code here + + size_t node; + size_t n = 1; + t.push_back(edges()); + + for (size_t i = 0; i < patterns.size(); i++) { + node = 0; + const string& pattern = patterns[i]; + for (size_t j = 0; j < pattern.size(); j++) { + char c = pattern[j]; + int next = t[node][c]; + if (next != 0) { + node = next; + } else { + t[node][c] = n; + node = n; + t.push_back(edges()); + n += 1; + } + } + } return t; } @@ -34,4 +54,4 @@ int main() { } return 0; -}
\ No newline at end of file +} diff --git a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp index a59a0db..2517d81 100644 --- a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp @@ -23,27 +23,96 @@ struct Node { return (next[0] == NA && next[1] == NA && next[2] == NA && next[3] == NA); } + + void print() + { + if (isLeaf()) + cout << "---- " << endl; + else + cout << "A:" << next[0] << " C:" << next[1] + << " G:" << next[2] << " T:" << next[3] << endl; + } }; -int letterToIndex (char letter) +struct Trie { - switch (letter) - { - case 'A': return 0; break; - case 'C': return 1; break; - case 'G': return 2; break; - case 'T': return 3; break; - default: assert (false); return -1; - } -} + vector<Node> nodes; + + Trie(const vector <string>& patterns) + { + int v = 1; + nodes.push_back(Node()); + + for (int i = 0; i < patterns.size(); i++) { + int n = 0; + const string& pattern = patterns[i]; + for (int j = 0; j < pattern.size(); j++) { + int c = letterToIndex(pattern[j]); + int next = nodes[n].next[c]; + if (next != NA) { + n = next; + } else { + nodes[n].next[c] = v; + n = v; + nodes.push_back(Node()); + v += 1; + } + } + } + } + + int letterToIndex (char letter) + { + switch (letter) + { + case 'A': return 0; break; + case 'C': return 1; break; + case 'G': return 2; break; + case 'T': return 3; break; + default: assert (false); return -1; + } + } + + void print() + { + for (int i = 0; i < nodes.size(); i++) { + cout << i << " " ; + nodes[i].print(); + } + } + + bool prefixMatch(const string& text) + { + int n = 0; + + for( int i = 0; i < text.size(); i++) { + int next = nodes[n].next[letterToIndex(text[i])]; + if (next == NA) + break; + n = next; + } + + return nodes[n].isLeaf(); + } + + void match(const string& text, vector<int>& result) + { + for (int i = 0; i < text.size(); i++) { + if (prefixMatch(&text[i])) + result.push_back(i); + } + } +}; vector <int> solve (const string& text, int n, const vector <string>& patterns) { - vector <int> result; + vector <int> result; - // write your code here + Trie t(patterns); + /* t.print(); */ + t.match(text, result); - return result; + return result; } int main (void) diff --git a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp index c68c2f6..c31456e 100644 --- a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp @@ -20,27 +20,96 @@ struct Node fill (next, next + Letters, NA); patternEnd = false; } + + void print() + { + cout << patternEnd << " A:" << next[0] << " C:" << next[1] + << " G:" << next[2] << " T:" << next[3] << endl; + } }; -int letterToIndex (char letter) +struct Trie { - switch (letter) - { - case 'A': return 0; break; - case 'C': return 1; break; - case 'G': return 2; break; - case 'T': return 3; break; - default: assert (false); return -1; - } -} + vector<Node> nodes; + + Trie(const vector <string>& patterns) + { + int v = 1; + nodes.push_back(Node()); + + for (int i = 0; i < patterns.size(); i++) { + int n = 0; + const string& pattern = patterns[i]; + for (int j = 0; j < pattern.size(); j++) { + int c = letterToIndex(pattern[j]); + int next = nodes[n].next[c]; + if (next == NA) { + nodes[n].next[c] = v; + n = v; + nodes.push_back(Node()); + v += 1; + } else + n = next; + } + nodes[n].patternEnd = true; + } + } + + int letterToIndex (char letter) + { + switch (letter) + { + case 'A': return 0; break; + case 'C': return 1; break; + case 'G': return 2; break; + case 'T': return 3; break; + default: assert (false); return -1; + } + } + + void print() + { + for (int i = 0; i < nodes.size(); i++) { + cout << i << " " ; + nodes[i].print(); + } + } + + bool prefixMatch(const string& text) + { + int n = 0; + + for( int i = 0; i < text.size(); i++) { + Node &node = nodes[n]; + if (node.patternEnd) + break; + int next = node.next[letterToIndex(text[i])]; + if (next == NA) + break; + n = next; + } + + return nodes[n].patternEnd; + } + + void match(const string& text, vector<int>& result) + { + for (int i = 0; i < text.size(); i++) { + if (prefixMatch(&text[i])) + result.push_back(i); + } + } +}; -vector <int> solve (string text, int n, vector <string> patterns) +vector <int> solve (const string& text, int n, const vector <string>& patterns) { - vector <int> result; + vector <int> result; - // write your code here + Trie t(patterns); + /* t.print(); */ + t.match(text, result); - return result; + return result; } int main (void) diff --git a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp index 5d9e2f1..96fdf83 100644 --- a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp @@ -1,7 +1,10 @@ #include <iostream> +#include <algorithm> +#include <cassert> #include <map> #include <string> #include <vector> +#include <stack> using std::cin; using std::cout; @@ -9,14 +12,140 @@ using std::endl; using std::map; using std::string; using std::vector; +using std::stack; -// Build a suffix tree of the string text and return a vector -// with all of the labels of its edges (the corresponding -// substrings of the text) in any order. -vector<string> ComputeSuffixTreeEdges(const string& text) { - vector<string> result; - // Implement this function yourself - return result; +int const Letters = 5; +int const NA = -1; + +struct Node +{ + int from; + int len; + int childs [Letters]; + + Node () : from(0), len(0) + { + std::fill(childs, childs + Letters, NA); + } + + Node(int f, int l) : from(f), len(l) + { + std::fill(childs, childs + Letters, NA); + } + + void print(const string& text) + { + std::cout << from << ";" << len << " "; + for(int i = from; i < (from + len); i++) + std::cout << text[i]; + std::cout << " - A:" << childs[0] << " C:" << childs[1] + << " G:" << childs[2] << " T:" << childs[3] << " $:" << childs[4]<< std::endl; + } +}; + +struct SuffixTree +{ + const string& text; + vector<Node> nodes; + + SuffixTree(const string& t) : text(t) + { + nodes.push_back(Node()); + for (int i = 0; i < t.size(); i++) + add(i, text.size() - i, 0); + } + + void add(int from, int len, int into) + { + int i = toInt(text[from]); + int n = nodes[into].childs[i]; + if (n == NA) { + nodes.push_back(Node(from, len)); + nodes[into].childs[i] = nodes.size() - 1; + } else { + insert(from, len, n, into); + } + } + + void insert(int from, int len, int n, int parent) + { + Node &node = nodes[n]; + + int i = from; + int j = node.from; + int to = from + std::min(len, node.len); + + for( ; (text[i] == text[j] && i < to); i++, j++) { } + + if (i < to) { + int matchLen = j - node.from; + int s = nodes.size(); + // new match node + nodes.push_back(Node(node.from, matchLen)); + // shrink n to mismatch part + nodes[n].from = j; + nodes[n].len = nodes[n].len - matchLen; + // parent[from] -> match[j] -> n + nodes[parent].childs[toInt(text[from])] = s; + nodes[s].childs[toInt(text[j])] = n; + // new mismatch node + nodes.push_back(Node(i, len - matchLen)); + // match[i] -> mismatch + nodes[s].childs[toInt(text[i])] = s + 1; + } else if (i == to) { + add(i, len - i + from, n); + } else if (i == from) { + std::cout << "missmatch at 0 !!!!!!!!!!!!!!!!!!" << std::endl; + nodes[n].print(text); + std::cout << &text[from]<< std::endl; + std::cout << &text[i]<< std::endl; + } + } + + void travers(vector<string> &r) + { + stack<int> s; + s.push(0); + while (!s.empty()) { + int n = s.top(); + s.pop(); + Node &node = nodes[n]; + + if (n != 0) + r.push_back(text.substr(node.from, node.len)); + + for(int i = 0; i < Letters; i++) { + int nn = node.childs[i]; + if (nn != NA) + s.push(nn); + } + } + } + + int toInt(char c) + { + switch (c) + { + case 'A': return 0; break; + case 'C': return 1; break; + case 'G': return 2; break; + case 'T': return 3; break; + case '$': return 4; break; + default: assert (false); return -1; + } + } + + +}; + +vector<string> ComputeSuffixTreeEdges(const string& text) +{ + vector<string> result; + + SuffixTree t(text); + t.travers(result); + + return result; } int main() { diff --git a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp index 2f2d98f..4c33d05 100644 --- a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp @@ -1,13 +1,168 @@ #include <iostream> #include <string> +#include <vector> +#include <stack> +#include <queue> +#include <limits> +#include <cassert> using namespace std; -string solve (string p, string q) +int const Letters = 6; +int const NA = -1; + +struct Node +{ + int from; + int len; + int childs [Letters]; + bool marked; + int realFrom; + + Node () : from(0), len(0), marked(false), realFrom (-1) + { + fill(childs, childs + Letters, NA); + } + + Node(int f, int l, bool t, int r) : from(f), len(l), marked(t), realFrom(r) + { + fill(childs, childs + Letters, NA); + } +}; + +struct SuffixTree +{ + const string& text; + const int separator; + vector<Node> nodes; + + SuffixTree(const string& t, int s) : text(t), separator(s) + { + nodes.push_back(Node()); + + for (int i = 0; i < separator; i++) + add(i, separator + 1 - i, 0, false, i); + + add(s, 1, 0, true, s); + + for (int i = s; i < t.size(); i++) + add(i, text.size() - i, 0, true, i); + } + + void add(int from, int len, int parent, bool marked, int realFrom) + { + int i = toInt(text[from]); + int n = nodes[parent].childs[i]; + + if (n == NA) { + nodes.push_back(Node(from, len, marked, realFrom)); + nodes[parent].childs[i] = nodes.size() - 1; + } else { + Node &node = nodes[n]; + + int i = from; + int j = node.from; + int to = from + min(len, node.len); + + for( ; (text[i] == text[j] && i < to); i++, j++) { } + + if (i < to) { + int matchLen = j - node.from; + int s = nodes.size(); + // new match node + nodes.push_back(Node(node.from, matchLen, marked, node.realFrom)); + // shrink n to mismatch part + nodes[n].from = j; + nodes[n].len = nodes[n].len - matchLen; + // parent[from] -> match[j] -> n + nodes[parent].childs[toInt(text[from])] = s; + nodes[s].childs[toInt(text[j])] = n; + // new mismatch node + int l = (len - matchLen); + nodes.push_back(Node(i, l, marked, realFrom)); + // match[i] -> mismatch + nodes[s].childs[toInt(text[i])] = s + 1; + } else if (i == to) { + node.marked = marked; + add(i, len - i + from, n, marked, realFrom); + } else { + assert (false); + } + } + } + + int toInt(char c) + { + switch (c) + { + case 'A': return 0; break; + case 'C': return 1; break; + case 'G': return 2; break; + case 'T': return 3; break; + case '#': return 4; break; + case '$': return 5; break; + default: assert (false); return -1; + } + } + + void graph() + { + cout << "digraph G {\nlabel=\"" << text << endl; + for (int i = 0; i < text.size(); i++) + cout << i << " "; + cout << "\"" << endl; + for (int i = 0; i < nodes.size(); i++) { + Node &n = nodes[i]; + cout << i << " [ label=\" " << i << endl; + for(int j = n.from; j < (n.from + n.len); j++) + cout << text[j]; + cout << endl << n.from - n.realFrom << "\""; + if (!n.marked) cout << " color=red"; + cout << " ]" << endl; + + for (int j = 0; j < Letters; j++) { + if (n.childs[j] != NA) + cout << i << " -> " << n.childs[j] << endl; + } + } + cout << "}" << endl; + } + + void solve(string &r) + { + int x = 0; + int d = numeric_limits<int>::max(); + + for (int i = 0; i < nodes.size(); i++) { + Node &n = nodes[i]; + if (!n.marked && n.len > 0 && text[n.from] != '#') { + int len = n.from - n.realFrom + 1; + if (len < d) { + d = len; + x = i; + } + } + } + + if (x == 0) + cout << r; + else + cout << text.substr(nodes[x].realFrom, d) << '\n'; + } + +}; + +void solve (string p, string q) { + string s = p + "#" + q + "$"; string result = p; - return result; + SuffixTree t(s, p.size()); +#if 0 + t.graph(); +#else + t.solve(p); +#endif } int main (void) @@ -17,9 +172,7 @@ int main (void) string q; cin >> q; - string ans = solve (p, q); - - cout << ans << endl; + solve (p, q); return 0; } |