summaryrefslogtreecommitdiffstats
path: root/04-algorithms_on_strings
diff options
context:
space:
mode:
authorJérémy Zurcher <jeremy@asynk.ch>2016-11-14 06:35:14 +0100
committerJérémy Zurcher <jeremy@asynk.ch>2016-11-14 06:47:40 +0100
commit26e4da9f87a7330d96bbeea0d47d875d3a96b1f8 (patch)
treecbce4f003f3f877968d70e8bf42dcb2bf755221e /04-algorithms_on_strings
parent45188449a04a1455ade01caf470e43a926d752aa (diff)
downloadcoursera-26e4da9f87a7330d96bbeea0d47d875d3a96b1f8.zip
coursera-26e4da9f87a7330d96bbeea0d47d875d3a96b1f8.tar.gz
Algorithms : complete 04-algorithms_on_strings 01-suffix_trees
Diffstat (limited to '04-algorithms_on_strings')
-rw-r--r--04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp24
-rw-r--r--04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp95
-rw-r--r--04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp97
-rw-r--r--04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp143
-rw-r--r--04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp163
5 files changed, 481 insertions, 41 deletions
diff --git a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp
index 08cfe9b..d5f9483 100644
--- a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp
+++ b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp
@@ -12,7 +12,27 @@ typedef vector<edges> trie;
trie build_trie(vector<string> & patterns) {
trie t;
- // write your code here
+
+ size_t node;
+ size_t n = 1;
+ t.push_back(edges());
+
+ for (size_t i = 0; i < patterns.size(); i++) {
+ node = 0;
+ const string& pattern = patterns[i];
+ for (size_t j = 0; j < pattern.size(); j++) {
+ char c = pattern[j];
+ int next = t[node][c];
+ if (next != 0) {
+ node = next;
+ } else {
+ t[node][c] = n;
+ node = n;
+ t.push_back(edges());
+ n += 1;
+ }
+ }
+ }
return t;
}
@@ -34,4 +54,4 @@ int main() {
}
return 0;
-} \ No newline at end of file
+}
diff --git a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp
index a59a0db..2517d81 100644
--- a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp
+++ b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp
@@ -23,27 +23,96 @@ struct Node
{
return (next[0] == NA && next[1] == NA && next[2] == NA && next[3] == NA);
}
+
+ void print()
+ {
+ if (isLeaf())
+ cout << "---- " << endl;
+ else
+ cout << "A:" << next[0] << " C:" << next[1]
+ << " G:" << next[2] << " T:" << next[3] << endl;
+ }
};
-int letterToIndex (char letter)
+struct Trie
{
- switch (letter)
- {
- case 'A': return 0; break;
- case 'C': return 1; break;
- case 'G': return 2; break;
- case 'T': return 3; break;
- default: assert (false); return -1;
- }
-}
+ vector<Node> nodes;
+
+ Trie(const vector <string>& patterns)
+ {
+ int v = 1;
+ nodes.push_back(Node());
+
+ for (int i = 0; i < patterns.size(); i++) {
+ int n = 0;
+ const string& pattern = patterns[i];
+ for (int j = 0; j < pattern.size(); j++) {
+ int c = letterToIndex(pattern[j]);
+ int next = nodes[n].next[c];
+ if (next != NA) {
+ n = next;
+ } else {
+ nodes[n].next[c] = v;
+ n = v;
+ nodes.push_back(Node());
+ v += 1;
+ }
+ }
+ }
+ }
+
+ int letterToIndex (char letter)
+ {
+ switch (letter)
+ {
+ case 'A': return 0; break;
+ case 'C': return 1; break;
+ case 'G': return 2; break;
+ case 'T': return 3; break;
+ default: assert (false); return -1;
+ }
+ }
+
+ void print()
+ {
+ for (int i = 0; i < nodes.size(); i++) {
+ cout << i << " " ;
+ nodes[i].print();
+ }
+ }
+
+ bool prefixMatch(const string& text)
+ {
+ int n = 0;
+
+ for( int i = 0; i < text.size(); i++) {
+ int next = nodes[n].next[letterToIndex(text[i])];
+ if (next == NA)
+ break;
+ n = next;
+ }
+
+ return nodes[n].isLeaf();
+ }
+
+ void match(const string& text, vector<int>& result)
+ {
+ for (int i = 0; i < text.size(); i++) {
+ if (prefixMatch(&text[i]))
+ result.push_back(i);
+ }
+ }
+};
vector <int> solve (const string& text, int n, const vector <string>& patterns)
{
- vector <int> result;
+ vector <int> result;
- // write your code here
+ Trie t(patterns);
+ /* t.print(); */
+ t.match(text, result);
- return result;
+ return result;
}
int main (void)
diff --git a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp
index c68c2f6..c31456e 100644
--- a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp
+++ b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp
@@ -20,27 +20,96 @@ struct Node
fill (next, next + Letters, NA);
patternEnd = false;
}
+
+ void print()
+ {
+ cout << patternEnd << " A:" << next[0] << " C:" << next[1]
+ << " G:" << next[2] << " T:" << next[3] << endl;
+ }
};
-int letterToIndex (char letter)
+struct Trie
{
- switch (letter)
- {
- case 'A': return 0; break;
- case 'C': return 1; break;
- case 'G': return 2; break;
- case 'T': return 3; break;
- default: assert (false); return -1;
- }
-}
+ vector<Node> nodes;
+
+ Trie(const vector <string>& patterns)
+ {
+ int v = 1;
+ nodes.push_back(Node());
+
+ for (int i = 0; i < patterns.size(); i++) {
+ int n = 0;
+ const string& pattern = patterns[i];
+ for (int j = 0; j < pattern.size(); j++) {
+ int c = letterToIndex(pattern[j]);
+ int next = nodes[n].next[c];
+ if (next == NA) {
+ nodes[n].next[c] = v;
+ n = v;
+ nodes.push_back(Node());
+ v += 1;
+ } else
+ n = next;
+ }
+ nodes[n].patternEnd = true;
+ }
+ }
+
+ int letterToIndex (char letter)
+ {
+ switch (letter)
+ {
+ case 'A': return 0; break;
+ case 'C': return 1; break;
+ case 'G': return 2; break;
+ case 'T': return 3; break;
+ default: assert (false); return -1;
+ }
+ }
+
+ void print()
+ {
+ for (int i = 0; i < nodes.size(); i++) {
+ cout << i << " " ;
+ nodes[i].print();
+ }
+ }
+
+ bool prefixMatch(const string& text)
+ {
+ int n = 0;
+
+ for( int i = 0; i < text.size(); i++) {
+ Node &node = nodes[n];
+ if (node.patternEnd)
+ break;
+ int next = node.next[letterToIndex(text[i])];
+ if (next == NA)
+ break;
+ n = next;
+ }
+
+ return nodes[n].patternEnd;
+ }
+
+ void match(const string& text, vector<int>& result)
+ {
+ for (int i = 0; i < text.size(); i++) {
+ if (prefixMatch(&text[i]))
+ result.push_back(i);
+ }
+ }
+};
-vector <int> solve (string text, int n, vector <string> patterns)
+vector <int> solve (const string& text, int n, const vector <string>& patterns)
{
- vector <int> result;
+ vector <int> result;
- // write your code here
+ Trie t(patterns);
+ /* t.print(); */
+ t.match(text, result);
- return result;
+ return result;
}
int main (void)
diff --git a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp
index 5d9e2f1..96fdf83 100644
--- a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp
+++ b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp
@@ -1,7 +1,10 @@
#include <iostream>
+#include <algorithm>
+#include <cassert>
#include <map>
#include <string>
#include <vector>
+#include <stack>
using std::cin;
using std::cout;
@@ -9,14 +12,140 @@ using std::endl;
using std::map;
using std::string;
using std::vector;
+using std::stack;
-// Build a suffix tree of the string text and return a vector
-// with all of the labels of its edges (the corresponding
-// substrings of the text) in any order.
-vector<string> ComputeSuffixTreeEdges(const string& text) {
- vector<string> result;
- // Implement this function yourself
- return result;
+int const Letters = 5;
+int const NA = -1;
+
+struct Node
+{
+ int from;
+ int len;
+ int childs [Letters];
+
+ Node () : from(0), len(0)
+ {
+ std::fill(childs, childs + Letters, NA);
+ }
+
+ Node(int f, int l) : from(f), len(l)
+ {
+ std::fill(childs, childs + Letters, NA);
+ }
+
+ void print(const string& text)
+ {
+ std::cout << from << ";" << len << " ";
+ for(int i = from; i < (from + len); i++)
+ std::cout << text[i];
+ std::cout << " - A:" << childs[0] << " C:" << childs[1]
+ << " G:" << childs[2] << " T:" << childs[3] << " $:" << childs[4]<< std::endl;
+ }
+};
+
+struct SuffixTree
+{
+ const string& text;
+ vector<Node> nodes;
+
+ SuffixTree(const string& t) : text(t)
+ {
+ nodes.push_back(Node());
+ for (int i = 0; i < t.size(); i++)
+ add(i, text.size() - i, 0);
+ }
+
+ void add(int from, int len, int into)
+ {
+ int i = toInt(text[from]);
+ int n = nodes[into].childs[i];
+ if (n == NA) {
+ nodes.push_back(Node(from, len));
+ nodes[into].childs[i] = nodes.size() - 1;
+ } else {
+ insert(from, len, n, into);
+ }
+ }
+
+ void insert(int from, int len, int n, int parent)
+ {
+ Node &node = nodes[n];
+
+ int i = from;
+ int j = node.from;
+ int to = from + std::min(len, node.len);
+
+ for( ; (text[i] == text[j] && i < to); i++, j++) { }
+
+ if (i < to) {
+ int matchLen = j - node.from;
+ int s = nodes.size();
+ // new match node
+ nodes.push_back(Node(node.from, matchLen));
+ // shrink n to mismatch part
+ nodes[n].from = j;
+ nodes[n].len = nodes[n].len - matchLen;
+ // parent[from] -> match[j] -> n
+ nodes[parent].childs[toInt(text[from])] = s;
+ nodes[s].childs[toInt(text[j])] = n;
+ // new mismatch node
+ nodes.push_back(Node(i, len - matchLen));
+ // match[i] -> mismatch
+ nodes[s].childs[toInt(text[i])] = s + 1;
+ } else if (i == to) {
+ add(i, len - i + from, n);
+ } else if (i == from) {
+ std::cout << "missmatch at 0 !!!!!!!!!!!!!!!!!!" << std::endl;
+ nodes[n].print(text);
+ std::cout << &text[from]<< std::endl;
+ std::cout << &text[i]<< std::endl;
+ }
+ }
+
+ void travers(vector<string> &r)
+ {
+ stack<int> s;
+ s.push(0);
+ while (!s.empty()) {
+ int n = s.top();
+ s.pop();
+ Node &node = nodes[n];
+
+ if (n != 0)
+ r.push_back(text.substr(node.from, node.len));
+
+ for(int i = 0; i < Letters; i++) {
+ int nn = node.childs[i];
+ if (nn != NA)
+ s.push(nn);
+ }
+ }
+ }
+
+ int toInt(char c)
+ {
+ switch (c)
+ {
+ case 'A': return 0; break;
+ case 'C': return 1; break;
+ case 'G': return 2; break;
+ case 'T': return 3; break;
+ case '$': return 4; break;
+ default: assert (false); return -1;
+ }
+ }
+
+
+};
+
+vector<string> ComputeSuffixTreeEdges(const string& text)
+{
+ vector<string> result;
+
+ SuffixTree t(text);
+ t.travers(result);
+
+ return result;
}
int main() {
diff --git a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp
index 2f2d98f..4c33d05 100644
--- a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp
+++ b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp
@@ -1,13 +1,168 @@
#include <iostream>
#include <string>
+#include <vector>
+#include <stack>
+#include <queue>
+#include <limits>
+#include <cassert>
using namespace std;
-string solve (string p, string q)
+int const Letters = 6;
+int const NA = -1;
+
+struct Node
+{
+ int from;
+ int len;
+ int childs [Letters];
+ bool marked;
+ int realFrom;
+
+ Node () : from(0), len(0), marked(false), realFrom (-1)
+ {
+ fill(childs, childs + Letters, NA);
+ }
+
+ Node(int f, int l, bool t, int r) : from(f), len(l), marked(t), realFrom(r)
+ {
+ fill(childs, childs + Letters, NA);
+ }
+};
+
+struct SuffixTree
+{
+ const string& text;
+ const int separator;
+ vector<Node> nodes;
+
+ SuffixTree(const string& t, int s) : text(t), separator(s)
+ {
+ nodes.push_back(Node());
+
+ for (int i = 0; i < separator; i++)
+ add(i, separator + 1 - i, 0, false, i);
+
+ add(s, 1, 0, true, s);
+
+ for (int i = s; i < t.size(); i++)
+ add(i, text.size() - i, 0, true, i);
+ }
+
+ void add(int from, int len, int parent, bool marked, int realFrom)
+ {
+ int i = toInt(text[from]);
+ int n = nodes[parent].childs[i];
+
+ if (n == NA) {
+ nodes.push_back(Node(from, len, marked, realFrom));
+ nodes[parent].childs[i] = nodes.size() - 1;
+ } else {
+ Node &node = nodes[n];
+
+ int i = from;
+ int j = node.from;
+ int to = from + min(len, node.len);
+
+ for( ; (text[i] == text[j] && i < to); i++, j++) { }
+
+ if (i < to) {
+ int matchLen = j - node.from;
+ int s = nodes.size();
+ // new match node
+ nodes.push_back(Node(node.from, matchLen, marked, node.realFrom));
+ // shrink n to mismatch part
+ nodes[n].from = j;
+ nodes[n].len = nodes[n].len - matchLen;
+ // parent[from] -> match[j] -> n
+ nodes[parent].childs[toInt(text[from])] = s;
+ nodes[s].childs[toInt(text[j])] = n;
+ // new mismatch node
+ int l = (len - matchLen);
+ nodes.push_back(Node(i, l, marked, realFrom));
+ // match[i] -> mismatch
+ nodes[s].childs[toInt(text[i])] = s + 1;
+ } else if (i == to) {
+ node.marked = marked;
+ add(i, len - i + from, n, marked, realFrom);
+ } else {
+ assert (false);
+ }
+ }
+ }
+
+ int toInt(char c)
+ {
+ switch (c)
+ {
+ case 'A': return 0; break;
+ case 'C': return 1; break;
+ case 'G': return 2; break;
+ case 'T': return 3; break;
+ case '#': return 4; break;
+ case '$': return 5; break;
+ default: assert (false); return -1;
+ }
+ }
+
+ void graph()
+ {
+ cout << "digraph G {\nlabel=\"" << text << endl;
+ for (int i = 0; i < text.size(); i++)
+ cout << i << " ";
+ cout << "\"" << endl;
+ for (int i = 0; i < nodes.size(); i++) {
+ Node &n = nodes[i];
+ cout << i << " [ label=\" " << i << endl;
+ for(int j = n.from; j < (n.from + n.len); j++)
+ cout << text[j];
+ cout << endl << n.from - n.realFrom << "\"";
+ if (!n.marked) cout << " color=red";
+ cout << " ]" << endl;
+
+ for (int j = 0; j < Letters; j++) {
+ if (n.childs[j] != NA)
+ cout << i << " -> " << n.childs[j] << endl;
+ }
+ }
+ cout << "}" << endl;
+ }
+
+ void solve(string &r)
+ {
+ int x = 0;
+ int d = numeric_limits<int>::max();
+
+ for (int i = 0; i < nodes.size(); i++) {
+ Node &n = nodes[i];
+ if (!n.marked && n.len > 0 && text[n.from] != '#') {
+ int len = n.from - n.realFrom + 1;
+ if (len < d) {
+ d = len;
+ x = i;
+ }
+ }
+ }
+
+ if (x == 0)
+ cout << r;
+ else
+ cout << text.substr(nodes[x].realFrom, d) << '\n';
+ }
+
+};
+
+void solve (string p, string q)
{
+ string s = p + "#" + q + "$";
string result = p;
- return result;
+ SuffixTree t(s, p.size());
+#if 0
+ t.graph();
+#else
+ t.solve(p);
+#endif
}
int main (void)
@@ -17,9 +172,7 @@ int main (void)
string q;
cin >> q;
- string ans = solve (p, q);
-
- cout << ans << endl;
+ solve (p, q);
return 0;
}