summaryrefslogtreecommitdiffstats
path: root/04-algorithms_on_strings/04-suffix_array
diff options
context:
space:
mode:
authorJérémy Zurcher <jeremy@asynk.ch>2016-11-14 07:01:13 +0100
committerJérémy Zurcher <jeremy@asynk.ch>2016-11-14 07:01:13 +0100
commit52041be8af3d321f07feb27fbba70b364d72c10a (patch)
treea9ad5d0cc7a3a836fcf130a428ba152e5d070fe2 /04-algorithms_on_strings/04-suffix_array
parent7471f538c2203f266cb8002fb001183a9720a969 (diff)
downloadcoursera-52041be8af3d321f07feb27fbba70b364d72c10a.zip
coursera-52041be8af3d321f07feb27fbba70b364d72c10a.tar.gz
Algorithms : complete 04-algorithms_on_strings 04-suffix_array
Diffstat (limited to '04-algorithms_on_strings/04-suffix_array')
-rw-r--r--04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp24
-rw-r--r--04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp124
-rw-r--r--04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp182
-rw-r--r--04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp161
4 files changed, 455 insertions, 36 deletions
diff --git a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
index 13121ae..4ac7afd 100644
--- a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
@@ -7,13 +7,23 @@ using std::cin;
using std::string;
using std::vector;
-// Find all occurrences of the pattern in the text and return a
-// vector with all positions in the text (starting from 0) where
-// the pattern starts in the text.
-vector<int> find_pattern(const string& pattern, const string& text) {
- vector<int> result;
- // Implement this function yourself
- return result;
+vector<int> find_pattern(const string& pattern, const string& text)
+{
+ vector<int> result;
+ string s = pattern + "$" + text;
+ vector<int> prefixes(s.size());
+ int border = 0;
+ int ps = pattern.size();
+
+ for (int i = 1; i < s.size(); i++) {
+ while ((border > 0) && (s[i] != s[border]))
+ border = prefixes[border - 1];
+ prefixes[i] = border = ((s[i] == s[border]) ? (border + 1) : 0);
+ if (border == ps)
+ result.push_back(i - 2 * ps);
+ }
+
+ return result;
}
int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
index 1dbbcc3..7235bbe 100644
--- a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
@@ -12,15 +12,121 @@ using std::pair;
using std::string;
using std::vector;
-// Build suffix array of the string text and
-// return a vector result of the same length as the text
-// such that the value result[i] is the index (0-based)
-// in text where the i-th lexicographically smallest
-// suffix of text starts.
-vector<int> BuildSuffixArray(const string& text) {
- vector<int> result;
- // Implement this function yourself
- return result;
+static int $ = 0;
+static int A = 1;
+static int C = 2;
+static int G = 3;
+static int T = 4;
+
+static inline int toInt(char c)
+{
+ int ret = 0;
+ switch(c) {
+ case 'A': ret = A; break;
+ case 'C': ret = C; break;
+ case 'G': ret = G; break;
+ case 'T': ret = T; break;
+ }
+ return ret;
+}
+
+vector<int> BuildSuffixArray(const string& text)
+{
+ int sz = text.size();
+ vector<int> v0(sz);
+ vector<int> v1(sz);
+ vector<int> v2(sz);
+ vector<int> v3(sz);
+ vector<int> v4(sz);
+
+ // count sort text[i]
+ vector<int> &count = v0;
+ for (int i = 0; i < sz; i++)
+ count[toInt(text[i])] += 1;
+ for(int i = 1; i < 5; i++)
+ count[i] += count[i - 1];
+
+ vector<int> &order = v1;
+ for (int i = sz - 1; i >= 0; i--) {
+ int x = toInt(text[i]);
+ count[x] -= 1;
+ order[count[x]] = i;
+ }
+
+ // compute classes
+ int cl = 0;
+ vector<int> &classes = v2;
+ classes[order[0]] = cl;
+ for (int i = 1; i < sz; i++) {
+ if (text[order[i]] != text[order[i - 1]])
+ cl += 1;
+ classes[order[i]] = cl;
+ }
+
+ /* cout << text << endl; */
+ /* cout << "$ : " << count[$] << endl; */
+ /* cout << "A : " << count[A] << endl; */
+ /* cout << "C : " << count[C] << endl; */
+ /* cout << "G : " << count[G] << endl; */
+ /* cout << "T : " << count[T] << endl; */
+ /* cout << "order" << endl; */
+ /* for (int i = 0; i < sz; i++) */
+ /* cout << order[i] << " "; */
+ /* cout << endl; */
+ /* cout << "classes" << endl; */
+ /* for (int i = 0; i < sz; i++) */
+ /* cout << classes[i] << " "; */
+ /* cout << endl; */
+
+ int l = 1;
+ vector<int> &nOrder = v3;
+ vector<int> &nClasses = v4;
+ while (l < sz) {
+ // count sort classes
+ for (int i = 0; i < sz; i++)
+ count[i] = 0;
+ for (int i = 0; i < sz; i++)
+ count[classes[i]] += 1;
+ for(int i = 1; i < sz; i++)
+ count[i] += count[i - 1];
+ for (int i = sz - 1; i >= 0; i--) {
+ int x = (order[i] - l + sz) % sz;
+ int cl = classes[x];
+ count[cl] -= 1;
+ nOrder[count[cl]] = x;
+ }
+ if(order == v1) {
+ order = v3;
+ nOrder = v1;
+ } else {
+ order = v1;
+ nOrder = v3;
+ }
+
+ // update classes
+ nClasses[order[0]] = 0;
+ for (int i = 1; i < sz; i++) {
+ int cur = order[i];
+ int prev = order[i - 1];
+ int mid = (cur + l) % sz;
+ int midPrev = (prev + l) % sz;
+ if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev])
+ nClasses[cur] = nClasses[prev] + 1;
+ else
+ nClasses[cur] = nClasses[prev];
+ }
+ if(classes == v2) {
+ classes = v4;
+ nClasses = v2;
+ } else {
+ classes = v2;
+ nOrder = v4;
+ }
+
+ l *= 2;
+ }
+
+ return order;
}
int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
index 204218c..74fa0ee 100644
--- a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
@@ -7,21 +7,187 @@ using std::cin;
using std::string;
using std::vector;
-vector<int> BuildSuffixArray(const string& text) {
- vector<int> order;
+static int $ = 0;
+static int A = 1;
+static int C = 2;
+static int G = 3;
+static int T = 4;
- // write your code here
+static inline int toInt(char c)
+{
+ int ret = 0;
+ switch(c) {
+ case 'A': ret = A; break;
+ case 'C': ret = C; break;
+ case 'G': ret = G; break;
+ case 'T': ret = T; break;
+ }
+ return ret;
+}
+
+vector<int> BuildSuffixArray(const string& text)
+{
+ int sz = text.size();
+ vector<int> v0(sz);
+ vector<int> v1(sz);
+ vector<int> v2(sz);
+ vector<int> v3(sz);
+ vector<int> v4(sz);
+
+ // count sort text[i]
+ vector<int> &count = v0;
+ for (int i = 0; i < sz; i++)
+ count[toInt(text[i])] += 1;
+ for(int i = 1; i < 5; i++)
+ count[i] += count[i - 1];
+
+ vector<int> &order = v1;
+ for (int i = sz - 1; i >= 0; i--) {
+ int x = toInt(text[i]);
+ count[x] -= 1;
+ order[count[x]] = i;
+ }
+
+ // compute classes
+ int cl = 0;
+ vector<int> &classes = v2;
+ classes[order[0]] = cl;
+ for (int i = 1; i < sz; i++) {
+ if (text[order[i]] != text[order[i - 1]])
+ cl += 1;
+ classes[order[i]] = cl;
+ }
+
+ int l = 1;
+ vector<int> &nOrder = v3;
+ vector<int> &nClasses = v4;
+ while (l < sz) {
+ // count sort classes
+ for (int i = 0; i < sz; i++)
+ count[i] = 0;
+ for (int i = 0; i < sz; i++)
+ count[classes[i]] += 1;
+ for(int i = 1; i < sz; i++)
+ count[i] += count[i - 1];
+ for (int i = sz - 1; i >= 0; i--) {
+ int x = (order[i] - l + sz) % sz;
+ int cl = classes[x];
+ count[cl] -= 1;
+ nOrder[count[cl]] = x;
+ }
+ if(order == v1) {
+ order = v3;
+ nOrder = v1;
+ } else {
+ order = v1;
+ nOrder = v3;
+ }
+
+ // update classes
+ nClasses[order[0]] = 0;
+ for (int i = 1; i < sz; i++) {
+ int cur = order[i];
+ int prev = order[i - 1];
+ int mid = (cur + l) % sz;
+ int midPrev = (prev + l) % sz;
+ if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev])
+ nClasses[cur] = nClasses[prev] + 1;
+ else
+ nClasses[cur] = nClasses[prev];
+ }
+ if(classes == v2) {
+ classes = v4;
+ nClasses = v2;
+ } else {
+ classes = v2;
+ nOrder = v4;
+ }
- return order;
+ l *= 2;
+ }
+
+ return order;
+}
+
+vector<int> _lcp(const string& s, const vector<int>& order)
+{
+ int sz = order.size();
+ vector<int> v(sz - 1);
+ vector<int> r(sz); // order from i
+ for (int i = 0; i < sz; i++)
+ r[order[i]] = i;
+
+ int lcp = 0;
+ int suffix = order[0];
+ for (int i = 0; i < sz; i++) {
+ int x = r[suffix];
+ if (x == sz - 1) {
+ lcp = 0;
+ } else {
+ int nextSuffix = order[x + 1];
+ if (lcp > 0) lcp -= 1;
+ while ((suffix + lcp < sz) && (nextSuffix + lcp < sz)) {
+ if (s[suffix + lcp] == s[nextSuffix+ lcp])
+ lcp += 1;
+ else
+ break;
+ }
+ v[x] = lcp;
+ }
+ suffix = (suffix + 1) % sz;
+ }
+
+ return v;
}
+vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array)
+{
+ vector<int> result;
-vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array) {
- vector<int> result;
+ int sz = text.size();
+ int pz = pattern.size();
+ int l = 0;
+ int r = sz;
+ while (l < r) {
+ int mid = (l + r) / 2;
+ int k = suffix_array[mid];
+ bool lower = false;
+ for (int j = 0; j < pz; j++) {
+ if (pattern[j] == text[k + j])
+ continue;
+ if (pattern[j] > text[k + j])
+ lower = true;
+ break;
+ }
+ if (lower)
+ l = mid + 1;
+ else
+ r = mid;
+ }
+
+ int s = l;
+ r = sz;
+ while (l < r) {
+ int mid = (l + r) / 2;
+ int k = suffix_array[mid];
+ bool lower = false;
+ for (int j = 0; j < pz; j++) {
+ if (pattern[j] == text[k + j])
+ continue;
+ if (pattern[j] < text[k + j])
+ lower = true;
+ break;
+ }
+ if (lower)
+ r = mid;
+ else
+ l = mid + 1;
+ }
- // write your code here
+ for (int i = s; i < r; i++)
+ result.push_back(suffix_array[i]);
- return result;
+ return result;
}
int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
index ca9fdb5..4ac4658 100644
--- a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
@@ -22,8 +22,9 @@ struct Edge {
// corresponding to the label of this edge.
int end;
- Edge(int node_, int start_, int end_) : node(node_), start(start_), end(end_) {}
- Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {}
+ Edge(int node_, int start_, int end_) :
+ node(node_), start(start_), end(end_) {}
+ /* Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {} */
};
// Build suffix tree of the string text given its suffix array suffix_array
@@ -37,15 +38,112 @@ struct Edge {
// must be represented by Edge(1, 6, 7). This edge must be present in the vector tree[0]
// (corresponding to the root node), and it should be the first edge in the vector
// (because it has the smallest first character of all edges outgoing from the root).
-map<int, vector<Edge> > SuffixTreeFromSuffixArray(
- const vector<int>& suffix_array,
- const vector<int>& lcp_array,
- const string& text) {
- map<int, vector<Edge> > tree;
- // Implement this function yourself
- return tree;
-}
+map<int, vector<Edge>> SuffixTreeFromSuffixArray(
+ const vector<int>& suffix_array,
+ const vector<int>& lcp_array,
+ const string& text)
+{
+ map<int, vector<Edge>> tree;
+ vector<int> hierarchy;
+ int sz = suffix_array.size();
+ int n = 0;
+ int parent = 0;
+ int idx = 0;
+ int prevLcp = 0;
+ hierarchy.push_back(0);
+
+ // add '$'
+ tree[parent].push_back(Edge(++n, suffix_array[0], sz));
+ // add following suffixes
+ /* for (int i = 1; i < 6; i++) { */
+ for (int i = 1; i < suffix_array.size(); i++) {
+ int start = suffix_array[i];
+ int lcp = lcp_array[i - 1];
+
+ if (lcp == 0) {
+ // add to root
+ parent = 0;
+ vector<Edge>& v = tree[parent];
+ idx = v.size();
+ v.push_back(Edge(++n, start, sz));
+ hierarchy.resize(1);
+ } else if (lcp == prevLcp) {
+ // add to parent
+ tree[parent].push_back(Edge(++n, start + lcp, sz));
+ idx++;
+ } else {
+ int s;
+ int l = (lcp - prevLcp);
+ bool push = true;
+ Edge& last = tree[parent][idx];
+ if (l > 0) {
+ s = start + lcp;
+ // push 2 new edges under last
+ parent = last.node;
+ vector<Edge>& v = tree[parent];
+ v.push_back(Edge(++n, last.start + l, sz));
+ v.push_back(Edge(++n, s, sz));
+ idx = 1;
+ // update split edge
+ last.end = s;
+ last.start = s - l;
+ // set branch update values
+ /* l = prevLcp; */
+ /* s = last.start; */
+ } else {
+ // find good parent
+ while (l < 0) {
+ hierarchy.resize(hierarchy.size() - 1);
+ Edge& e = tree[hierarchy.back()].back();
+ l += (e.end - e.start);
+ }
+ parent = hierarchy.back();
+ idx = tree[parent].size() - 1;
+ prevLcp = (lcp - l);
+ if (l == 0) {
+ // add to parent
+ tree[parent].push_back(Edge(++n, start + lcp, sz));
+ idx++;
+ // set branch update values
+ /* l = lcp; */
+ /* s = start + lcp; */
+ push = false;
+ } else {
+ // set new split edge
+ vector<Edge>& v = tree[parent];
+ Edge old = v.back();
+ v.pop_back();
+ v.push_back(Edge(++n, start + prevLcp, start + lcp));
+ // feed new split edge
+ parent = n;
+ vector<Edge>& w = tree[parent];
+ old.start += l;
+ w.push_back(old);
+ w.push_back(Edge(++n, start + lcp, sz));
+ idx = 1;
+ // set branch update values
+ /* l = prevLcp; */
+ /* s = start + prevLcp; */
+ }
+ }
+ // update branch
+ /* for (int i = hierarchy.size() - 2; (l > 0 && i >= 0); i--) { */
+ /* Edge& e = tree[hierarchy[i]].back(); */
+ /* int d = e.end - e.start; */
+ /* e.end = s; */
+ /* s -= d; */
+ /* e.start = s; */
+ /* l -= d; */
+ /* } */
+ if (push)
+ hierarchy.push_back(parent);
+ }
+ prevLcp = lcp;
+ }
+
+ return tree;
+}
int main() {
char buffer[200001];
@@ -62,7 +160,6 @@ int main() {
// Build the suffix tree and get a mapping from
// suffix tree node ID to the list of outgoing Edges.
map<int, vector<Edge> > tree = SuffixTreeFromSuffixArray(suffix_array, lcp_array, text);
- printf("%s\n", buffer);
// Output the edges of the suffix tree in the required order.
// Note that we use here the contract that the root of the tree
// will have node ID = 0 and that each vector of outgoing edges
@@ -85,6 +182,27 @@ int main() {
// }
// }
//
+/* #define GRAPH */
+#ifdef GRAPH
+ fprintf(stderr, "suffixes :\n");
+ for (int i = 0; i < suffix_array.size(); i++) {
+ if (i == 0) {
+ fprintf(stderr, "%2d : ", suffix_array[i]);
+ } else {
+ fprintf(stderr, "%2d (%2d) : ", suffix_array[i], lcp_array[i - 1]);
+ }
+ for (int j = suffix_array[i]; j < text.size(); j++)
+ fprintf(stderr, "%c", text[j]);
+ fprintf(stderr, "\n");
+ }
+ printf("digraph G {\nlabel=\"%s\n", text.c_str());
+ for (int i = 0; i < text.size(); i++)
+ printf("%d ", i);
+ printf("\"\n");
+#else
+ printf("%s\n", buffer);
+#endif
+
vector<pair<int, int> > stack(1, make_pair(0, 0));
while (!stack.empty()) {
pair<int, int> p = stack.back();
@@ -98,8 +216,27 @@ int main() {
if (edge_index + 1 < edges.size()) {
stack.push_back(make_pair(node, edge_index + 1));
}
- printf("%d %d\n", edges[edge_index].start, edges[edge_index].end);
stack.push_back(make_pair(edges[edge_index].node, 0));
+#ifdef GRAPH
+ if (edge_index == 0) {
+ for (int i = 0; i < edges.size(); i++) {
+ const Edge& e = edges[i];
+ printf(" %d [ label=\"%d : %d,%d\n", e.node, e.node, e.start, e.end);
+ if (e.start >= 0) {
+ for (int j = e.start; j < e.end; j++)
+ printf("%c", text[j]);
+ }
+ printf("\" ]\n");
+ printf("%d -> %d\n", node, e.node);
+ }
+ }
+#else
+ printf("%d %d\n", edges[edge_index].start, edges[edge_index].end);
+#endif
}
+#ifdef GRAPH
+ printf("}\n");
+#endif
+
return 0;
}