Algorithms : complete 04-algorithms_on_strings 04-suffix_array

author: Jérémy Zurcher <jeremy@asynk.ch> 2016-11-14 07:01:13 +0100
committer: Jérémy Zurcher <jeremy@asynk.ch> 2016-11-14 07:01:13 +0100
commit: 52041be8af3d321f07feb27fbba70b364d72c10a (patch)
tree: a9ad5d0cc7a3a836fcf130a428ba152e5d070fe2 /04-algorithms_on_strings/04-suffix_array
parent: 7471f538c2203f266cb8002fb001183a9720a969 (diff)
download: coursera-52041be8af3d321f07feb27fbba70b364d72c10a.zip
coursera-52041be8af3d321f07feb27fbba70b364d72c10a.tar.gz
4 files changed, 455 insertions, 36 deletions
diff --git a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
index 13121ae..4ac7afd 100644
--- a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp
@@ -7,13 +7,23 @@ using std::cin;
 using std::string;
 using std::vector;
 
-// Find all occurrences of the pattern in the text and return a
-// vector with all positions in the text (starting from 0) where 
-// the pattern starts in the text.
-vector<int> find_pattern(const string& pattern, const string& text) {
-  vector<int> result;
-  // Implement this function yourself
-  return result;
+vector<int> find_pattern(const string& pattern, const string& text)
+{
+    vector<int> result;
+    string s = pattern + "$" + text;
+    vector<int> prefixes(s.size());
+    int border = 0;
+    int ps = pattern.size();
+
+    for (int i = 1; i < s.size(); i++) {
+        while ((border > 0) && (s[i] != s[border]))
+            border = prefixes[border - 1];
+        prefixes[i] = border = ((s[i] == s[border]) ? (border + 1) : 0);
+        if (border == ps)
+            result.push_back(i - 2 * ps);
+    }
+
+    return result;
 }
 
 int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
index 1dbbcc3..7235bbe 100644
--- a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp
@@ -12,15 +12,121 @@ using std::pair;
 using std::string;
 using std::vector;
 
-// Build suffix array of the string text and
-// return a vector result of the same length as the text
-// such that the value result[i] is the index (0-based)
-// in text where the i-th lexicographically smallest
-// suffix of text starts.
-vector<int> BuildSuffixArray(const string& text) {
-  vector<int> result;
-  // Implement this function yourself
-  return result;
+static int $ = 0;
+static int A = 1;
+static int C = 2;
+static int G = 3;
+static int T = 4;
+
+static inline int toInt(char c)
+{
+    int ret = 0;
+    switch(c) {
+        case 'A': ret = A; break;
+        case 'C': ret = C; break;
+        case 'G': ret = G; break;
+        case 'T': ret = T; break;
+    }
+    return ret;
+}
+
+vector<int> BuildSuffixArray(const string& text)
+{
+    int sz = text.size();
+    vector<int> v0(sz);
+    vector<int> v1(sz);
+    vector<int> v2(sz);
+    vector<int> v3(sz);
+    vector<int> v4(sz);
+
+    // count sort text[i]
+    vector<int> &count = v0;
+    for (int i = 0; i < sz; i++)
+        count[toInt(text[i])] += 1;
+    for(int i = 1; i < 5; i++)
+        count[i] += count[i - 1];
+
+    vector<int> &order = v1;
+    for (int i = sz - 1; i >= 0; i--) {
+        int x = toInt(text[i]);
+        count[x] -= 1;
+        order[count[x]] = i;
+    }
+
+    // compute classes
+    int cl = 0;
+    vector<int> &classes = v2;
+    classes[order[0]] = cl;
+    for (int i = 1; i < sz; i++) {
+        if (text[order[i]] != text[order[i - 1]])
+            cl += 1;
+        classes[order[i]] = cl;
+    }
+
+    /* cout << text << endl; */
+    /* cout << "$ : " << count[$] << endl; */
+    /* cout << "A : " << count[A] << endl; */
+    /* cout << "C : " << count[C] << endl; */
+    /* cout << "G : " << count[G] << endl; */
+    /* cout << "T : " << count[T] << endl; */
+    /* cout << "order" << endl; */
+    /* for (int i = 0; i < sz; i++) */
+    /*     cout << order[i] << " "; */
+    /* cout << endl; */
+    /* cout << "classes" << endl; */
+    /* for (int i = 0; i < sz; i++) */
+    /*     cout << classes[i] << " "; */
+    /* cout << endl; */
+
+    int l = 1;
+    vector<int> &nOrder = v3;
+    vector<int> &nClasses = v4;
+    while (l < sz) {
+        // count sort classes
+        for (int i = 0; i < sz; i++)
+            count[i] = 0;
+        for (int i = 0; i < sz; i++)
+            count[classes[i]] += 1;
+        for(int i = 1; i < sz; i++)
+            count[i] += count[i - 1];
+        for (int i = sz - 1; i >= 0; i--) {
+            int x = (order[i] - l + sz) % sz;
+            int cl = classes[x];
+            count[cl] -= 1;
+            nOrder[count[cl]] = x;
+        }
+        if(order == v1) {
+            order = v3;
+            nOrder = v1;
+        } else {
+            order = v1;
+            nOrder = v3;
+        }
+
+        // update classes
+        nClasses[order[0]] = 0;
+        for (int i = 1; i < sz; i++) {
+            int cur = order[i];
+            int prev = order[i - 1];
+            int mid = (cur + l) % sz;
+            int midPrev = (prev + l) % sz;
+            if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev])
+                nClasses[cur] = nClasses[prev] + 1;
+            else
+                nClasses[cur] = nClasses[prev];
+        }
+        if(classes == v2) {
+            classes = v4;
+            nClasses = v2;
+        } else {
+            classes = v2;
+            nOrder = v4;
+        }
+
+        l *= 2;
+    }
+
+    return order;
 }
 
 int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
index 204218c..74fa0ee 100644
--- a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp
@@ -7,21 +7,187 @@ using std::cin;
 using std::string;
 using std::vector;
 
-vector<int> BuildSuffixArray(const string& text) {
-  vector<int> order;
+static int $ = 0;
+static int A = 1;
+static int C = 2;
+static int G = 3;
+static int T = 4;
 
-  // write your code here
+static inline int toInt(char c)
+{
+    int ret = 0;
+    switch(c) {
+        case 'A': ret = A; break;
+        case 'C': ret = C; break;
+        case 'G': ret = G; break;
+        case 'T': ret = T; break;
+    }
+    return ret;
+}
+
+vector<int> BuildSuffixArray(const string& text)
+{
+    int sz = text.size();
+    vector<int> v0(sz);
+    vector<int> v1(sz);
+    vector<int> v2(sz);
+    vector<int> v3(sz);
+    vector<int> v4(sz);
+
+    // count sort text[i]
+    vector<int> &count = v0;
+    for (int i = 0; i < sz; i++)
+        count[toInt(text[i])] += 1;
+    for(int i = 1; i < 5; i++)
+        count[i] += count[i - 1];
+
+    vector<int> &order = v1;
+    for (int i = sz - 1; i >= 0; i--) {
+        int x = toInt(text[i]);
+        count[x] -= 1;
+        order[count[x]] = i;
+    }
+
+    // compute classes
+    int cl = 0;
+    vector<int> &classes = v2;
+    classes[order[0]] = cl;
+    for (int i = 1; i < sz; i++) {
+        if (text[order[i]] != text[order[i - 1]])
+            cl += 1;
+        classes[order[i]] = cl;
+    }
+
+    int l = 1;
+    vector<int> &nOrder = v3;
+    vector<int> &nClasses = v4;
+    while (l < sz) {
+        // count sort classes
+        for (int i = 0; i < sz; i++)
+            count[i] = 0;
+        for (int i = 0; i < sz; i++)
+            count[classes[i]] += 1;
+        for(int i = 1; i < sz; i++)
+            count[i] += count[i - 1];
+        for (int i = sz - 1; i >= 0; i--) {
+            int x = (order[i] - l + sz) % sz;
+            int cl = classes[x];
+            count[cl] -= 1;
+            nOrder[count[cl]] = x;
+        }
+        if(order == v1) {
+            order = v3;
+            nOrder = v1;
+        } else {
+            order = v1;
+            nOrder = v3;
+        }
+
+        // update classes
+        nClasses[order[0]] = 0;
+        for (int i = 1; i < sz; i++) {
+            int cur = order[i];
+            int prev = order[i - 1];
+            int mid = (cur + l) % sz;
+            int midPrev = (prev + l) % sz;
+            if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev])
+                nClasses[cur] = nClasses[prev] + 1;
+            else
+                nClasses[cur] = nClasses[prev];
+        }
+        if(classes == v2) {
+            classes = v4;
+            nClasses = v2;
+        } else {
+            classes = v2;
+            nOrder = v4;
+        }
 
-  return order;
+        l *= 2;
+    }
+
+    return order;
+}
+
+vector<int> _lcp(const string& s, const vector<int>& order)
+{
+    int sz = order.size();
+    vector<int> v(sz - 1);
+    vector<int> r(sz); // order from i
+    for (int i = 0; i < sz; i++)
+        r[order[i]] = i;
+
+    int lcp = 0;
+    int suffix = order[0];
+    for (int i = 0; i < sz; i++) {
+        int x = r[suffix];
+        if (x == sz - 1) {
+            lcp = 0;
+        } else {
+            int nextSuffix = order[x + 1];
+            if (lcp > 0) lcp -= 1;
+            while ((suffix + lcp < sz) && (nextSuffix + lcp < sz)) {
+                if (s[suffix + lcp] == s[nextSuffix+ lcp])
+                    lcp += 1;
+                else
+                    break;
+            }
+            v[x] = lcp;
+        }
+        suffix = (suffix + 1) % sz;
+    }
+
+    return v;
 }
 
+vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array)
+{
+    vector<int> result;
 
-vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array) {
-  vector<int> result;
+    int sz = text.size();
+    int pz = pattern.size();
+    int l = 0;
+    int r = sz;
+    while (l < r) {
+        int mid = (l + r) / 2;
+        int k = suffix_array[mid];
+        bool lower = false;
+        for (int j = 0; j < pz; j++) {
+            if (pattern[j] == text[k + j])
+                continue;
+            if (pattern[j] > text[k + j])
+                lower = true;
+            break;
+        }
+        if (lower)
+            l = mid + 1;
+        else
+            r = mid;
+    }
+
+    int s = l;
+    r = sz;
+    while (l < r) {
+        int mid = (l + r) / 2;
+        int k = suffix_array[mid];
+        bool lower = false;
+        for (int j = 0; j < pz; j++) {
+            if (pattern[j] == text[k + j])
+                continue;
+            if (pattern[j] < text[k + j])
+                lower = true;
+            break;
+        }
+        if (lower)
+            r = mid;
+        else
+            l = mid + 1;
+    }
 
-  // write your code here
+    for (int i = s; i < r; i++)
+        result.push_back(suffix_array[i]);
 
-  return result;
+    return result;
 }
 
 int main() {
diff --git a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
index ca9fdb5..4ac4658 100644
--- a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
+++ b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp
@@ -22,8 +22,9 @@ struct Edge {
   // corresponding to the label of this edge.
   int end;
 
-  Edge(int node_, int start_, int end_) : node(node_), start(start_), end(end_) {}
-  Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {}
+  Edge(int node_, int start_, int end_) :
+      node(node_), start(start_), end(end_) {}
+  /* Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {} */
 };
 
 // Build suffix tree of the string text given its suffix array suffix_array
@@ -37,15 +38,112 @@ struct Edge {
 // must be represented by Edge(1, 6, 7). This edge must be present in the vector tree[0]
 // (corresponding to the root node), and it should be the first edge in the vector 
 // (because it has the smallest first character of all edges outgoing from the root).
-map<int, vector<Edge> > SuffixTreeFromSuffixArray(
-    const vector<int>& suffix_array,
-    const vector<int>& lcp_array,
-    const string& text) {
-  map<int, vector<Edge> > tree;
-  // Implement this function yourself
-  return tree;
-}
+map<int, vector<Edge>> SuffixTreeFromSuffixArray(
+        const vector<int>& suffix_array,
+        const vector<int>& lcp_array,
+        const string& text)
+{
+    map<int, vector<Edge>> tree;
+    vector<int> hierarchy;
+    int sz = suffix_array.size();
+    int n = 0;
+    int parent = 0;
+    int idx = 0;
+    int prevLcp = 0;
+    hierarchy.push_back(0);
+
+    // add '$'
+    tree[parent].push_back(Edge(++n, suffix_array[0], sz));
 
+    // add following suffixes
+    /* for (int i = 1; i < 6; i++) { */
+    for (int i = 1; i < suffix_array.size(); i++) {
+        int start = suffix_array[i];
+        int lcp = lcp_array[i - 1];
+
+        if (lcp == 0) {
+            // add to root
+            parent = 0;
+            vector<Edge>& v = tree[parent];
+            idx = v.size();
+            v.push_back(Edge(++n, start, sz));
+            hierarchy.resize(1);
+        } else if (lcp == prevLcp) {
+            // add to parent
+            tree[parent].push_back(Edge(++n, start + lcp, sz));
+            idx++;
+        } else {
+            int s;
+            int l = (lcp - prevLcp);
+            bool push = true;
+            Edge& last = tree[parent][idx];
+            if (l > 0) {
+                s = start + lcp;
+                // push 2 new edges under last
+                parent = last.node;
+                vector<Edge>& v = tree[parent];
+                v.push_back(Edge(++n, last.start + l, sz));
+                v.push_back(Edge(++n, s, sz));
+                idx = 1;
+                // update split edge
+                last.end = s;
+                last.start = s - l;
+                // set branch update values
+                /* l = prevLcp; */
+                /* s = last.start; */
+            } else {
+                // find good parent
+                while (l < 0) {
+                    hierarchy.resize(hierarchy.size() - 1);
+                    Edge& e = tree[hierarchy.back()].back();
+                    l += (e.end - e.start);
+                }
+                parent = hierarchy.back();
+                idx = tree[parent].size() - 1;
+                prevLcp = (lcp - l);
+                if (l == 0) {
+                    // add to parent
+                    tree[parent].push_back(Edge(++n, start + lcp, sz));
+                    idx++;
+                    // set branch update values
+                    /* l = lcp; */
+                    /* s = start + lcp; */
+                    push = false;
+                } else {
+                    // set new split edge
+                    vector<Edge>& v = tree[parent];
+                    Edge old = v.back();
+                    v.pop_back();
+                    v.push_back(Edge(++n, start + prevLcp, start + lcp));
+                    // feed new split edge
+                    parent = n;
+                    vector<Edge>& w = tree[parent];
+                    old.start += l;
+                    w.push_back(old);
+                    w.push_back(Edge(++n, start + lcp, sz));
+                    idx = 1;
+                    // set branch update values
+                    /* l = prevLcp; */
+                    /* s = start + prevLcp; */
+                }
+            }
+            // update branch
+            /* for (int i = hierarchy.size() - 2; (l > 0 && i >= 0); i--) { */
+            /*     Edge& e = tree[hierarchy[i]].back(); */
+            /*     int d = e.end - e.start; */
+            /*     e.end = s; */
+            /*     s -= d; */
+            /*     e.start = s; */
+            /*     l -= d; */
+            /* } */
+            if (push)
+                hierarchy.push_back(parent);
+        }
+        prevLcp = lcp;
+    }
+
+    return tree;
+}
 
 int main() {
   char buffer[200001];
@@ -62,7 +160,6 @@ int main() {
   // Build the suffix tree and get a mapping from 
   // suffix tree node ID to the list of outgoing Edges.
   map<int, vector<Edge> > tree = SuffixTreeFromSuffixArray(suffix_array, lcp_array, text);
-  printf("%s\n", buffer);
   // Output the edges of the suffix tree in the required order.
   // Note that we use here the contract that the root of the tree
   // will have node ID = 0 and that each vector of outgoing edges
@@ -85,6 +182,27 @@ int main() {
   //   }
   // }
   //
+/* #define GRAPH */
+#ifdef GRAPH
+  fprintf(stderr, "suffixes :\n");
+  for (int i = 0; i < suffix_array.size(); i++) {
+      if (i == 0) {
+          fprintf(stderr, "%2d      : ", suffix_array[i]);
+      } else {
+          fprintf(stderr, "%2d (%2d) : ", suffix_array[i], lcp_array[i - 1]);
+      }
+      for (int j = suffix_array[i]; j < text.size(); j++)
+          fprintf(stderr, "%c", text[j]);
+      fprintf(stderr, "\n");
+  }
+  printf("digraph G {\nlabel=\"%s\n", text.c_str());
+  for (int i = 0; i < text.size(); i++)
+      printf("%d ", i);
+  printf("\"\n");
+#else
+  printf("%s\n", buffer);
+#endif
+
   vector<pair<int, int> > stack(1, make_pair(0, 0));    
   while (!stack.empty()) {
     pair<int, int> p = stack.back();
@@ -98,8 +216,27 @@ int main() {
     if (edge_index + 1 < edges.size()) {
       stack.push_back(make_pair(node, edge_index + 1));
     }
-    printf("%d %d\n", edges[edge_index].start, edges[edge_index].end);
     stack.push_back(make_pair(edges[edge_index].node, 0));
+#ifdef GRAPH
+    if (edge_index == 0) {
+        for (int i = 0; i < edges.size(); i++) {
+            const Edge& e = edges[i];
+            printf(" %d [ label=\"%d : %d,%d\n", e.node, e.node, e.start, e.end);
+            if (e.start >= 0) {
+                for (int j = e.start; j < e.end; j++)
+                    printf("%c", text[j]);
+            }
+            printf("\" ]\n");
+            printf("%d -> %d\n", node, e.node);
+        }
+    }
+#else
+    printf("%d %d\n", edges[edge_index].start, edges[edge_index].end);
+#endif
   }
+#ifdef GRAPH
+  printf("}\n");
+#endif
+
   return 0;
 }
author	Jérémy Zurcher <jeremy@asynk.ch>	2016-11-14 07:01:13 +0100
committer	Jérémy Zurcher <jeremy@asynk.ch>	2016-11-14 07:01:13 +0100
commit	52041be8af3d321f07feb27fbba70b364d72c10a (patch)
tree	a9ad5d0cc7a3a836fcf130a428ba152e5d070fe2 /04-algorithms_on_strings/04-suffix_array
parent	7471f538c2203f266cb8002fb001183a9720a969 (diff)
download	coursera-52041be8af3d321f07feb27fbba70b364d72c10a.zip coursera-52041be8af3d321f07feb27fbba70b364d72c10a.tar.gz