diff options
author | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 07:01:13 +0100 |
---|---|---|
committer | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 07:01:13 +0100 |
commit | 52041be8af3d321f07feb27fbba70b364d72c10a (patch) | |
tree | a9ad5d0cc7a3a836fcf130a428ba152e5d070fe2 /04-algorithms_on_strings/04-suffix_array | |
parent | 7471f538c2203f266cb8002fb001183a9720a969 (diff) | |
download | coursera-52041be8af3d321f07feb27fbba70b364d72c10a.zip coursera-52041be8af3d321f07feb27fbba70b364d72c10a.tar.gz |
Algorithms : complete 04-algorithms_on_strings 04-suffix_array
Diffstat (limited to '04-algorithms_on_strings/04-suffix_array')
4 files changed, 455 insertions, 36 deletions
diff --git a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp index 13121ae..4ac7afd 100644 --- a/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp +++ b/04-algorithms_on_strings/04-suffix_array/01-kmp/kmp.cpp @@ -7,13 +7,23 @@ using std::cin; using std::string; using std::vector; -// Find all occurrences of the pattern in the text and return a -// vector with all positions in the text (starting from 0) where -// the pattern starts in the text. -vector<int> find_pattern(const string& pattern, const string& text) { - vector<int> result; - // Implement this function yourself - return result; +vector<int> find_pattern(const string& pattern, const string& text) +{ + vector<int> result; + string s = pattern + "$" + text; + vector<int> prefixes(s.size()); + int border = 0; + int ps = pattern.size(); + + for (int i = 1; i < s.size(); i++) { + while ((border > 0) && (s[i] != s[border])) + border = prefixes[border - 1]; + prefixes[i] = border = ((s[i] == s[border]) ? (border + 1) : 0); + if (border == ps) + result.push_back(i - 2 * ps); + } + + return result; } int main() { diff --git a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp index 1dbbcc3..7235bbe 100644 --- a/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp +++ b/04-algorithms_on_strings/04-suffix_array/02-suffix_array_long/suffix_array_long.cpp @@ -12,15 +12,121 @@ using std::pair; using std::string; using std::vector; -// Build suffix array of the string text and -// return a vector result of the same length as the text -// such that the value result[i] is the index (0-based) -// in text where the i-th lexicographically smallest -// suffix of text starts. -vector<int> BuildSuffixArray(const string& text) { - vector<int> result; - // Implement this function yourself - return result; +static int $ = 0; +static int A = 1; +static int C = 2; +static int G = 3; +static int T = 4; + +static inline int toInt(char c) +{ + int ret = 0; + switch(c) { + case 'A': ret = A; break; + case 'C': ret = C; break; + case 'G': ret = G; break; + case 'T': ret = T; break; + } + return ret; +} + +vector<int> BuildSuffixArray(const string& text) +{ + int sz = text.size(); + vector<int> v0(sz); + vector<int> v1(sz); + vector<int> v2(sz); + vector<int> v3(sz); + vector<int> v4(sz); + + // count sort text[i] + vector<int> &count = v0; + for (int i = 0; i < sz; i++) + count[toInt(text[i])] += 1; + for(int i = 1; i < 5; i++) + count[i] += count[i - 1]; + + vector<int> &order = v1; + for (int i = sz - 1; i >= 0; i--) { + int x = toInt(text[i]); + count[x] -= 1; + order[count[x]] = i; + } + + // compute classes + int cl = 0; + vector<int> &classes = v2; + classes[order[0]] = cl; + for (int i = 1; i < sz; i++) { + if (text[order[i]] != text[order[i - 1]]) + cl += 1; + classes[order[i]] = cl; + } + + /* cout << text << endl; */ + /* cout << "$ : " << count[$] << endl; */ + /* cout << "A : " << count[A] << endl; */ + /* cout << "C : " << count[C] << endl; */ + /* cout << "G : " << count[G] << endl; */ + /* cout << "T : " << count[T] << endl; */ + /* cout << "order" << endl; */ + /* for (int i = 0; i < sz; i++) */ + /* cout << order[i] << " "; */ + /* cout << endl; */ + /* cout << "classes" << endl; */ + /* for (int i = 0; i < sz; i++) */ + /* cout << classes[i] << " "; */ + /* cout << endl; */ + + int l = 1; + vector<int> &nOrder = v3; + vector<int> &nClasses = v4; + while (l < sz) { + // count sort classes + for (int i = 0; i < sz; i++) + count[i] = 0; + for (int i = 0; i < sz; i++) + count[classes[i]] += 1; + for(int i = 1; i < sz; i++) + count[i] += count[i - 1]; + for (int i = sz - 1; i >= 0; i--) { + int x = (order[i] - l + sz) % sz; + int cl = classes[x]; + count[cl] -= 1; + nOrder[count[cl]] = x; + } + if(order == v1) { + order = v3; + nOrder = v1; + } else { + order = v1; + nOrder = v3; + } + + // update classes + nClasses[order[0]] = 0; + for (int i = 1; i < sz; i++) { + int cur = order[i]; + int prev = order[i - 1]; + int mid = (cur + l) % sz; + int midPrev = (prev + l) % sz; + if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev]) + nClasses[cur] = nClasses[prev] + 1; + else + nClasses[cur] = nClasses[prev]; + } + if(classes == v2) { + classes = v4; + nClasses = v2; + } else { + classes = v2; + nOrder = v4; + } + + l *= 2; + } + + return order; } int main() { diff --git a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp index 204218c..74fa0ee 100644 --- a/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp +++ b/04-algorithms_on_strings/04-suffix_array/03-suffix_array_matching/suffix_array_matching.cpp @@ -7,21 +7,187 @@ using std::cin; using std::string; using std::vector; -vector<int> BuildSuffixArray(const string& text) { - vector<int> order; +static int $ = 0; +static int A = 1; +static int C = 2; +static int G = 3; +static int T = 4; - // write your code here +static inline int toInt(char c) +{ + int ret = 0; + switch(c) { + case 'A': ret = A; break; + case 'C': ret = C; break; + case 'G': ret = G; break; + case 'T': ret = T; break; + } + return ret; +} + +vector<int> BuildSuffixArray(const string& text) +{ + int sz = text.size(); + vector<int> v0(sz); + vector<int> v1(sz); + vector<int> v2(sz); + vector<int> v3(sz); + vector<int> v4(sz); + + // count sort text[i] + vector<int> &count = v0; + for (int i = 0; i < sz; i++) + count[toInt(text[i])] += 1; + for(int i = 1; i < 5; i++) + count[i] += count[i - 1]; + + vector<int> &order = v1; + for (int i = sz - 1; i >= 0; i--) { + int x = toInt(text[i]); + count[x] -= 1; + order[count[x]] = i; + } + + // compute classes + int cl = 0; + vector<int> &classes = v2; + classes[order[0]] = cl; + for (int i = 1; i < sz; i++) { + if (text[order[i]] != text[order[i - 1]]) + cl += 1; + classes[order[i]] = cl; + } + + int l = 1; + vector<int> &nOrder = v3; + vector<int> &nClasses = v4; + while (l < sz) { + // count sort classes + for (int i = 0; i < sz; i++) + count[i] = 0; + for (int i = 0; i < sz; i++) + count[classes[i]] += 1; + for(int i = 1; i < sz; i++) + count[i] += count[i - 1]; + for (int i = sz - 1; i >= 0; i--) { + int x = (order[i] - l + sz) % sz; + int cl = classes[x]; + count[cl] -= 1; + nOrder[count[cl]] = x; + } + if(order == v1) { + order = v3; + nOrder = v1; + } else { + order = v1; + nOrder = v3; + } + + // update classes + nClasses[order[0]] = 0; + for (int i = 1; i < sz; i++) { + int cur = order[i]; + int prev = order[i - 1]; + int mid = (cur + l) % sz; + int midPrev = (prev + l) % sz; + if (classes[cur] != classes[prev] || classes[mid] != classes[midPrev]) + nClasses[cur] = nClasses[prev] + 1; + else + nClasses[cur] = nClasses[prev]; + } + if(classes == v2) { + classes = v4; + nClasses = v2; + } else { + classes = v2; + nOrder = v4; + } - return order; + l *= 2; + } + + return order; +} + +vector<int> _lcp(const string& s, const vector<int>& order) +{ + int sz = order.size(); + vector<int> v(sz - 1); + vector<int> r(sz); // order from i + for (int i = 0; i < sz; i++) + r[order[i]] = i; + + int lcp = 0; + int suffix = order[0]; + for (int i = 0; i < sz; i++) { + int x = r[suffix]; + if (x == sz - 1) { + lcp = 0; + } else { + int nextSuffix = order[x + 1]; + if (lcp > 0) lcp -= 1; + while ((suffix + lcp < sz) && (nextSuffix + lcp < sz)) { + if (s[suffix + lcp] == s[nextSuffix+ lcp]) + lcp += 1; + else + break; + } + v[x] = lcp; + } + suffix = (suffix + 1) % sz; + } + + return v; } +vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array) +{ + vector<int> result; -vector<int> FindOccurrences(const string& pattern, const string& text, const vector<int>& suffix_array) { - vector<int> result; + int sz = text.size(); + int pz = pattern.size(); + int l = 0; + int r = sz; + while (l < r) { + int mid = (l + r) / 2; + int k = suffix_array[mid]; + bool lower = false; + for (int j = 0; j < pz; j++) { + if (pattern[j] == text[k + j]) + continue; + if (pattern[j] > text[k + j]) + lower = true; + break; + } + if (lower) + l = mid + 1; + else + r = mid; + } + + int s = l; + r = sz; + while (l < r) { + int mid = (l + r) / 2; + int k = suffix_array[mid]; + bool lower = false; + for (int j = 0; j < pz; j++) { + if (pattern[j] == text[k + j]) + continue; + if (pattern[j] < text[k + j]) + lower = true; + break; + } + if (lower) + r = mid; + else + l = mid + 1; + } - // write your code here + for (int i = s; i < r; i++) + result.push_back(suffix_array[i]); - return result; + return result; } int main() { diff --git a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp index ca9fdb5..4ac4658 100644 --- a/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp +++ b/04-algorithms_on_strings/04-suffix_array/04-suffix_tree_from_array/suffix_tree_from_array.cpp @@ -22,8 +22,9 @@ struct Edge { // corresponding to the label of this edge. int end; - Edge(int node_, int start_, int end_) : node(node_), start(start_), end(end_) {} - Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {} + Edge(int node_, int start_, int end_) : + node(node_), start(start_), end(end_) {} + /* Edge(const Edge& e) : node(e.node), start(e.start), end(e.end) {} */ }; // Build suffix tree of the string text given its suffix array suffix_array @@ -37,15 +38,112 @@ struct Edge { // must be represented by Edge(1, 6, 7). This edge must be present in the vector tree[0] // (corresponding to the root node), and it should be the first edge in the vector // (because it has the smallest first character of all edges outgoing from the root). -map<int, vector<Edge> > SuffixTreeFromSuffixArray( - const vector<int>& suffix_array, - const vector<int>& lcp_array, - const string& text) { - map<int, vector<Edge> > tree; - // Implement this function yourself - return tree; -} +map<int, vector<Edge>> SuffixTreeFromSuffixArray( + const vector<int>& suffix_array, + const vector<int>& lcp_array, + const string& text) +{ + map<int, vector<Edge>> tree; + vector<int> hierarchy; + int sz = suffix_array.size(); + int n = 0; + int parent = 0; + int idx = 0; + int prevLcp = 0; + hierarchy.push_back(0); + + // add '$' + tree[parent].push_back(Edge(++n, suffix_array[0], sz)); + // add following suffixes + /* for (int i = 1; i < 6; i++) { */ + for (int i = 1; i < suffix_array.size(); i++) { + int start = suffix_array[i]; + int lcp = lcp_array[i - 1]; + + if (lcp == 0) { + // add to root + parent = 0; + vector<Edge>& v = tree[parent]; + idx = v.size(); + v.push_back(Edge(++n, start, sz)); + hierarchy.resize(1); + } else if (lcp == prevLcp) { + // add to parent + tree[parent].push_back(Edge(++n, start + lcp, sz)); + idx++; + } else { + int s; + int l = (lcp - prevLcp); + bool push = true; + Edge& last = tree[parent][idx]; + if (l > 0) { + s = start + lcp; + // push 2 new edges under last + parent = last.node; + vector<Edge>& v = tree[parent]; + v.push_back(Edge(++n, last.start + l, sz)); + v.push_back(Edge(++n, s, sz)); + idx = 1; + // update split edge + last.end = s; + last.start = s - l; + // set branch update values + /* l = prevLcp; */ + /* s = last.start; */ + } else { + // find good parent + while (l < 0) { + hierarchy.resize(hierarchy.size() - 1); + Edge& e = tree[hierarchy.back()].back(); + l += (e.end - e.start); + } + parent = hierarchy.back(); + idx = tree[parent].size() - 1; + prevLcp = (lcp - l); + if (l == 0) { + // add to parent + tree[parent].push_back(Edge(++n, start + lcp, sz)); + idx++; + // set branch update values + /* l = lcp; */ + /* s = start + lcp; */ + push = false; + } else { + // set new split edge + vector<Edge>& v = tree[parent]; + Edge old = v.back(); + v.pop_back(); + v.push_back(Edge(++n, start + prevLcp, start + lcp)); + // feed new split edge + parent = n; + vector<Edge>& w = tree[parent]; + old.start += l; + w.push_back(old); + w.push_back(Edge(++n, start + lcp, sz)); + idx = 1; + // set branch update values + /* l = prevLcp; */ + /* s = start + prevLcp; */ + } + } + // update branch + /* for (int i = hierarchy.size() - 2; (l > 0 && i >= 0); i--) { */ + /* Edge& e = tree[hierarchy[i]].back(); */ + /* int d = e.end - e.start; */ + /* e.end = s; */ + /* s -= d; */ + /* e.start = s; */ + /* l -= d; */ + /* } */ + if (push) + hierarchy.push_back(parent); + } + prevLcp = lcp; + } + + return tree; +} int main() { char buffer[200001]; @@ -62,7 +160,6 @@ int main() { // Build the suffix tree and get a mapping from // suffix tree node ID to the list of outgoing Edges. map<int, vector<Edge> > tree = SuffixTreeFromSuffixArray(suffix_array, lcp_array, text); - printf("%s\n", buffer); // Output the edges of the suffix tree in the required order. // Note that we use here the contract that the root of the tree // will have node ID = 0 and that each vector of outgoing edges @@ -85,6 +182,27 @@ int main() { // } // } // +/* #define GRAPH */ +#ifdef GRAPH + fprintf(stderr, "suffixes :\n"); + for (int i = 0; i < suffix_array.size(); i++) { + if (i == 0) { + fprintf(stderr, "%2d : ", suffix_array[i]); + } else { + fprintf(stderr, "%2d (%2d) : ", suffix_array[i], lcp_array[i - 1]); + } + for (int j = suffix_array[i]; j < text.size(); j++) + fprintf(stderr, "%c", text[j]); + fprintf(stderr, "\n"); + } + printf("digraph G {\nlabel=\"%s\n", text.c_str()); + for (int i = 0; i < text.size(); i++) + printf("%d ", i); + printf("\"\n"); +#else + printf("%s\n", buffer); +#endif + vector<pair<int, int> > stack(1, make_pair(0, 0)); while (!stack.empty()) { pair<int, int> p = stack.back(); @@ -98,8 +216,27 @@ int main() { if (edge_index + 1 < edges.size()) { stack.push_back(make_pair(node, edge_index + 1)); } - printf("%d %d\n", edges[edge_index].start, edges[edge_index].end); stack.push_back(make_pair(edges[edge_index].node, 0)); +#ifdef GRAPH + if (edge_index == 0) { + for (int i = 0; i < edges.size(); i++) { + const Edge& e = edges[i]; + printf(" %d [ label=\"%d : %d,%d\n", e.node, e.node, e.start, e.end); + if (e.start >= 0) { + for (int j = e.start; j < e.end; j++) + printf("%c", text[j]); + } + printf("\" ]\n"); + printf("%d -> %d\n", node, e.node); + } + } +#else + printf("%d %d\n", edges[edge_index].start, edges[edge_index].end); +#endif } +#ifdef GRAPH + printf("}\n"); +#endif + return 0; } |