summaryrefslogtreecommitdiffstats
path: root/04-algorithms_on_strings/02-burrows_wheeler
diff options
context:
space:
mode:
authorJérémy Zurcher <jeremy@asynk.ch>2016-11-14 06:45:12 +0100
committerJérémy Zurcher <jeremy@asynk.ch>2016-11-14 06:47:52 +0100
commiteb0a213d040fb3b116ec030a6755216ce5e61ee9 (patch)
tree855c040d76e7653bdf90bcd47c571e2c8c119275 /04-algorithms_on_strings/02-burrows_wheeler
parent0bc44a8bcf77a81677d8257eb5c4190ff5d5dd73 (diff)
downloadcoursera-eb0a213d040fb3b116ec030a6755216ce5e61ee9.zip
coursera-eb0a213d040fb3b116ec030a6755216ce5e61ee9.tar.gz
Algorithms : complete 04-algorithms_on_strings 02-burrows_wheeler
Diffstat (limited to '04-algorithms_on_strings/02-burrows_wheeler')
-rw-r--r--04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp105
-rw-r--r--04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp146
-rw-r--r--04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp108
-rw-r--r--04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp66
4 files changed, 393 insertions, 32 deletions
diff --git a/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp b/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp
index f3b0497..6b1c94b 100644
--- a/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp
+++ b/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp
@@ -8,18 +8,109 @@ using std::cout;
using std::endl;
using std::string;
using std::vector;
+using std::swap;
-string BWT(const string& text) {
- string result = "";
+static bool debug = false;
- // write your code here
+class Bwt
+{
+private:
+ vector<int> v;
+ const string &s;
+ int sz;
- return result;
-}
+ void print(int n)
+ {
+ cout << "print : " << n << endl;
+ for (int i = 0; i < v.size(); i++)
+ cout << v[i] << ";" << s[(v[i] + n) % sz] << endl;
+ }
+
+ void print()
+ {
+ cout << "print : " << endl;
+ for (int i = 0; i < v.size(); i++) {
+ for (int j = 0; j < sz; j++)
+ cout << s[(v[i] + j) % sz];
+ cout << endl;
+ }
+ }
+
+public:
+
+ Bwt(const string& text) : s(text), sz(text.size()) {
+ v.reserve(s.size());
+ for (int i = 0; i < s.size(); i++)
+ v.push_back(i);
+ if (debug) print();
+ }
+
+ string solve() {
+ sort(0, 0, v.size() - 1);
+ if (debug) print();
+
+ string ret = "";
+ for (int i = 0; i < v.size(); i++)
+ ret += s[(v[i] + sz - 1) % sz];
+ return ret;
+ }
+
+private:
+ void sort(int x, int l, int r)
+ {
+ if (debug) cout << " sort: " << x << ";" << l << ";" << r << endl;
+ if (l >= r)
+ return;
+
+ quicksort(x, l, r);
+ if (debug) print();
+
+ char c = s[(v[l] + x) % sz];
+ int ll = l;
+ for (int i = l + 1; i <= r; i++) {
+ char C = s[(v[i] + x) % sz];
+ if (C != c) {
+ sort(x + 1, ll, i - 1);
+ ll = i;
+ c = C;
+ }
+ }
+ sort(x + 1, ll, r);
+ }
+
+ void quicksort(int x, int l, int r) {
+ if (l >= r)
+ return;
+
+ int p = l + rand() % (r - l + 1);
+ swap(v[l], v[p]);
+
+ char c = s[(v[l] + x) % sz];
+ int i = l + 1;
+ int j = r;
+ int k = r;
+ while (i <= j) {
+ int C = s[(v[j] + x ) % sz];
+ if (C < c) {
+ swap(v[i], v[j]);
+ i++;
+ } else if (C > c) {
+ swap(v[j], v[k]);
+ k--;
+ j--;
+ }
+ else
+ j--;
+ }
+
+ quicksort(x, l, i - 1);
+ quicksort(x, k + 1, r);
+ }
+};
int main() {
string text;
cin >> text;
- cout << BWT(text) << endl;
+ cout << Bwt(text).solve() << endl;
return 0;
-} \ No newline at end of file
+}
diff --git a/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp b/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp
index 3ec7c54..7c6ec56 100644
--- a/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp
+++ b/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp
@@ -8,18 +8,148 @@ using std::cout;
using std::endl;
using std::string;
using std::vector;
+using std::swap;
-string InverseBWT(const string& bwt) {
- string text = "";
+static bool debug = false;
- // write your code here
+class InverseBwt
+{
+private:
+ vector<int> indexes;
+ const string &s;
+ int sz;
+ int starts[4] = {0, 0, 0, 0};
- return text;
-}
+public:
+
+ InverseBwt(const string& text) : s(text), sz(text.size()) {
+ string ss = s;
+ quicksort(ss, 0, sz - 1);
+
+ indexes.reserve(sz);
+ int count[] = {0, 0, 0, 0};
+
+ for (int i = 0; i < sz; i++) {
+ int c = 0;
+ switch(s[i]) {
+ case 'A':
+ c = count[0];
+ count[0] += 1;
+ break;
+ case 'C':
+ c = count[1];
+ count[1] += 1;
+ break;
+ case 'G':
+ c = count[2];
+ count[2] += 1;
+ break;
+ case 'T':
+ c = count[3];
+ count[3] += 1;
+ break;
+ default:
+ break;
+ }
+ indexes.push_back(c);
+
+ switch(ss[i]) {
+ case 'A':
+ if (starts[0] == 0 )
+ starts[0] = i;
+ break;
+ case 'C':
+ if (starts[1] == 0 )
+ starts[1] = i;
+ break;
+ case 'G':
+ if (starts[2] == 0 )
+ starts[2] = i;
+ break;
+ case 'T':
+ if (starts[3] == 0 )
+ starts[3] = i;
+ break;
+ default:
+ break;
+ }
+ }
+
+ if (debug) {
+ for (int i = 0; i < sz; i++) {
+ cout << ss[i] << ";" << s[i] << ";" << indexes[i] << endl;
+ }
+ cout << starts[0] << endl;
+ cout << starts[1] << endl;
+ cout << starts[2] << endl;
+ cout << starts[3] << endl;
+ }
+ }
+
+ string solve() {
+ string ret = s;
+
+ ret[sz - 1] = '$';
+ for (int i = 0, j = sz - 2; j >= 0; j--) {
+ ret[j] = s[i];
+ char ch = s[i];
+ i = indexes[i];
+ switch(ch) {
+ case 'A':
+ i += starts[0];
+ break;
+ case 'C':
+ i += starts[1];
+ break;
+ case 'G':
+ i += starts[2];
+ break;
+ case 'T':
+ i += starts[3];
+ break;
+ default:
+ break;
+ }
+ }
+
+ return ret;
+ }
+
+private:
+
+ void quicksort(string &s, int l, int r) {
+ if (l >= r)
+ return;
+
+ int p = l + rand() % (r - l + 1);
+ swap(s[l], s[p]);
+
+ char c = s[l];
+ int i = l + 1;
+ int j = r;
+ int k = r;
+ while (i <= j) {
+ int C = s[j];
+ if (C < c) {
+ swap(s[i], s[j]);
+ i++;
+ } else if (C > c) {
+ swap(s[j], s[k]);
+ k--;
+ j--;
+ }
+ else
+ j--;
+ }
+
+ quicksort(s, l, i - 1);
+ quicksort(s, k + 1, r);
+ }
+};
int main() {
- string bwt;
- cin >> bwt;
- cout << InverseBWT(bwt) << endl;
+ string text;
+ cin >> text;
+ cout << InverseBwt(text).solve() << endl;
return 0;
}
diff --git a/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp b/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp
index 9353389..10551df 100644
--- a/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp
+++ b/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp
@@ -12,31 +12,113 @@ using std::map;
using std::string;
using std::vector;
+static bool debug = false;
+
// Preprocess the Burrows-Wheeler Transform bwt of some text
// and compute as a result:
-// * starts - for each character C in bwt, starts[C] is the first position
-// of this character in the sorted array of
+// * starts - for each character C in bwt, starts[C] is the first position
+// of this character in the sorted array of
// all characters of the text.
// * occ_count_before - for each character C in bwt and each position P in bwt,
// occ_count_before[C][P] is the number of occurrences of character C in bwt
// from position 0 to position P inclusive.
-void PreprocessBWT(const string& bwt,
- map<char, int>& starts,
- map<char, vector<int> >& occ_count_before) {
- // Implement this function yourself
+void PreprocessBWT(const string& bwt,
+ map<char, int>& starts,
+ map<char, vector<int> >& occ_count_before)
+{
+ starts.insert(std::pair<char,int>('A', -1));
+ starts.insert(std::pair<char,int>('C', -1));
+ starts.insert(std::pair<char,int>('G', -1));
+ starts.insert(std::pair<char,int>('T', -1));
+ occ_count_before.insert(std::pair<char,vector<int>>('A', vector<int>(bwt.size() + 1)));
+ occ_count_before.insert(std::pair<char,vector<int>>('C', vector<int>(bwt.size() + 1)));
+ occ_count_before.insert(std::pair<char,vector<int>>('G', vector<int>(bwt.size() + 1)));
+ occ_count_before.insert(std::pair<char,vector<int>>('T', vector<int>(bwt.size() + 1)));
+
+ int count[] = {0, 0, 0, 0, 0};
+ for (int i = 0; i < bwt.size(); i ++) {
+ occ_count_before['A'][i] = count[0];
+ occ_count_before['C'][i] = count[1];
+ occ_count_before['G'][i] = count[2];
+ occ_count_before['T'][i] = count[3];
+ switch(bwt[i]) {
+ case 'A':
+ count[0] += 1;
+ break;
+ case 'C':
+ count[1] += 1;
+ break;
+ case 'G':
+ count[2] += 1;
+ break;
+ case 'T':
+ count[3] += 1;
+ break;
+ default:
+ break;
+ }
+ }
+ occ_count_before['A'][bwt.size()] = count[0];
+ occ_count_before['C'][bwt.size()] = count[1];
+ occ_count_before['G'][bwt.size()] = count[2];
+ occ_count_before['T'][bwt.size()] = count[3];
+
+ int x = 1;
+ if (count[0] != 0) {
+ starts['A'] = x;
+ x += count[0];
+ }
+ if (count[1] != 0) {
+ starts['C'] = x;
+ x += count[1];
+ }
+ if (count[2] != 0) {
+ starts['G'] = x;
+ x += count[2];
+ }
+ if (count[3] != 0) {
+ starts['T'] = x;
+ x += count[3];
+ }
+
+ if (debug) {
+ std::cout << bwt << std::endl;
+ std::cout << "starts : " << std::endl;
+ for (std::map<char,int>::iterator it = starts.begin(); it != starts.end(); it++)
+ std::cout << it->first << " => " << it->second << std::endl;
+ for (std::map<char,vector<int>>::iterator it = occ_count_before.begin(); it != occ_count_before.end(); it++) {
+ std::cout << it->first << " => ";
+ for (int i = 0; i <= bwt.size(); i++)
+ std::cout << it->second[i] << " ";
+ std::cout << std::endl;
+ }
+ std::cout << std::endl;
+ }
}
// Compute the number of occurrences of string pattern in the text
// given only Burrows-Wheeler Transform bwt of the text and additional
// information we get from the preprocessing stage - starts and occ_counts_before.
-int CountOccurrences(const string& pattern,
- const string& bwt,
- const map<char, int>& starts,
- const map<char, vector<int> >& occ_count_before) {
- // Implement this function yourself
- return 0;
+int CountOccurrences(const string& pattern,
+ const string& bwt,
+ const map<char, int>& starts,
+ const map<char, vector<int> >& occ_count_before)
+{
+ for (int top = 0, bottom = bwt.size() - 1, i = pattern.size() - 1; top <= bottom; ) {
+ if (i >= 0) {
+ char c = pattern[i];
+ i--;
+ int s = starts.at(c);
+ const vector<int> &v = occ_count_before.at(c);
+ top = s + v[top];
+ bottom = s + v[bottom + 1] - 1;
+ } else
+ return bottom - top + 1;
+ }
+
+ return 0;
}
-
+
int main() {
string bwt;
diff --git a/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp b/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp
index 1dbbcc3..2e0ec6d 100644
--- a/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp
+++ b/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp
@@ -11,16 +11,74 @@ using std::make_pair;
using std::pair;
using std::string;
using std::vector;
+using std::swap;
+
+static void quicksort(const string &s, vector<int> &v, int x, int l, int r)
+{
+ if (l >= r)
+ return;
+
+ int p = l + rand() % (r - l + 1);
+ swap(v[l], v[p]);
+
+ char c = s[v[l] + x];
+ int i = l + 1;
+ int j = r;
+ int k = r;
+ while (i <= j) {
+ int C = s[v[j] + x];
+ if (C < c) {
+ swap(v[i], v[j]);
+ i++;
+ } else if (C > c) {
+ swap(v[j], v[k]);
+ k--;
+ j--;
+ }
+ else
+ j--;
+ }
+
+ quicksort(s, v, x, l, i - 1);
+ quicksort(s, v, x, k + 1, r);
+}
+
+static void sort(const string &s, vector<int> &v, int x, int l, int r)
+{
+ if (l >= r)
+ return;
+
+ quicksort(s, v, x, l, r);
+
+ char c = s[v[l] + x];
+ int ll = l;
+ for (int i = l + 1; i <= r; i++) {
+ char C = s[v[i] + x];
+ if (C != c) {
+ sort(s, v, x + 1, ll, i - 1);
+ ll = i;
+ c = C;
+ }
+ }
+ sort(s, v, x + 1, ll, r);
+}
// Build suffix array of the string text and
// return a vector result of the same length as the text
// such that the value result[i] is the index (0-based)
// in text where the i-th lexicographically smallest
// suffix of text starts.
-vector<int> BuildSuffixArray(const string& text) {
- vector<int> result;
- // Implement this function yourself
- return result;
+vector<int> BuildSuffixArray(const string& text)
+{
+ int sz = text.size();
+ vector<int> v;
+ v.reserve(text.size());
+ for (int i = 0; i < sz; i++)
+ v.push_back(i);
+
+ sort(text, v, 0, 0, sz - 1);
+
+ return v;
}
int main() {