diff options
author | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 06:45:12 +0100 |
---|---|---|
committer | Jérémy Zurcher <jeremy@asynk.ch> | 2016-11-14 06:47:52 +0100 |
commit | eb0a213d040fb3b116ec030a6755216ce5e61ee9 (patch) | |
tree | 855c040d76e7653bdf90bcd47c571e2c8c119275 /04-algorithms_on_strings | |
parent | 0bc44a8bcf77a81677d8257eb5c4190ff5d5dd73 (diff) | |
download | coursera-eb0a213d040fb3b116ec030a6755216ce5e61ee9.zip coursera-eb0a213d040fb3b116ec030a6755216ce5e61ee9.tar.gz |
Algorithms : complete 04-algorithms_on_strings 02-burrows_wheeler
Diffstat (limited to '04-algorithms_on_strings')
4 files changed, 393 insertions, 32 deletions
diff --git a/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp b/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp index f3b0497..6b1c94b 100644 --- a/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp +++ b/04-algorithms_on_strings/02-burrows_wheeler/01-bwt/bwt.cpp @@ -8,18 +8,109 @@ using std::cout; using std::endl; using std::string; using std::vector; +using std::swap; -string BWT(const string& text) { - string result = ""; +static bool debug = false; - // write your code here +class Bwt +{ +private: + vector<int> v; + const string &s; + int sz; - return result; -} + void print(int n) + { + cout << "print : " << n << endl; + for (int i = 0; i < v.size(); i++) + cout << v[i] << ";" << s[(v[i] + n) % sz] << endl; + } + + void print() + { + cout << "print : " << endl; + for (int i = 0; i < v.size(); i++) { + for (int j = 0; j < sz; j++) + cout << s[(v[i] + j) % sz]; + cout << endl; + } + } + +public: + + Bwt(const string& text) : s(text), sz(text.size()) { + v.reserve(s.size()); + for (int i = 0; i < s.size(); i++) + v.push_back(i); + if (debug) print(); + } + + string solve() { + sort(0, 0, v.size() - 1); + if (debug) print(); + + string ret = ""; + for (int i = 0; i < v.size(); i++) + ret += s[(v[i] + sz - 1) % sz]; + return ret; + } + +private: + void sort(int x, int l, int r) + { + if (debug) cout << " sort: " << x << ";" << l << ";" << r << endl; + if (l >= r) + return; + + quicksort(x, l, r); + if (debug) print(); + + char c = s[(v[l] + x) % sz]; + int ll = l; + for (int i = l + 1; i <= r; i++) { + char C = s[(v[i] + x) % sz]; + if (C != c) { + sort(x + 1, ll, i - 1); + ll = i; + c = C; + } + } + sort(x + 1, ll, r); + } + + void quicksort(int x, int l, int r) { + if (l >= r) + return; + + int p = l + rand() % (r - l + 1); + swap(v[l], v[p]); + + char c = s[(v[l] + x) % sz]; + int i = l + 1; + int j = r; + int k = r; + while (i <= j) { + int C = s[(v[j] + x ) % sz]; + if (C < c) { + swap(v[i], v[j]); + i++; + } else if (C > c) { + swap(v[j], v[k]); + k--; + j--; + } + else + j--; + } + + quicksort(x, l, i - 1); + quicksort(x, k + 1, r); + } +}; int main() { string text; cin >> text; - cout << BWT(text) << endl; + cout << Bwt(text).solve() << endl; return 0; -}
\ No newline at end of file +} diff --git a/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp b/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp index 3ec7c54..7c6ec56 100644 --- a/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp +++ b/04-algorithms_on_strings/02-burrows_wheeler/02-bwtinverse/bwtinverse.cpp @@ -8,18 +8,148 @@ using std::cout; using std::endl; using std::string; using std::vector; +using std::swap; -string InverseBWT(const string& bwt) { - string text = ""; +static bool debug = false; - // write your code here +class InverseBwt +{ +private: + vector<int> indexes; + const string &s; + int sz; + int starts[4] = {0, 0, 0, 0}; - return text; -} +public: + + InverseBwt(const string& text) : s(text), sz(text.size()) { + string ss = s; + quicksort(ss, 0, sz - 1); + + indexes.reserve(sz); + int count[] = {0, 0, 0, 0}; + + for (int i = 0; i < sz; i++) { + int c = 0; + switch(s[i]) { + case 'A': + c = count[0]; + count[0] += 1; + break; + case 'C': + c = count[1]; + count[1] += 1; + break; + case 'G': + c = count[2]; + count[2] += 1; + break; + case 'T': + c = count[3]; + count[3] += 1; + break; + default: + break; + } + indexes.push_back(c); + + switch(ss[i]) { + case 'A': + if (starts[0] == 0 ) + starts[0] = i; + break; + case 'C': + if (starts[1] == 0 ) + starts[1] = i; + break; + case 'G': + if (starts[2] == 0 ) + starts[2] = i; + break; + case 'T': + if (starts[3] == 0 ) + starts[3] = i; + break; + default: + break; + } + } + + if (debug) { + for (int i = 0; i < sz; i++) { + cout << ss[i] << ";" << s[i] << ";" << indexes[i] << endl; + } + cout << starts[0] << endl; + cout << starts[1] << endl; + cout << starts[2] << endl; + cout << starts[3] << endl; + } + } + + string solve() { + string ret = s; + + ret[sz - 1] = '$'; + for (int i = 0, j = sz - 2; j >= 0; j--) { + ret[j] = s[i]; + char ch = s[i]; + i = indexes[i]; + switch(ch) { + case 'A': + i += starts[0]; + break; + case 'C': + i += starts[1]; + break; + case 'G': + i += starts[2]; + break; + case 'T': + i += starts[3]; + break; + default: + break; + } + } + + return ret; + } + +private: + + void quicksort(string &s, int l, int r) { + if (l >= r) + return; + + int p = l + rand() % (r - l + 1); + swap(s[l], s[p]); + + char c = s[l]; + int i = l + 1; + int j = r; + int k = r; + while (i <= j) { + int C = s[j]; + if (C < c) { + swap(s[i], s[j]); + i++; + } else if (C > c) { + swap(s[j], s[k]); + k--; + j--; + } + else + j--; + } + + quicksort(s, l, i - 1); + quicksort(s, k + 1, r); + } +}; int main() { - string bwt; - cin >> bwt; - cout << InverseBWT(bwt) << endl; + string text; + cin >> text; + cout << InverseBwt(text).solve() << endl; return 0; } diff --git a/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp b/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp index 9353389..10551df 100644 --- a/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp +++ b/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp @@ -12,31 +12,113 @@ using std::map; using std::string; using std::vector; +static bool debug = false; + // Preprocess the Burrows-Wheeler Transform bwt of some text // and compute as a result: -// * starts - for each character C in bwt, starts[C] is the first position -// of this character in the sorted array of +// * starts - for each character C in bwt, starts[C] is the first position +// of this character in the sorted array of // all characters of the text. // * occ_count_before - for each character C in bwt and each position P in bwt, // occ_count_before[C][P] is the number of occurrences of character C in bwt // from position 0 to position P inclusive. -void PreprocessBWT(const string& bwt, - map<char, int>& starts, - map<char, vector<int> >& occ_count_before) { - // Implement this function yourself +void PreprocessBWT(const string& bwt, + map<char, int>& starts, + map<char, vector<int> >& occ_count_before) +{ + starts.insert(std::pair<char,int>('A', -1)); + starts.insert(std::pair<char,int>('C', -1)); + starts.insert(std::pair<char,int>('G', -1)); + starts.insert(std::pair<char,int>('T', -1)); + occ_count_before.insert(std::pair<char,vector<int>>('A', vector<int>(bwt.size() + 1))); + occ_count_before.insert(std::pair<char,vector<int>>('C', vector<int>(bwt.size() + 1))); + occ_count_before.insert(std::pair<char,vector<int>>('G', vector<int>(bwt.size() + 1))); + occ_count_before.insert(std::pair<char,vector<int>>('T', vector<int>(bwt.size() + 1))); + + int count[] = {0, 0, 0, 0, 0}; + for (int i = 0; i < bwt.size(); i ++) { + occ_count_before['A'][i] = count[0]; + occ_count_before['C'][i] = count[1]; + occ_count_before['G'][i] = count[2]; + occ_count_before['T'][i] = count[3]; + switch(bwt[i]) { + case 'A': + count[0] += 1; + break; + case 'C': + count[1] += 1; + break; + case 'G': + count[2] += 1; + break; + case 'T': + count[3] += 1; + break; + default: + break; + } + } + occ_count_before['A'][bwt.size()] = count[0]; + occ_count_before['C'][bwt.size()] = count[1]; + occ_count_before['G'][bwt.size()] = count[2]; + occ_count_before['T'][bwt.size()] = count[3]; + + int x = 1; + if (count[0] != 0) { + starts['A'] = x; + x += count[0]; + } + if (count[1] != 0) { + starts['C'] = x; + x += count[1]; + } + if (count[2] != 0) { + starts['G'] = x; + x += count[2]; + } + if (count[3] != 0) { + starts['T'] = x; + x += count[3]; + } + + if (debug) { + std::cout << bwt << std::endl; + std::cout << "starts : " << std::endl; + for (std::map<char,int>::iterator it = starts.begin(); it != starts.end(); it++) + std::cout << it->first << " => " << it->second << std::endl; + for (std::map<char,vector<int>>::iterator it = occ_count_before.begin(); it != occ_count_before.end(); it++) { + std::cout << it->first << " => "; + for (int i = 0; i <= bwt.size(); i++) + std::cout << it->second[i] << " "; + std::cout << std::endl; + } + std::cout << std::endl; + } } // Compute the number of occurrences of string pattern in the text // given only Burrows-Wheeler Transform bwt of the text and additional // information we get from the preprocessing stage - starts and occ_counts_before. -int CountOccurrences(const string& pattern, - const string& bwt, - const map<char, int>& starts, - const map<char, vector<int> >& occ_count_before) { - // Implement this function yourself - return 0; +int CountOccurrences(const string& pattern, + const string& bwt, + const map<char, int>& starts, + const map<char, vector<int> >& occ_count_before) +{ + for (int top = 0, bottom = bwt.size() - 1, i = pattern.size() - 1; top <= bottom; ) { + if (i >= 0) { + char c = pattern[i]; + i--; + int s = starts.at(c); + const vector<int> &v = occ_count_before.at(c); + top = s + v[top]; + bottom = s + v[bottom + 1] - 1; + } else + return bottom - top + 1; + } + + return 0; } - + int main() { string bwt; diff --git a/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp b/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp index 1dbbcc3..2e0ec6d 100644 --- a/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp +++ b/04-algorithms_on_strings/02-burrows_wheeler/04-suffix_array/suffix_array.cpp @@ -11,16 +11,74 @@ using std::make_pair; using std::pair; using std::string; using std::vector; +using std::swap; + +static void quicksort(const string &s, vector<int> &v, int x, int l, int r) +{ + if (l >= r) + return; + + int p = l + rand() % (r - l + 1); + swap(v[l], v[p]); + + char c = s[v[l] + x]; + int i = l + 1; + int j = r; + int k = r; + while (i <= j) { + int C = s[v[j] + x]; + if (C < c) { + swap(v[i], v[j]); + i++; + } else if (C > c) { + swap(v[j], v[k]); + k--; + j--; + } + else + j--; + } + + quicksort(s, v, x, l, i - 1); + quicksort(s, v, x, k + 1, r); +} + +static void sort(const string &s, vector<int> &v, int x, int l, int r) +{ + if (l >= r) + return; + + quicksort(s, v, x, l, r); + + char c = s[v[l] + x]; + int ll = l; + for (int i = l + 1; i <= r; i++) { + char C = s[v[i] + x]; + if (C != c) { + sort(s, v, x + 1, ll, i - 1); + ll = i; + c = C; + } + } + sort(s, v, x + 1, ll, r); +} // Build suffix array of the string text and // return a vector result of the same length as the text // such that the value result[i] is the index (0-based) // in text where the i-th lexicographically smallest // suffix of text starts. -vector<int> BuildSuffixArray(const string& text) { - vector<int> result; - // Implement this function yourself - return result; +vector<int> BuildSuffixArray(const string& text) +{ + int sz = text.size(); + vector<int> v; + v.reserve(text.size()); + for (int i = 0; i < sz; i++) + v.push_back(i); + + sort(text, v, 0, 0, sz - 1); + + return v; } int main() { |