summaryrefslogtreecommitdiffstats
path: root/04-algorithms_on_strings/02-burrows_wheeler/03-bwmatching/bwmatching.cpp
blob: 10551df1fead04ec1ecae3f9426fa040660b9319 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#include <algorithm>
#include <cstdio>
#include <iostream>
#include <map>
#include <sstream>
#include <string>
#include <vector>

using std::cin;
using std::istringstream;
using std::map;
using std::string;
using std::vector;

static bool debug = false;

// Preprocess the Burrows-Wheeler Transform bwt of some text
// and compute as a result:
//   * starts - for each character C in bwt, starts[C] is the first position
//       of this character in the sorted array of
//       all characters of the text.
//   * occ_count_before - for each character C in bwt and each position P in bwt,
//       occ_count_before[C][P] is the number of occurrences of character C in bwt
//       from position 0 to position P inclusive.
void PreprocessBWT(const string& bwt,
                   map<char, int>& starts,
                   map<char, vector<int> >& occ_count_before)
{
    starts.insert(std::pair<char,int>('A', -1));
    starts.insert(std::pair<char,int>('C', -1));
    starts.insert(std::pair<char,int>('G', -1));
    starts.insert(std::pair<char,int>('T', -1));
    occ_count_before.insert(std::pair<char,vector<int>>('A', vector<int>(bwt.size() + 1)));
    occ_count_before.insert(std::pair<char,vector<int>>('C', vector<int>(bwt.size() + 1)));
    occ_count_before.insert(std::pair<char,vector<int>>('G', vector<int>(bwt.size() + 1)));
    occ_count_before.insert(std::pair<char,vector<int>>('T', vector<int>(bwt.size() + 1)));

    int count[] = {0, 0, 0, 0, 0};
    for (int i = 0; i < bwt.size(); i ++) {
        occ_count_before['A'][i] = count[0];
        occ_count_before['C'][i] = count[1];
        occ_count_before['G'][i] = count[2];
        occ_count_before['T'][i] = count[3];
        switch(bwt[i]) {
            case 'A':
                count[0] += 1;
                break;
            case 'C':
                count[1] += 1;
                break;
            case 'G':
                count[2] += 1;
                break;
            case 'T':
                count[3] += 1;
                break;
            default:
                break;
        }
    }
    occ_count_before['A'][bwt.size()] = count[0];
    occ_count_before['C'][bwt.size()] = count[1];
    occ_count_before['G'][bwt.size()] = count[2];
    occ_count_before['T'][bwt.size()] = count[3];

    int x = 1;
    if (count[0] != 0) {
        starts['A'] = x;
        x += count[0];
    }
    if (count[1] != 0) {
        starts['C'] = x;
        x += count[1];
    }
    if (count[2] != 0) {
        starts['G'] = x;
        x += count[2];
    }
    if (count[3] != 0) {
        starts['T'] = x;
        x += count[3];
    }

    if (debug) {
        std::cout << bwt << std::endl;
        std::cout << "starts : " << std::endl;
        for (std::map<char,int>::iterator it = starts.begin(); it != starts.end(); it++)
            std::cout << it->first << " => " << it->second << std::endl;
        for (std::map<char,vector<int>>::iterator it = occ_count_before.begin(); it != occ_count_before.end(); it++) {
            std::cout << it->first << " => ";
            for (int i = 0; i <= bwt.size(); i++)
                std::cout << it->second[i] << " ";
            std::cout << std::endl;
        }
        std::cout << std::endl;
    }
}

// Compute the number of occurrences of string pattern in the text
// given only Burrows-Wheeler Transform bwt of the text and additional
// information we get from the preprocessing stage - starts and occ_counts_before.
int CountOccurrences(const string& pattern,
                     const string& bwt,
                     const map<char, int>& starts,
                     const map<char, vector<int> >& occ_count_before)
{
    for (int top = 0, bottom = bwt.size() - 1, i = pattern.size() - 1; top <= bottom; ) {
        if (i >= 0) {
            char c = pattern[i];
            i--;
            int s = starts.at(c);
            const vector<int> &v = occ_count_before.at(c);
            top = s + v[top];
            bottom = s + v[bottom + 1] - 1;
        } else
            return bottom - top + 1;
    }

    return 0;
}


int main() {
  string bwt;
  cin >> bwt;
  int pattern_count;
  cin >> pattern_count;
  // Start of each character in the sorted list of characters of bwt,
  // see the description in the comment about function PreprocessBWT
  map<char, int> starts;
  // Occurrence counts for each character and each position in bwt,
  // see the description in the comment about function PreprocessBWT
  map<char, vector<int> > occ_count_before;
  // Preprocess the BWT once to get starts and occ_count_before.
  // For each pattern, we will then use these precomputed values and
  // spend only O(|pattern|) to find all occurrences of the pattern
  // in the text instead of O(|pattern| + |text|).
  PreprocessBWT(bwt, starts, occ_count_before);
  for (int pi = 0; pi < pattern_count; ++pi) {
    string pattern;
    cin >> pattern;
    int occ_count = CountOccurrences(pattern, bwt, starts, occ_count_before);
    printf("%d ", occ_count);
  }
  printf("\n");
  return 0;
}