diff options
Diffstat (limited to '04-algorithms_on_strings/01-suffix_trees')
5 files changed, 481 insertions, 41 deletions
diff --git a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp index 08cfe9b..d5f9483 100644 --- a/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/01-trie/trie.cpp @@ -12,7 +12,27 @@ typedef vector<edges> trie;  trie build_trie(vector<string> & patterns) {    trie t; -  // write your code here + +  size_t node; +  size_t n = 1; +  t.push_back(edges()); + +  for (size_t i = 0; i < patterns.size(); i++) { +      node = 0; +      const string& pattern = patterns[i]; +      for (size_t j = 0; j < pattern.size(); j++) { +          char c = pattern[j]; +          int next = t[node][c]; +          if (next != 0) { +              node = next; +          } else { +              t[node][c] = n; +              node = n; +              t.push_back(edges()); +              n += 1; +          } +      } +  }    return t;  } @@ -34,4 +54,4 @@ int main() {    }    return 0; -}
\ No newline at end of file +} diff --git a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp index a59a0db..2517d81 100644 --- a/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/02-trie_matching/trie_matching.cpp @@ -23,27 +23,96 @@ struct Node  	{  	    return (next[0] == NA && next[1] == NA && next[2] == NA && next[3] == NA);  	} + +    void print() +    { +        if (isLeaf()) +            cout << "---- " << endl; +        else +            cout << "A:" << next[0] << " C:" << next[1] +            << " G:" << next[2] << " T:" << next[3] << endl; +    }  }; -int letterToIndex (char letter) +struct Trie  { -	switch (letter) -	{ -		case 'A': return 0; break; -		case 'C': return 1; break; -		case 'G': return 2; break; -		case 'T': return 3; break; -		default: assert (false); return -1; -	} -} +    vector<Node> nodes; + +    Trie(const vector <string>& patterns) +    { +        int v = 1; +        nodes.push_back(Node()); + +        for (int i = 0; i < patterns.size(); i++) { +            int n = 0; +            const string& pattern = patterns[i]; +            for (int j = 0; j < pattern.size(); j++) { +                int c = letterToIndex(pattern[j]); +                int next = nodes[n].next[c]; +                if (next != NA) { +                    n = next; +                } else { +                    nodes[n].next[c] = v; +                    n = v; +                    nodes.push_back(Node()); +                    v += 1; +                } +            } +        } +    } + +    int letterToIndex (char letter) +    { +        switch (letter) +        { +            case 'A': return 0; break; +            case 'C': return 1; break; +            case 'G': return 2; break; +            case 'T': return 3; break; +            default: assert (false); return -1; +        } +    } + +    void print() +    { +        for (int i = 0; i < nodes.size(); i++) { +            cout << i << " " ; +            nodes[i].print(); +        } +    } + +    bool prefixMatch(const string& text) +    { +        int n = 0; + +        for( int i = 0; i < text.size(); i++) { +            int next = nodes[n].next[letterToIndex(text[i])]; +            if (next == NA) +                break; +            n = next; +        } + +        return nodes[n].isLeaf(); +    } + +    void match(const string& text, vector<int>& result) +    { +        for (int i = 0; i < text.size(); i++) { +            if (prefixMatch(&text[i])) +                result.push_back(i); +        } +    } +};  vector <int> solve (const string& text, int n, const vector <string>& patterns)  { -	vector <int> result; +    vector <int> result; -	// write your code here +    Trie t(patterns); +    /* t.print(); */ +    t.match(text, result); -	return result; +    return result;  }  int main (void) diff --git a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp index c68c2f6..c31456e 100644 --- a/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/03-trie_matching_extended/trie_matching_extended.cpp @@ -20,27 +20,96 @@ struct Node  		fill (next, next + Letters, NA);  		patternEnd = false;  	} + +    void print() +    { +        cout << patternEnd << "  A:" << next[0] << " C:" << next[1] +            << " G:" << next[2] << " T:" << next[3] << endl; +    }  }; -int letterToIndex (char letter) +struct Trie  { -	switch (letter) -	{ -		case 'A': return 0; break; -		case 'C': return 1; break; -		case 'G': return 2; break; -		case 'T': return 3; break; -		default: assert (false); return -1; -	} -} +    vector<Node> nodes; + +    Trie(const vector <string>& patterns) +    { +        int v = 1; +        nodes.push_back(Node()); + +        for (int i = 0; i < patterns.size(); i++) { +            int n = 0; +            const string& pattern = patterns[i]; +            for (int j = 0; j < pattern.size(); j++) { +                int c = letterToIndex(pattern[j]); +                int next = nodes[n].next[c]; +                if (next == NA) { +                    nodes[n].next[c] = v; +                    n = v; +                    nodes.push_back(Node()); +                    v += 1; +                } else +                    n = next; +            } +            nodes[n].patternEnd = true; +        } +    } + +    int letterToIndex (char letter) +    { +        switch (letter) +        { +            case 'A': return 0; break; +            case 'C': return 1; break; +            case 'G': return 2; break; +            case 'T': return 3; break; +            default: assert (false); return -1; +        } +    } + +    void print() +    { +        for (int i = 0; i < nodes.size(); i++) { +            cout << i << " " ; +            nodes[i].print(); +        } +    } + +    bool prefixMatch(const string& text) +    { +        int n = 0; + +        for( int i = 0; i < text.size(); i++) { +            Node &node = nodes[n]; +            if (node.patternEnd) +                break; +            int next = node.next[letterToIndex(text[i])]; +            if (next == NA) +                break; +            n = next; +        } + +        return nodes[n].patternEnd; +    } + +    void match(const string& text, vector<int>& result) +    { +        for (int i = 0; i < text.size(); i++) { +            if (prefixMatch(&text[i])) +                result.push_back(i); +        } +    } +}; -vector <int> solve (string text, int n, vector <string> patterns) +vector <int> solve (const string& text, int n, const vector <string>& patterns)  { -	vector <int> result; +    vector <int> result; -	// write your code here +    Trie t(patterns); +    /* t.print(); */ +    t.match(text, result); -	return result; +    return result;  }  int main (void) diff --git a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp index 5d9e2f1..96fdf83 100644 --- a/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/04-suffix_tree/suffix_tree.cpp @@ -1,7 +1,10 @@  #include <iostream> +#include <algorithm> +#include <cassert>  #include <map>  #include <string>  #include <vector> +#include <stack>  using std::cin;  using std::cout; @@ -9,14 +12,140 @@ using std::endl;  using std::map;  using std::string;  using std::vector; +using std::stack; -// Build a suffix tree of the string text and return a vector -// with all of the labels of its edges (the corresponding  -// substrings of the text) in any order. -vector<string> ComputeSuffixTreeEdges(const string& text) { -  vector<string> result; -  // Implement this function yourself -  return result; +int const Letters =    5; +int const NA      =   -1; + +struct Node +{ +    int from; +    int len; +    int childs [Letters]; + +	Node () : from(0), len(0) +	{ +        std::fill(childs, childs + Letters, NA); +	} + +    Node(int f, int l) : from(f), len(l) +    { +        std::fill(childs, childs + Letters, NA); +    } + +    void print(const string& text) +    { +        std::cout << from << ";" << len << " "; +        for(int i = from; i < (from + len); i++) +            std::cout << text[i]; +        std::cout << " - A:" << childs[0] << " C:" << childs[1] +            << " G:" << childs[2] << " T:" << childs[3] << " $:" << childs[4]<< std::endl; +    } +}; + +struct SuffixTree +{ +    const string& text; +    vector<Node> nodes; + +    SuffixTree(const string& t) : text(t) +    { +        nodes.push_back(Node()); +        for (int i = 0; i < t.size(); i++) +            add(i, text.size() - i, 0); +    } + +    void add(int from, int len, int into) +    { +        int i = toInt(text[from]); +        int n = nodes[into].childs[i]; +        if (n == NA) { +            nodes.push_back(Node(from, len)); +            nodes[into].childs[i] = nodes.size() - 1; +        } else { +            insert(from, len, n, into); +        } +    } + +    void insert(int from, int len, int n, int parent) +    { +        Node &node = nodes[n]; + +        int i = from; +        int j = node.from; +        int to = from + std::min(len, node.len); + +        for( ; (text[i] == text[j] && i < to); i++, j++) { } + +        if (i < to) { +            int matchLen = j - node.from; +            int s = nodes.size(); +            // new match node +            nodes.push_back(Node(node.from, matchLen)); +            // shrink n to mismatch part +            nodes[n].from = j; +            nodes[n].len = nodes[n].len - matchLen; +            // parent[from] -> match[j] -> n +            nodes[parent].childs[toInt(text[from])] = s; +            nodes[s].childs[toInt(text[j])] = n; +            // new mismatch node +            nodes.push_back(Node(i, len - matchLen)); +            // match[i] -> mismatch +            nodes[s].childs[toInt(text[i])] = s + 1; +        } else if (i == to) { +            add(i, len - i + from, n); +        } else if (i == from) { +            std::cout << "missmatch at 0 !!!!!!!!!!!!!!!!!!" << std::endl; +            nodes[n].print(text); +            std::cout << &text[from]<< std::endl; +            std::cout << &text[i]<< std::endl; +        } +    } + +    void travers(vector<string> &r) +    { +        stack<int> s; +        s.push(0); +        while (!s.empty()) { +            int n = s.top(); +            s.pop(); +            Node &node = nodes[n]; + +            if (n != 0) +                r.push_back(text.substr(node.from, node.len)); + +            for(int i = 0; i < Letters; i++) { +                int nn = node.childs[i]; +                if (nn != NA) +                    s.push(nn); +            } +        } +    } + +    int toInt(char c) +    { +        switch (c) +        { +            case 'A': return 0; break; +            case 'C': return 1; break; +            case 'G': return 2; break; +            case 'T': return 3; break; +            case '$': return 4; break; +            default: assert (false); return -1; +        } +    } + + +}; + +vector<string> ComputeSuffixTreeEdges(const string& text) +{ +    vector<string> result; + +    SuffixTree t(text); +    t.travers(result); + +    return result;  }  int main() { diff --git a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp index 2f2d98f..4c33d05 100644 --- a/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp +++ b/04-algorithms_on_strings/01-suffix_trees/05-non_shared_substring/non_shared_substring.cpp @@ -1,13 +1,168 @@  #include <iostream>  #include <string> +#include <vector> +#include <stack> +#include <queue> +#include <limits> +#include <cassert>  using namespace std; -string solve (string p, string q) +int const Letters =    6; +int const NA      =   -1; + +struct Node +{ +    int from; +    int len; +    int childs [Letters]; +    bool marked; +    int realFrom; + +	Node () : from(0), len(0), marked(false), realFrom (-1) +	{ +        fill(childs, childs + Letters, NA); +	} + +    Node(int f, int l, bool t, int r) : from(f), len(l), marked(t), realFrom(r) +    { +        fill(childs, childs + Letters, NA); +    } +}; + +struct SuffixTree +{ +    const string& text; +    const int separator; +    vector<Node> nodes; + +    SuffixTree(const string& t, int s) : text(t), separator(s) +    { +        nodes.push_back(Node()); + +        for (int i = 0; i < separator; i++) +            add(i, separator + 1 - i, 0, false, i); + +        add(s, 1, 0, true, s); + +        for (int i = s; i < t.size(); i++) +            add(i, text.size() - i, 0, true, i); +    } + +    void add(int from, int len, int parent, bool marked, int realFrom) +    { +        int i = toInt(text[from]); +        int n = nodes[parent].childs[i]; + +        if (n == NA) { +            nodes.push_back(Node(from, len, marked, realFrom)); +            nodes[parent].childs[i] = nodes.size() - 1; +        } else { +            Node &node = nodes[n]; + +            int i = from; +            int j = node.from; +            int to = from + min(len, node.len); + +            for( ; (text[i] == text[j] && i < to); i++, j++) { } + +            if (i < to) { +                int matchLen = j - node.from; +                int s = nodes.size(); +                // new match node +                nodes.push_back(Node(node.from, matchLen, marked, node.realFrom)); +                // shrink n to mismatch part +                nodes[n].from = j; +                nodes[n].len = nodes[n].len - matchLen; +                // parent[from] -> match[j] -> n +                nodes[parent].childs[toInt(text[from])] = s; +                nodes[s].childs[toInt(text[j])] = n; +                // new mismatch node +                int l = (len - matchLen); +                nodes.push_back(Node(i, l, marked, realFrom)); +                // match[i] -> mismatch +                nodes[s].childs[toInt(text[i])] = s + 1; +            } else if (i == to) { +                node.marked = marked; +                add(i, len - i + from, n, marked, realFrom); +            } else { +                assert (false); +            } +        } +    } + +    int toInt(char c) +    { +        switch (c) +        { +            case 'A': return 0; break; +            case 'C': return 1; break; +            case 'G': return 2; break; +            case 'T': return 3; break; +            case '#': return 4; break; +            case '$': return 5; break; +            default: assert (false); return -1; +        } +    } + +    void graph() +    { +        cout << "digraph G {\nlabel=\"" << text << endl; +        for (int i = 0; i < text.size(); i++) +            cout << i << " "; +        cout << "\"" << endl; +        for (int i = 0; i < nodes.size(); i++) { +            Node &n = nodes[i]; +            cout << i << " [ label=\" " << i  << endl; +            for(int j = n.from; j < (n.from + n.len); j++) +                cout << text[j]; +            cout << endl << n.from - n.realFrom << "\""; +            if (!n.marked) cout << " color=red"; +            cout << " ]" << endl; + +            for (int j = 0; j < Letters; j++) { +                if (n.childs[j] != NA) +                    cout << i << " -> " << n.childs[j] << endl; +            } +        } +        cout << "}" << endl; +    } + +    void solve(string &r) +    { +        int x = 0; +        int d = numeric_limits<int>::max(); + +        for (int i = 0; i < nodes.size(); i++) { +            Node &n = nodes[i]; +            if (!n.marked && n.len > 0  && text[n.from] != '#') { +                int len = n.from - n.realFrom + 1; +                if (len < d) { +                    d = len; +                    x = i; +                } +            } +        } + +        if (x == 0) +            cout << r; +        else +            cout << text.substr(nodes[x].realFrom, d) << '\n'; +    } + +}; + +void solve (string p, string q)  { +    string s = p + "#" + q + "$";  	string result = p; -	return result; +    SuffixTree t(s, p.size()); +#if 0 +    t.graph(); +#else +    t.solve(p); +#endif  }  int main (void) @@ -17,9 +172,7 @@ int main (void)  	string q;  	cin >> q; -	string ans = solve (p, q); - -	cout << ans << endl; +	solve (p, q);  	return 0;  }  | 
