Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G,and the length of sequences is a given integer n.
题目的大意就是给定一组DNA的子序列,要我们计算长度为N的DNA序列中不包含这些子序列的数量有多少个。多串匹配的问题我们首先构造AC自动机,{ACG, C}如下图所示:
实线表示的是Trie的部分,虚线表示的是Fail指针(root的fail指针指向自己,这里我们未画出)。AC自动机表示了根据输入文本应该进行的状态转移,我们可以看到红色的状态是危险的状态,是我们无论如何都想避免到达的。如果我们把每个节点对应不同输入的状态转移画出来,可以得到对应的有向图:
对于一个节点,我们需要把对应的child指针指向接受输入之后应该指的方向。比如对于图中的1节点,输入a之后会走fail指针回到根节点0,但是fail指针的移动是不消耗输入字符的,所以我们可以继续move到a。所以对应的children[a]指针就应该指向a自身。同理对于0节点的children[t]和children[g],我们把他们指向root而不是留为nullptr。因为现在我们是对每个节点对应的每个输入建图,而不是仅仅保有在AC自动机的所拥有的边。建这个图的过程并不复杂,在我们建fail指针的时候,,对于所有不同的输入:
- 如果其有对应的子节点children[c],那么我么已经有有对应的边连接状态转移
- 如果其没有对应的子节点children[c],我们直接把其连向对应fail指针的对应子节点fail->children[c]。这里我们把root的fail指针设为root,这样一层一层下来的话,其一定可以指向对应c输入之后需要转向的节点。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
//POJ 2778 | |
/* | |
It's well known that DNA Sequence is a sequence only contains A, C, T and G, and it's very useful to analyze a segment of DNA Sequence,For example, if a animal's DNA sequence contains segment ATC then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don't contain those segments. | |
Suppose that DNA sequences of a species is a sequence that consist of A, C, T and G,and the length of sequences is a given integer n. | |
Input | |
First line contains two integer m (0 <= m <= 10), n (1 <= n <=2000000000). Here, m is the number of genetic disease segment, and n is the length of sequences. | |
Next m lines each line contain a DNA genetic disease segment, and length of these segments is not larger than 10. | |
Output | |
An integer, the number of DNA sequences, mod 100000. | |
*/ | |
#include <cstdio> | |
#include <queue> | |
#include <string> | |
#include <queue> | |
#include <cstring> | |
const int MOD = 100000; | |
char s[15]; | |
int cnt = -1; | |
using namespace std; | |
int id(int c) | |
{ | |
switch(c) | |
{ | |
case 'A': | |
return 0; | |
case 'C': | |
return 1; | |
case 'G': | |
return 2; | |
case 'T': | |
return 3; | |
default: | |
return -1; | |
} | |
} | |
struct Node | |
{ | |
Node* fail, *children[4]; | |
bool isVirus; | |
int id; | |
Node() | |
{ | |
fail = NULL; | |
for(int i = 0; i < 4; ++i) | |
children[i] = NULL; | |
isVirus = false; | |
id = ++cnt; | |
} | |
}; | |
//Matrix represents state transfer | |
struct Matrix | |
{ | |
long long m[105][105]; | |
Matrix() | |
{ | |
memset(m, 0, sizeof(m)); | |
} | |
Matrix operator * (Matrix rhs) | |
{ | |
Matrix res; | |
for(int i = 0; i <= cnt; ++i) | |
for(int j = 0; j <= cnt; ++j) | |
for(int k = 0; k <= cnt; ++k) | |
res.m[i][j] = (res.m[i][j] + m[i][k] * rhs.m[k][j]) % MOD; | |
return res; | |
} | |
}matrix; | |
//Aho–Corasick algorithm | |
//AC automation | |
class ACAutomaton | |
{ | |
public: | |
ACAutomaton() | |
{ | |
root = new Node(); | |
nodes[cnt] = root; | |
} | |
void insert(const string& str) | |
{ | |
Node* curr = root; | |
int len = str.size(); | |
for(int i = 0; i < len; ++i) | |
{ | |
if(curr->children[id(str[i])] == NULL) | |
{ | |
curr->children[id(str[i])] = new Node(); | |
nodes[cnt] = curr->children[id(str[i])]; | |
} | |
curr = curr->children[id(str[i])]; | |
} | |
curr->isVirus = true; | |
} | |
//build failed pointer for each node | |
void buildFailed() | |
{ | |
//for current node c with edge e, we look for its parent's fail pointer f | |
//if f has the same char as e as its edge to chidlren c', we point c to c' | |
//if not, we recursively look for f's parent until it is root | |
queue<Node*> q; | |
for(int i = 0; i < 4; ++i) | |
{ | |
if(!root->children[i]) | |
root->children[i] = root; | |
else | |
{ | |
q.push(root->children[i]); | |
root->children[i]->fail = root; | |
} | |
} | |
while(q.size()) | |
{ | |
Node* curr = q.front(); | |
q.pop(); | |
for(int i = 0; i < 4; ++i) | |
{ | |
if(curr->children[i]) | |
{ | |
q.push(curr->children[i]); | |
Node* fail = curr->fail; | |
while(fail != root && !fail->children[i]) | |
fail = fail->fail; | |
if(fail->children[i]) | |
{ | |
//if current substring's suffix is virus | |
curr->children[i]->isVirus |= fail->children[i]->isVirus; | |
curr->children[i]->fail = fail->children[i]; | |
} | |
else | |
curr->children[i]->fail = root; | |
} | |
//if there is no such children, we still need to update the | |
//next node given the corresponding input char, which means | |
//we regrads fail move as a no-op(and it is since it doesn't | |
//require any input to make this move). Because we need to construct | |
//the state transfer matrix later | |
else | |
curr->children[i] = curr->fail->children[i]; | |
} | |
} | |
} | |
void buildMatrix() | |
{ | |
//for every valid state(node without isVirus flag), we want to construct | |
//its state transfer matrix. For each node, given input A,C,T,G, we want | |
//to know what state it ends withi only one move | |
for(int i = 0; i <= cnt; ++i) | |
{ | |
Node* curr = nodes[i]; | |
//we skip if start or end state is virus | |
if(curr->isVirus)continue; | |
for(int j = 0; j < 4; ++j) | |
{ | |
Node* next = curr->children[j]; | |
if(!next || next->isVirus)continue; | |
++matrix.m[i][next->id]; | |
} | |
} | |
} | |
private: | |
Node* root, *nodes[105]; | |
}; | |
//pow function of matrix, O(log n) | |
//Let's say we have the state transfer matrix m, where m[i][j] represents | |
//the way we can go to state j from state i with one input char. | |
//Then m^k represents the way we can go to state j from state i with k input char. | |
//Why? take m^2 as an example, if m's dimension is 3, we have | |
//m^2[0][1] = m[0][0] * m[0][1] + matrix[0][1] * matrix[1][1] + matrix[0][2] * matrix[2][1]... | |
//that is, we enumerate all possibilities of first transfer and corresponding second transfer | |
//and we calculate all the ways from 0 to 2 with 2 transfers. That is true for all k. | |
//So we have the graph which represents the AC automaton and the matrix which represents state transfer, | |
//what we want to know is the total ways from start state 0 to a valid end state with n transfers, which is | |
//m^k[0][0], m^k[0][1], m^k[0][2]... | |
//One thing we need to notice when build m is that we don't need to consider those invalid states. | |
Matrix pow(Matrix matrix, int n) | |
{ | |
Matrix res; | |
for(int i = 0; i <= cnt; ++i) | |
res.m[i][i] = 1; | |
while(n) | |
{ | |
if(n & 1) | |
res = res * matrix; | |
matrix = matrix * matrix; | |
n >>= 1; | |
} | |
return res; | |
} | |
int main() | |
{ | |
int m, n; | |
scanf("%d%d", &m, &n); | |
ACAutomaton ac; | |
for(int i = 1; i <= m; i++) | |
{ | |
scanf("%s", s); | |
ac.insert(s); | |
} | |
ac.buildFailed(); | |
ac.buildMatrix(); | |
Matrix res = pow(matrix, n); | |
int ans = 0; | |
for(int i = 0; i <= cnt; i++) | |
{ | |
ans += res.m[0][i]; | |
ans %= MOD; | |
} | |
printf("%d\n",ans); | |
return 0; | |
} |
No comments:
Post a Comment