HDU 5853 Jong Hyok and String（二分+后缀数组/广义后缀自动机）

原创

alpc_qleonardo 2022-08-25 11:05:46 ©著作权

文章标签 后缀自动机 LCP HDU 后缀数组 i++ 文章分类 虚拟化云计算

©著作权归作者所有：来自51CTO博客作者alpc_qleonardo的原创作品，请联系作者获取转载授权，否则将追究法律责任

Jong Hyok and String

Time Limit: 3000/1500 MS (Java/Others) Memory Limit: 65536/65536 K (Java/Others)
Total Submission(s): 795 Accepted Submission(s): 233

Problem Description

Jong Hyok loves strings. One day he gives a problem to his friend you. He writes down n strings Pi in front of you, and asks m questions. For i-th question, there is a string Qi. We called strange set(s) = {(i, j) | s occurs in Pi and j is the position of its last character in the current occurence}. And for ith question, you must answer the number of different strings t which satisfies strange set(Qi) = strange set(t) and t is a substring of at least one of the given n strings.

Input

First line contains T, a number of test cases.

For each test cases, there two numbers n, m and then there are n strings Pi and m strings Qj.(i = 1…n, j = 1…m)

1 <= T <= 10
1 <= n <= 100000
1 <= m<= 500000
1 <=|Pi|<=100000
1 <=|Qi|<=100000
∑ni=1|Pi|≤100000
File size is less than 3.5 megabytes.

Output

For each test case, first line contains a line “Case #x:”, x is the number of the case.

For each question, you should print one integer in one line.

Sample Input

12 2abaabaab

Sample Output

Hint

Author

金策工业综合大学（DPRK）

Source

2016 Multi-University Training Contest 9

大致题意，给你很多个串，然后给你很多个询问，每个询问对应给出一个子串，问有多少个子串在之前给的串中的出现位置与这个的出现位置完全相同。

包含询问串或者被包含( 询问串是该串的后缀或者该串是询问串的后缀)，而且出现次数与询问出现次数相同，则这个子串一定是满足条件的串。那么问题的关键就是如何求这些串的次数

由于询问串是所求串的后缀，为了利用后缀数组的性质，我们干脆把这两个东西都倒过来，于是就变成了询问串是所求串的前缀。把所有字符串倒着加入后缀数组，两个之间用不同的特殊字符拼接。这样子，我们就可以利用sa数组对后悔拍的序，二分出所有前缀是询问串的编号的上下界。这些上下界包含串的个数也即询问串的出现次数，所以所求串的出现次数肯定也与这个次数相等。因此，所求串长度至多为上下界up、down的LCP，而这个LCP就是次数的上限。但是，你会发现，在这个长度范围内的子串出现次数可能不是恰好与询问串出现次数相同。因为可能与up后面或者down前面的LCP也是同一个长度，所以要减去这一部分。这一部分的也很好计算，max(height[down],height[up]+1)就是其最大长度，两个长度之差就是最后答案。

#include<bits/stdc++.h>
#define N 200010

using namespace std;

int st[N],d[N];
char s[N];

struct Suffix_Array
{
    int dp[N][19],n;
    int sa[N],Rank[N],h[N];
    int xx[N],yy[N],c[N]; int *s;
    bool cmp(int *s,int x,int y,int k)
    {return (s[x]==s[y])&&(s[x+k]==s[y+k]);}
    void ins(int *str,int len) {s=str;n=len+1;}

    void DA()
    {
        memset(c,0,sizeof(c));
        int *x=xx,*y=yy,m=N,*t,i;
        for(i=0;i<n;i++) x[i]=s[i];
        for(i=0;i<n;i++) c[x[i]]++;
        for(i=1;i<m;i++) c[i]+=c[i-1];
        for(i=n-1;i>=0;i--) sa[--c[x[i]]]=i;
        for(int k=1,tot=0;tot<n;k<<=1,m=tot)
        {
            memset(c,0,sizeof(c));
            for(i=0;i<n;i++) c[x[i]]++;
            for(i=1;i<m;i++) c[i]+=c[i-1];
            for(i=n-k,tot=0;i<n;i++) y[tot++]=i;
            for(i=0;i<n;i++) if (sa[i]>=k) y[tot++]=sa[i]-k;
            for(i=n-1;i>=0;i--) sa[--c[x[y[i]]]]=y[i];
            for(i=tot=1,t=x,x=y,y=t,x[sa[0]]=0;i<n;i++)
                x[sa[i]]=cmp(y,sa[i-1],sa[i],k)?tot-1:tot++;
        }
    }

    void cal_height()
    {
        int i,j,k=0;
        for(i=1;i<n;i++) Rank[sa[i]]=i;
        for(i=0;i<n-1;h[Rank[i++]]=k)
            for(k?k--:0,j=sa[Rank[i]-1];s[i+k]==s[j+k];k++);
        int m=floor(log(n+0.0)/log(2.0));
        for(int i=1;i<=n;i++) dp[i][0]=h[i];
        for(int i=1;i<=m;i++)
            for(int j=n;j;j--)
            {
                dp[j][i]=dp[j][i-1];
                if(j+(1<<(i-1))<=n) dp[j][i]=min(dp[j][i],dp[j+(1<<(i-1))][i-1]);
            }
    }

    int lcp(int l,int r)
    {
        int m=floor(log(r-l+1.0)/log(2.0));
        return min(dp[l][m],dp[r-(1<<m)+1][m]);
    }

} SA;

int cmp(int x,int len)
{
    for(int i=0;i<len;i++)
        if (st[x+i]>s[i]) return 1;
        else if (st[x+i]<s[i]) return -1;
    return 0;
}

int main()
{
    int T_T,T;
    cin>>T_T;T=0;
    while(T_T--)
    {
        int n,m,num=0;
        scanf("%d%d",&n,&m);
        for(int k=1;k<=n;k++)
        {
            scanf("%s",s);
            int l=strlen(s);
            reverse(s,s+l);
            int e=num+l-1;
            for(int i=0;s[i];i++)
                d[num]=e,st[num++]=s[i];
            st[num++]=k+255;
        }
        st[num]=0;
        SA.ins(st,num);
        SA.DA();SA.cal_height();
        printf("Case #%d:\n",++T);
        while(m--)
        {
            int up=-1,down=-1;
            scanf("%s",s);
            int len=strlen(s);
            int l=1,r=num-1,mid;
            reverse(s,s+len);
            while(l<=r)
            {
                mid=(l+r)>>1;
                int k=cmp(SA.sa[mid],len);
                if (k>=0)
                {
                    if (k==0) down=mid; r=mid-1;
                } else l=mid+1;
            }
            l=1,r=num-1;
            while(l<=r)
            {
                mid=(l+r)>>1;
                int k=cmp(SA.sa[mid],len);
                if (k<=0)
                {
                    if (k==0) up=mid; l=mid+1;
                } else r=mid-1;
            }
            if (down==-1) {puts("0");continue;}
            if (up==down) printf("%d\n",d[SA.sa[up]]-SA.sa[up]+1-max(SA.h[up],SA.h[up+1]));
                else printf("%d\n",SA.lcp(down+1,up)-max(SA.h[down],SA.h[up+1]));
        }
    }
    return 0;
}

~~然后这道题也是可以用后缀自动机来做的，但是今天先到这，后面补。还有点小bug没调出来……~~

终于算是解决了……既然这题都说了出现位置和次数都一样，那么显然是right数组一样。根据这个性质，很容易发现，满足right数组一样的子串个数，很明显就是T[x].len-T[fa].len，其中x表示询问串在后缀自动机里面跑到的位置，然后fa表示其parent指针。原因的话也很简单，fa位置的以及再往前的串的出现次数会比x多，而fa是恰好多出现一次的，所有在fa和x之间，len之差个子串都满足条件。

思路就是这样清晰明了，但是来说说我出现的bug。由于是第一次写多个串的后缀自动机，所以什么都不知道，直接把所有串拼成一个长串，然后直接往自动机里面加。但是，其实这么做是不对的，对于每个字符串，都是从root重新开始加入，如果某个点已经出现可以不重复添加。原理的话自己想想应该可以理解。然后这个东西有一个比较官方的名字——广义后缀自动机。具体的也没什么好说的，只不过是能够解决多个主串的问题，可以对多个串一起匹配。具体代码：

#include<bits/stdc++.h>
#define N 200010
using namespace std;

char s[N];

struct Suffix_Automation
{
    int tot,cur;
    struct node{int ch[26],len,fa;} T[N];
    void init(){cur=tot=1;memset(T,0,sizeof(T));}

    void ins(int x,int len)
    {
        if (T[cur].ch[x]&&T[T[cur].ch[x]].len==len)
        {
            cur=T[cur].ch[x]; return;
        }
        int p=cur;cur=++tot;T[cur].len=T[p].len+1;
        for(;p&&!T[p].ch[x];p=T[p].fa) T[p].ch[x]=cur;
        if (!p) {T[cur].fa=1;return;}int q=T[p].ch[x];
        if (T[p].len+1==T[q].len) {T[cur].fa=q;return;}
        int np=++tot; memcpy(T[np].ch,T[q].ch,sizeof(T[q].ch));
        T[np].fa=T[q].fa; T[q].fa=T[cur].fa=np; T[np].len=T[p].len+1;
        for(;p&&T[p].ch[x]==q;p=T[p].fa) T[p].ch[x]=np;
    }

    int match(char *s)
    {
        int p=1;
        for(int i=0;s[i];i++)
        {
            int c=s[i]-'a';
            if (T[p].ch[c]) p=T[p].ch[c];
                       else return 0;
        }
        return T[p].len-T[T[p].fa].len;
    }

} SAM;

int main()
{
    int T_T,T;
    cin>>T_T;T=0;
    while(T_T--)
    {
        int n,m,num=0;
        SAM.init();
        scanf("%d%d",&n,&m);
        while(n--)
        {
            scanf("%s",s);
            SAM.cur=1;
            for(int i=0;s[i];i++)
                SAM.ins(s[i]-'a',i+1);
        }
        printf("Case #%d:\n",++T);
        while(m--)
        {
            scanf("%s",s);
            printf("%d\n",SAM.match(s));
        }
    }
    return 0;
}