CF1535F String Distance

cf1535f,string,distance · 浏览次数 : 0

小编点评

The code generates a sequence of strings and processes each string using two search algorithms, `search1` and `search2`. **Algorithm 1: `search1`** - It traverses the string `a` and builds a 2D array `x_1` and `x_2` to record the left and right boundaries of substrings within the original string. - It also builds an array `y_1` and `y_2` to record the left and right boundaries of substrings within the reversed version of the string `a`. - It then combines these two arrays to determine the left and right boundaries of all substrings in the original string. **Algorithm 2: `search2`** - It traverses the string `a` and builds an array `y_1` and `y_2` to record the left and right boundaries of substrings within the original string. - It also builds an array `x_1` and `x_2` to record the left and right boundaries of substrings within the reversed version of the string `a`. - It then combines these two arrays to determine the left and right boundaries of all substrings in the original string. **Key points:** - The code uses a structure `Q` to store the left and right boundaries of substrings. - It employs two search algorithms, `search1` and `search2`, to determine the left and right boundaries of substrings. - The code also uses the `clear` function to reset the data structures for subsequent use. - It generates the final answer by adding the lengths of all substrings and applying some mathematical operations.

正文

$CF1535F\ \ String\ Distance$

题意

给 $n$ 个长度均为 $len$ 的字符串 $T_1,T_2,\dots T_n$，定义 $f(a,b)$ 为将 $a,b$ 排序后相等的最小排序次数，若无解则为 $1337$(这好像是个黑客用语)。求

\[\sum _{i=1}^{n}\sum _{j=i+1}^{n}f(T_i,T_j) \]

其中

\[n\times len \leq 2\times 10^5 \]

思路分析

~~按照神xrlong的思路模拟即可,%%%%%%~~

~~神说，两个 trie 树，二维偏序肆意维护就过了，不过他终于在我的无数发问下补充了一些细节%%%%%~~

首先发现 $f$ 是个诈骗函数，其值只可能为 $1,2,1337$，因为规定没有相同的串，字符集相同时最多对两个串都排一次序就完成了，所以我们计算 $f=1,2$ 和 $1337$ 的数量即可。

总操作数显然为

\[kobe=\sum _{i=1}^{n}\sum _{j=i+1}^{n}1= \frac{n\times (n-1)}{2} \]

故事的开始前，先学习一下 $nb$ 的二维数点:

类似于二维前缀和，只不过我们没有那么大的空间和时间开销，还是一样的思路:

\[ans(x_1,x_2,y_1,y_2)= sum(x_1-1,y_1-1)+sum(x_2,y_2)-sum(x_1-1,y_2)-sum(x_2,y_1-1) \]

然后我们考虑记录每个 $sum$ 的 $qx,qy,c.\ c$ 是该 $sum$ 是应加上还是减去的贡献。然后排序，树状数组扫描线维护即可。

q[++tot]={x1-1,y1-1,1};
q[++tot]={x2,y2,1};
q[++tot]={x1-1,y2,-1};
q[++tot]={x2,y1-1,-1};
sort(q+1,q+tot+1,[](Q a,Q b){return a.qx<b.qx;});
for(int i=1,j=1;i<=tot;i++){
    for(;lty[j].x<=q[i].qx&&j<=n;j++)
        add(lty[j].y);
    ans+=query(q[i].qy)*q[i].c;
}
复制

(建议先做一下P2163园丁的烦恼)

求$sum(f=1337)$

这个比较水，考虑对新读入的字符串桶排，然后塞到一个 $trie$ 树里，那么从根节点到每个叶子的路径即为一个字符集，记录字符集的个数和一些乱七八糟的东西，然后肆意求出。

CODE

//求sum(f=1337)
int trk8[N][27],numk8;
int c[30],cnt[N];
vector<int>k8he;
int belong[N];//标记叶子节点属于的字符集
int NUM;//字符集个数
vector<int >order[N];//按照字符集顺序遍历
void jjdw(char str[],int id){
    int p=0;for(int i=1;i<=26;++i)  c[i]=0;
    for(int i=0;i<len;++i)  c[str[i]-'a'+1]++;//桶排序
    for(int u=1;u<=26;u++){
        if(c[u]){
            for(int i=1;i<=c[u];++i){
                if(!trk8[p][u])  trk8[p][u]=++numk8;
                p=trk8[p][u];
            }
        }
    }
    if(!cnt[p]){
        k8he.push_back(p);
        belong[p]=++NUM;
    }
    order[belong[p]].push_back(id);
    lty[id].bel=belong[p];
    cnt[p]++;
}
int calc_k8(){//这个计算很好理解
    int kk=0,k8sum=n;
    for(int i=0;i<k8he.size()-1;++i){
        int p=k8he[i];
        k8sum-=cnt[p];
        if(k8sum>0)  kk+=cnt[p]*(k8sum);
    }
    return kk;
}
复制

求$sum(f=1)$

这是本题精髓，考虑一个串，找出其中所有的极长不下降子串(这里不是最长，我被坑惨了)，例如串 $abcdcbdfa$ 中的极长串是 $abcd,c,bdf,a$，对于每个子串，它会将把原串 $s$ 分成一个前缀 $s[1\dots l]$ 和后缀 $s[r\dots len]$，如果存在一个串 $t$，使得:

$s$ 和 $t$ 属于同一字符集，$belong[s]==belong[t]$.
$s,t$ 的前缀和后缀相等，$s[1\dots l]=t[1\dots l],s[r\dots len]=t[r\dots len]$

此时 $f(t,s)=1$(这很显然，只将 $t[l+1,r-1]$ 排序即可)

我们发现 $\sum _{i=1}^{n}\sum _{j=i+1}^{n}f(T_i,T_j)$ 实际就是两两字符串的 $f$，所以枚举每个字符串的每个极长不下降子串子串，求前缀和后缀和它分别相等的串的个数即可。

我们考虑对所有字符串正反建两遍 $trie$，对叶子节点进行编号 $1$ 到 $n$，那么每个串 $s$ 就可以表示为一个二元组 $(x,y)$，其中 $x$ 是 $s$ 在正 $trie$ 上结束的叶子节点编号,$y$ 同理。

我们发现:前缀相同的串在 $trie$ 树上叶子节点的编号是连续的，考虑将前缀在正 $trie$ 上跑，得到一个区间 $[l_1,r_1]$，再将后缀在反 $trie$ 上跑，得到另一个区间 $[l_2,r_2]$，然后在矩形 $[l_1,r_1,l_2,r_2]$ 二维数点即可。直观地，对于 $9$ 个字符串建出正 $trie$，如下图:

反 $trie$ 同理。

CODE

void ins1(char str[],int id){//正trie
    int p=0;
    for(int i=0;i<len;++i){
        int u=str[i]-'a'+1;
        if(!trie1[p][u])  trie1[p][u]=++num1;
        p=trie1[p][u];
    }
    flag1[p]=1;
    ed1[id]=p;
}
void ins2(char str[],int id){//反trie
    int p=0;
    for(int i=len-1;i>=0;i--){
        int u=str[i]-'a'+1;
        if(!trie2[p][u])  trie2[p][u]=++num2;
        p=trie2[p][u];
    }
    flag2[p]=1;
    ed2[id]=p;
}
void dfs1(int x){
    if(flag1[x]){
        l1[x]=r1[x]=++Id1;return;//给叶子结点编号
    }
    for(int i=1;i<=26;++i){
        int y=trie1[x][i];
        if(y){
            dfs1(y);
            if(!l1[x])  l1[x]=l1[y];
            l1[x]=min(l1[x],l1[y]);
            r1[x]=max(r1[x],r1[y]);
            //求该节点覆盖区间
        }
    }
}
void dfs2(int x){
    if(flag2[x]){
        l2[x]=r2[x]=++Id2;return;
    }
    for(int i=1;i<=26;++i){
        int y=trie2[x][i];
        if(y){
            dfs2(y);
            if(!l2[x])  l2[x]=l2[y];
            l2[x]=min(l2[x],l2[y]);
            r2[x]=max(r2[x],r2[y]);
        }
    }
}
复制

其中还有不少~~坑人的sm~~细节:

对于每个区间的计算，在线算的话单次 $O(len)$，我们考虑离线所有要查询的 $al,ar$，这里 $s[al,ar]$ 是一个极长不下降子串，考虑每个串所有的 $al$ 都在正 $trie$ 的一条链上($ar$ 同理)，然后就可以总复杂度 $O(n\times len)$ 水过去了。
我们要统计矩形 $[l_1,r_1,l_2,r_2]$ 内的点，这些点必须与当前串在同一字符集里，我们可以预处理出每个串所属的字符集，然后在数点的时候判断与当前字符集是否一致，也就是这样:

for(int i=1,j=1;i<=tot;i++){
    for(;lty[j].x<=q[i].qx&&j<=n;j++){
        if(lty[j].bel==id)
            add(lty[j].y);
    }
    count1+=query(q[i].qy)*q[i].c;
}
复制

但是这样显然是 $O(n^2)$ 的复杂度，狂 $T$ 不止 $\dots$

~~这是xrlong让我打他的trie树清空做法，我看着眼前二百多行代码很是不舍，然后就出来一个跑飞快的优化~~

一个 $nb$ 的优化，考虑将点集以字符集编号为第一关键字，横坐标第二关键字排序，这样可以去掉无用的遍历，也就是:

sort(lty+1,lty+n+1,[](Node a,Node b){
    if(a.bel==b.bel)  return a.x<b.x;
    return a.bel<b.bel;
});//玄学优化(不玄学)
for(int i=1;i<=n;i++){
    if(lty[i].bel!=lty[i-1].bel){
        ql[lty[i].bel]=i;
        qr[lty[i-1].bel]=i-1;
        //每段字符集对应的左右端点
    }
}
qr[lty[n].bel]=n;
//......
for(int id=1;id<=NUM;++id){
    //...
    for(int i=1,j=ql[id];i<=tot;++i){
        for(;lty[j].x<=q[i].qx&&j<=qr[id];++j)  add(lty[j].y);
        count1+=query(q[i].qy)*q[i].c;
    }
复制

这样就跑飞快了。

考虑跑完一个字符集，进行清空，我们当然不能 $O(n)$ 清空，那么:

你怎么加进去就怎么清空$\ \ \ \ \ \ \ $——$xrlong$

int C[N],topc;
void add(int x){
    C[++topc]=x;
    for(int i=x;i<=n;i+=lowbit(i))  tr[i]++;
}
void clear(){
    for(int x=1;x<=topc;x++){
        for(int i=C[x];i<=n;i+=lowbit(i)){
            if(!tr[i])  break;//剪个小枝
            tr[i]=0;
        }
    }
    topc=0;tot=0;
}
复制

每个字符串前缀后缀都相同的串，显然也包括它自己，所以最后要减去算了多少遍自己，即极长不下降子串个数。

求 $sum(f=2)$

小学生问题。

差不多了，拜谢 $xrlong$

~~也就二百多行，我码量大一定是因为我每个函数都写了两遍，dfs1,dfs2,ins1,ins2...~~

$AC\ \ Code$

#include<bits/stdc++.h>
using namespace std;
#define k8 1337
#define int long long
#define read read()
#define pt puts("")
inline int read
{
    int x=0,f=1;char c=getchar();
    while(c<'0'||c>'9') {if(c=='-')  f=-1;c=getchar();}
    while(c>='0'&&c<='9')   x=(x<<3)+(x<<1)+c-'0',c=getchar();
    return f*x;
}
void write(int x)
{
    if(x<0)  putchar('-'),x=-x;
    if(x>9)  write(x/10);
    putchar(x%10+'0');
    return;
}
const int N = 2e5+10;
int n;
int ans;
int len;
char s[N];
string T[N];
int trie1[N][27],trie2[N][27];
int l1[N],l2[N],r1[N],r2[N];//每个trie树上某节点覆盖的范围[l,r]
int num1,num2;
int Id1,Id2;//给叶子结点编号
int ed1[N],ed2[N];//记录每个串结束的节点
bool flag1[N],flag2[N];//标记为叶子节点
struct Node{
    int x,y;
    int bel;//所属字符集
}lty[N];
void ins1(char str[],int id){//正trie
    int p=0;
    for(int i=0;i<len;++i){
        int u=str[i]-'a'+1;
        if(!trie1[p][u])  trie1[p][u]=++num1;
        p=trie1[p][u];
    }
    flag1[p]=1;
    ed1[id]=p;
}
void ins2(char str[],int id){//反trie
    int p=0;
    for(int i=len-1;i>=0;i--){
        int u=str[i]-'a'+1;
        if(!trie2[p][u])  trie2[p][u]=++num2;
        p=trie2[p][u];
    }
    flag2[p]=1;
    ed2[id]=p;
}
void dfs1(int x){
    if(flag1[x]){
        l1[x]=r1[x]=++Id1;return;//给叶子结点编号
    }
    for(int i=1;i<=26;++i){
        int y=trie1[x][i];
        if(y){
            dfs1(y);
            if(!l1[x])  l1[x]=l1[y];
            l1[x]=min(l1[x],l1[y]);
            r1[x]=max(r1[x],r1[y]);
            //求该节点覆盖区间
        }
    }
}
void dfs2(int x){
    if(flag2[x]){
        l2[x]=r2[x]=++Id2;return;
    }
    for(int i=1;i<=26;++i){
        int y=trie2[x][i];
        if(y){
            dfs2(y);
            if(!l2[x])  l2[x]=l2[y];
            l2[x]=min(l2[x],l2[y]);
            r2[x]=max(r2[x],r2[y]);
        }
    }
}
//求sum(f=1337)
int trk8[N][27],numk8;
int c[30],cnt[N];
vector<int>k8he;
int belong[N];//标记叶子节点属于的字符集
int NUM;//字符集个数
vector<int >order[N];//按照字符集顺序遍历
void jjdw(char str[],int id){
    int p=0;for(int i=1;i<=26;++i)  c[i]=0;
    for(int i=0;i<len;++i)  c[str[i]-'a'+1]++;//桶排序
    for(int u=1;u<=26;u++){
        if(c[u]){
            for(int i=1;i<=c[u];++i){
                if(!trk8[p][u])  trk8[p][u]=++numk8;
                p=trk8[p][u];
            }
        }
    }
    if(!cnt[p]){
        k8he.push_back(p);
        belong[p]=++NUM;
    }
    order[belong[p]].push_back(id);
    lty[id].bel=belong[p];
    cnt[p]++;
}
int calc_k8(){//这个计算很好理解
    int kk=0,k8sum=n;
    for(int i=0;i<k8he.size()-1;++i){
        int p=k8he[i];
        k8sum-=cnt[p];
        if(k8sum>0)  kk+=cnt[p]*(k8sum);
    }
    return kk;
}
int kobe;
int L[N],R[N],top;
int x_1[N<<2],x_2[N<<2],y_1[N<<2],y_2[N<<2];
void search1(string a){//离线查询
    int p=0;
    for(int i=1,j=0;i<=top;++i){
        for(;j<L[i];j++){
            int u=a[j]-'a'+1;
            p=trie1[p][u];
        }
        x_1[i]=l1[p],x_2[i]=r1[p];
    }
}
void search2(string a){
    int p=0;
    for(int i=top,j=len-1;i;i--){
        for(;j>R[i];j--){
            int u=a[j]-'a'+1;
            p=trie2[p][u];
        }
        y_1[i]=l2[p],y_2[i]=r2[p];
    }
}
int tot;
struct Q{
    int qx,qy,c;
}q[N<<3];
int tr[N];
#define lowbit(i) (i&(-i))
int C[N],topc;
void add(int x){
    C[++topc]=x;
    for(int i=x;i<=n;i+=lowbit(i))  tr[i]++;
}
int query(int x){
    int res=0;
    for(int i=x;i;i-=lowbit(i)){
        res+=tr[i];
    }
    return res;
}
void clear(){
    for(int x=1;x<=topc;x++){
        for(int i=C[x];i<=n;i+=lowbit(i)){
            if(!tr[i])  break;
            tr[i]=0;
        }
    }
    topc=0;tot=0;
}
int ql[N],qr[N];
signed main()
{
    n=read;
    kobe=(n-1)*n/2;//总操作数
    for(int i=1;i<=n;++i){
        scanf(" %s",s);
        T[i]=string(s);
        if(!len) len=strlen(s);
        ins1(s,i);ins2(s,i);//正反建trie
        jjdw(s,i);
    }
    dfs1(0);dfs2(0);//编号
    int ck8=calc_k8();//1337的个数
    ans=k8*ck8;
    kobe-=ck8;
    if(!kobe){
        write(ans);return 0;
    }
    for(int i=1;i<=n;++i){
        lty[i].x=l1[ed1[i]];
        lty[i].y=l2[ed2[i]];
    }//每个串对应的坐标
    sort(lty+1,lty+n+1,[](Node a,Node b){
        if(a.bel==b.bel)  return a.x<b.x;
        return a.bel<b.bel;
    });//玄学优化(不玄学)
    for(int i=1;i<=n;i++){
        if(lty[i].bel!=lty[i-1].bel){
            ql[lty[i].bel]=i;
            qr[lty[i-1].bel]=i-1;
        }
    }
    qr[lty[n].bel]=n;//处理每个字符集左右边界
    int al,ar;
    int count1=0;
    string a;
    int self=n;//减去重复
    for(int id=1;id<=NUM;++id){
        for(int i:order[id]){
            al=0;a=T[i];
            for(ar=1;ar<len;++ar){//统计每个极长子串
                if(a[ar]<a[ar-1]){
                    L[++top]=al,R[top]=ar-1;//离线下来一遍求，否则会T
                    al=ar;++self;
                }
            }
            ar--;
            L[++top]=al,R[top]=ar;
            search1(a);search2(a);
            for(int i=1;i<=top;++i){
                int x1=x_1[i],x2=x_2[i],y1=y_1[i],y2=y_2[i];
                q[++tot]={x1-1,y1-1,1};
                q[++tot]={x2,y2,1};
                q[++tot]={x1-1,y2,-1};
                q[++tot]={x2,y1-1,-1};
            }//离线二维数点
            top=0;
        }
        sort(q+1,q+tot+1,[](Q a,Q b){return a.qx<b.qx;});
        for(int i=1,j=ql[id];i<=tot;++i){
            for(;lty[j].x<=q[i].qx&&j<=qr[id];++j)  add(lty[j].y);
            count1+=query(q[i].qy)*q[i].c;
        }
        clear();
    }
    count1-=self;
    ans+=count1;
    kobe-=count1;
    ans+=kobe*2;
    write(ans);
    return 0;
}
复制

唉，我是 $fw$，感觉这个题真是我现在码力的极限了...

与CF1535F String Distance相似的内容：

CF1535F String Distance

$CF1535F\ \ String\ Distance$ 题意给 $n$ 个长度均为 $len$ 的字符串 $T_1,T_2,\dots T_n$，定义 $f(a,b)$ 为将 $a,b$ 排序后相等的最小排序次数，若无解则为 $1337$(这好像是个黑客用语)。求