后缀数组

后缀数组 (Suffix Array, SA) 是把字符串的所有 后缀字典序 排序后得到的数组。

定义:

  • 排名数组rkw(i)\operatorname{rk}_w(i):从第 ii 位开始、长度为 ww 的子串在所有长度为 ww 的子串中的排名;
  • 编号数组saw(i)\operatorname{sa}_w(i):长度为 ww 的子串中字典序第 ii 小的一个的开始位置,与排名数组互逆,即sa(rk(i))=i\operatorname{sa}(\operatorname{rk}(i))=irk(sa(i))=i\operatorname{rk}(\operatorname{sa}(i))=i

用倍增法在Θ(nlogn)\Theta(n\log n) 的时间内求出后缀数组:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
struct SA {
string T;
int n;
vector<int> sa;

SA() {
cin >> T;
init();
}

SA(string s) {
T = s;
init();
}

void init() { // 1-index
n = T.size();
T = ' ' + T;
sa.resize(n + 1);
iota(sa.begin(), sa.end(), 0);
vector<int> rk(n + 1 << 1);
for (int i = 1; i <= n; ++i) rk[i] = T[i];
for (int w = 1; w < n; w <<= 1) {
auto tmp = rk;
auto rp = [&](int x) { return make_pair(rk[x], rk[x + w]); };
sort(sa.begin(), sa.end(), [&](int x, int y) { return rp(x) < rp(y); });
for (int i = 1, p = 0; i <= n; ++i) tmp[sa[i]] = rp(sa[i - 1]) == rp(sa[i]) ? p : ++p;
rk = tmp;
}
}
}; // Suffix Array


void eachT() {
string s;
cin >> s;

SA sa(s);
for (int i = 1; i <= sa.n; ++i) {
printf("%d ", sa.sa[i]);
}
}

优化:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
struct SA {
int n;
vector<int> sa, rk, lc;
// lc 是 sa[i] 和 sa[i - 1] 的最长公共前缀长度
SA(const string& s) {
n = s.length();
sa.resize(n);
lc.resize(n - 1);
rk.resize(n);
iota(sa.begin(), sa.end(), 0);
sort(sa.begin(), sa.end(), [&](int a, int b) {return s[a] < s[b]; });
rk[sa[0]] = 0;
for (int i = 1; i < n; ++i)
rk[sa[i]] = rk[sa[i - 1]] + (s[sa[i]] != s[sa[i - 1]]);
int k = 1;
vector<int> tmp, cnt(n);
tmp.reserve(n);
while (rk[sa[n - 1]] < n - 1) {
tmp.clear();
for (int i = 0; i < k; ++i)
tmp.push_back(n - k + i);
for (auto i : sa)
if (i >= k) tmp.push_back(i - k);
fill(cnt.begin(), cnt.end(), 0);
for (int i = 0; i < n; ++i)
++cnt[rk[i]];
for (int i = 1; i < n; ++i)
cnt[i] += cnt[i - 1];
for (int i = n - 1; i >= 0; --i)
sa[--cnt[rk[tmp[i]]]] = tmp[i];
swap(rk, tmp);
rk[sa[0]] = 0;
for (int i = 1; i < n; ++i)
rk[sa[i]] = rk[sa[i - 1]] + (tmp[sa[i - 1]] < tmp[sa[i]] || sa[i - 1] + k == n || tmp[sa[i - 1] + k] < tmp[sa[i] + k]);
k *= 2;
}
for (int i = 0, j = 0; i < n; ++i) {
if (rk[i] == 0) {
j = 0;
}
else {
for (j -= j > 0; i + j < n && sa[rk[i] - 1] + j < n && s[i + j] == s[sa[rk[i] - 1] + j]; )
++j;
lc[rk[i] - 1] = j;
}
}
}
};

void eachT() {
string s;
cin >> s;

SA sa(s);
for (int i = 0; i < sa.n; ++i) {
printf("%d ", sa.sa[i]);
}
printf("\n0 ");
for (int i = 0; i < sa.n - 1; ++i) {
printf("%d ", sa.lc[i]);
}
printf("\n");
}

例题

求本质不同子串个数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
ll calc(const string& s) {
const int n = s.length();
SA sa(s);

ll res = n * (n + 1ll) / 2;
for (int i = 0; i < n - 1; i++) {
res -= sa.lc[i];
}
return res;
}

ll calc(const string& s) {
const int n = s.length();
SA sa(s);

ll res = 0;
for (int i = 0; i < n; i++) {
res += n - sa.sa[i] - (i == 0 ? 0 : sa.lc[i - 1]);
}
return res;
}

求在两个字符串中各取出一个子串使得这两个子串相同的方案数。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
ll calc(const string& s) {
const int n = s.length();
SA sa(s);

ll res = 0;
stack<int> stk;
vector<int> f(n);
for (int i = 0; i < n - 1; ++i) {
while (!stk.empty() && sa.lc[stk.top()] >= sa.lc[i]) {
stk.pop();
}
f[i] = stk.empty() ? sa.lc[i] * (i + 1)
: f[stk.top()] + sa.lc[i] * (i - stk.top());
stk.push(i);
res += f[i];
}

return res;
}

void eachT() {
string s, t;
cin >> s >> t;

string st = s + "#" + t;
cout << calc(st) - calc(s) - calc(t) << endl;
}

给一个字符串,每次只能从两边取,加入到答案,要求取出来之后答案的字典序最小。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
void eachT() {
int n;
string s;
cin >> n >> s;

string ss = s;
reverse(ss.begin(), ss.end());
ss = s + "#" + ss;

SA sa(ss);
vector<int> f(n), g(n);
for (int i = 0; i < n; ++i) {
f[i] = sa.rk[i];
g[i] = sa.rk[2 * n - i];
}

int l = 0, r = n - 1;
while (l <= r) {
if (f[l] < g[r]) {
cout << s[l++];
}
else {
cout << s[r--];
}
}
}

求最长的出现了kk 次的子串长度:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
void eachT() {
int n, k;
string s;
cin >> n >> k >> s;

SA sa(s);
deque<int> Q;
int res = 0;
for (int i = 0; i < n - 1; ++i) {
while (!Q.empty() && sa.lc[Q.back()] > sa.lc[i]) Q.pop_back();
Q.push_back(i);
if (Q.front() + (k - 1) <= i) Q.pop_front();
if (i >= k - 1) {
res = max(res, sa.lc[Q.front()]);
}
} // 单调队列求长度为k-1的min(lc)最大值

cout << res << '\n';
}

后缀自动机

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
struct SAM {
static constexpr int tot = 26;
struct Node {
int len;
int link;
array<int, tot> next;
Node() : len{}, link{}, next{} {}
};
vector<Node> t;
SAM() {
init();
}
void init() {
t.assign(2, Node());
t[0].next.fill(1);
t[0].len = -1;
}
int newNode() {
t.emplace_back();
return t.size() - 1;
}
int extend(int p, int c) {
if (t[p].next[c]) {
int q = t[p].next[c];
if (t[q].len == t[p].len + 1) {
return q;
}
int r = newNode();
t[r].len = t[p].len + 1;
t[r].link = t[q].link;
t[r].next = t[q].next;
t[q].link = r;
while (t[p].next[c] == q) {
t[p].next[c] = r;
p = t[p].link;
}
return r;
}
int cur = newNode();
t[cur].len = t[p].len + 1;
while (!t[p].next[c]) {
t[p].next[c] = cur;
p = t[p].link;
}
t[cur].link = extend(p, c);
return cur;
}
int extend(int p, char c, char offset = 'a') {
return extend(p, c - offset);
}

int next(int p, int x) {
return t[p].next[x];
}

int next(int p, char c, char offset = 'a') {
return next(p, c - 'a');
}

int link(int p) {
return t[p].link;
}

int len(int p) {
return t[p].len;
}

int size() {
return t.size();
}
};