后缀数组后缀数组 (Suffix Array, SA ) 是把字符串的所有 后缀 按 字典序 排序后得到的数组。
定义:
排名数组 rk w ( i ) \operatorname{rk}_w(i) r k w ( i ) :从第 i i i 位开始、长度为 w w w 的子串在所有长度为 w w w 的子串中的排名;编号数组 sa w ( i ) \operatorname{sa}_w(i) s a w ( i ) :长度为 w w w 的子串中字典序第 i i i 小的一个的开始位置,与排名数组互逆,即sa ( rk ( i ) ) = i \operatorname{sa}(\operatorname{rk}(i))=i s a ( r k ( i ) ) = i ,rk ( sa ( i ) ) = i \operatorname{rk}(\operatorname{sa}(i))=i r k ( s a ( i ) ) = i 。用倍增法在Θ ( n log n ) \Theta(n\log n) Θ ( n log n ) 的时间内求出后缀数组:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 struct SA { string T; int n; vector<int > sa; SA () { cin >> T; init (); } SA (string s) { T = s; init (); } void init () { n = T.size (); T = ' ' + T; sa.resize (n + 1 ); iota (sa.begin (), sa.end (), 0 ); vector<int > rk (n + 1 << 1 ) ; for (int i = 1 ; i <= n; ++i) rk[i] = T[i]; for (int w = 1 ; w < n; w <<= 1 ) { auto tmp = rk; auto rp = [&](int x) { return make_pair (rk[x], rk[x + w]); }; sort (sa.begin (), sa.end (), [&](int x, int y) { return rp (x) < rp (y); }); for (int i = 1 , p = 0 ; i <= n; ++i) tmp[sa[i]] = rp (sa[i - 1 ]) == rp (sa[i]) ? p : ++p; rk = tmp; } } }; void eachT () { string s; cin >> s; SA sa (s) ; for (int i = 1 ; i <= sa.n; ++i) { printf ("%d " , sa.sa[i]); } }
优化:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 struct SA { int n; vector<int > sa, rk, lc; SA (const string& s) { n = s.length (); sa.resize (n); lc.resize (n - 1 ); rk.resize (n); iota (sa.begin (), sa.end (), 0 ); sort (sa.begin (), sa.end (), [&](int a, int b) {return s[a] < s[b]; }); rk[sa[0 ]] = 0 ; for (int i = 1 ; i < n; ++i) rk[sa[i]] = rk[sa[i - 1 ]] + (s[sa[i]] != s[sa[i - 1 ]]); int k = 1 ; vector<int > tmp, cnt (n); tmp.reserve (n); while (rk[sa[n - 1 ]] < n - 1 ) { tmp.clear (); for (int i = 0 ; i < k; ++i) tmp.push_back (n - k + i); for (auto i : sa) if (i >= k) tmp.push_back (i - k); fill (cnt.begin (), cnt.end (), 0 ); for (int i = 0 ; i < n; ++i) ++cnt[rk[i]]; for (int i = 1 ; i < n; ++i) cnt[i] += cnt[i - 1 ]; for (int i = n - 1 ; i >= 0 ; --i) sa[--cnt[rk[tmp[i]]]] = tmp[i]; swap (rk, tmp); rk[sa[0 ]] = 0 ; for (int i = 1 ; i < n; ++i) rk[sa[i]] = rk[sa[i - 1 ]] + (tmp[sa[i - 1 ]] < tmp[sa[i]] || sa[i - 1 ] + k == n || tmp[sa[i - 1 ] + k] < tmp[sa[i] + k]); k *= 2 ; } for (int i = 0 , j = 0 ; i < n; ++i) { if (rk[i] == 0 ) { j = 0 ; } else { for (j -= j > 0 ; i + j < n && sa[rk[i] - 1 ] + j < n && s[i + j] == s[sa[rk[i] - 1 ] + j]; ) ++j; lc[rk[i] - 1 ] = j; } } } }; void eachT () { string s; cin >> s; SA sa (s) ; for (int i = 0 ; i < sa.n; ++i) { printf ("%d " , sa.sa[i]); } printf ("\n0 " ); for (int i = 0 ; i < sa.n - 1 ; ++i) { printf ("%d " , sa.lc[i]); } printf ("\n" ); }
例题求本质不同子串个数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 ll calc (const string& s) { const int n = s.length (); SA sa (s) ; ll res = n * (n + 1ll ) / 2 ; for (int i = 0 ; i < n - 1 ; i++) { res -= sa.lc[i]; } return res; } ll calc (const string& s) { const int n = s.length (); SA sa (s) ; ll res = 0 ; for (int i = 0 ; i < n; i++) { res += n - sa.sa[i] - (i == 0 ? 0 : sa.lc[i - 1 ]); } return res; }
求在两个字符串中各取出一个子串使得这两个子串相同的方案数。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 ll calc (const string& s) { const int n = s.length (); SA sa (s) ; ll res = 0 ; stack<int > stk; vector<int > f (n) ; for (int i = 0 ; i < n - 1 ; ++i) { while (!stk.empty () && sa.lc[stk.top ()] >= sa.lc[i]) { stk.pop (); } f[i] = stk.empty () ? sa.lc[i] * (i + 1 ) : f[stk.top ()] + sa.lc[i] * (i - stk.top ()); stk.push (i); res += f[i]; } return res; } void eachT () { string s, t; cin >> s >> t; string st = s + "#" + t; cout << calc (st) - calc (s) - calc (t) << endl; }
给一个字符串,每次只能从两边取,加入到答案,要求取出来之后答案的字典序最小。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 void eachT () { int n; string s; cin >> n >> s; string ss = s; reverse (ss.begin (), ss.end ()); ss = s + "#" + ss; SA sa (ss) ; vector<int > f (n) , g (n) ; for (int i = 0 ; i < n; ++i) { f[i] = sa.rk[i]; g[i] = sa.rk[2 * n - i]; } int l = 0 , r = n - 1 ; while (l <= r) { if (f[l] < g[r]) { cout << s[l++]; } else { cout << s[r--]; } } }
求最长的出现了k k k 次的子串长度:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 void eachT () { int n, k; string s; cin >> n >> k >> s; SA sa (s) ; deque<int > Q; int res = 0 ; for (int i = 0 ; i < n - 1 ; ++i) { while (!Q.empty () && sa.lc[Q.back ()] > sa.lc[i]) Q.pop_back (); Q.push_back (i); if (Q.front () + (k - 1 ) <= i) Q.pop_front (); if (i >= k - 1 ) { res = max (res, sa.lc[Q.front ()]); } } cout << res << '\n' ; }
后缀自动机1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 struct SAM { static constexpr int tot = 26 ; struct Node { int len; int link; array<int , tot> next; Node () : len{}, link{}, next{} {} }; vector<Node> t; SAM () { init (); } void init () { t.assign (2 , Node ()); t[0 ].next.fill (1 ); t[0 ].len = -1 ; } int newNode () { t.emplace_back (); return t.size () - 1 ; } int extend (int p, int c) { if (t[p].next[c]) { int q = t[p].next[c]; if (t[q].len == t[p].len + 1 ) { return q; } int r = newNode (); t[r].len = t[p].len + 1 ; t[r].link = t[q].link; t[r].next = t[q].next; t[q].link = r; while (t[p].next[c] == q) { t[p].next[c] = r; p = t[p].link; } return r; } int cur = newNode (); t[cur].len = t[p].len + 1 ; while (!t[p].next[c]) { t[p].next[c] = cur; p = t[p].link; } t[cur].link = extend (p, c); return cur; } int extend (int p, char c, char offset = 'a' ) { return extend (p, c - offset); } int next (int p, int x) { return t[p].next[x]; } int next (int p, char c, char offset = 'a' ) { return next (p, c - 'a' ); } int link (int p) { return t[p].link; } int len (int p) { return t[p].len; } int size () { return t.size (); } };