Poison

1044. Longest Duplicate Substring

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
class Solution {
private static final int PRIME = Boolean.hashCode(true);

public String longestDupSubstring(String s) {
long[] prefixHashArray = new long[s.length() + 1]; // 当下标为 i 时,记录前 i 个字符组成的字符串的 hash 值
long[] factorArray = new long[s.length() + 1]; // 当下标为 i 时,记录当前所乘的因数的个数为 i
factorArray[0] = 1;

for (int i = 0; i < s.length(); i++) {
// 此时计算以 s[i] 结尾的字符串的 hash 值,对应前 i + 1 个字符
prefixHashArray[i + 1] = prefixHashArray[i] * PRIME + s.charAt(i);
factorArray[i + 1] = factorArray[i] * PRIME;
}

String answer = "";

// left 与 right 表示我们要寻找的字符串的长度范围:[left, right]
int left = 1, right = s.length();
while (left <= right) {
int mid = (left + right) >>> 1;
// 如果存在 mid 长度的重复字符串,则不断向更长的字符串逼近
String subStr = findDupSubString(s, mid, prefixHashArray, factorArray);
if (subStr.length() == mid) {
// 找到 mid 长度的字符串,尝试找更大长度的
left = mid + 1;

if (subStr.length() > answer.length()) {
answer = subStr;
}
} else {
// 不存在重复的 mid 长度的字符串,说明不存在比 mid 长度更长的字符串,所以减短长度,继续寻找
right = mid - 1;
}
}

return answer;
}

private String findDupSubString(String s, int length, long[] prefixHashArray, long[] factorArray) {
Set<Long> set = new HashSet<>();

// if i = 0, length = 10, lastIndex = 9, i <= 10 - 10
for (int i = 0; i <= s.length() - length; i++) {
int j = i + length - 1;

long hash = prefixHashArray[j + 1] - prefixHashArray[i] * factorArray[length];
if (set.contains(hash)) {
return s.substring(i, i + length);
} else {
set.add(hash);
}
}

return "";
}
}

注意重复子串的长度最长可达 3 × 104 - 1,容易出现哈希冲突,所以使用了 long 型的数组存储前缀字符串的哈希值。

Reference

1044. Longest Duplicate Substring