Poison

187. Repeated DNA Sequences

Trie
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
class Solution {
private static class Trie {
private final TrieNode root;

public Trie() {
this.root = new TrieNode();
}
}

private static class TrieNode {
private boolean end;
private final TrieNode[] children;

public TrieNode() {
this.end = false;
this.children = new TrieNode[4];
}

public TrieNode getChild(char c) {
return children[charToIndex(c)];
}

private int charToIndex(char c) {
switch (c) {
case 'A':
return 0;
case 'C':
return 1;
case 'G':
return 2;
case 'T':
return 3;
default:
throw new IllegalArgumentException();
}
}

public boolean isEnd() {
return end;
}

public void setEnd(boolean end) {
this.end = end;
}

public TrieNode addChild(char c) {
TrieNode child = new TrieNode();
children[charToIndex(c)] = child;
return child;
}
}

public List<String> findRepeatedDnaSequences(String s) {
Set<String> set = new HashSet<>(); // 注意使用 Trie 树时可能多次到达树中节点,故用 set 去重

Trie trie = new Trie();
for (int i = 0; i < s.length() - 9; i++) {
addCharToTrie(s, i, trie, set);
}

return new ArrayList<>(set);
}

private void addCharToTrie(String s, int i, Trie trie, Set<String> set) {
TrieNode node = trie.root;

for (int j = i; j < i + 10; j++) {
char c = s.charAt(j);
TrieNode childNode = node.getChild(c);
if (childNode != null) {
node = childNode;
} else {
node = node.addChild(c);
}
}

if (node.isEnd()) {
set.add(s.substring(i, i + 10));
} else {
node.setEnd(true);
}
}
}
Hash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
class Solution {
private int hash(char c) {
switch (c) {
case 'A':
return 0b00;
case 'C':
return 0b01;
case 'G':
return 0b10;
case 'T':
return 0b11;
default:
throw new IllegalArgumentException();
}
}

public List<String> findRepeatedDnaSequences(String s) {
List<String> list = new ArrayList<>();
if (s.length() <= 10) {
return list;
}

int hash = 0;
for (int i = 0; i < 10; i++) {
hash = (hash << 2) | hash(s.charAt(i));
}

Map<Integer, Integer> hashToCountMap = new HashMap<>();
hashToCountMap.put(hash, 1);

for (int i = 10; i < s.length(); i++) {
hash = ((hash << 2) | hash(s.charAt(i))) & 0b11111111111111111111;
int count = hashToCountMap.getOrDefault(hash, 0);
if (count == 1) {
list.add(s.substring(i - 9, i + 1));
}

hashToCountMap.put(hash, count + 1);
}

return list;
}
}
Rolling Hash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
class Solution {
private static final int PRIME = Boolean.hashCode(true);

public List<String> findRepeatedDnaSequences(String s) {
List<String> answerList = new ArrayList<>();

int[] prefixHashArray = new int[s.length() + 1]; // 前 i 个字符组成的字符串对应的 hash 值数组
int[] factorArray = new int[s.length() + 1];
factorArray[0] = 1;

// 计算出截止至下标为 i 的字符(含)的前缀字符串哈希
for (int i = 0; i < s.length(); i++) {
prefixHashArray[i + 1] = prefixHashArray[i] * PRIME + s.charAt(i);
factorArray[i + 1] = factorArray[i] * PRIME;
}

Map<Integer, Integer> hashToCountMap = new HashMap<>();
for (int j = 9; j < s.length(); j++) {
int i = j - 9;
int hash = prefixHashArray[j + 1] - prefixHashArray[i] * factorArray[10];
int count = hashToCountMap.getOrDefault(hash, 0);
if (count == 1) {
answerList.add(s.substring(i, j + 1));
}
hashToCountMap.put(hash, count + 1);
}

return answerList;
}
}
Rolling Hash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
class Solution {
private static final long PRIME = Boolean.hashCode(true);

public List<String> findRepeatedDnaSequences(String s) {
if (s.length() <= 10) {
return Collections.emptyList();
}

List<String> seqList = new ArrayList<>();
Map<Long, Integer> hashToFeqMap = new HashMap<>();
long hash = 0;
long product = 1; // target: PRIME ^ 9
for (int i = 0; i < 10; i++) {
if (i != 0) {
product *= PRIME;
}
hash = hash * PRIME + s.charAt(i);
}
hashToFeqMap.put(hash, 1);

for (int i = 10; i < s.length(); i++) {
hash -= s.charAt(i - 10) * product;
hash = hash * PRIME + s.charAt(i);
int feq = hashToFeqMap.getOrDefault(hash, 0);
if (feq == 1) {
seqList.add(s.substring(i - 9, i + 1));
}
hashToFeqMap.put(hash, feq + 1);
}

return seqList;
}
}
Reference

187. Repeated DNA Sequences