187. Repeated DNA Sequences

Trie

class Solution {
    private static class Trie {
        private final TrieNode root;

        public Trie() {
            this.root = new TrieNode();
        }
    }

    private static class TrieNode {
        private boolean end;
        private final TrieNode[] children;

        public TrieNode() {
            this.end = false;
            this.children = new TrieNode[4];
        }

        public TrieNode getChild(char c) {
            return children[charToIndex(c)];
        }

        private int charToIndex(char c) {
            switch (c) {
                case 'A':
                    return 0;
                case 'C':
                    return 1;
                case 'G':
                    return 2;
                case 'T':
                    return 3;
                default:
                    throw new IllegalArgumentException();
            }
        }

        public boolean isEnd() {
            return end;
        }

        public void setEnd(boolean end) {
            this.end = end;
        }

        public TrieNode addChild(char c) {
            TrieNode child = new TrieNode();
            children[charToIndex(c)] = child;
            return child;
        }
    }

    public List<String> findRepeatedDnaSequences(String s) {
        Set<String> set = new HashSet<>(); // 注意使用 Trie 树时可能多次到达树中节点，故用 set 去重

        Trie trie = new Trie();
        for (int i = 0; i < s.length() - 9; i++) {
            addCharToTrie(s, i, trie, set);
        }

        return new ArrayList<>(set);
    }

    private void addCharToTrie(String s, int i, Trie trie, Set<String> set) {
        TrieNode node = trie.root;

        for (int j = i; j < i + 10; j++) {
            char c = s.charAt(j);
            TrieNode childNode = node.getChild(c);
            if (childNode != null) {
                node = childNode;
            } else {
                node = node.addChild(c);
            }
        }

        if (node.isEnd()) {
            set.add(s.substring(i, i + 10));
        } else {
            node.setEnd(true);
        }
    }
}

Hash

class Solution {
    private int hash(char c) {
        switch (c) {
            case 'A':
                return 0b00;
            case 'C':
                return 0b01;
            case 'G':
                return 0b10;
            case 'T':
                return 0b11;
            default:
                throw new IllegalArgumentException();
        }
    }

    public List<String> findRepeatedDnaSequences(String s) {
        List<String> list = new ArrayList<>();
        if (s.length() <= 10) {
            return list;
        }

        int hash = 0;
        for (int i = 0; i < 10; i++) {
            hash = (hash << 2) | hash(s.charAt(i));
        }

        Map<Integer, Integer> hashToCountMap = new HashMap<>();
        hashToCountMap.put(hash, 1);

        for (int i = 10; i < s.length(); i++) {
            hash = ((hash << 2) | hash(s.charAt(i))) & 0b11111111111111111111;
            int count = hashToCountMap.getOrDefault(hash, 0);
            if (count == 1) {
                list.add(s.substring(i - 9, i + 1));
            }

            hashToCountMap.put(hash, count + 1);
        }

        return list;
    }
}

Rolling Hash

class Solution {
    private static final int PRIME = Boolean.hashCode(true);

    public List<String> findRepeatedDnaSequences(String s) {
        List<String> answerList = new ArrayList<>();

        int[] prefixHashArray = new int[s.length() + 1]; // 前 i 个字符组成的字符串对应的 hash 值数组
        int[] factorArray = new int[s.length() + 1];
        factorArray[0] = 1;

        // 计算出截止至下标为 i 的字符（含）的前缀字符串哈希
        for (int i = 0; i < s.length(); i++) {
            prefixHashArray[i + 1] = prefixHashArray[i] * PRIME + s.charAt(i);
            factorArray[i + 1] = factorArray[i] * PRIME;
        }

        Map<Integer, Integer> hashToCountMap = new HashMap<>();
        for (int j = 9; j < s.length(); j++) {
            int i = j - 9;
            int hash = prefixHashArray[j + 1] - prefixHashArray[i] * factorArray[10];
            int count = hashToCountMap.getOrDefault(hash, 0);
            if (count == 1) {
                answerList.add(s.substring(i, j + 1));
            }
            hashToCountMap.put(hash, count + 1);
        }

        return answerList;
    }
}

Rolling Hash

class Solution {
    private static final long PRIME = Boolean.hashCode(true);

    public List<String> findRepeatedDnaSequences(String s) {
        if (s.length() <= 10) {
            return Collections.emptyList();
        }

        List<String> seqList = new ArrayList<>();
        Map<Long, Integer> hashToFeqMap = new HashMap<>();
        long hash = 0;
        long product = 1; // target: PRIME ^ 9
        for (int i = 0; i < 10; i++) {
            if (i != 0) {
                product *= PRIME;
            }
            hash = hash * PRIME + s.charAt(i);
        }
        hashToFeqMap.put(hash, 1);

        for (int i = 10; i < s.length(); i++) {
            hash -= s.charAt(i - 10) * product;
            hash = hash * PRIME + s.charAt(i);
            int feq = hashToFeqMap.getOrDefault(hash, 0);
            if (feq == 1) {
                seqList.add(s.substring(i - 9, i + 1));
            }
            hashToFeqMap.put(hash, feq + 1);
        }

        return seqList;
    }
}

Reference

187. Repeated DNA Sequences