所有DNA由一系列缩写为A,C,G和 T 的核苷酸组成,例如:“ACGAATTCCG”。在研究DNA时,识别DNA中的重复序列有时非常有用。

编写一个函数来查找DNA分子中所有出现超多一次的10个字母长的序列(子串)。

详见:https://leetcode.com/problems/repeated-dna-sequences/description/

Java实现:



class Solution {
public List<String> findRepeatedDnaSequences(String s) {
List<String> res = new ArrayList<>();
if(s.length()<10){
return res;
}
Map<String,Integer> m = new HashMap<>();
for(int i=0;i<s.length()-9;i++){
String subString = s.substring(i,i+10);
if(m.containsKey(subString)){
int count=m.get(subString); //如果为1,则添加进结果,否则继续遍历
if(count==1){
res.add(subString);
}
m.put(subString,count+1);
}else{
m.put(subString,1);
}
}
return res;
}
}


 C++实现:



class Solution {
public:
vector<string> findRepeatedDnaSequences(string s) {
vector<string> res;
if (s.size() <= 10)
{
return res;
}
int mask = 0x7ffffff;
unordered_map<int, int> m;
int cur = 0, i = 0;
while (i < 9)
{
cur = (cur << 3) | (s[i++] & 7);
}
while (i < s.size())
{
cur = ((cur & mask) << 3) | (s[i++] & 7);
if (m.find(cur) != m.end())
{
if (m[cur] == 1)
{
res.push_back(s.substr(i - 10, 10));
}
++m[cur];
}
else
{
m[cur] = 1;
}
}
return res;
}
};