由于某些原因,我们的MongoDB里存在重复数据,甚至已经影响到数据统计。
其实在MongoDB 3.2之前可以通过索引直接去重。但这一特性在3.2版本之初已经移除。
{unique : true, dropDups : true} # 无法使用了
大概思路是,通过aggregation
先group
出重复的键值对并做count
,之后match
所有count>2
的键值对,认为他们是重复的,保留其中一条,删除其余。实现代码如下:
from pymongo import DeleteOne
from threading import Thread
from apscheduler.schedulers.blocking import BlockingScheduler
from Application.Utils.Log import Log
class DupKeywordRemove:
def __init__(self):
models = [monde1, monde2, monde2,
monde3, monde4, monde4] # mongoengine的modle
self.pipeline = [
# 根据几个字段去判断键值对的唯一性,这里特别写明了{"$exists": True},必须保证需要判断的字段完成,否则会影响到后面的group
{"$match": {
"keyword": {"$exists": True},
"country_id": {"$exists": True},
}},
# 将重复的键值对group起来,并用count计数
{"$group": {
"_id": {
"keyword": "$keyword",
"country_id": "$country_id",
},
"count": {"$sum": 1}
}},
# 匹配count大于2的键值对,他们就是重复的
{"$match": {
"count": {"$gt": 1}
}}
]
self.main(models)
def find_dup_id(self, model):
try:
_collection = model._get_collection()
# 配置allowDiskUse=True应对mongodb的16M limit
all_dup_key = list(_collection.aggregate(self.pipeline, allowDiskUse=True))
delete_list = []
# 重复的doc作为list返回,每一组键值对作为find条件去数据库里查询
for dup_event in all_dup_key:
match_pipe = dup_event['_id']
remove_id_list = []
dups = list(_collection.find(match_pipe))
if len(dups) >= 2:
for key in dups:
remove_id_list.append(key['_id'])
needToSave = remove_id_list.pop() # pop一个出去,作为保留的doc
for to_del in remove_id_list:
delete_list.append(
DeleteOne({
'_id': to_del
})
)
print(_collection, len(delete_list))
if delete_list:
print('删除重复数据')
_collection.bulk_write(delete_list)
else:
print('无重复数据')
pass
except Exception as e:
Log('keyword_dup_remove').info(e)
def main(self, models):
t_li = []
for _model in models:
t_li.append(
Thread(
target=self.find_dup_id,
kwargs={
'model': _model,
}
)
)
for t in t_li:
t.start()
for t in t_li:
t.join()
if __name__ == '__main__':
DupKeywordRemove()