目录
一、基本知识
二、常见的图片相似度去重的方法
三、代码
1、Hash 算法 与 直方图 代码(参考 python OpenCV 图片相似度 Hash算法)
(1) 参照代码
(2)自定义 pHash 类
2、ORB特征检测图片相似度
3、SIFT 算法
Reference
一、基本知识
1、各种图片读取的函数读取到的数据格式是 RGB 还是 BGR?
使用 scipy.misc.imread 读取的图片数据是 RGB 格式;
使用 cv2.imread 读取的图片数据是 BGR 格式;
使用 PIL.Image.open 读取的图片数据是RGB格式;
2、opencv 与 PIL 读取图片报错问题的原因
(1)cv2.imread 读取的图片数据是 BGR 格式,PIL.Image.open 读取的图片数据是RGB格式;
(2)图片数据本身不是三通道的数据(比如:灰阶图),需要进行判断
二、常见的图片相似度去重的方法
1、opencv各个相似度计算算法的特点
2、Opencv中特征点提取和特征匹配算法详解(ORB SIFT SURF FAST)
三、代码
1、Hash 算法 与 直方图 代码
1. 均值哈希算法、差值哈希算法和感知哈希算法都是值越小,相似度越高,取值为0-64,即汉明距离中,64位的hash值有多少不同。
三直方图和单通道直方图的值为0-1,值越大,相似度越高。
(1) 参照代码
import cv2
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
def aHash(img):
# 均值哈希算法
# 缩放为8*8
img = cv2.resize(img, (8, 8))
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# s为像素和初值为0,hash_str为hash值初值为''
s = 0
hash_str = ''
# 遍历累加求像素和
for i in range(8):
for j in range(8):
s = s+gray[i, j]
# 求平均灰度
avg = s/64
# 灰度大于平均值为1相反为0生成图片的hash值
for i in range(8):
for j in range(8):
if gray[i, j] > avg:
hash_str = hash_str+'1'
else:
hash_str = hash_str+'0'
return hash_str
def dHash(img):
# 差值哈希算法
# 缩放8*8
img = cv2.resize(img, (9, 8))
# 转换灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
hash_str = ''
# 每行前一个像素大于后一个像素为1,相反为0,生成哈希
for i in range(8):
for j in range(8):
if gray[i, j] > gray[i, j+1]:
hash_str = hash_str+'1'
else:
hash_str = hash_str+'0'
return hash_str
def pHash(img):
# 感知哈希算法
# 缩放32*32
img = cv2.resize(img, (32, 32)) # , interpolation=cv2.INTER_CUBIC
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# 将灰度图转为浮点型,再进行dct变换
dct = cv2.dct(np.float32(gray))
# opencv实现的掩码操作
dct_roi = dct[0:8, 0:8]
hash = []
avreage = np.mean(dct_roi)
for i in range(dct_roi.shape[0]):
for j in range(dct_roi.shape[1]):
if dct_roi[i, j] > avreage:
hash.append(1)
else:
hash.append(0)
return hash
def calculate(image1, image2):
# 灰度直方图算法
# 计算单通道的直方图的相似值
hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
# 计算直方图的重合度
degree = 0
for i in range(len(hist1)):
if hist1[i] != hist2[i]:
degree = degree + \
(1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
else:
degree = degree + 1
degree = degree / len(hist1)
return degree
def classify_hist_with_split(image1, image2, size=(256, 256)):
# RGB每个通道的直方图相似度
# 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
image1 = cv2.resize(image1, size)
image2 = cv2.resize(image2, size)
sub_image1 = cv2.split(image1)
sub_image2 = cv2.split(image2)
sub_data = 0
for im1, im2 in zip(sub_image1, sub_image2):
sub_data += calculate(im1, im2)
sub_data = sub_data / 3
return sub_data
def cmpHash(hash1, hash2):
# Hash值对比
# 算法中1和0顺序组合起来的即是图片的指纹hash。顺序不固定,但是比较的时候必须是相同的顺序。
# 对比两幅图的指纹,计算汉明距离,即两个64位的hash值有多少是不一样的,不同的位数越小,图片越相似
# 汉明距离:一组二进制数据变成另一组数据所需要的步骤,可以衡量两图的差异,汉明距离越小,则相似度越高。
# 汉明距离为0,即两张图片完全一样
n = 0
# hash长度不同则返回-1代表传参出错
if len(hash1) != len(hash2):
return -1
# 遍历判断
for i in range(len(hash1)):
# 不相等则n计数+1,n最终为相似度
if hash1[i] != hash2[i]:
n = n + 1
return n
def getImageByUrl(url):
# 根据图片url 获取图片对象
html = requests.get(url, verify=False)
image = Image.open(BytesIO(html.content))
return image
def PILImageToCV():
# PIL Image转换成OpenCV格式
path = "/Users/waldenz/Documents/Work/doc/TestImages/t3.png"
img = Image.open(path)
plt.subplot(121)
plt.imshow(img)
print(isinstance(img, np.ndarray))
img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
print(isinstance(img, np.ndarray))
plt.subplot(122)
plt.imshow(img)
plt.show()
def CVImageToPIL():
# OpenCV图片转换为PIL image
path = "/Users/waldenz/Documents/Work/doc/TestImages/t3.png"
img = cv2.imread(path)
# cv2.imshow("OpenCV",img)
plt.subplot(121)
plt.imshow(img)
img2 = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.subplot(122)
plt.imshow(img2)
plt.show()
def bytes_to_cvimage(filebytes):
# 图片字节流转换为cv image
image = Image.open(filebytes)
img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
return img
def runAllImageSimilaryFun(para1, para2):
# 均值、差值、感知哈希算法三种算法值越小,则越相似,相同图片值为0
# 三直方图算法和单通道的直方图 0-1之间,值越大,越相似。 相同图片为1
# t1,t2 14;19;10; 0.70;0.75
# t1,t3 39 33 18 0.58 0.49
# s1,s2 7 23 11 0.83 0.86 挺相似的图片
# c1,c2 11 29 17 0.30 0.31
if para1.startswith("http"):
# 根据链接下载图片,并转换为opencv格式
img1 = getImageByUrl(para1)
img1 = cv2.cvtColor(np.asarray(img1), cv2.COLOR_RGB2BGR)
img2 = getImageByUrl(para2)
img2 = cv2.cvtColor(np.asarray(img2), cv2.COLOR_RGB2BGR)
else:
# 通过imread方法直接读取物理路径
img1 = cv2.imread(para1)
img2 = cv2.imread(para2)
hash1 = aHash(img1)
hash2 = aHash(img2)
n1 = cmpHash(hash1, hash2)
print('均值哈希算法相似度aHash:', n1)
hash1 = dHash(img1)
hash2 = dHash(img2)
n2 = cmpHash(hash1, hash2)
print('差值哈希算法相似度dHash:', n2)
hash1 = pHash(img1)
hash2 = pHash(img2)
n3 = cmpHash(hash1, hash2)
print('感知哈希算法相似度pHash:', n3)
n4 = classify_hist_with_split(img1, img2)
print('三直方图算法相似度:', n4)
n5 = calculate(img1, img2)
print("单通道的直方图", n5)
print("%d %d %d %.2f %.2f " % (n1, n2, n3,n4, n5))
print("%.2f %.2f %.2f %.2f %.2f " % (1-float(n1/64), 1 -
float(n2/64), 1-float(n3/64), n4, n5))
plt.subplot(121)
plt.imshow(Image.fromarray(cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)))
plt.subplot(122)
plt.imshow(Image.fromarray(cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)))
plt.show()
if __name__ == "__main__":
p1="https://ww3.sinaimg.cn/bmiddle/007INInDly1g336j2zziwj30su0g848w.jpg"
p2="https://ww2.sinaimg.cn/bmiddle/007INInDly1g336j10d32j30vd0hnam6.jpg"
runAllImageSimilaryFun(p1,p2)
(2)自定义 pHash 类
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib3
urllib3.disable_warnings() # 忽略request警告
import cv2
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import pandas as pd
class PHASH():
def __init__(self,*args,**kwargs):
pass
@classmethod
def get_cv_img(cls,fpath_or_url):
"""
:param fpath_or_url: 给定一张图片的路径或者是url,将其转化为 opencv 数据
:return:
"""
if fpath_or_url.startswith('http'):
# TODO ssl证书报错,参数 verify=False,同时,requests默认是keep-alive的,可能没有释放,加参数 headers={'Connection':'close'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
'Connection':'close'
}
# TODO 增加连接重试次数
requests.adapters.DEFAULT_RETRIES = 5
response = requests.get(fpath_or_url,headers=headers,stream=True, verify=False)
image = Image.open(BytesIO(response.content))
# TODO 有可能该图本身就是 灰色图,所以这里判断一下 ndim 属性
image_rgb = np.asarray(image)
image_rgb_dim = image_rgb.ndim
if image_rgb_dim == 3:
# 转化为 openCV 的 bgr 图片格式
return cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR),image_rgb_dim
if image_rgb_dim == 2:
image_gray = image_rgb
return image_gray,image_rgb_dim
else:
'''
def imread(filename, flags=None):
默认读取为三通道,如果读取灰阶图:flag = cv2.IMREAD_GRAYSCALE
flag=-1时 8位深度,原通道
flag=0 8位深度,1通道
flag=1 8位深度 ,3通道
flag=2 原深度,1通道
flag=3 原深度,3通道
flag=4 8位深度 ,3通道
IMREAD_ANYCOLOR 按原图像颜色格式读取
IMREAD_ANYDEPTH 按原图像深度信息读取,非16位或32位的则转化为8位
IMREAD_COLOR 转化为三通道图像
IMREAD_UNCHAGED 原样读取,不改变图像信息
IMREAD_GRAYSCALE 转化为灰度图(8位),dtype=CV_8UC1
'''
image_bgr = cv2.imread(fpath_or_url) # shape = (h,w,c) c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
image_bgr_dim = image_bgr.ndim
return cv2.imread(fpath_or_url),image_bgr_dim # shape = (h,w,c) c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
@classmethod
def get_pHash_array(cls,fpath_or_url):
img,dim = cls.get_cv_img(fpath_or_url)
# 缩放32*32
img = cv2.resize(img, (32, 32), interpolation=cv2.INTER_LINEAR) # 转化图片的维度,即 h,w ,但是不该 c 的维度
"""
interpolation:这个是指定插值的方式,图像缩放之后,肯定像素要进行重新计算的,就靠这个参数来指定重新计算像素的方式,有以下几种:
INTER_NEAREST - 最邻近插值
INTER_LINEAR - 双线性插值,如果最后一个参数你不指定,默认使用这种方法
INTER_AREA - resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire’-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method.
INTER_CUBIC - 4x4像素邻域内的双立方插值
INTER_LANCZOS4 - 8x8像素邻域内的Lanczos插值
"""
if dim == 3:
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 三通道转化为 一通道灰阶图 ,(h,w,c) >>> (h,w)
else :
gray = img
# 将灰度图转为浮点型,再进行dct变换
dct = cv2.dct(np.float32(gray)) # shape = (32,32)
# opencv实现的掩码操作
dct_roi = dct[0:8, 0:8] # shape = (8,8)
dct_roi = dct_roi.reshape(-1) # shape = (64,)
# 计算hash str / hash array
hash_array = (dct_roi > np.mean(dct_roi)).astype('int32')
return hash_array
if __name__ == '__main__':
# TODO Testing Code
fpath1 = 'https://qcloud.dpfile.com/pc/gHG8ZFfVd9L8-dmljeMtcFj2g5GcEgRHlPARVAHUdNr5F47uz-fJtzd92pRPLfg3aLZ23ABw4IThDHW_mlzUpw.jpg'
fpath2 = 'https://qcloud.dpfile.com/pc/QFse1QpHtR6vhauRrr-yzly-0r8wPlk4-vEPwUM51gDDzoQjgbK7C3JTY4bHQchlaLZ23ABw4IThDHW_mlzUpw.jpg'
rry1 = PHASH.get_pHash_array(fpath1)
rry2 = PHASH.get_pHash_array(fpath2)
print(np.sum(rry1 != rry2))
2、ORB特征检测图片相似度
# -*- encoding=utf-8 -*-
import cv2
# 自定义计算两个图片相似度函数
def img_similarity(img1_path,img2_path):
"""
:param img1_path: 图片1路径
:param img2_path: 图片2路径
:return: 图片相似度
"""
try:
# 读取图片
img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)
# 初始化ORB检测器
orb = cv2.ORB_create()
kp1, des1 = orb.detectAndCompute(img1, None)
kp2, des2 = orb.detectAndCompute(img2, None)
# 提取并计算特征点
bf = cv2.BFMatcher(cv2.NORM_HAMMING)
# knn筛选结果
matches = bf.knnMatch(des1, trainDescriptors=des2, k=2)
# 查看最大匹配点数目
good = [m for (m, n) in matches if m.distance < 0.55 * n.distance]
print(len(good))
print(len(matches))
similary = len(good) / len(matches)
print("两张图片相似度为:%s" % similary)
return similary
except:
print('无法计算两张图片相似度')
return '0'
if __name__ == '__main__':
img1_path=r'F:\img_spam\test\7ba.jpg'
img2_path=r'F:\img_spam\test\ba.jpg'
similary=img_similarity(img1_path,img2_path)
3、SIFT 算法
为了排除因为图像遮挡和背景混乱而产生的无匹配关系的关键点,SIFT的作者Lowe提出了比较最近邻距离与次近邻距离的SIFT匹配
方式:取一幅图像中的一个SIFT关键点,并找出其与另一幅图像中欧式距离最近的前两个关键点,在这两个关键点中,如果最近的距
离除以次近的距离得到的比率ratio少于某个阈值T,则接受这一对匹配点。因为对于错误匹配,由于特征空间的高维性,相似的距
离可能有大量其他的错误匹配,从而它的ratio值比较高。显然降低这个比例阈值T,SIFT匹配点数目会减少,但更加稳定,反之亦然。
Lowe推荐ratio的阈值为0.8,但作者对大量任意存在尺度、旋转和亮度变化的两幅图片进行匹配,结果表明ratio取值在0. 4~0. 6
之间最佳,小于0. 4的很少有匹配点,大于0. 6的则存在大量错误匹配点,所以建议ratio的取值原则如下:
ratio=0. 4:对于准确度要求高的匹配;
ratio=0. 6:对于匹配点数目要求比较多的匹配;
ratio=0. 5:一般情况下。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib3
urllib3.disable_warnings() # 忽略request警告
import cv2
import numpy as np
import requests
from PIL import Image
from io import BytesIO
class SIFT_FLANN():
def __init__(self,*args,**kwargs):
pass
@classmethod
def get_cv_img(cls,fpath_or_url):
"""
:param fpath_or_url: 给定一张图片的路径或者是url,将其转化为 opencv 数据
:return:
"""
if fpath_or_url.startswith('http'):
# TODO ssl证书报错,参数 verify=False,同时,requests默认是keep-alive的,可能没有释放,加参数 headers={'Connection':'close'}
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
'Connection':'close'
}
# TODO 增加连接重试次数
requests.adapters.DEFAULT_RETRIES = 5
response = requests.get(fpath_or_url,headers=headers,stream=True, verify=False)
image = Image.open(BytesIO(response.content))
# TODO 有可能该图本身就是 灰色图,所以这里判断一下 ndim 属性
image_rgb = np.asarray(image)
image_rgb_dim = image_rgb.ndim
if image_rgb_dim == 3:
# 转化为 openCV 的 bgr 图片格式
return cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR),image_rgb_dim
if image_rgb_dim == 2:
image_gray = image_rgb
return image_gray,image_rgb_dim
elif '.jpg' in fpath_or_url or '.png' in fpath_or_url:
image_bgr = cv2.imread(fpath_or_url) # shape = (h,w,c) c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
image_bgr_dim = image_bgr.ndim
return cv2.imread(fpath_or_url),image_bgr_dim # shape = (h,w,c) c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
else:
print(1111111111111111111111111111111111111)
@classmethod
def sift_describe(cls,fpath_or_url):
img,dim = cls.get_cv_img(fpath_or_url)
# 缩放32*32
if dim == 3:
# 转换为灰度图
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # 三通道转化为 一通道灰阶图 ,(h,w,c) >>> (h,w)
else :
gray = img
# TODO SIFT
sift = cv2.xfeatures2d.SIFT_create()
# 查找监测点和匹配符
kp, des = sift.detectAndCompute(gray, None) # type(kp) ID 的 list, type(des) 2D ndarray
return (kp,des),img
@classmethod
def cal_sift_similarity(cls,img1_info,img2_info,
img_array1 = None ,img_array2 = None):
"""
传入两张 sift 特征点 与 其描述信息,返回两张图片的 similarity的值
:param img1_info:
:param img2_info:
:return:
"""
kp1,des1 = img1_info
kp2,des2 = img2_info
# TODO 构建 FLANN
FLANN_INDEX_KDTREE = 0
index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
search_params = dict(checks=100) # 或者传递空字典
flann = cv2.FlannBasedMatcher(index_params, search_params)
matches = flann.knnMatch(des1, des2, k=2)
good = [m for (m, n) in matches if m.distance < 0.55 * n.distance]
similarity = len(good) / len(matches)
return similarity
if __name__ == '__main__':
# TODO Testing Code
su_p1 = "https://qcloud.dpfile.com/pc/hXbfmfeK4W9FkcZwXqshIrXySooIPGgEb9uIVs2KW8oPHEiK039L37r0oM6CtGq9aLZ23ABw4IThDHW_mlzUpw.jpg"
p2 = "https://qcloud.dpfile.com/pc/nxetlAzTR_7ARUZjAw98l3S5Nku5CCTct6lFTSAnqzgq_97ET71kayhMglYH7dIhaLZ23ABw4IThDHW_mlzUpw.jpg"
p3 = 'https://qcloud.dpfile.com/pc/fVfbhCulsdsrzyCRMlF1qBndGogGhBwQSoFJG7_uBtMioGNY_iAGzVZOnx4ayKIqaLZ23ABw4IThDHW_mlzUpw.jpg'
p4 = 'https://qcloud.dpfile.com/pc/itfWTAlTRmjNDIr-jTv1QkEtPFnjlNvkF9iLQj35azGtPI2pUtZjhpqN7loP26AeaLZ23ABw4IThDHW_mlzUpw.jpg'
p5 = 'https://qcloud.dpfile.com/pc/2Q_ByqDIN63u2iuG3eImgr30TWxVV-0_4b4NXJ1Nj9L0yNmV522HRzWUEZZoF51daLZ23ABw4IThDHW_mlzUpw.jpg'
list_ = [p2,p3,p4,p5]
img1_info,img_array1 = SIFT_FLANN.sift_describe(su_p1)
for p in list_:
img2_info, img_array2 = SIFT_FLANN.sift_describe(p)
print('{}\n{}\n similarity:{}'.format(su_p1,p,SIFT_FLANN.cal_sift_similarity(img1_info,img2_info,img_array1,img_array2)))
print('='*150)
fpath1 = 'https://qcloud.dpfile.com/pc/gHG8ZFfVd9L8-dmljeMtcFj2g5GcEgRHlPARVAHUdNr5F47uz-fJtzd92pRPLfg3aLZ23ABw4IThDHW_mlzUpw.jpg'
fpath2 = 'https://qcloud.dpfile.com/pc/QFse1QpHtR6vhauRrr-yzly-0r8wPlk4-vEPwUM51gDDzoQjgbK7C3JTY4bHQchlaLZ23ABw4IThDHW_mlzUpw.jpg'
img1_info, img_array1 = SIFT_FLANN.sift_describe(fpath1)
img2_info, img_array2 = SIFT_FLANN.sift_describe(fpath2)
print('{}\n{}\n similarity:{}'.format(fpath1, fpath2,
SIFT_FLANN.cal_sift_similarity(img1_info, img2_info, img_array1, img_array2)))