目录

一、基本知识

二、常见的图片相似度去重的方法

三、代码

      1、Hash 算法 与 直方图 代码(参考 python OpenCV 图片相似度 Hash算法)

           (1) 参照代码 

         (2)自定义 pHash 类

 2、ORB特征检测图片相似度

 3、SIFT 算法

 

Reference


一、基本知识

    1、各种图片读取的函数读取到的数据格式是 RGB 还是 BGR?

使用 scipy.misc.imread 读取的图片数据是 RGB 格式;
使用 cv2.imread 读取的图片数据是 BGR 格式;
使用 PIL.Image.open 读取的图片数据是RGB格式;

     2、opencv 与 PIL 读取图片报错问题的原因

          (1)cv2.imread 读取的图片数据是 BGR 格式,PIL.Image.open 读取的图片数据是RGB格式;

          (2)图片数据本身不是三通道的数据(比如:灰阶图),需要进行判断

                    

二、常见的图片相似度去重的方法

     1、opencv各个相似度计算算法的特点

     2、Opencv中特征点提取和特征匹配算法详解(ORB SIFT SURF FAST)


三、代码

      1、Hash 算法 与 直方图 代码

1. 均值哈希算法、差值哈希算法和感知哈希算法都是值越小,相似度越高,取值为0-64,即汉明距离中,64位的hash值有多少不同。 
   三直方图和单通道直方图的值为0-1,值越大,相似度越高。

           (1) 参照代码 

import cv2
import numpy as np
from PIL import Image
import requests
from io import BytesIO
import matplotlib
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
 
 
def aHash(img):
    # 均值哈希算法
    # 缩放为8*8
    img = cv2.resize(img, (8, 8))
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # s为像素和初值为0,hash_str为hash值初值为''
    s = 0
    hash_str = ''
    # 遍历累加求像素和
    for i in range(8):
        for j in range(8):
            s = s+gray[i, j]
    # 求平均灰度
    avg = s/64
    # 灰度大于平均值为1相反为0生成图片的hash值
    for i in range(8):
        for j in range(8):
            if gray[i, j] > avg:
                hash_str = hash_str+'1'
            else:
                hash_str = hash_str+'0'
    return hash_str
 
 
def dHash(img):
    # 差值哈希算法
    # 缩放8*8
    img = cv2.resize(img, (9, 8))
    # 转换灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    hash_str = ''
    # 每行前一个像素大于后一个像素为1,相反为0,生成哈希
    for i in range(8):
        for j in range(8):
            if gray[i, j] > gray[i, j+1]:
                hash_str = hash_str+'1'
            else:
                hash_str = hash_str+'0'
    return hash_str
 
 
def pHash(img):
    # 感知哈希算法
    # 缩放32*32
    img = cv2.resize(img, (32, 32))   # , interpolation=cv2.INTER_CUBIC
 
    # 转换为灰度图
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    # 将灰度图转为浮点型,再进行dct变换
    dct = cv2.dct(np.float32(gray))
    # opencv实现的掩码操作
    dct_roi = dct[0:8, 0:8]
 
    hash = []
    avreage = np.mean(dct_roi)
    for i in range(dct_roi.shape[0]):
        for j in range(dct_roi.shape[1]):
            if dct_roi[i, j] > avreage:
                hash.append(1)
            else:
                hash.append(0)
    return hash
 
 
def calculate(image1, image2):
    # 灰度直方图算法
    # 计算单通道的直方图的相似值
    hist1 = cv2.calcHist([image1], [0], None, [256], [0.0, 255.0])
    hist2 = cv2.calcHist([image2], [0], None, [256], [0.0, 255.0])
    # 计算直方图的重合度
    degree = 0
    for i in range(len(hist1)):
        if hist1[i] != hist2[i]:
            degree = degree + \
                (1 - abs(hist1[i] - hist2[i]) / max(hist1[i], hist2[i]))
        else:
            degree = degree + 1
    degree = degree / len(hist1)
    return degree
 
 
def classify_hist_with_split(image1, image2, size=(256, 256)):
    # RGB每个通道的直方图相似度
    # 将图像resize后,分离为RGB三个通道,再计算每个通道的相似值
    image1 = cv2.resize(image1, size)
    image2 = cv2.resize(image2, size)
    sub_image1 = cv2.split(image1)
    sub_image2 = cv2.split(image2)
    sub_data = 0
    for im1, im2 in zip(sub_image1, sub_image2):
        sub_data += calculate(im1, im2)
    sub_data = sub_data / 3
    return sub_data
 
 
def cmpHash(hash1, hash2):
    # Hash值对比
    # 算法中1和0顺序组合起来的即是图片的指纹hash。顺序不固定,但是比较的时候必须是相同的顺序。
    # 对比两幅图的指纹,计算汉明距离,即两个64位的hash值有多少是不一样的,不同的位数越小,图片越相似
    # 汉明距离:一组二进制数据变成另一组数据所需要的步骤,可以衡量两图的差异,汉明距离越小,则相似度越高。
    #          汉明距离为0,即两张图片完全一样
    n = 0
    # hash长度不同则返回-1代表传参出错
    if len(hash1) != len(hash2):
        return -1
    # 遍历判断
    for i in range(len(hash1)):
        # 不相等则n计数+1,n最终为相似度
        if hash1[i] != hash2[i]:
            n = n + 1
    return n
 
 
def getImageByUrl(url):
    # 根据图片url 获取图片对象
    html = requests.get(url, verify=False)
    image = Image.open(BytesIO(html.content))
    return image
 
 
def PILImageToCV():
    # PIL Image转换成OpenCV格式
    path = "/Users/waldenz/Documents/Work/doc/TestImages/t3.png"
    img = Image.open(path)
    plt.subplot(121)
    plt.imshow(img)
    print(isinstance(img, np.ndarray))
    img = cv2.cvtColor(np.asarray(img), cv2.COLOR_RGB2BGR)
    print(isinstance(img, np.ndarray))
    plt.subplot(122)
    plt.imshow(img)
    plt.show()
 
 
def CVImageToPIL():
    # OpenCV图片转换为PIL image
    path = "/Users/waldenz/Documents/Work/doc/TestImages/t3.png"
    img = cv2.imread(path)
    # cv2.imshow("OpenCV",img)
    plt.subplot(121)
    plt.imshow(img)
 
    img2 = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.subplot(122)
    plt.imshow(img2)
    plt.show()
 
def bytes_to_cvimage(filebytes):
    # 图片字节流转换为cv image
    image = Image.open(filebytes)
    img = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR)
    return img
 
def runAllImageSimilaryFun(para1, para2):
    # 均值、差值、感知哈希算法三种算法值越小,则越相似,相同图片值为0
    # 三直方图算法和单通道的直方图 0-1之间,值越大,越相似。 相同图片为1
 
    # t1,t2   14;19;10;  0.70;0.75
    # t1,t3   39 33 18   0.58 0.49
    # s1,s2  7 23 11     0.83 0.86  挺相似的图片
    # c1,c2  11 29 17    0.30 0.31
 
    if para1.startswith("http"):
         # 根据链接下载图片,并转换为opencv格式
        img1 = getImageByUrl(para1)
        img1 = cv2.cvtColor(np.asarray(img1), cv2.COLOR_RGB2BGR)
 
        img2 = getImageByUrl(para2)
        img2 = cv2.cvtColor(np.asarray(img2), cv2.COLOR_RGB2BGR)
    else:
        # 通过imread方法直接读取物理路径
        img1 = cv2.imread(para1)
        img2 = cv2.imread(para2)
 
    hash1 = aHash(img1)
    hash2 = aHash(img2)
    n1 = cmpHash(hash1, hash2)
    print('均值哈希算法相似度aHash:', n1)
 
    hash1 = dHash(img1)
    hash2 = dHash(img2)
    n2 = cmpHash(hash1, hash2)
    print('差值哈希算法相似度dHash:', n2)
 
    hash1 = pHash(img1)
    hash2 = pHash(img2)
    n3 = cmpHash(hash1, hash2)
    print('感知哈希算法相似度pHash:', n3)
 
    n4 = classify_hist_with_split(img1, img2)
    print('三直方图算法相似度:', n4)
 
    n5 = calculate(img1, img2)
    print("单通道的直方图", n5)
    print("%d %d %d %.2f %.2f " % (n1, n2, n3,n4, n5))
    print("%.2f %.2f %.2f %.2f %.2f " % (1-float(n1/64), 1 -
                                         float(n2/64), 1-float(n3/64), n4, n5))
 
    plt.subplot(121)
    plt.imshow(Image.fromarray(cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)))
    plt.subplot(122)
    plt.imshow(Image.fromarray(cv2.cvtColor(img2, cv2.COLOR_BGR2RGB)))
    plt.show()
 
if __name__ == "__main__":
    p1="https://ww3.sinaimg.cn/bmiddle/007INInDly1g336j2zziwj30su0g848w.jpg"
    p2="https://ww2.sinaimg.cn/bmiddle/007INInDly1g336j10d32j30vd0hnam6.jpg"
    runAllImageSimilaryFun(p1,p2)

         (2)自定义 pHash 类

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib3
urllib3.disable_warnings() # 忽略request警告

import cv2
import numpy as np
import requests
from PIL import Image
from io import BytesIO
import pandas as pd



class PHASH():
    def __init__(self,*args,**kwargs):

        pass


    @classmethod
    def get_cv_img(cls,fpath_or_url):
        """

        :param fpath_or_url: 给定一张图片的路径或者是url,将其转化为 opencv 数据
        :return:
        """
        if fpath_or_url.startswith('http'):
            # TODO ssl证书报错,参数 verify=False,同时,requests默认是keep-alive的,可能没有释放,加参数 headers={'Connection':'close'}
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
                'Connection':'close'
            }
            # TODO 增加连接重试次数
            requests.adapters.DEFAULT_RETRIES = 5

            response = requests.get(fpath_or_url,headers=headers,stream=True, verify=False)
            image = Image.open(BytesIO(response.content))

            # TODO 有可能该图本身就是 灰色图,所以这里判断一下 ndim 属性
            image_rgb = np.asarray(image)
            image_rgb_dim = image_rgb.ndim
            if image_rgb_dim == 3:
                # 转化为 openCV 的 bgr 图片格式
                return cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR),image_rgb_dim
            if image_rgb_dim == 2:
                image_gray = image_rgb
                return image_gray,image_rgb_dim


        else:
            '''
                        def imread(filename, flags=None):
                            默认读取为三通道,如果读取灰阶图:flag = cv2.IMREAD_GRAYSCALE   

                            flag=-1时	8位深度,原通道
                            flag=0	8位深度,1通道
                            flag=1	8位深度 ,3通道
                            flag=2	原深度,1通道
                            flag=3	原深度,3通道
                            flag=4	8位深度 ,3通道

                            IMREAD_ANYCOLOR	按原图像颜色格式读取
                            IMREAD_ANYDEPTH	按原图像深度信息读取,非16位或32位的则转化为8位
                            IMREAD_COLOR	转化为三通道图像
                            IMREAD_UNCHAGED	原样读取,不改变图像信息
                            IMREAD_GRAYSCALE	转化为灰度图(8位),dtype=CV_8UC1

                        '''
            image_bgr = cv2.imread(fpath_or_url)  # shape = (h,w,c)  c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
            image_bgr_dim = image_bgr.ndim
            return cv2.imread(fpath_or_url),image_bgr_dim  # shape = (h,w,c)  c 为输入通道,如果是灰阶图读取,则 shape = (h,w)

    @classmethod
    def get_pHash_array(cls,fpath_or_url):

        img,dim = cls.get_cv_img(fpath_or_url)

        # 缩放32*32
        img = cv2.resize(img, (32, 32), interpolation=cv2.INTER_LINEAR)  # 转化图片的维度,即 h,w ,但是不该 c 的维度
        """
        interpolation:这个是指定插值的方式,图像缩放之后,肯定像素要进行重新计算的,就靠这个参数来指定重新计算像素的方式,有以下几种:
            INTER_NEAREST - 最邻近插值
            INTER_LINEAR - 双线性插值,如果最后一个参数你不指定,默认使用这种方法
            INTER_AREA - resampling using pixel area relation. It may be a preferred method for image decimation, as it gives moire’-free results. But when the image is zoomed, it is similar to the INTER_NEAREST method.
            INTER_CUBIC - 4x4像素邻域内的双立方插值
            INTER_LANCZOS4 - 8x8像素邻域内的Lanczos插值
        """

        if dim == 3:
            # 转换为灰度图
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 三通道转化为 一通道灰阶图 ,(h,w,c) >>>  (h,w)
        else :
            gray = img

        # 将灰度图转为浮点型,再进行dct变换
        dct = cv2.dct(np.float32(gray))  # shape = (32,32)

        # opencv实现的掩码操作
        dct_roi = dct[0:8, 0:8]  # shape = (8,8)
        dct_roi = dct_roi.reshape(-1)  # shape = (64,)
        # 计算hash str / hash array
        hash_array = (dct_roi > np.mean(dct_roi)).astype('int32')
        return hash_array


if __name__ == '__main__':

    # TODO Testing Code
    fpath1 = 'https://qcloud.dpfile.com/pc/gHG8ZFfVd9L8-dmljeMtcFj2g5GcEgRHlPARVAHUdNr5F47uz-fJtzd92pRPLfg3aLZ23ABw4IThDHW_mlzUpw.jpg'
    fpath2 = 'https://qcloud.dpfile.com/pc/QFse1QpHtR6vhauRrr-yzly-0r8wPlk4-vEPwUM51gDDzoQjgbK7C3JTY4bHQchlaLZ23ABw4IThDHW_mlzUpw.jpg'

    rry1  = PHASH.get_pHash_array(fpath1)
    rry2 = PHASH.get_pHash_array(fpath2)
    print(np.sum(rry1 != rry2))

 

 2、ORB特征检测图片相似度

# -*- encoding=utf-8 -*-

import cv2


# 自定义计算两个图片相似度函数
def img_similarity(img1_path,img2_path):
    """
    :param img1_path: 图片1路径
    :param img2_path: 图片2路径
    :return: 图片相似度
    """
    try:
        # 读取图片
        img1 = cv2.imread(img1_path, cv2.IMREAD_GRAYSCALE)
        img2 = cv2.imread(img2_path, cv2.IMREAD_GRAYSCALE)

        # 初始化ORB检测器
        orb = cv2.ORB_create()
        kp1, des1 = orb.detectAndCompute(img1, None)
        kp2, des2 = orb.detectAndCompute(img2, None)

        # 提取并计算特征点
        bf = cv2.BFMatcher(cv2.NORM_HAMMING)

        # knn筛选结果
        matches = bf.knnMatch(des1, trainDescriptors=des2, k=2)

        # 查看最大匹配点数目
        good = [m for (m, n) in matches if m.distance < 0.55 * n.distance]
        print(len(good))
        print(len(matches))
        similary = len(good) / len(matches)
        print("两张图片相似度为:%s" % similary)
        return similary

    except:
        print('无法计算两张图片相似度')
        return '0'


if __name__ == '__main__':

    img1_path=r'F:\img_spam\test\7ba.jpg'
    img2_path=r'F:\img_spam\test\ba.jpg'
    similary=img_similarity(img1_path,img2_path)

 3、SIFT 算法

为了排除因为图像遮挡和背景混乱而产生的无匹配关系的关键点,SIFT的作者Lowe提出了比较最近邻距离与次近邻距离的SIFT匹配
方式:取一幅图像中的一个SIFT关键点,并找出其与另一幅图像中欧式距离最近的前两个关键点,在这两个关键点中,如果最近的距
离除以次近的距离得到的比率ratio少于某个阈值T,则接受这一对匹配点。因为对于错误匹配,由于特征空间的高维性,相似的距
离可能有大量其他的错误匹配,从而它的ratio值比较高。显然降低这个比例阈值T,SIFT匹配点数目会减少,但更加稳定,反之亦然。

Lowe推荐ratio的阈值为0.8,但作者对大量任意存在尺度、旋转和亮度变化的两幅图片进行匹配,结果表明ratio取值在0. 4~0. 6 
之间最佳,小于0. 4的很少有匹配点,大于0. 6的则存在大量错误匹配点,所以建议ratio的取值原则如下:
ratio=0. 4:对于准确度要求高的匹配;
ratio=0. 6:对于匹配点数目要求比较多的匹配;
ratio=0. 5:一般情况下。
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import urllib3
urllib3.disable_warnings() # 忽略request警告
import cv2
import numpy as np
import requests
from PIL import Image
from io import BytesIO


class SIFT_FLANN():
    def __init__(self,*args,**kwargs):

        pass

    @classmethod
    def get_cv_img(cls,fpath_or_url):
        """

        :param fpath_or_url: 给定一张图片的路径或者是url,将其转化为 opencv 数据
        :return:
        """
        if fpath_or_url.startswith('http'):
            # TODO ssl证书报错,参数 verify=False,同时,requests默认是keep-alive的,可能没有释放,加参数 headers={'Connection':'close'}
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36',
                'Connection':'close'
            }
            # TODO 增加连接重试次数
            requests.adapters.DEFAULT_RETRIES = 5

            response = requests.get(fpath_or_url,headers=headers,stream=True, verify=False)
            image = Image.open(BytesIO(response.content))

            # TODO 有可能该图本身就是 灰色图,所以这里判断一下 ndim 属性
            image_rgb = np.asarray(image)
            image_rgb_dim = image_rgb.ndim
            if image_rgb_dim == 3:
                # 转化为 openCV 的 bgr 图片格式
                return cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR),image_rgb_dim
            if image_rgb_dim == 2:
                image_gray = image_rgb
                return image_gray,image_rgb_dim


        elif '.jpg' in fpath_or_url or '.png' in fpath_or_url:
            image_bgr = cv2.imread(fpath_or_url)  # shape = (h,w,c)  c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
            image_bgr_dim = image_bgr.ndim
            return cv2.imread(fpath_or_url),image_bgr_dim  # shape = (h,w,c)  c 为输入通道,如果是灰阶图读取,则 shape = (h,w)
        else:
            print(1111111111111111111111111111111111111)

    @classmethod
    def sift_describe(cls,fpath_or_url):

        img,dim = cls.get_cv_img(fpath_or_url)

        # 缩放32*32
        if dim == 3:
            # 转换为灰度图
            gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)  # 三通道转化为 一通道灰阶图 ,(h,w,c) >>>  (h,w)
        else :
            gray = img

        # TODO SIFT

        sift = cv2.xfeatures2d.SIFT_create()
        # 查找监测点和匹配符
        kp, des = sift.detectAndCompute(gray, None)  # type(kp) ID 的 list, type(des) 2D ndarray
        return (kp,des),img

    @classmethod
    def cal_sift_similarity(cls,img1_info,img2_info,
                            img_array1 = None ,img_array2 = None):
        """
        传入两张 sift 特征点 与 其描述信息,返回两张图片的 similarity的值
        :param img1_info:
        :param img2_info:
        :return:
        """
        kp1,des1 = img1_info
        kp2,des2 = img2_info

        # TODO 构建 FLANN
        FLANN_INDEX_KDTREE = 0
        index_params = dict(algorithm=FLANN_INDEX_KDTREE, trees=5)
        search_params = dict(checks=100)  # 或者传递空字典

        flann = cv2.FlannBasedMatcher(index_params, search_params)
        matches = flann.knnMatch(des1, des2, k=2)

        good = [m for (m, n) in matches if m.distance < 0.55 * n.distance]
        similarity = len(good) / len(matches)
        return similarity


if __name__ == '__main__':

    # TODO Testing Code
    su_p1 = "https://qcloud.dpfile.com/pc/hXbfmfeK4W9FkcZwXqshIrXySooIPGgEb9uIVs2KW8oPHEiK039L37r0oM6CtGq9aLZ23ABw4IThDHW_mlzUpw.jpg"
    p2 = "https://qcloud.dpfile.com/pc/nxetlAzTR_7ARUZjAw98l3S5Nku5CCTct6lFTSAnqzgq_97ET71kayhMglYH7dIhaLZ23ABw4IThDHW_mlzUpw.jpg"
    p3 = 'https://qcloud.dpfile.com/pc/fVfbhCulsdsrzyCRMlF1qBndGogGhBwQSoFJG7_uBtMioGNY_iAGzVZOnx4ayKIqaLZ23ABw4IThDHW_mlzUpw.jpg'
    p4 = 'https://qcloud.dpfile.com/pc/itfWTAlTRmjNDIr-jTv1QkEtPFnjlNvkF9iLQj35azGtPI2pUtZjhpqN7loP26AeaLZ23ABw4IThDHW_mlzUpw.jpg'
    p5 = 'https://qcloud.dpfile.com/pc/2Q_ByqDIN63u2iuG3eImgr30TWxVV-0_4b4NXJ1Nj9L0yNmV522HRzWUEZZoF51daLZ23ABw4IThDHW_mlzUpw.jpg'


    list_ = [p2,p3,p4,p5]
    img1_info,img_array1 = SIFT_FLANN.sift_describe(su_p1)
    for p in list_:
        img2_info, img_array2 = SIFT_FLANN.sift_describe(p)
        print('{}\n{}\n similarity:{}'.format(su_p1,p,SIFT_FLANN.cal_sift_similarity(img1_info,img2_info,img_array1,img_array2)))

    print('='*150)

    fpath1 = 'https://qcloud.dpfile.com/pc/gHG8ZFfVd9L8-dmljeMtcFj2g5GcEgRHlPARVAHUdNr5F47uz-fJtzd92pRPLfg3aLZ23ABw4IThDHW_mlzUpw.jpg'
    fpath2 = 'https://qcloud.dpfile.com/pc/QFse1QpHtR6vhauRrr-yzly-0r8wPlk4-vEPwUM51gDDzoQjgbK7C3JTY4bHQchlaLZ23ABw4IThDHW_mlzUpw.jpg'

    img1_info, img_array1 = SIFT_FLANN.sift_describe(fpath1)
    img2_info, img_array2 = SIFT_FLANN.sift_describe(fpath2)
    print('{}\n{}\n similarity:{}'.format(fpath1, fpath2,
                                          SIFT_FLANN.cal_sift_similarity(img1_info, img2_info, img_array1, img_array2)))