1、算法介绍

Soundex是一种语音算法,利用英文字的读音计算近似值,值由四个字符构成,第一个字符为英文字母,后三个为数字。在拼音文字中有时会有会念但不能拼出正确字的情形,可用Soundex做类似模糊匹配的效果。例如Knuth和Kant二个字符串,它们的Soundex值都是“K530”。其在电脑大师高德纳名著《计算机程序设计艺术》都有详细的介绍。

2、算法简要说明
  • 第一步:保留第一个字母,去掉第一个字母之后的所有的a, e, i, o, u, y, h, w;
  • 第二步:将英文字按以下规则替换(除第一个字符外):

b f p v -> 1
c g j k q s x z -> 2
d t -> 3
l -> 4
m n -> 5
r -> 6

  • 第三步:对于相邻的重复的数字只保留一个,即相邻的两个被替换为同一个数字的字母只保留一个;
  • 第四步:保留第一个字母后的三位数字,若不足三位则以0补足。
2、python实现
  • 算法代码
import threading

__author = 'cyy'

__Version = '1.0 : for one word'

local=threading.local()

class My_Soundex(object):

    def __init__(self,string):
        # 将用户输入的string进行首尾去空格处理并转换为大写
        self.string = string.strip().upper()
        # 第一个字母之后应该去掉的字母
        self.__noise=['A','E','I','O','U','Y','H','W']
        # 用来存储前一个字母的数字映射
        self.preNum = None
        # 数字映射列表
        self.Map = [#A   B   C   D   E   F   G   H   I   J   K   L   M
		            '0','1','2','3','0','1','2','0','0','2','2','4','5',
		            #N   O   P   Q   R   S   T   U   V   W   X   Y   Z
		            '5','0','1','2','6','2','3','0','1','0','2','0','2']
        # 存储结果
        self.__result = ''

    def Del_Noise(self):
        '''
        :param:self
        :return: 去噪后单词
        '''
        # 去掉不是字母的噪声
        for st in self.string:
            # 如果不是字母
            if not st.isalpha():
                self.string = self.string.replace(st,'')
        # 如果全部都是噪声,返回‘wrong’字符串
        if self.string == '':
            return 'Wrong'

        # 取第一个字母后的所有字母,用于去除有在['A','E','I','O','U','Y','H','W']中的字母
        stringBuffer = self.string[1:]
        # 如果第一个字母后的所有字母为空
        if not stringBuffer:
            # 直接返回
            return self.string
        # 如果不为空
        else:
            # 去除有在['A','E','I','O','U','Y','H','W']中的字母
            for noise in self.__noise:
                if stringBuffer == '':
                    break
                stringBuffer = stringBuffer.replace(noise,'')
            # 返回去除后的字符串
            return self.string[0]+stringBuffer

    def __call__(self, *args, **kwargs):
        return self.Del_Noise()

    @property
    def result(self):
        return self.__result

    def Soundex(self,size=4):
        '''
        :param size: 默认为4
        :return: 单词的Soundex映射结果
        '''
        # 第一步:去噪声和去除第一个字母后有在['A','E','I','O','U','Y','H','W']中的字母,赋给self.string
        self.string=self.Del_Noise()

        # 如果只有一个字母,补0后返回
        if self.string.__len__() == 1:
            return self.string+'0'*(size-1)

        # 如果不止一个字母
        # 将第一个字母加入结果
        self.__result += self.string[0]

        # 遍历第一个字母后的所有单词
        for string in self.string[1:]:
            # 取出数字映射
            each = self.Map[ord(string) - ord('A')]
            # 如果取出的数字映射不等于前一个字母的数字映射,即相邻的两个被替换为同一个数字的字母只保留一个
            if each != self.preNum:
                # 将数字映射加入结果
                self.__result += each
                self.preNum = each

        # 如果得到的结果长度小于size,补0直到结果长度与size相同
        if self.__result.__len__() < size:
            for j in range(size-self.__result.__len__()):
                self.__result += '0'
        # 如果得到的结果长度大于或等于size,从左取出与size相同长度的result
        else:
            self.__result = self.__result[:size]
        # 返回单词的Soundex映射结果
        return self.result
  • Main方法:用于交互及算法使用
def Main():
    '''
    main方法,用来处理用户输入及响应输出
    :param:
    :return:
    '''
    try:
        # 用户输入的单词
        your_word = input('Please input your word:')
        # 用户定义的结果size,一般为4
        your_size = int(input('Please input the size of the result you want:'))
    except ValueError as e:
        return Main()
    # 用用户输入的字符串实例化一个My_Soundex对象
    local.obj = My_Soundex(your_word)
    # 调用__call__()方法,即返回Del_Noise()去噪、去无关方法,赋给de_noise,用来检验用户是否有输入字母
    de_noise = local.obj()
    # 如果输入不包含字母
    if de_noise == 'Wrong':
        print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        print('please input English word which is right')
        # 重新输入
        return Main()
    # 如果输入包含字母,即输入合法
    else:
        print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
        print(f'Your input is {your_word}')
        print(f'After de-noising,your word becomes "{de_noise}"')
        # 调用Soundex()方法,得到单词的Soundex映射结果
        result = local.obj.Soundex(size=your_size)
        print(f'The result is {result}')
        print('>>>>>>>>>>>>>> End       >>>>>>>>>>')

        print('if you want to continue to input,input "1" ; if don’t, input "2" ; ')
        print('Want to compare cos_similarity of two words\' soundex results,input "3"!! ')
        print('Want to get min edit distance of two words\' soundex results,input "4"!!')

        want_continue = input('your input:')
        if want_continue == '3':
            print('>>>>>>>>>>>>>> cos_similarity  : for two words with the same initials >>>>>>>>>>>>>>>>>>>>>>>>')
            compare1()
        elif want_continue == '4':
            print('>>>>>>>>>>>>>> min edit distance : for two words with the same initials >>>>>>>>>>>>>>>>>>>>>>>>')
            compare2()
        elif want_continue == '1':
            return Main()
        else:
            pass



def compare1():
    '''
    用于获得输入及得到两个相同首字母单词的余弦相似度的输出
    :param:
    :return:
    '''
    c_size = size_process()
    print('Please input two words with the same initials!!!')
    in_first = process_input('first')
    in_second = process_input('second')
    print(Cosine_similarity(first=in_first, second=in_second, size=c_size))
    print('Don\'t Want to continue comparing? input "2";Want to get the Soundex result of your word? input "1" ')
    check = input('Your input : ')
    if check == '2':
        pass
    elif check == '1':
        return Main()
    else:
        return compare1()

def compare2():
    '''
    用于获得输入及得到两个相同首字母单词的最小编辑距离的输出
    :param:
    :return:
    '''
    c_size = size_process()
    in_first = process_input('first')
    in_second = process_input('second')
    print(run(first=in_first, second=in_second, size=c_size))
    print('Don\'t Want to continue comparing? input "2";Want to get the Soundex result of your word? input "1" ')
    check = input('Your input : ')
    if check == '2':
        pass
    elif check == '1':
        return Main()
    else:
        return compare2()

def size_process():
    '''
    处理compare_size输入
    :params:
    :return:
    '''
    try:
        compare_size = int(input('Please input the size you want:'))
        return compare_size
    except ValueError as e:
        return size_process()


def process_input(num):
    '''
    用于比较两个单词映射结果时处理用户单词输入
    :param num:
    :return 合法的输入:
    '''
    try:
        compare_input = input(f'Please input the {num} word:')
    except ValueError as e:
        return process_input(num)
    local.test = My_Soundex(compare_input)
    de_noise = local.test()
    # 如果输入不包含字母
    if de_noise == 'Wrong':
        print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
        print('please input English word which is right')
        # 重新输入
        return process_input(num)
    # 如果输入包含字母,即输入合法
    else:
        return compare_input

if __name__ == '__main__':
    print('(The system of this version only supports one word!)')
    print('If your input is like "long term",the system will process it as a single word "longterm"!')
    Main()
  • 测试:用来与wikipedia上Soundex算法的例子及例子输出结果比较
def test():
    '''
    用于测试
    用来与wikipedia上Soundex算法的例子及例子输出结果比较
    wikipedia的Soundex条目url:https://zh.wikipedia.org/wiki/Soundex
    :param:
    :return:
    '''
    test_words =  ['Knuth','Kant','Jarovski','Resnik','Reznick','Euler','Peterson','Jefferson']

    print(f'测试用例列表:{test_words}')

    result_list = ['K530', 'K530', 'J612',    'R252', 'R252',   'E460', 'P362',    'J162']

    print(f'\n对应官方结果列表:{result_list}')

    my_answer_list = []

    for i in range(len(test_words)):

        my_test = My_Soundex(test_words[i])

        my_answer = my_test.Soundex(size=4)

        my_answer_list.append(my_answer)

    if result_list == my_answer_list:

        print(f'\n系统输出结果为  :{my_answer_list}')

        print('\n系统结果与官方结果一致,测试通过!!')
  • 相似度比较:相同的首字母
def Cosine_similarity(first, second, size):
    '''
    计算两个单词的soundex算法映射结果的余弦相似度结果
    :param first:
    :param second:
    :param size:
    :return 余弦相似度结果:
    '''
    local.cos1 = My_Soundex(first)

    local.cos2 = My_Soundex(second)

    print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
    result1 = local.cos1.Soundex(size=size)
    print(f'The first result is "{result1}"')
    result2 = local.cos2.Soundex(size=size)
    print(f'The second result is "{result2}"')

    result1_first = result1[0]
    result2_first = result2[0]
    global result1_others
    global result2_others
    try:
        result1_others = result1[1:]
        result2_others = result2[1:]
    except IndexError as e:
        last = f'when the size is 1,the head of two results is {result1_first} and {result2_first} '

    mul = 0
    result1_abs = 0
    result2_abs = 0
    import math`在这里插入代码片`
    for i in range(len(result1_others)):
        mul += int(result1_others[i]) * int(result2_others[i])
        result1_abs += math.pow(int(result1_others[i]), 2)
        result2_abs += math.pow(int(result2_others[i]), 2)
    result1_abs = math.sqrt(result1_abs)
    result2_abs = math.sqrt(result2_abs)
    last_result = mul / (result1_abs * result2_abs)

    return last_result
  • 最小编辑距离
def min_edit_distance(word1, word2):
    import numpy as np
    len1 = len(word1)
    len2 = len(word2)
    dp = np.zeros((len1 + 1, len2 + 1))
    for i in range(len1 + 1):
        dp[i][0] = i
    for j in range(len2 + 1):
        dp[0][j] = j

    for i in range(1, len1 + 1):
        for j in range(1, len2 + 1):
            delta = 0 if word1[i - 1] == word2[j - 1] else 1
            dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
    return int(dp[len1][len2])

def run(first, second, size):
    '''
    计算两个单词的soundex算法映射结果的最小编辑距离结果
    :param first:
    :param second:
    :param size:
    :return 最小编辑距离结果:
    '''
    local.edit1 = My_Soundex(first)

    local.edit2 = My_Soundex(second)

    print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
    result1 = local.edit1.Soundex(size=size)
    print(f'The first result is "{result1}"')
    result2 = local.edit2.Soundex(size=size)
    print(f'The second result is "{result2}"')

    result1_first = result1
    result2_first = result2

    last_result = min_edit_distance(result1_first,result2_first)

    return 'The min edit distance of these two words is '+str(last_result)