1、算法介绍
Soundex是一种语音算法,利用英文字的读音计算近似值,值由四个字符构成,第一个字符为英文字母,后三个为数字。在拼音文字中有时会有会念但不能拼出正确字的情形,可用Soundex做类似模糊匹配的效果。例如Knuth和Kant二个字符串,它们的Soundex值都是“K530”。其在电脑大师高德纳名著《计算机程序设计艺术》都有详细的介绍。
2、算法简要说明
- 第一步:保留第一个字母,去掉第一个字母之后的所有的a, e, i, o, u, y, h, w;
- 第二步:将英文字按以下规则替换(除第一个字符外):
b f p v -> 1
c g j k q s x z -> 2
d t -> 3
l -> 4
m n -> 5
r -> 6
- 第三步:对于相邻的重复的数字只保留一个,即相邻的两个被替换为同一个数字的字母只保留一个;
- 第四步:保留第一个字母后的三位数字,若不足三位则以0补足。
2、python实现
- 算法代码:
import threading
__author = 'cyy'
__Version = '1.0 : for one word'
local=threading.local()
class My_Soundex(object):
def __init__(self,string):
# 将用户输入的string进行首尾去空格处理并转换为大写
self.string = string.strip().upper()
# 第一个字母之后应该去掉的字母
self.__noise=['A','E','I','O','U','Y','H','W']
# 用来存储前一个字母的数字映射
self.preNum = None
# 数字映射列表
self.Map = [#A B C D E F G H I J K L M
'0','1','2','3','0','1','2','0','0','2','2','4','5',
#N O P Q R S T U V W X Y Z
'5','0','1','2','6','2','3','0','1','0','2','0','2']
# 存储结果
self.__result = ''
def Del_Noise(self):
'''
:param:self
:return: 去噪后单词
'''
# 去掉不是字母的噪声
for st in self.string:
# 如果不是字母
if not st.isalpha():
self.string = self.string.replace(st,'')
# 如果全部都是噪声,返回‘wrong’字符串
if self.string == '':
return 'Wrong'
# 取第一个字母后的所有字母,用于去除有在['A','E','I','O','U','Y','H','W']中的字母
stringBuffer = self.string[1:]
# 如果第一个字母后的所有字母为空
if not stringBuffer:
# 直接返回
return self.string
# 如果不为空
else:
# 去除有在['A','E','I','O','U','Y','H','W']中的字母
for noise in self.__noise:
if stringBuffer == '':
break
stringBuffer = stringBuffer.replace(noise,'')
# 返回去除后的字符串
return self.string[0]+stringBuffer
def __call__(self, *args, **kwargs):
return self.Del_Noise()
@property
def result(self):
return self.__result
def Soundex(self,size=4):
'''
:param size: 默认为4
:return: 单词的Soundex映射结果
'''
# 第一步:去噪声和去除第一个字母后有在['A','E','I','O','U','Y','H','W']中的字母,赋给self.string
self.string=self.Del_Noise()
# 如果只有一个字母,补0后返回
if self.string.__len__() == 1:
return self.string+'0'*(size-1)
# 如果不止一个字母
# 将第一个字母加入结果
self.__result += self.string[0]
# 遍历第一个字母后的所有单词
for string in self.string[1:]:
# 取出数字映射
each = self.Map[ord(string) - ord('A')]
# 如果取出的数字映射不等于前一个字母的数字映射,即相邻的两个被替换为同一个数字的字母只保留一个
if each != self.preNum:
# 将数字映射加入结果
self.__result += each
self.preNum = each
# 如果得到的结果长度小于size,补0直到结果长度与size相同
if self.__result.__len__() < size:
for j in range(size-self.__result.__len__()):
self.__result += '0'
# 如果得到的结果长度大于或等于size,从左取出与size相同长度的result
else:
self.__result = self.__result[:size]
# 返回单词的Soundex映射结果
return self.result
- Main方法:用于交互及算法使用
def Main():
'''
main方法,用来处理用户输入及响应输出
:param:
:return:
'''
try:
# 用户输入的单词
your_word = input('Please input your word:')
# 用户定义的结果size,一般为4
your_size = int(input('Please input the size of the result you want:'))
except ValueError as e:
return Main()
# 用用户输入的字符串实例化一个My_Soundex对象
local.obj = My_Soundex(your_word)
# 调用__call__()方法,即返回Del_Noise()去噪、去无关方法,赋给de_noise,用来检验用户是否有输入字母
de_noise = local.obj()
# 如果输入不包含字母
if de_noise == 'Wrong':
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
print('please input English word which is right')
# 重新输入
return Main()
# 如果输入包含字母,即输入合法
else:
print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
print(f'Your input is {your_word}')
print(f'After de-noising,your word becomes "{de_noise}"')
# 调用Soundex()方法,得到单词的Soundex映射结果
result = local.obj.Soundex(size=your_size)
print(f'The result is {result}')
print('>>>>>>>>>>>>>> End >>>>>>>>>>')
print('if you want to continue to input,input "1" ; if don’t, input "2" ; ')
print('Want to compare cos_similarity of two words\' soundex results,input "3"!! ')
print('Want to get min edit distance of two words\' soundex results,input "4"!!')
want_continue = input('your input:')
if want_continue == '3':
print('>>>>>>>>>>>>>> cos_similarity : for two words with the same initials >>>>>>>>>>>>>>>>>>>>>>>>')
compare1()
elif want_continue == '4':
print('>>>>>>>>>>>>>> min edit distance : for two words with the same initials >>>>>>>>>>>>>>>>>>>>>>>>')
compare2()
elif want_continue == '1':
return Main()
else:
pass
def compare1():
'''
用于获得输入及得到两个相同首字母单词的余弦相似度的输出
:param:
:return:
'''
c_size = size_process()
print('Please input two words with the same initials!!!')
in_first = process_input('first')
in_second = process_input('second')
print(Cosine_similarity(first=in_first, second=in_second, size=c_size))
print('Don\'t Want to continue comparing? input "2";Want to get the Soundex result of your word? input "1" ')
check = input('Your input : ')
if check == '2':
pass
elif check == '1':
return Main()
else:
return compare1()
def compare2():
'''
用于获得输入及得到两个相同首字母单词的最小编辑距离的输出
:param:
:return:
'''
c_size = size_process()
in_first = process_input('first')
in_second = process_input('second')
print(run(first=in_first, second=in_second, size=c_size))
print('Don\'t Want to continue comparing? input "2";Want to get the Soundex result of your word? input "1" ')
check = input('Your input : ')
if check == '2':
pass
elif check == '1':
return Main()
else:
return compare2()
def size_process():
'''
处理compare_size输入
:params:
:return:
'''
try:
compare_size = int(input('Please input the size you want:'))
return compare_size
except ValueError as e:
return size_process()
def process_input(num):
'''
用于比较两个单词映射结果时处理用户单词输入
:param num:
:return 合法的输入:
'''
try:
compare_input = input(f'Please input the {num} word:')
except ValueError as e:
return process_input(num)
local.test = My_Soundex(compare_input)
de_noise = local.test()
# 如果输入不包含字母
if de_noise == 'Wrong':
print('>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>')
print('please input English word which is right')
# 重新输入
return process_input(num)
# 如果输入包含字母,即输入合法
else:
return compare_input
if __name__ == '__main__':
print('(The system of this version only supports one word!)')
print('If your input is like "long term",the system will process it as a single word "longterm"!')
Main()
- 测试:用来与wikipedia上Soundex算法的例子及例子输出结果比较
def test():
'''
用于测试
用来与wikipedia上Soundex算法的例子及例子输出结果比较
wikipedia的Soundex条目url:https://zh.wikipedia.org/wiki/Soundex
:param:
:return:
'''
test_words = ['Knuth','Kant','Jarovski','Resnik','Reznick','Euler','Peterson','Jefferson']
print(f'测试用例列表:{test_words}')
result_list = ['K530', 'K530', 'J612', 'R252', 'R252', 'E460', 'P362', 'J162']
print(f'\n对应官方结果列表:{result_list}')
my_answer_list = []
for i in range(len(test_words)):
my_test = My_Soundex(test_words[i])
my_answer = my_test.Soundex(size=4)
my_answer_list.append(my_answer)
if result_list == my_answer_list:
print(f'\n系统输出结果为 :{my_answer_list}')
print('\n系统结果与官方结果一致,测试通过!!')
- 相似度比较:相同的首字母
def Cosine_similarity(first, second, size):
'''
计算两个单词的soundex算法映射结果的余弦相似度结果
:param first:
:param second:
:param size:
:return 余弦相似度结果:
'''
local.cos1 = My_Soundex(first)
local.cos2 = My_Soundex(second)
print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
result1 = local.cos1.Soundex(size=size)
print(f'The first result is "{result1}"')
result2 = local.cos2.Soundex(size=size)
print(f'The second result is "{result2}"')
result1_first = result1[0]
result2_first = result2[0]
global result1_others
global result2_others
try:
result1_others = result1[1:]
result2_others = result2[1:]
except IndexError as e:
last = f'when the size is 1,the head of two results is {result1_first} and {result2_first} '
mul = 0
result1_abs = 0
result2_abs = 0
import math`在这里插入代码片`
for i in range(len(result1_others)):
mul += int(result1_others[i]) * int(result2_others[i])
result1_abs += math.pow(int(result1_others[i]), 2)
result2_abs += math.pow(int(result2_others[i]), 2)
result1_abs = math.sqrt(result1_abs)
result2_abs = math.sqrt(result2_abs)
last_result = mul / (result1_abs * result2_abs)
return last_result
- 最小编辑距离
def min_edit_distance(word1, word2):
import numpy as np
len1 = len(word1)
len2 = len(word2)
dp = np.zeros((len1 + 1, len2 + 1))
for i in range(len1 + 1):
dp[i][0] = i
for j in range(len2 + 1):
dp[0][j] = j
for i in range(1, len1 + 1):
for j in range(1, len2 + 1):
delta = 0 if word1[i - 1] == word2[j - 1] else 1
dp[i][j] = min(dp[i - 1][j - 1] + delta, min(dp[i - 1][j] + 1, dp[i][j - 1] + 1))
return int(dp[len1][len2])
def run(first, second, size):
'''
计算两个单词的soundex算法映射结果的最小编辑距离结果
:param first:
:param second:
:param size:
:return 最小编辑距离结果:
'''
local.edit1 = My_Soundex(first)
local.edit2 = My_Soundex(second)
print('>>>>>>>>>>>>>> Output : >>>>>>>>>>>>>>>>>>>>>>>>')
result1 = local.edit1.Soundex(size=size)
print(f'The first result is "{result1}"')
result2 = local.edit2.Soundex(size=size)
print(f'The second result is "{result2}"')
result1_first = result1
result2_first = result2
last_result = min_edit_distance(result1_first,result2_first)
return 'The min edit distance of these two words is '+str(last_result)