有jaccard距离udf函数:

def jaccardDistance = udf { (string1: String, string2: String) =>
var result = false
val set1: Set[Char] = string1.toSet
val set2: Set[Char] = string2.toSet
val intersectSize = set1.intersect(set2).size.toFloat
val unionSize = set1.union(set2).size.toFloat
val value = intersectSize / unionSize
if (value > 0.6) {
result = true
}
result
}

要计算

var resultDF = joinDF1.join(joinDF2,
jaccardDistance(joinDF1.col("input_column1"), joinDF2.col("input_column2"))
)

完全跑不完,
近似解决方案:预处理join的key

def removeFromArray(a: Array[Char], i: Int): Array[Char] = {
val b = a.toBuffer
b.remove(i)
b.toArray
}

// 变成set再sort再按 从小到大的长度 取,就能覆盖所有情况
val tmpString = (input_column).toSet.toArray.sorted.mkString("")
val originLength = tmpString.length
for (i <- 3 to originLength) { // 这里取最小长度3
val joinKey = tmpString.substring(0, i)
result.append((input_column, joinKey))
}
result.iterator

然后直接join

var resultDF = joinDF1.join(joinDF2,
Seq("joinKey")
)