一、 Java数据在内存去重一般我们有如下几种处理方法:
1. ArrayList去重
实现原理:通过equals方法比较tostring的值是否一致,判断是否重复
JDK源代码:
public boolean contains(Object o) {
return indexOf(o) >= 0;
}
public int indexOf(Object o) {
if (o == null) {
for (int i = 0; i < size; i++)
if (elementData[i]==null)
return i;
} else {
for (int i = 0; i < size; i++)
if (o.equals(elementData[i]))
return i;
}
return -1;
}
2. HashSet去重
实现原理:根据hashmap去重的原理去重
JDK源码
private transient HashMap<E,Object> map;
public boolean add(E e) {
return map.put(e, PRESENT)==null;
}
3. HashMap去重
实现原理:先比较hash值是否一致,如果一致再比较tostring的值是否一致
JDK源码
public V put(K key, V value) {
if (key == null)
return putForNullKey(value);
int hash = hash(key.hashCode());
int i = indexFor(hash, table.length);
for (Entry<K,V> e = table[i]; e != null; e = e.next) {
Object k;
if (e.hash == hash && ((k = e.key) == key || key.equals(k))) {
V oldValue = e.value;
e.value = value;
e.recordAccess(this);
return oldValue;
}
}
modCount++;
addEntry(hash, key, value, i);
return null;
}
4. BloomFilter去重
实现原理:根据BitSet判断值是否一致,bitset是JDK提供的去重的类
二、 代码示例
package repart;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
public class Repeat {
public static void main(String[] args) {
int index = 100005;
String[] arrays = new String[index];
for (int i = 0; i < index - 5; i++) {
arrays[i] = String.valueOf(i);
}
for (int i = 0; i < 5; i++) {
arrays[index - 5 + i] = String.valueOf(i);
}
repartList(arrays);
repartSet(arrays);
repartMap(arrays);
repartBloomFilter(arrays);
}
/**
* ArrayList去重
*/
public static void repartList(String[] arrays){
long start = System.currentTimeMillis();
List<String> list = new ArrayList<String>();
for (String num : arrays) {
if (!list.contains(num)) {
list.add(num);
}
}
System.out.println("repartList cost=" + (System.currentTimeMillis()-start));
System.out.println("size=" + list.size());
}
/**
* HashSet去重
*/
public static void repartSet(String[] arrays){
long start = System.currentTimeMillis();
Set<String> set = new HashSet<String>();
for (String num : arrays) {
set.add(num);
}
System.out.println("repartSet cost=" + (System.currentTimeMillis()-start));
System.out.println("size=" + set.size());
}
/**
* HashMap去重
*/
public static void repartMap(String[] arrays){
long start = System.currentTimeMillis();
Map<String, String> map = new HashMap<String, String>();
for (String num : arrays) {
map.put(num, null);
}
System.out.println("repartMap cost=" + (System.currentTimeMillis()-start));
System.out.println("size=" + map.size());
}
/**
* BloomFilter去重
* @param arrays
*/
public static void repartBloomFilter(String[] arrays){
long start = System.currentTimeMillis();
List<String> list = new ArrayList<String>();
SimpleBloomFilter filter = new SimpleBloomFilter();
for (String num : arrays) {
if (!filter.contains(num)) {
filter.add(num);
list.add(num);
}
}
System.out.println("repartBloomFilter cost=" + (System.currentTimeMillis()-start));
System.out.println("size=" + list.size());
}
}
package repart;
import java.util.BitSet;
public class SimpleBloomFilter {
private static final int DEFAULT_SIZE = 2 << 24;
private static final int[] seeds = new int[] { 5, 7, 11, 13, 31, 37, 61 };
private BitSet bits = new BitSet(DEFAULT_SIZE);
private SimpleHash[] func = new SimpleHash[seeds.length];
public SimpleBloomFilter() {
for (int i = 0; i < seeds.length; i++) {
func[i] = new SimpleHash(DEFAULT_SIZE, seeds[i]);
}
}
public void add(String value) {
for (SimpleHash f : func) {
bits.set(f.hash(value), true);
}
}
public boolean contains(String value) {
if (value == null) {
return false;
}
boolean ret = true;
for (SimpleHash f : func) {
ret = ret && bits.get(f.hash(value));
}
return ret;
}
//simpleHash
public static class SimpleHash {
private int cap;
private int seed;
public SimpleHash(int cap, int seed) {
this.cap = cap;
this.seed = seed;
}
public int hash(String value) {
int result = 0;
int len = value.length();
for (int i = 0; i < len; i++) {
result = seed * result + value.charAt(i);
}
return (cap - 1) & result;
}
}
public static void main(String[] args) {
String value = "stone2083@yahoo.cn";
SimpleBloomFilter filter = new SimpleBloomFilter();
System.out.println(filter.contains(value));
filter.add(value);
System.out.println(filter.contains(value));
}
}
三、 运行结果
repartListcost=24605
size=100000
repartSetcost=22
size=100000
repartMapcost=27
size=100000
repartBloomFiltercost=42
size=100000
四、 总结
1. 执行效率:HaspMap和HashSet是基本一致的,BloomFilter稍微慢一点,ArrayList效率最低。
2. 准确率:HaspMap、HashSet、ArrayList是100%去重,BloomFilter可能会有极小的误差。
3. 内存使用:HaspMap、HashSet、ArrayList内存占用都差不多,BloomFilter占用最少,随着数据量越来越大,为了避免内存溢出,BloomFilter去重则是最佳选择。
4. 其他去重手段
1) 数据入数据库,每次通过find判断数据是否存在去重,建议使用非关系数据库,执行效率较高,如redis、mongodb
2) 为了避免内存溢出,同时重复数据较多时,将数据拆分几份分别去重,最终再合起来去重