压缩小工具 哈夫曼算法
压缩原理图
说明
- 小写字母代表字节。
- 映射表为字节码与新编码的映射关系表,字节a→编码A,。。。
- 大写字母序列为压缩后文件的序列。
压缩前提
- 文件 = 字节序列,如上图序列{a,b,c,a,d,…}
- 文件不同,不同字节出现的频率也不同,如上,字节a出现的频率最高
- 让使用频率高的用短码,使用频率低的用长码,以优化整个文件长度。
基本术语
- 哈夫曼树又称为最优树.
- 路径和路径长度。 在一棵树中,从一个结点往下可以达到的孩子或孙子结点之间的通路,称为路径。通路中分支的数目称为路径长度。若规定根结点的层数为1,则从根结点到第L层结点的路径长度为L-1。
- 结点的权及带权路径长度。 若将树中结点赋给一个有着某种含义的数值,则这个数值称为该结点的权。结点的带权路径长度为:从根结点到该结点之间的路径长度与该结点的权的乘积。
- 树的带权路径长度。 树的带权路径长度规定为所有叶子结点的带权路径长度之和,记为WPL。
实现压缩文件过程
- ①读取文件字节流
- ②统计字节出现频率
- ③使用字节频率构造哈夫曼编码
- ④将原字节流通过编码表,映射为新的编码流
- ⑤将编码信息和新编码流保存,以实现压缩
①读取文件字节流
public class Compression{
...
private FileInputStream inputStream;
...
}
...
inputStream = new FileInputStream(s);
byte[] allBytes= inputStream.readAllBytes();
...
②统计字节出现频率
public class Compression{
...
HashMap<Byte,Integer> frequencyHashMap;
...
}
public void countFrequency(byte[] allBytes ) throws IOException {
long start = System.currentTimeMillis();
for (byte b: allBytes) {
if(frequencyHashMap.get(b)==null){
frequencyHashMap.put(b,1);
}else {
frequencyHashMap.put(b,frequencyHashMap.get(b)+1);
}
}
long stop = System.currentTimeMillis();
//System.out.println("统计频率完成");
System.out.println("统计频率用时:"+ (stop-start)+" ms");
}
③使用字节频率构造哈夫曼编码
说明:
- public Node createHuffmanTree(ArrayList l)
- 输入:节点序列
- 输出:哈夫曼树头结点
- void getHuffmanCodes(Node node,String code,StringBuilder sb)
- 递归的方式生成哈夫曼编码
- 左子树加0 右子树加1
- 当到达叶子节点时,便可以获得哈弗曼编码,并将哈夫曼编码信息存储映射表中
class Compression{
...
HashMap<Byte,String> compressionmap;
HashMap<String,Byte> decompressionmap;
...
}
/*
输入哈夫曼树根节点,并将映射关系,载入压缩和解压映射表中
*/
public void getHuffmanCodes(Node node,String code,StringBuilder sb){
StringBuilder sbthis=new StringBuilder(sb);
sbthis.append(code);
if(node!=null){
if(node.key==null){//非叶子节点
getHuffmanCodes(node.left,"0",sbthis);
getHuffmanCodes(node.right,"1",sbthis);
}else {//叶子节点
compressionmap.put(node.key,sbthis.toString());
decompressionmap.put(sbthis.toString(),node.key);
}
}
}
/*
输入ArrayList<Node>输出一个Node哈夫曼树
@return 一个Node哈夫曼树
*/
public Node createHuffmanTree(ArrayList<Node> l){
while(l.size()>1){
Collections.sort(l);
Node leftNode = l.get(0);
Node rightNode = l.get(1);
Node parent = new Node(null,leftNode.frency+ rightNode.frency);
parent.left = leftNode;
parent.right = rightNode;
l.remove(leftNode);
l.remove(rightNode);
l.add(parent);
}
return l.get(0);
}
class Node implements Comparable<Node> {
int frency;
Byte key;
Node left;
Node right;
public Node(Byte key,int frency){
this.frency=frency;
this.key=key;
}
@Override
public int compareTo(Node n) {
return (this.frency - n.frency);
}
④将原字节流通过编码表,映射为新的编码流
/*
* 原始byte数组 转换为 Huffman编码,紧凑,生成新的byte数组
* */
public byte[] BytesToBytesByHuffmanCode( byte[] allBytes){
StringBuilder sb=new StringBuilder();
for(byte b:allBytes){
sb.append(compressionmap.get(b));
}
int len;
if (sb.length() % 8 == 0) {
len = sb.length() / 8;
} else {
lastCode = sb.length() % 8;
len = sb.length() / 8 + 1;
}
byte[] bys = new byte[len];
int index = 0;
for (int i = 0; i < sb.length(); i += 8) {
String strByte;
if (i + 8 > sb.length()) {
strByte = sb.substring(i);
} else {
strByte = sb.substring(i, i + 8);
}
//转为2进制
bys[index] = (byte) Integer.parseInt(strByte, 2);
index++;
}
return bys;
}
⑤将编码信息和新编码流保存,以实现压缩
public void outPutCompressFile(byte[] compressBytes) throws IOException {
FileOutputStream fos=new FileOutputStream(fileAfterCompression.location);
ObjectOutputStream foos=new ObjectOutputStream(fos);
foos.writeObject(compressBytes);
foos.writeObject(compressionmap);
fos.close();
foos.close();
}
⑥测试
源码
package EX.compression;
import java.io.*;
import java.util.*;
public class CInOut {
private File file;
private int lastCode;
private FileInputStream inputStream;
private FileStruct fileBeforeCompression;
private FileStruct fileAfterCompression;
HashMap<Byte,Integer> frequencyHashMap;
HashMap<Byte,String> compressionmap;
HashMap<String,Byte> decompressionmap;
public void startCompress() throws IOException {
byte[] allBytes= inputStream.readAllBytes();
countFrequency(allBytes);
initHuffmanCode();
byte[] compressBytes=BytesToBytesByHuffmanCode(allBytes);
outPutCompressFile(compressBytes);
}
public void startDecompress() throws IOException, ClassNotFoundException {
InputStream is =new FileInputStream(fileAfterCompression.location);
ObjectInputStream ois = new ObjectInputStream(is);
byte[] allBytes=(byte[])ois.readObject();
Map<Byte, String> hMap=(Map<Byte, String>) ois.readObject();
byte[] deBytes= decodeBytes(allBytes,hMap);
OutputStream os=new FileOutputStream(fileAfterCompression.location+".txt");
os.write(deBytes);
os.close();
is.close();
ois.close();
}
public byte[] decodeBytes(byte[] allBytes ,Map<Byte, String> hmap){
StringBuilder sb = new StringBuilder();
for (int i = 0; i < allBytes.length; i++) {
byte b = allBytes[i];
boolean flag = (i == allBytes.length - 1); //flag是不是最后一个字节
int temp=b|256;//防止str的size小于8
String str=Integer.toBinaryString(temp);
//int temp=b; String str=Integer.toBinaryString(temp);
if(!flag){
sb.append(str.substring(str.length()-8));
}else{
sb.append(str.substring(str.length()-lastCode));
}
}
ArrayList<Byte> list = new ArrayList<>();
int leastnum=100;
for(var item : decompressionmap.entrySet()){
if(leastnum>item.getKey().length()) leastnum=item.getKey().length();
}
//System.out.println(leastnum);
for (int i = 0; i < sb.length(); ) {
int count = leastnum;
boolean flag = true;
Byte b = null;
while (flag) {
String s = sb.substring(i, i + count);
b = decompressionmap.get(s);
if (b == null) {
count++;
} else {
flag = false;
}
}
list.add(b);
i += count;
}
byte[] bytes1 = new byte[list.size()];
for (int i = 0; i < list.size(); i++) {
bytes1[i] = list.get(i);
}
return bytes1;
}
public void outPutCompressFile(byte[] compressBytes) throws IOException {
FileOutputStream fos=new FileOutputStream(fileAfterCompression.location);
ObjectOutputStream foos=new ObjectOutputStream(fos);
foos.writeObject(compressBytes);
foos.writeObject(compressionmap);
fos.close();
foos.close();
fileAfterCompression.filesize= (new File(fileAfterCompression.location)).length();
}
public void setInputFile(String s) throws FileNotFoundException {
file = new File(s);
fileBeforeCompression.location=s;
fileBeforeCompression.parentFile=file.getParentFile();
fileBeforeCompression.filename = file.getName();
fileBeforeCompression.filesize = file.length();
fileAfterCompression.parentFile=fileBeforeCompression.parentFile;
fileAfterCompression.filename = fileBeforeCompression.filename+".zip";
fileAfterCompression.location=s+".zip";
inputStream = new FileInputStream(s);
}
public CInOut() throws FileNotFoundException {
fileBeforeCompression = new FileStruct();
fileAfterCompression = new FileStruct();
frequencyHashMap =new HashMap<>();
compressionmap=new HashMap<>();
decompressionmap=new HashMap<>();
inputStream = null ;
}
public void countFrequency(byte[] allBytes ) throws IOException {
long start = System.currentTimeMillis();
for (byte b: allBytes) {
if(frequencyHashMap.get(b)==null){
frequencyHashMap.put(b,1);
}else {
frequencyHashMap.put(b,frequencyHashMap.get(b)+1);
}
}
long stop = System.currentTimeMillis();
//System.out.println("统计频率完成");
System.out.println("统计频率用时:"+ (stop-start)+" ms");
}
/*
* 原始byte数组 转换为 Huffman编码,紧凑,生成新的byte数组
* */
public byte[] BytesToBytesByHuffmanCode( byte[] allBytes){
StringBuilder sb=new StringBuilder();
for(byte b:allBytes){
sb.append(compressionmap.get(b));
}
int len;
if (sb.length() % 8 == 0) {
len = sb.length() / 8;
} else {
lastCode = sb.length() % 8;
len = sb.length() / 8 + 1;
}
byte[] bys = new byte[len];
int index = 0;
for (int i = 0; i < sb.length(); i += 8) {
String strByte;
if (i + 8 > sb.length()) {
strByte = sb.substring(i);
} else {
strByte = sb.substring(i, i + 8);
}
//转为2进制
bys[index] = (byte) Integer.parseInt(strByte, 2);
index++;
}
return bys;
}
//将记录<频率,字符>的map转换为ArrayList,然后获取每种字符的Huffman编码,并记录到新的map中
public void initHuffmanCode(){
ArrayList<Node> l=new ArrayList<>();
frequencyHashMap.forEach((key,value)->{l.add(new Node(key,value));});
getHuffmanCodes(createHuffmanTree(l),"",new StringBuilder());
// System.out.println("哈夫曼编码结束!");
}
/*
输入哈夫曼树根节点,并将映射关系,载入压缩和解压映射表中
*/
public void getHuffmanCodes(Node node,String code,StringBuilder sb){
StringBuilder sbthis=new StringBuilder(sb);
sbthis.append(code);
if(node!=null){
if(node.key==null){//非叶子节点
getHuffmanCodes(node.left,"0",sbthis);
getHuffmanCodes(node.right,"1",sbthis);
}else {//叶子节点
compressionmap.put(node.key,sbthis.toString());
decompressionmap.put(sbthis.toString(),node.key);
}
}
}
/*
输入ArrayList<Node>输出一个Node哈夫曼树
@return 一个Node哈夫曼树
*/
public Node createHuffmanTree(ArrayList<Node> l){
while(l.size()>1){
Collections.sort(l);
Node leftNode = l.get(0);
Node rightNode = l.get(1);
Node parent = new Node(null,leftNode.frency+ rightNode.frency);
parent.left = leftNode;
parent.right = rightNode;
l.remove(leftNode);
l.remove(rightNode);
l.add(parent);
}
return l.get(0);
}
public static void main(String[] s) throws IOException, ClassNotFoundException {
CInOut cin =new CInOut();
cin.setInputFile("src/EX/compression/testlist/斗破苍穹.txt");
//cin.setInputFile("src/EX/compression/testlist/第一章陨落的天才.txt");
var start=System.currentTimeMillis();
cin.startCompress();
var temptime=System.currentTimeMillis();
cin.startDecompress();
var stop=System.currentTimeMillis();
System.out.println("Done!");
System.out.println("压缩用时:"+(temptime-start)+" ms");
System.out.println("解压用时:"+(stop-temptime)+" ms");
System.out.println("总共用时:"+(stop-start)+" ms");
System.out.println("压缩前:"+cin.fileBeforeCompression.filesize+" KB");
System.out.println("压缩后:"+cin.fileAfterCompression.filesize+" KB");
System.out.println("压缩率:"+((double)cin.fileAfterCompression.filesize/(double)cin.fileBeforeCompression.filesize)+" ");
}
}
class FileStruct{
String filename;
File parentFile;
String location;
long filesize;//字节
//String fileformat;
FileStruct(){
}
}
class Node implements Comparable<Node> {
int frency;
Byte key;
Node left;
Node right;
public Node(Byte key,int frency){
this.frency=frency;
this.key=key;
}
@Override
public int compareTo(Node n) {
return (this.frency - n.frency);
}
}