最近实习工作要求将一些txt文本从Unicode16(小端模式)编码转为ISO-8859之类的编码,我能想到的途径有三种:
1)使用IBM的ICU工具包;
2)使用Java的转换方法;
3)自己通过Map表编写程序;
为了节省时间,我最终选择了方法2),但由于许久没用过Java及开始并不懂Java的编码方式,导致纠结了好久才搞定,下面就简单的阐述一下具体的方法。
在Java中实现编码转换主要用到了两个函数:
- getBytes(decode): 根据指定的decode编码返回某字符串在该编码下的byte数组表示
- new String(byte[] b, encoding):使用decode指定的编码来将byte[]解析成字符串,该字符串的编码为Java默认的Unicode16(BE)
始终要注意Java的默认编码方式是Unicode16(BE),所以new String得到的字符串对应的编码也是Unicode16(BE)。
读写文本文件采用的是InputStream和OutStream的方式,目的是为了对字节流进行读写,这样就可以避免在读写时改变了编码格式,示例代码如下:
1 import java.io.*;
2 import java.nio.charset.Charset;
3 import java.util.Iterator;
4 import java.util.Set;
5 import java.io.UnsupportedEncodingException;
6
7 public class Convertor {
8 public static final String Old_Charset = "Unicode";
9 public static final String ISO_8859_6 = "ISO-8859-6";
10 public static final String ISO_8859_7 = "ISO-8859-7";
11 public static final String ISO_8859_8 = "ISO-8859-8";
12 public static final String TIS_620 = "TIS-620";
13
14 public static void PrintAllSupportedCharset()
15 {
16 Set<String> charsetNames = Charset.availableCharsets().keySet();
17 System.out.println("-----the number of jdk1.67's charset is "+charsetNames.size()+"-----");
18 for (Iterator<String> it = charsetNames.iterator(); it.hasNext();)
19 {
20 String charsetName = (String) it.next();
21 System.out.println(charsetName);
22 }
23 }
24
25 public static void PrintBytes(byte[] b)
26 {
27 for (int i = 0; i < b.length; i++) {
28 String hex = Integer.toHexString(b[i] & 0xFF);
29 if (hex.length() == 1) {
30 hex = '0' + hex;
31 }
32 System.out.print(hex.toUpperCase() + " ");
33 }
34 System.out.println();
35 }
36
37 public static void ConvertCharset(String filePath, String fileName, String destDir, String oldCharset, String newCharset)
38 {
39 try
40 {
41 System.out.println(filePath);
42 InputStream in = new FileInputStream(filePath);
43
44 String srcStr = "";
45 if(in != null)
46 {
47 int byteNum = in.available();
48 byte[] b = new byte[byteNum];
49 in.read(b);//以字节流方式读入源文件
50 in.close();
51 PrintBytes(b);
52 //Convert
53 srcStr += new String(b, oldCharset); //生成的新的字符串是Unicode16(BE)编码的
54 byte[] nnb = srcStr.getBytes(newCharset);//获取Unicode16编码的字符串对应newCharset的字节数组,从而实现转码
55 PrintBytes(nnb);
56 String destFilePath = destDir + fileName;
57 OutputStream out = new FileOutputStream(destFilePath);
58 out.write(nnb);//以字节流方式输出
59 out.flush();
60 out.close();
61 }
62 }catch(Exception e)
63 {
64 e.printStackTrace();
65 }
66 }
67
68 public static boolean readfile(String filepath, String destDir, String newCharset) throws FileNotFoundException, IOException
69 {
70 try {
71
72 File file = new File(filepath);
73 if (!file.isDirectory()) {
74 System.out.println("文件");
75 System.out.println("path=" + file.getPath());
76 System.out.println("absolutepath=" + file.getAbsolutePath());
77 System.out.println("name=" + file.getName());
78 ConvertCharset(file.getAbsolutePath(), file.getName(), destDir, Convertor.Old_Charset, newCharset);
79
80
81 } else if (file.isDirectory()) {
82 System.out.println("文件夹");
83 String[] filelist = file.list();
84 for (int i = 0; i < filelist.length; i++) {
85 File readfile = new File(filepath + "\\" + filelist[i]);
86 if (!readfile.isDirectory()) {
87 System.out.println("path=" + readfile.getPath());
88 System.out.println("absolutepath="
89 + readfile.getAbsolutePath());
90 System.out.println("name=" + readfile.getName());
91 ConvertCharset(readfile.getAbsolutePath(), readfile.getName(), destDir, Convertor.Old_Charset, newCharset);
92
93 } else if (readfile.isDirectory()) {
94 readfile(filepath + "\\" + filelist[i], destDir, newCharset);
95 }
96 }
97 }
98
99 } catch (FileNotFoundException e) {
100 System.out.println("readfile() Exception:" + e.getMessage());
101 }
102 return true;
103 }
104
105
106
107 public static void main(String args[]) throws FileNotFoundException, IOException
108 {
109 //PrintAllSupportedCharset();
110 //System.out.println(Charset.defaultCharset());
111 String srcPath = "H:\\LYDATA\\四国语言\\Unicode16\\Thai_Unicode\\";
112 String destDir = "H:\\LYDATA\\四国语言\\Thai_TIS-620\\";
113 readfile(srcPath, destDir, Convertor.TIS_620);
114 }
115 }
几种编码方式对应的Unicode映射表:http://www.lingua-systems.com/knowledge/unicode-mappings/iso-8859-7-to-unicode.html