用pdfbox的jar包来解析pdf:

import java.io.ByteArrayOutputStream; 

import java.io.File; 

import java.io.OutputStreamWriter; 


import org.pdfbox.pdmodel.PDDocument; 

import org.pdfbox.util.PDFTextStripper; 


public class Pdf2text { 

 public static String getTxt(File f) throws Exception { 

 String ts=""; 

 try{ 

 String temp = ""; 

 PDDocument pdfdocument = PDDocument.load(f); 


 ByteArrayOutputStream out = new ByteArrayOutputStream(); 

 OutputStreamWriter writer = new OutputStreamWriter(out); 

 PDFTextStripper stripper = new PDFTextStripper(); 


 stripper.writeText(pdfdocument.getDocument(), writer); 


 pdfdocument.close(); 

 out.close(); 

 writer.close(); 

 byte[] contents = out.toByteArray(); 

 ts = new String(contents); 

 System.out.println(f.getName() + "length is:" + contents.length + "\n"); 

 }catch(Exception e){ 

 e.printStackTrace(); 

 } 

 finally{ 

 return ts; 

 } 

 } 


 public static void main(String[] args){ 

 File file = new File("E:/600536_2008_zzy.pdf"); 

 try { 

 System.out.println(Pdf2text.getTxt(file)); 

 } catch (Exception e) { 

 // TODO 自动生成 catch 块 

 e.printStackTrace(); 

 } 

 } 

} 

====================== 


word,excel和ppt都用POI的jar包来解析: 


 import java.io.File; 


 import org.apache.poi.POITextExtractor; 

 import org.apache.poi.extractor.ExtractorFactory; 


 public class DocxParser { 


 /** 

 * @param args 

 */ 

 public static void main(String[] args) { 

 try { 

 File inputFile = new File("D:\\test.docx"); 

 //File inputFile = new File("D:\\test.pptx"); 

 //File inputFile = new File("D:\\test.xlsx"); 

 //File inputFile = new File("D:\\test.xls"); 

 //File inputFile = new File("D:\\test.doc"); 

 //File inputFile = new File("D:\\test.ppt"); 

 POITextExtractor extractor = ExtractorFactory 

 .createExtractor(inputFile); 

 System.out.println("Document Text: "); 

 System.out.println("===================="); 

 System.out.println(extractor.getText()); 

 System.out.println("===================="); 

 } catch (Exception ex) { 

 ex.printStackTrace(); 

 } 

 } 


 } 


import java.io.File; 

import java.io.FileInputStream; 

import java.io.FileNotFoundException; 

import java.io.IOException; 


import org.apache.poi.hwpf.extractor.WordExtractor; 


public class Word2text { 


public static void main(String[] args) { 

 File file = new File("E:\\2009.doc"); 

 try { 

 FileInputStream fis = new FileInputStream(file); 

 WordExtractor wordExtractor = new WordExtractor(fis); 

 System.out.println("【 使用getText()方法提取的Word文件的内容如下所示:】"); 

 System.out.println(wordExtractor.getText()); 

 } catch (FileNotFoundException e) { 

 e.printStackTrace(); 

 } catch (IOException e) { 

 e.printStackTrace(); 

 } 

} 

} 


import java.io.File; 

import java.io.FileInputStream; 

import java.io.FileNotFoundException; 

import java.io.InputStream; 


import org.apache.poi.hslf.HSLFSlideShow; 

import org.apache.poi.hslf.model.TextRun; 

import org.apache.poi.hslf.model.Slide; 

import org.apache.poi.hslf.usermodel.SlideShow; 


public class Ppt2text { 


/** 

* @param args 

* @throws FileNotFoundException 

*/ 

public static void main(String[] args) throws FileNotFoundException { 

 File file = new File("E:\\1025681983.ppt"); 

 InputStream fis = new FileInputStream(file); 

 try { 

 getDocument(fis); 


 } catch (Exception e) { 


 e.printStackTrace(); 

 } 

} 


public static void getDocument(InputStream is) throws Exception { 

 StringBuffer content = new StringBuffer(""); 

 try { 

 SlideShow ss = new SlideShow(new HSLFSlideShow(is));// is 

 // 为文件的InputStream,建立SlideShow 

 Slide[] slides = ss.getSlides();// 获得每一张幻灯片 

 for (int i = 0; i < slides.length; i++) { 

 TextRun[] t = slides[i].getTextRuns();// 为了取得幻灯片的文字内容,建立TextRun 

 for (int j = 0; j < t.length; j++) { 

 content.append(t[j].getText());// 这里会将文字内容加到content中去 

 } 

 content.append(slides[i].getTitle()); 

 } 

 String str = new String(content); 

 System.out.println(str.toString()); 


 } catch (Exception ex) { 

 System.out.println(ex.toString()); 

 } 


} 


} 

============= 


对excel的解析也可以用jxl的jar包来解析: 


import java.io.File; 


import jxl.Cell; 

import jxl.CellType; 

import jxl.DateCell; 

import jxl.NumberCell; 

import jxl.Sheet; 

import jxl.Workbook; 



public class Excel2text { 

public static void main(String args[]) { 


 try { 


 Workbook workbook = null; 


 try { 

 workbook = Workbook.getWorkbook(new File("e:\\Dealerlist_3.xls")); 

 } catch (Exception e) { 

 throw new Exception("file to import not found!"); 

 } 


 Sheet sheet = workbook.getSheet(0); 

 Cell cell = null; 


 int columnCount = 3; 

 int rowCount = sheet.getRows(); 

 for (int i = 0; i < rowCount; i++) { 

 for (int j = 0; j < columnCount; j++) { 

 // 注意,这里的两个参数,第一个是表示列的,第二才表示行 

 cell = sheet.getCell(j, i); 

 // 要根据单元格的类型分别做处理,否则格式化过的内容可能会不正确 

 if (cell.getType() == CellType.NUMBER) { 

 System.out.print(((NumberCell) cell).getValue()); 

 } else if (cell.getType() == CellType.DATE) { 

 System.out.print(((DateCell) cell).getDate()); 

 } else { 

 System.out.print(cell.getContents()); 

 } 


 // System.out.print(cell.getContents()); 

 System.out.print("\t"); 

 } 

 System.out.print("\n"); 

 } 

 // 关闭它,否则会有内存泄露 

 workbook.close(); 

 } catch (Exception e) { 


 } 


} 

} 


import java.io.*; 

import jxl.*; 

import jxl.write.*; 

import jxl.format.*; 


public class Text2Excel { 

public static void main(String args[]) { 


 try { 


 File tempFile = new File("e:" + java.io.File.separator 

 + "output00.xls"); 

 System.out.println("e:" + java.io.File.separator + "output00.xls"); 


 WritableWorkbook workbook = Workbook.createWorkbook(tempFile); 

 WritableSheet sheet = workbook.createSheet("TestCreateExcel", 0); 


 // 一些临时变量,用于写到excel中 

 Label l = null; 

 jxl.write.Number n = null; 

 jxl.write.DateTime d = null; 


 // 预定义的一些字体和格式,同一个Excel中最好不要有太多格式 

 WritableFont headerFont = new WritableFont(WritableFont.ARIAL, 12, 

 WritableFont.BOLD, false, UnderlineStyle.NO_UNDERLINE, 

 jxl.format.Colour.BLUE); 

 WritableCellFormat headerFormat = new WritableCellFormat(headerFont); 


 WritableFont titleFont = new WritableFont(WritableFont.ARIAL, 10, 

 WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE, 

 jxl.format.Colour.RED); 

 WritableCellFormat titleFormat = new WritableCellFormat(titleFont); 


 WritableFont detFont = new WritableFont(WritableFont.ARIAL, 10, 

 WritableFont.NO_BOLD, false, UnderlineStyle.NO_UNDERLINE, 

 jxl.format.Colour.BLACK); 

 WritableCellFormat detFormat = new WritableCellFormat(detFont); 


 NumberFormat nf = new NumberFormat("0.00000"); // 用于Number的格式 

 WritableCellFormat priceFormat = new WritableCellFormat(detFont, nf); 


 DateFormat df = new DateFormat("yyyy-MM-dd");// 用于日期的 

 WritableCellFormat dateFormat = new WritableCellFormat(detFont, df); 


 // 剩下的事情,就是用上面的内容和格式创建一些单元格,再加到sheet中 

 l = new Label(0, 0, "用于测试的Excel文件", headerFormat); 

 sheet.addCell(l); 


 // add Title 

 int column = 0; 

 l = new Label(column++, 2, "标题", titleFormat); 

 sheet.addCell(l); 

 l = new Label(column++, 2, "日期", titleFormat); 

 sheet.addCell(l); 

 l = new Label(column++, 2, "货币", titleFormat); 

 sheet.addCell(l); 

 l = new Label(column++, 2, "价格", titleFormat); 

 sheet.addCell(l); 


 // add detail 

 int i = 0; 

 column = 0; 

 l = new Label(column++, i + 3, "标题 " + i, detFormat); 

 sheet.addCell(l); 

 d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat); 

 sheet.addCell(d); 

 l = new Label(column++, i + 3, "CNY", detFormat); 

 sheet.addCell(l); 

 n = new jxl.write.Number(column++, i + 3, 5.678, priceFormat); 

 sheet.addCell(n); 


 i++; 

 column = 0; 

 l = new Label(column++, i + 3, "标题 " + i, detFormat); 

 sheet.addCell(l); 

 d = new DateTime(column++, i + 3, new java.util.Date(), dateFormat); 

 sheet.addCell(d); 

 l = new Label(column++, i + 3, "SGD", detFormat); 

 sheet.addCell(l); 

 n = new jxl.write.Number(column++, i + 3, 98832, priceFormat); 

 sheet.addCell(n); 


 // 设置列的宽度 

 column = 0; 

 sheet.setColumnView(column++, 20); 

 sheet.setColumnView(column++, 20); 

 sheet.setColumnView(column++, 10); 

 sheet.setColumnView(column++, 20); 


 workbook.write(); 

 workbook.close(); 

 } catch (Exception e) { 

 e.printStackTrace(); 

 } 


} 

}