背景
因项目需求,项目中需要提供pdf压缩功能。将某一页压缩至1M大小。
场景的Java的pdf处理方案就是itext pdfbox 以及 apose
方案一:itext压缩(不推荐)
代码
/**
* @param src 源文件
* @param dest 目标文件
* @throws IOException
* @throws DocumentException
*/
public static void compressPdf(String src, String dest, float factor)
throws PdfCompressException {
log.info("use radio {} compress file:{}>>>{}", factor, src, dest);
// 读取pdf文件
PdfReader reader = null;
PdfStamper stamper = null;
ByteArrayOutputStream imgBytes = null;
try {
reader = new PdfReader(src);
int n = reader.getXrefSize();
PdfObject object;
PRStream stream;
// Look for image and manipulate image stream
for (int i = 0; i < n; i++) {
object = reader.getPdfObject(i);
if (object == null || !object.isStream()) {
continue;
}
stream = (PRStream) object;
PdfObject pdfSubByte = stream.get(PdfName.SUBTYPE);
if (pdfSubByte != null && pdfSubByte.toString().equals(PdfName.IMAGE.toString())) {
PdfImageObject image = new PdfImageObject(stream);
BufferedImage bi = image.getBufferedImage();
if (bi == null) {
continue;
}
int width = bi.getWidth();
int height = bi.getHeight();
AffineTransform at = AffineTransform.getScaleInstance(1, 1);
if ((int) (width * factor) > 0 && (int) (bi.getHeight() * factor) > 0) {
width = (int) (width * factor);
height = (int) (bi.getHeight() * factor);
at = AffineTransform.getScaleInstance(factor, factor);
}
BufferedImage img = new BufferedImage(width, height, BufferedImage.TYPE_INT_RGB);
Graphics2D g = img.createGraphics();
g.drawRenderedImage(bi, at);
imgBytes = new ByteArrayOutputStream();
ImageIO.write(img, "JPG", imgBytes);
stream.clear();
stream.setData(imgBytes.toByteArray(), false, PRStream.BEST_COMPRESSION);
stream.put(PdfName.TYPE, PdfName.XOBJECT);
stream.put(PdfName.SUBTYPE, PdfName.IMAGE);
stream.put(PdfName.FILTER, PdfName.DCTDECODE);
stream.put(PdfName.WIDTH, new PdfNumber(width));
stream.put(PdfName.HEIGHT, new PdfNumber(height));
stream.put(PdfName.BITSPERCOMPONENT, new PdfNumber(8));
stream.put(PdfName.COLORSPACE, PdfName.DEVICERGB);
}
}
stamper = new PdfStamper(reader, new FileOutputStream(dest));
} catch (Exception e) {
log.error("pdf compress error:{}>>>{}", src, dest);
log.error("pdf compress error:", e);
throw new PdfCompressException(e.getMessage());
} finally {
if (imgBytes != null) {
try {
imgBytes.close();
} catch (IOException e) {
log.error("imgBytes close failed when compress pdf:", e);
}
}
if (stamper != null) {
try {
stamper.close();
} catch (Exception e) {
log.error("stamper close failed when compress pdf:", e);
}
}
if (reader != null) {
reader.close();
}
}
}
方案描述
提供一个压缩方法,先尝试 1倍压缩判断是否小于1M,然后0.9,0.8...0.1,直至factor<=0.1或者压缩后文件小于等于1M才停止压缩。
方案问题
该方案相当于至针对pdf当中的图片进行压缩,之前遇见一个10M的非图片pdf,就是一个表格,但是无论如何都压不下来。
后面采用wps和apose以及在线的pdf压缩工具同样处理不了。
压缩效率低,内存消耗巨大。
方案二:pdfbox方案(强烈不推荐)
代码
public static void image2Pdf(String inputFile, String pdfFile) throws Image2PdfException {
log.info("convert image 2 pdf :{}>>>{}", inputFile, pdfFile);
Document doc = null;
ByteArrayOutputStream outStream = null;
PdfWriter pdfWriter = null;
FileInputStream fi = null;
try {
File file = new File(inputFile);
doc = new Document(PageSize.A4, 20, 20, 20, 20);
pdfWriter = PdfWriter.getInstance(doc, new FileOutputStream(pdfFile));
doc.open();
doc.newPage();
Image image;
if (file.getName().toLowerCase().endsWith("jpg") || file.getName().toLowerCase()
.endsWith("jpeg")) {
java.awt.Image awtImage = Toolkit.getDefaultToolkit().createImage(file.getAbsolutePath());
image = Image.getInstance(awtImage, null);
} else {
image = Image.getInstance(file.getAbsolutePath());
}
float height = image.getHeight();
float width = image.getWidth();
if (width > height) {
fi = new FileInputStream(file);
BufferedImage src = ImageIO.read(fi);
BufferedImage des1 = RotateImage.rotate(src, 90);
String type = file.getName().substring(file.getName().lastIndexOf(".") + 1).toLowerCase();
outStream = new ByteArrayOutputStream();
ImageIO.write(des1, type, outStream);
image = Image.getInstance(outStream.toByteArray());
height = image.getHeight();
width = image.getWidth();
}
int percent = getPercent(height, width);
image.setAlignment(Image.MIDDLE);
image.scalePercent(percent);
float x = (PageSize.A4.getWidth() - image.getScaledWidth()) / 2;
float y = (PageSize.A4.getHeight() - image.getScaledHeight()) / 2;
image.setAbsolutePosition(x, y);
doc.add(image);
} catch (Exception e) {
log.error("image 2 pdf failed:{}>>>{}", inputFile, pdfFile);
log.error("exception info:", e);
throw new Image2PdfException(e.getMessage());
} finally {
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
log.info("空文档:", e);
}
}
if (pdfWriter != null) {
pdfWriter.close();
}
if (outStream != null) {
try {
outStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fi != null) {
try {
fi.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 等比压缩,获取压缩百分比
*
* @param height 图片的高度
* @param weight 图片的宽度
* @return 压缩百分比
*/
private static int getPercent(float height, float weight) {
float percent = 0.0F;
if (height > weight) {
percent = (PageSize.A4.getHeight() - 120) / height * 100;
} else {
percent = (PageSize.A4.getWidth() - 120) / weight * 100;
}
return Math.round(percent);
}
public static void pdf2ImagePdf(String source, String targetPdf, int dpi)
throws Pdf2ImageException, PdfSplitException, Image2PdfException {
String imagePath = source.substring(0, source.lastIndexOf("."));
File imageDir = YhPdfUtil.pdf2Images(source, imagePath, dpi);
File[] files = imageDir.listFiles();
if (files == null || files.length == 0) {
throw new Pdf2ImageException("no image found,may pdf 2 image failed");
} else {
if (files.length == 1) {
log.info("pdf just one img ,just convert");
YhPdfUtil.image2Pdf(files[0].getAbsolutePath(), targetPdf);
} else {
log.info("so much images,convert every img and merge all...");
String tmpPdfDir =
source.replace("\\", "/").substring(0, source.lastIndexOf(".")) + "-pdf-" + System
.currentTimeMillis() + "/";
File fpd = new File(tmpPdfDir);
if (!fpd.exists()) {
fpd.mkdirs();
}
for (int k = 0; k < files.length; k++) {
String fn =
files[k].getName().substring(0, files[k].getName().lastIndexOf(".")) + k + ".pdf";
String tmpPdf = tmpPdfDir + fn;
YhPdfUtil.image2Pdf(files[k].getAbsolutePath(), tmpPdf);
}
File[] tps = fpd.listFiles();
if (tps == null || tps.length == 0) {
throw new Image2PdfException("no pdf found,may image 2 pdf failed");
} else {
List<String> tst = new ArrayList<>();
for (int l = 0; l < tps.length; l++) {
tst.add(tps[l].getAbsolutePath());
}
tst.sort(Comparator.comparing(t -> t));
YhPdfUtil.mergePdf(tst, targetPdf);
try {
FileUtils.deleteDirectory(imageDir);
FileUtils.deleteDirectory(fpd);
} catch (IOException e) {
log.error("pdf转纯图pdf后,删除临时文件失败:", e);
}
}
}
}
}
public static void pdf2ImagePdfWithMax(String source, String targetPdf, long size)
throws Pdf2ImageException, PdfSplitException, Image2PdfException, IOException {
int dpi;
File sourceFile = new File(source);
if (sourceFile.length() <= size) {
log.info("sourceFile's length:{}>size:{},just copy", sourceFile.length(), size);
FileUtils.copyFile(sourceFile, new File(targetPdf));
} else {
long c = size * 1000 / sourceFile.length();
c = c > 1000 ? 1000 : c;
for (dpi = Integer.parseInt(String.valueOf(c)); dpi > 1; dpi = dpi / 2) {
pdf2ImagePdf(source, targetPdf, dpi);
File file = new File(targetPdf);
if (file.length() > size) {
continue;
} else {
break;
}
}
}
}
/**
* 合并pdf
*
* @param fileList 本地文件列表 ["D:/opt/aaa.pdf","D:/opt/bbb.pdf"]
* @param newPdfPath 合并文件的保存路径 "D:/opt/ccc.pdf"
* @return boolean
* @throws
* @version V1.0.0
* @date 2021/11/4 10:00
*/
public static boolean mergePdf(List<String> fileList, String newPdfPath) {
Document document = null;
FileOutputStream fo = null;
PdfCopy copy = null;
PdfReader rr = null;
try {
fo = new FileOutputStream(newPdfPath);
rr = new PdfReader(fileList.get(0));
document = new Document(rr.getPageSize(1));
copy = new PdfCopy(document, fo);
copy.setFullCompression();
document.open();
for (int i = 0; i < fileList.size(); i++) {
PdfReader reader = new PdfReader(fileList.get(i));
try {
int n = reader.getNumberOfPages();
for (int j = 1; j <= n; j++) {
document.newPage();
PdfImportedPage page = copy.getImportedPage(reader, j);
copy.addPage(page);
}
} finally {
reader.close();
}
}
return true;
} catch (IOException | DocumentException e) {
log.error("pdf合并失败:", e);
return false;
} finally {
if (rr != null) {
rr.close();
}
if (copy != null) {
copy.close();
}
if (document != null) {
document.close();
}
if (fo != null) {
try {
fo.close();
} catch (Exception e) {
log.error("Io关闭异常:", e);
}
}
}
}
方案描述
该方案是通过pdfbox按某个dpi将pdf拆分成图片,然后在将拆出来的pdf通过itext合成为pdf.如果合并的pdf大于体积,则按更小的dpi再来一遍。
问题
其实该方案流程上没有问题,但是在性能上会存在非常大的漏洞及消耗-内存泄漏问题。pdfbox会缓存大量的pdf元数据(字体,字典)等信息
且无法被GC,或者说,在Gc之前,Java服务进程已经被服务器杀死了。刚开始还以为是版本问题,我看最新版本对内存做了优化,但是在升级
最新版本之后,内存增长虽然好了些,但是在有限的内存下。依旧无法会因内存泄漏问题导致服务宕机。
方案三:采用apose将pdf转为图片(不推荐)
代码
public static File pdf2Images(String pdfPath, String imageDirPath, int dpi)
throws Pdf2ImageException, PdfSplitException {
imageDirPath = imageDirPath.replace("\\", "/");
if (!imageDirPath.endsWith("/")) {
imageDirPath = imageDirPath + "/";
}
File file = new File(pdfPath);
File imageDir = new File(imageDirPath);
if (!imageDir.exists()) {
imageDir.mkdirs();
}
com.aspose.pdf.Document pdDocument;
try {
pdDocument = new com.aspose.pdf.Document(pdfPath);
FileOutputStream fileOutputStream = null;
int pages = pdDocument.getPages().size();
if (pages == 1) {
try {
Resolution resolution = new Resolution(dpi);
JpegDevice jpegDevice = new JpegDevice(resolution);
String tmpImage = imageDirPath + file.getName().substring(0, file.getName().lastIndexOf(".")) +
"-" + System.currentTimeMillis() + ".png";
log.info("pdf just one page,use dpi {} pdf file 2 image:{}>>>{}", dpi, pdfPath, tmpImage);
fileOutputStream = new FileOutputStream(new File(tmpImage));
jpegDevice.process(pdDocument.getPages().get_Item(1), fileOutputStream);
fileOutputStream.flush();
} finally {
pdDocument.close();
if (fileOutputStream != null) {
fileOutputStream.close();
}
}
} else {
log.info("the pdf so many pages, split every page before convert...");
String tmpPdfPath =
pdfPath.replace("\\", "/").substring(0, pdfPath.lastIndexOf(".")) + "-pdf-" + System
.currentTimeMillis() + "/";
File tmpPdfDir = splitPerPagePdf(pdfPath, tmpPdfPath);
File[] files = tmpPdfDir.listFiles();
if (files == null || files.length == 0) {
throw new PdfSplitException("pdf split failed, no result fle found");
} else {
List<File> pdfs = new ArrayList<File>(Arrays.asList(files));
pdfs.sort(Comparator.comparing(file1 -> file.getName()));
for (int k = 0; k < pdfs.size(); k++) {
pdf2Images(pdfs.get(k).getAbsolutePath(), imageDirPath, dpi);
}
FileUtils.deleteDirectory(new File(tmpPdfPath));
}
}
return imageDir;
} catch (IOException e) {
log.error("pdf转图片失败:{}", e);
throw new Pdf2ImageException(pdfPath);
}
}
public static void image2Pdf(String inputFile, String pdfFile) throws Image2PdfException {
log.info("convert image 2 pdf :{}>>>{}", inputFile, pdfFile);
Document doc = null;
ByteArrayOutputStream outStream = null;
PdfWriter pdfWriter = null;
FileInputStream fi = null;
try {
File file = new File(inputFile);
doc = new Document(PageSize.A4, 20, 20, 20, 20);
pdfWriter = PdfWriter.getInstance(doc, new FileOutputStream(pdfFile));
doc.open();
doc.newPage();
Image image;
if (file.getName().toLowerCase().endsWith("jpg") || file.getName().toLowerCase()
.endsWith("jpeg")) {
java.awt.Image awtImage = Toolkit.getDefaultToolkit().createImage(file.getAbsolutePath());
image = Image.getInstance(awtImage, null);
} else {
image = Image.getInstance(file.getAbsolutePath());
}
float height = image.getHeight();
float width = image.getWidth();
if (width > height) {
fi = new FileInputStream(file);
BufferedImage src = ImageIO.read(fi);
BufferedImage des1 = RotateImage.rotate(src, 90);
String type = file.getName().substring(file.getName().lastIndexOf(".") + 1).toLowerCase();
outStream = new ByteArrayOutputStream();
ImageIO.write(des1, type, outStream);
image = Image.getInstance(outStream.toByteArray());
height = image.getHeight();
width = image.getWidth();
}
int percent = getPercent(height, width);
image.setAlignment(Image.MIDDLE);
image.scalePercent(percent);
float x = (PageSize.A4.getWidth() - image.getScaledWidth()) / 2;
float y = (PageSize.A4.getHeight() - image.getScaledHeight()) / 2;
image.setAbsolutePosition(x, y);
doc.add(image);
} catch (Exception e) {
log.error("image 2 pdf failed:{}>>>{}", inputFile, pdfFile);
log.error("exception info:", e);
throw new Image2PdfException(e.getMessage());
} finally {
if (doc != null) {
try {
doc.close();
} catch (Exception e) {
log.info("空文档:", e);
}
}
if (pdfWriter != null) {
pdfWriter.close();
}
if (outStream != null) {
try {
outStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
if (fi != null) {
try {
fi.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
}
/**
* 合并pdf
*
* @param fileList 本地文件列表 ["D:/opt/aaa.pdf","D:/opt/bbb.pdf"]
* @param newPdfPath 合并文件的保存路径 "D:/opt/ccc.pdf"
* @return boolean
* @throws
* @version V1.0.0
* @date 2021/11/4 10:00
*/
public static boolean mergePdf(List<String> fileList, String newPdfPath) {
Document document = null;
FileOutputStream fo = null;
PdfCopy copy = null;
PdfReader rr = null;
try {
fo = new FileOutputStream(newPdfPath);
rr = new PdfReader(fileList.get(0));
document = new Document(rr.getPageSize(1));
copy = new PdfCopy(document, fo);
copy.setFullCompression();
document.open();
for (int i = 0; i < fileList.size(); i++) {
PdfReader reader = new PdfReader(fileList.get(i));
try {
int n = reader.getNumberOfPages();
for (int j = 1; j <= n; j++) {
document.newPage();
PdfImportedPage page = copy.getImportedPage(reader, j);
copy.addPage(page);
}
} finally {
reader.close();
}
}
return true;
} catch (IOException | DocumentException e) {
log.error("pdf合并失败:", e);
return false;
} finally {
if (rr != null) {
rr.close();
}
if (copy != null) {
copy.close();
}
if (document != null) {
document.close();
}
if (fo != null) {
try {
fo.close();
} catch (Exception e) {
log.error("Io关闭异常:", e);
}
}
}
}
public static void compress(String source, String target,int qa) {
new com.aspose.pdf.Document doc = new new com.aspose.pdf.Document(source);
//设置压缩属性
OptimizationOptions opt = new OptimizationOptions();
//删除PDF不必要的对象
opt.setRemoveUnusedObjects(true);
//链接重复流
opt.setLinkDuplcateStreams(false);
//删除未使用的流
opt.setRemoveUnusedStreams(false);
//删除不必要的字体
opt.setUnembedFonts(true);
//压缩PDF中的图片
opt.setCompressImages(true);
//图片压缩比, 0 到100可选,越低压缩比越大
opt.setImageQuality(qa);
doc.optimizeResources(opt);
//优化web的PDF文档
doc.optimize();
doc.save(target);
}
方案描述
流程是 pdf转图片->图片转pdf->合并->循环压缩至指定大小,该方案解决了pdfbox内存泄漏问题
问题
虽然解决的pdfbox内存泄漏问题,但是内存占用依旧非常严重。几个文件转换,内存飙升4个G。对服务而言,
还是比较危险的,在内存宽裕的情况下,采用这套方案可以,但是在内存禁止的情况下,不建议如此去做。
方案四:ghostscript+ImageMagick(推荐,最终方案)
代码:
private static String command = "";
private static final String cmdExpress = "%s -density 150 -quality %s -limit memory 10mb -limit map 10mb %s %s";
private static String gsCommand = "";
static {
String os = System.getProperty("os.name");
if (os != null && os.toLowerCase().contains("window")) {
command = "magick";
gsCommand = "gswin32c";
} else if (os != null && os.toLowerCase().contains("ubuntu")) {
command = "sudo convert";
gsCommand = "sudo gs";
} else {
command = "convert";
gsCommand = "gs";
}
}
public static void pdf2ImagePdf(String pdfPath, String targetPdf, int qa)
throws Pdf2ImageException, Image2PdfException {
String imageDirPath = pdfPath.substring(0, pdfPath.lastIndexOf(".")).replace("\\", "/");
log.info("pdf2image:{}>>>{}", pdfPath, imageDirPath);
pdfPath = pdfPath.replace("\\", "/");
File pdf = new File(pdfPath);
String pdfName = pdf.getName();
File imageDir = new File(imageDirPath);
if (!imageDir.exists()) {
imageDir.mkdirs();
}
String imageName = pdfName.substring(0, pdfName.lastIndexOf(".")) + ".png";
String imageFilePath = imageDirPath + "/" + imageName;
imageFilePath = imageFilePath.replace("\\", "/");
String pdf2ImgCmd = String.format(cmdExpress, command, qa, pdfPath, imageFilePath);
log.info("pdf2ImgCmd:{}", pdf2ImgCmd);
try {
Process pro = Runtime.getRuntime().exec(pdf2ImgCmd);
pro.waitFor(5, TimeUnit.MINUTES);
} catch (Exception e) {
log.error("pdf转图片你失败:", e);
throw new Pdf2ImageException(e.getMessage());
}
String inputFile = imageDirPath + "/*.png";
String cmdEx = "%s -density 150 -quality %s -limit memory 10mb -limit map 10mb %s %s";
String img2PdfCmd = String.format(cmdEx, command, qa, inputFile, targetPdf);
log.info("convert2PdfCmd:{}", img2PdfCmd);
try {
Process pro = Runtime.getRuntime().exec(img2PdfCmd);
pro.waitFor(3, TimeUnit.MINUTES);
} catch (Exception e) {
log.error("pdf转图片你失败:", e);
throw new Image2PdfException(e.getMessage());
}
FileUtil.del(imageDirPath);
}
/**
* @param src 源文件
* @param dest 目标文件
* @throws IOException
* @throws DocumentException
*/
public static void compressPdf(String src, String dest, int qa) throws IOException {
String compressCommand = "%s -dQUIET -dNOSAFER -r%s -sDEVICE=pdfwrite -dCompatibilityLevel=1.3 -dPDFSETTINGS=/screen -dNOPAUSE -dBATCH -dColorImageResolution=150 -sOutputFile=%s %s";
src = src.replace("\\", "/");
dest = dest.replace("\\", "/");
String cmd = String.format(compressCommand, gsCommand, qa, dest, src);
log.info(cmd);
try {
Process process = Runtime.getRuntime().exec(cmd);
process.waitFor(3, TimeUnit.MINUTES);
} catch (Exception e) {
log.info("文档转换失败:", e);
throw new PdfCompressException(e.getMessage());
}
}
public static void pdf2ImagePdfWithMax(String source, String targetPdf, long size)
throws IOException {
File sourceFile = new File(source);
if (sourceFile.length() <= size) {
log.info("sourceFile's length:{}>size:{},just copy", sourceFile.length(), size);
FileUtils.copyFile(sourceFile, new File(targetPdf));
} else {
String targetTmpPdf = targetPdf.substring(0, targetPdf.lastIndexOf(".")) + "-tmp" + ".pdf";
try {
FutureTask<Boolean> futureTask = new FutureTask<>(() -> {
pdf2ImagePdf(source, targetTmpPdf, 96);
compressPdf2FixLength(targetTmpPdf, targetPdf, size);
return true;
});
YhConstant.ITEM_POOL.submit(futureTask);
try {
futureTask.get(5, TimeUnit.MINUTES);
} catch (Exception e) {
throw new PdfCompressException("压缩失败:" + e.getMessage());
}
} finally {
File file = new File(targetTmpPdf);
if (file.exists()) {
file.delete();
}
}
}
}
方案描述:
流程依旧是 pdf转图片->图片合并成pdf->pdf压缩
只是通过系统层ghostscript+ImageMagick来实现
cenos:
yum install -y ghostscript ImageMagick
vi /etc/ImageMagick-6/policy.xml
将 <policy domain="module"这一行取消注释,并改为:
<policy domain="module" rights="read|write" pattern="{PS,PDF,XPS}" />
unbuntu:
apt install -y ghostscript ImageMagick
同样需要修改etc/magick安装目录下的policy.xml文件
windows:
自行安装且添加环境变量。
问题
ghostscript压缩pdf稍微费些内存,但是比起java要好好多。建议在ghostscript压缩加入线程池进行并发控制,降低内存爆掉的风险。
总结
Java就是TMD费内存,JVM优化其实也就那样,5家客户同事在用的saas系统,我只能xms xmx服务器剩余的4个G,再怎么优化也是醉了。