maven之读写pdf简单实例(pdfbox与itext)与pdfbox源码解析
(访问者模式)
记录学习的脚步
本⽂是⽤pdfbox读写pdf,但是因为pdfbox在写pdf的时候,对中⽂的⽀持不好,会有乱码,我尝试着修改COSString的源码,试了UTF-8、UTF-16BE⼏种编码 中⽂输出还是乱码 接着把pdfbox parent中的pom 的 <project.build.sourceEncoding>ISO-8859-
1</project.build.sourceEncoding> 属性改为UTF-8 还是不⾏ 好吧 能⼒有限 还是放弃了
所幸itext对中⽂的⽀持还不错 使⽤itext进⾏写pdf
参考
1、先看pdfbox的读写pdf的代码
产⽣pdf的 SavePdfDocument.java类 必要的地⽅都加了注释
package com.undergrowth.pdfbox;
import java.io.IOException;
import org.apachemons.logging.Log;
import org.apachemons.logging.LogFactory;
import org.s.COSString;
import org.ptions.COSVisitorException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDPage;
import org.apache.pdfbox.pdmodel.edit.PDPageContentStream;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.pdmodel.font.PDType1Font;
/**
* SavePdfDocument类⽤于产⽣pdf⽂档
* @author Administrator
* @date 2014-8-31
* @version 1.0.0
getsavefilename*/
public class SavePdfDocument {
/**
* ⽇志常量
*/
public static final Log Log(SavePdfDocument.class);
/**
* 测试产⽣pdf⽂档
* @param sayWhat 要写⼊到pdf⽂档中的内容
* @param filePath 保存pdf的路径
* @throws IOException
* @throws COSVisitorException
*
*/
public boolean helloPdf(String sayWhat,String filePath) throws IOException, COSVisitorException{
boolean f=false;
PDDocument document=getPdDocument();
PDPage page=getPdPage();
PDPage page=getPdPage();
document.addPage(page);
PDFont font=getFont();
PDPageContentStream contentStream=getPdPageContentStream(document, page);
contentStream.beginText();
contentStream.setFont(font, 20);
/* COSString cosString=new COSString(new Bytes(), "UTF-16BE")); contentStream.drawString("hello world"+"\t");*/
//contentStream.drawString("hello world"+String());
contentStream.drawString(sayWhat);
//关闭页⾯内容流
contentStream.close();
document.save(filePath);
document.close();
logger.info("成功创建pdf");
f=true;
return f;
}
/**
* 获取空的pdf⽂档对象
* @return PDDocument
*/
public PDDocument getPdDocument(){
PDDocument document=new PDDocument();
return document;
}
/**
* 通过⽂件名加载⽂档
* @param fileName
* @return PDDocument
* @throws IOException
*/
public PDDocument getPdDocument(String fileName) throws IOException{
PDDocument document=PDDocument.load(fileName);
return document;
}
/**
* 获取空的pdf页⾯对象
* @return PDPage
*/
public PDPage getPdPage(){
PDPage page =new PDPage();
return page;
}
/**
* 获取海维提卡体
* @return PDFont
*/
public PDFont getFont(){
public PDFont getFont(){
PDFont font=PDType1Font.HELVETICA_BOLD;
return font;
}
/**
* 获取页⾯内容流向页⾯添加内容
* @param document PDDocument
* @param page PDPage
* @return PDPageContentStream
* @throws IOException
*/
public PDPageContentStream getPdPageContentStream(PDDocument document,PDPage page) throws IOException{ PDPageContentStream contentStream=new PDPageContentStream(document, page);
return contentStream;
}
}
提取pdf的 PdfTextStripperTest.java
package com.undergrowth.pdfbox;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import org.apachemons.logging.Log;
import org.apachemons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;
public class PdfTextStripperTest {
public static Log Log(PdfTextStripperTest.class);
/**
* 获取⽂本提取
*
* @param document
* @param writer
* @throws IOException
*/
public void getTextStripper(PDDocument document, Writer writer) throws IOException {
PDFTextStripper textStripper = new PDFTextStripper();
textStripper.writeText(document, writer);
}
/**
* 提取⽂本内容
* @param String fileName 加载⽂档的路径
* @return String
* @throws IOException
*/
public String getText(String fileName) throws IOException {
String textString = "";
SavePdfDocument pdfDocument = new SavePdfDocument();
PDDocument document = PdDocument(fileName); //将提取出来的字节流转换为字符流进⾏显⽰
ByteArrayOutputStream out = new ByteArrayOutputStream();
OutputStreamWriter writer = new OutputStreamWriter(out);
getTextStripper(document, writer);
document.close();
out.close();
writer.close();
byte[] con = ByteArray();
textString = new String(con);
log.info("提取的⽂本内容为:"+textString);
return textString;
}
}
测试类
package com.undergrowth.pdfbox;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
import org.apachemons.logging.Log;
import org.apachemons.logging.Log;
import org.apachemons.logging.LogFactory;
import org.ptions.COSVisitorException;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
* @throws IOException
* @throws COSVisitorException
*/
public void testApp() throws COSVisitorException, IOException
{
SavePdfDocument pdfDocument=new SavePdfDocument();
String filePath="e:\\hello.pdf";
boolean f=pdfDocument.helloPdf(("hello world"), filePath);
/*
* boolean f=pdfDocument.helloPdf(new String("?我".getBytes("UTF-16BE"),"UTF-16BE"), filePath); * System.out.println("我".getBytes("UTF-8"));
System.out.println(new String("我".getBytes("UTF-16BE"), "UTF-16BE"));
*/
assertTrue( f );
filePath="E:\\test11.pdf";
PdfTextStripperTest textStripperTest=new PdfTextStripperTest();
String stripperText = Text(filePath);
assertNotSame(stripperText, "");
}
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论