pdfbox识别pdf为excel
1、继承 PageDrawer 和 PDFRenderer获得⽂本框坐标
2、通过坐标获取⽂字
3、通过easyExcel⽣成表格
public class MyPageDrawer extends PageDrawer {
static final List<Coordinate> COORDINATE_LIST = new ArrayList<>();
double pageHeight;
MyPageDrawer(PageDrawerParameters parameters) throws IOException
{
super(parameters);
this.Page().getBBox().getHeight();
}
PDPage pdPage;
@Override
public void processPage(PDPage aPage) throws IOException {
this.pdPage=aPage;
super.processPage(aPage);
}
@Override
public void fillPath(int windingRule) {
Shape bbox = getLinePath().getBounds2D();
Coordinate startCoordinate = new Bounds().getLocation().x,(Bounds().getLocation().y);
COORDINATE_LIST.add(startCoordinate);
getLinePath().reset();
}
}
public class MyPDFRenderer extends PDFRenderer
{
MyPDFRenderer(PDDocument document)
{
super(document);
}
@Override
protected PageDrawer createPageDrawer(PageDrawerParameters parameters) throws IOException
{
return new MyPageDrawer(parameters);
// return new TestPageDrawer(parameters);
}
}
public class App {
public static void main(String[] args) throws Exception {
String fileName = "E:\\download\\test\\2020年12⽉北京⼯程造价信息.pdf"; //这⾥先⼿动把绝对路径的⽂件夹给补上。
readPDF(fileName);
}
/**
* 读PDF⽂件,使⽤了pdfbox开源项⽬
* @param fileName
*/
public static void readPDF(String fileName) {
File file = new File(fileName);
FileInputStream in = null;
try {
in = new FileInputStream(fileName);
// 新建⼀个PDF解析器对象
PDFParser parser = new PDFParser(new RandomAccessFile(file,"rw"));
/
/ 对PDF⽂件进⾏解析
parser.parse();
// 获取解析后得到的PDF⽂档对象
PDDocument pdfdocument = PDDocument();
System.out.println("NumberOfPages:"+ NumberOfPages());
PDFRenderer renderer = new MyPDFRenderer(pdfdocument);
int pageNum=12;
BufferedImage image = derImage(pageNum);
ImageIO.write(image, "PNG", new File("test.png"));
// System.out.println("SEG_");
// MyPageDrawer.SEG_LINETO_LIST.stream().forEach(System.out::println);
String resultFileName = "simpleWrite" + System.currentTimeMillis() + ".xlsx";
EasyExcel.write(resultFileName).sheet().doWrite(judgeCoordinate(MyPageDrawer.COORDINATE_LIST, pdfdocument, pageNum)); } catch (Exception e) {
System.out.println("读取PDF⽂件" + AbsolutePath() + "⽣失败!" + e);
e.printStackTrace();
} finally {
if (in != null) {
try {
in.close();
} catch (IOException e1) {
}
}
}
}
/**
* 去重排序
*
* @param coordinateList
* @param document
* @return
*/
private static List<List<String>> judgeCoordinate(List<Coordinate> coordinateList, PDDocument document,int pageNum) {
//去除pdf边界
coordinateList=coordinateList.stream().filter(coordinate -> !(X()<38||Y()<70||Y()>780||X()>558)).List()); // 去重按y,x排序从左上⾓开始计算
coordinateList = coordinateList.stream().sorted(Comparatorparing(Coordinate::getY).thenComparing(Coordinate::getX)).List());
System.out.println("去重,排序后,分组前...");
coordinateList.stream().forEach(System.out::println);
// 去除相近元素
for(int a=0;a<coordinateList.size();a++){
Coordinate coordinateStart = (a);
for (int j = a+1; j < coordinateList.size(); j++) {
Coordinate coordinateC = (j);
if (Math.Y()-Y()) <=2) {
if(Math.X()-X())<=2){
j--;
}else {
int Y()&Y()?Y():Y();
coordinateC.setY(y);
}
}else {
break;
}
}
}
//需要重新排序
coordinateList=coordinateList.stream().sorted(Comparatorparing(Coordinate::getY).thenComparing(Coordinate::getX))
.List());
Map<Integer, List<Coordinate>> groupList = coordinateList.stream()
.upingBy(Coordinate::getY));
Map<Integer, List<Coordinate>> result =new LinkedHashMap<>();
.forEachOrdered(e -> result.Key(), e.getValue()));
System.out.println("总⾏数:"+result.size());
List<List<Coordinate>> resultRow = result.values().stream()
.List());
resultRow=resultRow.stream().filter(item-> (item.size()>1)).List());
System.out.println("去重,排序,分组后...");
resultRow.stream().forEach(System.out::println);
List<List<String>> mapList = new ArrayList<>();
for (int k = 0; k < resultRow.size()-1; k++) {
Map<String,String> map = new HashMap<>();
List<String> listRow=new ArrayList<>();
boolean nullData=false;
for (int i = 0; i < (k).size()-1; i++) {
Coordinate (k).get(i);
List<Coordinate> (k+1);
if(nextRow.size()>i+1){
Coordinate (i+1);
int X() - X();
int Y() - Y();
//左上⾓为原始点向右加宽向下加⾼
try {
String info = X(), Y(),
width,height, document,pageNum);
info = placeAll("\r|\n", "");
sortedlistmap.put("column"+i,info);
if(info==null||info.length()==0){
nullData=true;
}else {
nullData=false;
listRow.add(info);
}
}catch (Exception e){
e.printStackTrace();
}
}
}
if(!nullData){
mapList.add(listRow);
}
}
Gson gson = new Gson();
String mapListString = Json(mapList);
System.out.println(mapListString);
return mapList;
}
private static String readRectangleInfo(int x, int y, int width, int height, PDDocument document , int pageNum) throws Exception {
PDFTextStripperByArea stripper = new PDFTextStripperByArea();
stripper.setSortByPosition(true);
Rectangle rect = new Rectangle(x, y, width, height);
stripper.addRegion("rect", rect);
PDPage firstPage = Page(pageNum);
TextForRegion("rect");
}
}
<dependencies>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>pdfbox</artifactId>
<version>2.0.22</version>
</dependency>
<!-- mvnrepository/artifact/org.apache.pdfbox/fontbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
<version>2.0.22</version>
</dependency>
<!-- mvnrepository/artifact/org.apache.pdfbox/jempbox -->
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>jempbox</artifactId>
<version>1.8.16</version>
</dependency>
<dependency>
<groupId&le.code.gson</groupId>
<artifactId>gson</artifactId>
<version>2.8.0</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>easyexcel</artifactId>
<version>2.2.7</version>
</dependency>
</dependencies>
版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。
发表评论