|
@@ -1,10 +1,124 @@
|
|
package me.yoqi.pdf;
|
|
package me.yoqi.pdf;
|
|
|
|
|
|
|
|
+import java.io.File;
|
|
|
|
+import java.io.FileWriter;
|
|
|
|
+import java.io.IOException;
|
|
|
|
+import java.util.ArrayList;
|
|
|
|
+import java.util.List;
|
|
|
|
+
|
|
|
|
+import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
|
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
|
|
|
|
+import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
|
+
|
|
|
|
+/**
|
|
|
|
+ * 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
|
|
|
|
+ *
|
|
|
|
+ * @author liuyuqi
|
|
|
|
+ *
|
|
|
|
+ */
|
|
public class Main {
|
|
public class Main {
|
|
|
|
+ // 项目目录
|
|
|
|
+ private String projectPath;
|
|
|
|
+ private String resultFile;
|
|
|
|
+ private List<String> suffixList = new ArrayList<String>();
|
|
|
|
|
|
public static void main(String[] args) {
|
|
public static void main(String[] args) {
|
|
- // TODO 自动生成的方法存根
|
|
|
|
|
|
+ Main m = new Main();
|
|
|
|
+ m.init();
|
|
|
|
+ m.bathGetText();
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 初始化参数
|
|
|
|
+ */
|
|
|
|
+ public void init() {
|
|
|
|
+ projectPath = "E:\\data\\workspace\\PDFOperation\\data";
|
|
|
|
+ resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
|
|
|
|
+ suffixList.add(".pdf");// 增加后缀
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ // 保存的结果,输出
|
|
|
|
+ public void outputData(String fileName, String content) {
|
|
|
|
+ try {
|
|
|
|
+ //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
|
|
|
|
+ FileWriter writer = new FileWriter(fileName, true);
|
|
|
|
+ writer.write(content);
|
|
|
|
+ writer.close();
|
|
|
|
+ } catch (IOException e) {
|
|
|
|
+ e.printStackTrace();
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ public void bathGetText() {
|
|
|
|
+ String filedir = this.projectPath;
|
|
|
|
+ if (null == filedir || "".equals(filedir.trim())) {
|
|
|
|
+ System.out.println("filedir 目录不对!");
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ filedir = filedir.trim();
|
|
|
|
+ if (null == suffixList || suffixList.size() <= 0) {
|
|
|
|
+ System.out.println("suffixList 没有要匹配的后缀!");
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+ File f = new File(filedir);
|
|
|
|
+ if (f.isDirectory()) {
|
|
|
|
+ handleDirectory(f);
|
|
|
|
+ } else {
|
|
|
|
+ System.out.println("filedir 必须为目录");
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ private void handleDirectory(File filedir) {
|
|
|
|
+ // 目录
|
|
|
|
+ File[] files = filedir.listFiles();
|
|
|
|
+ for (File subFile : files) {
|
|
|
|
+ if (subFile.isDirectory()) {
|
|
|
|
+ handleDirectory(subFile);
|
|
|
|
+ } else {
|
|
|
|
+ // 文件
|
|
|
|
+ for (String suffix : suffixList) {
|
|
|
|
+ if (subFile.getName().endsWith(suffix)) {
|
|
|
|
+ System.out.println(subFile.getName());
|
|
|
|
+ getTextFromPDF(subFile);
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ /**
|
|
|
|
+ * 处理单个pdf为字符串
|
|
|
|
+ *
|
|
|
|
+ * @param pdfFilePath
|
|
|
|
+ * pdf文件路径
|
|
|
|
+ */
|
|
|
|
+ public void getTextFromPDF(File pdfFile) {
|
|
|
|
+ PDDocument document = null;
|
|
|
|
+
|
|
|
|
+ // 方式二:
|
|
|
|
+ try {
|
|
|
|
+ document = PDDocument.load(pdfFile);
|
|
|
|
+
|
|
|
|
+ // 获取页码
|
|
|
|
+ int pages = document.getNumberOfPages();
|
|
|
|
+
|
|
|
|
+ System.out.println(pages);
|
|
|
|
+
|
|
|
|
+ // 读文本内容
|
|
|
|
+ PDFTextStripper stripper = new PDFTextStripper();
|
|
|
|
+ // 设置按顺序输出
|
|
|
|
+ stripper.setSortByPosition(true);
|
|
|
|
+ stripper.setStartPage(1);
|
|
|
|
+ stripper.setEndPage(pages);
|
|
|
|
+ String content = stripper.getText(document);
|
|
|
|
+ // System.out.println(content);
|
|
|
|
+ outputData(resultFile, content);
|
|
|
|
+ } catch (InvalidPasswordException e) {
|
|
|
|
+ System.out.println(121);
|
|
|
|
+ } catch (Exception e) {
|
|
|
|
+ System.out.println(123331);
|
|
|
|
|
|
|
|
+ }
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
}
|