|
@@ -1,15 +1,5 @@
|
|
|
package me.yoqi.pdf;
|
|
|
|
|
|
-import java.io.File;
|
|
|
-import java.io.FileWriter;
|
|
|
-import java.io.IOException;
|
|
|
-import java.util.ArrayList;
|
|
|
-import java.util.List;
|
|
|
-
|
|
|
-import org.apache.pdfbox.pdmodel.PDDocument;
|
|
|
-import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
|
|
|
-import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
-
|
|
|
/**
|
|
|
* 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
|
|
|
*
|
|
@@ -17,112 +7,17 @@ import org.apache.pdfbox.text.PDFTextStripper;
|
|
|
*
|
|
|
*/
|
|
|
public class Main {
|
|
|
- // 项目目录
|
|
|
- private String projectPath;
|
|
|
- private String resultFile;
|
|
|
- private List<String> suffixList = new ArrayList<String>();
|
|
|
|
|
|
public static void main(String[] args) {
|
|
|
- Main m = new Main();
|
|
|
- m.init();
|
|
|
- m.bathGetText();
|
|
|
+ PDFOperation pdf = new PDFOperation();
|
|
|
+ pdf.init();
|
|
|
+ pdf.bathGetText();
|
|
|
}
|
|
|
|
|
|
- /**
|
|
|
- * 初始化参数
|
|
|
- */
|
|
|
- public void init() {
|
|
|
- projectPath = "E:\\data\\workspace\\PDFOperation\\data";
|
|
|
- resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
|
|
|
- suffixList.add(".pdf");// 增加后缀
|
|
|
- }
|
|
|
-
|
|
|
- // 保存的结果,输出
|
|
|
- public void outputData(String fileName, String content) {
|
|
|
- try {
|
|
|
- //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
|
|
|
- FileWriter writer = new FileWriter(fileName, true);
|
|
|
- writer.write(content);
|
|
|
- writer.close();
|
|
|
- } catch (IOException e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- public void bathGetText() {
|
|
|
- String filedir = this.projectPath;
|
|
|
- if (null == filedir || "".equals(filedir.trim())) {
|
|
|
- System.out.println("filedir 目录不对!");
|
|
|
- return;
|
|
|
- }
|
|
|
- filedir = filedir.trim();
|
|
|
- if (null == suffixList || suffixList.size() <= 0) {
|
|
|
- System.out.println("suffixList 没有要匹配的后缀!");
|
|
|
- return;
|
|
|
- }
|
|
|
- File f = new File(filedir);
|
|
|
- if (f.isDirectory()) {
|
|
|
- handleDirectory(f);
|
|
|
- } else {
|
|
|
- System.out.println("filedir 必须为目录");
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- private void handleDirectory(File filedir) {
|
|
|
- // 目录
|
|
|
- File[] files = filedir.listFiles();
|
|
|
- for (File subFile : files) {
|
|
|
- if (subFile.isDirectory()) {
|
|
|
- handleDirectory(subFile);
|
|
|
- } else {
|
|
|
- // 文件
|
|
|
- for (String suffix : suffixList) {
|
|
|
- if (subFile.getName().endsWith(suffix)) {
|
|
|
- System.out.println(subFile.getName());
|
|
|
- getTextFromPDF(subFile);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- /**
|
|
|
- * 处理单个pdf为字符串
|
|
|
- *
|
|
|
- * @param pdfFilePath
|
|
|
- * pdf文件路径
|
|
|
- */
|
|
|
- public void getTextFromPDF(File pdfFile) {
|
|
|
- PDDocument document = null;
|
|
|
-
|
|
|
- // 方式二:
|
|
|
- try {
|
|
|
- document = PDDocument.load(pdfFile);
|
|
|
-
|
|
|
- // 获取页码
|
|
|
- int pages = document.getNumberOfPages();
|
|
|
-
|
|
|
- System.out.println(pages);
|
|
|
-
|
|
|
- // 读文本内容
|
|
|
- PDFTextStripper stripper = new PDFTextStripper();
|
|
|
- // 设置按顺序输出
|
|
|
- stripper.setSortByPosition(true);
|
|
|
- stripper.setStartPage(1);
|
|
|
- stripper.setEndPage(pages);
|
|
|
- String content = stripper.getText(document);
|
|
|
- // System.out.println(content);
|
|
|
- outputData(resultFile, content);
|
|
|
- } catch (InvalidPasswordException e) {
|
|
|
- System.out.println(121);
|
|
|
- } catch (Exception e) {
|
|
|
- System.out.println(123331);
|
|
|
-
|
|
|
- }
|
|
|
- }
|
|
|
- private void stringWithOutChinese(){
|
|
|
- StringForChinese chinese=new StringForChinese();
|
|
|
- chinese.subStrWithOutChinese(str);
|
|
|
- }
|
|
|
+
|
|
|
+// private void stringWithOutChinese(){
|
|
|
+// StringForChinese chinese=new StringForChinese();
|
|
|
+// chinese.subStrWithOutChinese(str);
|
|
|
+// }
|
|
|
|
|
|
}
|