Browse Source

完成 pdf批量转换为txt
pdf存放data文件夹
txt存放output文件夹

liuyuqi-dellpc 7 years ago
parent
commit
ca15d9768c
3 changed files with 215 additions and 1 deletions
  1. 34 0
      pom.xml
  2. 115 1
      src/me/yoqi/pdf/Main.java
  3. 66 0
      src/me/yoqi/pdf/Test.java

+ 34 - 0
pom.xml

@@ -0,0 +1,34 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+	<modelVersion>4.0.0</modelVersion>
+	<groupId>PDFOperation</groupId>
+	<artifactId>PDFOperation</artifactId>
+	<version>0.0.1-SNAPSHOT</version>
+	<build>
+		<sourceDirectory>src</sourceDirectory>
+		<plugins>
+			<plugin>
+				<artifactId>maven-compiler-plugin</artifactId>
+				<version>3.5.1</version>
+				<configuration>
+					<source>1.7</source>
+					<target>1.7</target>
+				</configuration>
+			</plugin>
+		</plugins>
+	</build>
+	<dependencies>
+		<dependency>
+			<groupId>org.apache.pdfbox</groupId>
+			<artifactId>pdfbox</artifactId>
+			<version>2.0.6</version>
+		</dependency>
+		<!-- https://mvnrepository.com/artifact/org.bouncycastle/bcprov-jdk15on -->
+		<dependency>
+			<groupId>org.bouncycastle</groupId>
+			<artifactId>bcprov-jdk15on</artifactId>
+			<version>1.54</version>
+		</dependency>
+
+	</dependencies>
+</project>

+ 115 - 1
src/me/yoqi/pdf/Main.java

@@ -1,10 +1,124 @@
 package me.yoqi.pdf;
 package me.yoqi.pdf;
 
 
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+/**
+ * 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
+ * 
+ * @author liuyuqi
+ *
+ */
 public class Main {
 public class Main {
+	// 项目目录
+	private String projectPath;
+	private String resultFile;
+	private List<String> suffixList = new ArrayList<String>();
 
 
 	public static void main(String[] args) {
 	public static void main(String[] args) {
-		// TODO 自动生成的方法存根
+		Main m = new Main();
+		m.init();
+		m.bathGetText();
+	}
+
+	/**
+	 * 初始化参数
+	 */
+	public void init() {
+		projectPath = "E:\\data\\workspace\\PDFOperation\\data";
+		resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
+		suffixList.add(".pdf");// 增加后缀
+	}
+
+	// 保存的结果,输出
+	public void outputData(String fileName, String content) {
+	    try {
+            //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
+            FileWriter writer = new FileWriter(fileName, true);
+            writer.write(content);
+            writer.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+	}
+
+	public void bathGetText() {
+		String filedir = this.projectPath;
+		if (null == filedir || "".equals(filedir.trim())) {
+			System.out.println("filedir 目录不对!");
+			return;
+		}
+		filedir = filedir.trim();
+		if (null == suffixList || suffixList.size() <= 0) {
+			System.out.println("suffixList 没有要匹配的后缀!");
+			return;
+		}
+		File f = new File(filedir);
+		if (f.isDirectory()) {
+			handleDirectory(f);
+		} else {
+			System.out.println("filedir 必须为目录");
+		}
+	}
+
+	private void handleDirectory(File filedir) {
+		// 目录
+		File[] files = filedir.listFiles();
+		for (File subFile : files) {
+			if (subFile.isDirectory()) {
+				handleDirectory(subFile);
+			} else {
+				// 文件
+				for (String suffix : suffixList) {
+					if (subFile.getName().endsWith(suffix)) {
+						System.out.println(subFile.getName());
+						getTextFromPDF(subFile);
+					}
+				}
+			}
+		}
+	}
+
+	/**
+	 * 处理单个pdf为字符串
+	 * 
+	 * @param pdfFilePath
+	 *            pdf文件路径
+	 */
+	public void getTextFromPDF(File pdfFile) {
+		PDDocument document = null;
+
+		// 方式二:
+		try {
+			document = PDDocument.load(pdfFile);
+
+			// 获取页码
+			int pages = document.getNumberOfPages();
+
+			System.out.println(pages);
+
+			// 读文本内容
+			PDFTextStripper stripper = new PDFTextStripper();
+			// 设置按顺序输出
+			stripper.setSortByPosition(true);
+			stripper.setStartPage(1);
+			stripper.setEndPage(pages);
+			String content = stripper.getText(document);
+			// System.out.println(content);
+			outputData(resultFile, content);
+		} catch (InvalidPasswordException e) {
+			System.out.println(121);
+		} catch (Exception e) {
+			System.out.println(123331);
 
 
+		}
 	}
 	}
 
 
 }
 }

+ 66 - 0
src/me/yoqi/pdf/Test.java

@@ -0,0 +1,66 @@
+package me.yoqi.pdf;
+
+import java.io.File;
+import java.util.ArrayList;
+import java.util.List;
+
+public class Test {
+	public static void main(String[] args) {
+		String filedir = "F:\\某个目录下";
+		List<String> suffixList = new ArrayList<String>();
+		suffixList.add(".db");
+		// suffixList.add(".tmp");
+		// suffixList.add(".html_zh");
+		// suffixList.add("_zh.js");
+		Test sweepUnusedFiles = new Test();
+		sweepUnusedFiles.startDeleteFixedFiles(filedir, suffixList);
+		System.out.println("执行完成!");
+	}
+
+	public void startDeleteFixedFiles(String filedir, List<String> suffixList) {
+		if (null == filedir || "".equals(filedir.trim())) {
+			System.out.println("filedir 目录不对!");
+			return;
+		}
+		filedir = filedir.trim();
+		if (null == suffixList || suffixList.size() <= 0) {
+			System.out.println("suffixList 没有要匹配的后缀!");
+			return;
+		}
+		File f = new File(filedir);
+		if (f.isDirectory()) {
+			handleFile(f, suffixList);
+		} else {
+			System.out.println("filedir 必须为目录");
+			/*
+			 * for (String suffix : suffixList) { if
+			 * (f.getName().endsWith(suffix)) { // 匹配到的要删除 try { f.delete(); }
+			 * catch (Exception e) { System.out.println("文件删除失败:" +
+			 * f.getAbsolutePath() + "\\" + f.getName()); } } }
+			 */
+		}
+	}
+
+	private void handleFile(File filedir, List<String> suffixList) {
+		// 目录
+		File[] files = filedir.listFiles();
+		for (File subFile : files) {
+			if (subFile.isDirectory()) {
+				handleFile(subFile, suffixList);
+			} else {
+				// 文件
+				for (String suffix : suffixList) {
+					if (subFile.getName().endsWith(suffix)) {
+						// 匹配到的要删除
+						try {
+							subFile.delete();
+							System.out.println("已删除文件:" + subFile.getAbsolutePath() + "\\" + subFile.getName());
+						} catch (Exception e) {
+							System.out.println("文件删除失败:" + subFile.getAbsolutePath() + "\\" + subFile.getName());
+						}
+					}
+				}
+			}
+		}
+	}
+}