liuyuqi-dellpc 7 years ago
parent
commit
92bebdb4b0
3 changed files with 123 additions and 114 deletions
  1. 8 113
      java/src/me/yoqi/pdf/Main.java
  2. 111 0
      java/src/me/yoqi/pdf/PDFOperation.java
  3. 4 1
      output/说明.txt

+ 8 - 113
java/src/me/yoqi/pdf/Main.java

@@ -1,15 +1,5 @@
 package me.yoqi.pdf;
 
-import java.io.File;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.apache.pdfbox.pdmodel.PDDocument;
-import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
-import org.apache.pdfbox.text.PDFTextStripper;
-
 /**
  * 批量提取指定文件夹中所有pdf文件为txt格式。 并按照每句话分词 随机取出100句话。
  * 
@@ -17,112 +7,17 @@ import org.apache.pdfbox.text.PDFTextStripper;
  *
  */
 public class Main {
-	// 项目目录
-	private String projectPath;
-	private String resultFile;
-	private List<String> suffixList = new ArrayList<String>();
 
 	public static void main(String[] args) {
-		Main m = new Main();
-		m.init();
-		m.bathGetText();
+		PDFOperation pdf = new PDFOperation();
+		pdf.init();
+		pdf.bathGetText();
 	}
 
-	/**
-	 * 初始化参数
-	 */
-	public void init() {
-		projectPath = "E:\\data\\workspace\\PDFOperation\\data";
-		resultFile="E:\\data\\workspace\\PDFOperation\\output\\result.txt";
-		suffixList.add(".pdf");// 增加后缀
-	}
-
-	// 保存的结果,输出
-	public void outputData(String fileName, String content) {
-	    try {
-            //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
-            FileWriter writer = new FileWriter(fileName, true);
-            writer.write(content);
-            writer.close();
-        } catch (IOException e) {
-            e.printStackTrace();
-        }
-	}
-
-	public void bathGetText() {
-		String filedir = this.projectPath;
-		if (null == filedir || "".equals(filedir.trim())) {
-			System.out.println("filedir 目录不对!");
-			return;
-		}
-		filedir = filedir.trim();
-		if (null == suffixList || suffixList.size() <= 0) {
-			System.out.println("suffixList 没有要匹配的后缀!");
-			return;
-		}
-		File f = new File(filedir);
-		if (f.isDirectory()) {
-			handleDirectory(f);
-		} else {
-			System.out.println("filedir 必须为目录");
-		}
-	}
-
-	private void handleDirectory(File filedir) {
-		// 目录
-		File[] files = filedir.listFiles();
-		for (File subFile : files) {
-			if (subFile.isDirectory()) {
-				handleDirectory(subFile);
-			} else {
-				// 文件
-				for (String suffix : suffixList) {
-					if (subFile.getName().endsWith(suffix)) {
-						System.out.println(subFile.getName());
-						getTextFromPDF(subFile);
-					}
-				}
-			}
-		}
-	}
-
-	/**
-	 * 处理单个pdf为字符串
-	 * 
-	 * @param pdfFilePath
-	 *            pdf文件路径
-	 */
-	public void getTextFromPDF(File pdfFile) {
-		PDDocument document = null;
-
-		// 方式二:
-		try {
-			document = PDDocument.load(pdfFile);
-
-			// 获取页码
-			int pages = document.getNumberOfPages();
-
-			System.out.println(pages);
-
-			// 读文本内容
-			PDFTextStripper stripper = new PDFTextStripper();
-			// 设置按顺序输出
-			stripper.setSortByPosition(true);
-			stripper.setStartPage(1);
-			stripper.setEndPage(pages);
-			String content = stripper.getText(document);
-			// System.out.println(content);
-			outputData(resultFile, content);
-		} catch (InvalidPasswordException e) {
-			System.out.println(121);
-		} catch (Exception e) {
-			System.out.println(123331);
-
-		}
-	}
-	private void stringWithOutChinese(){
-		StringForChinese chinese=new  StringForChinese();
-		chinese.subStrWithOutChinese(str);
-	}
+	
+//	private void stringWithOutChinese(){
+//		StringForChinese chinese=new  StringForChinese();
+//		chinese.subStrWithOutChinese(str);
+//	}
 
 }

+ 111 - 0
java/src/me/yoqi/pdf/PDFOperation.java

@@ -0,0 +1,111 @@
+package me.yoqi.pdf;
+
+import java.io.File;
+import java.io.FileWriter;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.pdfbox.pdmodel.PDDocument;
+import org.apache.pdfbox.pdmodel.encryption.InvalidPasswordException;
+import org.apache.pdfbox.text.PDFTextStripper;
+
+public class PDFOperation {
+	// 项目目录
+	private String projectPath;
+	private String resultFile;
+	private List<String> suffixList = new ArrayList<String>();
+
+	/**
+	 * 初始化参数
+	 */
+	public void init() {
+		projectPath = "C:\\Users\\dell\\Downloads\\新建文件夹\\基本法";
+		resultFile="E:\\data\\workspace\\PDFOperation\\output\\基本法.txt";
+		suffixList.add(".pdf");// 增加后缀
+	}
+
+	// 保存的结果,输出
+	public void outputData(String fileName, String content) {
+	    try {
+            //打开一个写文件器,构造函数中的第二个参数true表示以追加形式写文件
+            FileWriter writer = new FileWriter(fileName, true);
+            writer.write(content);
+            writer.close();
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+	}
+
+	public void bathGetText() {
+		String filedir = this.projectPath;
+		if (null == filedir || "".equals(filedir.trim())) {
+			System.out.println("filedir 目录不对!");
+			return;
+		}
+		filedir = filedir.trim();
+		if (null == suffixList || suffixList.size() <= 0) {
+			System.out.println("suffixList 没有要匹配的后缀!");
+			return;
+		}
+		File f = new File(filedir);
+		if (f.isDirectory()) {
+			handleDirectory(f);
+		} else {
+			System.out.println("filedir 必须为目录");
+		}
+	}
+
+	private void handleDirectory(File filedir) {
+		// 目录
+		File[] files = filedir.listFiles();
+		for (File subFile : files) {
+			if (subFile.isDirectory()) {
+				handleDirectory(subFile);
+			} else {
+				// 文件
+				for (String suffix : suffixList) {
+					if (subFile.getName().endsWith(suffix)) {
+						System.out.println(subFile.getName());
+						getTextFromPDF(subFile);
+					}
+				}
+			}
+		}
+	}
+
+	/**
+	 * 处理单个pdf为字符串
+	 * 
+	 * @param pdfFilePath
+	 *            pdf文件路径
+	 */
+	public void getTextFromPDF(File pdfFile) {
+		PDDocument document = null;
+
+		// 方式二:
+		try {
+			document = PDDocument.load(pdfFile);
+
+			// 获取页码
+			int pages = document.getNumberOfPages();
+
+			System.out.println(pages);
+
+			// 读文本内容
+			PDFTextStripper stripper = new PDFTextStripper();
+			// 设置按顺序输出
+			stripper.setSortByPosition(true);
+			stripper.setStartPage(1);
+			stripper.setEndPage(pages);
+			String content = stripper.getText(document);
+			// System.out.println(content);
+			outputData(resultFile, content);
+		} catch (InvalidPasswordException e) {
+			System.out.println(121);
+		} catch (Exception e) {
+			System.out.println(123331);
+
+		}
+	}
+}

+ 4 - 1
output/说明.txt

@@ -1,2 +1,5 @@
 输出文件夹,包含文件:
- 1、
+	output\429-499(69).txt
+	output\法例.txt
+	output\国际条约.txt
+	output\基本法.txt