2 months ago · ed5d5eb382
--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,8 @@ RUN wget http://archive.ubuntu.com/ubuntu/pool/main/libj/libjpeg-turbo/libjpeg-t
 
															 RUN apt install -y ./libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb
														
 
															 RUN wget https://github.com/pdf2htmlEX/pdf2htmlEX/releases/download/v0.18.8.rc1/pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
														
 
															 RUN apt install -y ./pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
														
 
															-RUN apt install -y python3
														
 
															+RUN apt update && \
														
 
															+    apt install -y python3 libreoffice
														
 
															 WORKDIR /app
														
 
															 COPY main.py .
														
--- a/README.md
+++ b/README.md
@@ -1,12 +1,60 @@
 
															 # pdf2html
														
 
															-pdf批量转为html
														
 
															+PDF 批量转换工具，支持转换为 HTML 或 Word 文档（.docx）
														
 
															+## 功能
														
 
															+
														
 
															+- **pdf2html**: 将 PDF 转换为 HTML 格式（使用 pdf2htmlEX）
														
 
															+- **pdf2docs**: 将 PDF 转换为 Word 文档（.docx），保留原始排版（使用 LibreOffice）
														
 
															+
														
 
															+## 使用方法
														
 
															+
														
 
															+### 构建镜像
														
 
															+
														
 
															+```bash
														
 
															+docker build --pull --rm -f "Dockerfile" -t pdf2html:latest .
														
 
															 ```
														
 
															-docker build --pull --rm -f "pdf2html/Dockerfile" -t pdf2html:latest "pdf2html" 
														
 
															+### 创建别名（可选）
														
 
															+
														
 
															+```bash
														
 
															 alias pdf2html='docker run --rm -it -v `pwd`:/app pdf2html:latest'
														
 
															-pdf2html
														
 
															 ```
														
 
															+### 转换为 HTML（默认）
														
 
															+
														
 
															+```bash
														
 
															+docker run --rm -it -v `pwd`:/app pdf2html:latest --mode html
														
 
															+# 或使用别名
														
 
															+pdf2html --mode html
														
 
															+```
														
 
															+
														
 
															+输出文件保存在 `htmls/` 目录
														
 
															+
														
 
															+### 转换为 Word 文档
														
 
															+
														
 
															+```bash
														
 
															+docker run --rm -it -v `pwd`:/app pdf2html:latest --mode docx
														
 
															+# 或使用别名
														
 
															+pdf2html --mode docx
														
 
															+```
														
 
															+
														
 
															+输出文件保存在 `docs/` 目录
														
 
															+
														
 
															+## 参数说明
														
 
															+
														
 
															+- `--mode`: 转换模式
														
 
															+  - `html` (默认): 转换为 HTML 格式
														
 
															+  - `docx`: 转换为 Word 文档格式
														
 
															+
														
 
															+## 输出目录
														
 
															+
														
 
															+- HTML 文件: `htmls/`
														
 
															+- Word 文档: `docs/`
														
 
															+
														
 
															+## 依赖
														
 
															+
														
 
															+- pdf2htmlEX: PDF 转 HTML
														
 
															+- LibreOffice: PDF 转 Word 文档
														
 
															+
														
--- a/docker-compose.debug.yml
+++ b/docker-compose.debug.yml
@@ -1,8 +1,8 @@
 
															-version: '3.4'
														
 
															+
														
 
															 services:
														
 
															   pdf2html:
														
 
															-    image: pdf2html
														
 
															+    image: jianboy/pdf2html:latest
														
 
															     build:
														
 
															       context: .
														
 
															       dockerfile: ./Dockerfile
														
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,10 +1,10 @@
 
															-version: '3.4'
														
 
															+
														
 
															 services:
														
 
															   pdf2html:
														
 
															-    image: pdf2html
														
 
															-    build:
														
 
															-      context: .
														
 
															-      dockerfile: ./Dockerfile
														
 
															+    image: jianboy/pdf2html:latest
														
 
															+    # build:
														
 
															+    #   context: .
														
 
															+    #   dockerfile: ./Dockerfile
														
 
															     ports:
														
 
															       - 3000:3000
														
--- a/main.py
+++ b/main.py
@@ -6,16 +6,19 @@
 
															 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
														
 
															 @Desc    :   enter point
														
 
															-recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html
														
 
															+recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
														
 
															 '''
														
 
															-import os,sys,re,shutil
														
 
															+import os, sys, re, shutil, argparse
														
 
															-def convert():
														
 
															-    ''''''
														
 
															+def convert_pdf_to_html():
														
 
															+    """Convert PDF files to HTML format using pdf2htmlEX"""
														
 
															     current_dir = os.getcwd()
														
 
															-    if not os.path.exists(os.path.join(current_dir,'htmls')):
														
 
															-        os.mkdir(os.path.join(current_dir,'htmls'))
														
 
															+    output_dir = os.path.join(current_dir, 'htmls')
														
 
															+
														
 
															+    if not os.path.exists(output_dir):
														
 
															+        os.mkdir(output_dir)
														
 
															+
														
 
															     for root, dirs, files in os.walk(current_dir):
														
 
															         for file in files:
														
 
															             if file.endswith('.pdf'):
														
@@ -24,15 +27,50 @@ def convert():
 
															                     print('convert %s to %s.html'%(file,file))
														
 
															                 except Exception as e:
														
 
															                     print(f'convert failed: {e}')
														
 
															-    
														
 
															-    # move all .html to htmls diretory
														
 
															+
														
 
															+    # move all .html to htmls directory
														
 
															     for root, dirs, files in os.walk(current_dir):
														
 
															         for file in files:
														
 
															             if file.endswith('.html'):
														
 
															                 try:
														
 
															-                    shutil.move(os.path.join(root,file),os.path.join(current_dir,'htmls'))
														
 
															+                    shutil.move(os.path.join(root,file), output_dir)
														
 
															                 except Exception as e:
														
 
															                     print(f'move failed: {e}')
														
 
															+def convert_pdf_to_docx():
														
 
															+    """Convert PDF files to DOCX format using LibreOffice"""
														
 
															+    current_dir = os.getcwd()
														
 
															+    output_dir = os.path.join(current_dir, 'docs')
														
 
															+
														
 
															+    if not os.path.exists(output_dir):
														
 
															+        os.mkdir(output_dir)
														
 
															+
														
 
															+    # Use LibreOffice to convert PDF to DOCX
														
 
															+    os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)
														
 
															+
														
 
															+    # Count converted files
														
 
															+    for root, dirs, files in os.walk(output_dir):
														
 
															+        for file in files:
														
 
															+            if file.endswith('.docx'):
														
 
															+                print(f'convert {file.replace(".docx", ".pdf")} to {file}')
														
 
															+
														
 
															+def convert(mode='html'):
														
 
															+    """Main conversion function
														
 
															+
														
 
															+    Args:
														
 
															+        mode (str): Conversion mode - 'html' or 'docx'
														
 
															+    """
														
 
															+    if mode.lower() == 'html':
														
 
															+        convert_pdf_to_html()
														
 
															+    elif mode.lower() == 'docx':
														
 
															+        convert_pdf_to_docx()
														
 
															+    else:
														
 
															+        print(f"Unknown mode: {mode}. Use 'html' or 'docx'")
														
 
															+
														
 
															 if __name__=='__main__':
														
 
															-    convert()
														
 
															+    parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
														
 
															+    parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
														
 
															+                        help='Conversion mode: html (default) or docx')
														
 
															+
														
 
															+    args = parser.parse_args()
														
 
															+    convert(args.mode)