Browse Source

Add pdf2docs functionality using LibreOffice

- Add --mode parameter to support both html and docx conversion
- Implement convert_pdf_to_docx() using LibreOffice CLI for batch conversion
- Output DOCX files to 'docs/' directory
- Update Dockerfile to include LibreOffice
- Update README with usage instructions for both modes
liuyuqi-cnb 1 week ago
parent
commit
ed5d5eb382
5 changed files with 108 additions and 21 deletions
  1. 2 1
      Dockerfile
  2. 51 3
      README.md
  3. 2 2
      docker-compose.debug.yml
  4. 5 5
      docker-compose.yml
  5. 48 10
      main.py

+ 2 - 1
Dockerfile

@@ -6,7 +6,8 @@ RUN wget http://archive.ubuntu.com/ubuntu/pool/main/libj/libjpeg-turbo/libjpeg-t
 RUN apt install -y ./libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb
 RUN apt install -y ./libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb
 RUN wget https://github.com/pdf2htmlEX/pdf2htmlEX/releases/download/v0.18.8.rc1/pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
 RUN wget https://github.com/pdf2htmlEX/pdf2htmlEX/releases/download/v0.18.8.rc1/pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
 RUN apt install -y ./pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
 RUN apt install -y ./pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
-RUN apt install -y python3
+RUN apt update && \
+    apt install -y python3 libreoffice
 
 
 WORKDIR /app
 WORKDIR /app
 COPY main.py .
 COPY main.py .

+ 51 - 3
README.md

@@ -1,12 +1,60 @@
 # pdf2html
 # pdf2html
 
 
-pdf批量转为html
+PDF 批量转换工具,支持转换为 HTML 或 Word 文档(.docx)
 
 
+## 功能
+
+- **pdf2html**: 将 PDF 转换为 HTML 格式(使用 pdf2htmlEX)
+- **pdf2docs**: 将 PDF 转换为 Word 文档(.docx),保留原始排版(使用 LibreOffice)
+
+## 使用方法
+
+### 构建镜像
+
+```bash
+docker build --pull --rm -f "Dockerfile" -t pdf2html:latest .
 ```
 ```
-docker build --pull --rm -f "pdf2html/Dockerfile" -t pdf2html:latest "pdf2html" 
 
 
+### 创建别名(可选)
+
+```bash
 alias pdf2html='docker run --rm -it -v `pwd`:/app pdf2html:latest'
 alias pdf2html='docker run --rm -it -v `pwd`:/app pdf2html:latest'
-pdf2html
 ```
 ```
 
 
+### 转换为 HTML(默认)
+
+```bash
+docker run --rm -it -v `pwd`:/app pdf2html:latest --mode html
+# 或使用别名
+pdf2html --mode html
+```
+
+输出文件保存在 `htmls/` 目录
+
+### 转换为 Word 文档
+
+```bash
+docker run --rm -it -v `pwd`:/app pdf2html:latest --mode docx
+# 或使用别名
+pdf2html --mode docx
+```
+
+输出文件保存在 `docs/` 目录
+
+## 参数说明
+
+- `--mode`: 转换模式
+  - `html` (默认): 转换为 HTML 格式
+  - `docx`: 转换为 Word 文档格式
+
+## 输出目录
+
+- HTML 文件: `htmls/`
+- Word 文档: `docs/`
+
+## 依赖
+
+- pdf2htmlEX: PDF 转 HTML
+- LibreOffice: PDF 转 Word 文档
+
 
 

+ 2 - 2
docker-compose.debug.yml

@@ -1,8 +1,8 @@
-version: '3.4'
+
 
 
 services:
 services:
   pdf2html:
   pdf2html:
-    image: pdf2html
+    image: jianboy/pdf2html:latest
     build:
     build:
       context: .
       context: .
       dockerfile: ./Dockerfile
       dockerfile: ./Dockerfile

+ 5 - 5
docker-compose.yml

@@ -1,10 +1,10 @@
-version: '3.4'
+
 
 
 services:
 services:
   pdf2html:
   pdf2html:
-    image: pdf2html
-    build:
-      context: .
-      dockerfile: ./Dockerfile
+    image: jianboy/pdf2html:latest
+    # build:
+    #   context: .
+    #   dockerfile: ./Dockerfile
     ports:
     ports:
       - 3000:3000
       - 3000:3000

+ 48 - 10
main.py

@@ -6,16 +6,19 @@
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
 @Desc    :   enter point
 @Desc    :   enter point
 
 
-recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html
+recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
 '''
 '''
 
 
-import os,sys,re,shutil
+import os, sys, re, shutil, argparse
 
 
-def convert():
-    ''''''
+def convert_pdf_to_html():
+    """Convert PDF files to HTML format using pdf2htmlEX"""
     current_dir = os.getcwd()
     current_dir = os.getcwd()
-    if not os.path.exists(os.path.join(current_dir,'htmls')):
-        os.mkdir(os.path.join(current_dir,'htmls'))
+    output_dir = os.path.join(current_dir, 'htmls')
+
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
     for root, dirs, files in os.walk(current_dir):
     for root, dirs, files in os.walk(current_dir):
         for file in files:
         for file in files:
             if file.endswith('.pdf'):
             if file.endswith('.pdf'):
@@ -24,15 +27,50 @@ def convert():
                     print('convert %s to %s.html'%(file,file))
                     print('convert %s to %s.html'%(file,file))
                 except Exception as e:
                 except Exception as e:
                     print(f'convert failed: {e}')
                     print(f'convert failed: {e}')
-    
-    # move all .html to htmls diretory
+
+    # move all .html to htmls directory
     for root, dirs, files in os.walk(current_dir):
     for root, dirs, files in os.walk(current_dir):
         for file in files:
         for file in files:
             if file.endswith('.html'):
             if file.endswith('.html'):
                 try:
                 try:
-                    shutil.move(os.path.join(root,file),os.path.join(current_dir,'htmls'))
+                    shutil.move(os.path.join(root,file), output_dir)
                 except Exception as e:
                 except Exception as e:
                     print(f'move failed: {e}')
                     print(f'move failed: {e}')
 
 
+def convert_pdf_to_docx():
+    """Convert PDF files to DOCX format using LibreOffice"""
+    current_dir = os.getcwd()
+    output_dir = os.path.join(current_dir, 'docs')
+
+    if not os.path.exists(output_dir):
+        os.mkdir(output_dir)
+
+    # Use LibreOffice to convert PDF to DOCX
+    os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)
+
+    # Count converted files
+    for root, dirs, files in os.walk(output_dir):
+        for file in files:
+            if file.endswith('.docx'):
+                print(f'convert {file.replace(".docx", ".pdf")} to {file}')
+
+def convert(mode='html'):
+    """Main conversion function
+
+    Args:
+        mode (str): Conversion mode - 'html' or 'docx'
+    """
+    if mode.lower() == 'html':
+        convert_pdf_to_html()
+    elif mode.lower() == 'docx':
+        convert_pdf_to_docx()
+    else:
+        print(f"Unknown mode: {mode}. Use 'html' or 'docx'")
+
 if __name__=='__main__':
 if __name__=='__main__':
-    convert()
+    parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
+    parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
+                        help='Conversion mode: html (default) or docx')
+
+    args = parser.parse_args()
+    convert(args.mode)