Browse Source

Automatic Commit By liuyuqi

liuyuqi-dellpc 1 year ago
commit
089d850cff
6 changed files with 109 additions and 0 deletions
  1. 25 0
      .dockerignore
  2. 14 0
      Dockerfile
  3. 12 0
      README.md
  4. 10 0
      docker-compose.debug.yml
  5. 10 0
      docker-compose.yml
  6. 38 0
      main.py

+ 25 - 0
.dockerignore

@@ -0,0 +1,25 @@
+**/.classpath
+**/.dockerignore
+**/.env
+**/.git
+**/.gitignore
+**/.project
+**/.settings
+**/.toolstarget
+**/.vs
+**/.vscode
+**/*.*proj.user
+**/*.dbmdl
+**/*.jfm
+**/bin
+**/charts
+**/docker-compose*
+**/compose*
+**/Dockerfile*
+**/node_modules
+**/npm-debug.log
+**/obj
+**/secrets.dev.yaml
+**/values.dev.yaml
+LICENSE
+README.md

+ 14 - 0
Dockerfile

@@ -0,0 +1,14 @@
+# FROM mcr.microsoft.com/dotnet/aspnet:7.0 as build
+FROM ubuntu:18.04 as builder
+
+RUN apt update && apt install -y wget
+RUN wget http://archive.ubuntu.com/ubuntu/pool/main/libj/libjpeg-turbo/libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb
+RUN apt install -y ./libjpeg-turbo8_2.0.3-0ubuntu1_amd64.deb
+RUN wget https://github.com/pdf2htmlEX/pdf2htmlEX/releases/download/v0.18.8.rc1/pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
+RUN apt install -y ./pdf2htmlEX-0.18.8.rc1-master-20200630-Ubuntu-bionic-x86_64.deb
+RUN apt install -y python3
+
+WORKDIR /app
+COPY main.py .
+VOLUME [ "/app" ]
+ENTRYPOINT ["python3", "main.py"]

+ 12 - 0
README.md

@@ -0,0 +1,12 @@
+# pdf2html
+
+pdf批量转为html
+
+```
+docker build --pull --rm -f "pdf2html/Dockerfile" -t pdf2html:latest "pdf2html" 
+
+alias pdf2html='docker run --rm -it -v `pwd`:/app pdf2html:latest'
+pdf2html
+```
+
+

+ 10 - 0
docker-compose.debug.yml

@@ -0,0 +1,10 @@
+version: '3.4'
+
+services:
+  pdf2html:
+    image: pdf2html
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    ports:
+      - 3000:3000

+ 10 - 0
docker-compose.yml

@@ -0,0 +1,10 @@
+version: '3.4'
+
+services:
+  pdf2html:
+    image: pdf2html
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    ports:
+      - 3000:3000

+ 38 - 0
main.py

@@ -0,0 +1,38 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/12/09 14:57:36
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   enter point
+
+recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html
+'''
+
+import os,sys,re,shutil
+
+def convert():
+    ''''''
+    current_dir = os.getcwd()
+    if not os.path.exists(os.path.join(current_dir,'htmls')):
+        os.mkdir(os.path.join(current_dir,'htmls'))
+    for root, dirs, files in os.walk(current_dir):
+        for file in files:
+            if file.endswith('.pdf'):
+                try:
+                    os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file))
+                    print('convert %s to %s.html'%(file,file))
+                except Exception as e:
+                    print(f'convert failed: {e}')
+    
+    # move all .html to htmls diretory
+    for root, dirs, files in os.walk(current_dir):
+        for file in files:
+            if file.endswith('.html'):
+                try:
+                    shutil.move(os.path.join(root,file),os.path.join(current_dir,'htmls'))
+                except Exception as e:
+                    print(f'move failed: {e}')
+
+if __name__=='__main__':
+    convert()