|
@@ -6,16 +6,19 @@
|
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
@Desc : enter point
|
|
@Desc : enter point
|
|
|
|
|
|
|
|
-recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html
|
|
|
|
|
|
|
+recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
|
|
|
'''
|
|
'''
|
|
|
|
|
|
|
|
-import os,sys,re,shutil
|
|
|
|
|
|
|
+import os, sys, re, shutil, argparse
|
|
|
|
|
|
|
|
-def convert():
|
|
|
|
|
- ''''''
|
|
|
|
|
|
|
+def convert_pdf_to_html():
|
|
|
|
|
+ """Convert PDF files to HTML format using pdf2htmlEX"""
|
|
|
current_dir = os.getcwd()
|
|
current_dir = os.getcwd()
|
|
|
- if not os.path.exists(os.path.join(current_dir,'htmls')):
|
|
|
|
|
- os.mkdir(os.path.join(current_dir,'htmls'))
|
|
|
|
|
|
|
+ output_dir = os.path.join(current_dir, 'htmls')
|
|
|
|
|
+
|
|
|
|
|
+ if not os.path.exists(output_dir):
|
|
|
|
|
+ os.mkdir(output_dir)
|
|
|
|
|
+
|
|
|
for root, dirs, files in os.walk(current_dir):
|
|
for root, dirs, files in os.walk(current_dir):
|
|
|
for file in files:
|
|
for file in files:
|
|
|
if file.endswith('.pdf'):
|
|
if file.endswith('.pdf'):
|
|
@@ -24,15 +27,50 @@ def convert():
|
|
|
print('convert %s to %s.html'%(file,file))
|
|
print('convert %s to %s.html'%(file,file))
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f'convert failed: {e}')
|
|
print(f'convert failed: {e}')
|
|
|
-
|
|
|
|
|
- # move all .html to htmls diretory
|
|
|
|
|
|
|
+
|
|
|
|
|
+ # move all .html to htmls directory
|
|
|
for root, dirs, files in os.walk(current_dir):
|
|
for root, dirs, files in os.walk(current_dir):
|
|
|
for file in files:
|
|
for file in files:
|
|
|
if file.endswith('.html'):
|
|
if file.endswith('.html'):
|
|
|
try:
|
|
try:
|
|
|
- shutil.move(os.path.join(root,file),os.path.join(current_dir,'htmls'))
|
|
|
|
|
|
|
+ shutil.move(os.path.join(root,file), output_dir)
|
|
|
except Exception as e:
|
|
except Exception as e:
|
|
|
print(f'move failed: {e}')
|
|
print(f'move failed: {e}')
|
|
|
|
|
|
|
|
|
|
+def convert_pdf_to_docx():
|
|
|
|
|
+ """Convert PDF files to DOCX format using LibreOffice"""
|
|
|
|
|
+ current_dir = os.getcwd()
|
|
|
|
|
+ output_dir = os.path.join(current_dir, 'docs')
|
|
|
|
|
+
|
|
|
|
|
+ if not os.path.exists(output_dir):
|
|
|
|
|
+ os.mkdir(output_dir)
|
|
|
|
|
+
|
|
|
|
|
+ # Use LibreOffice to convert PDF to DOCX
|
|
|
|
|
+ os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)
|
|
|
|
|
+
|
|
|
|
|
+ # Count converted files
|
|
|
|
|
+ for root, dirs, files in os.walk(output_dir):
|
|
|
|
|
+ for file in files:
|
|
|
|
|
+ if file.endswith('.docx'):
|
|
|
|
|
+ print(f'convert {file.replace(".docx", ".pdf")} to {file}')
|
|
|
|
|
+
|
|
|
|
|
+def convert(mode='html'):
|
|
|
|
|
+ """Main conversion function
|
|
|
|
|
+
|
|
|
|
|
+ Args:
|
|
|
|
|
+ mode (str): Conversion mode - 'html' or 'docx'
|
|
|
|
|
+ """
|
|
|
|
|
+ if mode.lower() == 'html':
|
|
|
|
|
+ convert_pdf_to_html()
|
|
|
|
|
+ elif mode.lower() == 'docx':
|
|
|
|
|
+ convert_pdf_to_docx()
|
|
|
|
|
+ else:
|
|
|
|
|
+ print(f"Unknown mode: {mode}. Use 'html' or 'docx'")
|
|
|
|
|
+
|
|
|
if __name__=='__main__':
|
|
if __name__=='__main__':
|
|
|
- convert()
|
|
|
|
|
|
|
+ parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
|
|
|
|
|
+ parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
|
|
|
|
|
+ help='Conversion mode: html (default) or docx')
|
|
|
|
|
+
|
|
|
|
|
+ args = parser.parse_args()
|
|
|
|
|
+ convert(args.mode)
|