| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2023/12/09 14:57:36
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc : enter point
- recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
- '''
- import os, sys, re, shutil, argparse
- def convert_pdf_to_html():
- """Convert PDF files to HTML format using pdf2htmlEX"""
- current_dir = os.getcwd()
- output_dir = os.path.join(current_dir, 'htmls')
- if not os.path.exists(output_dir):
- os.mkdir(output_dir)
- for root, dirs, files in os.walk(current_dir):
- for file in files:
- if file.endswith('.pdf'):
- try:
- os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file))
- print('convert %s to %s.html'%(file,file))
- except Exception as e:
- print(f'convert failed: {e}')
- # move all .html to htmls directory
- for root, dirs, files in os.walk(current_dir):
- for file in files:
- if file.endswith('.html'):
- try:
- shutil.move(os.path.join(root,file), output_dir)
- except Exception as e:
- print(f'move failed: {e}')
- def convert_pdf_to_docx():
- """Convert PDF files to DOCX format using LibreOffice"""
- current_dir = os.getcwd()
- output_dir = os.path.join(current_dir, 'docs')
- if not os.path.exists(output_dir):
- os.mkdir(output_dir)
- # Use LibreOffice to convert PDF to DOCX
- os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)
- # Count converted files
- for root, dirs, files in os.walk(output_dir):
- for file in files:
- if file.endswith('.docx'):
- print(f'convert {file.replace(".docx", ".pdf")} to {file}')
- def convert(mode='html'):
- """Main conversion function
- Args:
- mode (str): Conversion mode - 'html' or 'docx'
- """
- if mode.lower() == 'html':
- convert_pdf_to_html()
- elif mode.lower() == 'docx':
- convert_pdf_to_docx()
- else:
- print(f"Unknown mode: {mode}. Use 'html' or 'docx'")
- if __name__=='__main__':
- parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
- parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
- help='Conversion mode: html (default) or docx')
- args = parser.parse_args()
- convert(args.mode)
|