main.py 2.5 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/12/09 14:57:36
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : enter point
  8. recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
  9. '''
  10. import os, sys, re, shutil, argparse
  11. def convert_pdf_to_html():
  12. """Convert PDF files to HTML format using pdf2htmlEX"""
  13. current_dir = os.getcwd()
  14. output_dir = os.path.join(current_dir, 'htmls')
  15. if not os.path.exists(output_dir):
  16. os.mkdir(output_dir)
  17. for root, dirs, files in os.walk(current_dir):
  18. for file in files:
  19. if file.endswith('.pdf'):
  20. try:
  21. os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file))
  22. print('convert %s to %s.html'%(file,file))
  23. except Exception as e:
  24. print(f'convert failed: {e}')
  25. # move all .html to htmls directory
  26. for root, dirs, files in os.walk(current_dir):
  27. for file in files:
  28. if file.endswith('.html'):
  29. try:
  30. shutil.move(os.path.join(root,file), output_dir)
  31. except Exception as e:
  32. print(f'move failed: {e}')
  33. def convert_pdf_to_docx():
  34. """Convert PDF files to DOCX format using LibreOffice"""
  35. current_dir = os.getcwd()
  36. output_dir = os.path.join(current_dir, 'docs')
  37. if not os.path.exists(output_dir):
  38. os.mkdir(output_dir)
  39. # Use LibreOffice to convert PDF to DOCX
  40. os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)
  41. # Count converted files
  42. for root, dirs, files in os.walk(output_dir):
  43. for file in files:
  44. if file.endswith('.docx'):
  45. print(f'convert {file.replace(".docx", ".pdf")} to {file}')
  46. def convert(mode='html'):
  47. """Main conversion function
  48. Args:
  49. mode (str): Conversion mode - 'html' or 'docx'
  50. """
  51. if mode.lower() == 'html':
  52. convert_pdf_to_html()
  53. elif mode.lower() == 'docx':
  54. convert_pdf_to_docx()
  55. else:
  56. print(f"Unknown mode: {mode}. Use 'html' or 'docx'")
  57. if __name__=='__main__':
  58. parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
  59. parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
  60. help='Conversion mode: html (default) or docx')
  61. args = parser.parse_args()
  62. convert(args.mode)