#!/usr/bin/env python # -*- encoding: utf-8 -*- ''' @Contact : liuyuqi.gov@msn.cn @Time : 2023/12/09 14:57:36 @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved. @Desc : enter point recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx ''' import os, sys, re, shutil, argparse def convert_pdf_to_html(): """Convert PDF files to HTML format using pdf2htmlEX""" current_dir = os.getcwd() output_dir = os.path.join(current_dir, 'htmls') if not os.path.exists(output_dir): os.mkdir(output_dir) for root, dirs, files in os.walk(current_dir): for file in files: if file.endswith('.pdf'): try: os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file)) print('convert %s to %s.html'%(file,file)) except Exception as e: print(f'convert failed: {e}') # move all .html to htmls directory for root, dirs, files in os.walk(current_dir): for file in files: if file.endswith('.html'): try: shutil.move(os.path.join(root,file), output_dir) except Exception as e: print(f'move failed: {e}') def convert_pdf_to_docx(): """Convert PDF files to DOCX format using LibreOffice""" current_dir = os.getcwd() output_dir = os.path.join(current_dir, 'docs') if not os.path.exists(output_dir): os.mkdir(output_dir) # Use LibreOffice to convert PDF to DOCX os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir) # Count converted files for root, dirs, files in os.walk(output_dir): for file in files: if file.endswith('.docx'): print(f'convert {file.replace(".docx", ".pdf")} to {file}') def convert(mode='html'): """Main conversion function Args: mode (str): Conversion mode - 'html' or 'docx' """ if mode.lower() == 'html': convert_pdf_to_html() elif mode.lower() == 'docx': convert_pdf_to_docx() else: print(f"Unknown mode: {mode}. Use 'html' or 'docx'") if __name__=='__main__': parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX') parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'], help='Conversion mode: html (default) or docx') args = parser.parse_args() convert(args.mode)