#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2023/12/09 14:57:36
@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
@Desc    :   enter point

recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html or x.docx
'''

import os, sys, re, shutil, argparse

def convert_pdf_to_html():
    """Convert PDF files to HTML format using pdf2htmlEX"""
    current_dir = os.getcwd()
    output_dir = os.path.join(current_dir, 'htmls')

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    for root, dirs, files in os.walk(current_dir):
        for file in files:
            if file.endswith('.pdf'):
                try:
                    os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file))
                    print('convert %s to %s.html'%(file,file))
                except Exception as e:
                    print(f'convert failed: {e}')

    # move all .html to htmls directory
    for root, dirs, files in os.walk(current_dir):
        for file in files:
            if file.endswith('.html'):
                try:
                    shutil.move(os.path.join(root,file), output_dir)
                except Exception as e:
                    print(f'move failed: {e}')

def convert_pdf_to_docx():
    """Convert PDF files to DOCX format using LibreOffice"""
    current_dir = os.getcwd()
    output_dir = os.path.join(current_dir, 'docs')

    if not os.path.exists(output_dir):
        os.mkdir(output_dir)

    # Use LibreOffice to convert PDF to DOCX
    os.system('libreoffice --headless --convert-to "docx" *.pdf --outdir %s' % output_dir)

    # Count converted files
    for root, dirs, files in os.walk(output_dir):
        for file in files:
            if file.endswith('.docx'):
                print(f'convert {file.replace(".docx", ".pdf")} to {file}')

def convert(mode='html'):
    """Main conversion function

    Args:
        mode (str): Conversion mode - 'html' or 'docx'
    """
    if mode.lower() == 'html':
        convert_pdf_to_html()
    elif mode.lower() == 'docx':
        convert_pdf_to_docx()
    else:
        print(f"Unknown mode: {mode}. Use 'html' or 'docx'")

if __name__=='__main__':
    parser = argparse.ArgumentParser(description='Convert PDF files to HTML or DOCX')
    parser.add_argument('--mode', type=str, default='html', choices=['html', 'docx'],
                        help='Conversion mode: html (default) or docx')

    args = parser.parse_args()
    convert(args.mode)