|
|
@@ -4,35 +4,258 @@
|
|
|
@Contact : liuyuqi.gov@msn.cn
|
|
|
@Time : 2023/12/09 14:57:36
|
|
|
@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
-@Desc : enter point
|
|
|
+@Desc : PDF批量转换为HTML工具
|
|
|
|
|
|
-recycle read all files in a directory, and find *.pdf files, then convert x.pdf to x.html
|
|
|
+功能说明:
|
|
|
+ 递归读取指定目录中的所有PDF文件,将其转换为HTML格式,
|
|
|
+ 并保存到指定的输出目录中。
|
|
|
+
|
|
|
+使用方法:
|
|
|
+ python main.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR] [--recursive] [--no-recursive]
|
|
|
+
|
|
|
+参数说明:
|
|
|
+ --input-dir, -i 输入目录,包含PDF文件的目录(默认:当前工作目录)
|
|
|
+ --output-dir, -o 输出目录,保存HTML文件的目录(默认:htmls)
|
|
|
+ --recursive, -r 递归搜索子目录中的PDF文件(默认:True)
|
|
|
+ --no-recursive 不递归搜索子目录
|
|
|
+ --zoom 转换时的缩放比例(默认:1.3)
|
|
|
+ --help, -h 显示帮助信息
|
|
|
'''
|
|
|
|
|
|
-import os,sys,re,shutil
|
|
|
-
|
|
|
-def convert():
|
|
|
- ''''''
|
|
|
- current_dir = os.getcwd()
|
|
|
- if not os.path.exists(os.path.join(current_dir,'htmls')):
|
|
|
- os.mkdir(os.path.join(current_dir,'htmls'))
|
|
|
- for root, dirs, files in os.walk(current_dir):
|
|
|
- for file in files:
|
|
|
- if file.endswith('.pdf'):
|
|
|
- try:
|
|
|
- os.system('pdf2htmlEX --zoom 1.3 --process-outline 0 --page-filename %s.html %s'%(file,file))
|
|
|
- print('convert %s to %s.html'%(file,file))
|
|
|
- except Exception as e:
|
|
|
- print(f'convert failed: {e}')
|
|
|
-
|
|
|
- # move all .html to htmls diretory
|
|
|
- for root, dirs, files in os.walk(current_dir):
|
|
|
- for file in files:
|
|
|
- if file.endswith('.html'):
|
|
|
- try:
|
|
|
- shutil.move(os.path.join(root,file),os.path.join(current_dir,'htmls'))
|
|
|
- except Exception as e:
|
|
|
- print(f'move failed: {e}')
|
|
|
-
|
|
|
-if __name__=='__main__':
|
|
|
- convert()
|
|
|
+import argparse
|
|
|
+import os
|
|
|
+import shutil
|
|
|
+import subprocess
|
|
|
+from pathlib import Path
|
|
|
+
|
|
|
+
|
|
|
+def convert_pdf_to_html(pdf_path: Path, output_dir: Path, zoom: float = 1.3) -> bool:
|
|
|
+ '''
|
|
|
+ 转换单个PDF文件为HTML格式
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ pdf_path: PDF文件的完整路径
|
|
|
+ output_dir: 输出目录路径
|
|
|
+ zoom: 缩放比例,默认为1.3
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ 转换成功返回True,失败返回False
|
|
|
+ '''
|
|
|
+ pdf_name = pdf_path.name
|
|
|
+ # 生成临时HTML文件名(不包含路径)
|
|
|
+ html_name = pdf_name.rsplit('.', 1)[0] + '.html'
|
|
|
+ # 最终输出路径
|
|
|
+ final_output_path = output_dir / html_name
|
|
|
+
|
|
|
+ # 检查输出文件是否已存在
|
|
|
+ if final_output_path.exists():
|
|
|
+ print(f'Skipping {pdf_name} - {html_name} already exists in output directory')
|
|
|
+ return True
|
|
|
+
|
|
|
+ # 构建pdf2htmlEX命令
|
|
|
+ cmd = [
|
|
|
+ 'pdf2htmlEX',
|
|
|
+ '--zoom', str(zoom),
|
|
|
+ '--process-outline', '0',
|
|
|
+ '--page-filename', html_name,
|
|
|
+ str(pdf_path)
|
|
|
+ ]
|
|
|
+
|
|
|
+ try:
|
|
|
+ # 执行转换命令
|
|
|
+ result = subprocess.run(
|
|
|
+ cmd,
|
|
|
+ capture_output=True,
|
|
|
+ text=True,
|
|
|
+ timeout=300 # 5分钟超时
|
|
|
+ )
|
|
|
+
|
|
|
+ if result.returncode == 0:
|
|
|
+ # 检查生成的HTML文件位置
|
|
|
+ # pdf2htmlEX默认在当前工作目录或PDF目录生成文件
|
|
|
+ generated_html = Path.cwd() / html_name
|
|
|
+ if not generated_html.exists():
|
|
|
+ # 可能在PDF文件所在目录
|
|
|
+ generated_html = pdf_path.parent / html_name
|
|
|
+
|
|
|
+ if generated_html.exists():
|
|
|
+ # 移动到输出目录
|
|
|
+ shutil.move(str(generated_html), str(final_output_path))
|
|
|
+ print(f'Successfully converted {pdf_name} to {html_name}')
|
|
|
+ return True
|
|
|
+ else:
|
|
|
+ print(f'Warning: Conversion command succeeded but {html_name} not found')
|
|
|
+ return False
|
|
|
+ else:
|
|
|
+ print(f'Failed to convert {pdf_name}: {result.stderr}')
|
|
|
+ return False
|
|
|
+
|
|
|
+ except subprocess.TimeoutExpired:
|
|
|
+ print(f'Timeout while converting {pdf_name}')
|
|
|
+ return False
|
|
|
+ except Exception as e:
|
|
|
+ print(f'Error converting {pdf_name}: {str(e)}')
|
|
|
+ return False
|
|
|
+
|
|
|
+
|
|
|
+def find_pdf_files(input_dir: Path, recursive: bool = True) -> list:
|
|
|
+ '''
|
|
|
+ 查找指定目录中的所有PDF文件
|
|
|
+
|
|
|
+ 参数:
|
|
|
+ input_dir: 输入目录路径
|
|
|
+ recursive: 是否递归搜索子目录,默认为True
|
|
|
+
|
|
|
+ 返回:
|
|
|
+ PDF文件路径列表
|
|
|
+ '''
|
|
|
+ pdf_files = []
|
|
|
+
|
|
|
+ if recursive:
|
|
|
+ # 递归搜索所有子目录
|
|
|
+ for root, dirs, files in os.walk(input_dir):
|
|
|
+ for file in files:
|
|
|
+ if file.lower().endswith('.pdf'):
|
|
|
+ pdf_files.append(Path(root) / file)
|
|
|
+ else:
|
|
|
+ # 仅搜索当前目录
|
|
|
+ for file in input_dir.iterdir():
|
|
|
+ if file.is_file() and file.name.lower().endswith('.pdf'):
|
|
|
+ pdf_files.append(file)
|
|
|
+
|
|
|
+ return pdf_files
|
|
|
+
|
|
|
+
|
|
|
+def main():
|
|
|
+ '''
|
|
|
+ 主函数:解析命令行参数并执行转换流程
|
|
|
+ '''
|
|
|
+ # 创建参数解析器
|
|
|
+ parser = argparse.ArgumentParser(
|
|
|
+ description='PDF批量转换为HTML工具',
|
|
|
+ formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
|
+ epilog='''
|
|
|
+示例用法:
|
|
|
+ python main.py # 使用默认设置转换当前目录下的PDF
|
|
|
+ python main.py -i ./pdfs -o ./output # 指定输入和输出目录
|
|
|
+ python main.py --no-recursive # 不搜索子目录
|
|
|
+ python main.py --zoom 1.5 # 使用1.5倍缩放比例
|
|
|
+ '''
|
|
|
+ )
|
|
|
+
|
|
|
+ # 添加命令行参数
|
|
|
+ parser.add_argument(
|
|
|
+ '--input-dir', '-i',
|
|
|
+ default='.',
|
|
|
+ help='输入目录,包含PDF文件的目录(默认:当前工作目录)'
|
|
|
+ )
|
|
|
+
|
|
|
+ parser.add_argument(
|
|
|
+ '--output-dir', '-o',
|
|
|
+ default='htmls',
|
|
|
+ help='输出目录,保存HTML文件的目录(默认:htmls)'
|
|
|
+ )
|
|
|
+
|
|
|
+ # 创建互斥组(递归和非递归只能选一个)
|
|
|
+ recursive_group = parser.add_mutually_exclusive_group()
|
|
|
+ recursive_group.add_argument(
|
|
|
+ '--recursive', '-r',
|
|
|
+ action='store_true',
|
|
|
+ default=True,
|
|
|
+ help='递归搜索子目录中的PDF文件(默认:True)'
|
|
|
+ )
|
|
|
+ recursive_group.add_argument(
|
|
|
+ '--no-recursive',
|
|
|
+ action='store_false',
|
|
|
+ dest='recursive',
|
|
|
+ help='不递归搜索子目录'
|
|
|
+ )
|
|
|
+
|
|
|
+ parser.add_argument(
|
|
|
+ '--zoom', '-z',
|
|
|
+ type=float,
|
|
|
+ default=1.3,
|
|
|
+ help='转换时的缩放比例(默认:1.3)'
|
|
|
+ )
|
|
|
+
|
|
|
+ # 解析命令行参数
|
|
|
+ args = parser.parse_args()
|
|
|
+
|
|
|
+ # 处理路径
|
|
|
+ input_dir = Path(args.input_dir).resolve()
|
|
|
+ output_dir = Path(args.output_dir).resolve()
|
|
|
+
|
|
|
+ # 验证输入目录
|
|
|
+ if not input_dir.exists():
|
|
|
+ print(f'错误: 输入目录不存在: {input_dir}')
|
|
|
+ return 1
|
|
|
+
|
|
|
+ if not input_dir.is_dir():
|
|
|
+ print(f'错误: 输入路径不是目录: {input_dir}')
|
|
|
+ return 1
|
|
|
+
|
|
|
+ # 创建输出目录(如果不存在)
|
|
|
+ try:
|
|
|
+ output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
+ except Exception as e:
|
|
|
+ print(f'错误: 无法创建输出目录 {output_dir}: {str(e)}')
|
|
|
+ return 1
|
|
|
+
|
|
|
+ # 显示配置信息
|
|
|
+ print('=' * 60)
|
|
|
+ print('PDF批量转换为HTML工具')
|
|
|
+ print('=' * 60)
|
|
|
+ print(f'输入目录: {input_dir}')
|
|
|
+ print(f'输出目录: {output_dir}')
|
|
|
+ print(f'递归搜索: {"是" if args.recursive else "否"}')
|
|
|
+ print(f'缩放比例: {args.zoom}')
|
|
|
+ print('=' * 60)
|
|
|
+
|
|
|
+ # 查找PDF文件
|
|
|
+ print(f'\n正在搜索PDF文件...')
|
|
|
+ pdf_files = find_pdf_files(input_dir, args.recursive)
|
|
|
+
|
|
|
+ if not pdf_files:
|
|
|
+ print(f'未找到PDF文件,退出程序。')
|
|
|
+ return 0
|
|
|
+
|
|
|
+ print(f'找到 {len(pdf_files)} 个PDF文件\n')
|
|
|
+
|
|
|
+ # 转换PDF文件
|
|
|
+ success_count = 0
|
|
|
+ failed_count = 0
|
|
|
+ skipped_count = 0
|
|
|
+
|
|
|
+ for i, pdf_path in enumerate(pdf_files, 1):
|
|
|
+ print(f'[{i}/{len(pdf_files)}] 处理: {pdf_path.name}')
|
|
|
+
|
|
|
+ # 转换文件
|
|
|
+ result = convert_pdf_to_html(pdf_path, output_dir, args.zoom)
|
|
|
+
|
|
|
+ if result:
|
|
|
+ # 检查是成功转换还是跳过
|
|
|
+ html_name = pdf_path.name.rsplit('.', 1)[0] + '.html'
|
|
|
+ if (output_dir / html_name).exists():
|
|
|
+ success_count += 1
|
|
|
+ else:
|
|
|
+ skipped_count += 1
|
|
|
+ else:
|
|
|
+ failed_count += 1
|
|
|
+
|
|
|
+ # 显示转换结果
|
|
|
+ print('\n' + '=' * 60)
|
|
|
+ print('转换完成')
|
|
|
+ print('=' * 60)
|
|
|
+ print(f'总PDF文件数: {len(pdf_files)}')
|
|
|
+ print(f'成功转换: {success_count}')
|
|
|
+ print(f'跳过(已存在): {skipped_count}')
|
|
|
+ print(f'转换失败: {failed_count}')
|
|
|
+ print(f'输出目录: {output_dir}')
|
|
|
+ print('=' * 60)
|
|
|
+
|
|
|
+ # 根据失败情况返回退出码
|
|
|
+ return 0 if failed_count == 0 else 1
|
|
|
+
|
|
|
+
|
|
|
+if __name__ == '__main__':
|
|
|
+ exit(main())
|