file_utils.py 3.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111
  1. # utils.py
  2. import os
  3. import cv2
  4. import numpy as np
  5. from PIL import Image
  6. import openpyxl
  7. from skimage.metrics import structural_similarity as ssim
  8. from config import Config
  9. from docx import Document
  10. def allowed_file(filename):
  11. return '.' in filename and \
  12. filename.rsplit('.', 1)[1].lower() in Config.ALLOWED_EXTENSIONS
  13. def calculate_similarity(imageA, imageB):
  14. grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
  15. grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
  16. score, _ = ssim(grayA, grayB, full=True)
  17. return score
  18. def extract_images_from_excel(file_path, time_label):
  19. workbook = openpyxl.load_workbook(file_path)
  20. images_info = []
  21. for sheet_name in workbook.sheetnames:
  22. sheet = workbook[sheet_name]
  23. for img in sheet._images:
  24. from io import BytesIO
  25. image_data = img._data()
  26. image = Image.open(BytesIO(image_data))
  27. position = (img.anchor._from.row + 1, img.anchor._from.col + 1)
  28. # if time_label == "120分钟": # Placeholder, implement actual logic
  29. images_info.append({
  30. 'image': image,
  31. 'position': position,
  32. 'sheet_name': sheet_name
  33. })
  34. return images_info
  35. def extract_images_from_wps(file_path, time_label):
  36. """
  37. 从WPS文档(.docx)中提取所有图像,并保存到指定文件夹。
  38. :param file_path: WPS文档的路径
  39. :param time_label: 时间标签,用于创建唯一输出文件夹名称
  40. """
  41. output_folder = os.path.join('output_images', f'{time_label}_images')
  42. if not os.path.exists(output_folder):
  43. os.makedirs(output_folder)
  44. document = Document(file_path)
  45. image_counter = 0
  46. images_info = []
  47. for rel in document.part.rels.values():
  48. if "image" in rel.target_ref:
  49. image_counter += 1
  50. image_part = rel.target_part
  51. image_filename = f"image_{image_counter}.png"
  52. image_filepath = os.path.join(output_folder, image_filename)
  53. with open(image_filepath, 'wb') as image_file:
  54. image_file.write(image_part.blob)
  55. # 打开并转换为NumPy数组
  56. pil_image = Image.open(image_filepath)
  57. img_np = np.array(pil_image)
  58. images_info.append({
  59. 'image': img_np,
  60. 'filename': os.path.basename(file_path),
  61. 'position': f'image_{image_counter}', # WPS 文档中没有 sheet_name 概念
  62. 'similarity_score': None # 初始值设置为 None
  63. })
  64. print(f"Finished extracting images from {file_path}. Total images extracted: {image_counter}")
  65. return images_info
  66. def process_files(upload_folder, ref_image_path, similarity_threshold, time_label):
  67. results = []
  68. ref_img = cv2.imread(ref_image_path)
  69. for filename in os.listdir(upload_folder):
  70. file_path = os.path.join(upload_folder, filename)
  71. if filename.endswith(".xlsx"):
  72. images_info = extract_images_from_excel(file_path, time_label)
  73. elif filename.endswith((".docx", ".wps")):
  74. images_info = extract_images_from_wps(file_path, time_label)
  75. else:
  76. continue
  77. for info in images_info:
  78. img_pil = info['image']
  79. img_np = np.array(img_pil) # Convert PIL Image to NumPy array
  80. img_cv2 = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) if len(img_np.shape) == 3 else img_np
  81. similarity_score = calculate_similarity(ref_img, img_cv2)
  82. if similarity_score >= similarity_threshold:
  83. results.append({
  84. 'filename': filename,
  85. 'sheet_name': info.get('sheet_name', 'N/A'), # Excel 文件有 sheet_name, WPS 没有
  86. 'position': info['position'],
  87. 'similarity_score': similarity_score,
  88. 'image': img_np # 确保这是一个 NumPy 数组
  89. })
  90. return results