# utils.py import os import cv2 import numpy as np from PIL import Image import openpyxl from skimage.metrics import structural_similarity as ssim from config import Config from docx import Document def allowed_file(filename): return '.' in filename and \ filename.rsplit('.', 1)[1].lower() in Config.ALLOWED_EXTENSIONS def calculate_similarity(imageA, imageB): grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY) grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY) score, _ = ssim(grayA, grayB, full=True) return score def extract_images_from_excel(file_path, time_label): workbook = openpyxl.load_workbook(file_path) images_info = [] for sheet_name in workbook.sheetnames: sheet = workbook[sheet_name] for img in sheet._images: from io import BytesIO image_data = img._data() image = Image.open(BytesIO(image_data)) position = (img.anchor._from.row + 1, img.anchor._from.col + 1) # if time_label == "120分钟": # Placeholder, implement actual logic images_info.append({ 'image': image, 'position': position, 'sheet_name': sheet_name }) return images_info def extract_images_from_wps(file_path, time_label): """ 从WPS文档(.docx)中提取所有图像,并保存到指定文件夹。 :param file_path: WPS文档的路径 :param time_label: 时间标签,用于创建唯一输出文件夹名称 """ output_folder = os.path.join('output_images', f'{time_label}_images') if not os.path.exists(output_folder): os.makedirs(output_folder) document = Document(file_path) image_counter = 0 images_info = [] for rel in document.part.rels.values(): if "image" in rel.target_ref: image_counter += 1 image_part = rel.target_part image_filename = f"image_{image_counter}.png" image_filepath = os.path.join(output_folder, image_filename) with open(image_filepath, 'wb') as image_file: image_file.write(image_part.blob) # 打开并转换为NumPy数组 pil_image = Image.open(image_filepath) img_np = np.array(pil_image) images_info.append({ 'image': img_np, 'filename': os.path.basename(file_path), 'position': f'image_{image_counter}', # WPS 文档中没有 sheet_name 概念 'similarity_score': None # 初始值设置为 None }) print(f"Finished extracting images from {file_path}. Total images extracted: {image_counter}") return images_info def process_files(upload_folder, ref_image_path, similarity_threshold, time_label): results = [] ref_img = cv2.imread(ref_image_path) for filename in os.listdir(upload_folder): file_path = os.path.join(upload_folder, filename) if filename.endswith(".xlsx"): images_info = extract_images_from_excel(file_path, time_label) elif filename.endswith((".docx", ".wps")): images_info = extract_images_from_wps(file_path, time_label) else: continue for info in images_info: img_pil = info['image'] img_np = np.array(img_pil) # Convert PIL Image to NumPy array img_cv2 = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) if len(img_np.shape) == 3 else img_np similarity_score = calculate_similarity(ref_img, img_cv2) if similarity_score >= similarity_threshold: results.append({ 'filename': filename, 'sheet_name': info.get('sheet_name', 'N/A'), # Excel 文件有 sheet_name, WPS 没有 'position': info['position'], 'similarity_score': similarity_score, 'image': img_np # 确保这是一个 NumPy 数组 }) return results