123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111 |
- # utils.py
- import os
- import cv2
- import numpy as np
- from PIL import Image
- import openpyxl
- from skimage.metrics import structural_similarity as ssim
- from config import Config
- from docx import Document
- def allowed_file(filename):
- return '.' in filename and \
- filename.rsplit('.', 1)[1].lower() in Config.ALLOWED_EXTENSIONS
- def calculate_similarity(imageA, imageB):
- grayA = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
- grayB = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
- score, _ = ssim(grayA, grayB, full=True)
- return score
- def extract_images_from_excel(file_path, time_label):
- workbook = openpyxl.load_workbook(file_path)
- images_info = []
- for sheet_name in workbook.sheetnames:
- sheet = workbook[sheet_name]
- for img in sheet._images:
- from io import BytesIO
- image_data = img._data()
- image = Image.open(BytesIO(image_data))
- position = (img.anchor._from.row + 1, img.anchor._from.col + 1)
- # if time_label == "120分钟": # Placeholder, implement actual logic
- images_info.append({
- 'image': image,
- 'position': position,
- 'sheet_name': sheet_name
- })
- return images_info
- def extract_images_from_wps(file_path, time_label):
- """
- 从WPS文档(.docx)中提取所有图像,并保存到指定文件夹。
-
- :param file_path: WPS文档的路径
- :param time_label: 时间标签,用于创建唯一输出文件夹名称
- """
- output_folder = os.path.join('output_images', f'{time_label}_images')
-
- if not os.path.exists(output_folder):
- os.makedirs(output_folder)
- document = Document(file_path)
- image_counter = 0
- images_info = []
- for rel in document.part.rels.values():
- if "image" in rel.target_ref:
- image_counter += 1
- image_part = rel.target_part
- image_filename = f"image_{image_counter}.png"
- image_filepath = os.path.join(output_folder, image_filename)
- with open(image_filepath, 'wb') as image_file:
- image_file.write(image_part.blob)
- # 打开并转换为NumPy数组
- pil_image = Image.open(image_filepath)
- img_np = np.array(pil_image)
- images_info.append({
- 'image': img_np,
- 'filename': os.path.basename(file_path),
- 'position': f'image_{image_counter}', # WPS 文档中没有 sheet_name 概念
- 'similarity_score': None # 初始值设置为 None
- })
- print(f"Finished extracting images from {file_path}. Total images extracted: {image_counter}")
- return images_info
- def process_files(upload_folder, ref_image_path, similarity_threshold, time_label):
- results = []
- ref_img = cv2.imread(ref_image_path)
- for filename in os.listdir(upload_folder):
- file_path = os.path.join(upload_folder, filename)
-
- if filename.endswith(".xlsx"):
- images_info = extract_images_from_excel(file_path, time_label)
- elif filename.endswith((".docx", ".wps")):
- images_info = extract_images_from_wps(file_path, time_label)
- else:
- continue
- for info in images_info:
- img_pil = info['image']
- img_np = np.array(img_pil) # Convert PIL Image to NumPy array
- img_cv2 = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) if len(img_np.shape) == 3 else img_np
- similarity_score = calculate_similarity(ref_img, img_cv2)
- if similarity_score >= similarity_threshold:
- results.append({
- 'filename': filename,
- 'sheet_name': info.get('sheet_name', 'N/A'), # Excel 文件有 sheet_name, WPS 没有
- 'position': info['position'],
- 'similarity_score': similarity_score,
- 'image': img_np # 确保这是一个 NumPy 数组
- })
- return results
|