4 years ago · 9f1b1a6fd6
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
 
				+/.idea
			
 
				+/videos
			
--- a/OcrUtils.py
+++ b/OcrUtils.py
@@ -0,0 +1,52 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Author  :   liuyuqi
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2020/02/17 22:13:38
			
 
				+@Version :   1.0
			
 
				+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   ocr 文字识别
			
 
				+'''
			
 
				+
			
 
				+import base64
			
 
				+import json,os,sys,re,requests
			
 
				+import user_agent
			
 
				+
			
 
				+class OcrUtils():
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+
			
 
				+    def ocrImg(self, imgData, ocrType):
			
 
				+        if ocrType == 0:
			
 
				+            return self.bdGeneralOcr(imgData)
			
 
				+        elif ocrType == 1:
			
 
				+            return self.bdAccurateOcr(imgData)
			
 
				+        elif ocrType == 1:
			
 
				+            return self.sogouMobileOcr(imgData)
			
 
				+        else:
			
 
				+            return self.sogouWebOcr(imgData)
			
 
				+
			
 
				+    def bdGeneralOcr(self, imgData):
			
 
				+        return self.bdBaseOcr(imgData, "general_location")
			
 
				+
			
 
				+    def sogouMobileOcr(self, imgData):
			
 
				+        pass
			
 
				+
			
 
				+    def bdAccurateOcr(self, imgData):
			
 
				+        self.bdBaseOcr(imgData, "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate")
			
 
				+        data={}
			
 
				+        res=requests.post(url=url,data=data,headers=user_agent.getheaders()).content.decode("utf8")
			
 
				+
			
 
				+    def sogouWebOcr(self, imgData):
			
 
				+        url = "https://deepi.sogou.com/api/sogouService"
			
 
				+        referer = "https://deepi.sogou.com/?from=picsearch&tdsourcetag=s_pctim_aiomsg"
			
 
				+        imageData = base64.encode(imgData)
			
 
				+
			
 
				+
			
 
				+    def bdBaseOcr(self, imgData, param):
			
 
				+        urlArr = ["http://ai.baidu.com/tech/ocr/general", "http://ai.baidu.com/index/seccode?action=show"]
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    test=OcrUtils()
			
 
				+    print(test.sogouWebOcr())
			
--- a/README.md
+++ b/README.md
@@ -0,0 +1,26 @@
 
				+# video-subtitle-extract
			
 
				+
			
 
				+无字幕文件的视频，字幕通过opencv图片识别方法抽取其中的文字。
			
 
				+
			
 
				+
			
 
				+## Usage
			
 
				+
			
 
				+1. 把视频文件放到 videos 文件夹中。
			
 
				+
			
 
				+2. 先执行 pre_do.py 文件，查看一下视频的帧速，视频尺寸等信息。
			
 
				+
			
 
				+    ![](assets/BaiduHi_2020-2-18_11-14-28.png)
			
 
				+
			
 
				+3. 通过上一步操作，修改 main.py 中的 参数，使得字幕图片被很好的截取。然后执行 main.py 会在 videos 文件夹中生成对视频分析后截取的字幕图片（10句合成一个图片，便于识别）。
			
 
				+
			
 
				+    ![](assets/BaiduHi_2020-2-18_11-16-10.png)
			
 
				+
			
 
				+4. 执行 MainFm.java 对上述图片进行批量文字识别（分别调用百度/搜狗等文字识别免费接口）。获得结果保存到 项目resxxx.txt 文件中。
			
 
				+
			
 
				+    ![](assets/BaiduHi_2020-2-18_11-13-33.png)
			
 
				+
			
 
				+
			
 
				+## 注意
			
 
				+
			
 
				+由于文字识别接口，可能识别错误。所以可以多执行几次，比较结果。
			
 
				+
			
--- a/assets/BaiduHi_2020-2-18_11-13-33.png
+++ b/assets/BaiduHi_2020-2-18_11-13-33.png
--- a/assets/BaiduHi_2020-2-18_11-14-28.png
+++ b/assets/BaiduHi_2020-2-18_11-14-28.png
--- a/assets/BaiduHi_2020-2-18_11-16-10.png
+++ b/assets/BaiduHi_2020-2-18_11-16-10.png
--- a/main.py
+++ b/main.py
@@ -0,0 +1,106 @@
 
				+# 基于图像识别和文字识别用 Python 提取视频字幕
			
 
				+import cv2
			
 
				+from PIL import Image
			
 
				+import numpy as np
			
 
				+import os
			
 
				+import datetime
			
 
				+import re
			
 
				+
			
 
				+
			
 
				+def format_time(second):
			
 
				+    hours = second // 3600
			
 
				+    minutes = (second - hours * 3600) // 60
			
 
				+    second = second - hours * 3600 - minutes * 60
			
 
				+    t = datetime.time(hour=hours, minute=minutes, second=second)
			
 
				+    return datetime.time.isoformat(t)
			
 
				+
			
 
				+
			
 
				+def cal_stderr(img, imgo=None):
			
 
				+    '''
			
 
				+    计算方差
			
 
				+    :param img:
			
 
				+    :param imgo:
			
 
				+    :return:
			
 
				+    '''
			
 
				+    if imgo is None:
			
 
				+        return (img ** 2).sum() / img.size * 100
			
 
				+    else:
			
 
				+        return ((img - imgo) ** 2).sum() / img.size * 100
			
 
				+
			
 
				+
			
 
				+def save_image(ex_folder, img: Image, starts: int, ends: int):
			
 
				+    # 保存字幕图片到文件夹
			
 
				+    start_time = format_time(starts)
			
 
				+    end_time = format_time(ends)
			
 
				+    timeline = '-'.join([start_time, end_time])
			
 
				+    timeline = timeline.replace(":", "_") + ".png"
			
 
				+    try:
			
 
				+        imgname = os.path.join(ex_folder, timeline)
			
 
				+        img.save(imgname)
			
 
				+        print('export subtitle at %s' % timeline)
			
 
				+    except Exception:
			
 
				+        print('export subtitle at %s error' % timeline)
			
 
				+
			
 
				+
			
 
				+def export_subtitle(video_filename, skip_frames):
			
 
				+    '''
			
 
				+    导出字幕
			
 
				+    :param video_filename: 视频文件
			
 
				+    :return: 字幕图片截图
			
 
				+    '''
			
 
				+    ex_folder = os.path.splitext(video_filename)[0]
			
 
				+    if not os.path.exists(ex_folder):
			
 
				+        os.mkdir(ex_folder)
			
 
				+    videoCap = cv2.VideoCapture(video_filename)
			
 
				+    for i in range(skip_frames):
			
 
				+        videoCap.read()
			
 
				+    start_frame = skip_frames
			
 
				+    curr_frame = skip_frames
			
 
				+    fps = videoCap.get(cv2.CAP_PROP_FPS)  # 30
			
 
				+    success = True
			
 
				+    subtitle_img = None
			
 
				+    last_img = None
			
 
				+    img_count = 0
			
 
				+    while success:
			
 
				+        for j in range(9):
			
 
				+            videoCap.read()
			
 
				+            curr_frame += 1
			
 
				+        success, frame = videoCap.read()
			
 
				+        curr_frame += 1
			
 
				+        if frame is None:
			
 
				+            print('video: %s finish at %d frame.' % (video_filename, curr_frame))
			
 
				+            break
			
 
				+
			
 
				+        img = frame[:, :, 0]
			
 
				+        img = img[320:640, :]
			
 
				+        _, img = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY)
			
 
				+
			
 
				+        if cal_stderr(img) < 1:  # 两帧字幕相同
			
 
				+            continue
			
 
				+
			
 
				+        if img_count == 0:
			
 
				+            subtitle_img = img
			
 
				+            print('video: %s add subtitle at %d frame.' % (video_filename, curr_frame))
			
 
				+            last_img = img
			
 
				+            img_count += 1
			
 
				+        elif img_count > 10:
			
 
				+            img_count = 0
			
 
				+            subtitle_img = Image.fromarray(subtitle_img)
			
 
				+            save_image(ex_folder, subtitle_img, int(start_frame / fps), int(curr_frame / fps))
			
 
				+            start_frame = curr_frame  # 开始时间往后移
			
 
				+        else:
			
 
				+            if cal_stderr(img, last_img) > 1:
			
 
				+                subtitle_img = np.vstack((subtitle_img, img))
			
 
				+                last_img = img
			
 
				+                img_count += 1
			
 
				+                print('video: %s add subtitle at %d frame.' % (video_filename, curr_frame))
			
 
				+    if img_count > 0:
			
 
				+        subtitle_img = Image.fromarray(subtitle_img)
			
 
				+        save_image(ex_folder, subtitle_img, int(start_frame / fps), int(curr_frame / fps))
			
 
				+    print('video: %s export subtitle finish!' % video_filename)
			
 
				+
			
 
				+
			
 
				+if __name__ == '__main__':
			
 
				+    video_filename = r'videos/大象解说《血战钢锯岭》.mp4'
			
 
				+    skip_frames = 2818
			
 
				+    export_subtitle(video_filename, skip_frames)
			
--- a/pre_do.py
+++ b/pre_do.py
@@ -0,0 +1,66 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Author  :   liuyuqi
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2020/02/17 16:08:13
			
 
				+@Version :   1.0
			
 
				+@License :   (C)Copyright 2019
			
 
				+@Desc    :   预处理
			
 
				+'''
			
 
				+
			
 
				+import cv2
			
 
				+
			
 
				+videoName = r'videos/大象解说《血战钢锯岭》.mp4'
			
 
				+# 截图
			
 
				+videoCap=cv2.VideoCapture(videoName)
			
 
				+
			
 
				+# 帧频
			
 
				+fps = videoCap.get(cv2.CAP_PROP_FPS)
			
 
				+# 视频总帧数
			
 
				+total_frames = int(videoCap.get(cv2.CAP_PROP_FRAME_COUNT))
			
 
				+# 图像尺寸
			
 
				+image_size = (int(videoCap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
			
 
				+              int(videoCap.get(cv2.CAP_PROP_FRAME_WIDTH)))
			
 
				+
			
 
				+print(fps)
			
 
				+print(total_frames)
			
 
				+print(image_size)
			
 
				+
			
 
				+
			
 
				+for i in range(2818):
			
 
				+    sucess, frame = videoCap.read()
			
 
				+
			
 
				+# sucess,frame=videoCap.read(2818)
			
 
				+
			
 
				+
			
 
				+from PIL import Image
			
 
				+# img = Image.fromarray(frame)
			
 
				+# img.show()
			
 
				+
			
 
				+
			
 
				+## 确定字幕范围
			
 
				+im = frame[:, :, 0]
			
 
				+im = im[325:640, :]
			
 
				+img = Image.fromarray(im)
			
 
				+# img.show()
			
 
				+
			
 
				+
			
 
				+# 二值化
			
 
				+thresh = 150
			
 
				+_, im = cv2.threshold(im, thresh, 255, cv2.THRESH_BINARY)
			
 
				+img = Image.fromarray(im)
			
 
				+img.show()
			
 
				+
			
 
				+
			
 
				+# 计算两张图像间每个像素点的平方误差之和的平均值百分比：当 e > 1 时，图像才有字幕。
			
 
				+
			
 
				+
			
 
				+# 接着计算相同字幕和不同字幕图像直接的误差 e： 误差 e > 1 时，字幕发生切换。
			
 
				+
			
 
				+
			
 
				+# 为了方便文字识别，将 20 句字幕组合成一张图片。完整的提取字幕的代码如下：
			
 
				+
			
 
				+
			
 
				+
			
 
				+
			
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,3 @@
 
				+opencv-python
			
 
				+Pillow
			
 
				+numpy
			
--- a/screenshot/BaiduHi_2020-2-18_11-13-33.png
+++ b/screenshot/BaiduHi_2020-2-18_11-13-33.png
--- a/screenshot/BaiduHi_2020-2-18_11-14-28.png
+++ b/screenshot/BaiduHi_2020-2-18_11-14-28.png
--- a/screenshot/BaiduHi_2020-2-18_11-16-10.png
+++ b/screenshot/BaiduHi_2020-2-18_11-16-10.png
--- a/user_agent.py
+++ b/user_agent.py
@@ -0,0 +1,79 @@
 
				+# -*-coding:utf-8 -*-
			
 
				+
			
 
				+import random
			
 
				+
			
 
				+# 返回一个随机的请求头 headers
			
 
				+def getheaders():
			
 
				+    # 各种PC端
			
 
				+    user_agent_list_2 = [
			
 
				+        # Opera
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
			
 
				+        "Opera/8.0 (Windows NT 5.1; U; en)",
			
 
				+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
			
 
				+        # Firefox
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
			
 
				+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
			
 
				+        # Safari
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
			
 
				+        # chrome
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
			
 
				+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
			
 
				+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
			
 
				+        # 360
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
			
 
				+        # 淘宝浏览器
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
			
 
				+        # 猎豹浏览器
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
			
 
				+        # QQ浏览器
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
			
 
				+        # sogou浏览器
			
 
				+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
			
 
				+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
			
 
				+        # maxthon浏览器
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
			
 
				+        # UC浏览器
			
 
				+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
			
 
				+    ]
			
 
				+    # 各种移动端
			
 
				+    user_agent_list_3 = [
			
 
				+        # IPhone
			
 
				+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
			
 
				+        # IPod
			
 
				+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
			
 
				+        # IPAD
			
 
				+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
			
 
				+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
			
 
				+        # Android
			
 
				+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
			
 
				+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
			
 
				+        # QQ浏览器 Android版本
			
 
				+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
			
 
				+        # Android Opera Mobile
			
 
				+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
			
 
				+        # Android Pad Moto Xoom
			
 
				+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
			
 
				+        # BlackBerry
			
 
				+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
			
 
				+        # WebOS HP Touchpad
			
 
				+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
			
 
				+        # Nokia N97
			
 
				+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
			
 
				+        # Windows Phone Mango
			
 
				+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
			
 
				+        # UC浏览器
			
 
				+        "UCWEB7.0.2.37/28/999",
			
 
				+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
			
 
				+        # UCOpenwave
			
 
				+        "Openwave/ UCWEB7.0.2.37/28/999",
			
 
				+        # UC Opera
			
 
				+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
			
 
				+    ]
			
 
				+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
			
 
				+    headers = {'User-Agent': UserAgent}
			
 
				+    return headers