liuyuqi-dellpc 4 years ago
commit
9f1b1a6fd6

+ 2 - 0
.gitignore

@@ -0,0 +1,2 @@
+/.idea
+/videos

+ 52 - 0
OcrUtils.py

@@ -0,0 +1,52 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/02/17 22:13:38
+@Version :   1.0
+@License :   Copyright © 2017-2020 liuyuqi. All Rights Reserved.
+@Desc    :   ocr 文字识别
+'''
+
+import base64
+import json,os,sys,re,requests
+import user_agent
+
+class OcrUtils():
+    def __init__(self):
+        pass
+
+    def ocrImg(self, imgData, ocrType):
+        if ocrType == 0:
+            return self.bdGeneralOcr(imgData)
+        elif ocrType == 1:
+            return self.bdAccurateOcr(imgData)
+        elif ocrType == 1:
+            return self.sogouMobileOcr(imgData)
+        else:
+            return self.sogouWebOcr(imgData)
+
+    def bdGeneralOcr(self, imgData):
+        return self.bdBaseOcr(imgData, "general_location")
+
+    def sogouMobileOcr(self, imgData):
+        pass
+
+    def bdAccurateOcr(self, imgData):
+        self.bdBaseOcr(imgData, "https://aip.baidubce.com/rest/2.0/ocr/v1/accurate")
+        data={}
+        res=requests.post(url=url,data=data,headers=user_agent.getheaders()).content.decode("utf8")
+
+    def sogouWebOcr(self, imgData):
+        url = "https://deepi.sogou.com/api/sogouService"
+        referer = "https://deepi.sogou.com/?from=picsearch&tdsourcetag=s_pctim_aiomsg"
+        imageData = base64.encode(imgData)
+
+
+    def bdBaseOcr(self, imgData, param):
+        urlArr = ["http://ai.baidu.com/tech/ocr/general", "http://ai.baidu.com/index/seccode?action=show"]
+
+if __name__ == '__main__':
+    test=OcrUtils()
+    print(test.sogouWebOcr())

+ 26 - 0
README.md

@@ -0,0 +1,26 @@
+# video-subtitle-extract
+
+无字幕文件的视频,字幕通过opencv图片识别方法抽取其中的文字。
+
+
+## Usage
+
+1. 把视频文件放到 videos 文件夹中。
+
+2. 先执行 pre_do.py 文件,查看一下视频的帧速,视频尺寸等信息。
+
+    ![](assets/BaiduHi_2020-2-18_11-14-28.png)
+
+3. 通过上一步操作,修改 main.py 中的 参数,使得字幕图片被很好的截取。然后执行 main.py 会在 videos 文件夹中生成对视频分析后截取的字幕图片(10句合成一个图片,便于识别)。
+
+    ![](assets/BaiduHi_2020-2-18_11-16-10.png)
+
+4. 执行 MainFm.java 对上述图片进行批量文字识别(分别调用百度/搜狗等文字识别免费接口)。获得结果保存到 项目resxxx.txt 文件中。
+
+    ![](assets/BaiduHi_2020-2-18_11-13-33.png)
+
+
+## 注意
+
+由于文字识别接口,可能识别错误。所以可以多执行几次,比较结果。
+

BIN
assets/BaiduHi_2020-2-18_11-13-33.png


BIN
assets/BaiduHi_2020-2-18_11-14-28.png


BIN
assets/BaiduHi_2020-2-18_11-16-10.png


+ 106 - 0
main.py

@@ -0,0 +1,106 @@
+# 基于图像识别和文字识别用 Python 提取视频字幕
+import cv2
+from PIL import Image
+import numpy as np
+import os
+import datetime
+import re
+
+
+def format_time(second):
+    hours = second // 3600
+    minutes = (second - hours * 3600) // 60
+    second = second - hours * 3600 - minutes * 60
+    t = datetime.time(hour=hours, minute=minutes, second=second)
+    return datetime.time.isoformat(t)
+
+
+def cal_stderr(img, imgo=None):
+    '''
+    计算方差
+    :param img:
+    :param imgo:
+    :return:
+    '''
+    if imgo is None:
+        return (img ** 2).sum() / img.size * 100
+    else:
+        return ((img - imgo) ** 2).sum() / img.size * 100
+
+
+def save_image(ex_folder, img: Image, starts: int, ends: int):
+    # 保存字幕图片到文件夹
+    start_time = format_time(starts)
+    end_time = format_time(ends)
+    timeline = '-'.join([start_time, end_time])
+    timeline = timeline.replace(":", "_") + ".png"
+    try:
+        imgname = os.path.join(ex_folder, timeline)
+        img.save(imgname)
+        print('export subtitle at %s' % timeline)
+    except Exception:
+        print('export subtitle at %s error' % timeline)
+
+
+def export_subtitle(video_filename, skip_frames):
+    '''
+    导出字幕
+    :param video_filename: 视频文件
+    :return: 字幕图片截图
+    '''
+    ex_folder = os.path.splitext(video_filename)[0]
+    if not os.path.exists(ex_folder):
+        os.mkdir(ex_folder)
+    videoCap = cv2.VideoCapture(video_filename)
+    for i in range(skip_frames):
+        videoCap.read()
+    start_frame = skip_frames
+    curr_frame = skip_frames
+    fps = videoCap.get(cv2.CAP_PROP_FPS)  # 30
+    success = True
+    subtitle_img = None
+    last_img = None
+    img_count = 0
+    while success:
+        for j in range(9):
+            videoCap.read()
+            curr_frame += 1
+        success, frame = videoCap.read()
+        curr_frame += 1
+        if frame is None:
+            print('video: %s finish at %d frame.' % (video_filename, curr_frame))
+            break
+
+        img = frame[:, :, 0]
+        img = img[320:640, :]
+        _, img = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY)
+
+        if cal_stderr(img) < 1:  # 两帧字幕相同
+            continue
+
+        if img_count == 0:
+            subtitle_img = img
+            print('video: %s add subtitle at %d frame.' % (video_filename, curr_frame))
+            last_img = img
+            img_count += 1
+        elif img_count > 10:
+            img_count = 0
+            subtitle_img = Image.fromarray(subtitle_img)
+            save_image(ex_folder, subtitle_img, int(start_frame / fps), int(curr_frame / fps))
+            start_frame = curr_frame  # 开始时间往后移
+        else:
+            if cal_stderr(img, last_img) > 1:
+                subtitle_img = np.vstack((subtitle_img, img))
+                last_img = img
+                img_count += 1
+                print('video: %s add subtitle at %d frame.' % (video_filename, curr_frame))
+    if img_count > 0:
+        subtitle_img = Image.fromarray(subtitle_img)
+        save_image(ex_folder, subtitle_img, int(start_frame / fps), int(curr_frame / fps))
+    print('video: %s export subtitle finish!' % video_filename)
+
+
+if __name__ == '__main__':
+    video_filename = r'videos/大象解说《血战钢锯岭》.mp4'
+    skip_frames = 2818
+    export_subtitle(video_filename, skip_frames)

+ 66 - 0
pre_do.py

@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2020/02/17 16:08:13
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   预处理
+'''
+
+import cv2
+
+videoName = r'videos/大象解说《血战钢锯岭》.mp4'
+# 截图
+videoCap=cv2.VideoCapture(videoName)
+
+# 帧频
+fps = videoCap.get(cv2.CAP_PROP_FPS)
+# 视频总帧数
+total_frames = int(videoCap.get(cv2.CAP_PROP_FRAME_COUNT))
+# 图像尺寸
+image_size = (int(videoCap.get(cv2.CAP_PROP_FRAME_HEIGHT)),
+              int(videoCap.get(cv2.CAP_PROP_FRAME_WIDTH)))
+
+print(fps)
+print(total_frames)
+print(image_size)
+
+
+for i in range(2818):
+    sucess, frame = videoCap.read()
+
+# sucess,frame=videoCap.read(2818)
+
+
+from PIL import Image
+# img = Image.fromarray(frame)
+# img.show()
+
+
+## 确定字幕范围
+im = frame[:, :, 0]
+im = im[325:640, :]
+img = Image.fromarray(im)
+# img.show()
+
+
+# 二值化
+thresh = 150
+_, im = cv2.threshold(im, thresh, 255, cv2.THRESH_BINARY)
+img = Image.fromarray(im)
+img.show()
+
+
+# 计算两张图像间每个像素点的平方误差之和的平均值百分比:当 e > 1 时,图像才有字幕。
+
+
+# 接着计算相同字幕和不同字幕图像直接的误差 e: 误差 e > 1 时,字幕发生切换。
+
+
+# 为了方便文字识别,将 20 句字幕组合成一张图片。完整的提取字幕的代码如下:
+
+
+
+

+ 3 - 0
requirements.txt

@@ -0,0 +1,3 @@
+opencv-python
+Pillow
+numpy

BIN
screenshot/BaiduHi_2020-2-18_11-13-33.png


BIN
screenshot/BaiduHi_2020-2-18_11-14-28.png


BIN
screenshot/BaiduHi_2020-2-18_11-16-10.png


+ 79 - 0
user_agent.py

@@ -0,0 +1,79 @@
+# -*-coding:utf-8 -*-
+
+import random
+
+# 返回一个随机的请求头 headers
+def getheaders():
+    # 各种PC端
+    user_agent_list_2 = [
+        # Opera
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
+        "Opera/8.0 (Windows NT 5.1; U; en)",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
+        # Firefox
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
+        # Safari
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        # chrome
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
+        # 360
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        # 淘宝浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+        # 猎豹浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+        # QQ浏览器
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+        # sogou浏览器
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
+        # maxthon浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
+        # UC浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
+    ]
+    # 各种移动端
+    user_agent_list_3 = [
+        # IPhone
+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPod
+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPAD
+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # Android
+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # QQ浏览器 Android版本
+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # Android Opera Mobile
+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+        # Android Pad Moto Xoom
+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+        # BlackBerry
+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+        # WebOS HP Touchpad
+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+        # Nokia N97
+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+        # Windows Phone Mango
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+        # UC浏览器
+        "UCWEB7.0.2.37/28/999",
+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
+        # UCOpenwave
+        "Openwave/ UCWEB7.0.2.37/28/999",
+        # UC Opera
+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
+    ]
+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
+    headers = {'User-Agent': UserAgent}
+    return headers