|
@@ -0,0 +1,209 @@
|
|
|
+#!/usr/bin/env python
|
|
|
+# -*- encoding: utf-8 -*-
|
|
|
+'''
|
|
|
+@Contact : liuyuqi.gov@msn.cn
|
|
|
+@Time : 2023/07/12 14:58:48
|
|
|
+@License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
|
|
|
+@Desc : 获取验证码,处理验证码
|
|
|
+'''
|
|
|
+from PIL import Image
|
|
|
+import requests
|
|
|
+import re
|
|
|
+import sys
|
|
|
+import os
|
|
|
+import time
|
|
|
+import random
|
|
|
+import json
|
|
|
+import base64
|
|
|
+from fuck12306 import api
|
|
|
+import pandas as pd
|
|
|
+import numpy as np
|
|
|
+# import cv2
|
|
|
+# import torch,caffe,tensorflow
|
|
|
+
|
|
|
+
|
|
|
+class CrawlCaptcha(object):
|
|
|
+ ''' 验证码处理类 '''
|
|
|
+
|
|
|
+ def __init__(self):
|
|
|
+ pass
|
|
|
+
|
|
|
+ def download_captcha(self, number=100):
|
|
|
+ ''' get code picture
|
|
|
+ :param number: the number of captcha
|
|
|
+ '''
|
|
|
+ print("............... get code pic ...........")
|
|
|
+ for i in range(number):
|
|
|
+ print("-----------%s--------------" % (i))
|
|
|
+ # 设置代理
|
|
|
+ proxies = {
|
|
|
+ "http": "http://" + self.conf["proxy"]["proxy"],
|
|
|
+ "https": "https://" + self.conf["proxy"]["proxy"],
|
|
|
+ }
|
|
|
+ agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
|
|
|
+ login = 'https://kyfw.12306.cn/otn/login/init'
|
|
|
+ domain = 'kyfw.12306.cn'
|
|
|
+
|
|
|
+ headers = {
|
|
|
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
|
+ 'Accept-Encoding': 'gzip, deflate, sdch, br',
|
|
|
+ 'Accept-Language': 'zh - CN, zh;q = 0.8',
|
|
|
+ 'Cache-Control': 'no - cache',
|
|
|
+ 'Connection': 'keep-alive',
|
|
|
+ 'Host': domain,
|
|
|
+ 'User-aget': agent,
|
|
|
+ 'Referer': login
|
|
|
+ }
|
|
|
+ self.sess.proxies.update(proxies)
|
|
|
+ res = self.sess.get(
|
|
|
+ api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
|
|
|
+ time.sleep(random.randint(1, 2))
|
|
|
+ if res.status_code == 200:
|
|
|
+ with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
|
|
|
+ file.write(res.content)
|
|
|
+
|
|
|
+ def download_captcha_v2(self, number=100):
|
|
|
+ ''' get code picture old api
|
|
|
+ :param number: the number of captcha
|
|
|
+ '''
|
|
|
+ print("............... get code pic ...........")
|
|
|
+ for i in range(number):
|
|
|
+ print("-----------%s--------------" % (i))
|
|
|
+ res = self.sess.get(api.pic_old_url)
|
|
|
+ time.sleep(random.randint(1, 3))
|
|
|
+ if res.status_code == 200:
|
|
|
+ img_str = json.loads(res.content)['image']
|
|
|
+ with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
|
|
|
+ file.write(base64.b64decode(img_str))
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def get_sub_pic(im: Image, x, y) -> Image:
|
|
|
+ ''' 验证码图片中截取图片部分 '''
|
|
|
+ assert 0 <= x <= 3
|
|
|
+ assert 0 <= y <= 2
|
|
|
+ WITH = HEIGHT = 68
|
|
|
+ left = 5 + (67 + 5) * x
|
|
|
+ top = 41 + (67 + 5) * y
|
|
|
+ right = left + 67
|
|
|
+ bottom = top + 67
|
|
|
+
|
|
|
+ return im.crop((left, top, right, bottom))
|
|
|
+
|
|
|
+ def get_label(self, im: Image) -> list:
|
|
|
+ ''' 获取image 中的标签 '''
|
|
|
+ label = ["叉子"]
|
|
|
+ left = 118
|
|
|
+ img_size = im.size
|
|
|
+ bottom = 29
|
|
|
+ tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
|
|
|
+ # 文字识别,分词,读取对于标签的hash字典
|
|
|
+ return label
|
|
|
+
|
|
|
+ def recornize_captcha(self,):
|
|
|
+ ''' recornize captcha ,图片尺寸:293*190 '''
|
|
|
+ # 获取12306验证码
|
|
|
+ img = Image.open("data/pic/pic_26173.png")
|
|
|
+
|
|
|
+ # 识别label
|
|
|
+ label = self.get_label(img)
|
|
|
+ # 读取子图
|
|
|
+ for x in range(4):
|
|
|
+ for y in range(2): # 两行四列
|
|
|
+ tmp = CrawlCaptcha.get_sub_pic(img, x, y)
|
|
|
+ # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
|
|
|
+ # save the tmp image
|
|
|
+ # 计算hash值与标签的hash字典key进行比对
|
|
|
+ tmp.save("data/%s%s.png" % (x, y))
|
|
|
+
|
|
|
+ def train(self):
|
|
|
+ ''' train the model '''
|
|
|
+ # 批量下载50000验证码,并每日不断更新
|
|
|
+ # 识别验证码,生成标签,生成训练集
|
|
|
+ # 训练模型
|
|
|
+ # 数据划分:训练,测试,验证
|
|
|
+
|
|
|
+ pass
|
|
|
+
|
|
|
+ def predict(self, img: Image):
|
|
|
+ ''' predict result
|
|
|
+ 输入验证码图片,识别标签,识别 4*2 子图后,识别子图标签,返回结果
|
|
|
+ '''
|
|
|
+ index = []
|
|
|
+ pass
|
|
|
+
|
|
|
+ def gen_label(self):
|
|
|
+ ''' gen label '''
|
|
|
+ labeles = []
|
|
|
+ for root, dirs, files in os.walk("data/archive/"):
|
|
|
+ # for file in files:
|
|
|
+ # # Access each file in "data/archive/" directory
|
|
|
+ # file_path = os.path.join(root, file)
|
|
|
+ # print(file_path)
|
|
|
+
|
|
|
+ for dir in dirs:
|
|
|
+ dir_path = os.path.join(root, dir)
|
|
|
+ labeles.append(dir)
|
|
|
+ df = pd.DataFrame(labeles)
|
|
|
+ df.to_csv("label.csv", header=False, index=False)
|
|
|
+ np.save("label.npy", labeles)
|
|
|
+ # df.to_pickle("res.pkl")
|
|
|
+
|
|
|
+ def remove_file_noise(self, im:Image, threshold=20):
|
|
|
+ """
|
|
|
+ 该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
|
|
|
+ ,param threshold, 设定黑色斑点的最大值
|
|
|
+ ,param img, RGB图像的数据
|
|
|
+ ,return, 返回RGB图像数据
|
|
|
+ """
|
|
|
+ def min(c_val, t_val):
|
|
|
+ return c_val if c_val < t_val else t_val
|
|
|
+
|
|
|
+ def sum(_val, _sum, _num):
|
|
|
+ return _val + _sum, _num + 1
|
|
|
+
|
|
|
+ def rm_blackNoisy_component(img, threshold):
|
|
|
+ y, x = img.shape[:2] # x,y表示坐标
|
|
|
+ for _x in range(0, x, 1):
|
|
|
+ for _y in range(0, y, 1):
|
|
|
+ cnt = 0
|
|
|
+ _sum = num = 0
|
|
|
+ if _y > 0:
|
|
|
+ _sum, num = sum(img[_y - 1, _x], _sum, num)
|
|
|
+ if _y + 1 < y:
|
|
|
+ _sum, num = sum(img[_y + 1, _x], _sum, num)
|
|
|
+ if _x > 0:
|
|
|
+ _sum, num = sum(img[_y, _x - 1], _sum, num)
|
|
|
+ if _x + 1 < x:
|
|
|
+ _sum, num = sum(img[_y, _x + 1], _sum, num)
|
|
|
+ if _y > 0 and _x > 0:
|
|
|
+ _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
|
|
|
+ if _y > 0 and _x + 1 < x:
|
|
|
+ _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
|
|
|
+ if _y + 1 < y and _x > 0:
|
|
|
+ _sum, num = sum(img[_y + 1, _x - 1], _sum, num)
|
|
|
+
|
|
|
+ if _y + 1 < y and _x + 1 < x:
|
|
|
+ _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
|
|
|
+ if cnt > 0 or num < 1: # 说明不是单个点
|
|
|
+ continue
|
|
|
+ # 如果是孤点,则可以依据权重进行插值
|
|
|
+ average = _sum / num
|
|
|
+ if img[_y, _x] + 100 <= average:
|
|
|
+ img[_y, _x] = average
|
|
|
+
|
|
|
+ return img
|
|
|
+
|
|
|
+ if im.ndim == 3:
|
|
|
+ b, g, r = cv2.split(im)
|
|
|
+ _b = rm_blackNoisy_component(b, threshold)
|
|
|
+ _g = rm_blackNoisy_component(g, threshold)
|
|
|
+ _r = rm_blackNoisy_component(r, threshold)
|
|
|
+ img = cv2.merge([_b, _g, _r]) # 前面分离出来的三个通道
|
|
|
+
|
|
|
+ else:
|
|
|
+ return rm_blackNoisy_component(im, threshold)
|
|
|
+
|
|
|
+ return img
|
|
|
+
|
|
|
+
|
|
|
+
|