123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209 |
- #!/usr/bin/env python
- # -*- encoding: utf-8 -*-
- '''
- @Contact : liuyuqi.gov@msn.cn
- @Time : 2023/07/12 14:58:48
- @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
- @Desc : 获取验证码,处理验证码
- '''
- from PIL import Image
- import requests
- import re
- import sys
- import os
- import time
- import random
- import json
- import base64
- from fuck12306 import api
- import pandas as pd
- import numpy as np
- # import cv2
- # import torch,caffe,tensorflow
- class CrawlCaptcha(object):
- ''' 验证码处理类 '''
- def __init__(self):
- pass
- def download_captcha(self, number=100):
- ''' get code picture
- :param number: the number of captcha
- '''
- print("............... get code pic ...........")
- for i in range(number):
- print("-----------%s--------------" % (i))
- # 设置代理
- proxies = {
- "http": "http://" + self.conf["proxy"]["proxy"],
- "https": "https://" + self.conf["proxy"]["proxy"],
- }
- agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
- login = 'https://kyfw.12306.cn/otn/login/init'
- domain = 'kyfw.12306.cn'
- headers = {
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
- 'Accept-Encoding': 'gzip, deflate, sdch, br',
- 'Accept-Language': 'zh - CN, zh;q = 0.8',
- 'Cache-Control': 'no - cache',
- 'Connection': 'keep-alive',
- 'Host': domain,
- 'User-aget': agent,
- 'Referer': login
- }
- self.sess.proxies.update(proxies)
- res = self.sess.get(
- api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
- time.sleep(random.randint(1, 2))
- if res.status_code == 200:
- with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
- file.write(res.content)
- def download_captcha_v2(self, number=100):
- ''' get code picture old api
- :param number: the number of captcha
- '''
- print("............... get code pic ...........")
- for i in range(number):
- print("-----------%s--------------" % (i))
- res = self.sess.get(api.pic_old_url)
- time.sleep(random.randint(1, 3))
- if res.status_code == 200:
- img_str = json.loads(res.content)['image']
- with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
- file.write(base64.b64decode(img_str))
- @staticmethod
- def get_sub_pic(im: Image, x, y) -> Image:
- ''' 验证码图片中截取图片部分 '''
- assert 0 <= x <= 3
- assert 0 <= y <= 2
- WITH = HEIGHT = 68
- left = 5 + (67 + 5) * x
- top = 41 + (67 + 5) * y
- right = left + 67
- bottom = top + 67
- return im.crop((left, top, right, bottom))
- def get_label(self, im: Image) -> list:
- ''' 获取image 中的标签 '''
- label = ["叉子"]
- left = 118
- img_size = im.size
- bottom = 29
- tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
- # 文字识别,分词,读取对于标签的hash字典
- return label
- def recornize_captcha(self,):
- ''' recornize captcha ,图片尺寸:293*190 '''
- # 获取12306验证码
- img = Image.open("data/pic/pic_26173.png")
- # 识别label
- label = self.get_label(img)
- # 读取子图
- for x in range(4):
- for y in range(2): # 两行四列
- tmp = CrawlCaptcha.get_sub_pic(img, x, y)
- # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
- # save the tmp image
- # 计算hash值与标签的hash字典key进行比对
- tmp.save("data/%s%s.png" % (x, y))
- def train(self):
- ''' train the model '''
- # 批量下载50000验证码,并每日不断更新
- # 识别验证码,生成标签,生成训练集
- # 训练模型
- # 数据划分:训练,测试,验证
- pass
- def predict(self, img: Image):
- ''' predict result
- 输入验证码图片,识别标签,识别 4*2 子图后,识别子图标签,返回结果
- '''
- index = []
- pass
- def gen_label(self):
- ''' gen label '''
- labeles = []
- for root, dirs, files in os.walk("data/archive/"):
- # for file in files:
- # # Access each file in "data/archive/" directory
- # file_path = os.path.join(root, file)
- # print(file_path)
- for dir in dirs:
- dir_path = os.path.join(root, dir)
- labeles.append(dir)
- df = pd.DataFrame(labeles)
- df.to_csv("label.csv", header=False, index=False)
- np.save("label.npy", labeles)
- # df.to_pickle("res.pkl")
- def remove_file_noise(self, im:Image, threshold=20):
- """
- 该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
- ,param threshold, 设定黑色斑点的最大值
- ,param img, RGB图像的数据
- ,return, 返回RGB图像数据
- """
- def min(c_val, t_val):
- return c_val if c_val < t_val else t_val
- def sum(_val, _sum, _num):
- return _val + _sum, _num + 1
- def rm_blackNoisy_component(img, threshold):
- y, x = img.shape[:2] # x,y表示坐标
- for _x in range(0, x, 1):
- for _y in range(0, y, 1):
- cnt = 0
- _sum = num = 0
- if _y > 0:
- _sum, num = sum(img[_y - 1, _x], _sum, num)
- if _y + 1 < y:
- _sum, num = sum(img[_y + 1, _x], _sum, num)
- if _x > 0:
- _sum, num = sum(img[_y, _x - 1], _sum, num)
- if _x + 1 < x:
- _sum, num = sum(img[_y, _x + 1], _sum, num)
- if _y > 0 and _x > 0:
- _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
- if _y > 0 and _x + 1 < x:
- _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
- if _y + 1 < y and _x > 0:
- _sum, num = sum(img[_y + 1, _x - 1], _sum, num)
- if _y + 1 < y and _x + 1 < x:
- _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
- if cnt > 0 or num < 1: # 说明不是单个点
- continue
- # 如果是孤点,则可以依据权重进行插值
- average = _sum / num
- if img[_y, _x] + 100 <= average:
- img[_y, _x] = average
- return img
-
- if im.ndim == 3:
- b, g, r = cv2.split(im)
- _b = rm_blackNoisy_component(b, threshold)
- _g = rm_blackNoisy_component(g, threshold)
- _r = rm_blackNoisy_component(r, threshold)
- img = cv2.merge([_b, _g, _r]) # 前面分离出来的三个通道
- else:
- return rm_blackNoisy_component(im, threshold)
- return img
-
-
|