lyq
/
fuck12306


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
							#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Contact :   liuyuqi.gov@msn.cn
@Time    :   2023/07/12 14:58:48
@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
@Desc    :   获取验证码，处理验证码
'''
from PIL import Image
import requests
import re
import sys
import os
import time
import random
import json
import base64
from fuck12306 import api
import pandas as pd
import numpy as np
# import cv2
# import torch,caffe,tensorflow


class CrawlCaptcha(object):
    ''' 验证码处理类 '''

    def __init__(self):
        pass

    def download_captcha(self, number=100):
        ''' get code picture 
            :param number: the number of captcha
        '''
        print("............... get code  pic ...........")
        for i in range(number):
            print("-----------%s--------------" % (i))
            # 设置代理
            proxies = {
                "http": "http://" + self.conf["proxy"]["proxy"],
                "https": "https://" + self.conf["proxy"]["proxy"],
            }
            agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
            login = 'https://kyfw.12306.cn/otn/login/init'
            domain = 'kyfw.12306.cn'

            headers = {
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding': 'gzip, deflate, sdch, br',
                'Accept-Language': 'zh - CN, zh;q = 0.8',
                'Cache-Control': 'no - cache',
                'Connection': 'keep-alive',
                'Host': domain,
                'User-aget': agent,
                'Referer': login
            }
            self.sess.proxies.update(proxies)
            res = self.sess.get(
                api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
            time.sleep(random.randint(1, 2))
            if res.status_code == 200:
                with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
                    file.write(res.content)

    def download_captcha_v2(self, number=100):
        ''' get code picture old api 
            :param number: the number of captcha
        '''
        print("............... get code  pic ...........")
        for i in range(number):
            print("-----------%s--------------" % (i))
            res = self.sess.get(api.pic_old_url)
            time.sleep(random.randint(1, 3))
            if res.status_code == 200:
                img_str = json.loads(res.content)['image']
                with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
                    file.write(base64.b64decode(img_str))

    @staticmethod
    def get_sub_pic(im: Image, x, y) -> Image:
        ''' 验证码图片中截取图片部分 '''
        assert 0 <= x <= 3
        assert 0 <= y <= 2
        WITH = HEIGHT = 68
        left = 5 + (67 + 5) * x
        top = 41 + (67 + 5) * y
        right = left + 67
        bottom = top + 67

        return im.crop((left, top, right, bottom))

    def get_label(self, im: Image) -> list:
        ''' 获取image 中的标签 '''
        label = ["叉子"]
        left = 118
        img_size = im.size
        bottom = 29
        tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
        # 文字识别，分词，读取对于标签的hash字典
        return label

    def recornize_captcha(self,):
        ''' recornize captcha ，图片尺寸：293*190 '''
        # 获取12306验证码
        img = Image.open("data/pic/pic_26173.png")

        # 识别label
        label = self.get_label(img)
        # 读取子图
        for x in range(4):
            for y in range(2):  # 两行四列
                tmp = CrawlCaptcha.get_sub_pic(img, x, y)
                # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
                # save the tmp image
                # 计算hash值与标签的hash字典key进行比对
                tmp.save("data/%s%s.png" % (x, y))

    def train(self):
        ''' train the model '''
        # 批量下载50000验证码，并每日不断更新
        # 识别验证码，生成标签，生成训练集
        # 训练模型
        # 数据划分:训练，测试，验证

        pass

    def predict(self, img: Image):
        ''' predict result 
            输入验证码图片，识别标签，识别 4*2 子图后，识别子图标签，返回结果
        '''
        index = []
        pass

    def gen_label(self):
        ''' gen label '''
        labeles = []
        for root, dirs, files in os.walk("data/archive/"):
            # for file in files:
            #     # Access each file in "data/archive/" directory
            #     file_path = os.path.join(root, file)
            #     print(file_path)

            for dir in dirs:
                dir_path = os.path.join(root, dir)
                labeles.append(dir)
        df = pd.DataFrame(labeles)
        df.to_csv("label.csv", header=False, index=False)
        np.save("label.npy", labeles)
        # df.to_pickle("res.pkl")

    def remove_file_noise(self, im:Image, threshold=20):
        """
            该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
            ,param threshold, 设定黑色斑点的最大值
            ,param img, RGB图像的数据
            ,return, 返回RGB图像数据
        """
        def min(c_val, t_val):
            return c_val if c_val < t_val else t_val

        def sum(_val, _sum, _num):
            return _val + _sum, _num + 1

        def rm_blackNoisy_component(img, threshold):
            y, x = img.shape[:2]  # x,y表示坐标
            for _x in range(0, x, 1):
                for _y in range(0, y, 1):
                    cnt = 0
                    _sum = num = 0
                    if _y > 0:
                        _sum, num = sum(img[_y - 1, _x], _sum, num)
                    if _y + 1 < y:
                        _sum, num = sum(img[_y + 1, _x], _sum, num)
                    if _x > 0:
                        _sum, num = sum(img[_y, _x - 1], _sum, num)
                    if _x + 1 < x:
                        _sum, num = sum(img[_y, _x + 1], _sum, num)
                    if _y > 0 and _x > 0:
                        _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
                    if _y > 0 and _x + 1 < x:
                        _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
                    if _y + 1 < y and _x > 0:
                        _sum, num = sum(img[_y + 1, _x - 1], _sum, num)

                    if _y + 1 < y and _x + 1 < x:
                        _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
                    if cnt > 0 or num < 1:  # 说明不是单个点
                        continue
                    # 如果是孤点,则可以依据权重进行插值
                    average = _sum / num
                    if img[_y, _x] + 100 <= average:
                        img[_y, _x] = average

            return img
        
        if im.ndim == 3:
            b, g, r = cv2.split(im)
            _b = rm_blackNoisy_component(b, threshold)
            _g = rm_blackNoisy_component(g, threshold)
            _r = rm_blackNoisy_component(r, threshold)
            img = cv2.merge([_b, _g, _r])  # 前面分离出来的三个通道

        else:
            return rm_blackNoisy_component(im, threshold)

        return img