2 years ago · b9b1780dfd
--- a/fuck12306/__init__.py
+++ b/fuck12306/__init__.py
@@ -1,5 +1,6 @@
 
				 
			
 
				 from fuck12306.crawl_12306 import Crawl12306
			
 
				+from fuck12306.crawl_captcha import CrawlCaptcha
			
 
				 
			
 
				 def main():
			
 
				     pass
			
--- a/fuck12306/api.py
+++ b/fuck12306/api.py
@@ -1,9 +1,12 @@
 
				 
			
 
				 
			
 
				 # 验证码图片
			
 
				-pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
			
 
				+pic_new_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&"
			
 
				 
			
 
				-pic_url2="https://kyfw.12306.cn/passport/captcha/captcha-image64"
			
 
				+pic_old_url="https://kyfw.12306.cn/passport/captcha/captcha-image64"
			
 
				+
			
 
				+# 验证码识别接口
			
 
				+recognize_url="https://xx/captcha-check"
			
 
				 
			
 
				 # 余票查询接口
			
 
				 remain = "https://kyfw.12306.cn/otn/leftTicket/queryA?leftTicketDTO.train_date=2018-09-30&leftTicketDTO.from_station=IZQ&leftTicketDTO.to_station=CBQ&purpose_codes=ADULT"
			
--- a/fuck12306/crawl_12306.py
+++ b/fuck12306/crawl_12306.py
@@ -51,7 +51,7 @@ class Crawl12306:
 
				                 "https": "https://" + self.conf["proxy"]["proxy"],
			
 
				             }
			
 
				             self.sess.proxies.update(proxies)
			
 
				-            res= self.sess.get(api.pic_url)
			
 
				+            res = self.sess.get(api.pic_new_url)
			
 
				             time.sleep(random.randint(3,5))
			
 
				             if res.status_code == 200:
			
 
				                 with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
			
@@ -62,8 +62,8 @@ class Crawl12306:
 
				         print("............... get code  pic ...........")
			
 
				         for i in range(number):
			
 
				             print("-----------%s--------------" % (i))
			
 
				-            res= self.sess.get(api.pic_url2)
			
 
				-            time.sleep(random.randint(1,3))
			
 
				+            res= self.sess.get(api.pic_old_url)
			
 
				+            time.sleep(random.uniform(0.5, 1))
			
 
				             if res.status_code == 200:
			
 
				                 img_str = json.loads(res.content)['image']
			
 
				                 with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
			
@@ -104,7 +104,8 @@ class Crawl12306:
 
				             time.sleep(5,10)
			
 
				             self.send_mail('有票了')
			
 
				 
			
 
				-    def send_mail(self,content:str):
			
 
				+    def send_mail(self, content: str):
			
 
				+        ''' send mail '''
			
 
				         message = MIMEText(content, 'plain', 'utf-8')
			
 
				         message['From'] = Header("肥肥", 'utf-8') 
			
 
				         message['To'] = Header("圆圆", 'utf-8')
			
@@ -124,5 +125,3 @@ class Crawl12306:
 
				     def check_update():
			
 
				         ''' check application update '''
			
 
				         pass
			
 
				-
			
 
				-        
			
--- a/fuck12306/crawl_captcha.py
+++ b/fuck12306/crawl_captcha.py
@@ -0,0 +1,209 @@
 
				+#!/usr/bin/env python
			
 
				+# -*- encoding: utf-8 -*-
			
 
				+'''
			
 
				+@Contact :   liuyuqi.gov@msn.cn
			
 
				+@Time    :   2023/07/12 14:58:48
			
 
				+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
			
 
				+@Desc    :   获取验证码，处理验证码
			
 
				+'''
			
 
				+from PIL import Image
			
 
				+import requests
			
 
				+import re
			
 
				+import sys
			
 
				+import os
			
 
				+import time
			
 
				+import random
			
 
				+import json
			
 
				+import base64
			
 
				+from fuck12306 import api
			
 
				+import pandas as pd
			
 
				+import numpy as np
			
 
				+# import cv2
			
 
				+# import torch,caffe,tensorflow
			
 
				+
			
 
				+
			
 
				+class CrawlCaptcha(object):
			
 
				+    ''' 验证码处理类 '''
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        pass
			
 
				+
			
 
				+    def download_captcha(self, number=100):
			
 
				+        ''' get code picture 
			
 
				+            :param number: the number of captcha
			
 
				+        '''
			
 
				+        print("............... get code  pic ...........")
			
 
				+        for i in range(number):
			
 
				+            print("-----------%s--------------" % (i))
			
 
				+            # 设置代理
			
 
				+            proxies = {
			
 
				+                "http": "http://" + self.conf["proxy"]["proxy"],
			
 
				+                "https": "https://" + self.conf["proxy"]["proxy"],
			
 
				+            }
			
 
				+            agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
			
 
				+            login = 'https://kyfw.12306.cn/otn/login/init'
			
 
				+            domain = 'kyfw.12306.cn'
			
 
				+
			
 
				+            headers = {
			
 
				+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
			
 
				+                'Accept-Encoding': 'gzip, deflate, sdch, br',
			
 
				+                'Accept-Language': 'zh - CN, zh;q = 0.8',
			
 
				+                'Cache-Control': 'no - cache',
			
 
				+                'Connection': 'keep-alive',
			
 
				+                'Host': domain,
			
 
				+                'User-aget': agent,
			
 
				+                'Referer': login
			
 
				+            }
			
 
				+            self.sess.proxies.update(proxies)
			
 
				+            res = self.sess.get(
			
 
				+                api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
			
 
				+            time.sleep(random.randint(1, 2))
			
 
				+            if res.status_code == 200:
			
 
				+                with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
			
 
				+                    file.write(res.content)
			
 
				+
			
 
				+    def download_captcha_v2(self, number=100):
			
 
				+        ''' get code picture old api 
			
 
				+            :param number: the number of captcha
			
 
				+        '''
			
 
				+        print("............... get code  pic ...........")
			
 
				+        for i in range(number):
			
 
				+            print("-----------%s--------------" % (i))
			
 
				+            res = self.sess.get(api.pic_old_url)
			
 
				+            time.sleep(random.randint(1, 3))
			
 
				+            if res.status_code == 200:
			
 
				+                img_str = json.loads(res.content)['image']
			
 
				+                with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
			
 
				+                    file.write(base64.b64decode(img_str))
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def get_sub_pic(im: Image, x, y) -> Image:
			
 
				+        ''' 验证码图片中截取图片部分 '''
			
 
				+        assert 0 <= x <= 3
			
 
				+        assert 0 <= y <= 2
			
 
				+        WITH = HEIGHT = 68
			
 
				+        left = 5 + (67 + 5) * x
			
 
				+        top = 41 + (67 + 5) * y
			
 
				+        right = left + 67
			
 
				+        bottom = top + 67
			
 
				+
			
 
				+        return im.crop((left, top, right, bottom))
			
 
				+
			
 
				+    def get_label(self, im: Image) -> list:
			
 
				+        ''' 获取image 中的标签 '''
			
 
				+        label = ["叉子"]
			
 
				+        left = 118
			
 
				+        img_size = im.size
			
 
				+        bottom = 29
			
 
				+        tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
			
 
				+        # 文字识别，分词，读取对于标签的hash字典
			
 
				+        return label
			
 
				+
			
 
				+    def recornize_captcha(self,):
			
 
				+        ''' recornize captcha ，图片尺寸：293*190 '''
			
 
				+        # 获取12306验证码
			
 
				+        img = Image.open("data/pic/pic_26173.png")
			
 
				+
			
 
				+        # 识别label
			
 
				+        label = self.get_label(img)
			
 
				+        # 读取子图
			
 
				+        for x in range(4):
			
 
				+            for y in range(2):  # 两行四列
			
 
				+                tmp = CrawlCaptcha.get_sub_pic(img, x, y)
			
 
				+                # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
			
 
				+                # save the tmp image
			
 
				+                # 计算hash值与标签的hash字典key进行比对
			
 
				+                tmp.save("data/%s%s.png" % (x, y))
			
 
				+
			
 
				+    def train(self):
			
 
				+        ''' train the model '''
			
 
				+        # 批量下载50000验证码，并每日不断更新
			
 
				+        # 识别验证码，生成标签，生成训练集
			
 
				+        # 训练模型
			
 
				+        # 数据划分:训练，测试，验证
			
 
				+
			
 
				+        pass
			
 
				+
			
 
				+    def predict(self, img: Image):
			
 
				+        ''' predict result 
			
 
				+            输入验证码图片，识别标签，识别 4*2 子图后，识别子图标签，返回结果
			
 
				+        '''
			
 
				+        index = []
			
 
				+        pass
			
 
				+
			
 
				+    def gen_label(self):
			
 
				+        ''' gen label '''
			
 
				+        labeles = []
			
 
				+        for root, dirs, files in os.walk("data/archive/"):
			
 
				+            # for file in files:
			
 
				+            #     # Access each file in "data/archive/" directory
			
 
				+            #     file_path = os.path.join(root, file)
			
 
				+            #     print(file_path)
			
 
				+
			
 
				+            for dir in dirs:
			
 
				+                dir_path = os.path.join(root, dir)
			
 
				+                labeles.append(dir)
			
 
				+        df = pd.DataFrame(labeles)
			
 
				+        df.to_csv("label.csv", header=False, index=False)
			
 
				+        np.save("label.npy", labeles)
			
 
				+        # df.to_pickle("res.pkl")
			
 
				+
			
 
				+    def remove_file_noise(self, im:Image, threshold=20):
			
 
				+        """
			
 
				+            该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
			
 
				+            ,param threshold, 设定黑色斑点的最大值
			
 
				+            ,param img, RGB图像的数据
			
 
				+            ,return, 返回RGB图像数据
			
 
				+        """
			
 
				+        def min(c_val, t_val):
			
 
				+            return c_val if c_val < t_val else t_val
			
 
				+
			
 
				+        def sum(_val, _sum, _num):
			
 
				+            return _val + _sum, _num + 1
			
 
				+
			
 
				+        def rm_blackNoisy_component(img, threshold):
			
 
				+            y, x = img.shape[:2]  # x,y表示坐标
			
 
				+            for _x in range(0, x, 1):
			
 
				+                for _y in range(0, y, 1):
			
 
				+                    cnt = 0
			
 
				+                    _sum = num = 0
			
 
				+                    if _y > 0:
			
 
				+                        _sum, num = sum(img[_y - 1, _x], _sum, num)
			
 
				+                    if _y + 1 < y:
			
 
				+                        _sum, num = sum(img[_y + 1, _x], _sum, num)
			
 
				+                    if _x > 0:
			
 
				+                        _sum, num = sum(img[_y, _x - 1], _sum, num)
			
 
				+                    if _x + 1 < x:
			
 
				+                        _sum, num = sum(img[_y, _x + 1], _sum, num)
			
 
				+                    if _y > 0 and _x > 0:
			
 
				+                        _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
			
 
				+                    if _y > 0 and _x + 1 < x:
			
 
				+                        _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
			
 
				+                    if _y + 1 < y and _x > 0:
			
 
				+                        _sum, num = sum(img[_y + 1, _x - 1], _sum, num)
			
 
				+
			
 
				+                    if _y + 1 < y and _x + 1 < x:
			
 
				+                        _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
			
 
				+                    if cnt > 0 or num < 1:  # 说明不是单个点
			
 
				+                        continue
			
 
				+                    # 如果是孤点,则可以依据权重进行插值
			
 
				+                    average = _sum / num
			
 
				+                    if img[_y, _x] + 100 <= average:
			
 
				+                        img[_y, _x] = average
			
 
				+
			
 
				+            return img
			
 
				+        
			
 
				+        if im.ndim == 3:
			
 
				+            b, g, r = cv2.split(im)
			
 
				+            _b = rm_blackNoisy_component(b, threshold)
			
 
				+            _g = rm_blackNoisy_component(g, threshold)
			
 
				+            _r = rm_blackNoisy_component(r, threshold)
			
 
				+            img = cv2.merge([_b, _g, _r])  # 前面分离出来的三个通道
			
 
				+
			
 
				+        else:
			
 
				+            return rm_blackNoisy_component(im, threshold)
			
 
				+
			
 
				+        return img
			
 
				+    
			
 
				+
			
 
				+    
			
--- a/main.py
+++ b/main.py
@@ -12,12 +12,16 @@ import sys
 
				 import re
			
 
				 import argparse
			
 
				 import logging
			
 
				-from fuck12306 import Crawl12306
			
 
				+from fuck12306 import Crawl12306,CrawlCaptcha
			
 
				 
			
 
				 if __name__ == '__main__':
			
 
				-    print("fuck12306")
			
 
				+    print("---------------fuck12306-----------")
			
 
				     crawl12306 = Crawl12306()
			
 
				     # crawl12306.monitor()
			
 
				     # crawl12306.get_code_pic(299)
			
 
				-    crawl12306.get_code_pic2(10000)
			
 
				+    # crawl12306.get_code_pic2(10000)
			
 
				 
			
 
				+    # 验证码识别
			
 
				+    crawlcaptcha=CrawlCaptcha()
			
 
				+    # crawlcaptcha.recornize_captcha()
			
 
				+    crawlcaptcha.gen_label()