liuyuqi-dellpc 9 months ago
parent
commit
b9b1780dfd
5 changed files with 227 additions and 11 deletions
  1. 1 0
      fuck12306/__init__.py
  2. 5 2
      fuck12306/api.py
  3. 5 6
      fuck12306/crawl_12306.py
  4. 209 0
      fuck12306/crawl_captcha.py
  5. 7 3
      main.py

+ 1 - 0
fuck12306/__init__.py

@@ -1,5 +1,6 @@
 
 from fuck12306.crawl_12306 import Crawl12306
+from fuck12306.crawl_captcha import CrawlCaptcha
 
 def main():
     pass

+ 5 - 2
fuck12306/api.py

@@ -1,9 +1,12 @@
 
 
 # 验证码图片
-pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
+pic_new_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&"
 
-pic_url2="https://kyfw.12306.cn/passport/captcha/captcha-image64"
+pic_old_url="https://kyfw.12306.cn/passport/captcha/captcha-image64"
+
+# 验证码识别接口
+recognize_url="https://xx/captcha-check"
 
 # 余票查询接口
 remain = "https://kyfw.12306.cn/otn/leftTicket/queryA?leftTicketDTO.train_date=2018-09-30&leftTicketDTO.from_station=IZQ&leftTicketDTO.to_station=CBQ&purpose_codes=ADULT"

+ 5 - 6
fuck12306/crawl_12306.py

@@ -51,7 +51,7 @@ class Crawl12306:
                 "https": "https://" + self.conf["proxy"]["proxy"],
             }
             self.sess.proxies.update(proxies)
-            res= self.sess.get(api.pic_url)
+            res = self.sess.get(api.pic_new_url)
             time.sleep(random.randint(3,5))
             if res.status_code == 200:
                 with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
@@ -62,8 +62,8 @@ class Crawl12306:
         print("............... get code  pic ...........")
         for i in range(number):
             print("-----------%s--------------" % (i))
-            res= self.sess.get(api.pic_url2)
-            time.sleep(random.randint(1,3))
+            res= self.sess.get(api.pic_old_url)
+            time.sleep(random.uniform(0.5, 1))
             if res.status_code == 200:
                 img_str = json.loads(res.content)['image']
                 with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
@@ -104,7 +104,8 @@ class Crawl12306:
             time.sleep(5,10)
             self.send_mail('有票了')
 
-    def send_mail(self,content:str):
+    def send_mail(self, content: str):
+        ''' send mail '''
         message = MIMEText(content, 'plain', 'utf-8')
         message['From'] = Header("肥肥", 'utf-8') 
         message['To'] = Header("圆圆", 'utf-8')
@@ -124,5 +125,3 @@ class Crawl12306:
     def check_update():
         ''' check application update '''
         pass
-
-        

+ 209 - 0
fuck12306/crawl_captcha.py

@@ -0,0 +1,209 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2023/07/12 14:58:48
+@License :   Copyright © 2017-2022 liuyuqi. All Rights Reserved.
+@Desc    :   获取验证码,处理验证码
+'''
+from PIL import Image
+import requests
+import re
+import sys
+import os
+import time
+import random
+import json
+import base64
+from fuck12306 import api
+import pandas as pd
+import numpy as np
+# import cv2
+# import torch,caffe,tensorflow
+
+
+class CrawlCaptcha(object):
+    ''' 验证码处理类 '''
+
+    def __init__(self):
+        pass
+
+    def download_captcha(self, number=100):
+        ''' get code picture 
+            :param number: the number of captcha
+        '''
+        print("............... get code  pic ...........")
+        for i in range(number):
+            print("-----------%s--------------" % (i))
+            # 设置代理
+            proxies = {
+                "http": "http://" + self.conf["proxy"]["proxy"],
+                "https": "https://" + self.conf["proxy"]["proxy"],
+            }
+            agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
+            login = 'https://kyfw.12306.cn/otn/login/init'
+            domain = 'kyfw.12306.cn'
+
+            headers = {
+                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
+                'Accept-Encoding': 'gzip, deflate, sdch, br',
+                'Accept-Language': 'zh - CN, zh;q = 0.8',
+                'Cache-Control': 'no - cache',
+                'Connection': 'keep-alive',
+                'Host': domain,
+                'User-aget': agent,
+                'Referer': login
+            }
+            self.sess.proxies.update(proxies)
+            res = self.sess.get(
+                api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
+            time.sleep(random.randint(1, 2))
+            if res.status_code == 200:
+                with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
+                    file.write(res.content)
+
+    def download_captcha_v2(self, number=100):
+        ''' get code picture old api 
+            :param number: the number of captcha
+        '''
+        print("............... get code  pic ...........")
+        for i in range(number):
+            print("-----------%s--------------" % (i))
+            res = self.sess.get(api.pic_old_url)
+            time.sleep(random.randint(1, 3))
+            if res.status_code == 200:
+                img_str = json.loads(res.content)['image']
+                with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
+                    file.write(base64.b64decode(img_str))
+
+    @staticmethod
+    def get_sub_pic(im: Image, x, y) -> Image:
+        ''' 验证码图片中截取图片部分 '''
+        assert 0 <= x <= 3
+        assert 0 <= y <= 2
+        WITH = HEIGHT = 68
+        left = 5 + (67 + 5) * x
+        top = 41 + (67 + 5) * y
+        right = left + 67
+        bottom = top + 67
+
+        return im.crop((left, top, right, bottom))
+
+    def get_label(self, im: Image) -> list:
+        ''' 获取image 中的标签 '''
+        label = ["叉子"]
+        left = 118
+        img_size = im.size
+        bottom = 29
+        tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
+        # 文字识别,分词,读取对于标签的hash字典
+        return label
+
+    def recornize_captcha(self,):
+        ''' recornize captcha ,图片尺寸:293*190 '''
+        # 获取12306验证码
+        img = Image.open("data/pic/pic_26173.png")
+
+        # 识别label
+        label = self.get_label(img)
+        # 读取子图
+        for x in range(4):
+            for y in range(2):  # 两行四列
+                tmp = CrawlCaptcha.get_sub_pic(img, x, y)
+                # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
+                # save the tmp image
+                # 计算hash值与标签的hash字典key进行比对
+                tmp.save("data/%s%s.png" % (x, y))
+
+    def train(self):
+        ''' train the model '''
+        # 批量下载50000验证码,并每日不断更新
+        # 识别验证码,生成标签,生成训练集
+        # 训练模型
+        # 数据划分:训练,测试,验证
+
+        pass
+
+    def predict(self, img: Image):
+        ''' predict result 
+            输入验证码图片,识别标签,识别 4*2 子图后,识别子图标签,返回结果
+        '''
+        index = []
+        pass
+
+    def gen_label(self):
+        ''' gen label '''
+        labeles = []
+        for root, dirs, files in os.walk("data/archive/"):
+            # for file in files:
+            #     # Access each file in "data/archive/" directory
+            #     file_path = os.path.join(root, file)
+            #     print(file_path)
+
+            for dir in dirs:
+                dir_path = os.path.join(root, dir)
+                labeles.append(dir)
+        df = pd.DataFrame(labeles)
+        df.to_csv("label.csv", header=False, index=False)
+        np.save("label.npy", labeles)
+        # df.to_pickle("res.pkl")
+
+    def remove_file_noise(self, im:Image, threshold=20):
+        """
+            该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
+            ,param threshold, 设定黑色斑点的最大值
+            ,param img, RGB图像的数据
+            ,return, 返回RGB图像数据
+        """
+        def min(c_val, t_val):
+            return c_val if c_val < t_val else t_val
+
+        def sum(_val, _sum, _num):
+            return _val + _sum, _num + 1
+
+        def rm_blackNoisy_component(img, threshold):
+            y, x = img.shape[:2]  # x,y表示坐标
+            for _x in range(0, x, 1):
+                for _y in range(0, y, 1):
+                    cnt = 0
+                    _sum = num = 0
+                    if _y > 0:
+                        _sum, num = sum(img[_y - 1, _x], _sum, num)
+                    if _y + 1 < y:
+                        _sum, num = sum(img[_y + 1, _x], _sum, num)
+                    if _x > 0:
+                        _sum, num = sum(img[_y, _x - 1], _sum, num)
+                    if _x + 1 < x:
+                        _sum, num = sum(img[_y, _x + 1], _sum, num)
+                    if _y > 0 and _x > 0:
+                        _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
+                    if _y > 0 and _x + 1 < x:
+                        _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
+                    if _y + 1 < y and _x > 0:
+                        _sum, num = sum(img[_y + 1, _x - 1], _sum, num)
+
+                    if _y + 1 < y and _x + 1 < x:
+                        _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
+                    if cnt > 0 or num < 1:  # 说明不是单个点
+                        continue
+                    # 如果是孤点,则可以依据权重进行插值
+                    average = _sum / num
+                    if img[_y, _x] + 100 <= average:
+                        img[_y, _x] = average
+
+            return img
+        
+        if im.ndim == 3:
+            b, g, r = cv2.split(im)
+            _b = rm_blackNoisy_component(b, threshold)
+            _g = rm_blackNoisy_component(g, threshold)
+            _r = rm_blackNoisy_component(r, threshold)
+            img = cv2.merge([_b, _g, _r])  # 前面分离出来的三个通道
+
+        else:
+            return rm_blackNoisy_component(im, threshold)
+
+        return img
+    
+
+    

+ 7 - 3
main.py

@@ -12,12 +12,16 @@ import sys
 import re
 import argparse
 import logging
-from fuck12306 import Crawl12306
+from fuck12306 import Crawl12306,CrawlCaptcha
 
 if __name__ == '__main__':
-    print("fuck12306")
+    print("---------------fuck12306-----------")
     crawl12306 = Crawl12306()
     # crawl12306.monitor()
     # crawl12306.get_code_pic(299)
-    crawl12306.get_code_pic2(10000)
+    # crawl12306.get_code_pic2(10000)
 
+    # 验证码识别
+    crawlcaptcha=CrawlCaptcha()
+    # crawlcaptcha.recornize_captcha()
+    crawlcaptcha.gen_label()