crawl_captcha.py 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @Contact : liuyuqi.gov@msn.cn
  5. @Time : 2023/07/12 14:58:48
  6. @License : Copyright © 2017-2022 liuyuqi. All Rights Reserved.
  7. @Desc : 获取验证码,处理验证码
  8. '''
  9. from PIL import Image
  10. import requests
  11. import re
  12. import sys
  13. import os
  14. import time
  15. import random
  16. import json
  17. import base64
  18. from fuck12306 import api
  19. import pandas as pd
  20. import numpy as np
  21. # import cv2
  22. # import torch,caffe,tensorflow
  23. class CrawlCaptcha(object):
  24. ''' 验证码处理类 '''
  25. def __init__(self):
  26. pass
  27. def download_captcha(self, number=100):
  28. ''' get code picture
  29. :param number: the number of captcha
  30. '''
  31. print("............... get code pic ...........")
  32. for i in range(number):
  33. print("-----------%s--------------" % (i))
  34. # 设置代理
  35. proxies = {
  36. "http": "http://" + self.conf["proxy"]["proxy"],
  37. "https": "https://" + self.conf["proxy"]["proxy"],
  38. }
  39. agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.94 Safari/537.36'
  40. login = 'https://kyfw.12306.cn/otn/login/init'
  41. domain = 'kyfw.12306.cn'
  42. headers = {
  43. 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
  44. 'Accept-Encoding': 'gzip, deflate, sdch, br',
  45. 'Accept-Language': 'zh - CN, zh;q = 0.8',
  46. 'Cache-Control': 'no - cache',
  47. 'Connection': 'keep-alive',
  48. 'Host': domain,
  49. 'User-aget': agent,
  50. 'Referer': login
  51. }
  52. self.sess.proxies.update(proxies)
  53. res = self.sess.get(
  54. api.pic_new_url+str(random.uniform(0, 1)), headers=headers)
  55. time.sleep(random.randint(1, 2))
  56. if res.status_code == 200:
  57. with open("data/pic/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
  58. file.write(res.content)
  59. def download_captcha_v2(self, number=100):
  60. ''' get code picture old api
  61. :param number: the number of captcha
  62. '''
  63. print("............... get code pic ...........")
  64. for i in range(number):
  65. print("-----------%s--------------" % (i))
  66. res = self.sess.get(api.pic_old_url)
  67. time.sleep(random.randint(1, 3))
  68. if res.status_code == 200:
  69. img_str = json.loads(res.content)['image']
  70. with open("data/pic2/pic_%s%s.png" % (int(time.time()), i), "wb") as file:
  71. file.write(base64.b64decode(img_str))
  72. @staticmethod
  73. def get_sub_pic(im: Image, x, y) -> Image:
  74. ''' 验证码图片中截取图片部分 '''
  75. assert 0 <= x <= 3
  76. assert 0 <= y <= 2
  77. WITH = HEIGHT = 68
  78. left = 5 + (67 + 5) * x
  79. top = 41 + (67 + 5) * y
  80. right = left + 67
  81. bottom = top + 67
  82. return im.crop((left, top, right, bottom))
  83. def get_label(self, im: Image) -> list:
  84. ''' 获取image 中的标签 '''
  85. label = ["叉子"]
  86. left = 118
  87. img_size = im.size
  88. bottom = 29
  89. tmp = im.crop((left, 0, 118+122, bottom)).save("label.png")
  90. # 文字识别,分词,读取对于标签的hash字典
  91. return label
  92. def recornize_captcha(self,):
  93. ''' recornize captcha ,图片尺寸:293*190 '''
  94. # 获取12306验证码
  95. img = Image.open("data/pic/pic_26173.png")
  96. # 识别label
  97. label = self.get_label(img)
  98. # 读取子图
  99. for x in range(4):
  100. for y in range(2): # 两行四列
  101. tmp = CrawlCaptcha.get_sub_pic(img, x, y)
  102. # res = self.sess.post(api.recognize_url, data={"img": base64.b64encode(img)})
  103. # save the tmp image
  104. # 计算hash值与标签的hash字典key进行比对
  105. tmp.save("data/%s%s.png" % (x, y))
  106. def train(self):
  107. ''' train the model '''
  108. # 批量下载50000验证码,并每日不断更新
  109. # 识别验证码,生成标签,生成训练集
  110. # 训练模型
  111. # 数据划分:训练,测试,验证
  112. pass
  113. def predict(self, img: Image):
  114. ''' predict result
  115. 输入验证码图片,识别标签,识别 4*2 子图后,识别子图标签,返回结果
  116. '''
  117. index = []
  118. pass
  119. def gen_label(self):
  120. ''' gen label '''
  121. labeles = []
  122. for root, dirs, files in os.walk("data/archive/"):
  123. # for file in files:
  124. # # Access each file in "data/archive/" directory
  125. # file_path = os.path.join(root, file)
  126. # print(file_path)
  127. for dir in dirs:
  128. dir_path = os.path.join(root, dir)
  129. labeles.append(dir)
  130. df = pd.DataFrame(labeles)
  131. df.to_csv("label.csv", header=False, index=False)
  132. np.save("label.npy", labeles)
  133. # df.to_pickle("res.pkl")
  134. def remove_file_noise(self, im:Image, threshold=20):
  135. """
  136. 该函数用来去除图像的中黑色斑点,同时采用插值法进行该点像素的补充
  137. ,param threshold, 设定黑色斑点的最大值
  138. ,param img, RGB图像的数据
  139. ,return, 返回RGB图像数据
  140. """
  141. def min(c_val, t_val):
  142. return c_val if c_val < t_val else t_val
  143. def sum(_val, _sum, _num):
  144. return _val + _sum, _num + 1
  145. def rm_blackNoisy_component(img, threshold):
  146. y, x = img.shape[:2] # x,y表示坐标
  147. for _x in range(0, x, 1):
  148. for _y in range(0, y, 1):
  149. cnt = 0
  150. _sum = num = 0
  151. if _y > 0:
  152. _sum, num = sum(img[_y - 1, _x], _sum, num)
  153. if _y + 1 < y:
  154. _sum, num = sum(img[_y + 1, _x], _sum, num)
  155. if _x > 0:
  156. _sum, num = sum(img[_y, _x - 1], _sum, num)
  157. if _x + 1 < x:
  158. _sum, num = sum(img[_y, _x + 1], _sum, num)
  159. if _y > 0 and _x > 0:
  160. _sum, num = sum(img[_y - 1, _x - 1], _sum, num)
  161. if _y > 0 and _x + 1 < x:
  162. _sum, num = sum(img[_y - 1, _x + 1], _sum, num)
  163. if _y + 1 < y and _x > 0:
  164. _sum, num = sum(img[_y + 1, _x - 1], _sum, num)
  165. if _y + 1 < y and _x + 1 < x:
  166. _sum, num = sum(img[_y + 1, _x + 1], _sum, num)
  167. if cnt > 0 or num < 1: # 说明不是单个点
  168. continue
  169. # 如果是孤点,则可以依据权重进行插值
  170. average = _sum / num
  171. if img[_y, _x] + 100 <= average:
  172. img[_y, _x] = average
  173. return img
  174. if im.ndim == 3:
  175. b, g, r = cv2.split(im)
  176. _b = rm_blackNoisy_component(b, threshold)
  177. _g = rm_blackNoisy_component(g, threshold)
  178. _r = rm_blackNoisy_component(r, threshold)
  179. img = cv2.merge([_b, _g, _r]) # 前面分离出来的三个通道
  180. else:
  181. return rm_blackNoisy_component(im, threshold)
  182. return img