123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146 |
- import re
- import ssl
- import urllib
- import requests
- from PIL import Image
- from PIL import ImageFilter
- if hasattr(ssl, '_create_unverified_context'):
- ssl._create_default_https_context = ssl._create_unverified_context
- UA = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.89 Safari/537.36"
- pic_url = "https://kyfw.12306.cn/otn/passcodeNew/getPassCodeNew?module=login&rand=sjrand&0.21191171556711197"
- def get_img():
- ''' 获取验证码图片 '''
- resp = urllib.urlopen(pic_url)
- raw = resp.read()
- with open("./tmp.jpg", 'wb') as fp:
- fp.write(raw)
- return Image.open("./tmp.jpg")
- def get_sub_img(im, x, y):
- assert 0 <= x <= 3
- assert 0 <= y <= 2
- WITH = HEIGHT = 68
- left = 5 + (67 + 5) * x
- top = 41 + (67 + 5) * y
- right = left + 67
- bottom = top + 67
- return im.crop((left, top, right, bottom))
- def baidu_image_upload(im):
- url = "http://image.baidu.com/pictureup/uploadshitu?fr=flash&fm=index&pos=upload"
- im.save("./query_temp_img.png")
- raw = open("./query_temp_img.png", 'rb').read()
- files = {
- 'fileheight': "0",
- 'newfilesize': str(len(raw)),
- 'compresstime': "0",
- 'Filename': "image.png",
- 'filewidth': "0",
- 'filesize': str(len(raw)),
- 'filetype': 'image/png',
- 'Upload': "Submit Query",
- 'filedata': ("image.png", raw)
- }
- resp = requests.post(url, files=files, headers={'User-Agent': UA})
- # resp.url
- redirect_url = "http://image.baidu.com" + resp.text
- return redirect_url
- def baidu_stu_lookup(im):
- redirect_url = baidu_image_upload(im)
- # print redirect_url
- resp = requests.get(redirect_url)
- html = resp.text
- return baidu_stu_html_extract(html)
- def baidu_stu_html_extract(html):
- pattern = re.compile(r"'multitags':\s*'(.*?)'")
- matches = pattern.findall(html)
- if not matches:
- return '[ERROR?]'
- tags_str = matches[0]
- result = list(filter(None, tags_str.replace('\t', ' ').split()))
- return '|'.join(result) if result else '[UNKOWN]'
- def ocr_question_extract(im):
- # git@github.com:madmaze/pytesseract.git
- global pytesseract
- try:
- import pytesseract
- except:
- print
- "[ERROR] pytesseract not installed"
- return
- im = im.crop((127, 3, 260, 22))
- im = pre_ocr_processing(im)
- return pytesseract.image_to_string(im, lang='chi_sim').strip()
- def pre_ocr_processing(im):
- im = im.convert("RGB")
- width, height = im.size
- white = im.filter(ImageFilter.BLUR).filter(ImageFilter.MaxFilter(23))
- grey = im.convert('L')
- impix = im.load()
- whitepix = white.load()
- greypix = grey.load()
- for y in range(height):
- for x in range(width):
- greypix[x, y] = min(255, max(255 + impix[x, y][0] - whitepix[x, y][0],
- 255 + impix[x, y][1] -
- whitepix[x, y][1],
- 255 + impix[x, y][2] - whitepix[x, y][2]))
- new_im = grey.copy()
- binarize(new_im, 150)
- return new_im
- def binarize(im, thresh=120):
- assert 0 < thresh < 255
- assert im.mode == 'L'
- w, h = im.size
- for y in range(0, h):
- for x in range(0, w):
- if im.getpixel((x, y)) < thresh:
- im.putpixel((x, y), 0)
- else:
- im.putpixel((x, y), 255)
- if __name__ == '__main__':
- im = get_img()
- try:
- print
- 'OCR Question:', ocr_question_extract(im)
- except Exception as e:
- print
- '<OCR failed>', e
- for y in range(2):
- for x in range(4):
- im2 = get_sub_img(im, x, y)
- result = baidu_stu_lookup(im2)
- print(y, x), result
|