liuyuqi-dellpc 4 years ago
commit
efdb552c5a

+ 13 - 0
README.md

@@ -0,0 +1,13 @@
+## 新闻电讯爬虫
+
+初步完成,单线程,文明爬虫(每次爬虫1-3s休息)。
+
+
+```
+cd my_project_dir
+virtualenv -p /opt/python/bin/python3 venv
+source venv/bin/activate
+pip install -r requirements.txt
+python main.py
+
+```

+ 0 - 0
data/.gitkeep


+ 63 - 0
main.py

@@ -0,0 +1,63 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/11/11 04:40:28
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   按照规则下载pdf文件,知道请求无效停止
+'''
+import os
+import sys
+import json
+import re
+import time
+import datetime
+from contextlib import closing
+import requests
+import utils.DownloadProgress as DownloadProgress
+from concurrent.futures import ThreadPoolExecutor
+import random
+import utils.user_agent as user_agent
+
+def get_link():
+    pass
+
+def downNews(url, fileName):
+    with closing(requests.get(url=url,headers=user_agent.getheaders(),stream=True)) as response:
+        chunkSize=1024
+        contentSize=int(response.headers["content-length"])
+        fileD="./data/"+fileName
+        if(os.path.exists(fileD) and os.path.getsize(fileD)==contentSize):
+            print("跳过"+fileName)
+        else:
+
+            progress=DownloadProgress.DownloadProgress(fileName,total=contentSize,unit="KB" ,
+                                                        chunk_size=chunkSize,run_status="downloading",fin_status="downloaded")
+            if not os.path.exists(os.path.dirname(fileD)):
+                os.makedirs(os.path.dirname(fileD))
+            with open(fileD,"wb") as file:
+                for data in response.iter_content(chunk_size=chunkSize):
+                    file.write(data)
+                    progress.refresh(count=len(data))
+
+def crawl():
+    # for i in range(1,3650):
+        yestday = (datetime.date.today() +
+                datetime.timedelta(-i)).strftime("%Y%m%d")
+        for j in range(1, 17):
+            url = r"http://mrdx.cn/PDF/%s/0%s.pdf" % (yestday, j)
+            # 检查链接有效性
+            response=requests.head(url)
+            if response.status_code==200:
+                fileName=r"%s/0%s.pdf" %(yestday,j)
+                downNews(url,fileName)
+                time.sleep(random.randint(1,2))  # 文明爬虫
+
+if __name__ == "__main__":
+    start_time = time.time()
+    if not os.path.exists("data"):
+        os.makedirs("data")
+    crawl()
+    print("last time: {} s".format(time.time() - start_time))

+ 1 - 0
requirements.txt

@@ -0,0 +1 @@
+requests

+ 41 - 0
test/file_cache.py

@@ -0,0 +1,41 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/11/11 13:38:11
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   文件缓存
+'''
+import json
+import threading
+
+TOKEN="asfa"
+def persist_to_file(file_name):
+    def decorator(original_func):
+
+        try:
+            cache = json.load(open(file_name, 'r'))
+        except (IOError, ValueError):
+            cache = {}
+
+        def new_func(param):
+            if param not in cache:
+                cache[param] = original_func(param)
+                json.dump(cache, open(file_name, 'w'), indent=4)
+            return cache[param]
+
+        return new_func
+
+    return decorator
+
+@persist_to_file('cache.dat')
+def tenantid_to_tenant_name(tenantid):
+    headers = {'X-Auth-Token': TOKEN}
+    print(tenantid)
+    return "I love you"
+
+if __name__ == "__main__":
+    for i in range(100):
+        tenantid_to_tenant_name(i)

+ 15 - 0
test/head_request.py

@@ -0,0 +1,15 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/11/11 14:31:42
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   None
+'''
+import requests
+
+response=requests.head(r"http://mrdx.cn/PDF/20190228/01.pdf")
+print(response.__sizeof__())
+print(response.status_code)

+ 39 - 0
utils/DownloadProgress.py

@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+'''
+下载进度
+@Auther :liuyuqi.gov@msn.cn
+@date :2019/4/8
+'''
+__author__ = "liuyuqi"
+
+
+class DownloadProgress(object):
+    def __init__(self, title, count=0.0, run_status=None, fin_status=None, total=100.0, unit='', sep='/',
+                 chunk_size=1.0):
+        super(DownloadProgress, self).__init__()
+        self.info = "[%s] %s %.2f %s %s %.2f %s"
+        self.title = title
+        self.total = total
+        self.count = count
+        self.chunk_size = chunk_size
+        self.status = run_status or ""
+        self.fin_status = fin_status or " " * len(self.status)
+        self.unit = unit
+        self.seq = sep
+
+    def __get_info(self):
+        # 【名称】状态 进度 单位 分割线 总数 单位
+        _info = self.info % (
+            self.title, self.status, self.count / self.chunk_size, self.unit, self.seq, self.total / self.chunk_size,
+            self.unit)
+        return _info
+
+    def refresh(self, count=1, status=None):
+        self.count += count
+        # if status is not None:
+        self.status = status or self.status
+        end_str = "\r"
+        if self.count >= self.total:
+            end_str = '\n'
+            self.status = status or self.fin_status
+        print(self.__get_info(), end=end_str)

BIN
utils/__pycache__/DownloadProgress.cpython-36.pyc


BIN
utils/__pycache__/user_agent.cpython-36.pyc


+ 79 - 0
utils/user_agent.py

@@ -0,0 +1,79 @@
+# -*-coding:utf-8 -*-
+
+import random
+
+# 返回一个随机的请求头 headers
+def getheaders():
+    # 各种PC端
+    user_agent_list_2 = [
+        # Opera
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
+        "Opera/8.0 (Windows NT 5.1; U; en)",
+        "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
+        # Firefox
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
+        "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
+        # Safari
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/534.57.2 (KHTML, like Gecko) Version/5.1.7 Safari/534.57.2",
+        # chrome
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.2171.71 Safari/537.36",
+        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/72.0.1271.64 Safari/537.11",
+        "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/66.0.648.133 Safari/534.16",
+        # 360
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.101 Safari/537.36",
+        "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko",
+        # 淘宝浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11",
+        # 猎豹浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER",
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)",
+        # QQ浏览器
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
+        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
+        # sogou浏览器
+        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 SE 2.X MetaSr 1.0",
+        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; SE 2.X MetaSr 1.0)",
+        # maxthon浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Maxthon/4.4.3.4000 Chrome/30.0.1599.101 Safari/537.36",
+        # UC浏览器
+        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.122 UBrowser/4.0.3214.0 Safari/537.36",
+    ]
+    # 各种移动端
+    user_agent_list_3 = [
+        # IPhone
+        "Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPod
+        "Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # IPAD
+        "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5",
+        "Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
+        # Android
+        "Mozilla/5.0 (Linux; U; Android 2.2.1; zh-cn; HTC_Wildfire_A3333 Build/FRG83D) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        "Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # QQ浏览器 Android版本
+        "MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1",
+        # Android Opera Mobile
+        "Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10",
+        # Android Pad Moto Xoom
+        "Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
+        # BlackBerry
+        "Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+",
+        # WebOS HP Touchpad
+        "Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0",
+        # Nokia N97
+        "Mozilla/5.0 (SymbianOS/9.4; Series60/5.0 NokiaN97-1/20.0.019; Profile/MIDP-2.1 Configuration/CLDC-1.1) AppleWebKit/525 (KHTML, like Gecko) BrowserNG/7.1.18124",
+        # Windows Phone Mango
+        "Mozilla/5.0 (compatible; MSIE 9.0; Windows Phone OS 7.5; Trident/5.0; IEMobile/9.0; HTC; Titan)",
+        # UC浏览器
+        "UCWEB7.0.2.37/28/999",
+        "NOKIA5700/ UCWEB7.0.2.37/28/999",
+        # UCOpenwave
+        "Openwave/ UCWEB7.0.2.37/28/999",
+        # UC Opera
+        "Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999"
+    ]
+    UserAgent = random.choice(user_agent_list_2) # 这里只用list1
+    headers = {'User-Agent': UserAgent}
+    return headers