Browse Source

user 基本完成

liuyuqi-dellpc 6 years ago
parent
commit
cbb5ccefb4
6 changed files with 180 additions and 178 deletions
  1. 2 0
      .gitignore
  2. 5 1
      README.md
  3. 2 1
      requirements.txt
  4. 171 134
      user/get_user.py
  5. 0 39
      user/main.py
  6. 0 3
      video/get_video.py

+ 2 - 0
.gitignore

@@ -1,3 +1,5 @@
 /.idea
 /__pycache__
 /data
+/.vscode
+/utils/__pycache__/*.pyc

+ 5 - 1
README.md

@@ -1,6 +1,6 @@
 ## crawl_bilibili
 
-bilibili爬虫,主要涉及:
+ bilibili 爬虫,主要涉及:
 
 用户公开数据(用户/性别/地区/注册时间/)
 
@@ -12,6 +12,10 @@ bilibili爬虫,主要涉及:
 
 付费数据()
 
+
 ### 使用
 
+
+
 ### 更新历史
+

+ 2 - 1
requirements.txt

@@ -1 +1,2 @@
-pymysql
+PyMySQL==0.9.3
+requests==2.21.0

+ 171 - 134
user/get_user.py

@@ -1,155 +1,192 @@
-# -*-coding:utf8-*-
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+"""
+@File    :   get_user.py
+@Time    :   2019/05/15 20:28:36
+@Author  :   Liuyuqi 
+@Version :   1.0
+@Contact :   liuyuqi.gov@msn.cn
+@License :   (C)Copyright 2019
+@Desc    :   抓取 用户信息,接口为:
+https://space.bilibili.com/521400
+http://space.bilibili.com/ajax/member/GetInfo
+"""
 
 import requests
 import json
 import random
 import pymysql
-import sys
 import datetime
 import time
-from imp import reload
-from multiprocessing.dummy import Pool as ThreadPool
+import os, sys
 
-def datetime_to_timestamp_in_milliseconds(d):
-    def current_milli_time(): return int(round(time.time() * 1000))
+src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
+os.chdir(src)
+sys.path.append(src)
 
-    return current_milli_time()
+from utils.user_agent import getheaders
 
+# 连接数据库
+conn = pymysql.connect(
+    host="192.168.99.100", user="root", passwd="123456", db="bilibili", charset="utf8"
+)
+cur = conn.cursor()
+# cur.execute("sql")
+# conn.commit()
 
-reload(sys)
-
-
-def LoadUserAgents(uafile):
-    uas = []
-    with open(uafile, 'rb') as uaf:
-        for ua in uaf.readlines():
-            if ua:
-                uas.append(ua.strip()[:-1])
-    random.shuffle(uas)
-    return uas
-
-
-uas = LoadUserAgents("user_agents.txt")
 head = {
-    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
-    'X-Requested-With': 'XMLHttpRequest',
-    'Referer': 'http://space.bilibili.com/45388',
-    'Origin': 'http://space.bilibili.com',
-    'Host': 'space.bilibili.com',
-    'AlexaToolbar-ALX_NS_PH': 'AlexaToolbar/alx-4.0',
-    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4',
-    'Accept': 'application/json, text/javascript, */*; q=0.01',
+    "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
+    "X-Requested-With": "XMLHttpRequest",
+    "Referer": "http://space.bilibili.com/45388",
+    "Origin": "http://space.bilibili.com",
+    "Host": "space.bilibili.com",
+    "AlexaToolbar-ALX_NS_PH": "AlexaToolbar/alx-4.0",
+    "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
+    "Accept": "application/json, text/javascript, */*; q=0.01",
 }
 
-# Please replace your own proxies.
-proxies = {
-    'http': 'http://120.26.110.59:8080',
-    'http': 'http://120.52.32.46:80',
-    'http': 'http://218.85.133.62:80',
-}
-time1 = time.time()
+proxies = {"http": "http://120.26.110.59:8080"}
 
+time1 = time.time()  # 1557920724.447739
 urls = []
+uas = []
+uas = getheaders()
+
+
+def datetime_to_timestamp_in_milliseconds():
+    return int(round(time.time() * 1000))  # 1557920582757
+
+
+def getsource(url, i):
+    payload = {
+        "_": datetime_to_timestamp_in_milliseconds(),
+        "mid": url.replace("https://space.bilibili.com/", ""),
+    }
+    head = {
+        "User-Agent": random.choice(uas),
+        "Referer": "https://space.bilibili.com/"
+        + str(i)
+        + "?from=search&seid="
+        + str(random.randint(10000, 50000)),
+    }
+    jscontent = (
+        requests.session()
+        .post(
+            "http://space.bilibili.com/ajax/member/GetInfo",
+            headers=head,
+            data=payload,
+            # proxies=proxies,
+        )
+        .text
+    )
+    time2 = time.time()
+    try:
+        jsDict = json.loads(jscontent)
+        statusJson = jsDict["status"] if "status" in jsDict.keys() else False
+        if statusJson == True:
+            if "data" in jsDict.keys():
+                jsData = jsDict["data"]
+                mid = jsData["mid"]
+                name = jsData["name"]
+                sex = jsData["sex"]
+                rank = jsData["rank"]
+                face = jsData["face"]
+                # regtimestamp = jsData["regtime"] #没有这个值
+                # regtime_local = time.localtime(regtimestamp)
+                regtime = "2018-05-06 12:22:23"
+                spacesta = jsData["spacesta"]
+                birthday = (
+                    jsData["birthday"] if "birthday" in jsData.keys() else "nobirthday"
+                )
+                sign = jsData["sign"]
+                level = jsData["level_info"]["current_level"]
+                OfficialVerifyType = jsData["official_verify"]["type"]
+                OfficialVerifyDesc = jsData["official_verify"]["desc"]
+                vipType = jsData["vip"]["vipType"]
+                vipStatus = jsData["vip"]["vipStatus"]
+                toutu = jsData["toutu"]
+                toutuId = jsData["toutuId"]
+                coins = jsData["coins"]
+                print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
+                try:
+                    res = requests.get(
+                        "https://api.bilibili.com/x/relation/stat?vmid="
+                        + str(mid)
+                        + "&jsonp=jsonp"
+                    ).text
+                    viewinfo = requests.get(
+                        "https://api.bilibili.com/x/space/upstat?mid="
+                        + str(mid)
+                        + "&jsonp=jsonp"
+                    ).text
+                    js_fans_data = json.loads(res)
+                    js_viewdata = json.loads(viewinfo)
+                    following = js_fans_data["data"]["following"]
+                    fans = js_fans_data["data"]["follower"]
+                    archiveview = js_viewdata["data"]["archive"]["view"]
+                    article = js_viewdata["data"]["article"]["view"]
+                except:
+                    following = 0
+                    fans = 0
+                    archiveview = 0
+                    article = 0
+            else:
+                print("no data now")
+            try:
+                cur.execute(
+                    'INSERT INTO user(mid, name, sex, rank, face, regtime, spacesta, \
+                            birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
+                            toutu, toutuId, coins, following, fans ,archiveview, article) \
+                VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
+                        "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
+                    % (
+                        mid,
+                        name,
+                        sex,
+                        rank,
+                        face,
+                        regtime,
+                        spacesta,
+                        birthday,
+                        sign,
+                        level,
+                        OfficialVerifyType,
+                        OfficialVerifyDesc,
+                        vipType,
+                        vipStatus,
+                        toutu,
+                        toutuId,
+                        coins,
+                        following,
+                        fans,
+                        archiveview,
+                        article,
+                    )
+                )
+                conn.commit()
+            except Exception as e:
+                print(e)
+        else:
+            print("Error: " + url)
+    except Exception as e:
+        print(e)
+        pass
 
-# Please change the range data by yourself.
-for m in range(5214, 5215):
-
-    for i in range(m * 100, (m + 1) * 100):
-        url = 'https://space.bilibili.com/' + str(i)
-        urls.append(url)
 
+def crawlUser():
+    """
+    开抓
+    param :
+    return:
+    """
+    m = 5214
+    for i in range(m * 100, ((m * 100 )+ 1)):  # range(521400,521500)
+        url = "https://space.bilibili.com/" + str(i)
+        # urls.append(url)
+        getsource(url, i)
 
-    def getsource(url):
-        payload = {
-            '_': datetime_to_timestamp_in_milliseconds(datetime.datetime.now()),
-            'mid': url.replace('https://space.bilibili.com/', '')
-        }
-        ua = random.choice(uas)
-        head = {
-            'User-Agent': ua,
-            'Referer': 'https://space.bilibili.com/' + str(i) + '?from=search&seid=' + str(random.randint(10000, 50000))
-        }
-        jscontent = requests \
-            .session() \
-            .post('http://space.bilibili.com/ajax/member/GetInfo',
-                  headers=head,
-                  data=payload,
-                  proxies=proxies) \
-            .text
-        time2 = time.time()
-        try:
-            jsDict = json.loads(jscontent)
-            statusJson = jsDict['status'] if 'status' in jsDict.keys() else False
-            if statusJson == True:
-                if 'data' in jsDict.keys():
-                    jsData = jsDict['data']
-                    mid = jsData['mid']
-                    name = jsData['name']
-                    sex = jsData['sex']
-                    rank = jsData['rank']
-                    face = jsData['face']
-                    regtimestamp = jsData['regtime']
-                    regtime_local = time.localtime(regtimestamp)
-                    regtime = time.strftime("%Y-%m-%d %H:%M:%S",regtime_local)
-                    spacesta = jsData['spacesta']
-                    birthday = jsData['birthday'] if 'birthday' in jsData.keys() else 'nobirthday'
-                    sign = jsData['sign']
-                    level = jsData['level_info']['current_level']
-                    OfficialVerifyType = jsData['official_verify']['type']
-                    OfficialVerifyDesc = jsData['official_verify']['desc']
-                    vipType = jsData['vip']['vipType']
-                    vipStatus = jsData['vip']['vipStatus']
-                    toutu = jsData['toutu']
-                    toutuId = jsData['toutuId']
-                    coins = jsData['coins']
-                    print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
-                    try:
-                        res = requests.get(
-                            'https://api.bilibili.com/x/relation/stat?vmid=' + str(mid) + '&jsonp=jsonp').text
-                        viewinfo = requests.get(
-                            'https://api.bilibili.com/x/space/upstat?mid=' + str(mid) + '&jsonp=jsonp').text
-                        js_fans_data = json.loads(res)
-                        js_viewdata = json.loads(viewinfo)
-                        following = js_fans_data['data']['following']
-                        fans = js_fans_data['data']['follower']
-                        archiveview = js_viewdata['data']['archive']['view']
-                        article = js_viewdata['data']['article']['view']
-                    except:
-                        following = 0
-                        fans = 0
-                        archiveview = 0
-                        article = 0
-                else:
-                    print('no data now')
-                try:
-                    # Please write your MySQL's information.
-                    conn = pymysql.connect(
-                        host='localhost', user='root', passwd='123456', db='bilibili', charset='utf8')
-                    cur = conn.cursor()
-                    cur.execute('INSERT INTO bilibili_user_info(mid, name, sex, rank, face, regtime, spacesta, \
-                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
-                                toutu, toutuId, coins, following, fans ,archiveview, article) \
-                    VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
-                            "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
-                                %
-                                (mid, name, sex, rank, face, regtime, spacesta, \
-                                birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
-                                toutu, toutuId, coins, following, fans ,archiveview, article))
-                    conn.commit()
-                except Exception as e:
-                    print(e)
-            else:
-                print("Error: " + url)
-        except Exception as e:
-            print(e)
-            pass
 
 if __name__ == "__main__":
-    pool = ThreadPool(1)
-    try:
-        results = pool.map(getsource, urls)
-    except Exception as e:
-        print(e)
- 
-    pool.close()
-    pool.join()
+    src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
+    os.chdir(src)
+    crawlUser()

+ 0 - 39
user/main.py

@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-'''
-@Auther :liuyuqi.gov@msn.cn
-@date :2019/4/8
-'''
-__author__ = "liuyuqi"
-
-import json
-import os
-import re
-from contextlib import closing
-
-import requests
-
-import DownloadProgress
-import user_agent
-
-# src = "D:/PycharmProjects/crawl_xuexi/"
-# os.chdir(src)
-
-
-
-def crawl():
-    with open("data/ml.json", "r", encoding="utf8") as f:
-        mlData = json.loads(f.read())
-        for i in range((len(mlData["fpe1ki18v228w00"]))):
-            frst_name = mlData["fpe1ki18v228w00"][i]["frst_name"].replace('\t', ' ')
-            static_page_url = mlData["fpe1ki18v228w00"][i]["static_page_url"]
-            # 打开 mp4 视频网页链接
-            resData = requests.get(static_page_url, headers=user_agent.getheaders()).content.decode("utf8")
-            preUrl = static_page_url.split("/")[3]
-            pattern = r'src="./data(.*?)"></script>'
-            url = "https://www.xuexi.cn/" + preUrl + "/data" + re.findall(pattern, resData, re.I)[0]
-            res = get_video_links(url)[0]
-            downloadVideo(res, file_name=frst_name)
-
-
-if __name__ == '__main__':
-    crawl()

+ 0 - 3
video/get_video.py

@@ -9,9 +9,6 @@
 @License :   (C)Copyright 2019
 @Desc    :   爬取 B 站视频
 '''
-
-# -*-coding:utf8-*-
-
 from lxml import etree
 from multiprocessing.dummy import Pool as ThreadPool
 import requests