liuyuqi-dellpc 5 years ago
parent
commit
3ae41c9091
4 changed files with 133 additions and 14 deletions
  1. 1 1
      README.md
  2. 11 0
      conf/mysql.conf
  3. 24 13
      user/get_user.py
  4. 97 0
      utils/config.py

+ 1 - 1
README.md

@@ -1,6 +1,6 @@
 ## crawl_bilibili
 ## crawl_bilibili
 
 
- bilibili 爬虫,主要涉及:
+bilibili 爬虫,不需要实时数据,设置一分钟爬 6 次。**一个月大概爬 25 万用户**。由于时间跨度大,设置断点续爬功能。主要涉及:
 
 
 用户公开数据(用户/性别/地区/注册时间/)
 用户公开数据(用户/性别/地区/注册时间/)
 
 

+ 11 - 0
conf/mysql.conf

@@ -0,0 +1,11 @@
+[db1]
+host = h5.yoqi.me
+port = 3306
+user = root
+pwd = 123456
+database = bilibili
+charset = utf8
+
+[project]
+workspace = C:/Users/liuyuqi/Desktop/crawl-bilibili
+

+ 24 - 13
user/get_user.py

@@ -7,28 +7,35 @@
 @Version :   1.0
 @Version :   1.0
 @Contact :   liuyuqi.gov@msn.cn
 @Contact :   liuyuqi.gov@msn.cn
 @License :   (C)Copyright 2019
 @License :   (C)Copyright 2019
-@Desc    :   抓取 用户信息,接口为:
-https://space.bilibili.com/521400
+@Desc    :   抓取 用户信息,接口为: https://space.bilibili.com/521400 后面数字穷举法获取所有用户姓名,性别,年龄等等信息。
+
 http://space.bilibili.com/ajax/member/GetInfo
 http://space.bilibili.com/ajax/member/GetInfo
 """
 """
 
 
+import sys
+import os
+src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
+os.chdir(src)
+sys.path.append(src)
+
+import utils.config as conf
+from utils.user_agent import getheaders
+
 import requests
 import requests
 import json
 import json
 import random
 import random
 import pymysql
 import pymysql
 import datetime
 import datetime
 import time
 import time
-import os, sys
 
 
-src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
-os.chdir(src)
-sys.path.append(src)
 
 
-from utils.user_agent import getheaders
+
+print(src)
+exit()
 
 
 # 连接数据库
 # 连接数据库
 conn = pymysql.connect(
 conn = pymysql.connect(
-    host="192.168.99.100", user="root", passwd="123456", db="bilibili", charset="utf8"
+    host=conf.readConf("db1", "host"), user=conf.readConf("db1", "user"), passwd=conf.readConf("db1", "pwd"), db="bilibili", charset="utf8"
 )
 )
 cur = conn.cursor()
 cur = conn.cursor()
 # cur.execute("sql")
 # cur.execute("sql")
@@ -96,7 +103,8 @@ def getsource(url, i):
                 regtime = "2018-05-06 12:22:23"
                 regtime = "2018-05-06 12:22:23"
                 spacesta = jsData["spacesta"]
                 spacesta = jsData["spacesta"]
                 birthday = (
                 birthday = (
-                    jsData["birthday"] if "birthday" in jsData.keys() else "nobirthday"
+                    jsData["birthday"] if "birthday" in jsData.keys(
+                    ) else "nobirthday"
                 )
                 )
                 sign = jsData["sign"]
                 sign = jsData["sign"]
                 level = jsData["level_info"]["current_level"]
                 level = jsData["level_info"]["current_level"]
@@ -107,7 +115,8 @@ def getsource(url, i):
                 toutu = jsData["toutu"]
                 toutu = jsData["toutu"]
                 toutuId = jsData["toutuId"]
                 toutuId = jsData["toutuId"]
                 coins = jsData["coins"]
                 coins = jsData["coins"]
-                print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
+                print("Succeed get user info: " +
+                      str(mid) + "\t" + str(time2 - time1))
                 try:
                 try:
                     res = requests.get(
                     res = requests.get(
                         "https://api.bilibili.com/x/relation/stat?vmid="
                         "https://api.bilibili.com/x/relation/stat?vmid="
@@ -179,8 +188,11 @@ def crawlUser():
     param :
     param :
     return:
     return:
     """
     """
+    # 获得索引头
+    cur.execute("sql")
+    res = conn.commit()
     m = 5214
     m = 5214
-    for i in range(m * 100, ((m * 100 )+ 1)):  # range(521400,521500)
+    for i in range(m * 100, ((m * 100) + 1)):  # range(521400,521500)
         url = "https://space.bilibili.com/" + str(i)
         url = "https://space.bilibili.com/" + str(i)
         # urls.append(url)
         # urls.append(url)
         getsource(url, i)
         getsource(url, i)
@@ -188,5 +200,4 @@ def crawlUser():
 
 
 if __name__ == "__main__":
 if __name__ == "__main__":
     src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
     src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
-    os.chdir(src)
-    crawlUser()
+    crawlUser()

+ 97 - 0
utils/config.py

@@ -0,0 +1,97 @@
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+'''
+@Author  :   liuyuqi
+@Contact :   liuyuqi.gov@msn.cn
+@Time    :   2019/07/23 10:58:48
+@Version :   1.0
+@License :   (C)Copyright 2019
+@Desc    :   设置 数据库参数
+'''
+
+import configparser
+import os
+import pymysql
+
+src = "C:/Users/liuyuqi/Desktop/crawl-bilibili"
+os.chdir(src)
+
+conf_dir = "conf/"
+config_file = os.path.join(conf_dir, "mysql.conf")
+section_name = "db1"
+
+
+def writeConf(user, pwd, database, host="localhost", port=3306, charset="utf8"):
+    cf = configparser.RawConfigParser()
+
+    cf.add_section('db1')
+    cf.set('db1', 'host', host)
+    cf.set('db1', 'port', port)
+    cf.set('db1', 'user', user)
+    cf.set('db1', 'pwd', pwd)
+    cf.set("db1", "database", database)
+    cf.set("db1", "charset", charset)
+
+    cf.add_section("project")
+    cf.set("project", "workspace", "C:/Users/liuyuqi/Desktop/crawl-bilibili")
+    with open(config_file, 'w') as configfile:
+        cf.write(configfile)
+
+
+def readConf(section, key):
+    config = configparser.RawConfigParser()
+    config.read(config_file)
+    port = config.get(section, key)
+    print(port)
+
+
+def readSQL(path):
+    with open(path, "r", encoding="utf-8") as f:
+        sql = ""
+        for line in f.readlines():
+            if not line or line == "\n":
+                continue
+            sql = sql+line
+        return sql
+
+def getDBServer():
+    pass
+
+def getWorkSpace():
+    return readConf("project", "workspace")
+
+def initDB1():
+    '''
+    导入数据,pymysql实在垃圾,只能一条一条执行。无法执行sql文件,也就是SQL文件手动读取为一条条在执行。。
+    '''
+    conn = pymysql.connect("localhost", "lyq", "123456", "bilibili")
+    cursor = conn.cursor()
+
+    # 如果没有数据库,则创建一个
+    # cursor.execute("CREATE DATABASE  bilibili;")
+
+    userSQL = os.path.join(conf_dir, "user.sql")
+    sql = readSQL(userSQL)
+    cursor.execute(sql)
+
+    videoSQL = os.path.join(conf_dir, "video.sql")
+    sql = readSQL(videoSQL)
+    cursor.execute(sql)
+
+
+def initDB():
+    '''
+    导入数据,采用shell命令执行
+    '''
+    userSQL = os.path.join(conf_dir, "user.sql")
+    videoSQL = os.path.join(conf_dir, "video.sql")
+    os.system(
+        "D:/Program-Files/MySQL/mysql-5.7.17-winx64/bin/mysql.exe -uroot -p123456  --default-character-set=utf8 bilibili < "+userSQL)
+    os.system(
+        "D:/Program-Files/MySQL/mysql-5.7.17-winx64/bin/mysql.exe -uroot -p123456  --default-character-set=utf8 bilibili < "+videoSQL)
+
+if __name__ == '__main__':
+    writeConf("root", "123456", "bilibili", host="h5.yoqi.me")
+    readConf("db1", "host")
+    # initDB()
+