get_user.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. """
  4. @File : get_user.py
  5. @Time : 2019/05/15 20:28:36
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 抓取 用户信息,接口为:
  11. https://space.bilibili.com/521400
  12. http://space.bilibili.com/ajax/member/GetInfo
  13. """
  14. import requests
  15. import json
  16. import random
  17. import pymysql
  18. import datetime
  19. import time
  20. import os, sys
  21. src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
  22. os.chdir(src)
  23. sys.path.append(src)
  24. from utils.user_agent import getheaders
  25. # 连接数据库
  26. conn = pymysql.connect(
  27. host="192.168.99.100", user="root", passwd="123456", db="bilibili", charset="utf8"
  28. )
  29. cur = conn.cursor()
  30. # cur.execute("sql")
  31. # conn.commit()
  32. head = {
  33. "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36",
  34. "X-Requested-With": "XMLHttpRequest",
  35. "Referer": "http://space.bilibili.com/45388",
  36. "Origin": "http://space.bilibili.com",
  37. "Host": "space.bilibili.com",
  38. "AlexaToolbar-ALX_NS_PH": "AlexaToolbar/alx-4.0",
  39. "Accept-Language": "zh-CN,zh;q=0.8,en;q=0.6,ja;q=0.4",
  40. "Accept": "application/json, text/javascript, */*; q=0.01",
  41. }
  42. proxies = {"http": "http://120.26.110.59:8080"}
  43. time1 = time.time() # 1557920724.447739
  44. urls = []
  45. uas = []
  46. uas = getheaders()
  47. def datetime_to_timestamp_in_milliseconds():
  48. return int(round(time.time() * 1000)) # 1557920582757
  49. def getsource(url, i):
  50. payload = {
  51. "_": datetime_to_timestamp_in_milliseconds(),
  52. "mid": url.replace("https://space.bilibili.com/", ""),
  53. }
  54. head = {
  55. "User-Agent": random.choice(uas),
  56. "Referer": "https://space.bilibili.com/"
  57. + str(i)
  58. + "?from=search&seid="
  59. + str(random.randint(10000, 50000)),
  60. }
  61. jscontent = (
  62. requests.session()
  63. .post(
  64. "http://space.bilibili.com/ajax/member/GetInfo",
  65. headers=head,
  66. data=payload,
  67. # proxies=proxies,
  68. )
  69. .text
  70. )
  71. time2 = time.time()
  72. try:
  73. jsDict = json.loads(jscontent)
  74. statusJson = jsDict["status"] if "status" in jsDict.keys() else False
  75. if statusJson == True:
  76. if "data" in jsDict.keys():
  77. jsData = jsDict["data"]
  78. mid = jsData["mid"]
  79. name = jsData["name"]
  80. sex = jsData["sex"]
  81. rank = jsData["rank"]
  82. face = jsData["face"]
  83. # regtimestamp = jsData["regtime"] #没有这个值
  84. # regtime_local = time.localtime(regtimestamp)
  85. regtime = "2018-05-06 12:22:23"
  86. spacesta = jsData["spacesta"]
  87. birthday = (
  88. jsData["birthday"] if "birthday" in jsData.keys() else "nobirthday"
  89. )
  90. sign = jsData["sign"]
  91. level = jsData["level_info"]["current_level"]
  92. OfficialVerifyType = jsData["official_verify"]["type"]
  93. OfficialVerifyDesc = jsData["official_verify"]["desc"]
  94. vipType = jsData["vip"]["vipType"]
  95. vipStatus = jsData["vip"]["vipStatus"]
  96. toutu = jsData["toutu"]
  97. toutuId = jsData["toutuId"]
  98. coins = jsData["coins"]
  99. print("Succeed get user info: " + str(mid) + "\t" + str(time2 - time1))
  100. try:
  101. res = requests.get(
  102. "https://api.bilibili.com/x/relation/stat?vmid="
  103. + str(mid)
  104. + "&jsonp=jsonp"
  105. ).text
  106. viewinfo = requests.get(
  107. "https://api.bilibili.com/x/space/upstat?mid="
  108. + str(mid)
  109. + "&jsonp=jsonp"
  110. ).text
  111. js_fans_data = json.loads(res)
  112. js_viewdata = json.loads(viewinfo)
  113. following = js_fans_data["data"]["following"]
  114. fans = js_fans_data["data"]["follower"]
  115. archiveview = js_viewdata["data"]["archive"]["view"]
  116. article = js_viewdata["data"]["article"]["view"]
  117. except:
  118. following = 0
  119. fans = 0
  120. archiveview = 0
  121. article = 0
  122. else:
  123. print("no data now")
  124. try:
  125. cur.execute(
  126. 'INSERT INTO user(mid, name, sex, rank, face, regtime, spacesta, \
  127. birthday, sign, level, OfficialVerifyType, OfficialVerifyDesc, vipType, vipStatus, \
  128. toutu, toutuId, coins, following, fans ,archiveview, article) \
  129. VALUES ("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s",\
  130. "%s","%s","%s","%s","%s", "%s","%s","%s","%s","%s","%s")'
  131. % (
  132. mid,
  133. name,
  134. sex,
  135. rank,
  136. face,
  137. regtime,
  138. spacesta,
  139. birthday,
  140. sign,
  141. level,
  142. OfficialVerifyType,
  143. OfficialVerifyDesc,
  144. vipType,
  145. vipStatus,
  146. toutu,
  147. toutuId,
  148. coins,
  149. following,
  150. fans,
  151. archiveview,
  152. article,
  153. )
  154. )
  155. conn.commit()
  156. except Exception as e:
  157. print(e)
  158. else:
  159. print("Error: " + url)
  160. except Exception as e:
  161. print(e)
  162. pass
  163. def crawlUser():
  164. """
  165. 开抓
  166. param :
  167. return:
  168. """
  169. m = 5214
  170. for i in range(m * 100, ((m * 100 )+ 1)): # range(521400,521500)
  171. url = "https://space.bilibili.com/" + str(i)
  172. # urls.append(url)
  173. getsource(url, i)
  174. if __name__ == "__main__":
  175. src = "C:/Users/liuyuqi/Desktop/crawl_bilibili"
  176. os.chdir(src)
  177. crawlUser()