get_barrage.py 1.9 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465
  1. #!/usr/bin/env python
  2. # -*- encoding: utf-8 -*-
  3. '''
  4. @File : get_barrage.py
  5. @Time : 2019/05/15 17:10:38
  6. @Author : Liuyuqi
  7. @Version : 1.0
  8. @Contact : liuyuqi.gov@msn.cn
  9. @License : (C)Copyright 2019
  10. @Desc : 弹幕爬虫
  11. '''
  12. # -*-coding:utf8-*-
  13. from lxml import etree
  14. import requests
  15. import sys
  16. import re
  17. #reload(sys)
  18. #sys.setdefaultencoding('utf-8')
  19. head = {
  20. 'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.130 Safari/537.36'
  21. }
  22. def spider(av):
  23. url = 'http://bilibili.com/video/av' + str(av)
  24. print(url)
  25. html = requests.get(url, headers=head)
  26. selector = etree.HTML(html.text)
  27. content = selector.xpath("//html")
  28. for each in content:
  29. title = each.xpath('//*[@id="viewbox_report"]/h1/span')
  30. if title:
  31. print(title[0].text)
  32. cid_html_1 = each.xpath('//*[@id="link2"]/@value')
  33. if cid_html_1:
  34. cid_html = cid_html_1[0]
  35. cids = re.findall(r'cid=.+&page', cid_html)
  36. cid = cids[0].replace("cid=", "").replace("&page", "")
  37. comment_url = 'http://comment.bilibili.com/' + \
  38. str(cid) + '.xml'
  39. print(comment_url)
  40. comment_text = requests.get(comment_url, headers=head)
  41. comment_selector = etree.HTML(comment_text.content)
  42. comment_content = comment_selector.xpath('//i')
  43. for comment_each in comment_content:
  44. comments = comment_each.xpath('//d/text()')
  45. if comments:
  46. for comment in comments:
  47. print(comment)
  48. f.writelines(comment + '\n')
  49. else:
  50. print('cid not found!')
  51. else:
  52. print('video not found!')
  53. if __name__ == '__main__':
  54. av = input('input av:')
  55. f = open(av + '.txt', 'w', encoding='utf-8')
  56. spider(av)