TitleSpider.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # -*- coding = utf-8 -*-
  2. # @Time : 2022/6/21 15:33
  3. # @Author : 刘正阳
  4. # @File : TitleSpider.py
  5. # @Software : PyCharm
  6. import requests
  7. import re
  8. from bs4 import BeautifulSoup
  9. findLink = re.compile(r'"part":"(.*?)","duratio')
  10. global fileName
  11. # 获取网页数据,传入参数:网址
  12. def FinData(url):
  13. dataList = []
  14. getUrl = requests.get(url=url)
  15. bsHtml = BeautifulSoup(getUrl.text, "html.parser")
  16. global urlTitle
  17. urlTitleList = bsHtml.get_text().title().split('\n', 1)
  18. urlTitle = urlTitleList[0][:-30].lstrip()
  19. dataList.append(str(urlTitle))
  20. bsFinData = bsHtml.select('script')
  21. bsData = ''
  22. # 筛选列表数据
  23. for i in bsFinData:
  24. bsData = str(i)
  25. if 'window.__INITIAL_STATE__={' in bsData:
  26. break
  27. # 正则查找,返回列表
  28. reList = re.findall(findLink, bsData)
  29. dataList += reList
  30. return dataList, urlTitle
  31. def saveAsTxt(video_list):
  32. fileTitle = urlTitle + ".txt" # 合成.txt格式 文件名
  33. # 去除标题中的Windows不兼容的的命名字
  34. for s in fileTitle:
  35. cut = ['|', '\\', '/', ':', '?', '"', '<', '>']
  36. if s in cut:
  37. fileTitle = fileTitle.replace(s, ' ')
  38. nameFile = open(fileTitle, "w", encoding="utf-8") # 写入文件
  39. j = 0
  40. for i in video_list:
  41. j += 1
  42. nameFile.write(i + "\n")
  43. nameFile.close()
  44. return fileTitle
  45. def GetTxt(bid):
  46. global fileName
  47. urlPart = 'https://www.bilibili.com/video/'
  48. bv = bid
  49. url = urlPart + bv
  50. dataList, urlTile = FinData(url)
  51. fileName = saveAsTxt(dataList)