TitleSpider.py 1.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566
  1. # -*- coding = utf-8 -*-
  2. # @Time : 2022/6/21 15:33
  3. # @Author : 刘正阳
  4. # @File : TitleSpider.py
  5. # @Software : PyCharm
  6. import requests
  7. import re
  8. from bs4 import BeautifulSoup
  9. findLink = re.compile(r'"part":"(.*?)","duratio')
  10. global fileName
  11. # 获取网页数据,传入参数:网址
  12. def FinData(url):
  13. dataList = []
  14. getUrl = requests.get(url=url)
  15. bsHtml = BeautifulSoup(getUrl.text, "html.parser")
  16. urlTitleList = bsHtml.get_text().title().split('\n', 1)
  17. urlTitle = urlTitleList[0][:-30].lstrip()
  18. # dataList.append(str(urlTitle))
  19. bsFinData = bsHtml.select('script')
  20. bsData = ''
  21. # 筛选列表数据
  22. for i in bsFinData:
  23. bsData = str(i)
  24. if 'window.__INITIAL_STATE__={' in bsData:
  25. break
  26. # 正则查找,返回列表
  27. reList = re.findall(findLink, bsData)
  28. dataList += reList
  29. return dataList, urlTitle
  30. def saveAsTxt(video_list, urlTitle):
  31. fileTitle = urlTitle + ".txt" # 合成.txt格式 文件名
  32. # 去除标题中的Windows不兼容的的命名字
  33. for s in fileTitle:
  34. cut = ['|', '\\', '/', ':', '?', '"', '<', '>']
  35. if s in cut:
  36. fileTitle = fileTitle.replace(s, ' ')
  37. nameFile = open(fileTitle, "w", encoding="utf-8") # 写入文件
  38. j = 0
  39. for i in video_list:
  40. j += 1
  41. nameFile.write(i + "\n")
  42. nameFile.close()
  43. return fileTitle
  44. def GetTxt(bid):
  45. global fileName
  46. urlPart = 'https://www.bilibili.com/video/'
  47. bv = bid
  48. url = urlPart + bv
  49. dataList, urlTile = FinData(url)
  50. fileName = saveAsTxt(dataList, urlTile)