# -*- coding = utf-8 -*-
# @Time : 2022/6/21 15:33
# @Author : 刘正阳
# @File : TitleSpider.py
# @Software : PyCharm
import os
import requests
import re
from bs4 import BeautifulSoup
findLink = re.compile(r'"part":"(.*?)","duratio')
global fileName
# 获取网页数据,传入参数:网址
def FinData(url):
dataList = []
getUrl = requests.get(url=url)
bsHtml = BeautifulSoup(getUrl.text, "html.parser")
urlTitleList = bsHtml.get_text().title().split('\n', 1)
urlTitle = urlTitleList[0][:-30].lstrip()
# dataList.append(str(urlTitle))
bsFinData = bsHtml.select('script')
bsData = ''
# 筛选列表数据
for i in bsFinData:
bsData = str(i)
if 'window.__INITIAL_STATE__={' in bsData:
break
# 正则查找,返回列表
reList = re.findall(findLink, bsData)
dataList += reList
return dataList, urlTitle
def saveAsTxt(video_list, urlTitle, path):
fileTitle = urlTitle + ".txt" # 合成.txt格式 文件名
for s in fileTitle:
cut = ['|', '\\', '/', ':', '?', '"', '<', '>']
if s in cut:
fileTitle = fileTitle.replace(s, ' ')
fileTitle = os.path.join(path, fileTitle)
# 去除标题中的Windows不兼容的的命名字
nameFile = open(fileTitle, "w", encoding="utf-8") # 写入文件
j = 0
for i in video_list:
j += 1
nameFile.write(i + "\n")
nameFile.close()
return fileTitle
def GetTxt(bid, path):
global fileName
urlPart = 'https://www.bilibili.com/video/'
bv = bid
url = urlPart + bv
dataList, urlTile = FinData(url)
fileName = saveAsTxt(dataList, urlTile, path)