所用版本:Python 3.6,requests 2.18.4,jieba 0.42.1,nltk 3.2.4,wordcloud 1.8.1
程序实现的功能:输入视频的地址,生成对应弹幕的词云。
每个视频都对应一个弹幕的xml文件,地址为https://comment.bilibili.com/(cid).xml,每个视频都有一个cid。将视频的网页爬取,然后从中搜索“cid”,就可以看到cid在哪里有。然后在程序里就可以用正则表达式,根据cid附近的字符串特征,将cid查找出来。接着爬取弹幕文件,同样用正则提取出弹幕内容。最后,对弹幕进行分词和统计。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
# -*- coding: utf-8 -*- import requests import re import jieba.analyse import nltk import wordcloud import matplotlib.pyplot as plt def search_url(url): # 获取网页内容 headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36"} response = requests.get(url, headers=headers) assert response.status_code == 200 html = response.content.decode() return html def get_xml_url(url): # 获取弹幕文件地址 html = search_url(url) match = re.findall("cid=(.*)&aid", html) xml_url = "https://comment.bilibili.com/" + match[0] + ".xml" return xml_url def get_comment(url): # 获取弹幕列表 xml_url = get_xml_url(url) xml = search_url(xml_url) list = re.findall("<d p=.*?>(.*?)</d>", xml) return list def analyse(comment): # 分析、统计弹幕 s = ' '.join(comment) jieba.load_userdict("words.txt") # 增加词典内容 jieba.analyse.set_stop_words("stop.txt") # 增加停用词 words = jieba.analyse.extract_tags(s, 100000) cut = jieba.lcut(s) cnt = nltk.FreqDist(cut) cnt = cnt.most_common(5000) cnt = [x for x in cnt if x[0] in words] cnt = dict(cnt) w = wordcloud.WordCloud(font_path="C:\Windows\Fonts\simsun.ttc", width=1200, height=800, background_color="white", ).fit_words(cnt) plt.imshow(w) plt.axis("off") plt.show() def get_data(url): # 获取弹幕数据 comment = get_comment(url) analyse(comment) url = input("请输入视频地址:") get_data(url) |