-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkugou_demo.py
More file actions
55 lines (49 loc) · 1.63 KB
/
kugou_demo.py
File metadata and controls
55 lines (49 loc) · 1.63 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import re
import requests
from lxml import etree
import pandas as pd
rank_list = []
title_list = []
name_list = []
time_list = []
user_agents = {
'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
}
def get_info(root: etree._Element):
# print
re_cls = r'\n|\t| - '
re_cls_obj = re.compile(re_cls)
# ranks
ranks = root.xpath('//span[@class="pc_temp_num"]')
for rank_element in ranks:
rank = re_cls_obj.sub('', rank_element.text)
# 判断前三
if not rank.isdigit():
strong_element = rank_element.find('./strong')
rank = strong_element.text
rank_list.append(rank)
# titles
titles = root.xpath('//*[@id="rankWrap"]/div[2]/ul/li/a')
for title_element in titles:
title_list.append(re_cls_obj.sub('', title_element.text))
name_list.append(re_cls_obj.sub('', title_element.find('./span').text))
# times
times = root.xpath('//span[@class="pc_temp_time"]/text()')
for time_element in times:
time_list.append(re_cls_obj.sub('', time_element))
if __name__ == '__main__':
for page in range(1, 24):
url = f'https://www.kugou.com/yy/rank/home/{page}-8888.html?from=rank'
html = requests.get(url, headers=user_agents).text
root = etree.HTML(html)
get_info(root)
# 字典套列表 写入到一个文件中
data = {
'rank': rank_list,
'title': title_list,
'name': name_list,
'time': time_list
}
# 把data形成df
df = pd.DataFrame(data)
df.to_excel('kugou_1.xlsx', index=False)