import requests from lxml import etree from time import sleep import numpy as np from openpyxl import Workbook # 进行UA伪装 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36 Edg/117.0.2045.41'} # 定义空列表存放电影数据 titles_cn = [] # 中文标题 titles_en = [] # 英文标题 links = [] # 详情页链接 director = [] # 导演 actors = [] # 演员 years = [] # 上映年份 nations = [] # 国籍 types = [] # 类型 scores = [] # 评分 rating_nums = [] # 评分人数 # 创建一个Workbook对象 wb = Workbook() # 激活当前工作表 ws = wb.active # 写入表头 ws.append( ['电影中文名', '电影英文名', '电影详情页链接', '导演', '演员', '上映年份', '国籍', '类型', '评分', '评分人数']) for i in range(0, 226, 25): url = f'https://movie.douban.com/top250?start={i}&filter=' response = requests.get(url, headers=headers) sleep(1) html = response.text data = etree.HTML(html) li_list = data.xpath('//*[@id="content"]/div/div[1]/ol/li') for each in li_list: title1 = each.xpath('./div/div[2]/div[1]/a/span[1]/text()')[0] titles_cn.append(title1) title2 = each.xpath('./div/div[2]/div[1]/a/span[2]/text()')[0].strip('\xa0/\xa0') titles_en.append(title2) link = each.xpath('./div/div[2]/div[1]/a/@href')[0] links.append(link) info1 = each.xpath('./div/div[2]/div[2]/p[1]/text()[1]')[0].strip() split_info1 = info1.split('\xa0\xa0\xa0') dirt = split_info1[0].strip('导演: ') director.append(dirt) if len(split_info1) == 2: ac = split_info1[1].strip('主演: ') actors.append(ac) else: actors.append(np.nan) info2 = each.xpath('./div/div[2]/div[2]/p[1]/text()[2]')[0].strip() split_info2 = info2.split('\xa0/\xa0') year = split_info2[0] nation = split_info2[1] ftype = split_info2[2] years.append(year) nations.append(nation) types.append(ftype) score = each.xpath('./div/div[2]/div[2]/div/span[2]/text()')[0] scores.append(score) num = each.xpath('./div/div[2]/div[2]/div/span[4]/text()')[0].strip('人评价') rating_nums.append(num) ws.append([title1, title2, link, dirt, ac, year, nation, ftype, score, num]) print(f'————————————第{int((i / 25) + 1)}页爬取完毕!——————————————') # 保存工作簿 wb.save('data.xlsx') print('——————————————————————————————————爬虫结束!!!!!————————————————————————————————————————————————')
本站资源均来自互联网,仅供研究学习,禁止违法使用和商用,产生法律纠纷本站概不负责!如果侵犯了您的权益请与我们联系!
转载请注明出处: 免费源码网-免费的源码资源网站 » bexcel
发表评论 取消回复