# 因为多线程获取数据保存在excel会乱,所以先保存在csv中以及图片文件,后面在结合
from openpyxl import load_workbook,Workbook # Load 读取; Workbook 写入
from openpyxl.drawing.image import Image # excel 写入图片
from openpyxl.styles import Alignment # 居中处理
from PIL import Image as I # 修改图片尺寸
from concurrent.futures import ThreadPoolExecutor # 多线程获取信息
import requests
from lxml import etree
import os
import csv
# test one
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
}
def GetUrl(url,div_num=6): # 第一页有大标题是8 , 后面是6
# url = "https://www.4399.com/flash/new_2.htm" # test_demo
response = requests.get(url=url, headers=headers)
response.encoding='gb2312'
tree = etree.HTML(response.text)
lis = tree.xpath(f'/html/body/div[{div_num}]/ul/li')
# print(len(lis)) # 每一个页面有2070条数据
for li in range(1,len(lis)+1):
url_a ="https://www.4399.com" + tree.xpath(f'/html/body/div[{div_num}]/ul/li[{li}]/a/@href')[0]
# print(url_a)
every_urls.append(url_a)
def Get_Data(url):
f = open("./game_data.csv","a",newline="",encoding="utf-8")
w = csv.writer(f)
response = requests.get(url=url, headers=headers)
response.encoding = 'gb2312'
if response.status_code !=200:
return
tree = etree.HTML(response.text)
a = 'https://www.4399.com'+tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/@href')[0]
title = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[1]/h1/a/text()')[0]
font = tree.xpath('/html/body/div[7]/div[1]/div[1]/div[2]/div[4]/div/font/text()')[0]
w.writerow([title,a,font]) # 写入到 csv文件中
img = 'https:' + tree.xpath( '/html/body/div[7]/div[1]/div[1]/div[1]/div[1]/a/img/@src')[0]
type = img.split(".")[-1]
response = requests.get(url=img, headers=headers).content
# 多线程去执行的时候,为了防止覆盖,所以用时间戳代表每一张图片
with open(f"./游戏图片/{title}.{type}", "wb") as w:
w.write(response)
test.append("1")
def Save():
# 保存信息
f = open("game_data.csv","r",encoding="utf-8",errors='ignore')
datas = list(csv.reader(f))
length = len(datas)
# print(len(datas))
wb = Workbook()
sheet = wb.active
# 提示居中
alignment = Alignment(horizontal='center', vertical='center')
# 设置高度 1是示例
for num in range(1,length+1):
sheet.row_dimensions[num].height=75.5
sheet.column_dimensions['A'].width=20
try:
tp = I.open(f'./游戏图片/{datas[num-1][0]}.jpg')
w,h = tp.size
np = tp.resize((w//2,h//2))
np.save(f'./图片缓存/{datas[num-1][0]}.jpg')
image_path =f'./图片缓存/{datas[num-1][0]}.jpg'
except:
image_path = f'./无.jpg'
try: # 防止索引超出 【无脑错误】
img = Image(image_path)
# img.anchor = sheet.cell(row=1,column=1).coordinate
sheet.add_image(img,f'a{num}') # 第四个为止
sheet.column_dimensions['B'].width = 22
sheet[f'B{num}'] = datas[num-1][0]
sheet[f'B{num}'].alignment = alignment
sheet.column_dimensions['C'].width = 38.18
sheet[f'C{num}'] = datas[num-1][1]
sheet[f'C{num}'].alignment = alignment
sheet[f'D{num}'] = datas[num-1][2]
sheet[f'D{num}'].alignment = alignment
except:
pass
wb.save('game.xlsx')
if __name__ == '__main__':
test = [] # 检测有多少条数据真正获取
if not os.path.exists('./图片缓存'):
os.mkdir('./图片缓存')
print("文件已创建")
#
f = open("./game_data.csv","w",encoding="utf-8",newline="")
every_urls = []
# "https://www.4399.com/flash/new.htm" 第一页的数据
urls = []
for i in range(2, 11):
urls.append(f'https://www.4399.com/flash/new_{i}.htm')
# print('添加10页的链接', urls)
GetUrl("https://www.4399.com/flash/new.htm",div_num=8) # 第一页的数据
# print(urls) # 所有的链接
# 多线程爬取多个 url
with ThreadPoolExecutor(max_workers=10) as e:
for url in urls:
e.submit(GetUrl,url)
print("多线程爬取到的所有url链接···")
# print(every_urls)
# print(len(every_urls))
with ThreadPoolExecutor(max_workers=100) as e:
for url in every_urls:
e.submit(Get_Data,url)
print("链接总共的条数有:",len(test))
f.close()
print("等待图片跟数据的保存···")
# 将 data_csv 和 图片想结合
Save()
print("数据已获取保存")
# test
# Get_Data('https://www.4399.com/flash/240995.htm')
本站资源均来自互联网,仅供研究学习,禁止违法使用和商用,产生法律纠纷本站概不负责!如果侵犯了您的权益请与我们联系!
转载请注明出处: 免费源码网-免费的源码资源网站 » xpath爬取4399的最新游戏系列
发表评论 取消回复