Python多线程下载黑白网学习资源库文件
前言
前几天(周),发现个信息安全的资源网站黑白网,看到一堆关于信息安全的资料。啊这,像我这种看到资料就想收藏的人那里忍得了,奈何资料有点多,就写个Python3脚本挂着下载
主要是官网显示居然要今年取消一切服务,这还不下载保存??
原始脚本【无多线程】
import requestsimport reimport timefrom pathlib import Path
url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\\\信息安全\\\\\\\\test\\\\'#下载路径req = requests.get(url)a = re.findall(r"<a href = '\\./\\./(.*)' target = ", req.text)with open('heibai.txt', 'w+', encoding='utf8') as f:
for i in a:
f.write(i+'\\n')
f.close()def mkdir(path):
import os
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return Falsewith open('heibai.txt', 'r', encoding='utf8') as f:
for i in f:
i = i[0:-1]
my_file = Path(path+i)
if not my_file.exists():
if '/' in i:
new = re.findall(r"(.*)/", i)
mkdir(path+new[0])
print('Downloading:'+i+' '+time.asctime(time.localtime(time.time())))
r = requests.get(url2+i)
with open(path+i, "wb") as code:
code.write(r.content)
print("Finnish!")
f.close()
挂着下载到本地,后来发现是真的慢,就考虑多线程下载??
多线程脚本
本来没学过多线程,临时抱佛脚学了一会原理,然后就直接搬网上的脚本改了一下
#! -coding:utf8 -*-import threading,sysimport requestsimport timeimport osimport refrom pathlib import Path
url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\\\信息安全\\\\安全书及笔记\\\\heibai\\\\'#下载路径def txt():
req = requests.get(url)
a = re.findall(r"<a href = '\\./\\./(.*)' target = ", req.text)
with open('heibai.txt', 'w+', encoding='utf8') as f:
for i in a:
f.write(i+'\\n')
f.close()def mkdir(path):
import os
path = path.strip()
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return True
else:
return Falseclass MulThreadDownload(threading.Thread):
def __init__(self,url,startpos,endpos,f):
super(MulThreadDownload,self).__init__()
self.url = url
self.startpos = startpos
self.endpos = endpos
self.fd = f def download(self):
#print("start thread:%s at %s" % (self.getName(), time.time()))
headers = {"Range":"bytes=%s-%s"%(self.startpos,self.endpos)}
res = requests.get(self.url,headers=headers)
self.fd.seek(self.startpos)
self.fd.write(res.content)
#print("stop thread:%s at %s" % (self.getName(), time.time()))
# f.close()
def run(self):
self.download()Blacklist = ['思维导图/移动安全/.DS_Store',]txt()with open('heibai.txt', 'r', encoding='utf8') as f:
for filename in f:
filename = filename[0:-1]
if filename in Blacklist:
continue
my_file = Path(path+filename)
if not my_file.exists():
if '/' in filename:
new = re.findall(r"(.*)/", filename)
mkdir(path+new[0])
while 1:
try:
filesize = int(requests.head(url2+filename).headers['Content-Length'])
except Exception:
print('10分钟后重试连接服务器!')
time.sleep(60*10)
continue
break
print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))
#线程数
threadnum = 2
#信号量,同时只允许2个线程运行
threading.BoundedSemaphore(threadnum)
# 默认2线程现在,也可以通过传参的方式设置线程数
step = filesize // threadnum
mtd_list = []
start = 0
end = -1
# 请空并生成文件
tempf = open(path+filename,'w')
tempf.close()
# rb+ ,二进制打开,可任意位置读写
with open(path+filename,'rb+') as f:
fileno = f.fileno()
# 如果文件大小为11字节,那就是获取文件0-10的位置的数据。如果end = 10,说明数据已经获取完了。
while end < filesize -1:
start = end +1
end = start + step -1
if end > filesize:
end = filesize # print("start:%s, end:%s"%(start,end))
# 复制文件句柄
dup = os.dup(fileno)
# print(dup)
# 打开文件
fd = os.fdopen(dup,'rb+',-1)
# print(fd)
t = MulThreadDownload(url,start,end,fd)
t.start()
mtd_list.append(t)
for i in mtd_list:
i.join()
后言
这是 闲的慌,硬盘闲的大
写的有点乱
脚本
思维导图/移动安全/.DS_Store
这文件被网站拦截,不能下载
似乎多线程太快了,服务器把我ip给ban了一会,所以是否使用多线程脚本看自己的网速
后来多线程中加入了延时重试连接服务器,应该可以晚上挂着下载了
我只在脚本的多线程下载部分加入重连,故如果脚本运行前就被banIP,则脚本报错
原始脚本下载一段时间,服务器会超时,脚本会卡住
再后来,直接在文件下载之间睡眠5s,好像效果还行
if not my_file.exists():
if '/' in filename:
new = re.findall(r"(.*)/", filename)
mkdir(path+new[0])
print('防止被ban,暂停5s中...')
time.sleep(5)
while 1:
try:
filesize = int(requests.head(url2+filename).headers['Content-Length'])
except Exception:
print('10分钟后重试连接服务器!')
time.sleep(60*10)
continue
break
print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))