Python多线程下载黑白网学习资源库文件
前言
前几天(周),发现个信息安全的资源网站黑白网,看到一堆关于信息安全的资料。啊这,像我这种看到资料就想收藏的人那里忍得了,奈何资料有点多,就写个Python3脚本挂着下载
主要是官网显示居然要今年取消一切服务,这还不下载保存??
原始脚本【无多线程】
- import requestsimport reimport timefrom pathlib import Path
-
- url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\\\信息安全\\\\\\\\test\\\\'#下载路径req = requests.get(url)a = re.findall(r"<a href = '\\./\\./(.*)' target = ", req.text)with open('heibai.txt', 'w+', encoding='utf8') as f:
- for i in a:
- f.write(i+'\\n')
- f.close()def mkdir(path):
- import os
- path = path.strip()
- isExists = os.path.exists(path)
- if not isExists:
- os.makedirs(path)
- return True
- else:
- return Falsewith open('heibai.txt', 'r', encoding='utf8') as f:
- for i in f:
- i = i[0:-1]
- my_file = Path(path+i)
- if not my_file.exists():
- if '/' in i:
- new = re.findall(r"(.*)/", i)
- mkdir(path+new[0])
- print('Downloading:'+i+' '+time.asctime(time.localtime(time.time())))
- r = requests.get(url2+i)
- with open(path+i, "wb") as code:
- code.write(r.content)
- print("Finnish!")
- f.close()
挂着下载到本地,后来发现是真的慢,就考虑多线程下载??
多线程脚本
本来没学过多线程,临时抱佛脚学了一会原理,然后就直接搬网上的脚本改了一下
- #! -coding:utf8 -*-import threading,sysimport requestsimport timeimport osimport refrom pathlib import Path
-
- url = 'https://edu.heibai.org/?c=cache'url2 = 'https://edu.heibai.org/'path = 'E:\\\\信息安全\\\\安全书及笔记\\\\heibai\\\\'#下载路径def txt():
- req = requests.get(url)
- a = re.findall(r"<a href = '\\./\\./(.*)' target = ", req.text)
- with open('heibai.txt', 'w+', encoding='utf8') as f:
- for i in a:
- f.write(i+'\\n')
- f.close()def mkdir(path):
- import os
- path = path.strip()
- isExists = os.path.exists(path)
- if not isExists:
- os.makedirs(path)
- return True
- else:
- return Falseclass MulThreadDownload(threading.Thread):
- def __init__(self,url,startpos,endpos,f):
- super(MulThreadDownload,self).__init__()
- self.url = url
- self.startpos = startpos
- self.endpos = endpos
- self.fd = f def download(self):
- #print("start thread:%s at %s" % (self.getName(), time.time()))
- headers = {"Range":"bytes=%s-%s"%(self.startpos,self.endpos)}
- res = requests.get(self.url,headers=headers)
- self.fd.seek(self.startpos)
- self.fd.write(res.content)
- #print("stop thread:%s at %s" % (self.getName(), time.time()))
- # f.close()
-
- def run(self):
- self.download()Blacklist = ['思维导图/移动安全/.DS_Store',]txt()with open('heibai.txt', 'r', encoding='utf8') as f:
- for filename in f:
- filename = filename[0:-1]
- if filename in Blacklist:
- continue
- my_file = Path(path+filename)
- if not my_file.exists():
- if '/' in filename:
- new = re.findall(r"(.*)/", filename)
- mkdir(path+new[0])
- while 1:
- try:
- filesize = int(requests.head(url2+filename).headers['Content-Length'])
- except Exception:
- print('10分钟后重试连接服务器!')
- time.sleep(60*10)
- continue
- break
- print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))
- #线程数
- threadnum = 2
- #信号量,同时只允许2个线程运行
- threading.BoundedSemaphore(threadnum)
- # 默认2线程现在,也可以通过传参的方式设置线程数
- step = filesize // threadnum
- mtd_list = []
- start = 0
- end = -1
-
- # 请空并生成文件
- tempf = open(path+filename,'w')
- tempf.close()
- # rb+ ,二进制打开,可任意位置读写
- with open(path+filename,'rb+') as f:
- fileno = f.fileno()
- # 如果文件大小为11字节,那就是获取文件0-10的位置的数据。如果end = 10,说明数据已经获取完了。
- while end < filesize -1:
- start = end +1
- end = start + step -1
- if end > filesize:
- end = filesize # print("start:%s, end:%s"%(start,end))
- # 复制文件句柄
- dup = os.dup(fileno)
- # print(dup)
- # 打开文件
- fd = os.fdopen(dup,'rb+',-1)
- # print(fd)
- t = MulThreadDownload(url,start,end,fd)
- t.start()
- mtd_list.append(t)
-
- for i in mtd_list:
- i.join()
后言
这是 闲的慌,硬盘闲的大
写的有点乱
脚本
思维导图/移动安全/.DS_Store
这文件被网站拦截,不能下载
似乎多线程太快了,服务器把我ip给ban了一会,所以是否使用多线程脚本看自己的网速
后来多线程中加入了延时重试连接服务器,应该可以晚上挂着下载了
我只在脚本的多线程下载部分加入重连,故如果脚本运行前就被banIP,则脚本报错
原始脚本下载一段时间,服务器会超时,脚本会卡住
再后来,直接在文件下载之间睡眠5s,好像效果还行
- if not my_file.exists():
- if '/' in filename:
- new = re.findall(r"(.*)/", filename)
- mkdir(path+new[0])
- print('防止被ban,暂停5s中...')
- time.sleep(5)
- while 1:
- try:
- filesize = int(requests.head(url2+filename).headers['Content-Length'])
- except Exception:
- print('10分钟后重试连接服务器!')
- time.sleep(60*10)
- continue
- break
- print('Downloading:'+filename+' '+time.asctime(time.localtime(time.time())))