#线程池:一次性开辟一些线程。我们用户直接给线程池提交任务,线程任务的调度交给线程池来完成fromconcurrent.futuresimportThreadPoolExecutor,ProcessPoolExecutordeffunc(name):foriinrange():print(name,i)if__name__==__main__:#创建线程池withThreadPoolExecutor(50)ast:foriinrange():t.submit(func,name=f"线程{i}")#等待线程池中的任务全部执行完毕,才继续执行(守护)print("线程执行完毕")线程池和进程池实战:用线程池爬取新发地
#1.如何提取单个页面的数据#2.上线程池,多个页面同时抓取importrequestsfromlxmlimportetreeimportcsvfromconcurrent.futuresimportThreadPoolExecutorf=open("data.csv","w",encoding="utf-8",newline="")csv_writer=csv.writer(f)defdownload_one_page(url):#拿到页面源代码resp=requests.get(url)html=etree.HTML(resp.text)table=html.xpath("/html/body/div[2]/div[4]/div[1]/table")[0]#trs=table.xpath("./tr")[1:]trs=table.xpath("./tr[position()1]")#拿到每个trfortrintrs:#name=tr.xpath("./td[1]/text()")[0]#low_price=tr.xpath("./td[2]/text()")[0]#avg_price=tr.xpath("./td[3]/text()")[0]#high_price=tr.xpath("./td[4]/text()")[0]#kind=tr.xpath("./td[5]/text()")[0]#unit=tr.xpath("./td[6]/text()")[0]#date=tr.xpath("./td[7]/text()")[0]#以上是我自己的方法,太笨了#txt=,.join(tr.xpath("./td/text()"))#这个也有点笨,用下面老师的方法txt=tr.xpath("./td/text()")#对数据做简单的处理,去掉”\\“和”/“txt=(item.replace("\\","").replace("/","")foritemintxt)csv_writer.writerow(txt)page=url.split("/")[-1][:-6]print(f"第{page}页下载完成")resp.close()if__name__==__main__:#foriinrange(1,):#url=f"