Python
#-*-coding:utf-8-*-
#By:EastmountCSDN
importurllib.request
importre
frombs4importBeautifulSoup
importcodecs
#-------------------------------------爬虫函数-------------------------------------
defcrawl(url,headers):
page=urllib.request.Request(url,headers=headers)
page=urllib.request.urlopen(page)
contents=page.read()
soup=BeautifulSoup(contents,"html.parser")
infofile.write("")
print(爬取豆瓣电影:\n)
fortaginsoup.find_all(attrs={"class":"item"}):
#爬取序号
num=tag.find(em).get_text()
print(num)
infofile.write(num+"\r\n")
#电影名称
name=tag.find_all(attrs={"class":"title"})
zwname=name[0].get_text()
print([中文名称],zwname)
infofile.write("[中文名称]"+zwname+"\r\n")
#网页链接
url_movie=tag.find(attrs={"class":"hd"}).a
urls=url_movie.attrs[href]
print([网页链接],urls)
infofile.write("[网页链接]"+urls+"\r\n")
#爬取评分和评论数
info=tag.find(attrs={"class":"star"}).get_text()
info=info.replace(\n,)
info=info.lstrip()
print([评分评论],info)
#获取评语
info=tag.find(attrs={"class":"inq"})
if(info):#避免没有影评调用get_text()报错
content=info.get_text()
print([影评],content)
infofile.write(u"[影评]"+content+"\r\n")
print()
#-------------------------------------主函数-------------------------------------
if__name__==__main__:
#存储文件
infofile=codecs.open("Result_Douban.txt",a,utf-8)
#消息头
headers={User-Agent:Mozilla/5.0(WindowsNT10.0;Win64;x64)\
AppleWebKit/.36(KHTML,likeGecko)Chrome/67.0..99Safari/.36}
#翻页
i=0
whilei10:
print(页码,(i+1))
num=i*25#每次显示25部URL序号按25增加
url=