python爬虫常用requests和beautifulSoup这2个第三方模块。需要先进行手动安装。
requests负责下载页面数据,beautifulSoup负责解析页面标签。
关于beautifulSoup的api使用,详见api页面:https://beautifulsoup.readthedocs.io/zh-cn/v4.4.0/#find-all
豆瓣评论中邮箱数据爬取案例:
import re #正则表达式 import requests #下载网页 import bs4# beautifulSoup,解析网页 headers1={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' } mail_list=[] #list存储邮箱结果 #因为豆瓣有反爬取机制,因此requests需要添加headers来模拟浏览器,否则requests抓取不到页面 response = requests.get('https://www.douban.com/group/topic/102346598/?_i=5308140i1GN13-',headers=headers1) #print(response.text) #页面文本按lxml格式进行解析 page_obj = bs4.BeautifulSoup(response.text,"lxml") reply_divs=page_obj.find_all("div",attrs={"class":"reply-doc"})#找到所有的评论div #print(len(reply_divs)) if reply_divs: for div in reply_divs:#遍历div,对评论数据进行解析 reply_div=div.find_next("p",attrs={"class":"reply-content"}) mail_re=re.search("\w+@\w+.\w+",reply_div.text,flags=re.A)#用正则表达式匹配邮箱,#flags=re.A的作用是排除2侧的中文 if mail_re:#如果这个评论中有邮箱,则继续查找他的时间 times=div.find_next("span",attrs={"class":"pubtime"}) mail_list.append([mail_re.group(),times.text]) print(mail_list) print(len(mail_list))
在豆瓣评论中有分页的情况,如果要分页评论数据都抓取要改造如下:
import re #正则表达式 import requests #下载网页 import bs4# beautifulSoup,解析网页 headers1={ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Host': 'www.douban.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36' } def download_page(url1): #先抓取第一页数据 print(f"下载分页{url1}") response = requests.get(url1, headers=headers1) page_obj = bs4.BeautifulSoup(response.text, "lxml") bs4_page_obj_list = [page_obj] #把第一页数据存储下来 # 把所有的分页下载下来后,然后统一去提取emails url_set = set() # 存下所有的分页的url paginator_ele = page_obj.find("div", attrs={"class": "paginator"}) for a_ele in paginator_ele.find_all("a"): url_set.add(a_ele.attrs.get("href")) for url in url_set:#变量其他分页(除了第一页) print(f"下载分页{url}") page_obj = requests.get(url, headers=headers1) bs4_page_obj = bs4.BeautifulSoup(page_obj.text, "lxml") bs4_page_obj_list.append(bs4_page_obj) # 先暂存 return bs4_page_obj_list def fetch_emails(page_obj_list): mail_list=[] for bs4_obj in page_obj_list:# 循环每个页面 reply_divs = bs4_obj.find_all("div",attrs={"class":"reply-doc"}) for div in reply_divs: reply_div = div.find("p",attrs={"class":"reply-content"}) mail_re = re.search("\w+@\w+\w+",reply_div.text,flags=re.A) if mail_re: pub_time = div.find("span",attrs={'class':"pubtime"}) print(pub_time.text,mail_re.group()) mail_list.append([mail_re.group(),pub_time.text]) print(f'总共有邮箱数量是:{len(mail_list)}') all_bs4_page_list = download_page("https://www.douban.com/group/topic/102346598/?_i=5308140i1GN13-") fetch_emails(all_bs4_page_list)