pyquery实际上是python中的jquery
安装
pip install pyquery
导入
from pyquery import PyQuery as pq
对象解析
urlparse = pq(url='https://www.baidu.com/')
# urlparse = pq(url='https://www.baidu.com/',headers=USER_HEARDER)
fileparse=pq(file='./test.html') # 本文不做过多阐述
reqparse = pq(req.text) #req = requests.get('https://www.baidu.com/') # 本文不做过多阐述
元素定位:直接使用css选择器
element = reqparse(css表达式).items() # .items()得到了一个generator 可以遍历获取每个元素
元素属性值/文本获取
from pyquery import PyQuery as pq
import requests
from pathlib import Path
hero_url = pq(url='https://pvp.qq.com/web201605/herolist.shtml',encoding='gbk')
heros = hero_url('.herolist img').items()
Path('hero').mkdir(exist_ok=True)
for hero in heros:
img_url = 'https:' + hero.attr.src
hero_name = hero.attr.alt
with open(f'./hero/{hero_name}.png','wb') as f:
f.write(requests.get(img_url).content)
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
driver.get('https://pvp.qq.com/web201605/herolist.shtml')
driver.maximize_window() # 最大化窗口
driver.implicitly_wait(5) # 隐式等待5s, 定位元素的时候如果没找到,等5s
heros = driver.find_elements('css selector','.herolist img') # 找到所有的英雄
for hero in heros: # hero 就是某个英雄元素, 对应到selenium中是WebElement类型
img_url = hero.get_attribute('src') # 获取元素的src属性值
hero_name = hero.get_attribute('alt')
with open(f'./hero1/{hero_name}.png','wb') as f:
pic_content = requests.get(img_url).content
f.write(pic_content)