pattern='mi-mall/(.*?jpg)' for src in img_src_list: img_data = requests.get(url=src, headers=headers).content img_name = re.search(pattern, src).group(1) with open(img_name, 'wb') as fp: fp.write(img_data)
bs4是python独有的解析方式。
本地html:
1 2 3 4 5 6 7
from bs4 import BeautifulSoup # 本地html with open(path, 'r', encoding='utf-8') as fp: soup = BeautifulSoup(fp, 'lxml') # 网页html page_text = requests.get(url=url, headers=headers).text soup = BeautifulSoup(page_text, 'lxml')
titles = soup.select('div.box_con dl a') with open('titles.txt', 'w', encoding='iso-8859-1') as fp: for title in titles: fp.write(title.string+"\n")
with open('contents.txt', 'w', encoding='iso-8859-1') as fp: for title in titles: new_url = url+title['href'] content_text = requests.get(url=new_url, headers=headers).text fp.write(title.string) pattern = '<div id="content"><!--go-->(.*?)<!--over-->' contents = re.search(pattern, content_text, re.S).group(1) contents = contents.replace(' ', ' ') contents = contents.replace('<br/>', '\n') fp.write(contents) fp.write('\n\n\n') fp.flush()
xpath是最常用且最便捷高效的解析方式
1 2 3 4 5 6 7 8
from lxml import etree
# 本地html tree = etree.parse(path)
# 网页url page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text)
标签定位。
1 2 3 4 5 6
r = tree.xpath('/html/head/title') # 返回Element的列表,/开头表示从根标签开始 r = tree.xpath('/html//title') # 两个斜杠//表示多个层级 r = tree.xpath('//title') # 所有的title标签 r = tree.xpath('//div[@class="classname"]') # 属性定位 r = tree.xpath('//div/p[3]') # 索引定位,从1开始 r = tree.xpath('//div/a | //div/p') # 定位两个标签
文本获取。
1 2 3
r = tree.xpath('//a/text()') # 直系文本,依然存储在列表中 r = tree.xpath('//a//text()') # 标签下的全部内容 r = tree.xpath('//img/@src') # 属性值
实例
1 2 3 4 5 6 7 8 9
url = 'https://www.58.com/ershoufang/'# 58同城二手房 page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) trs=tree.xpath('//div[@id="global"]//tr') for tr in trs: title = tr.xpath('./td[2]/a/text()')[0] # ./从当前局部开始 price = tr.xpath('./td[3]//text()')[0] print(title) print(price+"万")
1 2 3 4 5 6 7 8 9 10
url = 'http://prts.wiki/w/干员一览' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) divs = tree.xpath('//div[@class="smwdata"]') for div in divs: img_url = div.xpath('./@data-icon')[0] name = unquote(img_url.split('/')[-1]) img_data = requests.get(url=img_url, headers=headers).content with open(os.path.join('icon',name), 'wb') as fp: fp.write(img_data)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
url = 'https://pvp.qq.com/web201605/js/herolist.json'# 王者荣耀英雄一览 hero_list = requests.get(url=url, headers=headers).json() for hero in hero_list: cname = hero["cname"] detail_url = "https://pvp.qq.com/web201605/herodetail/{}.shtml".format(ename) # 英雄详细信息 detail_page = requests.get(url=detail_url, headers=headers).content.decode('gbk') tree = etree.HTML(detail_page) pics = tree.xpath('//div[@class="pic-pf"]/ul/@data-imgname')[0] pic_name_list = pics.split('|') for i, pic_name in enumerate(pic_name_list): pic_url = 'https://game.gtimg.cn/images/yxzj/img201606/skin/hero-info/{}/{}-bigskin-{}.jpg'.format(ename,ename, i+1) # 皮肤海报 pic_name = pic_name.split('&')[0] img_data = requests.get(url=pic_url, headers=headers).content with open(os.path.join('skin',cname+'_'+pic_name+'.jpg'), 'wb') as fp: fp.write(img_data)
from multiprocessing import Pool pool = Pool(4) ret = pool.map(func, array)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
deffunc(img_url): name = unquote(img_url.split('/')[-1]) img_data = requests.get(url=img_url, headers=headers).content with open(os.path.join('icon', name), 'wb') as fp: fp.write(img_data)
if __name__ == "__main__": url = 'http://prts.wiki/w/干员一览' page_text = requests.get(url=url, headers=headers).text tree = etree.HTML(page_text) img_urls = tree.xpath('//div[@class="smwdata"]/@data-icon') pool = Pool(10) pool.map(func, img_urls) pool.close() pool.join()
单线程+异步协程。
使用task或future:
1 2 3 4 5 6 7 8 9 10 11 12 13
asyncdeffunc(): pass
c = func() # 获得协程对象 loop = asyncio.get_event_loop() # 创建事件循环