Python爬虫视频教程零基础小白到scrapy爬虫高手-轻松入门
化工网批量获取,并存储到excel内
# -*- coding: utf-8 -*-"""Created on Tue Mar 15 09:35:28 2016@author: Administrator"""import requests,bs4,openpyxl,time,seleniumexcelName="sites.xlsx"sheetName="Sheet1"wb1=openpyxl.load_workbook(excelName)sheet=wb1.get_sheet_by_name(sheetName)start=1columnName="A"#真正网址数量2798个,测试只用10个#pages=2798pages=10#找到真正网址sites_list=[]#第一页网址#site="http://www.bzwxw.com/index.php?info%5Bcatid%5D=0&info%5Btitle%5D=gb&m=content&c=search&a=init&catid=13&dosubmit=1&page=1"def crawl_onePage_sites(page): res=requests.get(page) res.encoding = 'gbk' soup1=bs4.BeautifulSoup(res.text,"lxml") #sites=soup1.select('.blue fn') sites=soup1.find_all(attrs={"class":"blue fn"}) #find_all找到所有带有谋class对象 #sites=soup1.select(".blue a") #找到真正网址 for i in sites: s=i.get("href") #get获取href的内容 sites_list.append(s)page_list=[]page_front="http://www.bzwxw.com/index.php?info%5Bcatid%5D=0&info%5Btitle%5D=gb&m=content&c=search&a=init&catid=13&dosubmit=1&page="for i in range(1,pages): page=page_front+str(i) page_list.append(page) for page in page_list: crawl_onePage_sites(page)#网址数量len_sites=len(sites_list) row=2for site in sites_list: sheet['A'+str(row)].value=site row+=1wb1.save(excelName)