简单的爬虫完整代码 教程
爬虫简单流程 1,分析网页找出目标url 2,python用urllib2下载url内容 3,BeautifulSoup bs4解析网页 4,从解析出来的数据中选择爬取目标 5将目标写入本地。 注意事项 1, 添加header 主要为cookie和网站代理 放置封ip 2,python编码问题 下载解析过程中不改变编码方式 等最后写入本地时更改编码方式。
一爬某个学校网站所有最近通知**# -- encoding: utf-8 -- //转化python编码方式 import urllib2 //urllib2 下载网站包 from bs4 import BeautifulSoup//解析网站包 import json//将获取信息写入json格式 import sys //转换编码方式 #sys系统编码为ascii 通过下面两行代码更改默认编码 reload(sys) sys.setdefaultencoding(‘utf-8’) resultData = []//用来存放每次抓取的信息 //http://www.snnu.edu.cn/tzgg/217.htm //urllib2 抓取网页 url = ‘http://www.snnu.edu.cn/tzgg/’ count =1 for i in range(1, 219): num = 219 - i url_com = url + str(num) + “.htm”//循环组成所有目标url response = urllib2.urlopen(url_com) print response //charset = chardet.detect(url) //print charset //获取目标网页的编码方式 //print response.getcode() #检测是否抓取成功输出200 //cont = response.read().decode(‘utf-8’) 读出获得的网页 // BeautifulSoup 解析网页 soup = BeautifulSoup(response, ‘html.parser’) link = soup.find_all(class_=“con_newslist”)[0] #寻找目标特征 links = link.find_all(“li”) for li in links: data = { “title” : li.find(“a”).text } print count count = count + 1 resultData.append(data)//将每次循环获取的数据写入数组 with open(‘result.json’, ‘wb’) as f://将数组写入相同目录resylt.json中 wb表示可读取 可写入 f.write(json.dumps(resultData).decode(“unicode-escape”))、、写入的这行代码跟开头sys配套更改最后写入编码
二爬取某瓣top250 的影片名 主要 短评//encoding: utf-8 转变编码方式 //https://movie.douban.com/top250?start=25&filter= import urllib2 //下载网页 import sys //转换格式 import json//写入json格式 // 强制转换编码方式 reload(sys) sys.setdefaultencoding(‘utf-8’)
from bs4 import BeautifulSoup //解析网页
url = “https://movie.douban.com/top250?start=” heads = {
} num = 0 resultData = [] //用来存放每次循环爬取的信息 count = 1 for i in range(0, 10): //分析所有的url num = i * 25 url_com = url + str(num) + “&filter=” //添加header的表达方式 // request = urllib2.Request(url_comg, headers=header) // response = urllib2.urlopen(request) // 下载url response = urllib2.urlopen(url_com) // 分析url soup = BeautifulSoup(response, ‘html.parser’) // 查找目标元素 link = soup.find_all(class_=“grid_view”)[0] links = link.find_all(“li”) for li in links: print count try: data = { “movieName”: li.find(“img”).attrs[“alt”],// arrtrs表示属性 alt=XXXX “movieImg” : li.find(“img”).attrs[“src”], “info” : li.find(“div”,class_=“bd”).find(‘p’).text.encode(‘utf8’).strip(), ```//encode转化格式字符串在Python内部的表示是unicode编码,因此,在做编码转换时,通常需要以unicode作为中间编码,即先将其他编码的字符串解码(decode)成unicode,再从unicode编码(encode)成另一种编码,strip()函数的作用是info中有空白格,删除其中的空白格 ` #输出文本转化编码格式否则存储到元祖中为空 “score” : li.find(class_=“star”).find(class_=“rating_num”).text, “comment” : li.find(class_=“inq”).text } json.dumps(data, encoding=“UTF-8”, ensure_ascii=False) //将data数据转化为json resultData.append(data)//添加到数组 count = count + 1 except AttributeError: # try except 是因为有的comment为空,会因为其中一项数据没有定义所以会报错,所以对为空的数据赋值为空格 data[“comment”] = " " //json.dumps(data, encoding=“UTF-8”, ensure_ascii=False) // 将每次找到的data元祖数据放入数组中 resultData.append(data) count =count + 1 with open(‘douban.json’, ‘wb’) as f: //json.dump 将python数据转化为字符串 f.write(resultData)resultData为元祖不能写入 只有将其转化为字符串才能写入,40行提前虽然数据转化成功但是仍为元祖无法写入 f.write(json.dumps(resultData, indent=4).decode(“unicode-escape”)) indent如如元祖每一项前面有4个空格 print “that’s all”//结束语
## 查找网页编码的方法 import chardet import urllib2 url = "http://index.baidu.com/v2/main/index.html#/trend/%E8%A5%BF%E5%AE%89?words=%E8%A5%BF%E5%AE%89" response = urllib2.urlopen(url) html = response.read() print chardet.detect(html) 三爬取某指数获取2017-2018西安关键词的百度指数查找每一天的指数来完成遇到每个月不到31天的时候因为没有返回信息系统系统会自动跳过 不会报错 用python调用函数来完成 encoding: utf-8 http://index.baidu.com/api/SearchApi/index?area=0&word=西安&startDate=2023-05-01&endDate=2023-05-01 import sys import urllib2 import json data_f = [] reload(sys) sys.setdefaultencoding(‘utf-8’)
header = { “cookie” : “BAIDUID=D990C407249FE8E70BB0E1BCC69067A3:FG=1; BIDUPSID=D990C407249FE8E70BB0E1BCC69067A3; PSTM=1547021385; bdindexid=j946ivqfo9rq50k3mfsa5hcsb5; BDSFRCVID=508sJeCCxG3JggR9c11XMVqhqAFOeQZRddMu3J; H_BDCLCKID_SF=tR30WJbHMTrDHJTg5DTjhPrMhh-jbMT-027OKKOF5b3CbJOxXfrY24_d-H7lW-QIyHrb0p6athF0hDvYh4Oq2KCV-frb-C62aKDsob7I-hcqEpO9QTbkbP-wefbjqPTDWC5f_CnIBb5GoxogbMchDUThDHR02t3-MPoa3RTeb6rjDnCrjj0WKUI8LPbO05JZ5KvNVR8htxnYS4jdKfQPXPLuWR5bKUrtt2LE3-oJqCD5bD-63J; Hm_lvt_d101ea4d2a5c67dab98251f0b5de24dc=1557584073,1557625526,1557626135,1557636493; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; H_PS_PSSID=1465_21119_18560_28519_28774_28722_28964_28831_28585; delPer=0; PSINO=2; BDUSS=VsbWYyUHpDeTZFTUxMbjNiZ0pTM2VqWmhqR3c0d0xQd3JhZTEyd1FzRVFhfjljRVFBQUFBJCQAAAAAAAAAAAEAAABgjFgvemp6MTQyAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABDe11wQ3tdcWk; CHKFORREG=25a112f22b13124156fa13139499188d; Hm_lpvt_d101ea4d2a5c67dab98251f0b5de24dc=1557651091”,
“User-Agent” : “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36” } //找到目标网页查看工作台找到cookie 和代理
def getdata(key_1,url_comg)?/解析网页 获取目标信息 request = urllib2.Request(url_comg, headers=header)//添加header的访问方式 response = urllib2.urlopen(request) data = json.load(response) if str(data[“message”])== “bad request”: return data = { “key” : key_1, “startData”: str(data[“data”][“userIndexes”][0][“pc”][“startDate”]), “endData”: str(data[“data”][“userIndexes”][0][“pc”][“endDate”]), “mavg”: str(data[“data”][“generalRatio”][0][“all”][“avg”]), “avg”: str(data[“data”][“generalRatio”][0][“wise”][“avg”]) } data_f.append(data) savedata(data_f)
def savedata(data_fina)?/存储 写入 信息 with open(“baidu_1.json”,“wb”) as f: f.write(json.dumps(data_fina, indent=4).decode(“unicode-escape”))
def getkeyword()?/爬取的关键词 keylist = [“西安”] return keylist
def spider(url1, sy, ey)?/获取目标url urls = [] key = getkeyword() for k in key: for i in range(sy, ey ): for j in range(12,13): for f in range(1,33): url_com = url1 + k + “&startDate=” + str(i) + “-” + str(j) + “-” + str(f) + “&endDate=” + str(i) + “-” + str(j) + “-” + str(f) getdata(key,url_com)
if name ==“main”: //主文件 url = “http://index.baidu.com/api/SearchApi/index?area=0&word=” startyear = 2017 endyear = 2018 spider(url, startyear, endyear) print json.dumps(data_f).decode(“unicode-escape”)
四爬取某博的热搜标题 和部分图片代码没有完善 只能搜索出标题专门的图片,如果使用户的图片的话没有办法爬出来,会出现越界和返回数据类型错误的问题 用调用函数完成 #encoding : utf-8 import urllib import urllib2
from bs4 import BeautifulSoup
header ={ “Cookie”: “SINAGLOBAL=9411374167025.424.1552041554396; UOR=,v.ifeng.com; login_sid_t=6bc26fe217fbe09d05225f5654146e49; cross_origin_proto=SSL; _s_tentry=-; Apache=2129606814883.5312.1557713131893; ULV=1557713131897:3:2:1:2129606814883.5312.1557713131893:1557536542243; SUBP=0033WrSXqPxfM725Ws9jqgMF55529P9D9WWahfmB37nahga6ao9IhFid5JpX5o275NHD95QcSK24ShM4e0-0Ws4DqcjJi–Ri-zXi-iWCs-LxK-LB–LBoqLxKqL1-eL1h.LxK.L1K2LB-zt; SSOLoginState=1557713230; SUB=_2A25x3KECDeRhGeBL6lsV9yfPzj-IHXVSq5XKrDV8PUNbmtAKLWHakW9NRyhOGFlGQ_u-eepwRtim_EAJSV9wzzbY; SUHB=0Ela5I-B2gp8kP; ALF=1589249232; wvr=6; webim_unReadCount=%7B%22time%22%3A1557713248360%2C%22dm_pub_total%22%3A0%2C%22chat_group_pc%22%3A0%2C%22allcountNum%22%3A2%2C%22msgbox%22%3A0%7D”, “User-Agent”: “Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36” } datas = [] pics = []
def loadpic(pic): //下载图片 for i in range(0,len(pic)): pic1 = pic[i] home = “C:UserssnnucsDesktoppic” + str(i) + “.jpg” try: urllib.urlretrieve(pic[i], home) except Exception: print “meiyoufuhegeshi”
def get_data(urls): print len(urls) for i in range(0,len(urls))?/用添加header的方式下载解析url request = urllib2.Request(“https://s.weibo.com/”+ urls[i], headers=header) response = urllib2.urlopen(request) soup = BeautifulSoup(response, ‘html.parser’) try: pic = soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(“img”)[“src”] //这里会出现问题 越界或者返回数据类型错误 except AttributeError: print “meizhaodao” except IndexError: print “yuejie” datas.append(pic) return datas//返图偏 # data = { # “titile” : soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(class_=“info”).find(class_=“title”).find(“a”).text, # “pic” : soup.find_all(class_=“wbs-feed”)[0].find_all(class_=“m-main”)[0].find(class_=“card-topic-a”).find(“img”)[“src”] # # } # datas.append(data) # return datas # pics.append(pic) # return pics //注释的代码 爬取标题和照片但是有格式问题 没有完成待修改
def get_url(url_g)?/两次下载解析的目的是热搜51条里 要先下载一次获取 所有51条