requests模块实践:爬取淘宝商品信息和价格
文章最后更新时间为:2018年08月14日 10:08:28
代码:
import re
import requests
#获取当前url的html文档
def htmlget(url):
try:
kv={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
r=requests.get(url,headers=kv)
r.raise_for_status
r.encoding=r.apparent_encoding
return r.text
except:
return ''
<!--more-->
#对html文档进行解析找到信息存到数组中
def htmlparse(ilt,html):
try:
plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
for i in range(len(plt)):
price = eval(plt[i].split(':')[1])
title = eval(tlt[i].split(':')[1])
ilt.append([price , title])
except:
print("")
#把提取到的数组中的数据写到txt文件中去
def htmlreadin(ilt):
file=open("E:/python/taobao.txt","r+",encoding='utf8')
file.write("{:^10}{:^10}{:^30}".format("number","price","name")+'\n')
print("hhh")
for i in range(len(ilt)):
file.write("{:^10}{:^10}{:^30}".format(i, ilt[i][0], ilt[i][1])+'\n')
def main():
goods="篮球"
page=2
url='https://s.taobao.com/search?q='+goods
ilt=[]
for ii in range(page):
try:
html=htmlget(url+'&s='+str(ii*44))#切换商品下一页面,观察得到每两个页面隔44
htmlparse(ilt,html)
except:
continue
htmlreadin(ilt)
main()
效果: