requests模块实践:爬取淘宝商品信息和价格

文章最后更新时间为:2018年08月14日 10:08:28

代码:

import re
import requests
#获取当前url的html文档
def htmlget(url): 
    try:
        kv={'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}
        r=requests.get(url,headers=kv)
        r.raise_for_status
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ''
  


<!--more-->


      
#对html文档进行解析找到信息存到数组中
def htmlparse(ilt,html):
    try:
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])
            title = eval(tlt[i].split(':')[1])
            ilt.append([price , title])
    except:
        print("")
#把提取到的数组中的数据写到txt文件中去
def htmlreadin(ilt):
    file=open("E:/python/taobao.txt","r+",encoding='utf8')
    file.write("{:^10}{:^10}{:^30}".format("number","price","name")+'\n')
    print("hhh")
    for i in range(len(ilt)):
        file.write("{:^10}{:^10}{:^30}".format(i, ilt[i][0], ilt[i][1])+'\n')
def main():
    goods="篮球"
    page=2
    url='https://s.taobao.com/search?q='+goods
    ilt=[]
    for ii in range(page):
        try:
            html=htmlget(url+'&s='+str(ii*44))#切换商品下一页面,观察得到每两个页面隔44
            htmlparse(ilt,html)
        except:
            continue
    htmlreadin(ilt)
main()

    

效果:

1 + 5 =
快来做第一个评论的人吧~