先上代码
import requests
import json
import time
url = "https://rate.taobao.com/feedRateList.htm"
params = {
"auctionNumId":"551228275021",
"userNumId":"1674366195",
"currentPageNum":"1",
"pageSize":"20",
"rateType":"3",
"orderType":"sort_weight",
"attribute":"",
"sku":"",
"hasSku":"false",
"folded":"0",
"ua":"098#E1hvGpvWvPvvUvCkvvvvvjiWRsdO1j3nPFSh6j1VPmPW6jiEPF5yAjrPRLSOlj3E9vhvVe3wjPGIzHi47eR2zAuU6WRM6cnC4IjpKG4ndvhvmpvUGvETJ9vOPUOCvvpvCvvv29hvCvvvMM/Uvpvj2vmC9jHvQ89Cvv9vvUC2jJeO+f9CvmFMMQCGS6vvgQvv96Cvpvp/vvm2phCvhRvvvUnvphvppvvv963vpCmvmvhvLvAkfQvjRNkxfwpwdeQEfwFwaXTAVA5paN2Q+ulApz7QD76XV369D7zUQ8TxEcqUzj7Q+ulAp5c6Ano4hAx/AnCl+b8rwyxlYPexdByvvpvVvmvvvhCvRvhvCvvvvvmevpvhvvmv9F9Cvvpvvvvv",
"_ksTS":"1676613945868_2922",
"callback":"jsonp_tbcrate_reviews_list"
}
headers = {
'cookie': 'isg=BLq63PMdjdrAvwAsQVL_kIOOCODcaz5FVm1iksSz983Et1jxrPkJVfyBB8NrPLbd; l=fBaNSyjqLdRtmRtwBO5IEurza779gIR4zkPzaNbMiIEGa6wdGIZLzNCexBqBPdtfgTCAVetPIAtcpdU9WOzd0ETNJqHzH13bzxJ9-etzR; tfstk=cnP5BA_Yx3x7O1i2d0_V8eYzkmlAZs8sRTiuPgXV_rMOP8afiBOZfrQYKHhEJq1..; t=7726f24eb9e862e31f90beb9a64630a0; cna=ithoHFV8jlYCAXFotH3kFibV; sgcookie=E100d%2BTRD%2Febqlo1lqhlX8QMqWkMiuZ%2BnVUTg4Vk3MGrP3qjqQZw%2FUaD7uaf31EjnwtAdzNfWjK4GqEJo15TQP7zaiY0H9tla9s7dEnxVvKHB0Q%3D; uc3=id2=UUphwocR7BRT9edm5Q%3D%3D&nk2=0o8%2FnXBGkvTgGSdoApFcWQ%3D%3D&lg2=URm48syIIVrSKA%3D%3D&vt3=F8dCvjyn7yIk2zOE0CE%3D; lgc=%5Cu7B14%5Cu6746%5Cu5B50%5Cu529E%5Cu516C%5Cu65D7%5Cu8230%5Cu5E97; uc4=nk4=0%400D4kcvaqbgtfR8uEG8IjbABnAOj3UmMQ%2Ba2o&id4=0%40U2grGR8RjYrYeyzJ7CYb6fFThiG%2FG%2FjO; tracknick=%5Cu7B14%5Cu6746%5Cu5B50%5Cu529E%5Cu516C%5Cu65D7%5Cu8230%5Cu5E97; _cc_=UtASsssmfA%3D%3D; _m_h5_tk=9ad75fdfd7530809ea95d08dce8b331f_1676623602088; _m_h5_tk_enc=4f0cccbf9cdad7cb7383e575d4dd2675; mt=ci=0_1; thw=cn; cookie2=1039d83058f4eaa2f428efb1523370a4; _tb_token_=e697e5856e15b; v=0; uc1=pas=0&cookie15=VT5L2FSpMGV7TQ%3D%3D&tmb=1&cookie21=WqG3DMC9EdFmJgke4tCSRQ%3D%3D&cookie14=UoezSckqI64VdQ%3D%3D&cookie16=VT5L2FSpNgq6fDudInPRgavC%2BQ%3D%3D&existShop=true; xlly_s=1; _samesite_flag_=true; unb=2208092032956; csg=e0ab1db7; cancelledSubSites=empty; cookie17=UUphwocR7BRT9edm5Q%3D%3D; dnk=%5Cu7B14%5Cu6746%5Cu5B50%5Cu529E%5Cu516C%5Cu65D7%5Cu8230%5Cu5E97; skt=be7fdf5b453704b6; existShop=MTY3NjYxMzUwNg%3D%3D; _l_g_=Ug%3D%3D; sg=%E5%BA%9764; _nk_=%5Cu7B14%5Cu6746%5Cu5B50%5Cu529E%5Cu516C%5Cu65D7%5Cu8230%5Cu5E97; cookie1=BYlty4Hl0E049Ew0r6wFoRPzJm4uOVqjRMkbuC2MJuQ%3D; x5sec=7b22726174656d616e616765723b32223a223366376136336265326230636665616631373337373936366437333237363962434a2f42764a3847454c3743784a504d386f725533414561447a49794d4467774f5449774d7a49354e5459374d7a435a7872704251414d3d227d',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:109.0) Gecko/20100101 Firefox/110.0',
'Referer': 'https://item.taobao.com/item.htm?spm=a230r.1.14.38.1d4f6788YUHEp4&id=551228275021&ns=1&abbucket=18'
}
html = requests.get(url,params=params,headers=headers).text
start = html.find('{"qnaDisabled"')
ends = html.find('Num":1,"maxPage":4}')+len('Num":1,"maxPage":4}')
print(html)
count=0
for p in json.loads(html[start:ends])["comments"]:
if p['photos'] == []:
pass
else:
try:
for dp in range(0,4,1):
res='https:'+p['photos'][dp]['url'].split('_400')[0]
print(res)
# T = requests.get(res)
# with open('pinglun2/{}.jpg'.format(count), 'wb') as f:
# f.write(T.content)
# count += 1
# time.sleep(5)
except:
pass
经过前面天猫评论的研究,总算完整了。
最后如果只是print(res)
结果如下
这里只爬取了一页,如果需要多页的,只需要把params里面的页数做一个for循环就可以了
这个结果如果用下载with open 来下载的话,应该是淘宝有限制,会有一部分图片打不开,测试了很多种方法。成功率大概一半。
但是评论肯定是下载的了,应该是淘宝有什么限制。因为爬取其他网站能全部正常下载出图片。
这个以后再研究。本次主要是通过淘宝的评论来学习。
最后几个费时间的点总结
1.淘宝和天猫一样,每过大约半小时,cookie就会变动,需要重新验证。
2.获取的是json的格式,所以start和ends的定位需要对整个json仔细了解,{ }
3.淘宝的评论和天猫的评论是一样的,有正常的评论和追评。(这几天学习中发现,其实天猫需要做另外一个判断,因为天猫假如有追评晒图,那么就会多一个追评晒图的key和value。然后天猫的评论晒图和追评晒图的key是一样的,就导致如果有追评晒图,循环到这就会导致报错。)
4.if,是判断正常是否有评论晒图,如果没有晒图,则略过
后面就是上一篇天猫评论的是一样的,把图片用with open的方法下载到指定文件夹里。这里测试了一下time.sleep(5)
以为降低一点速度,就能下载到所有图片,测试不行。
所以暂时还是爬下所有图片链接的方式然后手动打开比较可靠,但是如果不能批量下载图片,然后又要时不时更换cookie。
不管是淘宝还是天猫的评论抓取,效率都变低了。没什么用的感觉