
公众号:uncle39py
2022/09/06阅读:52主题:默认主题
7.动态网页分析技术:selenium
动态网站的抓取
动态网页:指html元素通过ajax或者js加载的
通过抓取京东-商品详情页抓取来展开解说
京东是随便点开一个商品,比如手机,会发现像价格、评论之类的信息加载慢,因为这是通过js动态的方式加载过来的。

分析可以看出Elements(js加载完成后)里面有价格信息,而右键网页源码里面却没有


那怎么字段这个网页里面哪些是动态的元素,哪些是静态的元素呢?
我个人的做法是:把网页源码copy一份出来,删除掉其中的<script>
部分,然后用浏览器打开,就可以看到哪些是静态网页的元素.

目标:确定要抓取的内容
抓取的时候尽量把网页上有的信息都抓取下来,否则将来发现某个数据非常有用,就得全部从头抓取一遍,这样还不如第一次抓取的时候就把能看到的细节信息都抓取下来
爬取的内容分为:
-
商品信息 -
商品评价评论 -
商品评价总结

一.表结构设计
设计数据表的时候有几个重要点一定要注意
char类型, 要设置最大长度
对于无法确定最大长度的字段,可以设置为Text
设计表的时候 采集到的数据要尽量先做格式化处理
default(默认值)和null=True(可为空)
from peewee import *
db = MySQLDatabase('test',host='127.0.0.1',port=3306,user='root',password='root')
class BaseModel(Model):
class Meta:
database = db
class Good(BaseModel):
id = IntegerField(primary_key=True,verbose_name='商品id')
name = CharField(max_length=500,verbose_name='商品名称')
content = TextField(default='',verbose_name='商品描述')
supplier = CharField(max_length=500, default="")
ggbz = TextField(default="", verbose_name="规格和包装")
image_list = TextField(default="", verbose_name="商品的轮播图") # 一对多,当需要对数据做聚合操作的时候用主子表;否则不需要
price = FloatField(default=0.0, verbose_name="商品价格")
good_rate = IntegerField(default=0, verbose_name="好评率")
comments_nums = IntegerField(default=0, verbose_name="评论数")
has_image_comment_nums = IntegerField(default=0, verbose_name="晒图数")
has_video_comment_nums = IntegerField(default=0, verbose_name="视频晒单数")
has_add_comment_nums = IntegerField(default=0, verbose_name="追评数")
well_comment_nums = IntegerField(default=0, verbose_name="好评数")
middle_comment_nums = IntegerField(default=0, verbose_name="中评数")
bad_comment_nums = IntegerField(default=0, verbose_name="差评数")
# 商品评价表
class GoodEvaluate(BaseModel):
id = CharField(primary_key=True)
good = ForeignKeyField(Good,verbose_name="商品")
user_head_url = CharField(verbose_name="用户头像")
user_name = CharField(verbose_name="用户名")
good_info = CharField(max_length=500, verbose_name="购买的商品的信息")
evaluate_time = DateTimeField(verbose_name="评价时间")
content = TextField(default="", verbose_name="评论内容")
star = IntegerField(default=0, verbose_name="评分")
comment_nums = IntegerField(default=0, verbose_name="评论数")
praised_nums = IntegerField(default=0, verbose_name="点赞数")
image_list = TextField(default="", verbose_name="图片")
video_list = TextField(default="", verbose_name="视频")
#商品评价总结表
class GoodEvaluateSummary(BaseModel):
good = ForeignKeyField(Good, verbose_name="商品")
tag = CharField(max_length=20, verbose_name="标签")
num = IntegerField(default=0, verbose_name="数量")
if __name__ == '__main__':
db.create_tables([Good,GoodEvaluate,GoodEvaluateSummary])
二.动态网页分析技术:selenium
当数据是通过前端js加载出来时,找出数据是从哪个js加载来的,这叫做逆向工程
如果要通过前端js来分析接口,就比较费力,且不通用.
selenium主要是为了解决requests无法直接执行JS代码的问题。
因为requests只拿回响应数据,js代码还躺在脚本中未被触发执行,js可能的逻辑是会继续向服务器发送ajax请求加载数据
如果没有selenium,则只能手动使用requests再向ajax/js中的url(需要分析)再发请求

其中最重要的是:
-
拿到动态网页加载之后的elements bro.page_source
-
针对登陆等拿到登陆后的cookies bro.get_cookies()
-
可以定位到表单以及按钮等元素,自动填写表单及点击按钮

爬取商品详情页代码:
import re
import time
import json
from datetime import datetime
from selenium import webdriver
from scrapy import Selector
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.chrome.options import Options
from jd_spider.models import *
chrome_options = Options()
#设置headless模式
# chrome_options.add_argument("--headless")
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('--disable-gpu')
#设置不加载图片
chrome_options.add_argument("blink-settings=imagesEnabled=false")
browser = webdriver.Chrome(executable_path="E:/爬虫0基础入门/chromedriver_win32/chromedriver.exe", chrome_options=chrome_options)
#1. 无界面启动selenium
#2. 设置selenium不加载图片
def process_value(nums_str):
"""
将字符串类型的数字转换成数字
:param nums_str: 字符串类型的数字,数字中可能包含"万"
:return: 成功返回数字,默认返回0
"""
nums = 0
re_math = re.search("(\d+)", nums_str)
if re_math:
nums = int(re_math.group(1))
if "万" in nums_str:
nums *= 10000
return nums
def parse_good(good_id):
browser.get("https://item.jd.com/{}.html".format(good_id))
sel = Selector(text=browser.page_source)
#提取商品的基本信息
good = Good(id=good_id)
name = "".join(sel.xpath("//div[@class='sku-name']/text()").extract()).strip()
price = float("".join(sel.xpath("//span[@class='price J-p-{}']/text()".format(good_id)).extract()).strip())
detail = "".join(sel.xpath("//div[@id='detail']//div[@class='tab-con']").extract())
good_images = sel.xpath("//div[@id='spec-list']//img/@src").extract()
supplier_info = "".join(sel.xpath("//div[@id='summary-service']").extract())
re_match = re.search('<a href="//(.*).jd.com', supplier_info)
if re_match:
good.supplier = re_match.group(1)
else:
good.supplier = "京东"
good.name = name
good.price = price
good.content = detail
good.image_list = json.dumps(good_images)
#模拟点击规格和包装
ggbz_ele = browser.find_element_by_xpath("//div[@class='tab-main large']//li[contains(text(), '规格与包装')]")
ggbz_ele.click()
time.sleep(3)
sel = Selector(text=browser.page_source)
ggbz_detail = "".join(sel.xpath("//div[@id='detail']/div[@class='tab-con']").extract())
good.ggbz = ggbz_detail
#模拟点击商品评价后获取评价的信息
sppj_ele = browser.find_element_by_xpath("//li[@clstag='shangpin|keycount|product|shangpinpingjia_1']")
sppj_ele.click()
time.sleep(5)
sel = Selector(text=browser.page_source)
tag_list = sel.xpath("//div[@class='tag-list tag-available']//span/text()").extract()
good_rate = int(sel.xpath("//div[@class='percent-con']/text()").extract()[0])
good.good_rate = good_rate
summary_as = sel.xpath("//ul[@class='filter-list']/li/a")
for summary in summary_as:
name = summary.xpath("./text()").extract()[0]
nums = summary.xpath("./em/text()").extract()[0]
nums = process_value(nums)
if name == "晒图":
good.has_image_comment_nums = nums
elif name == "视频晒单":
good.has_video_comment_nums = nums
elif name == "追评":
good.has_add_comment_nums = nums
elif name == "好评":
good.well_comment_nums = nums
elif name == "中评":
good.middle_comment_nums = nums
elif name == "差评":
good.bad_comment_nums = nums
elif name == "全部评价":
good.comments_nums = nums
#保存商品信息
existed_good = Good.select().where(Good.id == good.id)
if existed_good:
good.save()
else:
good.save(force_insert=True)
for tag in tag_list:
re_match = re.match("(.*)\((\d+)\)", tag)
if re_match:
tag_name = re_match.group(1)
nums = int(re_match.group(2))
existed_summarys = GoodEvaluateSummary.select().where(GoodEvaluateSummary.good==good, GoodEvaluateSummary.tag==tag_name)
if existed_summarys:
summary = existed_summarys[0]
else:
summary = GoodEvaluateSummary(good=good)
summary.tag = tag_name
summary.num = nums
summary.save()
#获取商品的评价
has_next_page = True
while has_next_page:
all_evalutes = sel.xpath("//div[@class='comment-item']")
for item in all_evalutes:
good_evaluate = GoodEvaluate(good=good)
evaluate_id = item.xpath("./@data-guid").extract()[0]
print(evaluate_id)
good_evaluate.id = evaluate_id
user_head_url = item.xpath(".//div[@class='user-info']//img/@src").extract()[0]
user_name = "".join(item.xpath(".//div[@class='user-info']/text()").extract()).strip()
good_evaluate.user_head_url = user_head_url
good_evaluate.user_name = user_name
star = item.xpath("./div[2]/div[1]/@class").extract()[0]
star = int(star[-1])
good_evaluate.star = star
evaluate = "".join(item.xpath("./div[2]/p[1]/text()").extract()[0]).strip()
good_evaluate.content = evaluate
image_list = item.xpath("./div[2]//div[@class='pic-list J-pic-list']/a/img/@src").extract()
video_list = item.xpath("./div[2]//div[@class='J-video-view-wrap clearfix']//video/@src").extract()
good_evaluate.image_list = json.dumps(image_list)
good_evaluate.video_list = json.dumps(video_list)
praised_nums = int(item.xpath(".//div[@class='comment-op']/a[2]/text()").extract()[0])
comment_nums = int(item.xpath(".//div[@class='comment-op']/a[3]/text()").extract()[0])
good_evaluate.praised_nums = praised_nums
good_evaluate.comment_nums = comment_nums
comment_info = item.xpath(".//div[@class='order-info']/span/text()").extract()
order_info = comment_info[:-1]
evaluate_time = comment_info[-1]
good_evaluate.good_info = json.dumps(order_info)
evaluate_time = datetime.strptime(evaluate_time, "%Y-%m-%d %H:%M")
good_evaluate.evaluate_time = evaluate_time
#保存评价信息
existed_good_evaluates = GoodEvaluate.select().where(GoodEvaluate.id==good_evaluate.id)
if existed_good_evaluates:
good_evaluate.save()
else:
good_evaluate.save(force_insert=True)
try:
next_page_ele = browser.find_element_by_xpath("//div[@id='comment']//a[@class='ui-pager-next']")
# next_page_ele.click()
next_page_ele.send_keys("\n")
time.sleep(5)
sel = Selector(text=browser.page_source)
except NoSuchElementException as e:
has_next_page = False
if __name__ == "__main__":
parse_good(100016931023)
如果需要爬取一个分类的所有商品的详情,可以点击比如手机分类,在整个列表页提取出商品id,调用上面的解析方法即可.
最后
添加客服微信 慕课网爬虫实战课8元购



作者介绍
