博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
scrapy汽车之家车型的简单爬取
阅读量:6408 次
发布时间:2019-06-23

本文共 8478 字,大约阅读时间需要 28 分钟。

汽车之家车型的简单爬取 spider

# -*- coding: utf-8 -*-import scrapyfrom scrapy import Requestfrom mininova.items import carItemimport sysreload(sys)sys.setdefaultencoding('utf8')class SplashSpider(scrapy.Spider):	#spider名字    name = 'car_home'    allowed_domains = ['autohome.com.cn']    start_urls = [    ]     # 自定义配置    custom_settings = {         'ITEM_PIPELINES': {         'mininova.pipelines.CarPipeline': 300,         }    }    def start_requests(self): #重新定义起始爬取点    	#所有首字母        words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']        #按照首字母,组合对应的页面,压入start_urls        for word in words:            self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html')         #根据start_urls,抓取页面        for url in self.start_urls:            yield Request(url,meta={
'word':word}) #定义默认的抓取函数 def parse(self, response): print('url') print(response.url) word = response.meta['word'] car_cates = response.xpath('//dl').extract() brand_id = 0 total_cars = [] for brand_index in range(len(car_cates)): #品牌编号 brand_num = brand_index + 1 brand_num = str(brand_num) #品牌名 brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0] print('brand:'+brand) #品牌logo brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0] #品牌小类别 brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract() #品牌小类别对应的页面 brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract() for brand_item_index in range(len(brand_items)): #品牌小类别的编号 brand_item_num = brand_item_index + 1 brand_item_num = str(brand_item_num) #品牌小类别名 brand_item = brand_items[brand_item_index] #品牌小类别对应的页面的url brand_item_url = brand_item_urls[brand_item_index] print('brand_item:'+brand_item) print('brand_item_url:'+brand_item_url) #品牌小类别中的所有车 cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract() print('cars_count:'+str(len(cars))) for car_index in range(len(cars)): car_num = car_index + 1 car_num = str(car_num) #具体车的名称 name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0] #车对应的页面 url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0] #报价(最低价-最高价) price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0] prices = price.split('-') price_base = '万' if len(prices) != 2: max_price = '暂无' min_price = '暂无' else: max_price = str(prices[1].replace(price_base,'')) min_price = str(prices[0]) print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base) car_item = carItem() car_item['name'] = name car_item['url'] = url car_item['brand_item'] = brand_item car_item['first_word'] = word car_item['brand'] = brand car_item['brand_logo_url'] = brand_logo_url car_item['max_price'] = max_price car_item['min_price'] = min_price total_cars.append(car_item) return total_cars复制代码

item

# -*- coding: utf-8 -*-import scrapyclass carItem(scrapy.Item):	#具体车名	name = scrapy.Field()	#对应的介绍页面url	url = scrapy.Field()	#最高报价,单位(万)	max_price = scrapy.Field()	#最低报价,单位(万)	min_price = scrapy.Field()	#品牌名	brand = scrapy.Field()	#品牌logo	brand_logo_url = scrapy.Field()	#品牌小类别名	brand_item = scrapy.Field()	#品牌首字母	first_word = scrapy.Field() 复制代码

mongo_car

from mininova.mongodb import Mongofrom mininova.settings import mongo_settingclass MongoCar():    db_name = 'car'    brand_set_name = 'brand'    brand_item_set_name = 'brand_item'    car_set_name = 'car'    def __init__(self):        self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password'])    def insert(self,item):        brand_where = {
'name':item['brand']} brand = self.brand_exist(self.db,brand_where) if brand == False: brand = {
'name':item['brand'],'first_word':item['first_word']} brand = self.insert_brand(self.db,brand) print('brand insert ok!') else: brand = {
'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']} brand = self.update_brand(self.db,brand_where,brand) print('brand_exist!') brand_item_where = {
'name':item['brand_item']} brand_item = self.brand_item_exist(self.db,brand_item_where) if brand_item == False: brand_item = {
'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']} brand_item = self.insert_brand_item(self.db,brand_item) print('brand_item insert ok!') else: print('brand_item_exist!') car_where = {
'name':item['brand_item'],'name':item['name']} car = self.car_exist(self.db,car_where) if car == False: car = {
'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']} car = self.insert_car(self.db,car) print('car insert ok!') else: print('car_exist!') if car != False: return True; else: return False; def update_brand(self,db,brand_where,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.update_one(brand_where,{
'$set':brand}) exist = my_set.find_one(brand_where) if(exist is None): return False else: return exist def brand_exist(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) exist = my_set.find_one(brand) if(exist is None): return False else: return exist def insert_brand(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.insert_one(brand) brand = my_set.find_one(brand) return brand def brand_item_exist(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) exist = my_set.find_one(brand_item) if(exist is None): return False else: return exist def insert_brand_item(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) my_set.insert_one(brand_item) brand = my_set.find_one(brand_item) return brand def car_exist(self,db,car): my_set = db.set(self.db_name,self.car_set_name) exist = my_set.find_one(car) if(exist is None): return False else: return exist def insert_car(self,db,car): my_set = db.set(self.db_name,self.car_set_name) my_set.insert_one(car) brand = my_set.find_one(car) return brand复制代码

pipeline

from mininova.settings import settingsimport pymysqlimport osfrom mininova.db import Bookdbfrom mininova.mongo_novel import MongoNovelfrom mininova.mongo_car import MongoCarimport copyclass CarPipeline(object):       def process_item(self,item,spider):        mongo_car = MongoCar()        mongo_car.insert(item)        print(item['name'])        print('item insert ok!')复制代码

setting

mongo_setting = {	'mongo_host' : 'xxx.xxx.xxx.xxx',	'mongo_port' : 27017,	'mongo_user' : 'username',	'mongo_password' : 'password'}复制代码

转载于:https://juejin.im/post/5ce62de26fb9a07edb391f15

你可能感兴趣的文章
内存管理之1:x86段式内存管理与保护模式
查看>>
20180925上课截图
查看>>
IO输入/输出流的简单总结
查看>>
JavaScript之DOM-9 HTML DOM(HTML DOM概述、常用HTML DOM对象、HTML表单)
查看>>
技术成长之路(一)
查看>>
中国北方国际五金城硬件选型
查看>>
php.exe启动时提示缺少MVCR110.dall 64位 window系统 解决
查看>>
判断是否为数字方法
查看>>
[翻译] EF Core in Action 关于这本书
查看>>
js Uncaught TypeError: undefined is not a function
查看>>
数据库存储引擎
查看>>
[2019.2.13]BZOJ4318 OSU!
查看>>
版本号带两个小数点的,如何比较大小?( NSStringCompareOptions )
查看>>
QCustomplot使用分享(三) 图
查看>>
什么是java?
查看>>
WPF路径动画(动态逆向动画)
查看>>
Low Level Reader Protocol (LLRP) 简介
查看>>
[Micropython]TPYBoard v10x NRF24L01无线通讯模块使用教程
查看>>
mysql中show processlist过滤和杀死线程
查看>>
最新Sublime Text 2 激活 汉化
查看>>