汽车之家车型的简单爬取 spider
# -*- coding: utf-8 -*-import scrapyfrom scrapy import Requestfrom mininova.items import carItemimport sysreload(sys)sys.setdefaultencoding('utf8')class SplashSpider(scrapy.Spider): #spider名字 name = 'car_home' allowed_domains = ['autohome.com.cn'] start_urls = [ ] # 自定义配置 custom_settings = { 'ITEM_PIPELINES': { 'mininova.pipelines.CarPipeline': 300, } } def start_requests(self): #重新定义起始爬取点 #所有首字母 words = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z'] #按照首字母,组合对应的页面,压入start_urls for word in words: self.start_urls.append('https://www.autohome.com.cn/grade/carhtml/'+word+'.html') #根据start_urls,抓取页面 for url in self.start_urls: yield Request(url,meta={ 'word':word}) #定义默认的抓取函数 def parse(self, response): print('url') print(response.url) word = response.meta['word'] car_cates = response.xpath('//dl').extract() brand_id = 0 total_cars = [] for brand_index in range(len(car_cates)): #品牌编号 brand_num = brand_index + 1 brand_num = str(brand_num) #品牌名 brand = response.xpath('//dl['+brand_num+']/dt/div[1]/a/text()').extract()[0] print('brand:'+brand) #品牌logo brand_logo_url = response.xpath('//dl['+brand_num+']/dt//img[1]/@src').extract()[0] #品牌小类别 brand_items = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/text()').extract() #品牌小类别对应的页面 brand_item_urls = response.xpath('//dl['+brand_num+']/dd//div[@class="h3-tit"]/a/@href').extract() for brand_item_index in range(len(brand_items)): #品牌小类别的编号 brand_item_num = brand_item_index + 1 brand_item_num = str(brand_item_num) #品牌小类别名 brand_item = brand_items[brand_item_index] #品牌小类别对应的页面的url brand_item_url = brand_item_urls[brand_item_index] print('brand_item:'+brand_item) print('brand_item_url:'+brand_item_url) #品牌小类别中的所有车 cars = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]').extract() print('cars_count:'+str(len(cars))) for car_index in range(len(cars)): car_num = car_index + 1 car_num = str(car_num) #具体车的名称 name = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/text()').extract()[0] #车对应的页面 url = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/h4/a/@href').extract()[0] #报价(最低价-最高价) price = response.xpath('//dl['+brand_num+']/dd//ul[@class="rank-list-ul"]['+brand_item_num+']/li[@id]['+car_num+']/div[1]/a/text()').extract()[0] prices = price.split('-') price_base = '万' if len(prices) != 2: max_price = '暂无' min_price = '暂无' else: max_price = str(prices[1].replace(price_base,'')) min_price = str(prices[0]) print('car:'+name+' max_price:'+str(max_price)+' min_price:'+str(min_price)+' price_base:'+price_base) car_item = carItem() car_item['name'] = name car_item['url'] = url car_item['brand_item'] = brand_item car_item['first_word'] = word car_item['brand'] = brand car_item['brand_logo_url'] = brand_logo_url car_item['max_price'] = max_price car_item['min_price'] = min_price total_cars.append(car_item) return total_cars复制代码
item
# -*- coding: utf-8 -*-import scrapyclass carItem(scrapy.Item): #具体车名 name = scrapy.Field() #对应的介绍页面url url = scrapy.Field() #最高报价,单位(万) max_price = scrapy.Field() #最低报价,单位(万) min_price = scrapy.Field() #品牌名 brand = scrapy.Field() #品牌logo brand_logo_url = scrapy.Field() #品牌小类别名 brand_item = scrapy.Field() #品牌首字母 first_word = scrapy.Field() 复制代码
mongo_car
from mininova.mongodb import Mongofrom mininova.settings import mongo_settingclass MongoCar(): db_name = 'car' brand_set_name = 'brand' brand_item_set_name = 'brand_item' car_set_name = 'car' def __init__(self): self.db = Mongo(mongo_setting['mongo_host'],mongo_setting['mongo_port'],mongo_setting['mongo_user'],mongo_setting['mongo_password']) def insert(self,item): brand_where = { 'name':item['brand']} brand = self.brand_exist(self.db,brand_where) if brand == False: brand = { 'name':item['brand'],'first_word':item['first_word']} brand = self.insert_brand(self.db,brand) print('brand insert ok!') else: brand = { 'name':item['brand'],'first_word':item['first_word'],'logo_url':item['brand_logo_url']} brand = self.update_brand(self.db,brand_where,brand) print('brand_exist!') brand_item_where = { 'name':item['brand_item']} brand_item = self.brand_item_exist(self.db,brand_item_where) if brand_item == False: brand_item = { 'name':item['brand_item'],'first_word':item['first_word'],'brand_id':brand['_id']} brand_item = self.insert_brand_item(self.db,brand_item) print('brand_item insert ok!') else: print('brand_item_exist!') car_where = { 'name':item['brand_item'],'name':item['name']} car = self.car_exist(self.db,car_where) if car == False: car = { 'name':item['name'],'url':item['url'],'max_price':item['max_price'],'min_price':item['min_price'],'first_word':item['first_word'],'brand_id':brand['_id'],'brand_item_id':brand_item['_id']} car = self.insert_car(self.db,car) print('car insert ok!') else: print('car_exist!') if car != False: return True; else: return False; def update_brand(self,db,brand_where,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.update_one(brand_where,{ '$set':brand}) exist = my_set.find_one(brand_where) if(exist is None): return False else: return exist def brand_exist(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) exist = my_set.find_one(brand) if(exist is None): return False else: return exist def insert_brand(self,db,brand): my_set = db.set(self.db_name,self.brand_set_name) my_set.insert_one(brand) brand = my_set.find_one(brand) return brand def brand_item_exist(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) exist = my_set.find_one(brand_item) if(exist is None): return False else: return exist def insert_brand_item(self,db,brand_item): my_set = db.set(self.db_name,self.brand_item_set_name) my_set.insert_one(brand_item) brand = my_set.find_one(brand_item) return brand def car_exist(self,db,car): my_set = db.set(self.db_name,self.car_set_name) exist = my_set.find_one(car) if(exist is None): return False else: return exist def insert_car(self,db,car): my_set = db.set(self.db_name,self.car_set_name) my_set.insert_one(car) brand = my_set.find_one(car) return brand复制代码
pipeline
from mininova.settings import settingsimport pymysqlimport osfrom mininova.db import Bookdbfrom mininova.mongo_novel import MongoNovelfrom mininova.mongo_car import MongoCarimport copyclass CarPipeline(object): def process_item(self,item,spider): mongo_car = MongoCar() mongo_car.insert(item) print(item['name']) print('item insert ok!')复制代码
setting
mongo_setting = { 'mongo_host' : 'xxx.xxx.xxx.xxx', 'mongo_port' : 27017, 'mongo_user' : 'username', 'mongo_password' : 'password'}复制代码