#!/usr/bin/env python # -*- coding:utf-8 -*- import scrapy import hashlib from beauty.items import JieYiCaiItem from scrapy.http import Request from scrapy.selector import HtmlXPathSelector from scrapy.spiders import CrawlSpider, Rule from scrapy.linkextractors import LinkExtractor class JieYiCaiSpider(scrapy.spiders.Spider): count = 0 url_set = set() name = domain = allowed_domains = [] start_urls = [ , ] rules = [ ] def parse(self, response): md5_obj = hashlib.md5() md5_obj.update(response.url) md5_url = md5_obj.hexdigest() if md5_url in JieYiCaiSpider.url_set: pass else: JieYiCaiSpider.url_set.add(md5_url) hxs = HtmlXPathSelector(response) ): item = JieYiCaiItem() item[] = hxs.select().extract() item[] = hxs.select().re() item[] = hxs.select().extract() item[] = hxs.select().extract() item[] = hxs.select().extract() yield item current_page_urls = hxs.select().extract() for i in range(len(current_page_urls)): url = current_page_urls[i] ): url_ab = JieYiCaiSpider.domain + url yield Request(url_ab, callback=self.parse) spider
spider上述代码中:对url进行md5加密的目的是避免url过长,也方便保存在缓存或数据库中。
此处代码的关键在于:
# # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: import json from twisted.enterprise import adbapi import MySQLdb.cursors import re mobile_re = re.compile(r) phone_re = re.compile(r) class JsonPipeline(object): def __init__(self): self.file = open(, ) def process_item(self, item, spider): line = % (item[][0].encode(), item[][0].encode()) self.file.write(line) return item class DBPipeline(object): def __init__(self): self.db_pool = adbapi.ConnectionPool(, db=, user=, passwd=, cursorclass=MySQLdb.cursors.DictCursor, use_unicode=True) def process_item(self, item, spider): query = self.db_pool.runInteraction(self._conditional_insert, item) query.addErrback(self.handle_error) return item def _conditional_insert(self, tx, item): tx.execute(, (item[][0], )) result = tx.fetchone() if result: pass else: phone_obj = phone_re.search(item[][0].strip()) phone = phone_obj.group() mobile_obj = mobile_re.search(item[][1].strip()) mobile = mobile_obj.group() values = ( item[][0], item[][0], phone, mobile, item[][2].strip(), item[][0]) tx.execute(, values) def handle_error(self, e): ,e pipelines
pipelines上述代码中多个类的目的是,可以同时保存在文件和数据库中,保存的优先级可以在配置文件settings中定义。