Scrapy extension to store spider statistics to Postgesql DB

As am working on a Scrapy project, I wanted to store all spider statistics to Database so as I can access it later, So I wrote the following extension.

1. Create a new table in your database with folowing SQL structure:

CREATE TABLE spider_stats
(
  id serial NOT NULL,
  name character varying(56),
  start_time timestamp without time zone,
  finish_time timestamp without time zone,
  finish_reason character varying(128),
  item_scraped_count bigint,
  images_count bigint,
  images_uptodate bigint,
  images_downloaded bigint,
  request_count bigint,
  response_count bigint,
  response_status_count_200 bigint,
  response_status_count_301 bigint,
  response_status_count_302 bigint,
  created_at timestamp without time zone NOT NULL,
  updated_at timestamp without time zone NOT NULL,
  response_status_count_500 bigint,
  spider_id integer,
  CONSTRAINT spider_stats_pkey PRIMARY KEY (id )
)

2. Create a new variables array with your DB settings and add the extensions to extensions list in scrapy settings file (settings.py). So add the following lines to your scrapy settings file.

SPIDER_STATS_DB = {
  'host' : 'localhost',
  'user': 'user',
  'password' : 'password',
  'dbname' : 'database'
}

EXTENSIONS = {
    'scrapy.contrib.corestats.CoreStats': 500,
    'scrapy.contrib.logstats.LogStats': 500,
    'PROJECT_NAME.statstodb.StatsToDb': 1000, # Replace PROJECT_NAME with your project name
}

3. Create a new files called (statstodb.py), add save it to your scrapy project root folder, and add the following code :

from scrapy.xlib.pydispatch import dispatcher
from scrapy import signals
from scrapy import log
from scrapy.stats import stats
import psycopg2
from PROJECT_NAME.settings import SPIDER_STATS # Replace PROJECT_NAME with your project name
import datetime

class StatsToDb(object):

    def __init__(self):
        dispatcher.connect(self.stats_spider_closed, signal=signals.stats_spider_closed)

        try:
            self.conn = psycopg2.connect("host='%s' dbname='%s' user='%s' password='%s'" % (SPIDER_STATS['host'], SPIDER_STATS['dbname'], SPIDER_STATS['user'], SPIDER_STATS['password']))
            self.cursor = self.conn.cursor()
        except:
            print "Unable to connect to the database."

    def stats_spider_closed(self, spider, spider_stats):

        try:
            self.cursor.execute("INSERT INTO spider_stats (name, created_at, updated_at, start_time, finish_time, finish_reason, item_scraped_count, images_count, images_uptodate, images_downloaded, request_count, response_count, response_status_count_200, response_status_count_301, response_status_count_302, response_status_count_500) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)",
            # name, start_time, finish_time, finish_reason, item_scraped_count,
            # images_count, images_uptodate, images_downloaded, request_count,
            # response_count, response_status_count_200, response_status_count_301,
            # response_status_count_302, response_status_count_500
                           (spider.name,
                            unicode(datetime.datetime.now().replace(microsecond=0)),
                            unicode(datetime.datetime.now().replace(microsecond=0)),
                            unicode(spider_stats['start_time'].replace(microsecond=0)),
                            unicode(spider_stats['finish_time'].replace(microsecond=0)),
                            spider_stats['finish_reason'].encode('utf-8'),
                            spider_stats['item_scraped_count'] if 'item_scraped_count' in spider_stats  else 0,
                            spider_stats['images_count'] if 'images_count' in spider_stats  else 0 ,
                            spider_stats['images_uptodate'] if 'images_uptodate' in spider_stats  else 0,
                            spider_stats['images_downloaded'] if 'images_downloaded' in spider_stats  else 0,
                            spider_stats['downloader/request_count'] if 'request_count' in spider_stats  else 0,
                            spider_stats['downloader/response_count'] if 'downloader/response_count' in spider_stats  else 0,
                            spider_stats['downloader/response_status_count/200'] if 'downloader/response_status_count/200' in spider_stats else 0,
                            spider_stats['downloader/response_status_count/301'] if 'downloader/response_status_count/301' in spider_stats else 0,
                            spider_stats['downloader/response_status_count/302'] if 'downloader/response_status_count/302' in spider_stats else 0,
                            spider_stats['downloader/response_status_count/500'] if 'downloader/response_status_count/500' in spider_stats else 0,
            ))

            self.conn.commit()

        except psycopg2.DatabaseError, e:
            print 'Error %s' % e
            exit(1)

Leave a comment

 

WP-SpamFree by Pole Position Marketing