如題,我希望在爬蟲一開始先讀取postgresql中的資料當作關鍵字,然後再存入mongoDB
如果需要實現這樣的狀況,請問該從何下手呢?
目前是使用postgresql讀取和存入。
spider.py
class SearchSpider(scrapy.Spider):
def start_requests(self):
session = pipelines.NewsSaveToPostgresPipeline().sessionmaker()
total_sql = session.execute(text())
pipelines.py
class NewsSaveToPostgresPipeline(object):
@classmethod
def from_crawler(cls, crawler):
cls.DB_CON_STR = crawler.settings.get('DB_CON_STR')
cls.engine = create_engine(cls.DB_CON_STR, echo=True, poolclass=NullPool) # 连接数据库
cls.sessionmaker = sessionmaker(bind=cls.engine)
Base.metadata.create_all(cls.engine)
return cls()
def open_spider(self, spider):
self.session = self.sessionmaker()
def close_spider(self, spider):
self.session.close()
def process_item(self, item, spider):
if spider.name == 'proxy_example':
self.session.add(Proxy(**item))
else:
self.session.add(WebData(**item))
self.session.commit()