Scrapy with selenium
Scrapy middleware using selenium. Open many tabs to speed up operation
Installation
$ pip install scrapy-chrome
Configuration
-
set chromedriver in your path
-
In your spider class
from selenium import webdriver from selenium.webdriver.common.desired_capabilities import DesiredCapabilities class YourSpider(scrapy.Spider): use_chrome = True chrome_options = webdriver.ChromeOptions() # Maximum number of tabs opened by Chrome, must set chrome_tab_num = 10 # Don't load images, commit the following lines to load images prefs = {'profile.managed_default_content_settings.images': 2} chrome_options.add_experimental_option('prefs', prefs) # Use headless Chrome, commit the following lines to use head Chrome chrome_options.add_argument('headless') # Set page load time, commit the following lines to don't set maximum page loading time desired_capabilities = DesiredCapabilities().CHROME desired_capabilities['pageLoadStrategy'] = 'none' first_page_load_time = 7 # open start_urls loading time page_load_time = 3 # open per page loading time # HtmlResponse encoding, must set encoding = 'utf8' # To quit Chrome, must set def close(self, spider): if spider.use_chrome: spider.driver.quit()
-
Add the
ChromeMiddleware
to the downloader middlewares:DOWNLOADER_MIDDLEWARES = { 'scrapy_chrome.ChromeMiddleware': 1, ...... }