scrape websites with infinite scrolling

add-semi-colons picture add-semi-colons · Sep 20, 2012 · Viewed 26.2k times · Source

I have written many scrapers but I am not really sure how to handle infinite scrollers. These days most website etc, Facebook, Pinterest has infinite scrollers.

Answer

Pawan Kumar picture Pawan Kumar · Nov 8, 2014

You can use selenium to scrap the infinite scrolling website like twitter or facebook.

Step 1 : Install Selenium using pip

pip install selenium 

Step 2 : use the code below to automate infinite scroll and extract the source code

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import sys

import unittest, time, re

class Sel(unittest.TestCase):
    def setUp(self):
        self.driver = webdriver.Firefox()
        self.driver.implicitly_wait(30)
        self.base_url = "https://twitter.com"
        self.verificationErrors = []
        self.accept_next_alert = True
    def test_sel(self):
        driver = self.driver
        delay = 3
        driver.get(self.base_url + "/search?q=stckoverflow&src=typd")
        driver.find_element_by_link_text("All").click()
        for i in range(1,100):
            self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(4)
        html_source = driver.page_source
        data = html_source.encode('utf-8')


if __name__ == "__main__":
    unittest.main()

Step 3 : Print the data if required.