I'm trying to scrape Instagram (built with React) with Node.js / Cheerio. Debugging the document shows an object returned, but it doesn't look like the typical response.
I'm guessing this has to do with React. Is there a way to get around this, and pull the rendered DOM to parse with Cheerio? Or am I missing something entirely?
In the general case -- if the website is SEO friendly, you can do it by spoofing the user agent string of a web crawler. This returns a rendered DOM that can be parsed by Cheerio.
In the specific case -- Instagram returns a rendered DOM on its mobile web sites. Spoof the user agent string of a mobile phone and you can parse the data that is returned.
var options = {
url: user.instagram_url,
headers: {
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 8_0 like Mac OS X) AppleWebKit/600.1.3 (KHTML, like Gecko) Version/8.0 Mobile/12A4345d Safari/600.1.4'
}
};
request(options, function(error, response, html) {
if (!error) {
console.log('Scraper running on Instagram user page.');
// Use Cheerio to load the page.
var $ = cheerio.load(html);
// Code to parse the DOM here
}
}