I'm having a problem with PHP's cURL returning an empty string with some URL's. I'm trying to parse the OG metadata of different webpages and it works with all websites I've tried except for NYTimes. Here is my code so far.
print_r(get_og_metadata('http://somewebsite.com'));
public function get_data($url)
{
$ch = curl_init();
$timeout = 5;
// the url to fetch
curl_setopt($ch, CURLOPT_URL, $url);
// return result as a string rather than direct output
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
// set max time of cURL execution
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, $timeout);
$data = curl_exec($ch);
curl_close($ch);
return $data;
}
public function get_og_metadata($url)
{
libxml_use_internal_errors(TRUE);
$data = $this->_get_data($url);
$doc = new DOMDocument();
$doc->loadHTML($data);
$xpath = new DOMXPath($doc);
$query = '//*/meta[starts-with(@property, \'og:\')]';
$metadatas = $xpath->query($query);
$result = array();
foreach($metadatas as $metadata)
{
$property = $metadata->getAttribute('property');
$content = $metadata->getAttribute('content');
$result[$property] = $content;
}
return $result;
}
curl_setopt($ch, CURLOPT_USERAGENT,'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.52 Safari/537.17');
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_VERBOSE, 1);