Welcome to the Treehouse Community
Want to collaborate on code errors? Have bugs you need feedback on? Looking for an extra set of eyes on your latest project? Get support with fellow developers, designers, and programmers of all backgrounds and skill levels here with the Treehouse Community! While you're at it, check out some resources Treehouse students have shared here.
Looking to learn something new?
Treehouse offers a seven day free trial for new students. Get access to thousands of hours of content and join thousands of Treehouse students and alumni in the community today.
Start your free trial
Vittorio Somaschini
33,371 PointsPHP and CURL
Hello.
I was just wondering if we will have a php and curl course coming out.
I do not even know what CURL is but I am trying to do some web scraping with php and that word comes out every time.
Or maybe someone can point me in the right direction for a good CURL tutorial..
Thanks, much appreciated
Vittorio
2 Answers
Richard Duncan
5,568 PointscURL in PHP is the Client URL library. The manual can be found here.
I wrote this a few years ago to test on my own website and gain an understanding of how web crawlers work it's not amazing but you get the general idea...
You can see it in action here.
class Core {
function __construct($url = "http://kryptonite-dove.com", $agent = "FireFox") {
// Preset user agents, could be written into a module to more easily update
$objUserAgents = new stdClass;
$objUserAgents->Alias = (object) array(
FireFox => "Mozilla/5.0 (Windows NT 5.1; rv:2.0b9pre) Gecko/20110105 Firefox/4.0b9pre",
IE => "IE",
Chrome => "Chrome",
Safari => "Safari",
ByteMe => "Robot Byte.Me.uk");
// Parse & encode the url
$url = $this->parseURL($url);
$this->initcURL($url, $objUserAgents->Alias->$agent);
}
private function initcURL($url, $useragent) {
// Initiate curl session
$ch = curl_init();
// Set curl headers including user agent
curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_FAILONERROR, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($ch, CURLOPT_AUTOREFERER, true);
curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
// Execute
$this->html = curl_exec($ch);
// Check for failure
if (!$this->html) {
echo "<br />Error number: " .curl_errno($ch);
echo "\n<br />Error: " . curl_error($ch);
exit;
}
// Else close curl session
curl_close($ch);
$this->url = $url;
}
private function parseURL($urlToParse) {
// Remove http(s):// if entered
$regex = '#^https?://#';
$urlToParse = preg_replace($regex, "", strtolower($urlToParse));
//echo $urlToParse;
// Check if whats left is a valid URL
$regex = '_^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$_iuS';
// Add http
$urlToParse = "http://".$urlToParse;
// Check if URL is a valid match
if ($isURL = preg_match($regex, $urlToParse) == true) {
// Return
return $parsedURL = $urlToParse;
} else {
echo "Error: URL is not valid.";
exit;
}
}
public function scrapeAnchors($url, $html) {
// parse the html into a DOMDocument
$dom = new DOMDocument();
@$dom->loadHTML($html);
// grab all the on the page
$xpath = new DOMXPath($dom);
$tags = $xpath->evaluate("/html/body//a");
$anchor_count = 0;
echo "<h1>{$url}.</h1>\n\r\n\r";
echo "\n\r<ul>";
for ($i = 0; $i < $tags->length; $i++) {
$href = $tags->item($i);
$url = $href->getAttribute('href');
//$title = $href->getAttribute('title');
if ($url != "#" && $url != "javascript:void(0)" && $url != "javascript:void(0);") {
echo "\n\t<li><a href='//byte.me.uk/index.php?url={$url}'>{$url}</a> {$title}</li>";
$anchor_count++;
}
}
echo "\n\r</ul>";
echo "\n\r\n\r<i>{$anchor_count} links found.</i>";
}
public function displayAsText($url, $html) {
$dom = new DOMDocument();
echo strip_tags($dom->loadHTML($html));
}
}
Vittorio Somaschini
33,371 PointsNice.
Thanks a lot for the answer and sharing