Welcome to the Treehouse Community

Want to collaborate on code errors? Have bugs you need feedback on? Looking for an extra set of eyes on your latest project? Get support with fellow developers, designers, and programmers of all backgrounds and skill levels here with the Treehouse Community! While you're at it, check out some resources Treehouse students have shared here.

Looking to learn something new?

Treehouse offers a seven day free trial for new students. Get access to thousands of hours of content and join thousands of Treehouse students and alumni in the community today.

Start your free trial

PHP

PHP and CURL

Hello.

I was just wondering if we will have a php and curl course coming out.

I do not even know what CURL is but I am trying to do some web scraping with php and that word comes out every time.

Or maybe someone can point me in the right direction for a good CURL tutorial..

Thanks, much appreciated

Vittorio

2 Answers

cURL in PHP is the Client URL library. The manual can be found here.

I wrote this a few years ago to test on my own website and gain an understanding of how web crawlers work it's not amazing but you get the general idea...

You can see it in action here.

class Core {

function __construct($url = "http://kryptonite-dove.com", $agent = "FireFox") {

    // Preset user agents, could be written into a module to more easily update
    $objUserAgents = new stdClass;
      $objUserAgents->Alias = (object) array(
            FireFox => "Mozilla/5.0 (Windows NT 5.1; rv:2.0b9pre) Gecko/20110105 Firefox/4.0b9pre",
            IE => "IE",
            Chrome => "Chrome",
            Safari => "Safari",
            ByteMe => "Robot Byte.Me.uk");


    // Parse & encode the url
    $url = $this->parseURL($url);
        $this->initcURL($url, $objUserAgents->Alias->$agent);
}



 private function initcURL($url, $useragent) {
    // Initiate curl session
     $ch = curl_init();

     // Set curl headers including user agent
        curl_setopt($ch, CURLOPT_USERAGENT, $useragent);
        curl_setopt($ch, CURLOPT_URL, $url);
        curl_setopt($ch, CURLOPT_FAILONERROR, true);
        curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);
        curl_setopt($ch, CURLOPT_AUTOREFERER, true);
        curl_setopt($ch, CURLOPT_RETURNTRANSFER,true);
        curl_setopt($ch, CURLOPT_TIMEOUT, 10);

     // Execute 
        $this->html = curl_exec($ch);

        // Check for failure
        if (!$this->html) {
            echo "<br />Error number: " .curl_errno($ch);
            echo "\n<br />Error: " . curl_error($ch);
            exit;
        }

    // Else close curl session  
     curl_close($ch);       

     $this->url = $url;

 }


 private function parseURL($urlToParse) {

    // Remove http(s):// if entered
    $regex = '#^https?://#';
        $urlToParse = preg_replace($regex, "", strtolower($urlToParse));

        //echo $urlToParse;

    // Check if whats left is a valid URL
    $regex = '_^(?:(?:https?|ftp)://)(?:\S+(?::\S*)?@)?(?:(?!10(?:\.\d{1,3}){3})(?!127(?:\.\d{1,3}){3})(?!169\.254(?:\.\d{1,3}){2})(?!192\.168(?:\.\d{1,3}){2})(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))|(?:(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)(?:\.(?:[a-z\x{00a1}-\x{ffff}0-9]+-?)*[a-z\x{00a1}-\x{ffff}0-9]+)*(?:\.(?:[a-z\x{00a1}-\x{ffff}]{2,})))(?::\d{2,5})?(?:/[^\s]*)?$_iuS';

    // Add http
        $urlToParse = "http://".$urlToParse;

    // Check if URL is a valid match
    if ($isURL = preg_match($regex, $urlToParse) == true) {
         // Return
         return $parsedURL = $urlToParse;
    } else {
        echo "Error: URL is not valid.";
        exit;
    }        
 }



 public function scrapeAnchors($url, $html) {

        // parse the html into a DOMDocument
        $dom = new DOMDocument();
        @$dom->loadHTML($html);

        // grab all the on the page
        $xpath = new DOMXPath($dom);
        $tags = $xpath->evaluate("/html/body//a");
        $anchor_count = 0;

                echo "<h1>{$url}.</h1>\n\r\n\r";
                echo "\n\r<ul>";

                for ($i = 0; $i < $tags->length; $i++) {
                    $href = $tags->item($i);
                    $url = $href->getAttribute('href');
                    //$title = $href->getAttribute('title');



                        if ($url != "#" && $url != "javascript:void(0)" && $url != "javascript:void(0);") {
                            echo "\n\t<li><a href='//byte.me.uk/index.php?url={$url}'>{$url}</a> {$title}</li>";    
                            $anchor_count++;
                        }


                }

                echo "\n\r</ul>";           
                    echo "\n\r\n\r<i>{$anchor_count} links found.</i>"; 

 }

public function displayAsText($url, $html) {
    $dom = new DOMDocument();
        echo strip_tags($dom->loadHTML($html));



}


}

Nice.

Thanks a lot for the answer and sharing