Welcome to the Treehouse Community

Want to collaborate on code errors? Have bugs you need feedback on? Looking for an extra set of eyes on your latest project? Get support with fellow developers, designers, and programmers of all backgrounds and skill levels here with the Treehouse Community! While you're at it, check out some resources Treehouse students have shared here.

Looking to learn something new?

Treehouse offers a seven day free trial for new students. Get access to thousands of hours of content and join thousands of Treehouse students and alumni in the community today.

Start your free trial

JavaScript

Radha Bhambwani
Radha Bhambwani
10,182 Points

Web Scraping with Node.js

New to Node.js and I'm trying to scrape some data by looping through an array of 3 urls. The scraped data will be used to store in a mongodb collection.

Right now, I am looping through the array of urls and using node's request module inside the for loop for each url and storing data dynamically in an array called products.

My issue is that when i try to print products.length on to the console outside of the request method, the value is 0, indicating an empty array. Here's my code:

var mongoose = require('mongoose');
var request = require("request");
var cheerio = require("cheerio");
var http = require('http');

//these arrays will store the scraped information from webpage
var prodList = [];
var priceList = [];

//this is the array that will be used to organize and display the scraped info
var products = [];

//store scraped data as an object 
function Prod(prodName, price) {
    this.prodName = prodName;
    this.price = price;
};



var populateArray = function() {

    //urls to scrape
    var nyxLinks = [
        "http://www.nyxcosmetics.ca/en_CA/face?sz=999&viewall=1",
        "http://www.nyxcosmetics.ca/en_CA/lips?sz=999&viewall=1",
        "http://www.nyxcosmetics.ca/en_CA/eyes?sz=999&viewall=1"
    ];

    //empty all arrays
    prodList  = [];
    priceList = [];
    products = [];

    for(var i = 0; i < nyxLinks.length; i++) {

        //define url to download 
        var url = nyxLinks[i];
        console.log(url);


        request(url, function(error, response, body) {
            if(!error) {

                //load page into cheerio
                var $ = cheerio.load(body);

                //for each product on the page store in respective arrays
                $(".product_tile_wrapper").each(function(i, elem) {
                    prodList.push($(this).find($(".product_name")).attr("title"));
                    priceList.push($(this).find($(".product_price")).attr("data-pricevalue"));
                });

                for(var i = 0; i < prodList.length; i++) {
                    //store product info as an object

                    products.push(new Prod(prodList[i], priceList[i]));
                }
            } else {
                console.log("We've encountered an error!")
            }
        }).on("end", function(err, data) {          
            if(!err) {
                console.log("products length " + products.length);
            } else {
                console.log(err);
            }

        });
    }
    console.log("products length " + products.length);

}



mongoose.connect('mongodb://127.0.0.1:27017/makeupdb');

var db = mongoose.connection;
db.on('error', console.error.bind(console, 'Connection Error:'));
db.once('open', function() {
    // we're connected

    populateArray();
    console.log("number of products in products array " + products.length);

    //clear the current collection - db.remove({})

    //insert data in mongodb - db.insert(products)

});

var hostname = '127.0.0.01';
var port = process.env.PORT || 1337;

http.createServer(function(request, response) {
    response.writeHead(200, {'Content-Type': 'text/plain'});
    response.end("Hello World");
}).listen(port, hostname, function() {
    console.log("Server running at http://" + hostname + ":" + port + "/");
});

The console output from this code is:

Server running at http://127.0.0.01:1337/
http://www.nyxcosmetics.ca/en_CA/face?sz=999&viewall=1
http://www.nyxcosmetics.ca/en_CA/lips?sz=999&viewall=1
http://www.nyxcosmetics.ca/en_CA/eyes?sz=999&viewall=1
products length 0
number of products in products array 0
products length 0
products length 31
products length 119

I believe I need to use a callback to be able to access the products array but I am not sure where I would need to use this call back. Any help will be much appreciated.

Thanks,

Radha