asynchronous - Node.js Async mapLimit and memory -


solved, see answer please.

i have list of urls fetch using request , reason unable save more 1720 records database when try fetch 2000 or more url's @ time.

if try 1000 2000 , 2000 3000, 3000 results in total. when try 1000 3000 or 4000 6000, script stops after fetching 1720th result.

what reason that?

i use maplimit in order limit concurrent connections.

app.get('/asynctest', function(req, res) {   var people = [];   (var = 1000; < 3000; a++) {     people.push("http://www.example.com/" + + "/person.html");   }    async.maplimit(people, 20, function(url, callback) {     // iterator function     var options2 = {       url: url,       headers: {         'user-agent': req.headers['user-agent'],         'content-type': 'application/json; charset=utf-8'       }     };      request(options2, function(error, response, body) {       if (!error && response.statuscode == 200) {         async.series([           // add person database           function(callback) {             var $ = cheerio.load(body);             var name = entities.decodehtml($('span[itemprop="name"]').text());             new person({               name: name,               url: url             }).save();             callback();           },            function(callback) {             async.waterfall([                function(callback) {                 var $ = cheerio.load(body);                 var jobs = $('span[itemprop="jobtitle"]').length;                 if (jobs == 0) {                   console.log("no job");                   var jobsarr = 0;                 } else {                   var jobsarr = [];                   (var aa = 0; aa < jobs; aa++) {                     jobsarr.push(entities.decodehtml($('span[itemprop="jobtitle"]').eq(aa).text()));                   }                 }                  callback(null, jobsarr);               },                function(jobsarr, callback) {                 if (jobsarr == 0) {                   console.log("this person has no jobs");                 } else {                    async.map(jobsarr, function(jobs, callback) {                     personrole.where('job_name', jobs).fetch({                       require: true                     }).then(function(data1) {                       data1 = data1.tojson();                       person.where('url', url).fetch().then(function(data2) {                         data2 = data2.tojson();                         new personpersonrole({                           person_id: data2.id,                           personrole_id: data1.id                         }).save();                       });                     }).catch(function(err) {                       new personrole({                         job_name: jobs                       }).save().then(function(data3) {                         data3 = data3.tojson();                         person.where('url', url).fetch().then(function(data4) {                           data4 = data4.tojson();                           new personpersonrole({                             person_id: data4.id,                             personrole_id: data3.id                           }).save();                         });                       });                     });                   });                 }                 callback(null, "yes");               }             ], function(err, result) {               if (err) {                 console.log(err);               }             });             callback();           }         ], function(err, result) {           if (err) {             console.log("err3");           }         });       } else {         console.log("err4");       }     });     callback();   }); }); 

edit #2

the following code problematic, adds 1747 records , stops after that. if stop node app , start again, stops @ 1747.

var person = require('./models').person;  app.get('/asynctest', function(req, res) {   var people = [];  (var = 18000; < 20000; a++) {     people.push("random url");   }    async.maplimit(people, 20, function(url, callback) {     new person({       name: "yes",       url: url     }).save();     callback();   }); }); 

db.js

var knex = require('knex')({   client: 'mysql',   connection: {     host: '127.0.0.1',     port: 8889,     user: 'root',     password: 'root',     database: 'mydatabase',     charset: 'utf8'   },   pool: {     min: 0,     max: 100   } });  var db = require('bookshelf')(knex); module.exports = db; 

models.js

var person = db.model.extend({   tablename: 'people' });  module.exports = {  person : person }; 

edit #3

okay, think i've found solution.

18k-18.5k - no problem

18k-19k - no problem

18k-19.7k - no problem

18k-20k - rangeerror: maximum call stack size exceeded @ new object (native)

i wrapped callbacks wrapper, below.

async.setimmediate(function () {   callback(); });    app.get('/async22', function(req, res) {     var people = [];     (var = 18000; < 20000; a++) {       people.push("yes");   }    async.maplimit(people, 20, function(url, callback) {     new person({       name: "yes",       url: url     }).save();       async.setimmediate(function () {         callback();       });   }); }); 

this still not answer, big comment.

i suggest reduce code minimal example , try if works (example below , works me).

second thing - add monitoring route (see /apptest below), can check if app still works , processing progress.

if minimal sample works, start gradually add more code logic , check if still works.

the code, server.js:

var util = require('util'); var express = require('express'); var async = require('async'); var request = require('request'); var cheerio = require('cheerio');  var app = express.createserver(); app.successcount = 0; app.errorcount = 0;  app.get('/apptest', function(req, res) {   res.send(     util.format(       'i ok, successcount: %s, errorcount: %s',       app.successcount, app.errorcount     ), 200   ); });  app.get('/asynctest', function(req, res) {   var people = [];   (var = 1000; < 3000; a++) {     people.push("http://www.example.com/" + + "/person.html");   }    async.maplimit(people, 20, function(url, callback) {     // iterator function     var options2 = {       url: url,       headers: {         'user-agent': req.headers['user-agent'],         'content-type': 'application/json; charset=utf-8'       }     };      request(options2, function(error, response, body) {       if (!error) {         console.log('success requesting: ' + options2.url);         var $ = cheerio.load(body);         app.successcount += 1;       } else {         console.log(           'error requesting: %s, error: %s, status: %s',           options2.url, error, response.statuscode         );         app.errorcount += 1;       }       callback();     });   }); });  app.listen(3000, function() {   console.log(     "express server listening on port %d in %s mode",     app.address().port, app.settings.env   ); }); 

dependencies, package.json:

{   "name": "application-name",   "version": "0.0.1",   "private": true,   "dependencies": {     "async": "^1.5.2",     "cheerio": "^0.19.0",     "express": "2.5.8",     "request": "^2.67.0"   },   "devdependencies": {} } 

run example node server.js , open http://localhost:3000/asynctest in browser, should see success requesting: xxxx in console. while running (or when stops running) - open http://localhost:3000/apptest check if app ok , how many urls processed.


Comments

Popular posts from this blog

get url and add instance to a model with prefilled foreign key :django admin -

css - Make div keyboard-scrollable in jQuery Mobile? -

ruby on rails - Seeing duplicate requests handled with Unicorn -