asynchronous - Node.js Async mapLimit and memory -
solved, see answer please.
i have list of urls fetch using request , reason unable save more 1720 records database when try fetch 2000 or more url's @ time.
if try 1000 2000 , 2000 3000, 3000 results in total. when try 1000 3000 or 4000 6000, script stops after fetching 1720th result.
what reason that?
i use maplimit in order limit concurrent connections.
app.get('/asynctest', function(req, res) { var people = []; (var = 1000; < 3000; a++) { people.push("http://www.example.com/" + + "/person.html"); } async.maplimit(people, 20, function(url, callback) { // iterator function var options2 = { url: url, headers: { 'user-agent': req.headers['user-agent'], 'content-type': 'application/json; charset=utf-8' } }; request(options2, function(error, response, body) { if (!error && response.statuscode == 200) { async.series([ // add person database function(callback) { var $ = cheerio.load(body); var name = entities.decodehtml($('span[itemprop="name"]').text()); new person({ name: name, url: url }).save(); callback(); }, function(callback) { async.waterfall([ function(callback) { var $ = cheerio.load(body); var jobs = $('span[itemprop="jobtitle"]').length; if (jobs == 0) { console.log("no job"); var jobsarr = 0; } else { var jobsarr = []; (var aa = 0; aa < jobs; aa++) { jobsarr.push(entities.decodehtml($('span[itemprop="jobtitle"]').eq(aa).text())); } } callback(null, jobsarr); }, function(jobsarr, callback) { if (jobsarr == 0) { console.log("this person has no jobs"); } else { async.map(jobsarr, function(jobs, callback) { personrole.where('job_name', jobs).fetch({ require: true }).then(function(data1) { data1 = data1.tojson(); person.where('url', url).fetch().then(function(data2) { data2 = data2.tojson(); new personpersonrole({ person_id: data2.id, personrole_id: data1.id }).save(); }); }).catch(function(err) { new personrole({ job_name: jobs }).save().then(function(data3) { data3 = data3.tojson(); person.where('url', url).fetch().then(function(data4) { data4 = data4.tojson(); new personpersonrole({ person_id: data4.id, personrole_id: data3.id }).save(); }); }); }); }); } callback(null, "yes"); } ], function(err, result) { if (err) { console.log(err); } }); callback(); } ], function(err, result) { if (err) { console.log("err3"); } }); } else { console.log("err4"); } }); callback(); }); });
edit #2
the following code problematic, adds 1747 records , stops after that. if stop node app , start again, stops @ 1747.
var person = require('./models').person; app.get('/asynctest', function(req, res) { var people = []; (var = 18000; < 20000; a++) { people.push("random url"); } async.maplimit(people, 20, function(url, callback) { new person({ name: "yes", url: url }).save(); callback(); }); });
db.js
var knex = require('knex')({ client: 'mysql', connection: { host: '127.0.0.1', port: 8889, user: 'root', password: 'root', database: 'mydatabase', charset: 'utf8' }, pool: { min: 0, max: 100 } }); var db = require('bookshelf')(knex); module.exports = db;
models.js
var person = db.model.extend({ tablename: 'people' }); module.exports = { person : person };
edit #3
okay, think i've found solution.
18k-18.5k - no problem
18k-19k - no problem
18k-19.7k - no problem
18k-20k - rangeerror: maximum call stack size exceeded @ new object (native)
i wrapped callbacks wrapper, below.
async.setimmediate(function () { callback(); }); app.get('/async22', function(req, res) { var people = []; (var = 18000; < 20000; a++) { people.push("yes"); } async.maplimit(people, 20, function(url, callback) { new person({ name: "yes", url: url }).save(); async.setimmediate(function () { callback(); }); }); });
this still not answer, big comment.
i suggest reduce code minimal example , try if works (example below , works me).
second thing - add monitoring route (see /apptest
below), can check if app still works , processing progress.
if minimal sample works, start gradually add more code logic , check if still works.
the code, server.js:
var util = require('util'); var express = require('express'); var async = require('async'); var request = require('request'); var cheerio = require('cheerio'); var app = express.createserver(); app.successcount = 0; app.errorcount = 0; app.get('/apptest', function(req, res) { res.send( util.format( 'i ok, successcount: %s, errorcount: %s', app.successcount, app.errorcount ), 200 ); }); app.get('/asynctest', function(req, res) { var people = []; (var = 1000; < 3000; a++) { people.push("http://www.example.com/" + + "/person.html"); } async.maplimit(people, 20, function(url, callback) { // iterator function var options2 = { url: url, headers: { 'user-agent': req.headers['user-agent'], 'content-type': 'application/json; charset=utf-8' } }; request(options2, function(error, response, body) { if (!error) { console.log('success requesting: ' + options2.url); var $ = cheerio.load(body); app.successcount += 1; } else { console.log( 'error requesting: %s, error: %s, status: %s', options2.url, error, response.statuscode ); app.errorcount += 1; } callback(); }); }); }); app.listen(3000, function() { console.log( "express server listening on port %d in %s mode", app.address().port, app.settings.env ); });
dependencies, package.json:
{ "name": "application-name", "version": "0.0.1", "private": true, "dependencies": { "async": "^1.5.2", "cheerio": "^0.19.0", "express": "2.5.8", "request": "^2.67.0" }, "devdependencies": {} }
run example node server.js
, open http://localhost:3000/asynctest in browser, should see success requesting: xxxx
in console. while running (or when stops running) - open http://localhost:3000/apptest check if app ok , how many urls processed.
Comments
Post a Comment