We are having an issue with our node environment running under high load that we have not been able to find the source of.
A little background: we are running a clustered node application using Express for the http framework. Currently, there are 3 boxes with 8 CPU cores on each, and each box is running a cluster of 6 node workers. The setup seems to work great and I've researched all the suggested methodologies such that I believe the setup is solid. We're running node.js 0.8.1 with Express 2.5.11 and XMLHttpRequest 1.4.2.
Here's the issue: We're doing a "dark launch" test of this product (i.e. the browser client code has javascript ajax calls to our APIs in the background, but is not used on the page or shown to the user). After a few minutes running successfully, the system is throwing:
[RangeError: Maximum call stack size exceeded]
We're catching the error with the 'uncaughtException' event in the cluster controller (which starts each worker), however there is no stack trace available at that level. I've done extensive research on this issue and can't seem to find anyone with a similar error. After combing through EVERY line of code in the system, here's what I know:
The most telling symptom, I believe, is that the error always happens AFTER a request has been fully served. That is, the server takes a request, finds the proper Express route, calls res.send, and is finished. This really feels like garbage collection to me! I've read that the V8 engine has a very good GC engine, but I'm wondering how much our heavy load is impacting things.
As I said, the code throws the error even on a basic design. Having taken out most of our custom code, this is the basics of the setup. Sorry that I'm cutting here, so not all variable declarations etc will be included, however the code does work and all that stuff is in the real code:
Cluster controller. This is a cleaned-up version of what is started on the command line.
cluster = require('cluster');
path = require('path');
fs = require('fs');
app = require('./nodeApi');
_ = require('underscore');
nodeUtil = require(./nodeUtil);
process.on('uncaughtException', function(err) {
var stamp;
stamp = new Date();
console.log("***************************** Exception Caught, " + stamp);
return console.log("Exception is:", err);
});
if (cluster.isMaster) {
if ((nodeUtil.isLiveServer() || nodeUtil.isCluster()) && process.env.IS_CLUSTER !== '0') {
numCPUs = require("os").cpus().length - 2;
if (numCPUs <= 0) {
numCPUs = 1;
}
} else {
numCPUs = 1;
}
console.log("Forking " + numCPUs + " workers...");
for (i = _i = 1; 1 <= numCPUs ? _i <= numCPUs : _i >= numCPUs; i = 1 <= numCPUs ? ++_i : --_i) {
worker = cluster.fork();
}
} else {
app.start();
}
nodeWorker code. Using Express and a simple route to serve a request. Request is wrapped in a callback if jsonp is used (for our testing with ajax, this was needed)
(function() {
var crypto, express, fs, modroot, path, staticroot, _;
express = require('express');
_ = require('underscore');
fs = require('fs');
path = require('path');
module.exports.start = function() {
logFile = fs.createWriteStream("" + logpath + "/access.log", {
flags: 'a'
});
app = express.createServer();
app.configure(function() {
app.use(express.logger({
stream: logFile,
format: ':remote-addr - [:date] - ":method :url HTTP/:http-version" :status :res[content-length] ":referrer" :response-time ms'
}));
app.use(express.errorHandler({
dumpExceptions: true,
showStack: true
}));
app.use(express.cookieParser());
app.use(express.bodyParser());
app.use(express.session({
secret: "ourMemStoreSecret",
cookie: {
domain: ".ourdomain.com"
},
maxAge: new Date(Date.now() + 7200000),
// The store WAS a redis store. I took it out to eliminate redis as the issue. We don't use sessions anyway.
store: new require('express').session.MemoryStore({
reapInterval: 60000 * 15
})
}));
app.use(express["static"](staticroot));
app.set('view engine', 'underscore'); // For our template rendering. Not used in this test.
app.set('views', __dirname + '/views/src');
app.set('view options', {
layout: false
});
app.use(app.router);
});
ignore = function(req, res, next) {
if (req.params.api === 'favicon.ico') {
return next('route');
}
return next();
};
wrapCallback = function(req, res, next) {
var callbackName;
if (callbackName = req.query.callback) {
req.wrapCallback = true;
res._send = res.send;
res.send = function(data, status) {
var dataString;
if (_.isObject(data)) {
dataString = encodeURI(JSON.stringify(data));
res.setHeader('Content-Type', 'application/javascript');
return res._send("" + callbackName + "(\"" + dataString + "\")", status);
} else {
data = encodeURI(data);
return res._send("" + callbackName + "(\"" + data + "\")", status);
}
};
}
return next();
};
app.error(function(err, req, res, next) {
console.log("[" + process.pid + "] Error Handler. Ok.", err);
return res.send({
error: err.msg
}, err.statusCode);
});
// Does anyone know how to hard-code a path AND put it into a variable at the same time?
// Kind of like: "/:api=MyTestAPI" ?? That's why this route is here.
setAPIName = function(req, res, next) {
req.params.api = 'MyTestAPI';
return next();
};
app.get("/MyTestAPI", setAPIName, wrapCallback, function(req, res) {
res.send({
hello: 'world'
}, 200);
return console.log("[" + process.pid + "] res.send (no cacher) is done");
});
process.setMaxListeners(0);
process.send({
// For IPC - the controller has a handler for this message
cmd: 'isStarted'
});
return app.listen(process.env.APP_PORT);
};
}).call(this);
What the error looks like. Basically, I never see it happen in the middle of a request. There isn't a call stack on the error either--it's only the stack overflow message. Here you can see 2 worker processes each serving a response, then the error on one of them.
[660] res.send (no cacher) is done
[654] res.send (no cacher) is done
***************************** Exception Caught, Fri Nov 02 2012 10:23:48 GMT-0400 (EDT)
I would really appreciate some feedback on this. The system runs beautifully and is capable of handling our huge traffic with 3 boxes. The load on the boxes is about 40% and humming. I'd love to find the source of this issue so others can be as proud of this system as I am, and show the node.js non-believers that this is a great product!
I have faced same problem in one of my production environment. During the analysis, I found following things, may be i am wrong. But I hope, this will help you...
This problem is basically associated with Socket. There is a option how many open Socket connection should accept? and can the connection keep in half open?.
Normally this kind of exceptions happen only because of how frequent you hit server in particular time period.
Let me explain clearly...
Let assume there is only two socket path and you have four request each should take 5 seconds of processing time.
In general NodeJs can serve perfectly when you give 2 request in 0th second and remaining 2 in 6th second.
Rather than like this, if you give 4 request in 0th second, then NodeJs ready to serve 2 request only. NodeJs is simply close socket for remaining two request. Note: Later if you give the same request, NodeJs will accept and give the response.
For more information, please go through socket.io.js implementation.
And my solution is,
Or If you find any other easy way to solve this, Please update this post...
I am waiting to know a great solution for this problem.
Thanks