Nodejs Read very large file(~10GB), Process line by line then write to other file

HVT7 picture HVT7 · Jul 17, 2015 · Viewed 9.3k times · Source

I have a 10 GB log file in a particular format, I want to process this file line by line and then write the output to other file after applying some transformations. I am using node for this operation.

Though this method is fine but it takes a hell lot of time to do this. I was able to do this within 30-45 mins in JAVA, but in node it is taking more than 160 minutes to do the same job. Following is the code:

Following is the initiation code which reads each line from the input.

var path = '../10GB_input_file.txt';
var output_file = '../output.txt';

function fileopsmain(){

    fs.exists(output_file, function(exists){
        if(exists) {
            fs.unlink(output_file, function (err) {
                if (err) throw err;
                console.log('successfully deleted ' + output_file);
            });
        }
    });

    new lazy(fs.createReadStream(path, {bufferSize: 128 * 4096}))
        .lines
        .forEach(function(line){
            var line_arr = line.toString().split(';');
            perform_line_ops(line_arr, line_arr[6], line_arr[7], line_arr[10]);
        }
    );

}

This is the method that performs some operation over that line and passes the input to write method to write it into the output file.

function perform_line_ops(line_arr, range_start, range_end, daynums){

    var _new_lines = '';
    for(var i=0; i<days; i++){
        //perform some operation to modify line pass it to print
    }

    write_line_ops(_new_lines);
}

Following method is used to write data into a new file.

function write_line_ops(line) {
    if(line != null && line != ''){
        fs.appendFileSync(output_file, line);
    }
}

I want to bring this time down to 15-20 mins. Is it possible to do so.

Also for the record I'm trying this on a intel i7 processor with 8 GB of RAM.

Answer

mscdex picture mscdex · Jul 17, 2015

You can do this easily without a module. For example:

var fs = require('fs');
var inspect = require('util').inspect;

var buffer = '';
var rs = fs.createReadStream('foo.log');
rs.on('data', function(chunk) {
  var lines = (buffer + chunk).split(/\r?\n/g);
  buffer = lines.pop();
  for (var i = 0; i < lines.length; ++i) {
    // do something with `lines[i]`
    console.log('found line: ' + inspect(lines[i]));
  }
});
rs.on('end', function() {
  // optionally process `buffer` here if you want to treat leftover data without
  // a newline as a "line"
  console.log('ended on non-empty buffer: ' + inspect(buffer));
});