If I am about to implement something it usually turns out to be a good idea to first start with some prototypes & benchmarks. This post summarizes the outcome of such an experiment to read data from compressed tar archives effectively. I did some similar benchmarks about 30 month ago so I repeated the measurements to see if there have been improvements and added results for my new Go reader.
DISCLAIMER: I am not doing this to pick on nodejs. I repeated the measurements in the hope that the performance issues had been fixed by now.
Results are the average time measurements (usr + sys) from reading a 277 kB file a 1000 times.
Language | Time (lower is better) |
---|---|
Go: | 4.808 ms |
Python: | 54.272 ms |
Node.js: | 324.432 ms |
The performance of the Python implementation already is impressive. But the result from implementing this in Golang is amazing. Wow, an advantage of 10x!
Sourcecode of the nodejs implementation:
var fs = require('fs');
var tar = require('tar');
var zlib = require('zlib');
var Stream = require('stream');
fs.createReadStream('supercars-logs-13060317.tgz')
.pipe(zlib.Unzip())
.pipe(tar.Parse())
.on('entry', function(entry) {
if(entry.path == 'responsetimes.log.13060317') {
var bytes = 0;
entry.on('end', function() {
console.log('bytes: ' + bytes);
});
entry.on('data', function(data) {
bytes += data.length;
})
}
});
Sourcecode of the Python implementation:
#!/usr/bin/env python
import sys
import tarfile
import timeit
def count():
bytes = 0
tar = tarfile.open('supercars-logs-13060317.tgz', 'r:gz')
f = tar.extractfile('responsetimes.log.13060317')
for line in f:
bytes += len(line)
print "bytes: %d" % bytes
count()
Sourcecode of the Go implementation:
package main
import (
"archive/tar"
"compress/gzip"
"fmt"
"os"
"bufio"
)
func main() {
fmt.Printf("bytes: %d\n",
countBytes("supercars-logs-13060317.tgz", "responsetimes.log.13060317"))
}
func countBytes(archive string, filename string) int {
f, err := os.Open(archive)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
defer f.Close()
gzf, err := gzip.NewReader(f)
if err != nil {
fmt.Println(err)
os.Exit(1)
}
tarReader := tar.NewReader(gzf)
// search filename in archive
for {
header, err := tarReader.Next()
if err != nil {
fmt.Println(err)
os.Exit(1)
}
if header.Typeflag == tar.TypeReg && header.Name == filename {
count := 0
scanner := bufio.NewScanner(tarReader)
scanner.Split(bufio.ScanBytes)
for scanner.Scan() { count++ }
return count
}
}
}
Best, Mark
Resources
- http://stackoverflow.com/questions/17152307/is-something-wrong-with-how-i-read-tgz-files-in-node-js-benchmark-says-it-is-sl