Login

I want to download a zip file from the internet and unzip it in memory without saving to a temporary file. How can I do this?

Here is what I tried:

var url = 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip';

var request = require('request'), fs = require('fs'), zlib = require('zlib');

request.get(url, function(err, res, file) {
if(err) throw err;
zlib.unzip(file, function(err, txt) {
if(err) throw err;
console.log(txt.toString()); //outputs nothing
});
});

[EDIT]
As, suggested, I tried using the adm-zip library and I still cannot make this work:

var ZipEntry = require('adm-zip/zipEntry');
request.get(url, function(err, res, zipFile) {
if(err) throw err;
var zip = new ZipEntry();
zip.setCompressedData(new Buffer(zipFile.toString('utf-8')));
var text = zip.getData();
console.log(text.toString()); // fails
});

Two days ago the module `node-zip` has been released, which is a wrapper for the JavaScript only version of Zip: [JSZip](

[To see links please register here]

).

var NodeZip = require('node-zip')
, zip = new NodeZip(zipBuffer.toString("base64"), { base64: true })
, unzipped = zip.files["your-text-file.txt"].data;

If you're under MacOS or Linux, you can use the `unzip` command to unzip from `stdin`.

In this example I'm reading the zip file from the filesystem into a `Buffer` object but it works
with a downloaded file as well:

// Get a Buffer with the zip content
var fs = require("fs")
, zip = fs.readFileSync(__dirname + "/test.zip");

// Now the actual unzipping:
var spawn = require('child_process').spawn
, fileToExtract = "test.js"
// -p tells unzip to extract to stdout
, unzip = spawn("unzip", ["-p", "/dev/stdin", fileToExtract ])
;

// Write the Buffer to stdin
unzip.stdin.write(zip);

// Handle errors
unzip.stderr.on('data', function (data) {
console.log("There has been an error: ", data.toString("utf-8"));
});

// Handle the unzipped stdout
unzip.stdout.on('data', function (data) {
console.log("Unzipped file: ", data.toString("utf-8"));
});

unzip.stdin.end();

Which is actually just the node version of:

cat test.zip | unzip -p /dev/stdin test.js

**EDIT**: It's worth noting that this will not work if the input zip is too big to be read in one chunk from stdin. If you need to read bigger files, and your zip file contains only one file, you can use [funzip](

[To see links please register here]

) instead of `unzip`:

var unzip = spawn("funzip");

If your zip file contains multiple files (and the file you want isn't the first one) I'm afraid to say you're out of luck. Unzip needs to seek in the `.zip` file since zip files are just a container, and unzip may just unzip the last file in it. In that case you have to save the file temporarily ([node-temp](

[To see links please register here]

) comes in handy).

Sadly you can't **pipe** the response stream into the unzip job as node `zlib` lib allows you to do, you have to cache and wait the end of the response. I suggest you to pipe the response to a `fs` stream in case of big files, otherwise you will full fill your memory in a blink!

I don't completely understand what you are trying to do, but imho this is **the best approach**. You should **keep your data in memory only the time you really need it**, and then **stream to the [csv parser](

[To see links please register here]

)**.

*If you want to keep all your data in memory you can replace the csv parser method `fromPath` with `from` that takes a buffer instead and in getData return directly `unzipped`*

You can use the `AMDZip` (as @mihai said) instead of `node-zip`, just pay attention because `AMDZip` is not yet published in npm so you need:

$ npm install git://github.com/cthackers/adm-zip.git
**N.B. Assumption: the zip file contains only one file**



var request = require('request'),
fs = require('fs'),
csv = require('csv')
NodeZip = require('node-zip')

function getData(tmpFolder, url, callback) {
var tempZipFilePath = tmpFolder + new Date().getTime() + Math.random()
var tempZipFileStream = fs.createWriteStream(tempZipFilePath)
request.get({
url: url,
encoding: null
}).on('end', function() {
fs.readFile(tempZipFilePath, 'base64', function (err, zipContent) {
var zip = new NodeZip(zipContent, { base64: true })
Object.keys(zip.files).forEach(function (filename) {
var tempFilePath = tmpFolder + new Date().getTime() + Math.random()
var unzipped = zip.files[filename].data
fs.writeFile(tempFilePath, unzipped, function (err) {
callback(err, tempFilePath)
})
})
})
}).pipe(tempZipFileStream)
}

getData('/tmp/', 'http://bdn-ak.bloomberg.com/precanned/Comdty_Calendar_Spread_Option_20120428.txt.zip', function (err, path) {
if (err) {
return console.error('error: %s' + err.message)
}
var metadata = []
csv().fromPath(path, {
delimiter: '|',
columns: true
}).transform(function (data){
// do things with your data
if (data.NAME[0] === '#') {
metadata.push(data.NAME)
} else {
return data
}
}).on('data', function (data, index) {
console.log('#%d %s', index, JSON.stringify(data, null, ' '))
}).on('end',function (count) {
console.log('Metadata: %s', JSON.stringify(metadata, null, ' '))
console.log('Number of lines: %d', count)
}).on('error', function (error) {
console.error('csv parsing error: %s', error.message)
})
})

You need a library that can handle buffers. The latest version of `adm-zip` will do:

npm install adm-zip

My solution uses the `http.get` method, since it returns Buffer chunks.

Code:

var file_url = 'http://notepad-plus-plus.org/repository/7.x/7.6/npp.7.6.bin.x64.zip';

var AdmZip = require('adm-zip');
var http = require('http');

http.get(file_url, function(res) {
var data = [], dataLen = 0;

res.on('data', function(chunk) {
data.push(chunk);
dataLen += chunk.length;

}).on('end', function() {
var buf = Buffer.alloc(dataLen);

for (var i = 0, len = data.length, pos = 0; i < len; i++) {
data[i].copy(buf, pos);
pos += data[i].length;
}

var zip = new AdmZip(buf);
var zipEntries = zip.getEntries();
console.log(zipEntries.length)

for (var i = 0; i < zipEntries.length; i++) {
if (zipEntries[i].entryName.match(/readme/))
console.log(zip.readAsText(zipEntries[i]));
}
});
});

The idea is to create an array of buffers and concatenate them into a new one at the end. This is due to the fact that buffers cannot be resized.

**Update**

This is a simpler solution that uses the `request` module to obtain the response in a buffer, by setting `encoding: null` in the options. It also follows redirects and resolves http/https automatically.

var file_url = 'https://github.com/mihaifm/linq/releases/download/3.1.1/linq.js-3.1.1.zip';

var AdmZip = require('adm-zip');
var request = require('request');

request.get({url: file_url, encoding: null}, (err, res, body) => {
var zip = new AdmZip(body);
var zipEntries = zip.getEntries();
console.log(zipEntries.length);

zipEntries.forEach((entry) => {
if (entry.entryName.match(/readme/i))
console.log(zip.readAsText(entry));
});
});

The `body` of the response is a buffer that can be passed directly to `AdmZip`, simplifying the whole process.

burgonet958602

dukeicctitc

Proparafoilkbkkfp

cathead180

disserviceably386389