-
-
Save asiellb/62d6b88be18dc4a8a6ad56d7978bdff8 to your computer and use it in GitHub Desktop.
Parsing And Serializing Large Datasets Using Newline-Delimited JSON In Node.js
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{"id":1,"name":"O Brother, Where Art Thou?"} | |
{"id":2,"name":"Home for the Holidays"} | |
{"id":3,"name":"The Firm"} | |
{"id":4,"name":"Broadcast News"} | |
{"id":5,"name":"Raising Arizona"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Require the core node modules. | |
var chalk = require( "chalk" ); | |
var fileSystem = require( "fs" ); | |
var ndjson = require( "ndjson" ); | |
// ----------------------------------------------------------------------------------- // | |
// ----------------------------------------------------------------------------------- // | |
// Imagine that we are performing some sort of data migration and we have to move data | |
// from one database to flat files; then transport those flat files elsewhere; then, | |
// import those flat files into a different database. | |
var records = [ | |
{ id: 1, name: "O Brother, Where Art Thou?" }, | |
{ id: 2, name: "Home for the Holidays" }, | |
{ id: 3, name: "The Firm" }, | |
{ id: 4, name: "Broadcast News" }, | |
{ id: 5, name: "Raising Arizona" } | |
// .... hundreds of thousands of records .... | |
]; | |
// Traditionally, we might store ONE JSON document PER FILE. However, this has some | |
// serious implications once we move out of local development environment and into | |
// production. As the JSON documents grow in size, we run the risk of running out of | |
// memory (during the serialization and parsing process). To get around this, we can | |
// use a slightly different storage format in which our data file is not ONE JSON | |
// document PER FILE, but rather ONE JSON document PER LINE. This is known as "ndjson" | |
// or "Newline-Delimited JSON". To use this format, we're going to create an ndjson | |
// Transform stream (aka "through" stream) that takes each JavaScript object and | |
// writes it as a newline-delimited String to the output stream (which will be a | |
// file-output stream in our case). | |
// -- | |
// NOTE: We're using .ndjson - NOT .json - for this storage format. | |
var transformStream = ndjson.stringify(); | |
// Pipe the ndjson serialized output to the file-system. | |
var outputStream = transformStream.pipe( fileSystem.createWriteStream( __dirname + "/data.ndjson" ) ); | |
// Iterate over the records and write EACH ONE to the TRANSFORM stream individually. | |
// Each one of these records will become a line in the output file. | |
records.forEach( | |
function iterator( record ) { | |
transformStream.write( record ); | |
} | |
); | |
// Once we've written each record in the record-set, we have to end the stream so that | |
// the TRANSFORM stream knows to flush and close the file output stream. | |
transformStream.end(); | |
// Once ndjson has flushed all data to the output stream, let's indicate done. | |
outputStream.on( | |
"finish", | |
function handleFinish() { | |
console.log( chalk.green( "ndjson serialization complete!" ) ); | |
console.log( "- - - - - - - - - - - - - - - - - - - - - - -" ); | |
} | |
); | |
// ----------------------------------------------------------------------------------- // | |
// ----------------------------------------------------------------------------------- // | |
// Since the stream actions are event-driven (and asynchronous), we have to wait until | |
// our output stream has been closed before we can try reading it back in. | |
outputStream.on( | |
"finish", | |
function handleFinish() { | |
// When we read the file back into memory, ndjson will stream, buffer, and split | |
// the content based on the newline character. It will then parse each newline- | |
// delimited value as a JSON object and emit it from the TRANSFORM stream. | |
var inputStream = fileSystem.createReadStream( __dirname + "/data.ndjson" ); | |
var transformStream = inputStream.pipe( ndjson.parse() ); | |
transformStream | |
// Each "data" event will emit one item from our original record-set. | |
.on( | |
"data", | |
function handleRecord( data ) { | |
console.log( chalk.red( "Record (event):" ), data ); | |
} | |
) | |
// Once ndjson has parsed all the input, let's indicate done. | |
.on( | |
"end", | |
function handleEnd() { | |
console.log( "- - - - - - - - - - - - - - - - - - - - - - -" ); | |
console.log( chalk.green( "ndjson parsing complete!" ) ); | |
} | |
) | |
; | |
} | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment