Created
July 22, 2022 14:31
-
-
Save Gopikrishna19/5104160c50827aa0366997b954f9f701 to your computer and use it in GitHub Desktop.
CSV Parser - Final state machine - Javascript
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const states = { | |
startField: 'startField', | |
startRow: 'startRow', | |
escaped: 'escaped', | |
nonQuoted: 'nonQuoted', | |
quoted: 'quoted', | |
}; | |
const EOF = 'eof'; | |
module.exports.csvParser = (content) => { | |
const data = []; | |
let state = states.startRow; | |
let fieldBuffer = []; | |
const startRow = () => { | |
data.push([]); | |
}; | |
const endField = () => { | |
data[data.length - 1].push(fieldBuffer.join('')); | |
fieldBuffer = []; | |
}; | |
const machineError = () => new Error('Invalid end of state'); | |
const machine = { | |
[states.startRow](char) { | |
if (char === EOF) { | |
return; | |
} else if (/\n/.test(char)) { | |
return states.startRow; | |
} | |
startRow(); | |
if (/,/.test(char)) { | |
endField(); | |
return states.startField; | |
} else if (/"/.test(char)) { | |
return states.quoted; | |
} else if (char) { | |
fieldBuffer.push(char); | |
return states.nonQuoted; | |
} | |
}, | |
[states.startField](char) { | |
if (char === EOF) { | |
// no op | |
} else if (/,/.test(char)) { | |
endField(); | |
return states.startField; | |
} else if (/"/.test(char)) { | |
return states.quoted; | |
} else if (/[\n\r]/.test(char)) { | |
endField(); | |
return states.startRow; | |
} else if (char) { | |
fieldBuffer.push(char); | |
return states.nonQuoted; | |
} else { | |
throw machineError(); | |
} | |
}, | |
[states.escaped](char) { | |
if (char === EOF) { | |
endField(); | |
} else if (/"/.test(char)) { | |
fieldBuffer.push(char); | |
return states.quoted; | |
} else if (/,/.test(char)) { | |
endField(); | |
return states.startField; | |
} else if (/[\n\r]/.test(char)) { | |
endField(); | |
return states.startRow; | |
} else { | |
throw machineError(); | |
} | |
}, | |
[states.quoted](char) { | |
if (/"/.test(char)) { | |
return states.escaped; | |
} else if (char !== EOF) { | |
fieldBuffer.push(char); | |
return states.quoted; | |
} else { | |
throw machineError(); | |
} | |
}, | |
[states.nonQuoted](char) { | |
if (char === EOF) { | |
endField(); | |
} else if (/,/.test(char)) { | |
endField(); | |
return states.startField; | |
} else if (/[\n\r]/.test(char)) { | |
endField(); | |
return states.startRow; | |
} else if (char) { | |
fieldBuffer.push(char); | |
return states.nonQuoted; | |
} else { | |
throw machineError(); | |
} | |
}, | |
}; | |
for (const char of content) { | |
state = machine[state](char); | |
} | |
machine[state](EOF); | |
return data; | |
}; |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// JEST | |
const {csvParser} = require('./index'); | |
describe('parser', () => { | |
it('should handle no content', () => { | |
expect(csvParser('')).toEqual([]); | |
}); | |
it('should handle one word', () => { | |
expect(csvParser('hello')).toEqual([['hello']]); | |
expect(csvParser('"hello"')).toEqual([['hello']]); | |
}); | |
it('should handle two words', () => { | |
expect(csvParser('hello,world')).toEqual([['hello', 'world']]); | |
expect(csvParser('hello,"world"')).toEqual([['hello', 'world']]); | |
}); | |
it('should handle escaped words', () => { | |
expect(csvParser('hello,world""')).toEqual([['hello', 'world""']]); | |
expect(csvParser('hello,"""world"""')).toEqual([['hello', '"world"']]); | |
}); | |
it('should handle empty words', () => { | |
expect(csvParser('hello,,world')).toEqual([['hello', '', 'world']]); | |
expect(csvParser('"hello",,world')).toEqual([['hello', '', 'world']]); | |
expect(csvParser(',world')).toEqual([['', 'world']]); | |
expect(csvParser(',\n')).toEqual([['', '']]); | |
expect(csvParser(',\r')).toEqual([['', '']]); | |
expect(csvParser(',\r\n')).toEqual([['', '']]); | |
}); | |
it('should handle lf', () => { | |
expect(csvParser('hello\n"hello"')).toEqual([['hello'], ['hello']]); | |
expect(csvParser('hello,world\nhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]); | |
expect(csvParser('hello,,world\n"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]); | |
}); | |
it('should handle cr', () => { | |
expect(csvParser('hello\r"hello"')).toEqual([['hello'], ['hello']]); | |
expect(csvParser('hello,world\rhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]); | |
expect(csvParser('hello,,world\r"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]); | |
}); | |
it('should handle crlf', () => { | |
expect(csvParser('hello\r\n"hello"')).toEqual([['hello'], ['hello']]); | |
expect(csvParser('hello,world\r\nhello,world')).toEqual([['hello', 'world'], ['hello', 'world']]); | |
expect(csvParser('hello,,world\r\n"hello",,world')).toEqual([['hello', '', 'world'], ['hello', '', 'world']]); | |
}); | |
it('should handle all', () => { | |
const sampleRows = (lineBreak) => [ | |
'hello,world,"this,is,good"', | |
'this,has,numbers,1234', | |
'this,has,special,characters,!@#$%^&*()\'[]{}./\\|-=_+<>?', | |
'hello,world,"this,is,escaped""quotation"""', | |
'this,,has,,spaces', | |
'this has spaces', | |
`and,"trailing,spaces",with,eol,${lineBreak}`, | |
].join(lineBreak); | |
const lfRows = sampleRows('\n'); | |
const crRows = sampleRows('\r'); | |
const crlfRows = sampleRows('\r\n'); | |
const expected = [ | |
['hello', 'world', 'this,is,good'], | |
['this', 'has', 'numbers', '1234'], | |
['this', 'has', 'special', 'characters', '!@#$%^&*()\'[]{}./\\|-=_+<>?'], | |
['hello', 'world', 'this,is,escaped"quotation"'], | |
['this', '', 'has', '', 'spaces'], | |
['this has spaces'], | |
['and', 'trailing,spaces', 'with', 'eol', ''], | |
]; | |
expect(csvParser(lfRows)).toEqual(expected); | |
expect(csvParser(crRows)).toEqual(expected); | |
expect(csvParser(crlfRows)).toEqual(expected); | |
}); | |
}); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment