Created
June 14, 2019 08:42
-
-
Save sesn/0767b0cfb38220e3cb857e3ee67fe872 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
const config = require('config'); | |
const AWS = require('aws-sdk'); | |
const fs = require('fs'); | |
const path = require('path'); | |
AWS.config.region = config.get('textract').region; | |
AWS.config.credentials = new AWS.Credentials(config.get('s3')); | |
const awsTextract = new AWS.Textract(); | |
/** | |
* Generate CSV Data from Image using AWS Textract | |
* | |
* @params inputFile - Path of the input file | |
* | |
*/ | |
async function generateAwsTextract({ inputFile }) { | |
let promise = new Promise((resolve, reject) => { | |
let inputBuffer = fs.readFileSync(inputFile); | |
const params = { | |
Document: { | |
Bytes: Buffer.from(inputBuffer), | |
}, | |
FeatureTypes: [ | |
'TABLES' | |
] | |
}; | |
awsTextract.analyzeDocument(params, (err, data) => { | |
if (err) reject(err); // an error occurred | |
let blocks = data.Blocks; | |
let blocks_map = {}; | |
let table_blocks = []; | |
let csv = ''; | |
for(let block of blocks) { | |
blocks_map[block['Id']] = block; | |
if (block.BlockType == 'TABLE') { | |
table_blocks.push(block); | |
} | |
} | |
if(table_blocks.length == 0) { | |
let parsedData = {}; | |
return resolve(parsedData); | |
// return reject('Not Found'); | |
} else { | |
table_blocks.forEach((table, index) => { | |
csv += generateAwsTextractTableCsv(table, blocks_map, index); | |
csv += '\n\n'; | |
}); | |
resolve(csv); | |
}); | |
} | |
}); | |
}); | |
return promise; | |
} | |
function getAwsTextractText(result, blocks_map) { | |
let text = ''; | |
if (result.Relationships) { | |
result.Relationships.forEach(relationship => { | |
if (relationship.Type === 'CHILD') { | |
relationship.Ids.forEach(child_id => { | |
let word = blocks_map[child_id]; | |
if (word.BlockType === 'WORD') { | |
text += word.Text + ' '; | |
} | |
if (word.BlockType === 'SELECTION_ELEMENT' && word.SelectionStatus == 'SELECTED') { | |
text += 'X '; | |
} | |
}); | |
} | |
}); | |
} | |
return text; | |
} | |
function generateAwsTextractRowsColumnMap(table_result, blocks_map) { | |
let rows = {}; | |
for (let relationship of table_result.Relationships) { | |
if (relationship && relationship.Type === 'CHILD') { | |
relationship.Ids.forEach(id => { | |
let cell = blocks_map[id]; | |
if (cell.BlockType === 'CELL') { | |
let row_index = cell.RowIndex; | |
let col_index = cell.ColumnIndex; | |
if (typeof rows[row_index] == 'undefined') { | |
rows[row_index] = {}; | |
} | |
rows[row_index][col_index] = getAwsTextractText(cell, blocks_map); | |
} | |
}); | |
} | |
return rows; | |
} | |
} | |
function generateAwsTextractTableCsv(table_result, blocks_map, table_index) { | |
let rows = generateAwsTextractRowsColumnMap(table_result, blocks_map); | |
let table_id = 'Table_' + table_index; | |
let csv = `BOM Table: ${table_id+1}\n`; | |
for (let rowKey of Object.keys(rows)) { | |
for (let colKey of Object.keys(rows[rowKey])) { | |
csv += rows[rowKey][colKey] + ','; | |
} | |
csv += '\n'; | |
} | |
csv += '\n\n\n'; | |
return csv; | |
} | |
module.exports = { | |
generateAwsTextract | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment