Skip to content

Instantly share code, notes, and snippets.

@nchaulet
Last active August 29, 2015 14:06
Show Gist options
  • Save nchaulet/a6d41c259d3a8048b49d to your computer and use it in GitHub Desktop.
Save nchaulet/a6d41c259d3a8048b49d to your computer and use it in GitHub Desktop.
Parse docx
npm install mammoth
var docxReader = require("mammoth/lib/docx/docx-reader");
var path = require('path');
var fs = require('fs');
var os = require('os').EOL;
// Config to modify
var imageOutputDir = 'images';
var postOutputDir = '.';
if (!fs.existsSync(imageOutputDir)) {
fs.mkdirSync(imageOutputDir);
}
var docxConverter = {
fileIndex: 1,
imageIndex: 1,
convertDoc: function(docPath) {
docxReader.read({path: docPath})
.then(function(documentResult) {
return documentResult.map(docxConverter.convertElementToRawText);
})
.then(function(result) {
var doc = {
filename: docPath,
content: result.value
};
fs.writeFileSync(postOutputDir + '/article' + docxConverter.fileIndex.toString() + '.json', JSON.stringify(doc), {
flag: 'w'
});
docxConverter.fileIndex++;
});
},
convertElementToRawText: function(element) {
if (element.type === "text") {
return element.value;
} else if (element.type == 'image') {
var extension = element.contentType.split("/")[1];
var filename = docxConverter.imageIndex.toString() + "." + extension;
docxConverter.saveImage(element, filename);
docxConverter.imageIndex++;
return '[[image:' + filename + ']]';
} else {
var tail = element.type === "paragraph" ? "\n\n" : "";
return (element.children || []).map(docxConverter.convertElementToRawText).join("") + tail;
}
},
saveImage: function(element, filename) {
return element.read().then(function(imageBuffer) {
var imagePath = path.join(imageOutputDir, filename);
fs.writeFileSync(imagePath, imageBuffer);
});
}
};
// Launch conversion of test.docx file
docxConverter.convertDoc('test.docx');
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment