Skip to content

Instantly share code, notes, and snippets.

@mokshchadha
Created May 29, 2025 06:14
Show Gist options
  • Save mokshchadha/c36f92edc711d7a512ef0933d4cea64b to your computer and use it in GitHub Desktop.
Save mokshchadha/c36f92edc711d7a512ef0933d4cea64b to your computer and use it in GitHub Desktop.
A service module to merge and club your docx filese to a single docx while preserving the styles

PDF Merger Script Prerequisites

Required software to run the PDF merger and conversion script.

Prerequisites: Node.js and Python are already installed.

Required Software

1. PDF Toolkit (pdftk)

For merging PDF files.

macOS:

brew install pdftk-java

Linux:

sudo apt install pdftk

2. LibreOffice

For document format conversions.

macOS:

brew install --cask libreoffice

Linux:

sudo apt install libreoffice

Verification

pdftk --version
libreoffice --version
# or
soffice --version

Basic Usage

# Merge PDFs
pdftk file1.pdf file2.pdf cat output merged.pdf

# Convert PDF to ODT, then to DOCX
libreoffice --headless --convert-to odt merged.pdf
libreoffice --headless --convert-to docx merged.odt
const { exec } = require('child_process');
const fs = require('fs');
const path = require('path');
const os = require('os');
function cleanupTempFiles(filePaths) {
console.log('Starting cleanup of temporary files...');
filePaths.forEach(filePath => {
try {
if (fs.existsSync(filePath)) {
fs.unlinkSync(filePath);
console.log(`Deleted: ${path.basename(filePath)}`);
} else {
console.log(`File not found (already deleted?): ${path.basename(filePath)}`);
}
} catch (error) {
console.error(`Error deleting ${path.basename(filePath)}: ${error.message}`);
}
});
console.log('Cleanup completed');
}
function forceCleanupPdfFiles(tempDir = null) {
const workingDir = tempDir || os.tmpdir();
console.log(`Force cleaning PDF files in: ${workingDir}`);
const pdfPatterns = [
'first.pdf',
'second.pdf',
'merged.pdf',
'temp_*.pdf'
];
try {
const files = fs.readdirSync(workingDir);
let deletedCount = 0;
files.forEach(file => {
const filePath = path.join(workingDir, file);
const isTargetPdf = pdfPatterns.some(pattern => {
if (pattern.includes('*')) {
const regex = new RegExp(pattern.replace('*', '.*'));
return regex.test(file);
}
return file === pattern;
});
if (isTargetPdf && fs.existsSync(filePath)) {
try {
fs.unlinkSync(filePath);
console.log(`Force deleted: ${file}`);
deletedCount++;
} catch (error) {
console.error(`Failed to delete ${file}: ${error.message}`);
}
}
});
console.log(`Force cleanup completed. Deleted ${deletedCount} files.`);
return { success: true, deletedCount };
} catch (error) {
console.error(`Error during force cleanup: ${error.message}`);
return { success: false, error: error.message };
}
}
function mergeDocx(firstFileName, secondFileName, outputDocx, callback) {
const platform = os.platform();
const libreOfficeCmd = platform === 'darwin' ? 'soffice' : 'libreoffice';
const tempDir = os.tmpdir();
const mergedPdf = path.join(tempDir, 'merged.pdf');
const mergedDocx = path.join(tempDir, 'merged.docx');
console.log(`Platform detected: ${platform}`);
console.log(`Using LibreOffice command: ${libreOfficeCmd}`);
let processedFirstFile = firstFileName;
let processedSecondFile = secondFileName;
let tempFilesToCleanup = [];
function convertDocToDocx(inputFile, callback) {
const fileExt = path.extname(inputFile).toLowerCase();
if (fileExt === '.doc') {
const convertCmd = `${libreOfficeCmd} --headless --convert-to docx --outdir "${tempDir}" "${inputFile}"`;
const baseName = path.basename(inputFile, '.doc');
const outputDocxPath = path.join(tempDir, `${baseName}.docx`);
console.log(`Converting ${inputFile} from .doc to .docx...`);
exec(convertCmd, (error, stdout, stderr) => {
if (error) {
return callback(`Error converting ${inputFile} to DOCX: ${error.message}`);
}
console.log(`Step 0: Successfully converted ${inputFile} to DOCX`);
tempFilesToCleanup.push(outputDocxPath);
callback(null, outputDocxPath);
});
} else {
callback(null, inputFile);
}
}
convertDocToDocx(firstFileName, (error, convertedFirstFile) => {
if (error) {
return callback(error);
}
processedFirstFile = convertedFirstFile;
convertDocToDocx(secondFileName, (error, convertedSecondFile) => {
if (error) {
return callback(error);
}
processedSecondFile = convertedSecondFile;
console.log('Doc to Docx conversion completed. Proceeding with PDF conversion...');
const convertFirstCmd = `${libreOfficeCmd} --headless --convert-to pdf --outdir "${tempDir}" "${processedFirstFile}"`;
exec(convertFirstCmd, (error, stdout, stderr) => {
if (error) {
cleanupTempFiles(tempFilesToCleanup);
return callback(`Error converting first file to PDF: ${error.message}`);
}
console.log('Step 1: First file converted to PDF');
const convertSecondCmd = `${libreOfficeCmd} --headless --convert-to pdf --outdir "${tempDir}" "${processedSecondFile}"`;
exec(convertSecondCmd, (error, stdout, stderr) => {
if (error) {
cleanupTempFiles(tempFilesToCleanup);
return callback(`Error converting second file to PDF: ${error.message}`);
}
console.log('Step 2: Second file converted to PDF');
const firstBaseName = path.basename(processedFirstFile, path.extname(processedFirstFile));
const secondBaseName = path.basename(processedSecondFile, path.extname(processedSecondFile));
const actualFirstPdf = path.join(tempDir, `${firstBaseName}.pdf`);
const actualSecondPdf = path.join(tempDir, `${secondBaseName}.pdf`);
const mergePdfCmd = `pdftk "${actualFirstPdf}" "${actualSecondPdf}" cat output "${mergedPdf}"`;
exec(mergePdfCmd, (error, stdout, stderr) => {
if (error) {
cleanupTempFiles([...tempFilesToCleanup, actualFirstPdf, actualSecondPdf]);
return callback(`Error merging PDFs: ${error.message}`);
}
console.log('Step 3: PDFs merged successfully');
const setupAndConvertCmd = `
# Check if virtual environment exists, create if not
if [ ! -d "pdf_env" ]; then
python3 -m venv pdf_env
source pdf_env/bin/activate
pip install pdf2docx
else
source pdf_env/bin/activate
fi
# Convert PDF to DOCX
pdf2docx convert "${mergedPdf}" "${mergedDocx}"
`;
exec(setupAndConvertCmd, { shell: '/bin/bash' }, (error, stdout, stderr) => {
if (error) {
cleanupTempFiles([...tempFilesToCleanup, actualFirstPdf, actualSecondPdf, mergedPdf]);
return callback(`Error converting PDF to DOCX: ${error.message}`);
}
console.log('Step 4: PDF converted to DOCX');
fs.copyFile(mergedDocx, outputDocx, (err) => {
if (err) {
cleanupTempFiles([...tempFilesToCleanup, actualFirstPdf, actualSecondPdf, mergedPdf, mergedDocx]);
return callback(`Error copying final file: ${err.message}`);
}
cleanupTempFiles([...tempFilesToCleanup, actualFirstPdf, actualSecondPdf, mergedPdf, mergedDocx]);
console.log(`Document merge completed successfully: ${outputDocx}`);
callback(null, `Successfully merged documents into ${outputDocx}`);
});
});
});
});
});
});
});
}
function mergeDocxCrossPlatform(firstFileName, secondFileName, outputDocx, callback) {
const platform = os.platform();
let libreOfficeCmd;
switch (platform) {
case 'darwin': // macOS
libreOfficeCmd = 'soffice';
break;
case 'linux':
libreOfficeCmd = 'libreoffice';
break;
case 'win32': // Windows
libreOfficeCmd = '"C:\\Program Files\\LibreOffice\\program\\soffice.exe"';
break;
default:
return callback(`Unsupported platform: ${platform}`);
}
}
// Usage examples:
//
// Basic usage:
// mergeDocx('document1.docx', 'document2.docx', 'output_merged.docx', (error, result) => {
// if (error) {
// console.error('Error:', error);
// } else {
// console.log('Success:', result);
// }
// });
//
// Force cleanup if needed:
// forceCleanupPdfFiles(); // Cleans system temp directory
// forceCleanupPdfFiles('/custom/temp/path'); // Cleans specific directory
module.exports = { mergeDocx, mergeDocxCrossPlatform, cleanupTempFiles, forceCleanupPdfFiles };
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment