/**
* This module provides functions to transform document files (primarily DOCX)
* to HTML and Markdown using Pandoc, handling asset extraction and URL updates.
* @module transform-files
*/
import path from "path";
import fs from "fs-extra";
import { glob } from "glob";
import { pandoc } from "./pandoc.js";
import { urlize, cleanHtml, markdownRemoveImageDimensions } from "./utils.js";
import createLogger from "@ulu/node-logger";
import { hasRequiredProps } from "@ulu/utils/object.js";
const logger = createLogger("Pandoc Adapter: Glob:");
const logFilename = filename => `(${ filename })`;
const requiredOptions = [
"inputDir",
"outputDir",
"assetDir",
// "assetRelDir",
// "assetPublicPath"
];
const hasRequiredOptions = hasRequiredProps(requiredOptions);
/**
* Default options for the transformFiles function.
* @typedef {object} TransformFilesOptions
* @property {string} inputDir - The directory containing the input files.
* @property {string} outputDir - The directory where the output files will be written.
* @property {string} assetDir - The directory where extracted media (images, etc.) will be written.
* @property {string} assetPublicPath - The final asset path (images/etc paths are modified to point at this absolute path, like the base path in site it will be used in).
* @property {string} pattern - The glob pattern used to select input files.
* @property {boolean} emptyOutputDir - Empty (delete contents) of output directory before populating
* @property {boolean} emptyAssetDir - Empty (delete contents) of asset directory before populating
* @property {boolean} updateAssetUrls - Whether you want the asset paths (absolute based on assetDir) to be updated to assetPublicPath (relative as though in a website/etc)
* @property {import("./pandoc").PandocAdapterOptions} adapterOptions - Options to pass to the pandoc adapter.
* @property {boolean} adapterOptions.allowError - Allow errors from pandoc.
* @property {boolean} adapterOptions.allowStdoutError - Allow errors from pandoc stdout.
* @property {object} adapterOptions.execFile - Options for the child_process.execFile.
* @property {number} adapterOptions.execFile.maxBuffer - Maximum buffer size.
* @property {function} getFileOutputPath - Function to determine the output file path.
* @property {function} getFileOutputDir - Function to determine the output directory.
* @property {function} getFileAssetDir - Function to determine the asset extraction directory.
* @property {function} getHtmlArgs - Function to generate Pandoc arguments for HTML conversion.
* @property {function} getMarkdownArgs - Function to generate Pandoc arguments for Markdown conversion.
* @property {function} beforeWrite - Function to modify the output before writing to disk.
*/
/**
* Default configuration for the pandoc function.
* @type {TransformFilesOptions}
*/
export const defaults = {
inputDir: null,
outputDir: null,
assetDir: null,
assetPublicPath: "/assets/extracted",
pattern: "[!~]?*.docx",
emptyOutputDir: true,
emptyAssetDir: true,
updateAssetUrls: true,
adapterOptions: {
allowError: true,
allowStdoutError: true,
execFile: {
maxBuffer: 1024 * 1000000
}
},
/**
* Determines the output file path.
* @param {object} ctx - The file processing context.
* @returns {string} The output file path.
*/
getFileOutputPath(ctx) {
return path.join(ctx.options.outputDir, `${ ctx.slug }/converted.${ ctx.ext }`);
},
/**
* Determines the output directory.
* @param {object} ctx - The file processing context.
* @returns {string} The output directory.
*/
getFileOutputDir(ctx) {
return path.join(ctx.options.outputDir, `${ ctx.slug }/`);
},
/**
* Determines the asset extraction directory.
* @param {object} ctx - The file processing context.
* @returns {string} The asset extraction directory.
*/
getFileAssetDir(ctx) {
return path.join(ctx.options.assetDir, `${ ctx.slug }/`);
},
/**
* Generates Pandoc arguments for HTML conversion.
* @param {object} ctx - The file processing context.
* @returns {string[]} An array of Pandoc arguments.
*/
getHtmlArgs: (ctx) => [
"--from=docx",
"--to=html",
`--id-prefix=d${ ctx.index }r`,
`--extract-media=${ ctx.options.getFileAssetDir(ctx) }`,
],
/**
* Generates Pandoc arguments for Markdown conversion.
* @param {object} ctx - The file processing context.
* @returns {string[]} An array of Pandoc arguments.
*/
getMarkdownArgs: (ctx) => [
"--from=docx",
"--to=markdown-grid_tables-multiline_tables-pipe_tables-bracketed_spans-native_spans",
"--reference-location=block",
"--columns=110",
`--id-prefix=d${ ctx.index }r`,
`--extract-media=${ ctx.options.getFileAssetDir(ctx) }`,
],
/**
* Modifies the output before writing to disk.
* @param {string} markup - The markup string (HTML or Markdown).
* @param {object} ctx - The file processing context.
* @returns {string} The modified markup string.
*/
beforeWrite(markup, ctx) {
return markup;
}
};
/**
* Transforms files from DOCX to HTML and Markdown using Pandoc.
* @async
* @param {TransformFilesOptions} userOptions - User-provided options to override the defaults.
* @returns {Promise<void>} A Promise that resolves when the transformation is complete.
*/
export async function transformFiles(userOptions = {}) {
if (!hasRequiredOptions(userOptions)) {
throw new Error(`Missing a required option, options required: ${ requiredOptions.join(", ") }`);
}
const options = Object.assign({}, defaults, userOptions);
try {
const files = await glob(options.pattern, { cwd: options.inputDir });
const filePromises = files.map((filepath, index) => processFile(filepath, index, options));
return await Promise.all(filePromises);
} catch (error) {
logger.error(error);
}
}
/**
* Processes a single file, converting it to HTML and Markdown.
* @async
* @param {string} filepath - The path to the file to process.
* @param {number} index - The index of the file in the list.
* @param {Options} options - The processing options.
* @returns {Promise<void>} A Promise that resolves when the file is processed.
*/
async function processFile(filepath, index, options) {
const absolute = path.resolve(options.inputDir, filepath);
const file = fs.readFileSync(absolute);
const name = path.basename(absolute).replace(/\.[^/.]+$/, "");
const slug = urlize(name);
const ctx = { absolute, name, index, slug, options };
const ctxHtml = { ...ctx, ext: "html", type: "html" };
const ctxMarkdown = { ...ctx, ext: "md", type: "markdown" };
const fileOutputDir = options.getFileOutputDir(ctx);
const fileAssetDir = options.getFileAssetDir(ctx);
const assetDirRegex = new RegExp(options.assetDir, "g");
const updateAssetUrls = content => content.replace(assetDirRegex, options.assetPublicPath);
fs.ensureDirSync(fileOutputDir);
if (options.emptyOutputDir) {
logger.log(`Emptying (${fileOutputDir})...`);
fs.emptyDirSync(fileOutputDir);
}
if (options.emptyAssetDir) {
logger.log(`Emptying (${fileAssetDir})...`);
fs.emptyDirSync(fileAssetDir);
}
try {
const html = await pandoc({
input: file,
args: options.getHtmlArgs(ctxHtml),
...options.adapterOptions
});
let finalHtml = await cleanHtml(html);
const outputTo = options.getFileOutputPath(ctxHtml);
if (options.updateAssetUrls) {
finalHtml = updateAssetUrls(finalHtml);
}
if (options.beforeWrite) {
finalHtml = options.beforeWrite(finalHtml, ctxHtml);
}
fs.writeFileSync(outputTo, finalHtml);
logger.log("Successfully converted to html!", logFilename(name));
} catch (error) {
handleError(`Unable to convert file to html! ${logFilename(name)}`, filepath, error);
throw error; // Re-throw to be caught in transformFiles
}
try {
const markdown = await pandoc({
input: file,
args: options.getMarkdownArgs(ctxMarkdown),
...options.adapterOptions
});
const outputTo = options.getFileOutputPath(ctxMarkdown);
let finalMarkdown = markdownRemoveImageDimensions(markdown);
if (options.updateAssetUrls) {
finalMarkdown = updateAssetUrls(finalMarkdown);
}
if (options.beforeWrite) {
finalMarkdown = options.beforeWrite(finalMarkdown, ctxMarkdown);
}
fs.writeFileSync(outputTo, finalMarkdown);
logger.log("Successfully converted to markdown!", logFilename(name));
} catch (error) {
handleError(`Unable to convert to markdown! ${logFilename(name)}`, filepath, error);
throw error; // Re-throw to be caught in transformFiles
}
}
/**
* Handles errors during file processing.
* @param {string} title - The error title.
* @param {string} filepath - The path to the file being processed.
* @param {Error} error - The error object.
*/
function handleError(title, filepath, error) {
logger.error(title, filepath);
console.log(error);
}