transform-files.js

/**
 * This module provides functions to transform document files (primarily DOCX)
 * to HTML and Markdown using Pandoc, handling asset extraction and URL updates.
 * @module transform-files
 */

import path from "path";
import fs from "fs-extra";
import { glob } from "glob";
import { pandoc } from "./pandoc.js";
import { urlize, cleanHtml, markdownRemoveImageDimensions } from "./utils.js";
import createLogger from "@ulu/node-logger";
import { hasRequiredProps } from "@ulu/utils/object.js";

const logger = createLogger("Pandoc Adapter: Glob:");
const logFilename = filename => `(${ filename })`;
const requiredOptions = [
  "inputDir",
  "outputDir",
  "assetDir",
  // "assetRelDir", 
  // "assetPublicPath"
];
const hasRequiredOptions = hasRequiredProps(requiredOptions);
/**
 * Default options for the transformFiles function.
 * @typedef {object} TransformFilesOptions
 * @property {string} inputDir - The directory containing the input files.
 * @property {string} outputDir - The directory where the output files will be written.
 * @property {string} assetDir - The directory where extracted media (images, etc.) will be written. 
 * @property {string} assetPublicPath - The final asset path (images/etc paths are modified to point at this absolute path, like the base path in site it will be used in).
 * @property {string} pattern - The glob pattern used to select input files.
 * @property {boolean} emptyOutputDir - Empty (delete contents) of output directory before populating
 * @property {boolean} emptyAssetDir - Empty (delete contents) of asset directory before populating
 * @property {boolean} updateAssetUrls - Whether you want the asset paths (absolute based on assetDir) to be updated to assetPublicPath (relative as though in a website/etc)
 * @property {import("./pandoc").PandocAdapterOptions} adapterOptions - Options to pass to the pandoc adapter.
 * @property {boolean} adapterOptions.allowError - Allow errors from pandoc.
 * @property {boolean} adapterOptions.allowStdoutError - Allow errors from pandoc stdout.
 * @property {object} adapterOptions.execFile - Options for the child_process.execFile.
 * @property {number} adapterOptions.execFile.maxBuffer - Maximum buffer size.
 * @property {function} getFileOutputPath - Function to determine the output file path.
 * @property {function} getFileOutputDir - Function to determine the output directory.
 * @property {function} getFileAssetDir - Function to determine the asset extraction directory. 
 * @property {function} getHtmlArgs - Function to generate Pandoc arguments for HTML conversion.
 * @property {function} getMarkdownArgs - Function to generate Pandoc arguments for Markdown conversion.
 * @property {function} beforeWrite - Function to modify the output before writing to disk.
 */


/**
 * Default configuration for the pandoc function.
 * @type {TransformFilesOptions}
 */
export const defaults = {
  inputDir: null,
  outputDir: null,
  assetDir: null,      
  assetPublicPath: "/assets/extracted",   
  pattern: "[!~]?*.docx",
  emptyOutputDir: true,
  emptyAssetDir: true, 
  updateAssetUrls: true,
  adapterOptions: {
    allowError: true,
    allowStdoutError: true,
    execFile: {
      maxBuffer: 1024 * 1000000
    }
  },
  /**
   * Determines the output file path.
   * @param {object} ctx - The file processing context.
   * @returns {string} The output file path.
   */
  getFileOutputPath(ctx) {
    return path.join(ctx.options.outputDir, `${ ctx.slug }/converted.${ ctx.ext }`);
  },
  /**
   * Determines the output directory.
   * @param {object} ctx - The file processing context.
   * @returns {string} The output directory.
   */
  getFileOutputDir(ctx) {
    return path.join(ctx.options.outputDir, `${ ctx.slug }/`);
  },
  /**
   * Determines the asset extraction directory.
   * @param {object} ctx - The file processing context.
   * @returns {string} The asset extraction directory.
   */
  getFileAssetDir(ctx) {
    return path.join(ctx.options.assetDir, `${ ctx.slug }/`);
  },
  /**
   * Generates Pandoc arguments for HTML conversion.
   * @param {object} ctx - The file processing context.
   * @returns {string[]} An array of Pandoc arguments.
   */
  getHtmlArgs: (ctx) => [
    "--from=docx",
    "--to=html",
    `--id-prefix=d${ ctx.index }r`,
    `--extract-media=${ ctx.options.getFileAssetDir(ctx) }`, 
  ],
  /**
   * Generates Pandoc arguments for Markdown conversion.
   * @param {object} ctx - The file processing context.
   * @returns {string[]} An array of Pandoc arguments.
   */
  getMarkdownArgs: (ctx) => [
    "--from=docx",
    "--to=markdown-grid_tables-multiline_tables-pipe_tables-bracketed_spans-native_spans",
    "--reference-location=block",
    "--columns=110",
    `--id-prefix=d${ ctx.index }r`,
    `--extract-media=${ ctx.options.getFileAssetDir(ctx) }`, 
  ],
  /**
   * Modifies the output before writing to disk.
   * @param {string} markup - The markup string (HTML or Markdown).
   * @param {object} ctx - The file processing context.
   * @returns {string} The modified markup string.
   */
  beforeWrite(markup, ctx) {
    return markup;
  }
};


/**
 * Transforms files from DOCX to HTML and Markdown using Pandoc.
 * @async
 * @param {TransformFilesOptions} userOptions - User-provided options to override the defaults.
 * @returns {Promise<void>} A Promise that resolves when the transformation is complete.
 */
export async function transformFiles(userOptions = {}) {
  if (!hasRequiredOptions(userOptions)) {
    throw new Error(`Missing a required option, options required: ${ requiredOptions.join(", ") }`);
  }

  const options = Object.assign({}, defaults, userOptions);

  try {
    const files = await glob(options.pattern, { cwd: options.inputDir });
    const filePromises = files.map((filepath, index) => processFile(filepath, index, options));
    return await Promise.all(filePromises);
  } catch (error) {
    logger.error(error);
  }
}

/**
 * Processes a single file, converting it to HTML and Markdown.
 * @async
 * @param {string} filepath - The path to the file to process.
 * @param {number} index - The index of the file in the list.
 * @param {Options} options - The processing options.
 * @returns {Promise<void>} A Promise that resolves when the file is processed.
 */
async function processFile(filepath, index, options) {
  const absolute = path.resolve(options.inputDir, filepath);
  const file = fs.readFileSync(absolute);
  const name = path.basename(absolute).replace(/\.[^/.]+$/, "");
  const slug = urlize(name);
  const ctx = { absolute, name, index, slug, options };
  const ctxHtml = { ...ctx, ext: "html", type: "html" };
  const ctxMarkdown = { ...ctx, ext: "md", type: "markdown" };
  const fileOutputDir = options.getFileOutputDir(ctx);
  const fileAssetDir = options.getFileAssetDir(ctx);
  const assetDirRegex = new RegExp(options.assetDir, "g");
  const updateAssetUrls = content => content.replace(assetDirRegex, options.assetPublicPath);

  fs.ensureDirSync(fileOutputDir);

  if (options.emptyOutputDir) {
    logger.log(`Emptying (${fileOutputDir})...`);
    fs.emptyDirSync(fileOutputDir);
  }

  if (options.emptyAssetDir) {
    logger.log(`Emptying (${fileAssetDir})...`);
    fs.emptyDirSync(fileAssetDir);
  }

  try {
    const html = await pandoc({
      input: file,
      args: options.getHtmlArgs(ctxHtml),
      ...options.adapterOptions
    });

    let finalHtml = await cleanHtml(html);
    const outputTo = options.getFileOutputPath(ctxHtml);

    if (options.updateAssetUrls) {
      finalHtml = updateAssetUrls(finalHtml);
    }

    if (options.beforeWrite) {
      finalHtml = options.beforeWrite(finalHtml, ctxHtml);
    }
    fs.writeFileSync(outputTo, finalHtml);
    logger.log("Successfully converted to html!", logFilename(name));
  } catch (error) {
    handleError(`Unable to convert file to html!  ${logFilename(name)}`, filepath, error);
    throw error; // Re-throw to be caught in transformFiles
  }

  try {
    const markdown = await pandoc({
      input: file,
      args: options.getMarkdownArgs(ctxMarkdown),
      ...options.adapterOptions
    });
    const outputTo = options.getFileOutputPath(ctxMarkdown);

    let finalMarkdown = markdownRemoveImageDimensions(markdown);

    if (options.updateAssetUrls) {
      finalMarkdown = updateAssetUrls(finalMarkdown);
    }

    if (options.beforeWrite) {
      finalMarkdown = options.beforeWrite(finalMarkdown, ctxMarkdown);
    }
    fs.writeFileSync(outputTo, finalMarkdown);
    logger.log("Successfully converted to markdown!", logFilename(name));
  } catch (error) {
    handleError(`Unable to convert to markdown! ${logFilename(name)}`, filepath, error);
    throw error; // Re-throw to be caught in transformFiles
  }
}

/**
 * Handles errors during file processing.
 * @param {string} title - The error title.
 * @param {string} filepath - The path to the file being processed.
 * @param {Error} error - The error object.
 */
function handleError(title, filepath, error) {
  logger.error(title, filepath);
  console.log(error);
}