Move to a custom substitutions file format

* The new format is easier to read and write, at the cost of a custom parser. * The extra validation during parsing caught a few cases where we were accidentally replacing text with "undefined", due to typos. In particular, Worm Imago 21.3 and Ward Gleaming 9.11. * We now have separate substitutions files for each book. * We now read the substitutions from disk once per book, and pass the parsed result to the worker, which should be more efficient.
domenic · Jan 19, 2025 · 67ec627 · 67ec627
1 parent 281b3dd
commit 67ec627
Show file tree

Hide file tree

Showing 10 changed files with 7,487 additions and 9,342 deletions.
diff --git a/README.md b/README.md
@@ -68,6 +68,6 @@ You can see all the chosen character-name titles in the [`book-data/`](./book-da
 
 ## Text fixups
 
-This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code; there's [`convert-worker.js`](https://github.com/domenic/worm-scraper/blob/master/lib/convert-worker.js), where some things are handled generally, and [`substitutions.json`](https://github.com/domenic/worm-scraper/blob/master/lib/substitutions.json), for one-off fixes.
+This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code: there's [`convert-worker.js`](lib/convert-worker.js), where some things are handled generally, and the [`substitutions/` directory](./substitutions/), for one-off fixes.
 
 This process is designed to be extensible, so if you notice any problems with the original text that you think should be fixed, file an issue to let me know, and we can update the fixup code so that the resulting ebook is improved. (Or better yet, send a pull request!)
diff --git a/lib/convert-worker.js b/lib/convert-worker.js
@@ -2,15 +2,19 @@
 const workerpool = require("workerpool");
 const fs = require("fs");
 const { JSDOM } = require("jsdom");
-const substitutions = require("./substitutions.json");
 
 workerpool.worker({ convertChapter });
 
-function convertChapter(chapter, bookTitle, inputPath, outputPath) {
+function convertChapter(chapter, bookTitle, inputPath, outputPath, chapterSubstitutions) {
   const contents = fs.readFileSync(inputPath, { encoding: "utf-8" });
 
   const rawChapterJSDOM = new JSDOM(contents);
-  const { output, warnings } = getChapterString(chapter, bookTitle, rawChapterJSDOM.window.document);
+  const { output, warnings } = getChapterString(
+    chapter,
+    bookTitle,
+    chapterSubstitutions,
+    rawChapterJSDOM.window.document
+  );
 
   // TODO: this should probably not be necessary... jsdom bug I guess!?
   rawChapterJSDOM.window.close();
@@ -19,9 +23,9 @@ function convertChapter(chapter, bookTitle, inputPath, outputPath) {
   return warnings;
 }
 
-function getChapterString(chapter, bookTitle, rawChapterDoc) {
+function getChapterString(chapter, bookTitle, chapterSubstitutions, rawChapterDoc) {
   const { xml, warnings } =
-    getBodyXML(chapter, bookTitle, rawChapterDoc.querySelector(".entry-content"));
+    getBodyXML(chapter, bookTitle, chapterSubstitutions, rawChapterDoc.querySelector(".entry-content"));
 
   const output = `<?xml version="1.0" encoding="utf-8" ?>
 <!DOCTYPE html>
@@ -40,7 +44,7 @@ ${xml}
   return { output, warnings };
 }
 
-function getBodyXML(chapter, bookTitle, contentEl) {
+function getBodyXML(chapter, bookTitle, chapterSubstitutions, contentEl) {
   const warnings = [];
 
   // Remove initial Next Chapter and Previous Chapter <p>
@@ -287,21 +291,21 @@ function getBodyXML(chapter, bookTitle, contentEl) {
   xml = fixParahumansOnline(xml, bookTitle);
 
   // One-off fixes
-  for (const substitution of substitutions[chapter.url] || []) {
+  for (const substitution of chapterSubstitutions) {
     if (substitution.before) {
       const indexOf = xml.indexOf(substitution.before);
       if (indexOf === -1) {
         warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
-                      `updated at the source, in which case, you should edit substitutions.json.`);
+                      `updated at the source, in which case, you should edit the substitutions file.`);
       }
       if (indexOf !== xml.lastIndexOf(substitution.before)) {
         warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
-                      `Update substitutions.json for a more precise substitution.`);
+                      `Update the substitutions file for a more precise substitution.`);
       }
 
       xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
     } else if (substitution.regExp) {
-      xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
+      xml = xml.replace(substitution.regExp, substitution.replacement);
     } else {
       warnings.push(`Invalid substitution specified for ${chapter.url}`);
     }
@@ -336,7 +340,7 @@ function fixTruncatedWords(xml) {
   xml = xml.replace(/[‘’][Cc]age(?![a-z])/ug, "’Cage");
 
   // We can't do "’Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
-  // open quote, so we do that in substitutions.json.
+  // open quote, so we do that in the substitutions file.
 
   return xml;
 }
@@ -355,9 +359,9 @@ function fixDialogueTags(xml) {
   //
   // This sometimes overcorrects, as in the following example:
   // > “Basically,” Alec said, “For your powers to manifest, ...
-  // Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
+  // Here instead we should lowercase the "f". We handle that via one-offs in the substitutions file.
   //
-  // This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
+  // This applies to ~800 instances, so although we have to correct back in the substitutions file a decent number of
   // times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
   // capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
   xml = xml.replace(/,” ([A-Za-z]+ [A-Za-z]+), “([A-Z])/ug, ",” $1. “$2");
@@ -535,8 +539,8 @@ function fixCapitalization(xml, bookTitle) {
     /patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
     (_, $1) => `Patrol ${$1.toLowerCase()}`
   );
-  // This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
-  // Worm, where it is never needed:
+  // This usually works in Ward (some instances corrected back in the substitutions file), and has a few false positives
+  // in Worm, where it is never needed:
   if (bookTitle === "Ward") {
     xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
   }
@@ -572,13 +576,14 @@ function fixCapitalization(xml, bookTitle) {
   xml = xml.replace(/(?<! {2}|“|>)Flock/ug, "flock");
 
   // Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
-  // can be reasonably sure they don't start a sentence, although more specific instances are done in
-  // substitutions.json, and some need to be back-corrected.
+  // can be reasonably sure they don't start a sentence, although more specific instances are done in the substitutions
+  // file, and some need to be back-corrected.
   //
   // Note: "Master" is specifically omitted because it fails poorly on Worm Interlude 4. Other instances need to be
-  // corrected via substitutions.json.
+  // corrected via the substitutions file.
   //
-  // This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
+  // This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in the substitutions
+  // file.
   xml = xml.replace(
     // eslint-disable-next-line max-len
     /(?<! {2}|“|>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
@@ -615,7 +620,7 @@ function fixCapitalization(xml, bookTitle) {
 
   // "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
   // instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
-  // substitutions.json.
+  // the substitutions file.
   xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
   xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");
 
@@ -635,8 +640,8 @@ function fixCapitalization(xml, bookTitle) {
   xml = xml.replace(/ Neo-/ug, " neo-");
 
   // Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
-  // adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
-  // case of "French toast".)
+  // adjective capitalized. The books mostly use lowercase, so let's stick with that. (The substitutions file corrects
+  // one case of "French toast".)
   xml = xml.replace(/english(?! muffin)/ug, "English");
   xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");
 
@@ -652,7 +657,7 @@ function fixCapitalization(xml, bookTitle) {
     // All plural discussions of "Titans" are after Sundown 17.y.
     xml = xml.replace(/titans/ug, "Titans");
 
-    // Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
+    // Since we can't safely change all instances of "titan", most are in the substitutions file. We can do a few here,
     // though.
     xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
     xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
@@ -725,7 +730,7 @@ function fixHyphens(xml) {
   // Preemptive(ly) is often hyphenated (not always). It should not be.
   xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");
 
-  // These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
+  // These should be hyphenated only when used as a verb. We correct those cases back in the substitutions file.
   xml = xml.replace(/fist-bump/ug, "fist bump");
   xml = xml.replace(/high-five/ug, "high five");
 
@@ -762,7 +767,7 @@ function standardizeSpellings(xml) {
   xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
   xml = xml.replace(/t\.v\./uig, "TV");
 
-  // "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
+  // "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via the substitutions file when people are
   // writing notes and thus probably the intention was to be less formal. Also it seems per
   // https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
   xml = xml.replace(/Ok([,. ])/ug, "Okay$1");

diff --git a/lib/convert.js b/lib/convert.js
@@ -10,6 +10,7 @@ module.exports = async (
   chapterDataPath,
   contentPath,
   bookData,
+  substitutionsPath,
   concurrentJobs,
   chapterTitleStyle
 ) => {
@@ -19,6 +20,9 @@ module.exports = async (
   await fs.writeFile(chapterDataPath, JSON.stringify(chapterData, null, 2));
   const flattenedChapters = chapterData.flatMap(arc => arc.chapters);
 
+  const substitutionsText = await fs.readFile(substitutionsPath, { encoding: "utf-8" });
+  const substitutions = parseSubstitutions(substitutionsText);
+
   console.log("Converting raw downloaded HTML to EPUB chapters");
   const progress = progressUtils.start(flattenedChapters.length);
 
@@ -32,8 +36,15 @@ module.exports = async (
   await Promise.all(flattenedChapters.map(async chapter => {
     const inputPath = path.resolve(cachePath, chapter.inputFilename);
     const outputPath = path.resolve(contentPath, chapter.outputFilename);
+    const chapterSubstitutions = substitutions.get(chapter.url) || [];
 
-    warnings.push(...await pool.exec("convertChapter", [chapter, bookData.title, inputPath, outputPath]));
+    warnings.push(...await pool.exec("convertChapter", [
+      chapter,
+      bookData.title,
+      inputPath,
+      outputPath,
+      chapterSubstitutions
+    ]));
 
     progressUtils.increment(progress);
   }));
@@ -91,3 +102,133 @@ function chooseChapterTitle(chapterData, chapterTitleStyle) {
 
   throw new Error(`Invalid chapter title style: ${chapterTitleStyle}`);
 }
+
+function parseSubstitutions(text) {
+  const lines = text.split("\n");
+  const result = new Map();
+
+  let currentChapter = null;
+  let currentBefore = null;
+  let currentRegExp = null;
+
+  for (const [lineNumber, line] of Object.entries(lines)) {
+    // Skip empty lines
+    if (!line.trim()) {
+      continue;
+    }
+
+    const errorPrefix = `Error in substitutions line "${line}" (line number ${Number(lineNumber) + 1}): `;
+
+    let sigil, content;
+    try {
+      [, sigil, content] = /(@ | {2}- | {2}\+ ?| {2}r | {2}s | {2}# )(.*)/u.exec(line);
+    } catch {
+      throw new Error(`${errorPrefix}invalid line format`);
+    }
+
+    switch (sigil) {
+      // New chapter
+      case "@ ": {
+        if (!isCanonicalizedURL(content)) {
+          throw new Error(`${errorPrefix}invalid chapter URL`);
+        }
+
+        currentChapter = content;
+        if (!result.has(currentChapter)) {
+          result.set(currentChapter, []);
+        }
+        currentBefore = null;
+        currentRegExp = null;
+
+        break;
+      }
+
+      // Before line
+      case "  - ": {
+        if (!currentChapter) {
+          throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
+        }
+        if (currentBefore) {
+          throw new Error(`${errorPrefix}appeared after a before (-) line`);
+        }
+        if (currentRegExp) {
+          throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
+        }
+
+        currentBefore = content.replaceAll("\\n", "\n");
+        currentRegExp = null;
+
+        break;
+      }
+
+      // After line
+      case "  +":
+      case "  + ": {
+        if (!currentChapter || !currentBefore) {
+          throw new Error(`${errorPrefix}missing previous current chapter (@) or before (-) line`);
+        }
+        if (currentRegExp) {
+          throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
+        }
+
+        const change = {
+          before: beforeAfterLineToString(currentBefore),
+          after: beforeAfterLineToString(content)
+        };
+        result.get(currentChapter).push(change);
+        currentBefore = null;
+
+        break;
+      }
+
+      // RegExp line
+      case "  r ": {
+        if (!currentChapter) {
+          throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
+        }
+        if (currentBefore) {
+          throw new Error(`${errorPrefix}appeared after a before (-) line`);
+        }
+
+        currentRegExp = new RegExp(content, "ug");
+
+        break;
+      }
+
+      // RegExp substitution
+      case "  s ": {
+        if (!currentChapter || !currentRegExp) {
+          throw new Error(`${errorPrefix}missing previous current chapter (@) or regexp (r) line`);
+        }
+
+        const change = {
+          regExp: currentRegExp,
+          replacement: content.replaceAll("\\n", "\n")
+        };
+        result.get(currentChapter).push(change);
+        currentRegExp = null;
+
+        break;
+      }
+
+      // Comment
+      case "  # ": {
+        if (!currentChapter) {
+          throw new Error(`${errorPrefix} missing previous current chapter (@) line`);
+        }
+
+        break;
+      }
+    }
+  }
+
+  return result;
+}
+
+function isCanonicalizedURL(urlString) {
+  return URL.parse(urlString).href === urlString;
+}
+
+function beforeAfterLineToString(line) {
+  return line.replaceAll("\\n", "\n").replace(/(?:\\s)+$/u, match => " ".repeat(match.length / 2));
+}