Skip to content

Commit

Permalink
Move to a custom substitutions file format
Browse files Browse the repository at this point in the history
* The new format is easier to read and write, at the cost of a custom parser.

* The extra validation during parsing caught a few cases where we were accidentally replacing text with "undefined", due to typos. In particular, Worm Imago 21.3 and Ward Gleaming 9.11.

* We now have separate substitutions files for each book.

* We now read the substitutions from disk once per book, and pass the parsed result to the worker, which should be more efficient.
  • Loading branch information
domenic committed Jan 19, 2025
1 parent 281b3dd commit 67ec627
Show file tree
Hide file tree
Showing 10 changed files with 7,487 additions and 9,342 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,6 @@ You can see all the chosen character-name titles in the [`book-data/`](./book-da

## Text fixups

This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code; there's [`convert-worker.js`](https://github.com/domenic/worm-scraper/blob/master/lib/convert-worker.js), where some things are handled generally, and [`substitutions.json`](https://github.com/domenic/worm-scraper/blob/master/lib/substitutions.json), for one-off fixes.
This project makes a lot of fixups to the original text, mostly around typos, punctuation, capitalization, and consistency. You can get a more specific idea of what these are via the code: there's [`convert-worker.js`](lib/convert-worker.js), where some things are handled generally, and the [`substitutions/` directory](./substitutions/), for one-off fixes.

This process is designed to be extensible, so if you notice any problems with the original text that you think should be fixed, file an issue to let me know, and we can update the fixup code so that the resulting ebook is improved. (Or better yet, send a pull request!)
55 changes: 30 additions & 25 deletions lib/convert-worker.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@
const workerpool = require("workerpool");
const fs = require("fs");
const { JSDOM } = require("jsdom");
const substitutions = require("./substitutions.json");

workerpool.worker({ convertChapter });

function convertChapter(chapter, bookTitle, inputPath, outputPath) {
function convertChapter(chapter, bookTitle, inputPath, outputPath, chapterSubstitutions) {
const contents = fs.readFileSync(inputPath, { encoding: "utf-8" });

const rawChapterJSDOM = new JSDOM(contents);
const { output, warnings } = getChapterString(chapter, bookTitle, rawChapterJSDOM.window.document);
const { output, warnings } = getChapterString(
chapter,
bookTitle,
chapterSubstitutions,
rawChapterJSDOM.window.document
);

// TODO: this should probably not be necessary... jsdom bug I guess!?
rawChapterJSDOM.window.close();
Expand All @@ -19,9 +23,9 @@ function convertChapter(chapter, bookTitle, inputPath, outputPath) {
return warnings;
}

function getChapterString(chapter, bookTitle, rawChapterDoc) {
function getChapterString(chapter, bookTitle, chapterSubstitutions, rawChapterDoc) {
const { xml, warnings } =
getBodyXML(chapter, bookTitle, rawChapterDoc.querySelector(".entry-content"));
getBodyXML(chapter, bookTitle, chapterSubstitutions, rawChapterDoc.querySelector(".entry-content"));

const output = `<?xml version="1.0" encoding="utf-8" ?>
<!DOCTYPE html>
Expand All @@ -40,7 +44,7 @@ ${xml}
return { output, warnings };
}

function getBodyXML(chapter, bookTitle, contentEl) {
function getBodyXML(chapter, bookTitle, chapterSubstitutions, contentEl) {
const warnings = [];

// Remove initial Next Chapter and Previous Chapter <p>
Expand Down Expand Up @@ -287,21 +291,21 @@ function getBodyXML(chapter, bookTitle, contentEl) {
xml = fixParahumansOnline(xml, bookTitle);

// One-off fixes
for (const substitution of substitutions[chapter.url] || []) {
for (const substitution of chapterSubstitutions) {
if (substitution.before) {
const indexOf = xml.indexOf(substitution.before);
if (indexOf === -1) {
warnings.push(`Could not find text "${substitution.before}" in ${chapter.url}. The chapter may have been ` +
`updated at the source, in which case, you should edit substitutions.json.`);
`updated at the source, in which case, you should edit the substitutions file.`);
}
if (indexOf !== xml.lastIndexOf(substitution.before)) {
warnings.push(`The text "${substitution.before}" occurred twice, and so the substitution was ambiguous. ` +
`Update substitutions.json for a more precise substitution.`);
`Update the substitutions file for a more precise substitution.`);
}

xml = xml.replace(new RegExp(escapeRegExp(substitution.before), "u"), substitution.after);
} else if (substitution.regExp) {
xml = xml.replace(new RegExp(substitution.regExp, "ug"), substitution.replacement);
xml = xml.replace(substitution.regExp, substitution.replacement);
} else {
warnings.push(`Invalid substitution specified for ${chapter.url}`);
}
Expand Down Expand Up @@ -336,7 +340,7 @@ function fixTruncatedWords(xml) {
xml = xml.replace(/[][Cc]age(?![a-z])/ug, "’Cage");

// We can't do "’Clear" (short for Crystalclear) here because it appears too much as a normal word preceded by an
// open quote, so we do that in substitutions.json.
// open quote, so we do that in the substitutions file.

return xml;
}
Expand All @@ -355,9 +359,9 @@ function fixDialogueTags(xml) {
//
// This sometimes overcorrects, as in the following example:
// > “Basically,” Alec said, “For your powers to manifest, ...
// Here instead we should lowercase the "f". We handle that via one-offs in substitutions.json.
// Here instead we should lowercase the "f". We handle that via one-offs in the substitutions file.
//
// This applies to ~800 instances, so although we have to correct back in substitutions.json a decent number of
// This applies to ~800 instances, so although we have to correct back in the substitutions file a decent number of
// times, it definitely pays for itself. Most of the instances we have to correct back we also need to fix the
// capitalization anyway, and that's harder to do automatically, since proper names/"I"/etc. stay capitalized.
xml = xml.replace(/, ([A-Za-z]+ [A-Za-z]+), ([A-Z])/ug, ",” $1. “$2");
Expand Down Expand Up @@ -535,8 +539,8 @@ function fixCapitalization(xml, bookTitle) {
/patrol (block|group|leader|guard|student|uniform|squad|soldier|officer|crew|girl|bus|training)/uig,
(_, $1) => `Patrol ${$1.toLowerCase()}`
);
// This usually works in Ward (some instances corrected back in substitutions.json), and has a few false positives in
// Worm, where it is never needed:
// This usually works in Ward (some instances corrected back in the substitutions file), and has a few false positives
// in Worm, where it is never needed:
if (bookTitle === "Ward") {
xml = xml.replace(/the patrol(?!s|ling)/ug, "the Patrol");
}
Expand Down Expand Up @@ -572,13 +576,14 @@ function fixCapitalization(xml, bookTitle) {
xml = xml.replace(/(?<! {2}||>)Flock/ug, "flock");

// Especially early in Worm, PRT designations are capitalized; they should not be. This fixes the cases where we
// can be reasonably sure they don't start a sentence, although more specific instances are done in
// substitutions.json, and some need to be back-corrected.
// can be reasonably sure they don't start a sentence, although more specific instances are done in the substitutions
// file, and some need to be back-corrected.
//
// Note: "Master" is specifically omitted because it fails poorly on Worm Interlude 4. Other instances need to be
// corrected via substitutions.json.
// corrected via the substitutions file.
//
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in substitutions.json.
// This also over-de-capitalizes "The Stranger" in Ward (a titan name). Those also get fixed in the substitutions
// file.
xml = xml.replace(
// eslint-disable-next-line max-len
/(?<! {2}||>|\n|: )(Mover|Shaker|Brute|Breaker|Tinker|Blaster|Thinker|Striker|Changer|Trump|Stranger|Shifter|Shaper)(?! [A-Z])/ug,
Expand Down Expand Up @@ -615,7 +620,7 @@ function fixCapitalization(xml, bookTitle) {

// "Mom" and "Dad" should be capitalized when used as a proper name. These regexps are tuned to catch a good amount of
// instances, without over-correcting for non-proper-name-like cases. Many other instances are handled in
// substitutions.json.
// the substitutions file.
xml = xml.replace(/(?<!mom), dad(?![a-z])/ug, ", Dad");
xml = xml.replace(/, mom(?![a-z-])/ug, ", Mom");

Expand All @@ -635,8 +640,8 @@ function fixCapitalization(xml, bookTitle) {
xml = xml.replace(/ Neo-/ug, " neo-");

// Style guides disagree on whether items like "english muffin", "french toast", and "french kiss" need their
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (substitutions.json corrects one
// case of "French toast".)
// adjective capitalized. The books mostly use lowercase, so let's stick with that. (The substitutions file corrects
// one case of "French toast".)
xml = xml.replace(/english(?! muffin)/ug, "English");
xml = xml.replace(/(?<! {2})English muffin/ug, "english muffin");

Expand All @@ -652,7 +657,7 @@ function fixCapitalization(xml, bookTitle) {
// All plural discussions of "Titans" are after Sundown 17.y.
xml = xml.replace(/titans/ug, "Titans");

// Since we can't safely change all instances of "titan", most are in substitutions.json. We can do a few here,
// Since we can't safely change all instances of "titan", most are in the substitutions file. We can do a few here,
// though.
xml = xml.replace(/dauntless titan/uig, "Dauntless Titan"); // Sometimes "Dauntless" isn't even capitalized.
xml = xml.replace(/Kronos titan/ug, "Kronos Titan");
Expand Down Expand Up @@ -725,7 +730,7 @@ function fixHyphens(xml) {
// Preemptive(ly) is often hyphenated (not always). It should not be.
xml = xml.replace(/([Pp])re-emptive/ug, "$1reemptive");

// These should be hyphenated only when used as a verb. We correct those cases back in substitutions.json.
// These should be hyphenated only when used as a verb. We correct those cases back in the substitutions file.
xml = xml.replace(/fist-bump/ug, "fist bump");
xml = xml.replace(/high-five/ug, "high five");

Expand Down Expand Up @@ -762,7 +767,7 @@ function standardizeSpellings(xml) {
xml = xml.replace(/(\b)tv(\b)/ug, "$1TV$2");
xml = xml.replace(/t\.v\./uig, "TV");

// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via substitutions.json when people are
// "okay" is preferred to "ok" or "o.k.". This sometimes gets changed back via the substitutions file when people are
// writing notes and thus probably the intention was to be less formal. Also it seems per
// https://en.wikipedia.org/wiki/A-ok the "A" in "A-okay" should be capitalized.
xml = xml.replace(/Ok([,. ])/ug, "Okay$1");
Expand Down
143 changes: 142 additions & 1 deletion lib/convert.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ module.exports = async (
chapterDataPath,
contentPath,
bookData,
substitutionsPath,
concurrentJobs,
chapterTitleStyle
) => {
Expand All @@ -19,6 +20,9 @@ module.exports = async (
await fs.writeFile(chapterDataPath, JSON.stringify(chapterData, null, 2));
const flattenedChapters = chapterData.flatMap(arc => arc.chapters);

const substitutionsText = await fs.readFile(substitutionsPath, { encoding: "utf-8" });
const substitutions = parseSubstitutions(substitutionsText);

console.log("Converting raw downloaded HTML to EPUB chapters");
const progress = progressUtils.start(flattenedChapters.length);

Expand All @@ -32,8 +36,15 @@ module.exports = async (
await Promise.all(flattenedChapters.map(async chapter => {
const inputPath = path.resolve(cachePath, chapter.inputFilename);
const outputPath = path.resolve(contentPath, chapter.outputFilename);
const chapterSubstitutions = substitutions.get(chapter.url) || [];

warnings.push(...await pool.exec("convertChapter", [chapter, bookData.title, inputPath, outputPath]));
warnings.push(...await pool.exec("convertChapter", [
chapter,
bookData.title,
inputPath,
outputPath,
chapterSubstitutions
]));

progressUtils.increment(progress);
}));
Expand Down Expand Up @@ -91,3 +102,133 @@ function chooseChapterTitle(chapterData, chapterTitleStyle) {

throw new Error(`Invalid chapter title style: ${chapterTitleStyle}`);
}

function parseSubstitutions(text) {
const lines = text.split("\n");
const result = new Map();

let currentChapter = null;
let currentBefore = null;
let currentRegExp = null;

for (const [lineNumber, line] of Object.entries(lines)) {
// Skip empty lines
if (!line.trim()) {
continue;
}

const errorPrefix = `Error in substitutions line "${line}" (line number ${Number(lineNumber) + 1}): `;

let sigil, content;
try {
[, sigil, content] = /(@ | {2}- | {2}\+ ?| {2}r | {2}s | {2}# )(.*)/u.exec(line);
} catch {
throw new Error(`${errorPrefix}invalid line format`);
}

switch (sigil) {
// New chapter
case "@ ": {
if (!isCanonicalizedURL(content)) {
throw new Error(`${errorPrefix}invalid chapter URL`);
}

currentChapter = content;
if (!result.has(currentChapter)) {
result.set(currentChapter, []);
}
currentBefore = null;
currentRegExp = null;

break;
}

// Before line
case " - ": {
if (!currentChapter) {
throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
}
if (currentBefore) {
throw new Error(`${errorPrefix}appeared after a before (-) line`);
}
if (currentRegExp) {
throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
}

currentBefore = content.replaceAll("\\n", "\n");
currentRegExp = null;

break;
}

// After line
case " +":
case " + ": {
if (!currentChapter || !currentBefore) {
throw new Error(`${errorPrefix}missing previous current chapter (@) or before (-) line`);
}
if (currentRegExp) {
throw new Error(`${errorPrefix}appeared after a regexp (r) line`);
}

const change = {
before: beforeAfterLineToString(currentBefore),
after: beforeAfterLineToString(content)
};
result.get(currentChapter).push(change);
currentBefore = null;

break;
}

// RegExp line
case " r ": {
if (!currentChapter) {
throw new Error(`${errorPrefix}missing previous current chapter (@) line`);
}
if (currentBefore) {
throw new Error(`${errorPrefix}appeared after a before (-) line`);
}

currentRegExp = new RegExp(content, "ug");

break;
}

// RegExp substitution
case " s ": {
if (!currentChapter || !currentRegExp) {
throw new Error(`${errorPrefix}missing previous current chapter (@) or regexp (r) line`);
}

const change = {
regExp: currentRegExp,
replacement: content.replaceAll("\\n", "\n")
};
result.get(currentChapter).push(change);
currentRegExp = null;

break;
}

// Comment
case " # ": {
if (!currentChapter) {
throw new Error(`${errorPrefix} missing previous current chapter (@) line`);
}

break;
}
}
}

return result;
}

function isCanonicalizedURL(urlString) {
return URL.parse(urlString).href === urlString;
}

function beforeAfterLineToString(line) {
return line.replaceAll("\\n", "\n").replace(/(?:\\s)+$/u, match => " ".repeat(match.length / 2));
}
Loading

0 comments on commit 67ec627

Please sign in to comment.