forked from WorldBrain/Memex
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtransform-page-text.js
120 lines (92 loc) · 4.2 KB
/
transform-page-text.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import urlRegex from 'url-regex' // Check https://mathiasbynens.be/demo/url-regex for results RE: this pattern
import sw from 'remove-stopwords'
import rmDiacritics from './remove-diacritics'
import { DEFAULT_TERM_SEPARATOR } from 'src/search/util'
const termSeparator = new RegExp(DEFAULT_TERM_SEPARATOR.source, 'gu')
const allWhitespacesPattern = /\s+/g
const nonWordsPattern = /[\u2000-\u206F\u2E00-\u2E7F\\!"#$%&()*+,./:;<=>?[\]^_`{|}~«»。()ㅇ©ºø°]/gi
const apostrophePattern = /['’]/g
const wantedDashPattern = /(\S+)-(\S+)/g
const unwantedDashPattern = /\s+-\s+/g
const longWords = /\b\w{30,}\b/gi
const randomDigits = /\b(\d{1,3}|\d{5,})\b/gi
const urlPattern = urlRegex()
const removeUrls = (text = '') => text.replace(urlPattern, ' ')
const removePunctuation = (text = '') => text.replace(nonWordsPattern, ' ')
const cleanupWhitespaces = (text = '') =>
text.replace(allWhitespacesPattern, ' ').trim()
/**
* Split string into strings of words, then remove duplicates (using `Set` constructor).
*
* @param {string} [text=''] Input string
* @param {string|RegExp} [wordDelim=' '] Delimiter to split `input` into words.
* @returns {string} Version of `text` param without duplicate words.
*/
export const removeDupeWords = (text = '') =>
[...new Set(text.split(termSeparator))].join(' ')
const removeUselessWords = (text = '', lang) => {
const oldString = text.split(termSeparator)
const newString = sw.removeStopwords(oldString, lang)
return newString.join(' ')
}
const combinePunctuation = (text = '') => text.replace(apostrophePattern, '')
// Extract individual words from any words-connected-by-dashes
const splitDashes = (text = '') => {
const matches = text.match(wantedDashPattern)
if (matches == null) {
return text.replace(unwantedDashPattern, ' ')
}
// Split up dash-words, deriving new words to add to the text
const newWords = matches
.map(match => match.split('-'))
.reduce((a, b) => [...a, ...b])
.join(' ')
// Ensure to remove any other dash/hyphens in the text that don't connect words (have spaces around)
return `${text} ${newWords}`.replace(unwantedDashPattern, ' ')
}
const removeDiacritics = (text = '') => {
return rmDiacritics(text)
}
const removeRandomDigits = (text = '') => text.replace(randomDigits, ' ')
const removeLongWords = (text = '') => text.replace(longWords, ' ')
/**
* Takes in some text content and strips it of unneeded data. Currently does
* puncation (although includes accented characters), numbers, and whitespace.
* TODO: pass in options to disable certain functionality.
*
* @param {any} content A content string to transform.
* @returns {any} Object containing the transformed `content` + less important
* `lengthBefore`, `lengthAfter` stats.
*/
export default function transform({ text = '', lang = 'en' }) {
// Short circuit if no text
if (!text.trim().length) {
return { text, lenAfter: 0, lenBefore: 0 }
}
let searchableText = text.toLocaleLowerCase(lang)
// Remove URLs first before we start messing with things
searchableText = removeUrls(searchableText)
// Removes ' from words effectively combining them
// Example O'Grady => OGrady
searchableText = combinePunctuation(searchableText)
// Splits words with - into separate words
// Example "chevron-right": "chevron right chevron-right"
searchableText = splitDashes(searchableText)
// Changes accented characters to regular letters
searchableText = removeDiacritics(searchableText)
searchableText = removePunctuation(searchableText)
searchableText = removeDupeWords(searchableText)
// Removes all single digits and digits over 5+ characters
searchableText = removeRandomDigits(searchableText)
// Removes 'stopwords' such as they'll, don't, however ect..
searchableText = removeUselessWords(searchableText, lang)
// We don't care about non-single-space whitespace (' ' is cool)
searchableText = cleanupWhitespaces(searchableText)
// Removes all words 20+ characters long
searchableText = removeLongWords(searchableText)
return {
text: searchableText,
lenBefore: text.length,
lenAfter: searchableText.length,
}
}