Skip to content

Commit

Permalink
feat: enhance FlexSearch encoding for CJK support (#553)
Browse files Browse the repository at this point in the history
- Added support for CJK (Chinese, Japanese, Korean) languages in FlexSearch encoding.
- Introduced `isCJK` function to detect language and select appropriate encoding method.
- Implemented `encodeCJK` and `encodeDefault` functions for different tokenization strategies.
  • Loading branch information
imfing authored Jan 18, 2025
1 parent a1232ec commit 14036ff
Showing 1 changed file with 12 additions and 0 deletions.
12 changes: 12 additions & 0 deletions assets/js/flexsearch.js
Original file line number Diff line number Diff line change
Expand Up @@ -195,8 +195,19 @@ document.addEventListener("DOMContentLoaded", function () {
*/
async function preloadIndex() {
const tokenize = '{{- site.Params.search.flexsearch.tokenize | default "forward" -}}';

const isCJK = () => {
const lang = document.documentElement.lang || "en";
return lang.startsWith("zh") || lang.startsWith("ja") || lang.startsWith("ko");
}

const encodeCJK = (str) => str.replace(/[\x00-\x7F]/g, "").split("");
const encodeDefault = (str) => (""+str).toLocaleLowerCase().split(/[\p{Z}\p{S}\p{P}\p{C}]+/u);
const encodeFunction = isCJK() ? encodeCJK : encodeDefault;

window.pageIndex = new FlexSearch.Document({
tokenize,
encode: encodeFunction,
cache: 100,
document: {
id: 'id',
Expand All @@ -207,6 +218,7 @@ document.addEventListener("DOMContentLoaded", function () {

window.sectionIndex = new FlexSearch.Document({
tokenize,
encode: encodeFunction,
cache: 100,
document: {
id: 'id',
Expand Down

0 comments on commit 14036ff

Please sign in to comment.