Do not treat fullwidth Latin and symbols as unbroken script

Up until now all Unicode codepoints in the Halfwidth and Fullwidth Forms Block (U+FF00..U+FFEF) were treated as unbroken script. This causes terms that consist of fullwidth Latin characters in this range to not being lowercased before indexing, resulting in queries not finding such text. This patch changes word-breaker to only consider halfwidth Katakana and Hanul characters as unbroken script, handling all fullwidth Latin characters, numbers and symbols in this block as broken script.
cyrusimap · Jan 8, 2024 · bc5bd91 · bc5bd91
1 parent df2ed66
commit bc5bd91
Show file tree

Hide file tree

Showing 3 changed files with 17 additions and 2 deletions.
diff --git a/xapian-core/queryparser/word-breaker.cc b/xapian-core/queryparser/word-breaker.cc
@@ -102,8 +102,10 @@ is_unbroken_script(unsigned p)
 	0xF900 - 1, 0xFAFF,
 	// FE30..FE4F; CJK Compatibility Forms
 	0xFE30 - 1, 0xFE4F,
-	// FF00..FFEF; Halfwidth and Fullwidth Forms
-	0xFF00 - 1, 0xFFEF,
+	// FF00..FF60: Fullwidth Numbers, Latin Characters, Punctuation
+	// FF61..FF64: Halfwidth Punctuation
+	0xFF65 - 1, 0xFFDC, // Halfwidth Katakana and Hangul
+	// FFE0..FFEF; Fullwidth and Halfwidth Symbols
 	// 1AFF0..1AFFF; Kana Extended-B
 	// 1B000..1B0FF; Kana Supplement
 	// 1B100..1B12F; Kana Extended-A

diff --git a/xapian-core/tests/api_queryparser.cc b/xapian-core/tests/api_queryparser.cc
@@ -758,6 +758,11 @@ static const test test_or_queries[] = {
     // Test Khmer (added in 1.5.0).
     { "\"សៀវភៅនេះថ្លៃណាស់ \"", "(សៀវភៅ@1 PHRASE 4 នេះ@1 PHRASE 4 ថ្លៃ@1 PHRASE 4 ណាស់@1)" },
 
+    // Test fullwidth Latin
+    { "\"ｈｅｌｌｏ ，ｗｏｒｌｄ！\"", "(ｈｅｌｌｏ@1 PHRASE 2 ｗｏｒｌｄ@2)" },
+    { "ＵＦＪ", "ｕｆｊ@1" },
+    { "\"三菱ＵＦＪファクター\"", "(三菱@1 PHRASE 3 ｕｆｊ@2 PHRASE 3 ファクター@3)" },
+
     { "\"久有归天愿\"", "(久@1 PHRASE 4 有@1 PHRASE 4 归天@1 PHRASE 4 愿@1)" },
     { "\"久有test归天\"", "(久@1 PHRASE 4 有@1 PHRASE 4 test@2 PHRASE 4 归天@3)" },
     { "\"归天\"", "归天@1" },

diff --git a/xapian-core/tests/api_termgen.cc b/xapian-core/tests/api_termgen.cc
@@ -176,6 +176,14 @@ static const test test_simple[] = {
     { "prefix=", "インtestタ", "test[2] イン[1] タ[3]" },
     { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
 
+    // Mixed fullwidth script:
+    { "", "三菱ＵＦＪファクター株式会社", "ファクター[3] 三菱[1] 株式会社[4] ｕｆｊ[2]" },
+
+    // Fullwidth vs. halfwidth script:
+    { "", "シーサイドライナー", "シーサイド[1] ライナー[2]" },
+    { "", "ｼｰｻｲﾄﾞﾗｲﾅｰ", "ｼｰｻｲﾄﾞ[1] ﾗｲﾅｰ[2]" },
+    { "", "ｈｅｌｌｏ ，ｗｏｒｌｄ！", "ｈｅｌｌｏ[1] ｗｏｒｌｄ[2]" },
+
     // Test non-word characters in a script without explicit word breaks.
     //
     // The text here contains U+FF01 FULLWIDTH EXCLAMATION MARK which is both a