From bc5bd910851fae3a5213ecab5e635288162c5dd4 Mon Sep 17 00:00:00 2001
From: Robert Stepanek <rsto@fastmailteam.com>
Date: Mon, 8 Jan 2024 13:47:32 +0100
Subject: [PATCH] Do not treat fullwidth Latin and symbols as unbroken script

Up until now all Unicode codepoints in the Halfwidth and
Fullwidth Forms Block (U+FF00..U+FFEF) were treated as
unbroken script. This causes terms that consist of fullwidth
Latin characters in this range to not being lowercased
before indexing, resulting in queries not finding such text.

This patch changes word-breaker to only consider halfwidth
Katakana and Hanul characters as unbroken script, handling
all fullwidth Latin characters, numbers and symbols in this
block as broken script.
---
 xapian-core/queryparser/word-breaker.cc | 6 ++++--
 xapian-core/tests/api_queryparser.cc    | 5 +++++
 xapian-core/tests/api_termgen.cc        | 8 ++++++++
 3 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/xapian-core/queryparser/word-breaker.cc b/xapian-core/queryparser/word-breaker.cc
index 8108523ccd5..6122dcdccc9 100644
--- a/xapian-core/queryparser/word-breaker.cc
+++ b/xapian-core/queryparser/word-breaker.cc
@@ -102,8 +102,10 @@ is_unbroken_script(unsigned p)
 	0xF900 - 1, 0xFAFF,
 	// FE30..FE4F; CJK Compatibility Forms
 	0xFE30 - 1, 0xFE4F,
-	// FF00..FFEF; Halfwidth and Fullwidth Forms
-	0xFF00 - 1, 0xFFEF,
+	// FF00..FF60: Fullwidth Numbers, Latin Characters, Punctuation
+	// FF61..FF64: Halfwidth Punctuation
+	0xFF65 - 1, 0xFFDC, // Halfwidth Katakana and Hangul
+	// FFE0..FFEF; Fullwidth and Halfwidth Symbols
 	// 1AFF0..1AFFF; Kana Extended-B
 	// 1B000..1B0FF; Kana Supplement
 	// 1B100..1B12F; Kana Extended-A
diff --git a/xapian-core/tests/api_queryparser.cc b/xapian-core/tests/api_queryparser.cc
index 4e0af3ffeee..a2511cfb2a7 100644
--- a/xapian-core/tests/api_queryparser.cc
+++ b/xapian-core/tests/api_queryparser.cc
@@ -758,6 +758,11 @@ static const test test_or_queries[] = {
     // Test Khmer (added in 1.5.0).
     { "\"សៀវភៅនេះថ្លៃណាស់ \"", "(សៀវភៅ@1 PHRASE 4 នេះ@1 PHRASE 4 ថ្លៃ@1 PHRASE 4 ណាស់@1)" },
 
+    // Test fullwidth Latin
+    { "\"ｈｅｌｌｏ ，ｗｏｒｌｄ！\"", "(ｈｅｌｌｏ@1 PHRASE 2 ｗｏｒｌｄ@2)" },
+    { "ＵＦＪ", "ｕｆｊ@1" },
+    { "\"三菱ＵＦＪファクター\"", "(三菱@1 PHRASE 3 ｕｆｊ@2 PHRASE 3 ファクター@3)" },
+
     { "\"久有归天愿\"", "(久@1 PHRASE 4 有@1 PHRASE 4 归天@1 PHRASE 4 愿@1)" },
     { "\"久有test归天\"", "(久@1 PHRASE 4 有@1 PHRASE 4 test@2 PHRASE 4 归天@3)" },
     { "\"归天\"", "归天@1" },
diff --git a/xapian-core/tests/api_termgen.cc b/xapian-core/tests/api_termgen.cc
index f9dadf1ff6e..075ddf5f739 100644
--- a/xapian-core/tests/api_termgen.cc
+++ b/xapian-core/tests/api_termgen.cc
@@ -176,6 +176,14 @@ static const test test_simple[] = {
     { "prefix=", "インtestタ", "test[2] イン[1] タ[3]" },
     { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" },
 
+    // Mixed fullwidth script:
+    { "", "三菱ＵＦＪファクター株式会社", "ファクター[3] 三菱[1] 株式会社[4] ｕｆｊ[2]" },
+
+    // Fullwidth vs. halfwidth script:
+    { "", "シーサイドライナー", "シーサイド[1] ライナー[2]" },
+    { "", "ｼｰｻｲﾄﾞﾗｲﾅｰ", "ｼｰｻｲﾄﾞ[1] ﾗｲﾅｰ[2]" },
+    { "", "ｈｅｌｌｏ ，ｗｏｒｌｄ！", "ｈｅｌｌｏ[1] ｗｏｒｌｄ[2]" },
+
     // Test non-word characters in a script without explicit word breaks.
     //
     // The text here contains U+FF01 FULLWIDTH EXCLAMATION MARK which is both a