From bc5bd910851fae3a5213ecab5e635288162c5dd4 Mon Sep 17 00:00:00 2001 From: Robert Stepanek Date: Mon, 8 Jan 2024 13:47:32 +0100 Subject: [PATCH] Do not treat fullwidth Latin and symbols as unbroken script Up until now all Unicode codepoints in the Halfwidth and Fullwidth Forms Block (U+FF00..U+FFEF) were treated as unbroken script. This causes terms that consist of fullwidth Latin characters in this range to not being lowercased before indexing, resulting in queries not finding such text. This patch changes word-breaker to only consider halfwidth Katakana and Hanul characters as unbroken script, handling all fullwidth Latin characters, numbers and symbols in this block as broken script. --- xapian-core/queryparser/word-breaker.cc | 6 ++++-- xapian-core/tests/api_queryparser.cc | 5 +++++ xapian-core/tests/api_termgen.cc | 8 ++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) diff --git a/xapian-core/queryparser/word-breaker.cc b/xapian-core/queryparser/word-breaker.cc index 8108523ccd5..6122dcdccc9 100644 --- a/xapian-core/queryparser/word-breaker.cc +++ b/xapian-core/queryparser/word-breaker.cc @@ -102,8 +102,10 @@ is_unbroken_script(unsigned p) 0xF900 - 1, 0xFAFF, // FE30..FE4F; CJK Compatibility Forms 0xFE30 - 1, 0xFE4F, - // FF00..FFEF; Halfwidth and Fullwidth Forms - 0xFF00 - 1, 0xFFEF, + // FF00..FF60: Fullwidth Numbers, Latin Characters, Punctuation + // FF61..FF64: Halfwidth Punctuation + 0xFF65 - 1, 0xFFDC, // Halfwidth Katakana and Hangul + // FFE0..FFEF; Fullwidth and Halfwidth Symbols // 1AFF0..1AFFF; Kana Extended-B // 1B000..1B0FF; Kana Supplement // 1B100..1B12F; Kana Extended-A diff --git a/xapian-core/tests/api_queryparser.cc b/xapian-core/tests/api_queryparser.cc index 4e0af3ffeee..a2511cfb2a7 100644 --- a/xapian-core/tests/api_queryparser.cc +++ b/xapian-core/tests/api_queryparser.cc @@ -758,6 +758,11 @@ static const test test_or_queries[] = { // Test Khmer (added in 1.5.0). { "\"សៀវភៅនេះថ្លៃណាស់ \"", "(សៀវភៅ@1 PHRASE 4 នេះ@1 PHRASE 4 ថ្លៃ@1 PHRASE 4 ណាស់@1)" }, + // Test fullwidth Latin + { "\"hello ,world!\"", "(hello@1 PHRASE 2 world@2)" }, + { "UFJ", "ufj@1" }, + { "\"三菱UFJファクター\"", "(三菱@1 PHRASE 3 ufj@2 PHRASE 3 ファクター@3)" }, + { "\"久有归天愿\"", "(久@1 PHRASE 4 有@1 PHRASE 4 归天@1 PHRASE 4 愿@1)" }, { "\"久有test归天\"", "(久@1 PHRASE 4 有@1 PHRASE 4 test@2 PHRASE 4 归天@3)" }, { "\"归天\"", "归天@1" }, diff --git a/xapian-core/tests/api_termgen.cc b/xapian-core/tests/api_termgen.cc index f9dadf1ff6e..075ddf5f739 100644 --- a/xapian-core/tests/api_termgen.cc +++ b/xapian-core/tests/api_termgen.cc @@ -176,6 +176,14 @@ static const test test_simple[] = { { "prefix=", "インtestタ", "test[2] イン[1] タ[3]" }, { "", "配this is合a个 test!", "a[5] is[3] test[7] this[2] 个[6] 合[4] 配[1]" }, + // Mixed fullwidth script: + { "", "三菱UFJファクター株式会社", "ファクター[3] 三菱[1] 株式会社[4] ufj[2]" }, + + // Fullwidth vs. halfwidth script: + { "", "シーサイドライナー", "シーサイド[1] ライナー[2]" }, + { "", "シーサイドライナー", "シーサイド[1] ライナー[2]" }, + { "", "hello ,world!", "hello[1] world[2]" }, + // Test non-word characters in a script without explicit word breaks. // // The text here contains U+FF01 FULLWIDTH EXCLAMATION MARK which is both a