From 3dea72d90f16626824493fa0e5f9794e09bf8c6f Mon Sep 17 00:00:00 2001 From: Marcus Aspin Date: Mon, 3 Feb 2025 13:05:24 +0000 Subject: [PATCH] PI-2526 Remove repeated punctuation strings to prevent model errors due to "too many tokens": ``` Input validation error: `inputs` must have less than 512 tokens Given: 566 ``` caused by strings such as: ``` --------------------------------------------------------- Comment added by name on 01/02/2023 at 12:34 Report Edited: 01/02/2023 at 12:34 --------------------------------------------------------- Comment added by name on 01/02/2023 at 12:34 Report Edited: 01/02/2023 at 12:34 ... ``` --- .../pipelines/contact/index/ingest-pipeline.tpl.json | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/projects/person-search-index-from-delius/container/pipelines/contact/index/ingest-pipeline.tpl.json b/projects/person-search-index-from-delius/container/pipelines/contact/index/ingest-pipeline.tpl.json index 782196fb4..2bd853343 100644 --- a/projects/person-search-index-from-delius/container/pipelines/contact/index/ingest-pipeline.tpl.json +++ b/projects/person-search-index-from-delius/container/pipelines/contact/index/ingest-pipeline.tpl.json @@ -1,6 +1,14 @@ { "description": "Split text into chunks and generate embeddings", "processors": [ + { + "gsub": { + "tag": "Remove any repeated non-alphanumeric strings. The pattern looks for 2 or more non-alphanumeric characters surrounded by whitespace.", + "field": "notes", + "pattern": "(^|\\s)[^\\w\\s]{2,}(\\s|$)", + "replacement": " " + } + }, { "text_chunking": { "algorithm": {