Merge pull request #81 from harmonydata/h_score

Add H-score as output value
harmonydata · Jan 22, 2025 · 61d3488 · 61d3488
2 parents 8f34b4a + 91d5ebf
commit 61d3488
Show file tree

Hide file tree

Showing 14 changed files with 267 additions and 95 deletions.
diff --git a/src/harmony/matching/default_matcher.py b/src/harmony/matching/default_matcher.py
@@ -33,6 +33,8 @@
 from numpy import ndarray
 from sentence_transformers import SentenceTransformer
 
+from harmony.schemas.responses.text import HarmonyMatchResponse
+
 if (
         os.environ.get("HARMONY_SENTENCE_TRANSFORMER_PATH", None) is not None
         and os.environ.get("HARMONY_SENTENCE_TRANSFORMER_PATH", None) != ""
@@ -76,7 +78,7 @@ def match_instruments(
         mhc_embeddings: np.ndarray = np.zeros((0, 0)),
         texts_cached_vectors: dict[str, List[float]] = {}, batch_size: int = 1000, max_batches: int = 2000,
         is_negate: bool = True
-) -> tuple:
+) -> HarmonyMatchResponse:
     return match_instruments_with_function(
         instruments=instruments,
         query=query,

diff --git a/src/harmony/matching/instrument_to_instrument_similarity.py b/src/harmony/matching/instrument_to_instrument_similarity.py
@@ -0,0 +1,61 @@
+import operator
+
+import numpy as np
+
+from harmony.schemas.responses.text import InstrumentToInstrumentSimilarity
+
+
+def get_precision_recall_f1(item_to_item_similarity_matrix: np.ndarray) -> tuple:
+    abs_similarities_between_instruments = np.abs(item_to_item_similarity_matrix)
+
+    coord_to_sim = {}
+    for y in range(abs_similarities_between_instruments.shape[0]):
+        for x in range(abs_similarities_between_instruments.shape[1]):
+            coord_to_sim[(y, x)] = abs_similarities_between_instruments[y, x]
+
+    best_matches = set()
+    is_used_x = set()
+    is_used_y = set()
+    for (y, x), sim in sorted(coord_to_sim.items(), key=operator.itemgetter(1), reverse=True):
+        if x not in is_used_x and y not in is_used_y and abs_similarities_between_instruments[(y, x)] >= 0:
+            best_matches.add((x, y))
+
+            is_used_x.add(x)
+            is_used_y.add(y)
+
+    precision = len(is_used_x) / abs_similarities_between_instruments.shape[1]
+    recall = len(is_used_y) / abs_similarities_between_instruments.shape[0]
+
+    f1 = np.mean((precision, recall))
+
+    return precision, recall, f1
+
+
+def get_instrument_similarity(instruments, similarity_with_polarity):
+    instrument_start_pos = []
+    instrument_end_pos = []
+    cur_start = 0
+    for instr_idx in range(len(instruments)):
+        instrument_start_pos.append(cur_start)
+        instrument_end_pos.append(cur_start + len(instruments[instr_idx].questions))
+        cur_start += len(instruments[instr_idx].questions)
+
+    instrument_to_instrument_similarities = []
+
+    for i in range(len(instruments)):
+        instrument_1 = instruments[i]
+        for j in range(i + 1, len(instruments)):
+            instrument_2 = instruments[j]
+            item_to_item_similarity_matrix = similarity_with_polarity[instrument_start_pos[i]:instrument_end_pos[i],
+                                             instrument_start_pos[j]:instrument_end_pos[j]]
+
+            precision, recall, f1 = get_precision_recall_f1(item_to_item_similarity_matrix)
+
+            instrument_to_instrument_similarities.append(
+                InstrumentToInstrumentSimilarity(instrument_1_idx=i, instrument_2_idx=j,
+                                                 instrument_1_name=instrument_1.instrument_name,
+                                                 instrument_2_name=instrument_2.instrument_name, precision=precision,
+                                                 recall=recall, f1=f1)
+            )
+
+    return instrument_to_instrument_similarities
diff --git a/src/harmony/matching/matcher.py b/src/harmony/matching/matcher.py
@@ -24,40 +24,42 @@
 SOFTWARE.
 """
 
-import statistics
 import heapq
+import os
+import statistics
 from collections import Counter, OrderedDict
 from typing import List, Callable
 
 import numpy as np
 from numpy import dot, matmul, ndarray, matrix
 from numpy.linalg import norm
 
+from harmony.matching.instrument_to_instrument_similarity import get_instrument_similarity
 from harmony.matching.negator import negate
 from harmony.schemas.catalogue_instrument import CatalogueInstrument
 from harmony.schemas.catalogue_question import CatalogueQuestion
 from harmony.schemas.requests.text import (
     Instrument,
     Question,
 )
+from harmony.schemas.responses.text import HarmonyMatchResponse
 from harmony.schemas.text_vector import TextVector
 
-import os
-
 
 # This has been tested on 16 GB RAM production server, 1000 seems a safe number (TW, 15 Dec 2024)
-def get_batch_size(default=1000): 
+def get_batch_size(default=1000):
     try:
         batch_size = int(os.getenv("BATCH_SIZE", default))
         return max(batch_size, 0)
     except (ValueError, TypeError):
         return default
+
+
 def process_items_in_batches(items, llm_function):
     batch_size = get_batch_size()
 
     if batch_size == 0:
-         return llm_function(items)
-
+        return llm_function(items)
 
     batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]
 
@@ -156,12 +158,9 @@ def create_full_text_vectors(
     # Texts with no cached vector
     texts_not_cached = [x.text for x in text_vectors if not x.vector]
 
-
-
     # Get vectors for all texts not cached
     new_vectors_list: List = process_items_in_batches(texts_not_cached, vectorisation_function)
 
-
     # Create a dictionary with new vectors
     new_vectors_dict = {}
     for vector, text in zip(new_vectors_list, texts_not_cached):
@@ -577,7 +576,7 @@ def match_instruments_with_function(
         mhc_embeddings: np.ndarray = np.zeros((0, 0)),
         texts_cached_vectors: dict[str, List[float]] = {},
         is_negate: bool = True
-) -> tuple:
+) -> HarmonyMatchResponse:
     """
     Match instruments.
 
@@ -673,9 +672,17 @@ def match_instruments_with_function(
             for question in all_questions:
                 question.topics_auto = []
 
-    return (
-        all_questions,
-        similarity_with_polarity,
-        query_similarity,
-        new_vectors_dict
-    )
+    instrument_to_instrument_similarities = get_instrument_similarity(instruments, similarity_with_polarity)
+
+    return HarmonyMatchResponse(questions=all_questions,
+                                similarity_with_polarity=similarity_with_polarity,
+                                query_similarity=query_similarity,
+                                new_vectors_dict=new_vectors_dict,
+                                instrument_to_instrument_similarities=instrument_to_instrument_similarities)
+    # return (
+    #     all_questions,
+    #     similarity_with_polarity,
+    #     query_similarity,
+    #     new_vectors_dict,
+    #     instrument_to_instrument_similarities
+    # )
diff --git a/src/harmony/schemas/responses/text.py b/src/harmony/schemas/responses/text.py
@@ -25,12 +25,25 @@
 
 '''
 
-from typing import List
+from typing import List, Any
+
+import numpy as np
+from pydantic import BaseModel, Field, RootModel
 
 from harmony.schemas.catalogue_instrument import CatalogueInstrument
 from harmony.schemas.requests.text import Instrument
 from harmony.schemas.requests.text import Question
-from pydantic import BaseModel, Field, RootModel
+
+class InstrumentToInstrumentSimilarity(BaseModel):
+    instrument_1_idx: int = Field(
+        description="The index of the first instrument in the similarity pair in the list of instruments passed to Harmony (zero-indexed)")
+    instrument_2_idx: int = Field(
+        description="The index of the second instrument in the similarity pair in the list of instruments passed to Harmony (zero-indexed)")
+    instrument_1_name: str = Field(description="The name of the first instrument in the similarity pai")
+    instrument_2_name: str = Field(description="The name of the second instrument in the similarity pai")
+    precision: float = Field(description="The precision score of the match between Instrument 1 and Instrument 2")
+    recall: float = Field(description="The recall score of the match between Instrument 1 and Instrument 2")
+    f1: float = Field(description="The F1 score of the match between Instrument 1 and Instrument 2")
 
 
 class MatchResponse(BaseModel):
@@ -47,6 +60,9 @@ class MatchResponse(BaseModel):
         description="The closest catalogue instrument matches in the catalogue for all the instruments, "
                     "the first index contains the best match etc."
     )
+    instrument_to_instrument_similarities: List[InstrumentToInstrumentSimilarity] = Field(
+        None, description="A list of similarity values (precision, recall, F1) between instruments"
+    )
 
 
 class SearchInstrumentsResponse(BaseModel):
@@ -60,3 +76,21 @@ class InstrumentList(RootModel):
 class CacheResponse(BaseModel):
     instruments: List[Instrument] = Field(description="A list of instruments")
     vectors: List[dict] = Field(description="A list of vectors")
+
+
+
+# For use internally in the Python library but *not* the API because the NDarrays don't serialise
+class HarmonyMatchResponse(BaseModel):
+    questions: List[Question] = Field(
+        description="The questions which were matched, in an order matching the order of the matrix"
+    )
+    similarity_with_polarity: Any = Field(description="Matrix of cosine similarity matches")
+    query_similarity: Any = Field(
+        None, description="Similarity metric between query string and items"
+    )
+    new_vectors_dict: dict = Field(
+        None, description="Vectors for the cache. These should be stored by the Harmony API to reduce unnecessary calls to the LLM"
+    )
+    instrument_to_instrument_similarities: List[InstrumentToInstrumentSimilarity] = Field(
+        None, description="A list of similarity values (precision, recall, F1) between instruments"
+    )
diff --git a/tests/test_batch.py b/tests/test_batch.py
@@ -27,35 +27,34 @@
 
 import sys
 import unittest
+
 import numpy
 
 sys.path.append("../src")
 
 from harmony.matching.default_matcher import convert_texts_to_vector
 
+
 class createModel:
     def encode(self, sentences, convert_to_numpy=True):
         # Generate a dummy embedding with 768 dimensions for each sentence
         return numpy.array([[1] * 768] * len(sentences))
 
 
-
 model = createModel()
 
+
 class TestBatching(unittest.TestCase):
     def test_convert_texts_to_vector_with_batching(self):
         # Create a list of 10 dummy texts
         texts = ["text" + str(i) for i in range(10)]
 
-
         batch_size = 5
         max_batches = 2
         embeddings = convert_texts_to_vector(texts, batch_size=batch_size, max_batches=max_batches)
 
-
         self.assertEqual(embeddings.shape[0], 10)
 
-
         self.assertEqual(embeddings.shape[1], 384)
 
 

diff --git a/tests/test_batching_in_matcher.py b/tests/test_batching_in_matcher.py
@@ -1,11 +1,9 @@
-import sys
 import os
+import sys
 import unittest
-import numpy
 
 sys.path.append("../src")
 from unittest import TestCase, mock
-from harmony.matching.matcher import get_batch_size
 from harmony.matching.matcher import process_items_in_batches
 
 

diff --git a/tests/test_cluster.py b/tests/test_cluster.py
@@ -31,24 +31,26 @@
 sys.path.append("../src")
 
 from harmony.matching.cluster import cluster_questions
-from harmony import create_instrument_from_list, import_instrument_into_harmony_web
 from harmony.schemas.requests.text import Instrument, Question
 
 
 class TestCluster(unittest.TestCase):
     def setUp(self):
-        self. all_questions_real = [Question(question_no="1", question_text="Feeling nervous, anxious, or on edge"),
-                              Question(question_no="2", question_text="Not being able to stop or control worrying"),
-                              Question(question_no="3", question_text="Little interest or pleasure in doing things"),
-                              Question(question_no="4", question_text="Feeling down, depressed, or hopeless"),
-                              Question(question_no="5",
-                                       question_text="Trouble falling/staying asleep, sleeping too much"), ]
+        self.all_questions_real = [Question(question_no="1", question_text="Feeling nervous, anxious, or on edge"),
+                                   Question(question_no="2",
+                                            question_text="Not being able to stop or control worrying"),
+                                   Question(question_no="3",
+                                            question_text="Little interest or pleasure in doing things"),
+                                   Question(question_no="4", question_text="Feeling down, depressed, or hopeless"),
+                                   Question(question_no="5",
+                                            question_text="Trouble falling/staying asleep, sleeping too much"), ]
         self.instruments = Instrument(questions=self.all_questions_real)
 
     def test_cluster(self):
         clusters_out, score_out = cluster_questions(self.instruments, 2, False)
-        assert(len(clusters_out) == 5)
+        assert (len(clusters_out) == 5)
         assert score_out
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_convert_text.py b/tests/test_convert_text.py
@@ -27,8 +27,6 @@
 
 import sys
 import unittest
-from harmony.parsing.text_parser import convert_text_to_instruments
-from harmony.schemas.requests.text import RawFile, FileType
 
 sys.path.append("../src")
 

diff --git a/tests/test_crosswalk.py b/tests/test_crosswalk.py
@@ -83,8 +83,8 @@ def test_generate_crosswalk_table_empty(self):
         self.assertTrue(result.empty)
 
     def test_generate_crosswalk_table_real(self):
-        all_questions, similarity_with_polarity, _, _ = match_instruments(self.instruments)
-        result = generate_crosswalk_table(self.instruments, similarity_with_polarity, self.threshold,
+        match_response = match_instruments(self.instruments)
+        result = generate_crosswalk_table(self.instruments, match_response.similarity_with_polarity, self.threshold,
                                           is_allow_within_instrument_matches=True)
         expected_matches = []
 
@@ -94,7 +94,7 @@ def test_generate_crosswalk_table_real(self):
         self.assertEqual(len(result), len(expected_matches))
 
         lower_threshold = 0.5
-        result = generate_crosswalk_table(self.instruments, similarity_with_polarity, lower_threshold,
+        result = generate_crosswalk_table(self.instruments, match_response.similarity_with_polarity, lower_threshold,
                                           is_allow_within_instrument_matches=True)
 
         self.assertEqual(len(result), 1)
@@ -106,8 +106,9 @@ def test_crosswalk_two_instruments_allow_many_to_one_matches(self):
             ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
         instruments = [instrument_1, instrument_2]
 
-        all_questions, similarity_with_polarity, _, _ = match_instruments(instruments)
-        result = generate_crosswalk_table(instruments, similarity_with_polarity, 0, is_enforce_one_to_one=False)
+        match_response = match_instruments(instruments)
+        result = generate_crosswalk_table(instruments, match_response.similarity_with_polarity, 0,
+                                          is_enforce_one_to_one=False)
 
         self.assertEqual(2, len(result))
 
@@ -118,8 +119,9 @@ def test_crosswalk_two_instruments_enforce_one_to_one_matches(self):
             ["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
         instruments = [instrument_1, instrument_2]
 
-        all_questions, similarity_with_polarity, _, _ = match_instruments(instruments)
-        result = generate_crosswalk_table(instruments, similarity_with_polarity, 0, is_enforce_one_to_one=True)
+        match_response = match_instruments(instruments)
+        result = generate_crosswalk_table(instruments, match_response.similarity_with_polarity, 0,
+                                          is_enforce_one_to_one=True)
 
         self.assertEqual(1, len(result))