Skip to content

Commit

Permalink
Merge pull request #81 from harmonydata/h_score
Browse files Browse the repository at this point in the history
Add H-score as output value
  • Loading branch information
woodthom2 authored Jan 22, 2025
2 parents 8f34b4a + 91d5ebf commit 61d3488
Show file tree
Hide file tree
Showing 14 changed files with 267 additions and 95 deletions.
4 changes: 3 additions & 1 deletion src/harmony/matching/default_matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,8 @@
from numpy import ndarray
from sentence_transformers import SentenceTransformer

from harmony.schemas.responses.text import HarmonyMatchResponse

if (
os.environ.get("HARMONY_SENTENCE_TRANSFORMER_PATH", None) is not None
and os.environ.get("HARMONY_SENTENCE_TRANSFORMER_PATH", None) != ""
Expand Down Expand Up @@ -76,7 +78,7 @@ def match_instruments(
mhc_embeddings: np.ndarray = np.zeros((0, 0)),
texts_cached_vectors: dict[str, List[float]] = {}, batch_size: int = 1000, max_batches: int = 2000,
is_negate: bool = True
) -> tuple:
) -> HarmonyMatchResponse:
return match_instruments_with_function(
instruments=instruments,
query=query,
Expand Down
61 changes: 61 additions & 0 deletions src/harmony/matching/instrument_to_instrument_similarity.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import operator

import numpy as np

from harmony.schemas.responses.text import InstrumentToInstrumentSimilarity


def get_precision_recall_f1(item_to_item_similarity_matrix: np.ndarray) -> tuple:
abs_similarities_between_instruments = np.abs(item_to_item_similarity_matrix)

coord_to_sim = {}
for y in range(abs_similarities_between_instruments.shape[0]):
for x in range(abs_similarities_between_instruments.shape[1]):
coord_to_sim[(y, x)] = abs_similarities_between_instruments[y, x]

best_matches = set()
is_used_x = set()
is_used_y = set()
for (y, x), sim in sorted(coord_to_sim.items(), key=operator.itemgetter(1), reverse=True):
if x not in is_used_x and y not in is_used_y and abs_similarities_between_instruments[(y, x)] >= 0:
best_matches.add((x, y))

is_used_x.add(x)
is_used_y.add(y)

precision = len(is_used_x) / abs_similarities_between_instruments.shape[1]
recall = len(is_used_y) / abs_similarities_between_instruments.shape[0]

f1 = np.mean((precision, recall))

return precision, recall, f1


def get_instrument_similarity(instruments, similarity_with_polarity):
instrument_start_pos = []
instrument_end_pos = []
cur_start = 0
for instr_idx in range(len(instruments)):
instrument_start_pos.append(cur_start)
instrument_end_pos.append(cur_start + len(instruments[instr_idx].questions))
cur_start += len(instruments[instr_idx].questions)

instrument_to_instrument_similarities = []

for i in range(len(instruments)):
instrument_1 = instruments[i]
for j in range(i + 1, len(instruments)):
instrument_2 = instruments[j]
item_to_item_similarity_matrix = similarity_with_polarity[instrument_start_pos[i]:instrument_end_pos[i],
instrument_start_pos[j]:instrument_end_pos[j]]

precision, recall, f1 = get_precision_recall_f1(item_to_item_similarity_matrix)

instrument_to_instrument_similarities.append(
InstrumentToInstrumentSimilarity(instrument_1_idx=i, instrument_2_idx=j,
instrument_1_name=instrument_1.instrument_name,
instrument_2_name=instrument_2.instrument_name, precision=precision,
recall=recall, f1=f1)
)

return instrument_to_instrument_similarities
39 changes: 23 additions & 16 deletions src/harmony/matching/matcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,40 +24,42 @@
SOFTWARE.
"""

import statistics
import heapq
import os
import statistics
from collections import Counter, OrderedDict
from typing import List, Callable

import numpy as np
from numpy import dot, matmul, ndarray, matrix
from numpy.linalg import norm

from harmony.matching.instrument_to_instrument_similarity import get_instrument_similarity
from harmony.matching.negator import negate
from harmony.schemas.catalogue_instrument import CatalogueInstrument
from harmony.schemas.catalogue_question import CatalogueQuestion
from harmony.schemas.requests.text import (
Instrument,
Question,
)
from harmony.schemas.responses.text import HarmonyMatchResponse
from harmony.schemas.text_vector import TextVector

import os


# This has been tested on 16 GB RAM production server, 1000 seems a safe number (TW, 15 Dec 2024)
def get_batch_size(default=1000):
def get_batch_size(default=1000):
try:
batch_size = int(os.getenv("BATCH_SIZE", default))
return max(batch_size, 0)
except (ValueError, TypeError):
return default


def process_items_in_batches(items, llm_function):
batch_size = get_batch_size()

if batch_size == 0:
return llm_function(items)

return llm_function(items)

batches = [items[i:i + batch_size] for i in range(0, len(items), batch_size)]

Expand Down Expand Up @@ -156,12 +158,9 @@ def create_full_text_vectors(
# Texts with no cached vector
texts_not_cached = [x.text for x in text_vectors if not x.vector]



# Get vectors for all texts not cached
new_vectors_list: List = process_items_in_batches(texts_not_cached, vectorisation_function)


# Create a dictionary with new vectors
new_vectors_dict = {}
for vector, text in zip(new_vectors_list, texts_not_cached):
Expand Down Expand Up @@ -577,7 +576,7 @@ def match_instruments_with_function(
mhc_embeddings: np.ndarray = np.zeros((0, 0)),
texts_cached_vectors: dict[str, List[float]] = {},
is_negate: bool = True
) -> tuple:
) -> HarmonyMatchResponse:
"""
Match instruments.
Expand Down Expand Up @@ -673,9 +672,17 @@ def match_instruments_with_function(
for question in all_questions:
question.topics_auto = []

return (
all_questions,
similarity_with_polarity,
query_similarity,
new_vectors_dict
)
instrument_to_instrument_similarities = get_instrument_similarity(instruments, similarity_with_polarity)

return HarmonyMatchResponse(questions=all_questions,
similarity_with_polarity=similarity_with_polarity,
query_similarity=query_similarity,
new_vectors_dict=new_vectors_dict,
instrument_to_instrument_similarities=instrument_to_instrument_similarities)
# return (
# all_questions,
# similarity_with_polarity,
# query_similarity,
# new_vectors_dict,
# instrument_to_instrument_similarities
# )
38 changes: 36 additions & 2 deletions src/harmony/schemas/responses/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,25 @@
'''

from typing import List
from typing import List, Any

import numpy as np
from pydantic import BaseModel, Field, RootModel

from harmony.schemas.catalogue_instrument import CatalogueInstrument
from harmony.schemas.requests.text import Instrument
from harmony.schemas.requests.text import Question
from pydantic import BaseModel, Field, RootModel

class InstrumentToInstrumentSimilarity(BaseModel):
instrument_1_idx: int = Field(
description="The index of the first instrument in the similarity pair in the list of instruments passed to Harmony (zero-indexed)")
instrument_2_idx: int = Field(
description="The index of the second instrument in the similarity pair in the list of instruments passed to Harmony (zero-indexed)")
instrument_1_name: str = Field(description="The name of the first instrument in the similarity pai")
instrument_2_name: str = Field(description="The name of the second instrument in the similarity pai")
precision: float = Field(description="The precision score of the match between Instrument 1 and Instrument 2")
recall: float = Field(description="The recall score of the match between Instrument 1 and Instrument 2")
f1: float = Field(description="The F1 score of the match between Instrument 1 and Instrument 2")


class MatchResponse(BaseModel):
Expand All @@ -47,6 +60,9 @@ class MatchResponse(BaseModel):
description="The closest catalogue instrument matches in the catalogue for all the instruments, "
"the first index contains the best match etc."
)
instrument_to_instrument_similarities: List[InstrumentToInstrumentSimilarity] = Field(
None, description="A list of similarity values (precision, recall, F1) between instruments"
)


class SearchInstrumentsResponse(BaseModel):
Expand All @@ -60,3 +76,21 @@ class InstrumentList(RootModel):
class CacheResponse(BaseModel):
instruments: List[Instrument] = Field(description="A list of instruments")
vectors: List[dict] = Field(description="A list of vectors")



# For use internally in the Python library but *not* the API because the NDarrays don't serialise
class HarmonyMatchResponse(BaseModel):
questions: List[Question] = Field(
description="The questions which were matched, in an order matching the order of the matrix"
)
similarity_with_polarity: Any = Field(description="Matrix of cosine similarity matches")
query_similarity: Any = Field(
None, description="Similarity metric between query string and items"
)
new_vectors_dict: dict = Field(
None, description="Vectors for the cache. These should be stored by the Harmony API to reduce unnecessary calls to the LLM"
)
instrument_to_instrument_similarities: List[InstrumentToInstrumentSimilarity] = Field(
None, description="A list of similarity values (precision, recall, F1) between instruments"
)
7 changes: 3 additions & 4 deletions tests/test_batch.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,35 +27,34 @@

import sys
import unittest

import numpy

sys.path.append("../src")

from harmony.matching.default_matcher import convert_texts_to_vector


class createModel:
def encode(self, sentences, convert_to_numpy=True):
# Generate a dummy embedding with 768 dimensions for each sentence
return numpy.array([[1] * 768] * len(sentences))



model = createModel()


class TestBatching(unittest.TestCase):
def test_convert_texts_to_vector_with_batching(self):
# Create a list of 10 dummy texts
texts = ["text" + str(i) for i in range(10)]


batch_size = 5
max_batches = 2
embeddings = convert_texts_to_vector(texts, batch_size=batch_size, max_batches=max_batches)


self.assertEqual(embeddings.shape[0], 10)


self.assertEqual(embeddings.shape[1], 384)


Expand Down
4 changes: 1 addition & 3 deletions tests/test_batching_in_matcher.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,9 @@
import sys
import os
import sys
import unittest
import numpy

sys.path.append("../src")
from unittest import TestCase, mock
from harmony.matching.matcher import get_batch_size
from harmony.matching.matcher import process_items_in_batches


Expand Down
18 changes: 10 additions & 8 deletions tests/test_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,24 +31,26 @@
sys.path.append("../src")

from harmony.matching.cluster import cluster_questions
from harmony import create_instrument_from_list, import_instrument_into_harmony_web
from harmony.schemas.requests.text import Instrument, Question


class TestCluster(unittest.TestCase):
def setUp(self):
self. all_questions_real = [Question(question_no="1", question_text="Feeling nervous, anxious, or on edge"),
Question(question_no="2", question_text="Not being able to stop or control worrying"),
Question(question_no="3", question_text="Little interest or pleasure in doing things"),
Question(question_no="4", question_text="Feeling down, depressed, or hopeless"),
Question(question_no="5",
question_text="Trouble falling/staying asleep, sleeping too much"), ]
self.all_questions_real = [Question(question_no="1", question_text="Feeling nervous, anxious, or on edge"),
Question(question_no="2",
question_text="Not being able to stop or control worrying"),
Question(question_no="3",
question_text="Little interest or pleasure in doing things"),
Question(question_no="4", question_text="Feeling down, depressed, or hopeless"),
Question(question_no="5",
question_text="Trouble falling/staying asleep, sleeping too much"), ]
self.instruments = Instrument(questions=self.all_questions_real)

def test_cluster(self):
clusters_out, score_out = cluster_questions(self.instruments, 2, False)
assert(len(clusters_out) == 5)
assert (len(clusters_out) == 5)
assert score_out


if __name__ == '__main__':
unittest.main()
2 changes: 0 additions & 2 deletions tests/test_convert_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@

import sys
import unittest
from harmony.parsing.text_parser import convert_text_to_instruments
from harmony.schemas.requests.text import RawFile, FileType

sys.path.append("../src")

Expand Down
16 changes: 9 additions & 7 deletions tests/test_crosswalk.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ def test_generate_crosswalk_table_empty(self):
self.assertTrue(result.empty)

def test_generate_crosswalk_table_real(self):
all_questions, similarity_with_polarity, _, _ = match_instruments(self.instruments)
result = generate_crosswalk_table(self.instruments, similarity_with_polarity, self.threshold,
match_response = match_instruments(self.instruments)
result = generate_crosswalk_table(self.instruments, match_response.similarity_with_polarity, self.threshold,
is_allow_within_instrument_matches=True)
expected_matches = []

Expand All @@ -94,7 +94,7 @@ def test_generate_crosswalk_table_real(self):
self.assertEqual(len(result), len(expected_matches))

lower_threshold = 0.5
result = generate_crosswalk_table(self.instruments, similarity_with_polarity, lower_threshold,
result = generate_crosswalk_table(self.instruments, match_response.similarity_with_polarity, lower_threshold,
is_allow_within_instrument_matches=True)

self.assertEqual(len(result), 1)
Expand All @@ -106,8 +106,9 @@ def test_crosswalk_two_instruments_allow_many_to_one_matches(self):
["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
instruments = [instrument_1, instrument_2]

all_questions, similarity_with_polarity, _, _ = match_instruments(instruments)
result = generate_crosswalk_table(instruments, similarity_with_polarity, 0, is_enforce_one_to_one=False)
match_response = match_instruments(instruments)
result = generate_crosswalk_table(instruments, match_response.similarity_with_polarity, 0,
is_enforce_one_to_one=False)

self.assertEqual(2, len(result))

Expand All @@ -118,8 +119,9 @@ def test_crosswalk_two_instruments_enforce_one_to_one_matches(self):
["Feeling afraid, as if something awful might happen", "Feeling nervous, anxious, or on edge"])
instruments = [instrument_1, instrument_2]

all_questions, similarity_with_polarity, _, _ = match_instruments(instruments)
result = generate_crosswalk_table(instruments, similarity_with_polarity, 0, is_enforce_one_to_one=True)
match_response = match_instruments(instruments)
result = generate_crosswalk_table(instruments, match_response.similarity_with_polarity, 0,
is_enforce_one_to_one=True)

self.assertEqual(1, len(result))

Expand Down
Loading

0 comments on commit 61d3488

Please sign in to comment.