Skip to content

Commit

Permalink
Fix: Fix wrong JC implementation
Browse files Browse the repository at this point in the history
Closes #20
  • Loading branch information
anergictcell committed Mar 23, 2024
1 parent 2db9e64 commit ac1cc7f
Showing 5 changed files with 96 additions and 90 deletions.
49 changes: 45 additions & 4 deletions docs/similarity.rst
Original file line number Diff line number Diff line change
@@ -1,15 +1,55 @@
Similarity
==========

Builtin Similarity algorithms
*****************************

Resnik
------
.. autoclass:: pyhpo.similarity.defaults.Resnik


Lin
---
.. autoclass:: pyhpo.similarity.defaults.Lin


JC (Jiang & Conrath)
--------------------
.. autoclass:: pyhpo.similarity.defaults.JC


Relevance
---------
.. autoclass:: pyhpo.similarity.defaults.Relevance


InformationCoefficient
----------------------
.. autoclass:: pyhpo.similarity.defaults.InformationCoefficient


GraphIC
-------
.. autoclass:: pyhpo.similarity.defaults.GraphIC


Distance
--------
.. autoclass:: pyhpo.similarity.defaults.Distance


.. _custom-similarity-methods:

Custom Similarity algorithms
****************************
The ``similarity`` submodule allows to create custom Similarity calculations
for comparison of single terms or term-sets.

It provides a simple interface to register custom Similarity handler, so
that they can be called directly on an :class:`pyhpo.term.HPOTerm` or an
:class:`pyhpo.set.HPOSet`.

SimilarityBase
**************
.. autoclass:: pyhpo.similarity.base.SimilarityBase

__call__
@@ -18,7 +58,7 @@ __call__


Examples
********
--------

.. code:: python
@@ -62,4 +102,5 @@ Examples
method='custom_method'
)
assert sim_score == 1
assert sim_score == 1
4 changes: 2 additions & 2 deletions pyhpo/set.py
Original file line number Diff line number Diff line change
@@ -334,11 +334,11 @@ def similarity(
kind: str, default ``''``
Which kind of information content should be calculated.
Options are ['omim', 'orpha', 'decipher', 'gene']
See :func:`pyhpo.HPOTerm.similarity_score` for options
See :func:`pyhpo.term.HPOTerm.similarity_score` for options
method: string, default ``''``
The method to use to calculate the similarity.
See :func:`pyhpo.HPOTerm.similarity_score` for options
See :func:`pyhpo.term.HPOTerm.similarity_score` for options
Additional options:
74 changes: 25 additions & 49 deletions pyhpo/similarity/defaults.py
Original file line number Diff line number Diff line change
@@ -7,7 +7,9 @@

class Resnik(SimilarityBase):
"""
Based on Resnik P, Proceedings of the 14th IJCAI, (1995)
Based on *Resnik P, Proceedings of the 14th IJCAI, (1995)*
https://www.ijcai.org/Proceedings/95-1/Papers/059.pdf
"""

def __call__(
@@ -28,7 +30,9 @@ def __call__(

class Lin(SimilarityBase):
"""
Based on Lin D, Proceedings of the 15th ICML, (1998)
Based on *Lin D, Proceedings of the 15th ICML, (1998)*
https://dl.acm.org/doi/10.5555/645527.657297
"""

dependencies: List[str] = ["resnik"]
@@ -50,20 +54,17 @@ def __call__(

class JC(SimilarityBase):
"""
This method is the same as the source code in
the R package ``hposim``
.. code-block:: r
Jiang & Conrath similarity Score, based on
*Jiang J, Conrath D, Rocling X, (1997) and
Deng Y, et. al., PLoS One, (2015)*
res= - 1/ ( 1 + 2*IC[IC[,1]==an,3] - IC[IC[,1]==term1,3]
- IC[IC[,1]==term2,3] )
https://aclanthology.org/O97-1002.pdf
.. note::
This method is *not recommended*.
This method was previously wrongly implemented
and fixed in 3.3.0 based on `this discussion <https://github.com/anergictcell/pyhpo/issues/20>`_
See :func:`pyhpo.term._jc_similarity_score_2`
for an alternative way to calculate Jiang & Conrath
"""

dependencies: List[str] = ["resnik"]
@@ -81,46 +82,17 @@ def __call__(
ic_t1 = term1.information_content[kind]
ic_t2 = term2.information_content[kind]

return -1 / (1 + (2 * dependencies[0]) - ic_t1 - ic_t2)


class JC2(SimilarityBase):
"""
Jiang & Conrath similarity Score, based on
Jiang J, Conrath D, ROCLING X, (1997) and
Deng Y, et. al., PLoS One, (2015)
This method is the same as the description
in the paper for the R package ``hposim``
Deng Y, et. al., PLoS One, (2015)
::
sim[JC](t1,t2) = 1-(IC(t1)+IC(t2)−2×IC(t[MICA]))
"""

dependencies: List[str] = ["resnik"]

def __call__(
self,
term1: "pyhpo.HPOTerm",
term2: "pyhpo.HPOTerm",
kind: str,
dependencies: List[float],
) -> float:
if term1 == term2:
return 1.0

ic_t1 = term1.information_content[kind]
ic_t2 = term2.information_content[kind]
if ic_t1 == 0.0 or ic_t2 == 0.0:
return 0.0

return 1 - (ic_t1 + ic_t2 - (2 * dependencies[0]))
return 1.0 / (ic_t1 + ic_t2 - (2.0 * dependencies[0]) + 1.0)


class Relevance(SimilarityBase):
"""
Based on Schlicker A, et.al., BMC Bioinformatics, (2006)
Based on *Schlicker A, et.al., BMC Bioinformatics, (2006)*
https://bmcbioinformatics.biomedcentral.com/articles/10.1186/1471-2105-7-302
"""

dependencies: List[str] = ["resnik", "lin"]
@@ -137,7 +109,9 @@ def __call__(

class InformationCoefficient(SimilarityBase):
"""
Based on Li B, et. al., arXiv, (2010)
Based on *Li B, et. al., arXiv, (2010)*
https://arxiv.org/abs/1001.0958
"""

dependencies: List[str] = ["resnik", "lin"]
@@ -155,7 +129,9 @@ def __call__(
class GraphIC(SimilarityBase):
"""
Graph based Information coefficient, based on
Deng Y, et. al., PLoS One, (2015)
*Deng Y, et. al., PLoS One, (2015)*
https://pubmed.ncbi.nlm.nih.gov/25664462/
"""

def __call__(
@@ -210,7 +186,7 @@ def register_defaults(simscore: "pyhpo.similarity.base._Similarity") -> None:
simscore.register("resnik", Resnik)
simscore.register("lin", Lin)
simscore.register("jc", JC)
simscore.register("jc2", JC2)
simscore.register("jc2", JC)
simscore.register("rel", Relevance)
simscore.register("ic", InformationCoefficient)
simscore.register("graphic", GraphIC)
22 changes: 8 additions & 14 deletions pyhpo/term.py
Original file line number Diff line number Diff line change
@@ -476,21 +476,15 @@ def similarity_score(
Available options:
* **resnik** - Resnik P, Proceedings of the 14th IJCAI, (1995)
* **lin** - Lin D, Proceedings of the 15th ICML, (1998)
* **jc** - Jiang J, Conrath D, ROCLING X, (1997)
Implementation according to R source code
* **jc2** - Jiang J, Conrath D, ROCLING X, (1997)
Implementation according to paper from R ``hposim`` library
Deng Y, et. al., PLoS One, (2015)
* **rel** - Relevance measure - Schlicker A, et.al.,
BMC Bioinformatics, (2006)
* **ic** - Information coefficient - Li B, et. al., arXiv, (2010)
* **graphic** - Graph based Information coefficient -
Deng Y, et. al., PLoS One, (2015)
* **dist** - Distance between terms
* **resnik** - :class:`pyhpo.similarity.defaults.Resnik`
* **lin** - :class:`pyhpo.similarity.defaults.Lin`
* **jc** - :class:`pyhpo.similarity.defaults.JC`
* **jc2** - :class:`pyhpo.similarity.defaults.JC` (**deprecated**)
* **rel** - :class:`pyhpo.similarity.defaults.Relevance`
* **ic** - :class:`pyhpo.similarity.defaults.InformationCoefficient`
* **dist** - :class:`pyhpo.similarity.defaults.Distance`
* Additional methods can be registered separately (
see :class::`pyhpo.similarity.base._Similarity`)
see :ref:`custom-similarity-methods`)
Raises
------
37 changes: 16 additions & 21 deletions tests/test_similarity_default_methods.py
Original file line number Diff line number Diff line change
@@ -107,14 +107,26 @@ def test_lin_zero(self):
assert res == 0.0, res

def test_jc(self):
# Resnik retuns 0.9
self.simscore.register("jc", d.JC)
terms = make_terms()
terms[0].information_content.omim = 0.5
terms[1].information_content.omim = 0.7
terms[0].information_content.omim = 1.6
terms[1].information_content.omim = 1.2
# 1 / 1.6 + 1.2 - 2 x 0.9 + 1
# 1 / 2.8 - 1.8 + 1
# 1 / 2
res = self.simscore(terms[0], terms[1], method="jc")
assert int(res * 10) == 5, res

# -1 / ((1 + 1.8) - 0.5 - 0.7)
terms[0].information_content.omim = 0
terms[1].information_content.omim = 1.2
res = self.simscore(terms[0], terms[1], method="jc")
assert res == -0.625, res
assert res == 0.0, res

terms[0].information_content.omim = 1.8
terms[1].information_content.omim = 0
res = self.simscore(terms[0], terms[1], method="jc")
assert res == 0.0, res

def test_jc_identical(self):
self.simscore.register("jc", d.JC)
@@ -123,23 +135,6 @@ def test_jc_identical(self):
res = self.simscore(term, term, method="jc")
assert res == 1

def test_jc2(self):
self.simscore.register("jc2", d.JC2)
terms = make_terms()
terms[0].information_content.omim = 0.5
terms[1].information_content.omim = 0.7

# 1 - (0.5 + 0.7 - 1.8)
res = self.simscore(terms[0], terms[1], method="jc2")
assert res == 1.6, res

def test_jc2_identical(self):
self.simscore.register("jc2", d.JC2)

term = "foo"
res = self.simscore(term, term, method="jc2")
assert res == 1


class TestSimilarity_resnik_lin_dependencies(unittest.TestCase):
def setUp(self):

0 comments on commit ac1cc7f

Please sign in to comment.