anergictcell · anergictcell · Mar 24, 2024 · Mar 20, 2024 · Mar 23, 2024 · Mar 23, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,119 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+
+## [0.8.2] - 2024-03-09
+
+### Data
+
+- Update to HPO 2024-03-09
+
+### Refactor
+
+- Update dependencies
+
+
+## [0.8.1] - 2023-06-25
+
+### Feature
+
+- Derive `Clone` for `Ontology`
+
+
+## [0.8.0] - 2023-05-22
+
+### Feature
+
+- Add method to calculate hypergeometric enrichment of genes and diseases in HpoSets
+- Add method to create dendogram clusters based on similarity
+
+### Refactor
+
+- Allow custom Similarity implementations to use Matrix
+
+
+## [0.7.1] - 2023-04-27
+
+### Refactor
+
+- Derive `Debug` trait on more public structs
+
+
+## [0.7.0] - 2023-04-22
+
+### Feature
+
+- New method to retrieve the shortest path between two HpoTerm
+- Add modifier flag and categories of HpoTerm
+
+### Refactor
+
+- Use SmallVec for HpoGroup with default size 30
+- Add more benchmarks
+- Improve performance for adding, or-ing and comparing HpoGroups
+
+
+## [0.6.3] - 2023-04-11
+
+### Bugfix
+
+- Fix issue parsing new HPO masterdata format
+
+
+## [0.6.2] - 2023-04-05
+
+### Bugfix
+
+- Fix Subontology to not include all parents or children
+
+### Refactor
+
+- Add benchmark tests for Criterion
+
+
+## [0.6.1] - 2023-03-30
+
+### Documentation
+
+- Add plenty of documentation
+
+
+## [0.6.0] - 2023-03-18
+
+### Feature
+
+- Replace obsolete terms in an HpoSet
+- allow different versions of binary masterdata
+
+### Refactor
+
+- add stricter clippy rules
+- switch from `log` to `tracing`
+
+
+## [0.5.0] - 2023-03-07
+
+### Refactor
+
+- clean up Similarity methods
+- Simplify iterators across the full crate and add new ones
+
+
+## [0.4.2] - 2023-02-11
+
+### Feature
+
+- new similarity method: Mutation
+
+
+## [0.4.0] - 2023-02-04
+
+### Feature
+
+- Create a sub-ontology
+- Calculate hypergeometric enrichment
+
+### Bugfix
+
+- Collecting into a HpoGroup will maintain order of the IDs internally
diff --git a/RELEASE_CHECKLIST.md b/RELEASE_CHECKLIST.md
@@ -0,0 +1,32 @@
+# Release checklist
+
+This document contains the workflows to follow for all changes and releases to `hpo`. 
+The worklow assures that the `main` branch always holds a functional version of `hpo` with all tests passing. The `main` branch can be ahead of the official `crates.io` release. New versions for `crates.io` releases are created independently of the regular updates and will contain all changes present in the `main` branch at that point. My goal is to automate the version bump and release process using Github Actions at some point.
+
+This procedure is just a suggestion at this point and can be modified if needs arise.
+
+
+## Regular updates / Normal development
+
+- [ ] Develop in a dedicated branch (or your own fork): `git checkout -b <MY_FEATURE_NAME>`
+- [ ] Rebase onto `main`: `git rebase main <MY_FEATURE_NAME>`
+- [ ] Double check for good code, sensible API and well-explained docs
+- [ ] Run format, clippy, tests and doc-generation: `cargo fmt --check && cargo clippy && cargo test && cargo doc`
+- [ ] Push to remote: `git push -u origin <MY_FEATURE_NAME>`
+- [ ] Create merge/pull request to `main` branch
+- [ ] Once CICD passes, changes are merged to `main`
+
+
+## Version bumps
+
+- [ ] Make dedicated branch named after version: `git checkout main && git pull && git checkout -b release/<MAJOR>.<MINOR>.<PATCH>`
+- [ ] Update Cargo.toml with new version
+- [ ] Update dependencies if needed and possible
+- [ ] Check if README or docs need update
+- [ ] Add Changelog summary of changes
+- [ ] Run format, clippy, tests and doc-generation: `cargo fmt --check && cargo clippy && cargo test && cargo doc`
+- [ ] add git tag with version: `git tag v<MAJOR>.<MINOR>.<PATCH>`
+- [ ] push to remote, also push tags: `git push -u origin release/<MAJOR>.<MINOR>.<PATCH> && git push tags`
+- [ ] Merge into main
+- [ ] update main branch locally: `git checkout main && git pull`
+- [ ] release to cargo: `cargo release`
diff --git a/examples/compare_similarities.rs b/examples/compare_similarities.rs
@@ -40,8 +40,17 @@ fn main() {
     let sim1 = Builtins::new(&sim1_name, ic_kind).expect("invalid algoritm 1");
     let sim2 = Builtins::new(&sim2_name, ic_kind).expect("invalid algoritm 2");
 
+    let mut n_terms = 100_000;
+
+    if let Some(n) = args.next() {
+        if let Ok(items) = n.parse::<usize>() {
+            n_terms = items;
+        }
+    }
+
     let scores: Vec<String> = ontology
         .into_iter()
+        .take(n_terms)
         .par_bridge()
         .map(|term1| {
             let mut inner_score = Vec::new();

diff --git a/examples/search_by_name.rs b/examples/search_by_name.rs
@@ -4,9 +4,9 @@ use hpo::Ontology;
 
 fn main() {
     let ontology = Ontology::from_binary("tests/ontology.hpo").unwrap();
-    let cystinosis = ontology.disease_by_name("Cystinosis").unwrap();
+    let cystinosis = ontology.omim_disease_by_name("Cystinosis").unwrap();
     println!("first match: {:?}", cystinosis.name());
-    for result in ontology.diseases_by_name("Cystinosis") {
+    for result in ontology.omim_diseases_by_name("Cystinosis") {
         println!("{:?}", result.name());
     }
 }
diff --git a/src/ontology.rs b/src/ontology.rs
@@ -3,7 +3,7 @@ use std::collections::hash_map::Values;
 use std::collections::{HashMap, HashSet};
 use std::fs::File;
 use std::io::Read;
-use std::iter::Filter;
+
 use std::ops::BitOr;
 use std::path::Path;
 
@@ -322,14 +322,27 @@ impl Debug for Ontology {
     }
 }
 
-pub struct DiseaseIter<'a, F> {
-    inner: Filter<Values<'a, OmimDiseaseId, OmimDisease>, F>,
+/// Iterates [`OmimDisease`] that match the query string
+///
+/// This struct is returned by [`Ontology::omim_diseases_by_name`]
+pub struct OmimDiseaseFilter<'a> {
+    iter: Values<'a, OmimDiseaseId, OmimDisease>,
+    query: &'a str,
+}
+
+impl<'a> OmimDiseaseFilter<'a> {
+    fn new(iter: Values<'a, OmimDiseaseId, OmimDisease>, query: &'a str) -> Self {
+        OmimDiseaseFilter { iter, query }
+    }
 }
 
-impl<'a, F: FnMut(&&'a OmimDisease) -> bool + 'a> Iterator for DiseaseIter<'a, F> {
+impl<'a> Iterator for OmimDiseaseFilter<'a> {
     type Item = &'a OmimDisease;
+
     fn next(&mut self) -> Option<Self::Item> {
-        self.inner.next()
+        self.iter
+            .by_ref()
+            .find(|&item| item.name().contains(self.query))
     }
 }
 
@@ -767,20 +780,12 @@ impl Ontology {
     /// use hpo::Ontology;
     /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap();
     ///
-    /// for result in ontology.diseases_by_name("Cystinosis") {
+    /// for result in ontology.omim_diseases_by_name("Cystinosis") {
     ///     println!("{:?}", result.name());
     ///  }
     /// ```
-    pub fn diseases_by_name<'a>(
-        &'a self,
-        substring: &'a str,
-    ) -> DiseaseIter<impl FnMut(&&'a OmimDisease) -> bool + 'a> {
-        DiseaseIter {
-            inner: self
-                .omim_diseases
-                .values()
-                .filter(move |disease| disease.name().contains(substring)),
-        }
+    pub fn omim_diseases_by_name<'a>(&'a self, substring: &'a str) -> OmimDiseaseFilter {
+        OmimDiseaseFilter::new(self.omim_diseases.values(), substring)
     }
 
     /// Returns the first matching [`OmimDisease`] whose name contains the provided
@@ -794,9 +799,9 @@ impl Ontology {
     /// use hpo::Ontology;
     /// let ontology = Ontology::from_binary("tests/example.hpo").unwrap();
     ///
-    /// let cystinosis = ontology.disease_by_name("Cystinosis");
+    /// let cystinosis = ontology.omim_disease_by_name("Cystinosis");
     /// ```
-    pub fn disease_by_name(&self, substring: &str) -> Option<&OmimDisease> {
+    pub fn omim_disease_by_name(&self, substring: &str) -> Option<&OmimDisease> {
         self.omim_diseases
             .values()
             .find(|&disease| disease.name().contains(substring))
@@ -1815,25 +1820,31 @@ mod test {
     #[test]
     fn diseases_by_name() {
         let ont = Ontology::from_binary("tests/example.hpo").unwrap();
-        assert_eq!(ont.diseases_by_name("Cystinosis").count(), 3);
-        assert_eq!(ont.diseases_by_name("Macdermot-Winter syndrome").count(), 1);
-        assert_eq!(ont.diseases_by_name("anergictcell syndrome").count(), 0);
+        assert_eq!(ont.omim_diseases_by_name("Cystinosis").count(), 3);
+        assert_eq!(
+            ont.omim_diseases_by_name("Macdermot-Winter syndrome")
+                .count(),
+            1
+        );
+        assert_eq!(
+            ont.omim_diseases_by_name("anergictcell syndrome").count(),
+            0
+        );
 
-        let cystinosis = vec![
+        let cystinosis = [
             "Cystinosis, adult nonnephropathic",
             "Cystinosis, late-onset juvenile or adolescent nephropathic",
             "Cystinosis, nephropathic",
         ];
+        assert!(cystinosis.contains(&ont.omim_disease_by_name("Cystinosis").unwrap().name()));
+
         assert_eq!(
-            cystinosis.contains(&ont.disease_by_name("Cystinosis").unwrap().name()),
-            true
-        );
-        assert_eq!(
-            ont.disease_by_name("Macdermot-Winter syndrome")
+            ont.omim_disease_by_name("Macdermot-Winter syndrome")
                 .unwrap()
                 .name(),
             "Macdermot-Winter syndrome"
         );
-        assert_eq!(ont.disease_by_name("anergictcell syndrome").is_none(), true);
+
+        assert!(ont.omim_disease_by_name("anergictcell syndrome").is_none());
     }
 }
diff --git a/src/similarity/defaults.rs b/src/similarity/defaults.rs
@@ -142,13 +142,19 @@ impl Similarity for Lin {
 
 /// Similarity score from Jiang & Conrath
 ///
-/// For a detailed description see [Jiang J, Conrath D, ROCLING X, (1997)](https://aclanthology.org/O97-1002.pdf)
+/// For a detailed description see [Jiang J, Conrath D, Rocling X, (1997)](https://aclanthology.org/O97-1002.pdf)
 ///
 /// # Note
 ///
-/// This algorithm is an implementation as described in the paper cited above. It is different
-/// from the `JC` implementation in the `HPOSim` R library. It is identical to the `JC2`
-/// implementation in [`PyHPO`](https://pypi.org/project/pyhpo/)
+/// This algorithm is an implementation as described in the paper cited above, with minor
+/// modifications. It is different from the `JC` implementation in the `HPOSim` R library.
+/// For a discussion on the correct implementation see
+/// [this issue from pyhpo](https://github.com/anergictcell/pyhpo/issues/20).
+///
+/// # Note
+///
+/// The logic of the JC similarity was changed in version `0.8.3`. Ensure you update
+/// to at least `0.8.3` before using it.
 #[derive(Debug)]
 pub struct Jc {
     kind: InformationContentKind,
@@ -179,12 +185,16 @@ impl Similarity for Jc {
             return 1.0;
         }
 
-        let ic_combined = a.information_content().get_kind(&self.kind)
-            + b.information_content().get_kind(&self.kind);
+        let ic1 = a.information_content().get_kind(&self.kind);
+        let ic2 = b.information_content().get_kind(&self.kind);
+
+        if ic1 == 0.0 || ic2 == 0.0 {
+            return 0.0;
+        }
 
         let resnik = Resnik::new(self.kind).calculate(a, b);
 
-        1.0 - (ic_combined - 2.0 * resnik)
+        1.0 / (ic1 + ic2 - 2.0 * resnik + 1.0)
     }
 }
 

diff --git a/src/stats.rs b/src/stats.rs
@@ -219,6 +219,16 @@ impl<K: Clone> Iterator for Counts<'_, K> {
     }
 }
 
+/// We have to frequently do divisions starting with u64 values
+/// and need to return f64 values. To ensure some kind of safety
+/// we use this method to panic in case of overflows.
+fn f64_from_u64(n: u64) -> f64 {
+    let intermediate: u32 = n
+        .try_into()
+        .expect("cannot safely create f64 from large u64");
+    intermediate.into()
+}
+
 #[cfg(test)]
 mod test {
     use super::*;
@@ -263,13 +273,3 @@ mod test {
         assert!(iter.next().is_none());
     }
 }
-
-/// We have to frequently do divisions starting with u64 values
-/// and need to return f64 values. To ensure some kind of safety
-/// we use this method to panic in case of overflows.
-fn f64_from_u64(n: u64) -> f64 {
-    let intermediate: u32 = n
-        .try_into()
-        .expect("cannot safely create f64 from large u64");
-    intermediate.into()
-}