Add Sample Datatool (asreview#41)

laurens88 · web-flow · commit d71440ccc74b · 2024-04-08T17:26:14.000+02:00
Adds a new tool to datatools that samples old, new and random records from a dataset.
diff --git a/README.md b/README.md
@@ -9,7 +9,8 @@ LAB](https://github.com/asreview/asreview) that can be used to:
 - [**Deduplicate**](#data-dedup) data
 - [**Stack**](#data-vstack-experimental) multiple datasets
 - [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets
-- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations.
+- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations
+- [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time.
 
 Several [tutorials](Tutorials.md) are available that show how
 `ASReview-Datatools` can be used in different scenarios.
@@ -288,6 +289,15 @@ One thing to note is that OpenAlex will handle data requests faster if the sende
 asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com
 ```
 
+## Sample
+
+This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be:
+
+```bash
+asreview data sample input_dataset.xlsx output_dataset.xslx 50
+```
+This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`.
+
 ## License
 
 This extension is published under the [MIT license](/LICENSE).
diff --git a/asreviewcontrib/datatools/entrypoint.py b/asreviewcontrib/datatools/entrypoint.py
@@ -10,12 +10,14 @@
 from asreviewcontrib.datatools.convert import convert
 from asreviewcontrib.datatools.describe import _parse_arguments_describe
 from asreviewcontrib.datatools.describe import describe
+from asreviewcontrib.datatools.sample import _parse_arguments_sample
+from asreviewcontrib.datatools.sample import sample
 from asreviewcontrib.datatools.snowball import _parse_arguments_snowball
 from asreviewcontrib.datatools.snowball import snowball
 from asreviewcontrib.datatools.stack import _parse_arguments_vstack
 from asreviewcontrib.datatools.stack import vstack
 
-DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball"]
+DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"]
 
 
 class DataEntryPoint(BaseEntryPoint):
@@ -104,6 +106,10 @@ def execute(self, argv):
                 args_snowballing_parser = _parse_arguments_snowball()
                 args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:]))
                 snowball(**args_snowballing)
+            if argv[0] == "sample":
+                args_sample_parser = _parse_arguments_sample()
+                args_sample = vars(args_sample_parser.parse_args(argv[1:]))
+                sample(**args_sample)
             if argv[0] == "vstack":
                 args_vstack_parser = _parse_arguments_vstack()
                 args_vstack = args_vstack_parser.parse_args(argv[1:])
diff --git a/asreviewcontrib/datatools/sample.py b/asreviewcontrib/datatools/sample.py
@@ -0,0 +1,70 @@
+import argparse
+
+import pandas as pd
+from asreview import ASReviewData
+from asreview.data.base import load_data
+
+
+def sample(input_path, output_path, nr_records, year_column="publication_year"):
+    df_input = load_data(input_path).df
+
+    # Check for presence of any variation of a year column
+    if year_column not in df_input.columns:
+        raise ValueError(f"• The input file should have a {year_column} column.")
+
+    # Check if k is not too large
+    if nr_records * 3 > len(df_input):
+        raise ValueError(
+            f"• The number of records to sample is too large."
+            f"Only {len(df_input)} records are present in the input file."
+            f" You are trying to sample {nr_records*3} records."
+        )
+
+    if nr_records < 1:
+        raise ValueError("• The number of records to sample should be at least 1.")
+
+    # Sort by year
+    dated_records = df_input[df_input[year_column].notnull()]
+
+    if dated_records.empty:
+        raise ValueError(f"• The input file has no {year_column} values.")
+
+    if len(dated_records) < nr_records * 2:
+        raise ValueError("• Not enough dated records to sample from.")
+
+    sorted_records = dated_records.sort_values(year_column, ascending=True)
+
+    # Take k old and k new records
+    old_records = sorted_records.head(nr_records)
+    new_records = sorted_records.tail(nr_records)
+
+    # Sample k records without overlap with old/new records
+    records_to_exclude = pd.concat([old_records, new_records]).index
+    remaining_records = df_input[~df_input.index.isin(records_to_exclude)]
+
+    sampled_records = remaining_records.sample(nr_records)
+
+    # Combine old, new, and sampled records
+    df_out = pd.concat([old_records, sampled_records, new_records])
+
+    asdata = ASReviewData(df=df_out)
+    asdata.to_file(output_path)
+
+
+def _parse_arguments_sample():
+    parser = argparse.ArgumentParser(prog="asreview data sample")
+    parser.add_argument("input_path", type=str, help="The input file path.")
+    parser.add_argument("output_path", type=str, help="The output file path.")
+    parser.add_argument(
+        "nr_records",
+        type=int,
+        help="The amount of records for old, random, and new records each.",
+    )
+    parser.add_argument(
+        "--year_column",
+        default="publication_year",
+        type=str,
+        help="The name of the column containing the publication year.",
+    )
+
+    return parser
diff --git a/tests/demo_data/sample_data.csv b/tests/demo_data/sample_data.csv
@@ -0,0 +1,7 @@
+title, doi, publication_year
+title1, doi1, 2005
+title2, doi2, 2001
+title3, doi3,
+title4, doi4, 2003
+title5, doi5, 2004
+title6, doi6, 2000
diff --git a/tests/test_sample.py b/tests/test_sample.py
@@ -0,0 +1,17 @@
+# create unit tests for the sample.py file
+from pathlib import Path
+
+import pandas as pd
+
+from asreviewcontrib.datatools.sample import sample
+
+INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv"
+
+
+def test_sample(tmpdir):
+    sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year")
+    df = pd.read_csv(tmpdir / "output.csv")
+    assert len(df) == 3
+    assert "publication_year" in df.columns
+    assert df.iloc[0]["publication_year"] == 2000
+    assert df.iloc[2]["publication_year"] == 2005