Skip to content

Commit d71440c

Browse files
authored
Add Sample Datatool (asreview#41)
Adds a new tool to datatools that samples old, new and random records from a dataset.
1 parent 65f439e commit d71440c

File tree

5 files changed

+112
-2
lines changed

5 files changed

+112
-2
lines changed

README.md

+11-1
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,8 @@ LAB](https://github.com/asreview/asreview) that can be used to:
99
- [**Deduplicate**](#data-dedup) data
1010
- [**Stack**](#data-vstack-experimental) multiple datasets
1111
- [**Compose**](#data-compose-experimental) a single (labeled, partly labeled, or unlabeled) dataset from multiple datasets
12-
- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations.
12+
- [**Snowball**](#snowball) a dataset to find incoming or outgoing citations
13+
- [**Sample**](#sample) old, random, and new papers in order to check if the terminology has changed over time.
1314

1415
Several [tutorials](Tutorials.md) are available that show how
1516
`ASReview-Datatools` can be used in different scenarios.
@@ -288,6 +289,15 @@ One thing to note is that OpenAlex will handle data requests faster if the sende
288289
asreview data snowball input_dataset.csv output_dataset.csv --backward --email my_email@provider.com
289290
```
290291

292+
## Sample
293+
294+
This datatool is used to sample old, random and new records from your dataset by using the `asreview data sample` command. The sampled records are then stored in an output file. This can be useful for detecting concept drift, meaning that the words used for certain concepts change over time. This script assumes that the dataset includes a column named `publication_year`. An example would be:
295+
296+
```bash
297+
asreview data sample input_dataset.xlsx output_dataset.xslx 50
298+
```
299+
This samples the `50` oldest and `50` newest records from `input_dataset.xlsx` and samples `50` records randomly (without overlap from the old and new partitions!). The resulting 150 records are written to `output_dataset.xlsx`.
300+
291301
## License
292302

293303
This extension is published under the [MIT license](/LICENSE).

asreviewcontrib/datatools/entrypoint.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -10,12 +10,14 @@
1010
from asreviewcontrib.datatools.convert import convert
1111
from asreviewcontrib.datatools.describe import _parse_arguments_describe
1212
from asreviewcontrib.datatools.describe import describe
13+
from asreviewcontrib.datatools.sample import _parse_arguments_sample
14+
from asreviewcontrib.datatools.sample import sample
1315
from asreviewcontrib.datatools.snowball import _parse_arguments_snowball
1416
from asreviewcontrib.datatools.snowball import snowball
1517
from asreviewcontrib.datatools.stack import _parse_arguments_vstack
1618
from asreviewcontrib.datatools.stack import vstack
1719

18-
DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball"]
20+
DATATOOLS = ["describe", "dedup", "convert", "compose", "vstack", "snowball", "sample"]
1921

2022

2123
class DataEntryPoint(BaseEntryPoint):
@@ -104,6 +106,10 @@ def execute(self, argv):
104106
args_snowballing_parser = _parse_arguments_snowball()
105107
args_snowballing = vars(args_snowballing_parser.parse_args(argv[1:]))
106108
snowball(**args_snowballing)
109+
if argv[0] == "sample":
110+
args_sample_parser = _parse_arguments_sample()
111+
args_sample = vars(args_sample_parser.parse_args(argv[1:]))
112+
sample(**args_sample)
107113
if argv[0] == "vstack":
108114
args_vstack_parser = _parse_arguments_vstack()
109115
args_vstack = args_vstack_parser.parse_args(argv[1:])

asreviewcontrib/datatools/sample.py

+70
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,70 @@
1+
import argparse
2+
3+
import pandas as pd
4+
from asreview import ASReviewData
5+
from asreview.data.base import load_data
6+
7+
8+
def sample(input_path, output_path, nr_records, year_column="publication_year"):
9+
df_input = load_data(input_path).df
10+
11+
# Check for presence of any variation of a year column
12+
if year_column not in df_input.columns:
13+
raise ValueError(f"• The input file should have a {year_column} column.")
14+
15+
# Check if k is not too large
16+
if nr_records * 3 > len(df_input):
17+
raise ValueError(
18+
f"• The number of records to sample is too large."
19+
f"Only {len(df_input)} records are present in the input file."
20+
f" You are trying to sample {nr_records*3} records."
21+
)
22+
23+
if nr_records < 1:
24+
raise ValueError("• The number of records to sample should be at least 1.")
25+
26+
# Sort by year
27+
dated_records = df_input[df_input[year_column].notnull()]
28+
29+
if dated_records.empty:
30+
raise ValueError(f"• The input file has no {year_column} values.")
31+
32+
if len(dated_records) < nr_records * 2:
33+
raise ValueError("• Not enough dated records to sample from.")
34+
35+
sorted_records = dated_records.sort_values(year_column, ascending=True)
36+
37+
# Take k old and k new records
38+
old_records = sorted_records.head(nr_records)
39+
new_records = sorted_records.tail(nr_records)
40+
41+
# Sample k records without overlap with old/new records
42+
records_to_exclude = pd.concat([old_records, new_records]).index
43+
remaining_records = df_input[~df_input.index.isin(records_to_exclude)]
44+
45+
sampled_records = remaining_records.sample(nr_records)
46+
47+
# Combine old, new, and sampled records
48+
df_out = pd.concat([old_records, sampled_records, new_records])
49+
50+
asdata = ASReviewData(df=df_out)
51+
asdata.to_file(output_path)
52+
53+
54+
def _parse_arguments_sample():
55+
parser = argparse.ArgumentParser(prog="asreview data sample")
56+
parser.add_argument("input_path", type=str, help="The input file path.")
57+
parser.add_argument("output_path", type=str, help="The output file path.")
58+
parser.add_argument(
59+
"nr_records",
60+
type=int,
61+
help="The amount of records for old, random, and new records each.",
62+
)
63+
parser.add_argument(
64+
"--year_column",
65+
default="publication_year",
66+
type=str,
67+
help="The name of the column containing the publication year.",
68+
)
69+
70+
return parser

tests/demo_data/sample_data.csv

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
title, doi, publication_year
2+
title1, doi1, 2005
3+
title2, doi2, 2001
4+
title3, doi3,
5+
title4, doi4, 2003
6+
title5, doi5, 2004
7+
title6, doi6, 2000

tests/test_sample.py

+17
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# create unit tests for the sample.py file
2+
from pathlib import Path
3+
4+
import pandas as pd
5+
6+
from asreviewcontrib.datatools.sample import sample
7+
8+
INPUT_DIR = Path(__file__).parent / "demo_data" / "sample_data.csv"
9+
10+
11+
def test_sample(tmpdir):
12+
sample(INPUT_DIR, tmpdir / "output.csv", 1, "publication_year")
13+
df = pd.read_csv(tmpdir / "output.csv")
14+
assert len(df) == 3
15+
assert "publication_year" in df.columns
16+
assert df.iloc[0]["publication_year"] == 2000
17+
assert df.iloc[2]["publication_year"] == 2005

0 commit comments

Comments
 (0)