|
| 1 | +import argparse |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | +from asreview import ASReviewData |
| 5 | +from asreview.data.base import load_data |
| 6 | + |
| 7 | + |
| 8 | +def sample(input_path, output_path, nr_records, year_column="publication_year"): |
| 9 | + df_input = load_data(input_path).df |
| 10 | + |
| 11 | + # Check for presence of any variation of a year column |
| 12 | + if year_column not in df_input.columns: |
| 13 | + raise ValueError(f"• The input file should have a {year_column} column.") |
| 14 | + |
| 15 | + # Check if k is not too large |
| 16 | + if nr_records * 3 > len(df_input): |
| 17 | + raise ValueError( |
| 18 | + f"• The number of records to sample is too large." |
| 19 | + f"Only {len(df_input)} records are present in the input file." |
| 20 | + f" You are trying to sample {nr_records*3} records." |
| 21 | + ) |
| 22 | + |
| 23 | + if nr_records < 1: |
| 24 | + raise ValueError("• The number of records to sample should be at least 1.") |
| 25 | + |
| 26 | + # Sort by year |
| 27 | + dated_records = df_input[df_input[year_column].notnull()] |
| 28 | + |
| 29 | + if dated_records.empty: |
| 30 | + raise ValueError(f"• The input file has no {year_column} values.") |
| 31 | + |
| 32 | + if len(dated_records) < nr_records * 2: |
| 33 | + raise ValueError("• Not enough dated records to sample from.") |
| 34 | + |
| 35 | + sorted_records = dated_records.sort_values(year_column, ascending=True) |
| 36 | + |
| 37 | + # Take k old and k new records |
| 38 | + old_records = sorted_records.head(nr_records) |
| 39 | + new_records = sorted_records.tail(nr_records) |
| 40 | + |
| 41 | + # Sample k records without overlap with old/new records |
| 42 | + records_to_exclude = pd.concat([old_records, new_records]).index |
| 43 | + remaining_records = df_input[~df_input.index.isin(records_to_exclude)] |
| 44 | + |
| 45 | + sampled_records = remaining_records.sample(nr_records) |
| 46 | + |
| 47 | + # Combine old, new, and sampled records |
| 48 | + df_out = pd.concat([old_records, sampled_records, new_records]) |
| 49 | + |
| 50 | + asdata = ASReviewData(df=df_out) |
| 51 | + asdata.to_file(output_path) |
| 52 | + |
| 53 | + |
| 54 | +def _parse_arguments_sample(): |
| 55 | + parser = argparse.ArgumentParser(prog="asreview data sample") |
| 56 | + parser.add_argument("input_path", type=str, help="The input file path.") |
| 57 | + parser.add_argument("output_path", type=str, help="The output file path.") |
| 58 | + parser.add_argument( |
| 59 | + "nr_records", |
| 60 | + type=int, |
| 61 | + help="The amount of records for old, random, and new records each.", |
| 62 | + ) |
| 63 | + parser.add_argument( |
| 64 | + "--year_column", |
| 65 | + default="publication_year", |
| 66 | + type=str, |
| 67 | + help="The name of the column containing the publication year.", |
| 68 | + ) |
| 69 | + |
| 70 | + return parser |
0 commit comments