Skip to content

Commit

Permalink
Merge pull request #96 from p-lambda/unlabeledsquashed
Browse files Browse the repository at this point in the history
Added unlabeled data to WILDS datasets
  • Loading branch information
teetone authored Dec 13, 2021
2 parents 3da4b35 + e79f7f4 commit a7a452c
Show file tree
Hide file tree
Showing 90 changed files with 9,302 additions and 787 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
__pycache__
.idea
build
data
logs
dist
venv
wilds.egg-info
.DS_Store
321 changes: 205 additions & 116 deletions README.md

Large diffs are not rendered by default.

168 changes: 168 additions & 0 deletions dataset_preprocessing/amazon_yelp/create_unlabeled_amazon.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import argparse
import csv
import os

import numpy as np
import pandas as pd

# Fix the seed for reproducibility
np.random.seed(0)

"""
Create unlabeled splits for Amazon.
Usage:
python dataset_preprocessing/amazon_yelp/create_unlabeled_amazon.py <path>
"""

NOT_IN_DATASET = -1

# Splits
# 'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4,
# 'val_unlabeled': 11, 'test_unlabeled': 12, 'extra_unlabeled': 13
(
TRAIN,
OOD_VAL,
ID_VAL,
OOD_TEST,
ID_TEST,
) = range(5)
VAL_UNLABELED, TEST_UNLABELED, EXTRA_UNLABELED = range(11, 14)


def main(dataset_path):
def output_split_sizes():
print("-" * 50)
print(f'Train size: {len(split_df[split_df["split"] == TRAIN])}')
print(f'Val size: {len(split_df[split_df["split"] == OOD_VAL])}')
print(f'ID Val size: {len(split_df[split_df["split"] == ID_VAL])}')
print(f'Test size: {len(split_df[split_df["split"] == OOD_TEST])}')
print(f'ID Test size: {len(split_df[split_df["split"] == ID_TEST])}')
print(
f'OOD Val Unlabeled size: {len(split_df[split_df["split"] == VAL_UNLABELED])}'
)
print(
f'OOD Test Unlabeled size: {len(split_df[split_df["split"] == TEST_UNLABELED])}'
)
print(
f'Extra Unlabeled size: {len(split_df[split_df["split"] == EXTRA_UNLABELED])}'
)
print(
f'Number of examples not included: {len(split_df[split_df["split"] == NOT_IN_DATASET])}'
)
print(f'Number of unclean reviews: {len(split_df[~split_df["clean"]])}')
print("-" * 50)
print("\n")

def set_unlabeled_split(split, reviewers):
# Get unused reviews written by users from `reviewers`
split_df.loc[
(split_df["split"] == NOT_IN_DATASET)
& split_df["clean"]
& data_df["reviewerID"].isin(reviewers),
"split",
] = split

def validate_split(split, expected_reviewers_count):
# Sanity check:
# Ensure the number of reviewers equals the number of reviewers in its unlabeled counterpart
# and each reviewer has at least 75 reviews.
actual_reviewers_counts = (
data_df[(split_df["split"] == split)]["reviewerID"].unique().size
)
assert (
actual_reviewers_counts == expected_reviewers_count
), "The number of reviewers ({}) did not equal {}".format(
actual_reviewers_counts, expected_reviewers_count
)
min_reviewers_count = (
data_df[(split_df["split"] == split)]["reviewerID"].value_counts().min()
)
assert (
min_reviewers_count >= 75
), "Each reviewer should have at least 75 reviews, but got a minimum of {} reviews.".format(
min_reviewers_count
)

data_df = pd.read_csv(
os.path.join(dataset_path, "reviews.csv"),
dtype={
"reviewerID": str,
"asin": str,
"reviewTime": str,
"unixReviewTime": int,
"reviewText": str,
"summary": str,
"verified": bool,
"category": str,
"reviewYear": int,
},
keep_default_na=False,
na_values=[],
quoting=csv.QUOTE_NONNUMERIC,
)
user_csv_path = os.path.join(dataset_path, "splits", "user.csv")
split_df = pd.read_csv(user_csv_path)
assert split_df.shape[0] == data_df.shape[0]
output_split_sizes()

ood_val_reviewers_ids = data_df[
split_df["split"] == OOD_VAL
].reviewerID.unique() # 1334 users
set_unlabeled_split(VAL_UNLABELED, ood_val_reviewers_ids)

ood_test_reviewers_ids = data_df[
split_df["split"] == OOD_TEST
].reviewerID.unique() # 1334 users
set_unlabeled_split(TEST_UNLABELED, ood_test_reviewers_ids)

# For EXTRA_UNLABELED, use any users not in any of the other splits
existing_reviewer_ids = np.concatenate(
[
ood_test_reviewers_ids,
ood_val_reviewers_ids,
data_df[split_df["split"] == TRAIN].reviewerID.unique(),
data_df[split_df["split"] == ID_VAL].reviewerID.unique(),
data_df[split_df["split"] == ID_TEST].reviewerID.unique(),
]
)
# There are 151,736 extra reviewers
extra_reviewers_ids = data_df[
~data_df.reviewerID.isin(existing_reviewer_ids)
].reviewerID.unique()
set_unlabeled_split(EXTRA_UNLABELED, extra_reviewers_ids)

# Exclude reviewers with less than 75 reviews.
review_counts = data_df[(split_df["split"] == EXTRA_UNLABELED)][
"reviewerID"
].value_counts()
reviewers_to_filter_out = review_counts[review_counts < 75].keys()
split_df.loc[
(split_df["split"] == EXTRA_UNLABELED)
& data_df["reviewerID"].isin(reviewers_to_filter_out),
"split",
] = NOT_IN_DATASET

# We are done splitting, output stats.
output_split_sizes()

# Sanity checks
validate_split(VAL_UNLABELED, ood_val_reviewers_ids.size)
validate_split(TEST_UNLABELED, ood_test_reviewers_ids.size)
# After filtering out unclean reviews and ensuring >= 75 reviews per reviewer, we are left with 21,694 reviewers.
validate_split(EXTRA_UNLABELED, 21694)

# Write out the new unlabeled split to user.csv
split_df.to_csv(user_csv_path, index=False)
print("Done.")


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Create unlabeled splits for Amazon.")
parser.add_argument(
"path",
type=str,
help="Path to the Amazon dataset",
)
args = parser.parse_args()
main(args.path)
24 changes: 24 additions & 0 deletions dataset_preprocessing/camelyon17/unlabeled/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
## Unlabeled Camelyon17-WILDS patch processing

#### Requirements

- openslide-python>=1.1.2
- opencv-python>=4.4.0

openslide-python relies on first installing OpenSlide;
see [installation instructions](https://github.com/openslide/openslide-python).

#### Instructions

1. Download the [CAMELYON17 training data](https://drive.google.com/drive/folders/0BzsdkU4jWx9BSEI2X1VOLUpYZ3c?resourcekey=0-41XIPJNyEAo598wHxVAP9w)
into `SLIDE_ROOT`.

2. Run `python generate_all_patch_coords.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to generate a .csv of all
potential patches as well as the tissue masks for each WSI. `OUTPUT_ROOT` is wherever you would like the
patches to eventually be written.

3. Then run `python generate_final_metadata.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT`
to generate the metadata.csv file for unlabeled Camelyon.

4. Finally, run `python extract_final_patches_to_disk.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to
extract the chosen patches from the WSIs and write them to disk.
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
import argparse
import os
import pdb
from tqdm import tqdm

import openslide
import pandas as pd

from generate_all_patch_coords import PATCH_LEVEL, CENTER_SIZE


def write_patch_images_from_df(slide_root, output_root):
print("Writing patch images to disk...")
read_df = pd.read_csv(
os.path.join(output_root, "metadata.csv"), index_col=0, dtype={"patient": "str"}
)

patch_level = PATCH_LEVEL
center_size = CENTER_SIZE
patch_size = center_size * 3

for idx in tqdm(read_df.index):
orig_x = read_df.loc[idx, "x_coord"]
orig_y = read_df.loc[idx, "y_coord"]
center = read_df.loc[idx, "center"]
patient = read_df.loc[idx, "patient"]
node = read_df.loc[idx, "node"]

patch_folder = os.path.join(
output_root, "patches", f"patient_{patient}_node_{node}"
)
patch_path = os.path.join(
patch_folder,
f"patch_patient_{patient}_node_{node}_x_{orig_x}_y_{orig_y}.png",
)

os.makedirs(patch_folder, exist_ok=True)
if os.path.isfile(patch_path):
continue

slide_path = os.path.join(
slide_root,
f"center_{center}",
f"patient_{patient}",
f"patient_{patient}_node_{node}.tif",
)
slide = openslide.OpenSlide(slide_path)

# Coords are at patch_level
# First shift coords to top left corner of the entire patch
x = orig_x - center_size
y = orig_y - center_size
# Then match to level 0 coords so we can use read_region
x = int(
round(
x
* slide.level_dimensions[0][0]
/ slide.level_dimensions[patch_level][0]
)
)
y = int(
round(
y
* slide.level_dimensions[0][1]
/ slide.level_dimensions[patch_level][1]
)
)

patch = slide.read_region((x, y), 2, (patch_size, patch_size))
patch.save(patch_path)
print("Done.")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--slide_root", required=True)
parser.add_argument("--output_root", required=True)
args = parser.parse_args()
write_patch_images_from_df(slide_root=args.slide_root, output_root=args.output_root)
Loading

0 comments on commit a7a452c

Please sign in to comment.