-
Notifications
You must be signed in to change notification settings - Fork 130
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #96 from p-lambda/unlabeledsquashed
Added unlabeled data to WILDS datasets
- Loading branch information
Showing
90 changed files
with
9,302 additions
and
787 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
__pycache__ | ||
.idea | ||
build | ||
data | ||
logs | ||
dist | ||
venv | ||
wilds.egg-info | ||
.DS_Store |
168 changes: 168 additions & 0 deletions
168
dataset_preprocessing/amazon_yelp/create_unlabeled_amazon.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
import argparse | ||
import csv | ||
import os | ||
|
||
import numpy as np | ||
import pandas as pd | ||
|
||
# Fix the seed for reproducibility | ||
np.random.seed(0) | ||
|
||
""" | ||
Create unlabeled splits for Amazon. | ||
Usage: | ||
python dataset_preprocessing/amazon_yelp/create_unlabeled_amazon.py <path> | ||
""" | ||
|
||
NOT_IN_DATASET = -1 | ||
|
||
# Splits | ||
# 'train': 0, 'val': 1, 'id_val': 2, 'test': 3, 'id_test': 4, | ||
# 'val_unlabeled': 11, 'test_unlabeled': 12, 'extra_unlabeled': 13 | ||
( | ||
TRAIN, | ||
OOD_VAL, | ||
ID_VAL, | ||
OOD_TEST, | ||
ID_TEST, | ||
) = range(5) | ||
VAL_UNLABELED, TEST_UNLABELED, EXTRA_UNLABELED = range(11, 14) | ||
|
||
|
||
def main(dataset_path): | ||
def output_split_sizes(): | ||
print("-" * 50) | ||
print(f'Train size: {len(split_df[split_df["split"] == TRAIN])}') | ||
print(f'Val size: {len(split_df[split_df["split"] == OOD_VAL])}') | ||
print(f'ID Val size: {len(split_df[split_df["split"] == ID_VAL])}') | ||
print(f'Test size: {len(split_df[split_df["split"] == OOD_TEST])}') | ||
print(f'ID Test size: {len(split_df[split_df["split"] == ID_TEST])}') | ||
print( | ||
f'OOD Val Unlabeled size: {len(split_df[split_df["split"] == VAL_UNLABELED])}' | ||
) | ||
print( | ||
f'OOD Test Unlabeled size: {len(split_df[split_df["split"] == TEST_UNLABELED])}' | ||
) | ||
print( | ||
f'Extra Unlabeled size: {len(split_df[split_df["split"] == EXTRA_UNLABELED])}' | ||
) | ||
print( | ||
f'Number of examples not included: {len(split_df[split_df["split"] == NOT_IN_DATASET])}' | ||
) | ||
print(f'Number of unclean reviews: {len(split_df[~split_df["clean"]])}') | ||
print("-" * 50) | ||
print("\n") | ||
|
||
def set_unlabeled_split(split, reviewers): | ||
# Get unused reviews written by users from `reviewers` | ||
split_df.loc[ | ||
(split_df["split"] == NOT_IN_DATASET) | ||
& split_df["clean"] | ||
& data_df["reviewerID"].isin(reviewers), | ||
"split", | ||
] = split | ||
|
||
def validate_split(split, expected_reviewers_count): | ||
# Sanity check: | ||
# Ensure the number of reviewers equals the number of reviewers in its unlabeled counterpart | ||
# and each reviewer has at least 75 reviews. | ||
actual_reviewers_counts = ( | ||
data_df[(split_df["split"] == split)]["reviewerID"].unique().size | ||
) | ||
assert ( | ||
actual_reviewers_counts == expected_reviewers_count | ||
), "The number of reviewers ({}) did not equal {}".format( | ||
actual_reviewers_counts, expected_reviewers_count | ||
) | ||
min_reviewers_count = ( | ||
data_df[(split_df["split"] == split)]["reviewerID"].value_counts().min() | ||
) | ||
assert ( | ||
min_reviewers_count >= 75 | ||
), "Each reviewer should have at least 75 reviews, but got a minimum of {} reviews.".format( | ||
min_reviewers_count | ||
) | ||
|
||
data_df = pd.read_csv( | ||
os.path.join(dataset_path, "reviews.csv"), | ||
dtype={ | ||
"reviewerID": str, | ||
"asin": str, | ||
"reviewTime": str, | ||
"unixReviewTime": int, | ||
"reviewText": str, | ||
"summary": str, | ||
"verified": bool, | ||
"category": str, | ||
"reviewYear": int, | ||
}, | ||
keep_default_na=False, | ||
na_values=[], | ||
quoting=csv.QUOTE_NONNUMERIC, | ||
) | ||
user_csv_path = os.path.join(dataset_path, "splits", "user.csv") | ||
split_df = pd.read_csv(user_csv_path) | ||
assert split_df.shape[0] == data_df.shape[0] | ||
output_split_sizes() | ||
|
||
ood_val_reviewers_ids = data_df[ | ||
split_df["split"] == OOD_VAL | ||
].reviewerID.unique() # 1334 users | ||
set_unlabeled_split(VAL_UNLABELED, ood_val_reviewers_ids) | ||
|
||
ood_test_reviewers_ids = data_df[ | ||
split_df["split"] == OOD_TEST | ||
].reviewerID.unique() # 1334 users | ||
set_unlabeled_split(TEST_UNLABELED, ood_test_reviewers_ids) | ||
|
||
# For EXTRA_UNLABELED, use any users not in any of the other splits | ||
existing_reviewer_ids = np.concatenate( | ||
[ | ||
ood_test_reviewers_ids, | ||
ood_val_reviewers_ids, | ||
data_df[split_df["split"] == TRAIN].reviewerID.unique(), | ||
data_df[split_df["split"] == ID_VAL].reviewerID.unique(), | ||
data_df[split_df["split"] == ID_TEST].reviewerID.unique(), | ||
] | ||
) | ||
# There are 151,736 extra reviewers | ||
extra_reviewers_ids = data_df[ | ||
~data_df.reviewerID.isin(existing_reviewer_ids) | ||
].reviewerID.unique() | ||
set_unlabeled_split(EXTRA_UNLABELED, extra_reviewers_ids) | ||
|
||
# Exclude reviewers with less than 75 reviews. | ||
review_counts = data_df[(split_df["split"] == EXTRA_UNLABELED)][ | ||
"reviewerID" | ||
].value_counts() | ||
reviewers_to_filter_out = review_counts[review_counts < 75].keys() | ||
split_df.loc[ | ||
(split_df["split"] == EXTRA_UNLABELED) | ||
& data_df["reviewerID"].isin(reviewers_to_filter_out), | ||
"split", | ||
] = NOT_IN_DATASET | ||
|
||
# We are done splitting, output stats. | ||
output_split_sizes() | ||
|
||
# Sanity checks | ||
validate_split(VAL_UNLABELED, ood_val_reviewers_ids.size) | ||
validate_split(TEST_UNLABELED, ood_test_reviewers_ids.size) | ||
# After filtering out unclean reviews and ensuring >= 75 reviews per reviewer, we are left with 21,694 reviewers. | ||
validate_split(EXTRA_UNLABELED, 21694) | ||
|
||
# Write out the new unlabeled split to user.csv | ||
split_df.to_csv(user_csv_path, index=False) | ||
print("Done.") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description="Create unlabeled splits for Amazon.") | ||
parser.add_argument( | ||
"path", | ||
type=str, | ||
help="Path to the Amazon dataset", | ||
) | ||
args = parser.parse_args() | ||
main(args.path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
## Unlabeled Camelyon17-WILDS patch processing | ||
|
||
#### Requirements | ||
|
||
- openslide-python>=1.1.2 | ||
- opencv-python>=4.4.0 | ||
|
||
openslide-python relies on first installing OpenSlide; | ||
see [installation instructions](https://github.com/openslide/openslide-python). | ||
|
||
#### Instructions | ||
|
||
1. Download the [CAMELYON17 training data](https://drive.google.com/drive/folders/0BzsdkU4jWx9BSEI2X1VOLUpYZ3c?resourcekey=0-41XIPJNyEAo598wHxVAP9w) | ||
into `SLIDE_ROOT`. | ||
|
||
2. Run `python generate_all_patch_coords.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to generate a .csv of all | ||
potential patches as well as the tissue masks for each WSI. `OUTPUT_ROOT` is wherever you would like the | ||
patches to eventually be written. | ||
|
||
3. Then run `python generate_final_metadata.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` | ||
to generate the metadata.csv file for unlabeled Camelyon. | ||
|
||
4. Finally, run `python extract_final_patches_to_disk.py --slide_root SLIDE_ROOT --output_root OUTPUT_ROOT` to | ||
extract the chosen patches from the WSIs and write them to disk. |
79 changes: 79 additions & 0 deletions
79
dataset_preprocessing/camelyon17/unlabeled/extract_final_patches_to_disk.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,79 @@ | ||
import argparse | ||
import os | ||
import pdb | ||
from tqdm import tqdm | ||
|
||
import openslide | ||
import pandas as pd | ||
|
||
from generate_all_patch_coords import PATCH_LEVEL, CENTER_SIZE | ||
|
||
|
||
def write_patch_images_from_df(slide_root, output_root): | ||
print("Writing patch images to disk...") | ||
read_df = pd.read_csv( | ||
os.path.join(output_root, "metadata.csv"), index_col=0, dtype={"patient": "str"} | ||
) | ||
|
||
patch_level = PATCH_LEVEL | ||
center_size = CENTER_SIZE | ||
patch_size = center_size * 3 | ||
|
||
for idx in tqdm(read_df.index): | ||
orig_x = read_df.loc[idx, "x_coord"] | ||
orig_y = read_df.loc[idx, "y_coord"] | ||
center = read_df.loc[idx, "center"] | ||
patient = read_df.loc[idx, "patient"] | ||
node = read_df.loc[idx, "node"] | ||
|
||
patch_folder = os.path.join( | ||
output_root, "patches", f"patient_{patient}_node_{node}" | ||
) | ||
patch_path = os.path.join( | ||
patch_folder, | ||
f"patch_patient_{patient}_node_{node}_x_{orig_x}_y_{orig_y}.png", | ||
) | ||
|
||
os.makedirs(patch_folder, exist_ok=True) | ||
if os.path.isfile(patch_path): | ||
continue | ||
|
||
slide_path = os.path.join( | ||
slide_root, | ||
f"center_{center}", | ||
f"patient_{patient}", | ||
f"patient_{patient}_node_{node}.tif", | ||
) | ||
slide = openslide.OpenSlide(slide_path) | ||
|
||
# Coords are at patch_level | ||
# First shift coords to top left corner of the entire patch | ||
x = orig_x - center_size | ||
y = orig_y - center_size | ||
# Then match to level 0 coords so we can use read_region | ||
x = int( | ||
round( | ||
x | ||
* slide.level_dimensions[0][0] | ||
/ slide.level_dimensions[patch_level][0] | ||
) | ||
) | ||
y = int( | ||
round( | ||
y | ||
* slide.level_dimensions[0][1] | ||
/ slide.level_dimensions[patch_level][1] | ||
) | ||
) | ||
|
||
patch = slide.read_region((x, y), 2, (patch_size, patch_size)) | ||
patch.save(patch_path) | ||
print("Done.") | ||
|
||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser() | ||
parser.add_argument("--slide_root", required=True) | ||
parser.add_argument("--output_root", required=True) | ||
args = parser.parse_args() | ||
write_patch_images_from_df(slide_root=args.slide_root, output_root=args.output_root) |
Oops, something went wrong.