Skip to content

Commit

Permalink
feat: added cleanvision to all projects
Browse files Browse the repository at this point in the history
  • Loading branch information
danellecline committed Dec 23, 2024
1 parent ce51cc8 commit ca24a03
Show file tree
Hide file tree
Showing 11 changed files with 742 additions and 49 deletions.
12 changes: 9 additions & 3 deletions aipipeline/prediction/download_crop_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# aipipeline, Apache-2.0 license
# Filename: aipipeline/prediction/download-crop-pipeline.py
# Filename: aipipeline/prediction/download_crop_pipeline.py
# Description: Download dataset of images and prepare them running vss pipelines
import glob
from datetime import datetime
Expand All @@ -12,7 +12,8 @@
import logging

from aipipeline.config_setup import extract_labels_config, setup_config
from aipipeline.prediction.library import download, crop_rois_voc, clean
from aipipeline.prediction.library import download, crop_rois_voc, clean, compute_stats, generate_multicrop_views, \
clean_images, remove_multicrop_views

logger = logging.getLogger(__name__)
formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
Expand Down Expand Up @@ -86,6 +87,9 @@ def run_pipeline(argv=None):
if not args.skip_clean:
clean(download_path.as_posix())

# Always remove any previous augmented data before starting
remove_multicrop_views(download_path.as_posix())

if args.download_dir:
config_dict["data"]["processed_path"] = args.download_dir

Expand All @@ -97,7 +101,9 @@ def run_pipeline(argv=None):
(
p
| "Start download" >> beam.Create([labels])
| "Download labeled data" >> beam.Map(download, conf_files=conf_files, config_dict=config_dict)
| "Crop ROI" >> beam.Map(crop_rois_voc, config_dict=config_dict)
| "Generate views" >> beam.Map(generate_multicrop_views)
| "Clean bad examples" >> beam.Map(clean_images, config_dict=config_dict)
| "Log results" >> beam.Map(logger.info)
)

Expand Down
80 changes: 46 additions & 34 deletions aipipeline/prediction/library.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,29 +76,34 @@ def generate_multicrop_views2(image) -> List[tuple]:
return data


def clean_bad_images(element) -> tuple:
def clean_bad_images(element, config_dict: Dict) -> tuple:
count, crop_path, save_path = element
num_removed = 0
# Check if any images exist
if count == 0:
return count, crop_path, save_path
imagelab = Imagelab(data_path=crop_path)
imagelab.find_issues()
issues = {
issue["name"]: {key: value for key, value in issue.items() if key != "name"}
for issue in config_dict["data"]["cleanvision_issues"]
}
imagelab.find_issues(issues)
imagelab.report()
# Columns to check for issues
issue_columns = ["is_dark_issue", "is_blurry_issue", "is_exact_duplicates_issue"]
bad_images = imagelab.issues[imagelab.issues[issue_columns].any(axis=1)].index
# Create column names for issues, e.g. is_dark_issue, is_blurry_issue, is_exact_duplicates_issue
# from dark, blurry, and exact_duplicates
issue_columns = [f"is_{issue}_issue" for issue in issues.keys()]
bad_images = imagelab.issues[imagelab.issues[issue_columns].any(axis=1)].index
for img in bad_images:
os.remove(img)
num_removed += 1
logger.info(f"Removed {num_removed} dark or blurry images in {crop_path}")
logger.info(f"Removed {num_removed} images in {crop_path} using cleanvision {issues}")
return count - num_removed, crop_path, save_path


def clean_images(elements) -> List[tuple]:
def clean_images(elements, config_dict: Dict) -> List[tuple]:
logger.info(f"Cleaning bad images in {elements} ")
for element in elements:
clean_bad_images(element)
clean_bad_images(element, config_dict)

return elements

Expand Down Expand Up @@ -257,6 +262,38 @@ def gen_machine_friendly_label(label: str) -> str:
label_machine_friendly = label_machine_friendly.replace(".", "")
return label_machine_friendly

def compute_stats(labels_filter: List[str], config_dict: Dict, processed_dir: str = None) -> List[tuple]:
if processed_dir is None:
processed_data = config_dict["data"]["processed_path"]
else:
processed_data = processed_dir
base_path = os.path.join(processed_data, config_dict["data"]["version"])

# Find the file stats.txt and read it as a json file
stats_file = Path(f"{base_path}/crops/stats.json")
if not stats_file.exists():
logger.error(f"Cannot find {stats_file}. Did voc-cropper run successfully?")
return []

data = []
with stats_file.open("r") as f:
stats = json.load(f)
logger.info(f"Found stats: {stats}")
total_labels = stats["total_labels"]
labels = list(total_labels.keys())
logger.info(f"Found labels: {labels}")
for label, count in total_labels.items():
if count == 0:
logger.info(f"Skipping label {label} with 0 crops")
continue
if labels_filter and 'all' not in labels_filter and label not in labels_filter:
logger.info(f"Skipping label {label} not in {labels_filter}")
continue
logger.info(f"Found {count} crops for label {label}")
# Total number of crops, and paths to crops and cluster output respectively
data.append((count, f"{base_path}/crops/{label}", f"{base_path}/cluster/{label}"))
logger.debug(data)
return data

def crop_rois_voc(labels_filter: List[str], config_dict: Dict, processed_dir: str = None, image_dir: str = None) -> List[
tuple]:
Expand Down Expand Up @@ -317,32 +354,7 @@ def crop_rois_voc(labels_filter: List[str], config_dict: Dict, processed_dir: st
logger.error(f"All {n} attempts failed. Giving up.")
return []

# Find the file stats.txt and read it as a json file
stats_file = Path(f"{base_path}/crops/stats.json")
if not stats_file.exists():
logger.error(f"Cannot find {stats_file}. Did voc-cropper run successfully?")
return []

data = []
with stats_file.open("r") as f:
stats = json.load(f)
logger.info(f"Found stats: {stats}")
total_labels = stats["total_labels"]
labels = list(total_labels.keys())
logger.info(f"Found labels: {labels}")
for label, count in total_labels.items():
if count == 0:
logger.info(f"Skipping label {label} with 0 crops")
continue
if labels_filter and 'all' not in labels_filter and label not in labels_filter:
logger.info(f"Skipping label {label} not in {labels_filter}")
continue
logger.info(f"Found {count} crops for label {label}")
# Total number of crops, and paths to crops and cluster output respectively
data.append((count, f"{base_path}/crops/{label}", f"{base_path}/cluster/{label}"))
logger.debug(data)
return data

return compute_stats(labels_filter, config_dict, processed_dir=processed_dir)

def clean(base_path: str) -> str:
# Remove any existing data, except for downloaded images
Expand Down
5 changes: 2 additions & 3 deletions aipipeline/prediction/vss_init_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,11 @@
from aipipeline.prediction.library import (
download,
crop_rois_voc,
generate_multicrop_views,
get_short_name,
gen_machine_friendly_label,
clean,
batch_elements,
ProcessClusterBatch, remove_multicrop_views, clean_images,
ProcessClusterBatch, remove_multicrop_views, clean_images, generate_multicrop_views,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -157,7 +156,7 @@ def run_pipeline(argv=None):
start
| "Crop ROI" >> beam.Map(crop_rois_voc, config_dict=config_dict)
| "Generate views" >> beam.Map(generate_multicrop_views)
| "Clean dark blurry examples" >> beam.Map(clean_images)
| "Clean bad examples" >> beam.Map(clean_images, config_dict=config_dict)
| 'Batch cluster ROI elements' >> beam.FlatMap(lambda x: batch_elements(x, batch_size=batch_size))
| 'Process cluster ROI batches' >> beam.ParDo(ProcessClusterBatch(config_dict=config_dict, min_detections=MIN_DETECTIONS))
| "Load exemplars" >> beam.Map(load_exemplars, config_dict=config_dict, conf_files=conf_files)
Expand Down
6 changes: 6 additions & 0 deletions aipipeline/projects/bio/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@ data:
version: "mega-vits-track-gcam"
labels: "all"
download_args: ["--verified --min-saliency 1000"]
cleanvision_issues:
- name: "low_information"
threshold: 0.53
- name: "blurry"
- name: "dark"
- name: "exact_duplicates"

mounts:
- name: "video"
Expand Down
6 changes: 6 additions & 0 deletions aipipeline/projects/cfe/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,12 @@ data:
version: "Baseline"
labels: "bloom,long_particle_blur,diatom_chain,aggregate,artifact,phaeocystis,copepod,rhizaria,particle_blur,larvacean,fecal_pellet,football,centric_diatom,gelatinous"
download_args: ["--verified"]
cleanvision_issues:
- name: "low_information"
threshold: 0.53
- name: "blurry"
- name: "dark"
- name: "exact_duplicates"

sdcat:
model: "hustvl/yolos-tiny"
Expand Down
2 changes: 2 additions & 0 deletions aipipeline/projects/i2map/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ data:
version: "Baseline"
labels: "all"
download_args: ["--verified"]
cleanvision_issues:
- name: "exact_duplicates"

mounts:
- name: "video"
Expand Down
2 changes: 2 additions & 0 deletions aipipeline/projects/i2mapbulk/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ data:
version: "Baseline"
labels: "Acanthamunnopsis miller,Bathochordaeus sinker,Bolinopsis,Cydippida,Eusergestes similis,Solmissus,Actinopterygii,Pyrosoma,Salpa fusiformis,krill molt,Merluccius productus,Radiozoa,Thalassocracy inconstans"
download_args: ["--verified"]
cleanvision_issues:
- name: "exact_duplicates"

sdcat:
model: "hustvl/yolos-tiny"
Expand Down
6 changes: 6 additions & 0 deletions aipipeline/projects/m3/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,12 @@ data:
# labels: "Ctenophora sp. A,Aegina citrea,Aegina rosea,Aegina sp. 1,Aglantha,Aglantha digitale,Beroe,Beroe abyssicola,Beroe forskalii,Beroe gracilis,Colobonema,Colobonema sericeum,Merluccius productus,Pantachogon,Pantachogon haeckeli,Praya dubia,Praya dubia nectosome,Teuthoidea,Thalassocalyce,Thalassocalyce inconstans,Vampyroteuthis infernalis,Vitreosalpa gemini,larvacean house,larvacean house outer filter"
labels: "all"
download_args: [""]
cleanvision_issues:
- name: "low_information"
threshold: 0.53
- name: "blurry"
- name: "dark"
- name: "exact_duplicates"

sdcat:
model: "hustvl/yolos-tiny"
Expand Down
638 changes: 638 additions & 0 deletions aipipeline/projects/uav/clean_vision.ipynb

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions aipipeline/projects/uav/config/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ data:
version: "Baseline"
labels: "all"
download_args: ["--verified"]
cleanvision_issues:
- name: "low_information"
threshold: 0.53
- name: "blurry"
- name: "dark"
- name: "exact_duplicates"

sdcat:
model: "MBARI/yolov5x6-uavs-oneclass"
Expand Down
28 changes: 19 additions & 9 deletions justfile
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,23 @@ install: update_trackers
update_trackers:
conda env update_trackers --file environment.yml --prune

# Copy dev code to the project on doris
cp-dev:
cp ./aipipeline/projects/bio/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/
cp ./aipipeline/projects/bio/model/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/model/
cp ./aipipeline/projects/bio/core/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/core/

# Copy core dev code to the project on doris
cp-core:
cp ./aipipeline/prediction/*.py /Volumes/dcline/code/aipipeline/aipipeline/prediction/
cp ./aipipeline/metrics/*.py /Volumes/dcline/code/aipipeline/aipipeline/metrics/
cp justfile /Volumes/dcline/code/aipipeline/justfile

# Copy uav dev code to the project on doris
cp-dev-uav:
cp ./aipipeline/projects/uav/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/uav/
cp ./aipipeline/projects/uav/config/* /Volumes/dcline/code/aipipeline/aipipeline/projects/uav/config/

# Copy bio dev code to the project on doris
cp-dev-bio:
cp ./aipipeline/projects/bio/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/
cp ./aipipeline/projects/bio/config/ /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/config/
cp ./aipipeline/projects/bio/model/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/model/

# Generate a tsne plot of the VSS database
plot-tsne-vss project='uav':
Expand Down Expand Up @@ -172,12 +182,12 @@ crop project='uav' *more_args="":
{{more_args}}
# Download and crop
download-crop project='uav':
download-crop project='uav' *more_args="":
#!/usr/bin/env bash
export PYTHONPATH=.
time conda run -n aipipeline --no-capture-output python3 aipipeline/prediction/download_crop_pipeline.py \
--config ./aipipeline/projects/{{project}}/config/config.yml \
--skip-clean True
{{more_args}}
# Download only
download project='uav':
Expand Down Expand Up @@ -261,10 +271,10 @@ run-mega-track-bio-dive dive='/mnt/M3/mezzanine/Ventana/2022/09/4432' gpu_id='0'
echo "Processing $video"
time conda run -n aipipeline --no-capture-output python3 aipipeline/projects/bio/process.py \
--config ./aipipeline/projects/bio/config/config.yml \
--max-frames-tracked 200 --min-score-det 0.1 --batch-size 60 --min-score-track 0.1 --min-frames 5 --version mega-vits-track-gcam \
--max-frames-tracked 200 --min-score-det 0.1 --batch-size 34 --min-score-track 0.1 --min-frames 5 --version mega-vits-track-gcam \
--vits-model /mnt/DeepSea-AI/models/m3midwater-vit-b-16 \
--det-model /mnt/DeepSea-AI/models/megadet \
--stride-fps 1 --video $video --gpu-id {{gpu_id}}
--stride 16 --video $video --gpu-id {{gpu_id}}
}
export -f process_file
# Run 1 video in parallel
Expand Down

0 comments on commit ca24a03

Please sign in to comment.