feat: added cleanvision to all projects

mbari-org · Dec 23, 2024 · ca24a03 · ca24a03
1 parent ce51cc8
commit ca24a03
Show file tree

Hide file tree

Showing 11 changed files with 742 additions and 49 deletions.
diff --git a/aipipeline/prediction/download_crop_pipeline.py b/aipipeline/prediction/download_crop_pipeline.py
@@ -1,5 +1,5 @@
 # aipipeline, Apache-2.0 license
-# Filename: aipipeline/prediction/download-crop-pipeline.py
+# Filename: aipipeline/prediction/download_crop_pipeline.py
 # Description: Download dataset of images and prepare them running vss pipelines
 import glob
 from datetime import datetime
@@ -12,7 +12,8 @@
 import logging
 
 from aipipeline.config_setup import extract_labels_config, setup_config
-from aipipeline.prediction.library import download, crop_rois_voc, clean
+from aipipeline.prediction.library import download, crop_rois_voc, clean, compute_stats, generate_multicrop_views, \
+    clean_images, remove_multicrop_views
 
 logger = logging.getLogger(__name__)
 formatter = logging.Formatter("%(asctime)s %(levelname)s %(message)s")
@@ -86,6 +87,9 @@ def run_pipeline(argv=None):
     if not args.skip_clean:
         clean(download_path.as_posix())
 
+    # Always remove any previous augmented data before starting
+    remove_multicrop_views(download_path.as_posix())
+
     if args.download_dir:
         config_dict["data"]["processed_path"] = args.download_dir
 
@@ -97,7 +101,9 @@ def run_pipeline(argv=None):
         (
             p
             | "Start download" >> beam.Create([labels])
-            | "Download labeled data" >> beam.Map(download, conf_files=conf_files, config_dict=config_dict)
+            | "Crop ROI" >> beam.Map(crop_rois_voc, config_dict=config_dict)
+            | "Generate views" >> beam.Map(generate_multicrop_views)
+            | "Clean bad examples" >> beam.Map(clean_images, config_dict=config_dict)
             | "Log results" >> beam.Map(logger.info)
         )
 

diff --git a/aipipeline/prediction/library.py b/aipipeline/prediction/library.py
@@ -76,29 +76,34 @@ def generate_multicrop_views2(image) -> List[tuple]:
     return data
 
 
-def clean_bad_images(element) -> tuple:
+def clean_bad_images(element, config_dict: Dict) -> tuple:
     count, crop_path, save_path = element
     num_removed = 0
     # Check if any images exist
     if count == 0:
         return count, crop_path, save_path
     imagelab = Imagelab(data_path=crop_path)
-    imagelab.find_issues()
+    issues = {
+        issue["name"]: {key: value for key, value in issue.items() if key != "name"}
+        for issue in config_dict["data"]["cleanvision_issues"]
+    }
+    imagelab.find_issues(issues)
     imagelab.report()
-    # Columns to check for issues
-    issue_columns = ["is_dark_issue", "is_blurry_issue", "is_exact_duplicates_issue"]
-    bad_images  = imagelab.issues[imagelab.issues[issue_columns].any(axis=1)].index
+    # Create column names for issues, e.g. is_dark_issue, is_blurry_issue, is_exact_duplicates_issue
+    # from dark, blurry, and exact_duplicates
+    issue_columns = [f"is_{issue}_issue" for issue in issues.keys()]
+    bad_images = imagelab.issues[imagelab.issues[issue_columns].any(axis=1)].index
     for img in bad_images:
         os.remove(img)
         num_removed += 1
-    logger.info(f"Removed {num_removed} dark or blurry images in {crop_path}")
+    logger.info(f"Removed {num_removed} images in {crop_path} using cleanvision {issues}")
     return count - num_removed, crop_path, save_path
 
 
-def clean_images(elements) -> List[tuple]:
+def clean_images(elements, config_dict: Dict) -> List[tuple]:
     logger.info(f"Cleaning bad images in {elements} ")
     for element in elements:
-        clean_bad_images(element)
+        clean_bad_images(element, config_dict)
 
     return elements
 
@@ -257,6 +262,38 @@ def gen_machine_friendly_label(label: str) -> str:
     label_machine_friendly = label_machine_friendly.replace(".", "")
     return label_machine_friendly
 
+def compute_stats(labels_filter: List[str], config_dict: Dict, processed_dir: str = None) -> List[tuple]:
+    if processed_dir is None:
+        processed_data = config_dict["data"]["processed_path"]
+    else:
+        processed_data = processed_dir
+    base_path = os.path.join(processed_data, config_dict["data"]["version"])
+
+    # Find the file stats.txt and read it as a json file
+    stats_file = Path(f"{base_path}/crops/stats.json")
+    if not stats_file.exists():
+        logger.error(f"Cannot find {stats_file}. Did voc-cropper run successfully?")
+        return []
+
+    data = []
+    with stats_file.open("r") as f:
+        stats = json.load(f)
+        logger.info(f"Found stats: {stats}")
+        total_labels = stats["total_labels"]
+        labels = list(total_labels.keys())
+        logger.info(f"Found labels: {labels}")
+        for label, count in total_labels.items():
+            if count == 0:
+                logger.info(f"Skipping label {label} with 0 crops")
+                continue
+            if labels_filter and 'all' not in labels_filter and label not in labels_filter:
+                logger.info(f"Skipping label {label} not in {labels_filter}")
+                continue
+            logger.info(f"Found {count} crops for label {label}")
+            # Total number of crops, and paths to crops and cluster output respectively
+            data.append((count, f"{base_path}/crops/{label}", f"{base_path}/cluster/{label}"))
+        logger.debug(data)
+    return data
 
 def crop_rois_voc(labels_filter: List[str], config_dict: Dict, processed_dir: str = None, image_dir: str = None) -> List[
     tuple]:
@@ -317,32 +354,7 @@ def crop_rois_voc(labels_filter: List[str], config_dict: Dict, processed_dir: st
                     logger.error(f"All {n} attempts failed. Giving up.")
                     return []
 
-    # Find the file stats.txt and read it as a json file
-    stats_file = Path(f"{base_path}/crops/stats.json")
-    if not stats_file.exists():
-        logger.error(f"Cannot find {stats_file}. Did voc-cropper run successfully?")
-        return []
-
-    data = []
-    with stats_file.open("r") as f:
-        stats = json.load(f)
-        logger.info(f"Found stats: {stats}")
-        total_labels = stats["total_labels"]
-        labels = list(total_labels.keys())
-        logger.info(f"Found labels: {labels}")
-        for label, count in total_labels.items():
-            if count == 0:
-                logger.info(f"Skipping label {label} with 0 crops")
-                continue
-            if labels_filter and 'all' not in labels_filter and label not in labels_filter:
-                logger.info(f"Skipping label {label} not in {labels_filter}")
-                continue
-            logger.info(f"Found {count} crops for label {label}")
-            # Total number of crops, and paths to crops and cluster output respectively
-            data.append((count, f"{base_path}/crops/{label}", f"{base_path}/cluster/{label}"))
-        logger.debug(data)
-    return data
-
+    return compute_stats(labels_filter, config_dict, processed_dir=processed_dir)
 
 def clean(base_path: str) -> str:
     # Remove any existing data, except for downloaded images

diff --git a/aipipeline/prediction/vss_init_pipeline.py b/aipipeline/prediction/vss_init_pipeline.py
@@ -17,12 +17,11 @@
 from aipipeline.prediction.library import (
     download,
     crop_rois_voc,
-    generate_multicrop_views,
     get_short_name,
     gen_machine_friendly_label,
     clean,
     batch_elements,
-    ProcessClusterBatch, remove_multicrop_views, clean_images,
+    ProcessClusterBatch, remove_multicrop_views, clean_images, generate_multicrop_views,
 )
 
 logger = logging.getLogger(__name__)
@@ -157,7 +156,7 @@ def run_pipeline(argv=None):
             start
             | "Crop ROI" >> beam.Map(crop_rois_voc, config_dict=config_dict)
             | "Generate views" >> beam.Map(generate_multicrop_views)
-            | "Clean dark blurry examples" >> beam.Map(clean_images)
+            | "Clean bad examples" >> beam.Map(clean_images, config_dict=config_dict)
             | 'Batch cluster ROI elements' >> beam.FlatMap(lambda x: batch_elements(x, batch_size=batch_size))
             | 'Process cluster ROI batches' >> beam.ParDo(ProcessClusterBatch(config_dict=config_dict, min_detections=MIN_DETECTIONS))
             | "Load exemplars" >> beam.Map(load_exemplars, config_dict=config_dict, conf_files=conf_files)

diff --git a/aipipeline/projects/bio/config/config.yml b/aipipeline/projects/bio/config/config.yml
@@ -10,6 +10,12 @@ data:
   version: "mega-vits-track-gcam"
   labels: "all"
   download_args: ["--verified --min-saliency 1000"]
+  cleanvision_issues:
+    - name: "low_information"
+      threshold: 0.53
+    - name: "blurry"
+    - name: "dark"
+    - name: "exact_duplicates"
 
 mounts:
   - name: "video"

diff --git a/aipipeline/projects/cfe/config/config.yml b/aipipeline/projects/cfe/config/config.yml
@@ -17,6 +17,12 @@ data:
   version: "Baseline"
   labels: "bloom,long_particle_blur,diatom_chain,aggregate,artifact,phaeocystis,copepod,rhizaria,particle_blur,larvacean,fecal_pellet,football,centric_diatom,gelatinous"
   download_args: ["--verified"]
+  cleanvision_issues:
+    - name: "low_information"
+      threshold: 0.53
+    - name: "blurry"
+    - name: "dark"
+    - name: "exact_duplicates"
 
 sdcat:
   model: "hustvl/yolos-tiny"

diff --git a/aipipeline/projects/i2map/config/config.yml b/aipipeline/projects/i2map/config/config.yml
@@ -9,6 +9,8 @@ data:
   version: "Baseline"
   labels: "all"
   download_args: ["--verified"]
+  cleanvision_issues:
+    - name: "exact_duplicates"
 
 mounts:
   - name: "video"

diff --git a/aipipeline/projects/i2mapbulk/config/config.yml b/aipipeline/projects/i2mapbulk/config/config.yml
@@ -11,6 +11,8 @@ data:
   version: "Baseline"
   labels: "Acanthamunnopsis miller,Bathochordaeus sinker,Bolinopsis,Cydippida,Eusergestes similis,Solmissus,Actinopterygii,Pyrosoma,Salpa fusiformis,krill molt,Merluccius productus,Radiozoa,Thalassocracy inconstans"
   download_args: ["--verified"]
+  cleanvision_issues:
+    - name: "exact_duplicates"
 
 sdcat:
   model: "hustvl/yolos-tiny"

diff --git a/aipipeline/projects/m3/config/config.yml b/aipipeline/projects/m3/config/config.yml
@@ -11,6 +11,12 @@ data:
 #  labels: "Ctenophora sp. A,Aegina citrea,Aegina rosea,Aegina sp. 1,Aglantha,Aglantha digitale,Beroe,Beroe abyssicola,Beroe forskalii,Beroe gracilis,Colobonema,Colobonema sericeum,Merluccius productus,Pantachogon,Pantachogon haeckeli,Praya dubia,Praya dubia nectosome,Teuthoidea,Thalassocalyce,Thalassocalyce inconstans,Vampyroteuthis infernalis,Vitreosalpa gemini,larvacean house,larvacean house outer filter"
   labels: "all"
   download_args: [""]
+  cleanvision_issues:
+    - name: "low_information"
+      threshold: 0.53
+    - name: "blurry"
+    - name: "dark"
+    - name: "exact_duplicates"
 
 sdcat:
   model: "hustvl/yolos-tiny"

diff --git a/aipipeline/projects/uav/clean_vision.ipynb b/aipipeline/projects/uav/clean_vision.ipynb
diff --git a/aipipeline/projects/uav/config/config.yml b/aipipeline/projects/uav/config/config.yml
@@ -23,6 +23,12 @@ data:
   version: "Baseline"
   labels: "all"
   download_args: ["--verified"]
+  cleanvision_issues:
+    - name: "low_information"
+      threshold: 0.53
+    - name: "blurry"
+    - name: "dark"
+    - name: "exact_duplicates"
 
 sdcat:
   model: "MBARI/yolov5x6-uavs-oneclass"

diff --git a/justfile b/justfile
@@ -20,13 +20,23 @@ install: update_trackers
 update_trackers:
     conda env update_trackers --file environment.yml --prune
 
-# Copy dev code to the project on doris
-cp-dev:
-    cp ./aipipeline/projects/bio/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/
-    cp ./aipipeline/projects/bio/model/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/model/
-    cp ./aipipeline/projects/bio/core/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/core/
+
+# Copy core dev code to the project on doris
+cp-core:
     cp ./aipipeline/prediction/*.py /Volumes/dcline/code/aipipeline/aipipeline/prediction/
     cp ./aipipeline/metrics/*.py /Volumes/dcline/code/aipipeline/aipipeline/metrics/
+    cp justfile /Volumes/dcline/code/aipipeline/justfile
+
+# Copy uav dev code to the project on doris
+cp-dev-uav:
+    cp ./aipipeline/projects/uav/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/uav/
+    cp ./aipipeline/projects/uav/config/* /Volumes/dcline/code/aipipeline/aipipeline/projects/uav/config/
+
+# Copy bio dev code to the project on doris
+cp-dev-bio:
+    cp ./aipipeline/projects/bio/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/
+    cp ./aipipeline/projects/bio/config/ /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/config/
+    cp ./aipipeline/projects/bio/model/*.py /Volumes/dcline/code/aipipeline/aipipeline/projects/bio/model/
 
 # Generate a tsne plot of the VSS database
 plot-tsne-vss project='uav':
@@ -172,12 +182,12 @@ crop project='uav' *more_args="":
         {{more_args}}
 
 # Download and crop 
-download-crop project='uav':
+download-crop project='uav' *more_args="":
     #!/usr/bin/env bash
     export PYTHONPATH=.
     time conda run -n aipipeline --no-capture-output python3 aipipeline/prediction/download_crop_pipeline.py \
         --config ./aipipeline/projects/{{project}}/config/config.yml \
-        --skip-clean True
+        {{more_args}}
 
 # Download only
 download project='uav':
@@ -261,10 +271,10 @@ run-mega-track-bio-dive dive='/mnt/M3/mezzanine/Ventana/2022/09/4432' gpu_id='0'
      echo "Processing $video"
      time conda run -n aipipeline --no-capture-output python3 aipipeline/projects/bio/process.py \
        --config ./aipipeline/projects/bio/config/config.yml \
-       --max-frames-tracked 200 --min-score-det 0.1 --batch-size 60 --min-score-track 0.1 --min-frames 5 --version mega-vits-track-gcam \
+       --max-frames-tracked 200 --min-score-det 0.1 --batch-size 34 --min-score-track 0.1 --min-frames 5 --version mega-vits-track-gcam \
        --vits-model /mnt/DeepSea-AI/models/m3midwater-vit-b-16 \
        --det-model /mnt/DeepSea-AI/models/megadet \
-       --stride-fps 1 --video $video --gpu-id {{gpu_id}}
+       --stride 16 --video $video --gpu-id {{gpu_id}}
      } 
     export -f process_file
     # Run 1 video in parallel