index.html

<!DOCTYPE html>
<html>

<head>
  <title>Double Y: Building Extraction Generalization</title>
  <link rel="icon" type="image/png" href="static/images/orbit.png">
  <link href="https://fonts.googleapis.com/css?family=Google+Sans|Noto+Sans|Castoro" rel="stylesheet">
  
  <!-- Meta tags for social media banners, these should be filled in appropriatly as they are your "business card" -->
  <meta name="description" content="Double Y: Building Extraction Generalization">
  <meta property="og:title" content="Double Y: Building Extraction Generalization"/>
  <meta property="og:description" content="Cross-City Building Instance Segmentation: From More Data to Diffusion-Augmentation"/>
  <meta property="og:url" content="https://github.com/DoubleY-BEGC2024"/>
  
  <link rel="stylesheet" href="static/css/bulma.min.css">
  <link rel="stylesheet" href="static/css/bulma-carousel.min.css">
  <link rel="stylesheet" href="static/css/bulma-slider.min.css">
  <link rel="stylesheet" href="static/css/fontawesome.all.min.css">
  <link rel="stylesheet" href="https://cdn.jsdelivr.net/gh/jpswalsh/academicons@1/css/academicons.min.css">
  <link rel="stylesheet" href="static/css/index.css">

  <script src="https://ajax.googleapis.com/ajax/libs/jquery/3.5.1/jquery.min.js"></script>
  <script src="https://documentcloud.adobe.com/view-sdk/main.js"></script>
  <script defer src="static/js/fontawesome.all.min.js"></script>
  <script src="static/js/bulma-carousel.min.js"></script>
  <script src="static/js/bulma-slider.min.js"></script>
  <script src="static/js/index.js"></script>
</head>

<body>

  <section class="hero">
    <div class="hero-body">
      <div class="container is-max-desktop">
        <div class="columns is-centered">
          <div class="column has-text-centered">
            <h1 class="title is-1 publication-title">Cross-City Building Instance Segmentation: From More Data to Diffusion-Augmentation</h1>
            <div class="is-size-5 publication-authors">
              <!-- Paper authors -->
              <span class="author-block">
                <a href="https://www.linkedin.com/in/wongyijie/" target="_blank">Yi Jie WONG</a>,</span>
              <span class="author-block">
                <a href="https://www.linkedin.com/in/yinloonkhor/" target="_blank">Yin Loon KHOR</a></span>
              </span>
            </div>

            <div class="is-size-5 publication-authors">
              <span class="author-block"><strong>Group Name:</strong> Double-Y | <strong>Public
                  Leaderboard:</strong> 1st out of 68 Entrants <br>
                <a href="https://www.kaggle.com/competitions/building-extraction-generalization-2024/overview">IEEE BigData Cup 2024: Building
                Extraction Generalization Challenge</a>
              </span>

              <!-- <span class="eql-cntrb"><small><br><sup>*</sup>Indicates Equal Contribution</small></span> -->
            </div>

            <div class="column has-text-centered">
              <div class="publication-links">

                <!-- PDF -->
                <span class="link-block">
                  <a href="static/pdfs/DoubleY Technical Report.pdf" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Technical Report</span>
                  </a>
                </span>

                <!-- PDF -->
                <span class="link-block">
                  <a href="https://doi.org/10.36227/techrxiv.173091008.80781383/v1" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fas fa-file-pdf"></i>
                    </span>
                    <span>Full Paper</span>
                  </a>
                </span>                
                
                <!-- Github link -->
                <span class="link-block">
                  <a href="https://github.com/DoubleY-BEGC2024/OurSolution" target="_blank"
                    class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-github"></i>
                    </span>
                    <span>Source Code</span>
                  </a>
                </span>

                <!-- Best model -->
                <span class="link-block">
                  <a href="https://www.dropbox.com/scl/fi/cdrl62i3mx9p82lqwpik5/yolov8m-seg_LasVegas.pt?rlkey=8ao7a5zz7xnqfd74deffprix2&st=m5kth3w0&dl=0"
                    target="_blank" class="external-link button is-normal is-rounded is-dark">
                    <span class="icon">
                      <i class="fab fa-dropbox"></i>
                    </span>
                    <span>Best Model</span>
                  </a>
                </span>

              </div>
            </div>
          </div>
        </div>
      </div>
    </div>
  </section>


<!-- Teaser GIF -->
<section class="hero teaser">
  <div class="container is-max-desktop">
    <div class="hero-body">
      <img src="static/images/segmentations/output.gif" alt="Inference samples predicted by our trained model.">
      <h2 class="subtitle">
        Examples of Building Segmentation.
      </h2>
    </div>
  </div>
</section>
<!-- End teaser GIF -->


  <!-- Summary -->
  <section class="section hero is-light">
    <div class="container is-max-desktop">
      <div class="columns is-centered has-text-centered">
        <div class="column is-four-fifths">
          <h2 class="title is-3">Summary</h2>
          <div class="content has-text-justified">
            <p> Deep learning has significantly advanced the field of building extraction from remote sensing images, providing robust solutions 
              for identifying and delineating building footprints. However, a major challenge persists in the form of domain adaptation, particularly 
              when addressing cross-city variations. The primary challenge lies in the significant differences in building appearances across cities, 
              influenced by variations in building shapes and environmental characteristics. Consequently, models trained on data from one city often 
              struggle to accurately identify buildings in another city. In this paper, we address this challenge from a data-centric perspective, 
              focusing on diversifying the training set. Our empirical results show that improving data diversity via open-source datasets and 
              diffusion augmentation significantly improved the performance of the segmentation model. Our baseline model, trained with no extra dataset, 
              only achieved a private F1 score of 0.663. On the other hand, our best model, trained with the additional Las Vegas building footprints 
              extracted from the Microsoft Building Footprint dataset, achieved a high private F1 score of 0.703. Surprisingly, we found that diffusion 
              augmentation helps improve our model score to 0.681 without requiring an extra dataset, which is higher than the baseline model. Finally, 
              we also experimented with the Non-Maximal Suppression (NMS) hyperparameter to improve the model’s performance in segmenting dense and 
              small objects, which gave us a high private F1 score of 0.897. Our source code and the pretrained models are publicly available at 
              https://github.com/DoubleY-BEGC2024/OurSolution. 
            </p>
          </div>
        </div>
      </div>
    </div>
  </section>
  <!-- End paper abstract -->

  <!-- Competition Overview -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Competition Overview</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p><strong>1. Objective:</strong> This competition embarks on this challenge by utilizing a building footprint dataset from the Tokyo area 
            as the primary training set, with plans to extend testing to other Japanese regions. This approach aims to inspire the development of models 
            with robust generalization capabilities, capable of overcoming the hurdles of automatic building footprint detection and extraction across 
            various landscapes. Overcoming this challenge signifies the creation of a novel approach for efficient, cost-effective, and precise building 
            footprint extraction at a national level with minimal regional data, showcasing its potential applicability worldwide.</p>
          <p><strong>2. Mandatory Training Data:</strong> The training set data uses 0.3-meter Google earth satellite images complemented by meticulously manually annotated building outlines. 
              A total of 4717 images are provided, where all of them are extracted within Tokyo vicinity. The training data was divided into a training set 
              and a validation set with a ratio of 8:2.</p>
          <p><strong>3. Mandatory Test Data:</strong> The imagery and building annotations for both test sets derive from the open-source Japanese 3D city model, the Plateau project (https://www.mlit.go.jp/plateau/) 
              enhanced with manual adjustments following visual inspection. All test images were randomly selected from 42 cities in Japan, but a balance of different types of 
              areas was maintained. A total of 250 images were taken from each region, totaling 1,000 images.</p>  
        </div>
      </div>
      <br><br>
    </div>
  </div>
  <!-- Competition Overview -->


  <!-- Methodology Overview -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h2 class="title is-3" style="white-space: nowrap;">Method 1: Additional Open-Sourced Dataset</h2>
      <div class="content has-text-justified">
        <div style="text-align: center;">
          <img src="static/images/ROI cropped from Microsoft BF Dataset.png" alt="Microsoft BF Dataset" width="820">
          <p class="caption" style="width: 100%; text-align: justify;">Figure 1: The region of interest for the building footprints 
            extracted from the Microsoft Building Footprint (BF) dataset. (a) Redmond, Washington. (b) Las Vegas, Nevada. 
            For simplicity's sake, we refer the former as Redmond dataset, and the latter as Las Vegas dataset.</p>
        </div>
      </div>
      <br>
    </div>
  </div>
  
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h2 class="title is-3" style="white-space: nowrap;">Method 2: Diffusion Augmentation</h2>
      <div class="content has-text-justified">
        <div style="text-align: center;">
          <img src="static/images/Segmentation Guided Diffusion.jpg" alt="Segmentation Guided Diffusion" width="820">
          <p class="caption" style="width: 100%; text-align: justify;">Figure 2. The proposed diffusion augmentation pipeline. 
            (1) Use pretrained segmentation model to generate semantic segmentation. 
            (2) Refine the segmentation mask using the building polygon labels. 
            (3) Concatenate input image with the semantic mask. 
            (4) Train the segmentation-guided diffusion model using the concatenated inputs.</p>
        </div>
      </div>
      <br>
    </div>
  </div>
  <!-- End methodology overview--> 


  <!-- Model Selection -->  
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Model Selection</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>YOLOv8 series comes with several instance segmentation models, ranging from the smallest nano (n) variant to the 
            largest extra-large (x) variant. We performed several experiments to select the best YOLOv8 variant for our task, 
            considering both the F1 score and model complexity, as shown in Table I. Additionally, we compared the performance of 
            YOLOv8-based instance segmentation models with other state-of-the-art models, including YOLOv9, Mask R-CNN, and EfficientNet. 
            All models are trained for 50 epochs with 640 image size. During test time and submission, the confidence and NMS IoU 
            thresholds are set as 0.20 and 0.70, unless stated otherwise. We also tested the F1-score of the models with a confidence 
            threshold of 0.50, primarily to evaluate how confident the models are rather than for actual submission.
          </p>
          <table class="table is-bordered is-hoverable">          
            <thead>
              <tr>
                <th rowspan="2">Model</th>
                <th rowspan="2">Pretrained Weights</th>
                <th rowspan="2">Batch Size</th>
                <th rowspan="2">Params (M)</th>
                <th rowspan="2">FLOPs (G)</th>
                <th colspan="2">Public F1-Score</th>
              </tr>
              <tr>
                <th>Conf = 0.50</th>
                <th>Conf = 0.20</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>YOLOv8n-seg</td>
                <td rowspan="4">DOTAv1 Aerial Detection</td>
                <td>16</td>
                <td>3.4</td>
                <td>12.6</td>
                <td>0.510</td>
                <td>0.645</td>
              </tr>
              <tr>
                <td>YOLOv8s-seg</td>
                <td>16</td>
                <td>11.8</td>
                <td>42.6</td>
                <td>0.535</td>
                <td>0.654</td>
              </tr>
              <tr>
                <td>YOLOv8m-seg</td>
                <td>16</td>
                <td>27.3</td>
                <td>110.2</td>
                <td>0.592</td>
                <td>0.649</td>
              </tr>
              <tr>
                <td>YOLOv8x-seg</td>      
                <td>8</td>
                <td>71.8</td>
                <td>344.1</td>
                <td>0.579</td>
                <td>0.627</td>
              </tr>
              <tr>
                <td>YOLOv9c-seg</td>
                <td rowspan="2">COCO Segmentation</td>
                <td>4</td>
                <td>27.9</td>
                <td>159.4</td>
                <td>0.476</td>
                <td>0.577</td>
              </tr>
              <tr>
                <td>Mask R-CNN (MPViT-Tiny)</td>
                <td>4</td>
                <td>17</td>
                <td>196.0</td>
                <td>-</td>
                <td>0.596</td>
              </tr>
              <tr>
                <td>EfficientNet-b0-YOLO-seg</td>
                <td>ImageNet</td>
                <td>4</td>
                <td>6.4</td>
                <td>12.5</td>
                <td>-</td>
                <td>0.560</td>
              </tr>
            </tbody>
          </table>
          
          <p>Our observations:</p>
          <ol type="1" padding-left: 0;">
            <li style="margin-bottom: 5px;">Generally, we observe that the F1 score increases when scaling up the model from 
              the smallest YOLOv8n-seg to the medium size YOLOv8m-seg.  Notably, there is a significant jump in F1 score 
              from the YOLOv8s-seg to the YOLOv8m-seg when evaluated in confidence threshold of 0.50, with the score improving 
              from 0.535 to 0.592. </li>
            <li style="margin-bottom: 5px;">Interestingly, the F1 score of YOLOv8m-seg is slightly lower than the smaller YOLOv8s-seg 
              when setting the confidence threshold to 0.20. This observation suggest that the m-variant still has a bigger room of 
              improvement compared to s-variant. </li>
            <li style="margin-bottom: 5px;">Meanwhile, the largest YOLOv8x-seg variant has a lower F1 score than YOLOv8m-seg in 
              both confidence threshold 0.50 and 0.20. This suggests that further improvements in F1 score beyond the m-variant 
              may be minimal unless we enhance the quality of the training dataset or address generalizability issues.</li>
            <li style="margin-bottom: 5px;"> We also tried training YOLOv9 instance segmentation model. Specficially, we chose YOLOv9c, 
              which corresponds to the m-variant of YOLOv8 (YOLOv8m-seg). However, we find that YOLOv9 is hard to train and slower 
              due to the high FLOPS, with no F1-score improvement.</li>
            <li style="margin-bottom: 5px;"> Other than YOLO family, we also tried using Mask R-CNN with the MPViT backbone. However, 
              due to resource constraints, we were only able to test the smallest MPViT-Tiny for our Mask R-CNN backbone. The modified
              Mask R-CNN has a slightly higher F1 score than YOLOv9c, but is inferior to all YOLOv8 variants we tried. It is not only
              slow, but also not accurate.</li>            
            <li style="margin-bottom: 5px;"> Lastly, we tried replacing the YOLOv5 backbone with EfficientNet, which is a lightweight yet
              effective CNN model designed for low-computational-power. We uses the smallest EfficientNet-b0 as the replacement backbone.
              It is able to reach a considerably high F1 score (close to both YOLOv9c and Mask R-CNN) with significantly lower FLOPs. 
              However, the F1 score is still significantly lower than YOLOv8 series. We believe the performance could be further improved 
              by pretraining the EfficientNet with COCO segmentation, and scaling up the EfficientNet backbone.</li>    
            <li style="margin-bottom: 5px;"> In short, we prefered using YOLOv8 series due to its exceptional balance between speed and
              accuracy. Specifically, we uses YOLOv8m-seg variant due to its high performance.</li>               
          </ol>
          
        </div>
      </div>
    </div>
  </div>
  <!-- Model Selection -->     


  <!-- Dataset -->  
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Dataset and Performance Improvement</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>We experimented with the performance of YOLOv8m-seg by varying the training dataset, as shown in table below:
          </p>
          <table class="table is-bordered is-hoverable">                              
            <tr>
              <th>Setup</th>
              <th>Dataset</th>
              <th>Public F1 Score</th>
            </tr>
            <tr>
              <td>A</td>
              <td>BEGC 2024</td>
              <td>0.649</td>
            </tr>
            <tr>
              <td>B</td>
              <td>BEGC 2024 + Redmond Dataset</td>
              <td>0.660</td>
            </tr>
            <tr>
              <td>C</td>
              <td>BEGC 2024 + Las Vegas Dataset</td>
              <td>0.686</td>
            </tr>
            <tr>
              <td>D</td>
              <td>BEGC 2024 + Diffusion Augmentation</td>
              <td>0.672</td>
            </tr>
            <tr>
              <td>E</td>
              <td>BEGC 2024 + CutMix Dataset</td>
              <td>0.650</td>
            </tr>
          </table>

          
          <p>Our observations:</p>
          <ul style="list-style-type: none; padding-left: 0;">
            <li style="margin-bottom: 5px;"><strong>Setup A:</strong> 0.649 - This setup represents the performance baseline, 
              where YOLOv8m-seg was trained solely on the provided BEGC2024 training dataset. As expected, this setup 
              resulted in the lowest F1-score, likely due to the lack of diversity in the training data. </li>
            <li style="margin-bottom: 5px;"><strong>Setup B:</strong> 0.660 - We trained YOLOv8m-seg using both the BEGC2024 
              training set and our Redmond dataset. This simple step of diversifying the training data led to a significant 
              increase in the F1-score, from 0.649 to 0.660.  </li>
            <li style="margin-bottom: 5px;"><strong>Setup C:</strong> 0.686 - Surprisingly, using the Las Vegas dataset resulted 
              in an even higher public F1-score of 0.686, as shown in Setup C. We believe the reason why the Las Vegas dataset 
              results in a greater improvement in F1 score is due to its greater semantic difference from the BEGC2024 training set, 
              which helps enhance the model's ability to generalize in the test set. </li>
            <li style="margin-bottom: 5px;"><strong>Setup D:</strong> 0.672 - Surprisingly, the performance of the YOLOv8m-seg model 
              trained with the BEGC2024 dataset using diffusion augmentation resulted in a considerably high F1 score of 0.672. 
              This F1 score is even higher than that of Setup B, which was trained with the Redmond dataset. This observation 
              demonstrates that our diffusion augmentation method successfully created semantically different images that were 
              sufficient to diversify the BEGC2024 training set. </li>
            <li style="margin-bottom: 5px;"><strong>Setup E:</strong> 0.650 - We also tried CutMix augmentation to diversify the 
              training dataset to improve generalization of our model. However, we found this method to be less effective, 
              achieving an F1 score of only 0.650, as shown in Setup E of Table III. The F1 score improvement was almost negligible 
              compared to our baseline in Setup A. We believe this is due to the lack of variation in building structures, as we 
              only changed the backgrounds. This highlights the importance of diversifying both building shapes and background 
              textures to improve the model's generalization. </li>        
          </ul>
          
        </div>
      </div>
    </div>
  </div>
  <!-- Dataset -->   


  <!-- Kaggle Leaderboard -->  
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Comparison with 2nd and 3rd Place Entrants</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>We compare our solutions with the 2nd and 3rd place in the leaderboard:
          </p>
          <table class="table is-bordered is-hoverable">                    
            <tr>
              <th rowspan=2>Solution</th>
              <th rowspan=2>FLOPS (G)</th>
              <th colspan="2">F1-Score</th>
            </tr>
            <tr>
              <td>Public</td>
              <td>Private</td>
            </tr>
            <tr>
              <td>YOLOv8m-seg + BEGC 2024</td>
              <td rowspan=4>110.2</td>
              <td>0.64926</td>
              <td>0.66531</td>
            </tr>
            <tr>
              <td>YOLOv8m-seg + BEGC 2024 + Redmond Dataset</td>
              <td>0.65951</td>
              <td>0.67133</td>
            </tr>
            <tr>
              <td>YOLOv8m-seg + BEGC 2024 + Las Vegas Dataset</td>
              <td>0.68627</td>
              <td>0.70326</td>
            </tr>
            <tr>
              <td>YOLOv8m-seg + BEGC 2024 + Diffusion Augmentation</td>
              <td>0.67189</td>
              <td>0.68096</td>
            </tr>
            <tr>
              <td>2nd place (RTMDet-x + Alabama Buildings Segmentation Dataset)</td>
              <td>141.7</td>
              <td>0.6813</td>
              <td>0.68453</td>
            </tr>
            <tr>
              <td>3rd Place (Custom Mask-RCNN + No extra Dataset)</td>
              <td>124.1</td>
              <td>0.59314</td>
              <td>0.60649</td>
            </tr>
          </table>
          
          <p>Our observations:</p>
          <ol type="1" padding-left: 0;">
            <li style="margin-bottom: 5px;">Generally, using an additional dataset, whether it is an open-sourced dataset or a 
              synthetic dataset, helps improve the training of the model. </li>
            <li style="margin-bottom: 5px;">However, you might sample high-quality or low-quality additional datasets from 
              open-sourced databases without careful engineering. For instance, using the Redmond dataset only slightly 
              improves the F1 score compared to using the BEGC 2024 dataset alone. On the other hand, using the Las Vegas dataset 
              significantly improves the F1 score, achieving the top F1 score among all methods.</li>   
            <li style="margin-bottom: 5px;">On the other hand, using our diffusion augmentation, we can generate a synthetic dataset 
              to train YOLOv8m-Seg without needing an additional dataset (which means no extra manual annotation is required). 
              Using BEGC2024 combined with the synthetic dataset, our YOLOv8m-Seg model achieved an F1 score that is significantly 
              higher than the baseline and close to our top-1 score (using the Las Vegas dataset) and the 2nd-place solution.</li>   
            <li style="margin-bottom: 5px;">Note that the 2nd-place solution uses a bigger model (higher FLOPs) with an additional 
              dataset to reach a high F1 score, whereas our diffusion augmentation pipeline allows our model (lower FLOPs) to 
              achieve a surprisingly close F1 score without an additional dataset.</li>               
          </ol>
          
        </div>
      </div>
    </div>
  </div>
  <!-- Kaggle Leaderboard -->     


  <!-- NMS IoU Threshold -->    
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Extra Trick: NMS IoU Threshold</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>Non-maximal suppression (NMS) can be less effective at detecting small, densely packed objects, as it relies on 
            IoU to suppress overlapping bounding boxes. In scenarios involving small and dense objects, the bounding boxes 
            often overlap significantly, which can lead to the suppression of true positives. We can mitigate this issue by 
            increasing the IoU threshold in the NMS layer to prevent unnecessary reduction of bounding boxes. We experimented 
            by increasing the IoU threshold in the NMS layer of YOLOv8m-seg from the default 0.70 to 0.95, with increments of 0.05. 
          </p>
          <table class="table is-bordered is-hoverable">                              
            <thead>
              <tr>
                <th rowspan="2">Dataset</th>
                <th colspan="6">Private F1 Score (using different NMS IoU Threshold)</th>
              </tr>
              <tr>
                <th>0.70</th>
                <th>0.75</th>
                <th>0.80</th>
                <th>0.85</th>
                <th>0.90</th>
                <th>0.95</th>
              </tr>
            </thead>
            <tbody>
              <tr>
                <td>BEGC2024 + Redmond Dataset</td>
                <td>0.672</td>
                <td>0.677</td>
                <td>-</td>
                <td>-</td>
                <td>0.748</td>
                <td>0.866</td>
              </tr>
              <tr>
                <td>BEGC2024 + Las Vegas Dataset</td>
                <td>0.703</td>
                <td>0.693</td>
                <td>0.686</td>
                <td>0.721</td>
                <td>0.766</td>
                <td>0.897</td>
              </tr>
              <tr>
                <td>BEGC2024 + Diffusion Augmentation</td>
                <td>0.681</td>
                <td>-</td>
                <td>0.694</td>
                <td>0.711</td>
                <td>0.751</td>
                <td>0.887</td>
              </tr>
            </tbody>
          </table>
          
          <p>Our observations:</p>
          <ol type="1" padding-left: 0;">
            <li style="margin-bottom: 5px;">Generally, we found that IoU thresholds of 0.90 and 0.95 work best compared to 
              other threshold settings. </li>
            <li style="margin-bottom: 5px;">Note that simply increasing the IoU threshold does not directly translate to 
              better performance, as it may lead to an increase in false positives that should have been suppressed by the 
              NMS layers. </li>          
            <li style="margin-bottom: 5px;">For instance, setting the IoU threshold between 0.75 and 0.80 is generally 
              worse than the default 0.70 threshold. </li>  
            <li style="margin-bottom: 5px;">Hence, our final submission is the YOLOv8m-seg model trained on the BEGC2024 and 
              Las Vegas datasets, with the IoU threshold for NMS set to 0.95.  </li>               
            <li style="margin-bottom: 5px;">In future works, we consider trying more advanced NMS variation including Attention based NMS
              and Density-based NMS to better mitigate this problem.  </li>                
          </ol>
          
        </div>
      </div>
    </div>
  </div>
  <!-- NMS IoU Threshold -->   

  
  <!-- Conclusion -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Key Takeaways</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>
            <strong>1. Dataset quality is what you need:</strong> There are 2 observations from our study. Firstly, 
            data diversity is important to mitigate the generalization challenge. For instance, Las Vegas dataset offers higher diversity 
            (i.e., desert backgrounds, different building shapes) as compared to the Redmond dataset, which is semantically more 
            similar to the provided BEGC2024 training set. Hence, the performance of our model trained with BEGC2024 + Las Vegas dataset
            is better than BEGC2024 + Redmond dataset.
          </p>
          <p>
            <strong>2. Diffusion Augmentation is label-efficient:</strong> Diffusion augmentaion is what you need if you do not have
            extra dataset which is diverse enough from the original training set. For instance, the Redmond dataset is not as useful 
            as the Las Vegas dataset. However, it might be difficult and/or costly to find out the suitable extra dataset. On the other 
            hand, we do not need extra dataset to prepare our diffusion augmentation pipeline. Even better, BEGC2024 + Diffusion Augmentation
            outperforms BEGC2024 + Redmond dataset, and also outperforms the 2nd and 3rd place entrants!
          </p>          
          <p>
            <strong>3. Start with a small model:</strong> We recommend starting with a smaller model. It is unwise to use a 
            larger model when dealing with a limited dataset, as it may lead to overfitting. Our empirical study agrees with 
            this hypothesis, as we failed to achieve a high mAP score using the biggest YOLOv8 version (YOLOv8x-seg). Given more time, 
            we would explore training YOLOv8x-seg with all the extra datasets we gathered, and also using our diffusion 
            augmentation pipeline.
          </p>
        </div>
      </div>
      <br><br>
    </div>
  </div>
  <!-- Conclusion -->

  <!-- Logo Acknowledgment -->
  <div class="columns is-centered has-text-centered">
    <div class="column is-four-fifths">
      <h4 class="title is-3" style="white-space: nowrap;">Technological Stack</h4>
      <div class="content has-text-justified">
        <div style="text-align: justify;">
          <p>
            <a href="https://github.com/ultralytics/ultralytics" target="_blank"><img
                src="static/icons/ultralyticsyolo-logo.svg" alt="ultralytics" style="width: 200px;"></a>        
            <a href="https://pytorch.org/" target="_blank"><img src="static/icons/pytorch-logo.svg" alt="pytorch"
                style="width: 210px;"></a>        
            <a href="https://jupyter.org/" target="_blank"><img src="static/icons/jupyter-logo.png" alt="jupyter"
                style="width: 200px;"></a>              
            <a href="https://www.python.org/" target="_blank"><img src="static/icons/python-logo.svg" alt="python"
                style="width: 200px;"></a>  
          </p>
        </div>
      </div>
      <br><br>
    </div>
  </div>
  <!-- Logo Acknowledgment -->

  <footer class="footer">
    <div class="container">
      <div class="columns is-centered">
        <div class="column is-8">
          <div class="content">
            
            <p>
              This page was built using the <a href="https://github.com/eliahuhorwitz/Academic-project-page-template"
                target="_blank">Academic Project Page Template</a> which was adopted from the <a
                href="https://nerfies.github.io" target="_blank">Nerfies</a> project page.
              You are free to borrow the of this website, we just ask that you link back to this page in the footer.
            </p>

            <p>            
              This website is licensed under a <a rel="license"
                href="http://creativecommons.org/licenses/by-sa/4.0/" target="_blank">Creative
                Commons Attribution-ShareAlike 4.0 International License</a>.
            </p>

          </div>
        </div>
      </div>
    </div>
  </footer>

  <!-- Default Statcounter code for EY project website -->
  <!-- 
  <script type="text/javascript">
    var sc_project = 12976265;
    var sc_invisible = 1;
    var sc_security = "c70be6f1"; 
  </script>
  <script type="text/javascript" src="https://www.statcounter.com/counter/counter.js" async></script>
  <noscript>
    <div class="statcounter"><a title="Web Analytics" href="https://statcounter.com/" target="_blank"><img
          class="statcounter" src="https://c.statcounter.com/12976265/0/c70be6f1/1/" alt="Web Analytics"
          referrerPolicy="no-referrer-when-downgrade"></a></div>
  </noscript>
  -->
  <!-- End of Statcounter Code -->

</body>

</html>