MIDL2024.bib

@Proceedings{MIDL-2024,
    editor = "Ninon Burgos and Caroline Petitjean and Maria Vakalopoulou and Stergios Christodoulidis and Pierrick Coupe and Herv\'e Delingette and Carole Lartizien and Diana Mateus",
    address = "Paris, France",
    start = "2024-07-03",
    end = "2024-07-05",
    published = "2024-12-23",
    year = "2024",
    volume = "250",
    booktitle = "Proceedings of The 7nd International Conference on Medical Imaging with Deep Learning",
    name = "Medical Imaging with Deep Learning",
    shortname = "MIDL",
    conference_url = "https://2024.midl.io",
    conference_number = "7"
}

@InProceedings{preface,
  title =	 {Preface for International Conference on Medical
                  Imaging with Deep Learning 2024},
  pages =	 {i-xviii},
  author =	 {Ninon Burgos and Caroline Petitjean and Maria
                  Vakalopoulou and Stergios Christodoulidis and
                  Pierrick Coupe and Herv\'e Delingette and Carole
                  Lartizien and Diana Mateus},
  abstract =	 {This volume contains the Proceedings of the Seventh
                  International Conference on Medical Imaging with
                  Deep Learning - MIDL 2024. The conference was held
                  from July 3 to 5, 2024, in Paris, France and it was
                  organized by co-chairs Ninon Burgos, Caroline
                  Petitjean and Maria Vakalopoulou from Centre
                  national de la recherche scientifique (CNRS) - Paris
                  Brain Institute, University of Rouen Normandie and
                  CentraleSupélec Université Paris Saclay,
                  respectively. The scientific program was organized
                  by a team of program chairs from CentraleSupelec,
                  CNRS - LaBRI, Inria Sophia-Antipolis, CNRS - CREATIS
                  and Centrale Nantes - LS2N.

                  Similar to the previous
                  editions of the conference, MIDL 2024 had two
                  submission tracks: full papers and short papers. 181
                  valid full papers and 124 short papers underwent a
                  transparent review process through the OpenReview
                  platform. Both the full paper and short paper track
                  review processes were single-blind. The papers and
                  reviews are publicly available through OpenReview,
                  the papers with a final decision of rejection were
                  not listed in the OpenReview platform.  The full
                  paper submissions underwent a rigorous single-blind
                  review process that involved a team of 5 Program
                  Chairs (PC), 27 Area Chairs (AC), and 202
                  reviewers. For each submission at least 3 reviews
                  were ensured. After desk rejection of incomplete
                  submissions by PC, each of the remaining 217 papers
                  received at least three reviews as well as a
                  meta-review by an AC, and the authors were allowed
                  to respond to the reviews during a rebuttal
                  period. The PC then discussed each borderline paper
                  over a virtual meeting to make the accept/reject
                  decisions and to select oral presentations. The
                  acceptance rate of the full paper track was $54 \%$,
                  with 36 oral presentations and 81 posters.

                  The
                  short paper submissions underwent a more streamlined
                  single-blind review process involving a team of the
                  same 5 PCs and 25 reviewers, most of whom had also
                  served as AC for the full paper track. Of the 170
                  submissions, 105 were accepted as posters.

                  We want
                  to thank the Area Chairs and reviewers for their
                  careful reviews and constructive feedback to the
                  authors, which made it possible to create this
                  robust technical program. We are grateful to our
                  sponsors for their financial support of the
                  conference. Finally, we would also like to thank the
                  OpenReview team for their tech support throughout
                  the entire process.

                  The articles in these
                  proceedings are presented in alphabetical order by
                  first author surnames. The papers consist of a wide
                  range of topics including segmentation,
                  representation learning, multimodal methods,
                  semi/weakly-supervised learning, clinical
                  translation & domain adaptation, geometric deep
                  learning, federated learning, synthesis, explainable
                  AI, uncertainty and foundation models.

                  *Paris,
                  September 25, 2024*}
}

@InProceedings{xu2024,
    title = "Feasibility and benefits of joint learning from MRI databases with different brain diseases and modalities for segmentation",
    author = "Wentian Xu and Matthew Moffat and Thalia Seale and Ziyun Liang and Felix Wagner and Daniel Whitehouse and David Menon and Virginia Newcombe and Natalie Voets and Abhirup Banerjee and Konstantinos Kamnitsas",
    abstract = "Models for segmentation of brain lesions in multi-modal MRI are commonly trained for a specific pathology using a single database with a predefined set of MRI modalities, determined by a protocol for the specific disease.This work explores the following open questions: Is it feasible to train a model using multiple databases that contain varying sets of MRI modalities and annotations for different brain pathologies? Will this joint learning benefit performance on the sets of modalities and pathologies available during training? Will it enable analysis of new databases with different sets of modalities and pathologies? We develop and compare different methods and show that promising results can be achieved with appropriate, simple and practical alterations to the model and training framework. We experiment with 7 databases containing 5 types of brain pathologies and different sets of MRI modalities. Results demonstrate, for the first time, that joint training on multi-modal MRI databases with different brain pathologies and sets of modalities is feasible and offers practical benefits. It enables a single model to segment pathologies encountered during training in diverse sets of modalities, while facilitating segmentation of new types of pathologies such as via follow-up fine-tuning. The insights this study provides into the potential and limitations of this paradigm should prove useful for guiding future advances in the direction. Code and pretrained models: https://github.com/WenTXuL/MultiUnet",
    pages = "1771--1784"
}

@InProceedings{louiset2024,
    title = "SepVAE: a contrastive VAE to separate pathological patterns from healthy ones",
    author = "Robin Louiset and Edouard Duchesnay and Grigis Antoine and Benoit Dufumier and Pietro Gori",
    abstract = "Contrastive Analysis VAE (CA-VAEs) is a family of Variational auto-encoders (VAEs) that aims at separating the common factors of variation between a \textit{background} dataset (BG) (\textit{i.e.,} healthy subjects) and a \textit{target} dataset (TG) (\textit{i.e.,} patients) from the ones that only exist in the target dataset. To do so, these methods separate the latent space into a set of \textbf{salient} features (\textit{i.e.,} proper to the target dataset) and a set of \textbf{common} features (\textit{i.e.,} exist in both datasets). Currently, all CA-VAEs models fail to prevent sharing of information between the latent spaces and to capture all salient factors of variation. To this end, we introduce two crucial regularization losses: a disentangling term between common and salient representations and a classification term between background and target samples in the salient space. We show a better performance than previous CA-VAEs methods on three medical applications and a natural images dataset (CelebA).",
    pages = "918--936"
}

@InProceedings{xie2024,
    title = "Semi-Supervised Segmentation via Embedding Matching",
    author = "Weiyi Xie and Nathalie Willems and Nikolas Lessmann and Tom Gibbons and Daniele De Massari",
    abstract = "Deep convolutional neural networks are widely used in medical image segmentation but require many labeled images for training. Annotating three-dimensional medical images is a time-consuming and costly process. To overcome this limitation, we propose a novel semi-supervised segmentation method that leverages mostly unlabeled images and a small set of labeled images in training. Our approach involves assessing prediction uncertainty to identify reliable predictions on unlabeled voxels from the teacher model. These voxels serve as pseudo-labels for training the student model. In voxels where the teacher model produces unreliable predictions, pseudo-labeling is carried out based on voxel-wise embedding correspondence using reference voxels from labeled images.We applied this method to automate hip bone segmentation in CT images, achieving notable results with just 4 CT scans. The proposed approach yielded a Hausdorff distance with 95th percentile (HD95) of 3.30 and IoU of 0.929, surpassing existing methods achieving HD95 (4.07) and IoU (0.927) at their best.",
    pages = "1741--1753"
}

@InProceedings{maack2024,
    title = "Efficient Anatomy Segmentation in Laparoscopic Surgery using Multi-Teacher Knowledge Distillation",
    author = "Lennart Maack and Finn Behrendt and Debayan Bhattacharya and Sarah Latus and Alexander Schlaefer",
    abstract = "Automatic segmentation of anatomical structures in laparoscopic images or videos is an important prerequisite for visual assistance tools which are designed to increase efficiency and safety during an intervention.In order to be used in a realistic clinical scenario, both high accuracy and real-time capability are required.Current deep learning networks for anatomy segmentation show high accuracy, but are not suitable for real-time clinical application due to their large size.As smaller, real-time capable deep learning networks show lower segmentation performance, we propose a multi-teacher knowledge distillation approach applicable to partially labeled datasets.We leverage the knowledge of multiple anatomy-specific, high-accuracy teacher networks to improve the segmentation performance of a single and efficient student network capable of segmenting multiple anatomies simultaneously.To do so, we minimize the Kullback-Leibler divergence between the normalized anatomy-specific teacher logits and the respective normalized logits of the student.We conduct experiments on the Dresden Surgical Anatomy Dataset, which provides multiple subsets of binary segmented anatomical structures.Results show that our approach can increase the overall Dice score for different real-time capable network architectures for anatomy segmentation.",
    pages = "937--948"
}

@InProceedings{chakravarty2024,
    title = "Predicting Age-related Macular Degeneration Progression from Retinal Optical Coherence Tomography with Intra-Subject Temporal Consistency",
    author = "Arunava Chakravarty and Taha Emre and Dmitrii Lachinov and Antoine Rivail and Ursula Schmidt-Erfurth and Hrvoje Bogunovi\'c",
    abstract = "The wide variability in the progression rates of Age-Related Macular Degeneration (AMD) and the absence of well-established clinical biomarkers make it difficult to predict an individual\'s risk of AMD progression from intermediate stage (iAMD) to late dry stage (dAMD) using Optical Coherence Tomography (OCT) scans.To address this challenge, we propose to jointly train an AMD stage classifier to discriminate between iAMD and dAMD with a Neural-ODE that models the future trajectory of the disease progression in the learned embedding space. A temporal ordering is imposed such that the distance of a scan from the decision hyperplane of the AMD stage classifier is inversely related to its time-to-conversion. In addition, an intra-subject temporal consistency in the predicted conversion risk scores is ensured by incorporating a pair of longitudinal scans from the same eye during training. We evaluated our proposed method on a longitudinal dataset comprising 235 eyes (3,534 OCT scans) with 40 converters. The results demonstrate the effectiveness of our approach, achieving an average area under the ROC of 0.84 for predicting conversion within the next 6, 12, 18 and 24 months. Additionally, the Concordance Index of 0.78 surpasses the performance of several popular methods for survival analysis.",
    pages = "184--198"
}

@InProceedings{omidi2024,
    title = "Unsupervised Domain Adaptation of Brain MRI Skull Stripping Trained on Adult Data to Newborns: Combining Synthetic Data with Domain Invariant Features",
    author = "Abbas Omidi and Amirmohammad Shamaei and Anouk Verschuu and Regan King and Lara Leijser and Roberto Souza",
    abstract = "Skull-stripping constitutes a crucial initial step in neuroimaging analysis, and supervised deep-learning models have demonstrated considerable success in automating this task. However, a notable challenge is the limited availability of publicly accessible newborn brain MRI datasets. Furthermore, these datasets frequently use diverse post-processing techniques to improve image quality, which may not be consistently feasible in all clinical settings. Additionally, manual segmentation of newborn brain MR images is labor-intensive and demands specialized expertise, rendering it inefficient. While various adult brain MRI datasets with skull-stripping masks are publicly available, applying supervised models trained on these datasets directly to newborns poses a challenge due to domain shift. We propose a methodology that combines domain adversarial models to learn domain-invariant features between newborn and adult data, along with the integration of synthetic data generated using a Gaussian Mixture Model (GMM) as well as data augmentation procedures. The GMM method facilitates the creation of synthetic brain MR images, ensuring a diverse and representative input from multiple domains within our source dataset during model training. The data augmentation procedures were tailored to make the adult MRI data distribution closer to the newborn data distribution. Our results yielded an overall Dice coefficient of 0.9308 ± 0.0297 (mean± std), outperforming all compared unsupervised domain adaptation models and surpassing some supervised techniques previously trained on newborn data. This project\'s code and trained models\' weights are publicly available at https://github.com/abbasomidi77/GMM-Enhanced-DAUnet",
    pages = "1073--1085"
}

@InProceedings{schwarz2024,
    title = "A Patch-based Student-Teacher Pyramid Matching Approach to Anomaly Detection in 3D Magnetic Resonance Imaging",
    author = {Johannes Schwarz and Lena Will and J\"org Wellmer and Axel Mosig},
    abstract = "Anomaly detection on 3D magnet resonance images (MRI) is of high medical relevance in the context of detecting lesions associated with different diseases. Yet, reliable anomaly detection in MRI images involves major challenges, specifically taking into account information in 3D, and the need to localize relatively small and subtle abnormalities within the context of whole organ MRIs. In this paper, a top-down approach, which uses student-teacher feature pyramid matching (STFPM) for detecting anomalies at image and voxel level, is applied to 3D brain MRI inputs. The combination of a 3D patch based self-supervised pre-training and axial-coronal-sagittal (ACS) convolutions pushes the performance above that of f-AnoGAN (bottom-up). The evaluation is based on a tumor dataset.",
    pages = "1357--1370"
}

@InProceedings{ren2024,
    title = "Re-DiffiNet: Modeling discrepancy in tumor segmentation using diffusion models",
    author = "Tianyi Ren and Abhishek Sharma and Juampablo E Heras Rivera and Lakshmi Harshitha Rebala and Ethan Honey and Agamdeep Chopra and Mehmet Kurt",
    abstract = "Identification of tumor margins is essential for surgical decision-making for glioblastoma patients and provides reliable assistance for neurosurgeons. Despite improvements in deep learning architectures for tumor segmentation over the years, creating a fully autonomous system suitable for clinical floors remains a formidable challenge because the model predictions have not yet reached the desired level of accuracy and generalizability for clinical applications. Generative modeling techniques have seen significant improvements in recent times. Specifically, Generative Adversarial Networks (GANs) and Denoising diffusion probabilistic models (DDPMs) have been used to generate higher-quality images with fewer artifacts and finer attributes. In this work, we introduce a framework called Re-Diffinet for modeling the discrepancy between the outputs of a segmentation model like U-Net and the ground truth, using DDPMs. By explicitly modeling the discrepancy, the results show an average improvement of 0.55\% in the Dice score and 16.28\% in 95\% Hausdorff Distance from cross-validation over 5-folds, compared to the state-of-the-art U-Net segmentation model. The code is available:",
    pages = "1257--1266"
}

@InProceedings{varma2024,
    title = "VariViT: A Vision Transformer for Variable Image Sizes",
    author = "Aswathi Varma and Suprosanna Shit and Chinmay Prabhakar and Daniel Scholz and Hongwei Bran Li and Bjoern Menze and Daniel Rueckert and Benedikt Wiestler",
    abstract = "Vision Transformers (ViTs) have emerged as the state-of-the-art architecture in representation learning, leveraging self-attention mechanisms to excel in various tasks. ViTs split images into fixed-size patches, constraining them to a predefined size and necessitating pre-processing steps like resizing, padding, or cropping. This poses challenges in medical imaging, particularly with irregularly shaped structures like tumors. A fixed bounding box crop size produces input images with highly variable foreground-to-background ratios. Resizing medical images can degrade information and introduce artifacts, impacting diagnosis. Hence, tailoring variable-sized crops to regions of interest can enhance feature representation capabilities. Moreover, large images are computationally expensive, and smaller sizes risk information loss, presenting a computation-accuracy tradeoff. We propose VariViT, an improved ViT model crafted to handle variable image sizes while maintaining a consistent patch size. VariViT employs a novel positional embedding resizing scheme for a variable number of patches. We also implement a new batching strategy within VariViT to reduce computational complexity, resulting in faster training and inference times. In our evaluations on two 3D brain MRI datasets, VariViT surpasses vanilla ViTs and ResNet in glioma genotype prediction and brain tumor classification. It achieves F1-scores of 75.5\% and 76.3\%, respectively, learning more discriminative features. Our proposed batching strategy reduces computation time by up to 30\% compared to conventional architectures. These findings underscore the efficacy of VariViT in image representation learning.",
    pages = "1571--1583"
}

@InProceedings{alekseenko2024,
    title = "Distance-Aware Non-IID Federated Learning for Generalization and Personalization in Medical Imaging Segmentation",
    author = "Julia Alekseenko and Alexandros Karargyris and Nicolas Padoy",
    abstract = "Federated learning (FL) in healthcare suffers from non-identically distributed (non-IID) data, impacting model convergence and performance. While existing solutions for the non-IID problem often do not quantify the degree of non-IID nature between clients in the federation, assessing it can improve training experiences and outcomes, particularly in real-world scenarios with unfamiliar datasets. The paper presents a practical non-IID assessment methodology for a medical segmentation problem, highlighting its significance in medical FL. We propose a simple yet effective solution that utilizes distance measurements in the embedding space of medical images and statistical measurements calculated over their metadata. Our method, designed for medical imaging and integrated into federated averaging, improves model generalization by downgrading the contribution from the most distant client, treating it as an outlier. Additionally, it enhances model personalization by introducing distance-based clustering of clients. To the best of our knowledge, this method is the first to use distance-based techniques for providing a practical solution to the non-IID problem within the medical imaging FL domain. Furthermore, we validate our approach on three public FL imaging radiology datasets (FeTS, Prostate, and Fed-KITS2019) to demonstrate its effectiveness across various radiology imaging scenarios.",
    pages = "33--47"
}

@InProceedings{yi2024,
    title = "PAAN: Pyramid Attention Augmented Network for polyp segmentation",
    author = "Sida Yi and Yuesheng Zhu and Guibo Luo",
    abstract = "Polyp segmentation is a task of segmenting polyp lesion regions from normal tissues in medical images, which is crucial for medical diagnosis and treatment planning. However, existing methods still suffer from low accuracy in polyp boundary delineation and insufficient suppression of irrelevant background due to the blur boundaries and textures of polyps. To overcome these limitations, in this paper a Pyramid Attention Augmented Network (PAAN) is proposed, in which a pyramid feature diversion structure with spatial attention mechanism is developed so that good feature representation with low information loss can be achieved by conducting channel attention-based feature diversion and inter-layer fusion, while reducing computational complexity. Also, our framework includes an Enhanced Spatial Attention module (ESA), which can improve the quality of initial polyp segmentation predictions through spatial self-attention mechanism and multi-scale feature fusion. Our approach is evaluated on five challenging polyp datasets— Kvasir, CVC-ClinicDB, CVC-300, ETIS, and CVC-colonDB and achieves excellent results. In particular, we achieve 94.2\% Dice and 89.7\% IoU on Kvasir, outperforming other state-of-the-art methods.",
    pages = "1823--1840"
}

@InProceedings{buess2024,
    title = "Video-CT MAE: Self-supervised Video-CT Domain Adaptation for Vertebral Fracture Diagnosis",
    author = "Lukas Buess and Marijn F. Stollenga and David Schinz and Benedikt Wiestler and Jan Kirschke and Andreas Maier and Nassir Navab and Matthias Keicher",
    abstract = "Early and accurate diagnosis of vertebral body anomalies is crucial for effectively treating spinal disorders, but the manual interpretation of CT scans can be time-consuming and error-prone. While deep learning has shown promise in automating vertebral fracture detection, improving the interpretability of existing methods is crucial for building trust and ensuring reliable clinical application. Vision Transformers (ViTs) offer inherent interpretability through attention visualizations but are limited in their application to 3D medical images due to reliance on 2D image pretraining. To address this challenge, we propose a novel approach combining the benefits of transfer learning from video-pretrained models and domain adaptation with self-supervised pretraining on a task-specific but unlabeled dataset. Compared to naive transfer learning from Video MAE, our method shows improved downstream task performance by 8.3 in F1 and a training speedup of factor 2. This closes the gap between videos and medical images, allowing a ViT to learn relevant anatomical features while adapting to the task domain. We demonstrate that our framework enables ViTs to effectively detect vertebral fractures in a low data regime, outperforming CNN-based state-of-the-art methods while providing inherent interpretability. Our task adaptation approach and dataset not only improve the performance of our proposed method but also enhance existing self-supervised pretraining approaches, highlighting the benefits of task-specific self-supervised pretraining for domain adaptation. The code is publicly available.",
    pages = "151--167"
}

@InProceedings{billot2024,
    title = "Network conditioning for synergistic learning on partial annotations",
    author = "Benjamin Billot and Neel Dey and Esra Abaci Turk and Ellen Grant and Polina Golland",
    abstract = "The robustness and accuracy of multi-organ segmentation networks is limited by the scarcity of labels. A common strategy to alleviate the annotation burden is to use partially labelled datasets, where each image can be annotated for a subset of all organs of interest. Unfortunately, this approach causes inconsistencies in the background class since it can now include target organs. Moreover, we consider the even more relaxed setting of region-based segmentation, where voxels can be labelled for super-regions, thus causing further inconsistencies across annotations. Here we propose CoNeMOS (Conditional Network for Multi-Organ Segmentation), a framework that leverages a label-conditioned network for synergistic learning on partially labelled region-based segmentations. Conditioning is achieved by combining convolutions with expressive Feature-wise Linear Modulation (FiLM) layers, whose parameters are controlled by an auxiliary network. In contrast to other conditioning methods, FiLM layers are stable to train and add negligible computation overhead, which enables us to condition the entire network. As a result, the network can learn where it needs to extract shared or label-specific features, instead of imposing it with the architecture (e.g., with different segmentation heads). By encouraging flexible synergies across labels, our method obtains state-of-the-art results for the segmentation of challenging low-resolution fetal MRI data. Our code is available at https://github.com/BBillot/CoNeMOS.",
    pages = "119--130"
}

@InProceedings{bhattacharya2024,
    title = "GazeDiff: A radiologist visual attention guided diffusion model for zero-shot disease classification",
    author = "Moinak Bhattacharya and Prateek Prasanna",
    abstract = "We present GazeDiff, a novel architecture that leverages radiologists\' eye gaze patterns as controls to text-to-image diffusion models for zero-shot classification. Eye-gaze patterns provide important cues during the visual exploration process; existing diffusion-based models do not harness the valuable insights derived from these patterns during image interpretation. GazeDiff utilizes a novel expert visual attention-conditioned diffusion model to generate robust medical images. This model offers more than just image generation capabilities; the density estimates derived from the gaze-guided diffusion model can effectively improve zero-shot classification performance. We show the zero-shot classification efficacy of GazeDiff on four publicly available datasets for two common pulmonary disease types, namely pneumonia, and tuberculosis.",
    pages = "103--118"
}

@InProceedings{poudel2024,
    title = "Exploring Transfer Learning in Medical Image Segmentation using Vision-Language Models",
    author = "Kanchan Poudel and Manish Dhakal and Prasiddha Bhandari and Rabin Adhikari and Safal Thapaliya and Bishesh Khanal",
    abstract = "Medical image segmentation allows quantifying target structure size and shape, aiding in disease diagnosis, prognosis, surgery planning, and comprehension. Building upon recent advancements in foundation Vision-Language Models (VLMs) from natural image-text pairs, several studies have proposed adapting them to Vision-Language Segmentation Models (VLSMs) that allow using language text as an additional input to segmentation models. Introducing auxiliary information via text with human-in-the-loop prompting during inference opens up unique opportunities, such as open vocabulary segmentation and potentially more robust segmentation models against out-of-distribution data.Although transfer learning from natural to medical images has been explored for image-only segmentation models, the joint representation of vision-language in segmentation problems remains underexplored. This study introduces the first systematic study on transferring VLSMs to 2D medical images, using carefully curated $11$ datasets encompassing diverse modalities and insightful language prompts and experiments. Our findings demonstrate that although VLSMs show competitive performance compared to image-only modelsfor segmentation after finetuning in limited medical image datasets, not all VLSMs utilize the additional information from language prompts, with image features playing a dominant role. While VLSMs exhibit enhanced performance in handling pooled datasets with diversemodalities and show potential robustness to domain shifts compared to conventional segmentation models, our results suggest that novel approaches are required to enable VLSMs to leverage the various auxiliary information available through language prompts. The code and datasets are available at https://github.com/naamiinepal/medvlsm.",
    pages = "1142--1165"
}

@InProceedings{zhang2024,
    title = "Boundary-aware Contrastive Learning for Semi-supervised Nuclei Instance Segmentation",
    author = "Ye Zhang and Ziyue Wang and Yifeng Wang and Hao Bian and Linghan Cai and Hengrui Li and Lingbo Zhang and Yongbing Zhang",
    abstract = "Semi-supervised segmentation methods have demonstrated promising results in natural scenarios, providing a solution to reduce dependency on manual annotation. However, these methods face significant challenges when directly applied to pathological images due to the subtle color differences between nuclei and tissues, as well as the significant morphological variations among nuclei. Consequently, the generated pseudo-labels often contain much noise, especially at the nuclei boundaries. To address the above problem, this paper proposes a boundary-aware contrastive learning network to denoise the boundary noise in a semi-supervised nuclei segmentation task. The model has two key designs: a low-resolution denoising (LRD) module and a cross-RoI contrastive learning (CRC) module. The LRD improves the smoothness of the nuclei boundary by pseudo-labels denoising, and the CRC enhances the discrimination between foreground and background by boundary feature contrastive learning. We conduct extensive experiments to demonstrate the superiority of our proposed method over existing semi-supervised instance segmentation methods.",
    pages = "1851--1861"
}

@InProceedings{perez-caballero2024,
    title = "Unsupervised Deep Learning Method for Bias Correction",
    author = "Maria Perez-Caballero and Sergio Morell-Ortega and Marina Ruiz Perez and Pierrick Coupe and Jose V Manjon",
    abstract = "In this paper, a new method for automatic MR image inhomogeneity correction is proposed. This method, based on deep learning, uses unsupervised learning to estimate the bias corrected images minimizing a cost function based on the entropy of the corrupted image, the derivative of the estimated bias field and corrected image statistics. The proposed method has been compared with the state-of-the-art method N4 providing improved results.",
    pages = "1098--1106"
}

@InProceedings{decroocq2024,
    title = "Multi-scale Stochastic Generation of Labelled Microscopy Images for Neuron Segmentation",
    author = "Meghane Decroocq and Binbin Xu and Katherine L Thompson-Peer and Adrian Moore and Henrik Skibbe",
    abstract = "We introduce a novel method leveraging conditional generative adversarial networks (cGANs) to generate diverse, high-resolution microscopy images for neuron tracing model training. This approach addresses the challenge of limited annotated data availability, a significant obstacle in automating neuron dendrite tracing. Our technique utilizes a multi-scale cascade process to generate synthetic images from single neuron tractograms, accurately replicating the complex characteristics of real microscopy images, encompassing imaging artifacts and background structures. In experiments, our method generates diverse images that mimic the characteristics of two distinct neuron microscopy datasets, which were successfully used as training data in the segmentation task of real neuron images.",
    pages = "352--366"
}

@InProceedings{konstantin2024,
    title = "ASMR: Angular Support for Malfunctioning Client Resilience in Federated Learning",
    author = "Mirko Konstantin and Moritz Fuchs and Anirban Mukhopadhyay",
    abstract = "Federated Learning (FL) allows the training of deep neural networks in a distributed andprivacy-preserving manner. However, this concept suffers from malfunctioning updatessent by the attending clients that cause global model performance degradation. Reasonsfor this malfunctioning might be technical issues, disadvantageous training data, or mali-cious attacks. Most of the current defense mechanisms are meant to require impracticalprerequisites like knowledge about the number of malfunctioning updates, which makesthem unsuitable for real-world applications. To counteract these problems, we introducea novel method called ASMR, that dynamically excludes malfunctioning clients based ontheir angular distance. Our novel method does not require any hyperparameters or knowl-edge about the number of malfunctioning clients. Our experiments showcase the detectioncapabilities of ASMR in an image classification task on a histopathological dataset, whilealso presenting findings on the significance of dynamically adapting decision boundaries.",
    pages = "754--767"
}

@InProceedings{chebykin2024,
    title = "Hyperparameter-Free Medical Image Synthesis for Sharing Data and Improving Site-Specific Segmentation",
    author = "Alexander Chebykin and Peter Bosman and Tanja Alderliesten",
    abstract = "Sharing synthetic medical images is a promising alternative to sharing real images that can improve patient privacy and data security. To get good results, existing methods for medical image synthesis must be manually adjusted when they are applied to unseen data. To remove this manual burden, we introduce a Hyperparameter-Free distributed learning method for automatic medical image Synthesis, Sharing, and Segmentation called HyFree-S3. For three diverse segmentation settings (pelvic MRIs, lung X-rays, polyp photos), the use of HyFree-S3 results in improved performance over training only with site-specific data (in the majority of cases). The hyperparameter-free nature of the method should make data synthesis and sharing easier, potentially leading to an increase in the quantity of available data and consequently the quality of the models trained that may ultimately be applied in the clinic. Our code is available at https://github.com/AwesomeLemon/HyFree-S3",
    pages = "199--219"
}

@InProceedings{ibarra2024,
    title = "A recurrent network for segmenting the thrombus on brain MRI in patients with hyper-acute ischemic stroke",
    author = "Sofia Vargas Ibarra and Vincent Martin VIGNERON and Sonia Garcia Salicetti and Hichem Maaref and Jonathan Kobold and Nicolas Chausson and Yann Lhermitte and Didier Smadja",
    abstract = "In the stroke workflow, timely decision-making is crucial. Identifying, localizing, and measuring occlusive arterial thrombi during initial imaging is a critical step that triggers the choice of therapeutic treatment for optimizing vascular re-canalization. We present a recurrent model that segments the thrombus in patients suffering from a hyper-acute stroke. A cross-attention module is defined to merge the diffusion and susceptibility-weighted modalities available in Magnetic Resonance Imaging (MRI), which are fed to a modified version of the Convolutional Long-Short-Term Memory (CLSTM) model. It detects almost all the thrombi with a Dice higher than 0.6. The lesion segmentation prediction reduces the false positives to almost zero and the performance is comparable between distal and proximal occlusions.",
    pages = "657--671"
}

@InProceedings{harten2024,
    title = "REINDIR: Repeated Embedding Infusion for Neural Deformable Image Registration",
    author = "Louis van Harten and Rudolf Leonardus Mirjam Van Herten and Ivana Isgum",
    abstract = "The use of implicit neural representations (INRs) has been explored for medical image registration in a number of recent works. Using these representations has several advantages over both classic optimization-based methods and deep learning-based methods, but it is hindered by long optimization times during inference. To address this issue, we propose REINDIR: Repeated Embedding Infusion for Neural Deformable Image Registration. REINDIR is a meta-learning framework that uses a combination of an image encoder and template representations, which are infused with image embeddings to specialize them for a pair of test images. This specialization results in a better initialization for the subsequent optimization process. By broadcasting the encodings to fill our modulation weight matrices, we greatly reduce the required size of the encoder compared to approaches that predict the complete weight matrices directly. Additionally, our method retains the flexibility to infuse arbitrarily large encodings. The presented approach greatly improves the efficiency of deformable registration with INRs when applied to (near-)IID data, while remaining robust to severe domain shifts from the distribution the method is trained on.",
    pages = "577--595"
}

@InProceedings{vries2024,
    title = "Accelerating physics-informed neural fields for fast CT perfusion analysis in acute ischemic stroke",
    author = "Lucas de Vries and Rudolf Leonardus Mirjam Van Herten and Jan W. Hoving and Ivana Isgum and Bart Emmer and Charles B. Majoie and Henk Marquering and Stratis Gavves",
    abstract = "Spatio-temporal perfusion physics-informed neural networks were introduced as a new method (SPPINN) for CT perfusion (CTP) analysis in acute ischemic stroke. SPPINN leverages physics-informed learning and neural fields to perform a robust analysis of noisy CTP data. However, SPPINN faces limitations that hinder its application in practice, namely its implementation as a slice-based (2D) method, lengthy computation times, and the lack of infarct core segmentation. To address these challenges, we introduce a new approach to accelerate physics-informed neural fields for fast, volume-based (3D), CTP analysis including infarct core segmentation: ReSPPINN. To accommodate 3D data while simultaneously reducing computation times, we integrate efficient coordinate encodings. Furthermore, to ensure even faster model convergence, we use a meta-learning strategy. In addition, we also segment the infarct core. We employ acute MRI reference standard infarct core segmentations to evaluate ReSPPINN and we compare the performance with two commercial software packages. We show that meta-learning allows for full-volume perfusion map generation in 1.2 minutes without comprising quality, compared to over 40 minutes required by SPPINN. Moreover, ReSPPINN\'s infarct core segmentation outperforms commercial software.",
    pages = "1606--1626"
}

@InProceedings{wundram2024,
    title = "Leveraging Probabilistic Segmentation Models for Improved Glaucoma Diagnosis: A Clinical Pipeline Approach",
    author = "Anna M. Wundram and Paul Fischer and Stephan Wunderlich and Hanna Faber and Lisa M. Koch and Philipp Berens and Christian F. Baumgartner",
    abstract = "The accurate segmentation of the optic cup and disc in fundus images is essential for diagnostic processes such as glaucoma detection. The inherent ambiguity in locating these structures often poses a significant challenge, leading to potential misdiagnosis. To model such ambiguities, numerous probabilistic segmentation models have been proposed. In this paper, we investigate the integration of these probabilistic segmentation models into a multistage pipeline closely resembling clinical practice. Our findings indicate that leveraging the uncertainties provided by these models substantially enhances the quality of glaucoma diagnosis compared to relying on a single segmentation only.",
    pages = "1725--1740"
}

@InProceedings{dahan2024a,
    title = "The Multiscale Surface Vision Transformer",
    author = "Simon Dahan and Logan Zane John Williams and Daniel Rueckert and Emma Claire Robinson",
    abstract = "Surface meshes are a favoured domain for representing structural and functional information on the human cortex, but their complex topology and geometry pose significant challenges for deep learning analysis. While Transformers have excelled as domain-agnostic architectures for sequence-to-sequence learning, the quadratic cost of the self-attention operation remains an obstacle for many dense prediction tasks. Inspired by some of the latest advances in hierarchical modelling with vision transformers, we introduce the Multiscale Surface Vision Transformer (MS-SiT) as a backbone architecture for surface deep learning. The self-attention mechanism is applied within local-mesh-windows to allow for high-resolution sampling of the underlying data, while a shifted-window strategy improves the sharing of information between windows. Neighbouring patches are successively merged, allowing the MS-SiT to learn hierarchical representations suitable for any prediction task. Results demonstrate that the MS-SiT outperforms existing surface deep learning methods for neonatal phenotyping prediction tasks using the Developing Human Connectome Project (dHCP) dataset. Furthermore, building the MS-SiT backbone into a U-shaped architecture for surface segmentation demonstrates competitive results on cortical parcellation using the UK Biobank (UKB) and manually-annotated MindBoggle datasets. Code and trained models are publicly available at https://github.com/metrics-lab/surface-vision-transformers.",
    pages = "289--305"
}

@InProceedings{ranem2024,
    title = "UnCLe SAM: Unleashing SAM’s Potential for Continual Prostate MRI Segmentation",
    author = "Amin Ranem and Mohamed Afham Mohamed Aflal and Moritz Fuchs and Anirban Mukhopadhyay",
    abstract = "Continual medical image segmentation primarily explores the utilization of U-Net and its derivatives within the realm of medical imaging, posing significant challenges in meeting the demands of shifting domains over time. Foundation models serve as robust knowledge repositories, offering unique advantages such as general applicability, knowledge transferability, and continuous improvements. By leveraging pre-existing domain insights, adaptability, generalization, and performance across diverse tasks can be enhanced.In this work, we show how to deploy Segment Anything Model\'s (SAM) natural image pretraining for the continual medical image segmentation, where data is sparse.We introduce UnCLe SAM, a novel approach that uses the knowledge of the pre-trained SAM foundation model to make it suitable for continual segmentation in dynamic environments.We demonstrate that UnCLe SAM is a robust alternative to U-Net-based approaches and showcase its state-of-the-art (SOTA) continual medical segmentation capabilities.The primary objective of UnCLe SAM is to strike a delicate balance between model rigidity and plasticity, effectively addressing prevalent pitfalls within CL methodologies.We assess UnCLe SAM through a series of prostate segmentation tasks, applying a set of different CL methods. Comparative evaluations against the SOTA Lifelong nnU-Net framework reveal the potential application of UnCLe SAM in dynamically changing environments like healthcare.Our code base will be made public upon acceptance.",
    pages = "1207--1220"
}

@InProceedings{royer2024,
    title = "MultiMedEval: A Benchmark and a Toolkit for Evaluating Medical Vision-Language Models",
    author = "Corentin Royer and Bjoern Menze and Anjany Sekuboyina",
    abstract = "We introduce MultiMedEval, an open-source toolkit for fair and reproducible evaluation of large, medical vision-language models (VLM). MultiMedEval comprehensively assesses the models’ performance on a broad array of six multi-modal tasks, conducted over 23 datasets, and spanning over 11 medical domains. The chosen tasks and performance metrics are based on their widespread adoption in the community and their diversity, ensuring a thorough evaluation of the model’s overall generalizability. We open-source a Python toolkit (https://anonymous.4open.science/r/MultiMedEval-C780) with a simple interface and setup process, enabling the evaluation of any VLM in just a few lines of code. Our goal is to simplify the intricate landscape of VLM evaluation, thus promoting fair and uniform benchmarking of future VLMs.",
    pages = "1310--1327"
}

@InProceedings{junjiang2024,
    title = "Train Once, Deploy Anywhere: Edge-Guided Single-source Domain Generalization for Medical Image Segmentation",
    author = "Jun Jiang and Shi Gu",
    abstract = {In medical image analysis, unsupervised domain adaptation models require retraining when receiving samples from a new data distribution, and multi-source domain generalization methods might be infeasible when there is only a single source domain. These will pose formidable obstacles to model deployment. To this end, we take the \"Train Once, Deploy Anywhere\" as our objective and consider a challenging but practical problem: Single-source Domain Generalization (SDG). Meanwhile, we note that (i) the medical image segmentation applications where generalization errors often come from imprecise predictions at the ambiguous boundary of anatomies and (ii) the edge of the image is domain-invariant, which can reduce the domain shift between the source and target domain in all network layers. Specifically, we borrow the prior knowledge from Digital Image Processing and take the edge of the image as input to enhance the model attention at the boundary of anatomies and improve the generalization performance on unknown target domains. Extensive experiments on three typical medical image segmentation datasets, which cover cross-sequence, cross-center, and cross-modality settings with various anatomical structures, demonstrate our method achieves superior generalization performance compared to the state-of-the-art SDG methods. The code is available at https://github.com/thinkdifferentor/EGSDG.},
    pages = "722--741"
}

@InProceedings{behrendt2024,
    title = "Combining Reconstruction-based Unsupervised Anomaly Detection with Supervised Segmentation for Brain MRIs",
    author = {Finn Behrendt and Debayan Bhattacharya and Lennart Maack and Julia Kr\"uger and Roland Opfer and Alexander Schlaefer},
    abstract = "In contrast to supervised deep learning approaches, unsupervised anomaly detection (UAD) methods can be trained with healthy data only and do not require pixel-level annotations, enabling the identification of unseen pathologies. While this is promising for clinical screening tasks, reconstruction-based UAD methods fall short in segmentation accuracy compared to supervised models. Therefore, self-supervised UAD approaches have been proposed to improve segmentation accuracy. Typically, synthetic anomalies are used to train a segmentation network in a supervised fashion. However, this approach does not effectively generalize to real pathologies. We propose a framework combining reconstruction-based and self-supervised UAD methods to improve both segmentation performance for known anomalies and generalization to unknown pathologies. The framework includes an unsupervised diffusion model trained on healthy data to produce pseudo-healthy reconstructions and a supervised Unet trained to delineate anomalies from deviations between input- reconstruction pairs. Besides the effective use of synthetic training data, this framework allows for weakly-supervised training with small annotated data sets, generalizing to unseen pathologies. Our results show that with our approach, utilizing annotated data sets during training can substantially improve the segmentation performance for in-domain data while maintaining the generalizability of reconstruction-based approaches to pathologies unseen during training.",
    pages = "87--102"
}

@InProceedings{zolotarev2024,
    title = "Predicting Atrial Fibrillation Treatment Outcome with Siamese Multi-modal Fusion and Cardiac Digital Twins",
    author = "Alexander M. Zolotarev and Abbas Khan Rayabat Khan and Gregory Slabaugh and Caroline Roney",
    abstract = "Atrial fibrillation, the most common heart rhythm disorder, presents challenges in treatment due to difficulty pinpointing the patient-specific regions of abnormal electrical activity. While biophysical simulations of cardiac electrophysiology create a digital twin of atrial electrical activity based on CT or MRI scans, testing various treatment strategies on them is time-consuming and impractical on clinical timescales.Our proposed pipeline, incorporating Siamese architecture, fuses latent representations of multi-modal features extracted from atrial digital twin before any therapy and predicts the outcomes of several treatment strategies.A large in-silico dataset of 1000 virtual patients, generated from clinical data, was utilized to provide the biophysical simulations before (used for feature extraction) and after (used for calculating ground truth labels depending on whether atrial fibrillation terminates or not) various treatment strategies.  By accurately predicting freedom from atrial fibrillation, our pipeline paves the way for personalized atrial fibrillation therapy with a fast and precise selection of optimal treatments.",
    pages = "1927--1938"
}

@InProceedings{yassin2024,
    title = "Evaluating Age-Related Anatomical Consistency in Synthetic Brain MRI against Real-World Alzheimer's Disease Data.",
    author = "Hadya Yassin and Jana Fehr and Wei-Cheng Lai and Alina Krichevsky and Alexander Rakowski and Christoph Lippert",
    abstract = "This study examines the realism of medical images created with deep generative models, specifically their replication of aging and Alzheimer\'s disease (AD) related anatomical changes. Previous research focused on developing generative methods with limited attention to image fidelity. We aim to assess the resemblance of brain MRI generated by a StyleGAN3 model with causal controls to neurodegenerative changes. For a benchmark, we conducted a visual Turing test (VTT) to see if radiologists could distinguish between synthetic and real images. Then, we employed a U-Net-based model to segment hallmarks relevant to normal aging and (AD). Finally, we conducted statistical tests for our hypothesis that no significant differences existed between real and synthetic images. (VTT) results showed radiologists struggled to differentiate between image types, highlighting (VTT)\'s limitations due to subjectivity and time constraints. We found slight hippocampus distribution differences ($\textit{P}$ = 5.7e-2) and significant lateral ventricle discrepancies ($\textit{P}$s $<$ 5.0e-2), indicating higher hippocampus realism and ventricle size inconsistencies. The model more effectively simulated changes in the hippocampus than in the lateral ventricles, where difficulties were encountered with certain subgroups. We conclude that the (VTT) alone is inadequate for a comprehensive quality evaluation, promoting a more objective approach. Future research could adapt our approach to evaluate other generated medical images intended for different downstream tasks. For reproducibility, we provide detailed code implementation$^1$.",
    pages = "1801--1822"
}

@InProceedings{deshpande2024,
    title = "Auto-Generating Weak Labels for Real \& Synthetic Data to Improve Label-Scarce Medical Image Segmentation",
    author = "Tanvi Deshpande and Eva Prakash and Elsie Gyang Ross and Curtis Langlotz and Andrew Y. Ng and Jeya Maria Jose Valanarasu",
    abstract = "The high cost of creating pixel-by-pixel gold-standard labels, limited expert availability, and presence of diverse tasks make it challenging to generate segmentation labels to train deep learning models for medical imaging tasks. In this work, we present a new approach to overcome the hurdle of costly medical image labeling by leveraging foundation models like Segment Anything Model (SAM) and its medical alternate MedSAM. Our pipeline has the ability to generate *weak labels* for any unlabeled medical image and subsequently use it to augment label-scarce datasets. We perform this by leveraging a model trained on a few gold-standard labels and using it to intelligently prompt MedSAM for weak label generation. This automation eliminates the manual prompting step in MedSAM, creating a streamlined process for generating labels for both real and synthetic images, regardless of quantity. We conduct experiments on label-scarce settings for multiple tasks pertaining to modalities ranging from ultrasound, dermatology, and X-rays to demonstrate the usefulness of our pipeline. The code will be made public after review.",
    pages = "391--405"
}

@InProceedings{sourget2024,
    title = "[Citation needed] Data usage and citation practices in medical imaging conferences",
    author = "Th\'eo Sourget and Ahmet Akko\c{c} and Stinna Winther and Christine Lyngbye Galsgaard and Amelia Jim\'enez-S\'anchez and Dovile Juodelyte and Caroline Petitjean and Veronika Cheplygina",
    abstract = "Medical imaging papers often focus on methodology, but the quality of the algorithms and the validity of the conclusions are highly dependent on the datasets used. As creating datasets requires a lot of effort, researchers often use publicly available datasets, there is however no adopted standard for citing the datasets used in scientific papers, leading to difficulty in tracking dataset usage.In this work, we present two open-source tools we created that could help with the detection of dataset usage, a pipeline using OpenAlex and full-text analysis, and a PDF annotation software used in our study to manually label the presence of datasets. We applied both tools on a study of the usage of 20 publicly available medical datasets in papers from MICCAI and MIDL. We compute the proportion and the evolution between 2013 and 2023 of 3 types of presence in a paper: cited, mentioned in the full text, cited and mentioned. Our findings demonstrate the concentration of the usage of a limited set of datasets. We also highlight different citing practices, making the automation of tracking difficult.",
    pages = "1475--1496"
}

@InProceedings{ahmad2024,
    title = "Interpretable Uncertainty-Aware Deep Regression with Cohort Saliency Analysis for Three-Slice CT Imaging Studies",
    author = {Nouman Ahmad and Johan \"Ofverstedt and Sambit Tarai and G\"oran Bergstr\"om and H\r{a}kan Ahlstr\"om and Joel Kullberg},
    abstract = "Obesity is associated with an increased risk of morbidity and mortality. Achieving a healthy body composition, which involves maintaining a balance between fat and muscle mass, is important for metabolic health and preventing chronic diseases. Computed tomography (CT) imaging offers detailed insights into the body’s internal structure, aiding in understanding body composition and its related factors. In this feasibility study, we utilized CT image data from 2,724 subjects from the large metabolic health cohort studies SCAPIS and IGT. We train and evaluate an uncertainty-aware deep regression based ResNet-50 network, which outputs its prediction as mean and variance, for quantification of cross-sectional areas of liver, visceral adipose tissue (VAT), and thigh muscle. This was done using collages of three single-slice CT images from the liver, abdomen, and thigh regions. The model demonstrated promising results with the evaluation metrics – including R-squared ($R^2$) and mean absolute error (MAE) for predictions. Additionally, for interpretability, the model was evaluated with saliency analysis based on Grad-CAM (Gradient-weighted Class Activation Mapping) at stages 2, 3, and 4 of the network. Deformable image registration to a template subject further enabled cohort saliency analysis that provide group-wise visualization of image regions of importance for associations to biomarkers of interest. We found that the networks focus on relevant regions for each target, according to prior knowledge. The source code is available at: \url{https://github.com/noumannahmad/dr\_3slice\_ct}.",
    pages = "17--32"
}

@InProceedings{demir2024,
    title = "Multimodal Image Registration Guided by Few Segmentations from One Modality",
    author = "Basar Demir and Marc Niethammer",
    abstract = "Registration of multimodal images is challenging, especially when dealing with different anatomical structures and samples without segmentations. The main difficulty arises from the use of registration loss functions that are inadequate in the absence of corresponding regions. In this work, we present the first registration and segmentation approach tailored to this challenge. In particular, we assume the practically highly relevant scenario that only a limited number of segmentations are available for one modality and none for the other. First, we augment our few segmented samples using unsupervised deep registration within one modality, thereby providing many anatomically plausible samples to train a segmentation network. The resulting segmentation network then allows us to train a segmentation network on the target modality without available segmentations by using an unsupervised domain adaptation architecture. Finally, we train a deep registration network to register multimodal image pairs purely based on predictions of their segmentation networks. Our work demonstrates that using a small number of segmentations from one modality enables training a segmentation network on a target modality without the need for additional manual segmentations on that modality. Additionally, we show that registration based on these segmentations provides smooth and accurate deformation fields on anatomically different image pairs, unlike previous methods. We evaluate our approach on 2D medical image segmentation and registration between knee DXA and X-ray images. Our experiments show that our approach outperforms existing methods. Code is available at https://github.com/uncbiag/SegGuidedMMReg.",
    pages = "367--390"
}

@InProceedings{geissler2024,
    title = "Structure Size as Confounder in Uncertainty Based Segmentation Quality Prediction",
    author = "Kai Gei{\ss}ler and Jochen G. Hirsch and Stefan Heldmann and Hans Meine",
    abstract = "Various uncertainty estimation methods have been proposed for deep learning-based image segmentation models. An uncertainty measure is treated useful if it can be used to accurately predict segmentation quality. Therefore, structure-wise uncertainty measures are frequently correlated with measures like the Dice score. However, it is known that the Dice score highly depends on the size of the structure of interest. It is less well-known that popular structure-wise uncertainty measures also correlate with structure size. Therefore, the structure size acts as confounding variable when trying to quantify the performance of such uncertainty measures via correlation. We investigate this for the popular uncertainty measures structure-wise epistemic uncertainty, mean pairwise Dice and volume variation coefficient based on test-time-augmentation, Monte Carlo Dropout and model ensembles. We propose to use a partial correlation coefficient to address structure size as confounding variable and arrive at lower correlation estimates which better reflect the true relationship between segmentation quality and structure-wise uncertainty.",
    pages = "504--519"
}

@InProceedings{sharma2024,
    title = "Lupus Nephritis Subtype Classification with only Slide Level Labels",
    author = "Amit Sharma and Ekansh Chauhan and Megha S Uppin and Liza Rajasekhar and C.V. Jawahar and P K Vinod",
    abstract = "Lupus Nephritis classification has historically relied on labor-intensive and meticulous glomerular-level labeling of renal structures in whole slide images (WSIs). However, this approach presents a formidable challenge due to its tedious and resource-intensive nature, limiting its scalability and practicality in clinical settings. In response to this challenge, our work introduces a novel methodology that utilizes only slide-level labels, eliminating the need for granular glomerular-level labeling. A comprehensive multi-stained lupus nephritis digital histopathology WSI dataset was created from the Indian population, which is the largest of its kind. LupusNet, a deep learning MIL-based model, was developed for the sub- type classification of LN. The results underscore its effectiveness, achieving an AUC score of 91.0\%, an F1-score of 77.3\%, and an accuracy of 81.1\% on our dataset in distinguishing membranous and diffused classes of LN.",
    pages = "1401--1411"
}

@InProceedings{fuchs2024,
    title = "HARP: Unsupervised Histopathology Artifact Restoration",
    author = {Moritz Fuchs and Ssharvien Kumar R Sivakumar and Mirko Sch\"ober and Niklas Woltering and Marie-Lisa Eich and Leonille Schweizer and Anirban Mukhopadhyay},
    abstract = "Histopathological analysis, vital for medical diagnostics, is often challenged by artifacts insample preparation and imaging, such as staining inconsistencies and physical obstructions.Addressing this, our work introduces a novel, fully unsupervised histopathological artifactrestoration pipeline (HARP). HARP integrates artifact detection, localization, and restorationinto one pipeline. The first step to make artifact restoration applicable is an analysisof anomaly detection algorithms. Then, HARP leverages the power of unsupervised segmentationtechniques to propose localizations for potential artifacts, for which we select thebest localization based on our novel inpainting denoising diffusion model. Finally, HARPemploys an inpainting model for artifact restoration while conditioning it on the artifact localizations.We evaluate the artifact detection quality along with the image reconstructionquality, surpassing the state-of-the-art artifact restoration. Furthermore, we demonstratethat HARP improves the robustness and reliability of downstream models and show thatpathologists can not tell the difference between clean images and images restored throughHARP. This demonstrates that HARP significantly improves image quality and diagnosticreliability, enhancing histopathological examination accuracy for AI systems.",
    pages = "465--479"
}

@InProceedings{gu2024,
    title = "Predicting 3D forearm fracture angle from biplanar Xray images with rotational bone pose estimation",
    author = "Hanxue Gu and Roy Colglazier and Jikai Zhang and Robert Lark and Benjamin Alman and Maciej A Mazurowski",
    abstract = "Two-dimensional X-ray images, while widely used, have limitations to reflect 3D information of the imaged objects. Several studies have tried to recover such information from multiple X-ray images of the same object. Still, those approaches often fail due to the unrealistic assumption that the target does not move between views and those two views are perfectly orthogonal.A problem where 3D information would be highly valuable but is very difficult to assess from 2D X-ray images is the measurement of the actual 3D fracture angles in the forearm. To address this problem, we propose a deep learning-based method that predicts the rotational movement and skeletal posture from biplanar X-ray images, offering a novel and precise solution.Our strategy comprises the following steps: (1) automatic segmentation of the ulna and radius bones of the forearm on two X-ray images by a neural network; (2) prediction of the rotational parameters of the bones by a pose prediction network; (3) automatic detection of fracture locations and assessment of the fracture angles on 2D images; and (4) reconstruction of the real 3D fracture angle by inferring it from the 2D fracture information and the skeleton pose parameters collected from the two images. Our experiments on X-ray images show that our method can accurately measure 2D fracture angles and infer the pose of the forearm bones. By simulating X-ray images for various types of fractures, we show that our method could provide more accurate measurements of fracture angles in 3D. We are the first attempt for the fully automatic fracture angle measurements on both 2D and 3D versions, and we show the robustness of our method even in extreme cases where the two views are highly nonorthogonal.",
    pages = "554--576"
}

@InProceedings{chen2024,
    title = "FedFDD: Federated Learning with Frequency Domain Decomposition for Low-Dose CT Denoising",
    author = "Xuhang Chen and Zeju Li and Zikun Xu and Kaijie Xu and Cheng Ouyang and Chen Qin",
    abstract = "Low-dose computed tomography (LDCT) enables imaging with minimal radiation exposure but typically results in noisy outputs. Deep learning algorithms have been emerging as popular tools for denoising LDCT images, where they typically rely on large data sets requiring data from multiple centers. However, LDCT images collected from different centers (clients) can present significant data heterogeneity, and the sharing of them between clients is also constrained by privacy regulations. In this work, we propose a personalized federated learning (FL) approach for enhancing model generalization across different organ images from multiple local clients while preserving data privacy. Empirically, we find that earlier FL methods tend to underperform single-set models on non-IID LDCT data due to the presence of data heterogeneity characterized by varying frequency patterns. To address this, we introduce a Federated Learning with Frequency Domain Decomposition (FedFDD) approach, which decomposes images into different frequency components and then updates high-frequency signals in an FL setting while preserving local low-frequency characteristics. Specifically, we leverage an adaptive frequency mask with discrete cosine transformation for the frequency domain decomposition. The proposed algorithm is evaluated on LDCT datasets of different organs and our experimental results show that FedFDD can surpass state-of-the-art FL methods as well as both localized and centralized models, especially on challenging LDCT denoising cases. Our code is available at https://github.com/xuhang2019/FedFDD.",
    pages = "234--249"
}

@InProceedings{graf2024,
    title = "Modeling the acquisition shift between axial and sagittal MRI for diffusion superresolution to enable axial spine segmentation",
    author = {Robert Graf and Hendrik M\"oller and Julian McGinnis and Sebastian R\"uhling and Maren Weihrauch and Matan Atad and Suprosanna Shit and Bjoern Menze and Mark M\"uhlau and Johannes C. Paetzold and Daniel Rueckert and Jan Kirschke},
    abstract = "Spine MRIs are usually acquired in highly anisotropic 2D axial or sagittal slices. Vertebra structures are not fully resolved in these images, and multi-image superresolution by aligning scans to pair them is difficult due to partial volume effects and inter-vertebral movement during acquisition. Hence, we propose an unpaired inpainting superresolution algorithm that extrapolates the missing spine structures. We generate synthetic training pairs by multiple degradation functions that model the data shift and acquisition errors between sagittal slices and sagittal views of axial images. Our method employs modeling of the k-space point spread function and the interslice gap. Further, we imitate different MR acquisition challenges like histogram shifts, bias fields, interlace movement artifacts, Gaussian noise, and blur. This enables the training of diffusion-based superresolution models on scaling factors larger than 6$\times$ without real paired data. The low z-resolution in axial images prevents existing approaches from separating individual vertebrae instances. By applying this superresolution model to the z-dimension, we can generate images that allow a pre-trained segmentation model to distinguish between vertebrae and enable automatic segmentation and processing of axial images. We experimentally benchmark our method and show that diffusion-based superresolution outperforms state-of-the-art super-resolution models.",
    pages = "520--537"
}

@InProceedings{vs2024,
    title = "Target and task specific source-free domain adaptive image segmentation",
    author = "Vibashan VS and Jeya Maria Jose Valanarasu and Vishal M. Patel",
    abstract = "Solving the domain shift problem during inference is essential in medical imaging as most deep-learning based solutions suffer from it. In practice, domain shifts are tackled by performing Unsupervised Domain Adaptation (UDA), where a model is adapted to an unlabeled target domain by leveraging the labelled source domain. In medical scenarios, the data comes with huge privacy concerns making it difficult to apply standard UDA techniques. Hence, a closer clinical setting is Source-Free UDA (SFUDA), where we have access to source trained model but not the source data during adaptation. Methods trying to solve SFUDA typically address the domain shift using pseudo-label based self-training techniques. However due to domain shift, these pseudo-labels are usually of high entropy and denoising them still does not make them perfect labels to supervise the model. Therefore, adapting the source model with noisy pseudo labels reduces its segmentation capability while addressing the domain shift. To this end, we propose a two-stage approach for source-free domain adaptive image segmentation: 1) Target-specific adaptation followed by 2) Task-specific adaptation. In the Stage-I, we focus on learning target-specific representation and generating high-quality pseudo labels by leveraging a proposed ensemble entropy minimization loss and selective voting strategy. In Stage II, we focus on improving segmentation performance by utilizing teacher-student self-training and augmentation-guided consistency loss, leveraging the pseudo labels obtained from Stage I. We evaluate our proposed method on both 2D fundus datasets and 3D MRI volumes across 7 different domain shifts where we achieve better performance than recent UDA and SF-UDA methods for medical image segmentation. Code is available at https://github.com/Vibashan/tt-sfuda.",
    pages = "1627--1639"
}

@InProceedings{rizhko2024,
    title = "Improving Identically Distributed and Out-of-Distribution Medical Image Classification with Segmentation-Guided Attention in Small Dataset Scenarios",
    author = "Mariia Rizhko and Lauren Erdman and Mandy Rickard and Kunj Sheth and Daniel Alvarez and Kyla N Velaer and Megan A. Bonnett and Christopher S. Cooper and Gregory E. Tasian and John Weaver and Alice Xiang and Armando J. Lorenzo and Anna Goldenberg",
    abstract = "We propose a new approach for training medical image classification models using segmentation masks, particularly effective in small dataset scenarios. By guiding the model’s attention with segmentation masks toward relevant features, we significantly improve accuracy for diagnosing Hydronephrosis. Evaluation of our model on identically distributed data showed either the same or better performance with improvement up to 0.28 in AUROC and up to 0.33 in AUPRC. Our method showed better generalization ability than baselines, improving from 0.02 to 0.75 in AUROC and from 0.09 to 0.47 in AUPRC for four different out-of-distribution datasets. The results show that models trained on smaller datasets using our approach can achieve comparable results to those trained on datasets 25 times larger.",
    pages = "1282--1296"
}

@InProceedings{afzal2024,
    title = "A Comprehensive Benchmark of Supervised and Self-supervised Pre-training on Multi-view Chest X-ray Classification",
    author = "Muhammad Muneeb Afzal and Muhammad Osama Khan and Yi Fang",
    abstract = "Chest X-ray analysis in medical imaging has largely focused on single-view methods. However, recent advancements have led to the development of multi-view approaches that harness the potential of multiple views for the same patient. Although these methods have shown improvements, it is especially difficult to collect large multi-view labeled datasets owing to the prohibitive annotation costs and acquisition times. Hence, it is crucial to address the multi-view setting in the low data regime. Pre-training is a critical component to ensure efficient performance in this low data regime, as evidenced by its improvements in natural and medical imaging. However, in the multi-view setup, such pre-training strategies have received relatively little attention and ImageNet initialization remains largely the norm. We bridge this research gap by conducting an extensive benchmarking study illustrating the efficacy of 10 strong supervised and self-supervised models pre-trained on both natural and medical images for multi-view chest X-ray classification. We further examine the performance in the low data regime by training these methods on 1\%, 10\%, and 100\% fractions of the training set. Moreover, our best models yield significant improvements compared to existing state-of-the-art multi-view approaches, outperforming them by as much as 9.9\%, 8.8\% and 1.6\% on the 1\%, 10\%, and 100\% data fractions respectively. We hope this benchmark will spur the development of stronger multi-view medical imaging models, similar to the role of such benchmarks in other computer vision and medical imaging domains. As open science, we make our code publicly available to aid in the development of stronger multi-view models.",
    pages = "1--16"
}

@InProceedings{moens2024,
    title = "Laparoflow-SSL: Image Analysis From a Tiny Dataset Through Self-Supervised Transformers Leveraging Unlabeled Surgical Video",
    author = "Karel Moens and Jonas De Vylder and Matthew B. Blaschko and Tinne Tuytelaars",
    abstract = "During minimally invasive surgery, surgeons monitor their actions and the relevant tissue through a camera. This provides an ideal environment for artificial intelligence (AI) assisted surgery. For the development of such AI components, the need for expert annotations remains a key bottleneck. In this paper, we study the application of self-supervised learning (SSL) on surgical data. In a self-supervised setting, a representation backbone is trained on information that is inherently present in the data. There is no need for annotations, leaving the backbone free to train on all recordings, not just labeled ones. We leveraged optical flow for weighting pairs in a view-contrastive self-supervised learning loss. Constructed as an Info Noise-Contrastive Estimation (InfoNCE) loss, it contrasted the pixel representations of two differently, photometrically and geometrically transformed views. The importance of each contrasted pixel pair is determined by computing the difference between the optical flows of the respective pixels. In this way, the optical flow guided the representations of pixels that move together to similar vectors. We tested the usefulness of the representation vectors by training simple networks for semantic segmentation or robotic instrument key point detection. These networks showed competitive performance, even when using over 92\% fewer annotated samples than other works. For semantic segmentation, we used as little as 99.73\% fewer samples for training, originating from the m2caiSeg dataset, and remained competitive even when testing on the unseen cholecSeg8k dataset.",
    pages = "986--1010"
}

@InProceedings{loizillon2024,
    title = "Detecting Brain Anomalies in Clinical Routine with the $\beta$-VAE: Feasibility Study on Age-Related White Matter Hyperintensities",
    author = "Sophie Loizillon and Yannick Jacob and Maire Aur\'elien and Didier Dormont and Olivier Colliot and Ninon Burgos and Apprimage Study Group",
    abstract = "This experimental study assesses the ability of variational autoencoders (VAEs) to perform anomaly detection in clinical routine, in particular the detection of age-related white matter lesions in brain MRIs acquired at different hospitals and gathered in a clinical data warehouse (CDW). We pre-trained a state-of-the-art $\beta$-VAE on a healthy cohort of over 10,000 FLAIR MR images from the UK Biobank to learn the distribution of healthy brains. The model was then fine-tuned on a cohort of nearly 700 healthy FLAIR images coming from a CDW. We first ensured the good performance of our pre-trained model compared with the state-of-the-art using a widely used public dataset (MSSEG). We then validated it on our target task, age-related WMH detection, on ADNI3 and on a curated clinical dataset from a single-site neuroradiology department, for which we had manually delineated lesion masks. Next, we applied the fine-tuned $\beta$-VAE for anomaly detection in a CDW characterised by an exceptional heterogeneity in terms of hospitals, scanners and image quality. We found a correlation between the Fazekas scores extracted from the radiology reports and the volumes of the lesions detected by our model, providing a first insight into the performance of VAEs in a clinical setting. We also observed that our model was robust to image quality, which strongly varies in the CDW. However, despite these encouraging results, such approach is not ready for an application in clinical routine yet due to occasional failures in detecting certain lesions, primarily attributed to the poor quality of the images reconstructed by the VAE.",
    pages = "903--917"
}

@InProceedings{hu2024,
    title = "ICL-SAM: Synergizing In-context Learning Model and SAM in Medical Image Segmentation",
    author = "Jiesi Hu and Yang Shang and Yanwu Yang and Xutao Guo and Hanyang Peng and Ting Ma",
    abstract = "Medical image segmentation, a field facing domain shifts due to diverse imaging modal- ities and biomedical domains, has made strides with the development of robust models. The In-Context Learning (ICL) model, like UniverSeg, demonstrates robustness to domain shifts with support image-label pairs in varied medical imaging segmentation tasks. How- ever, its performance is still unsatisfied. On the other hand, the Segment Anything Model (SAM) stands out as a powerful universal segmentation model. In this work, we intro- duce a novel methodology, ICL-SAM, that integrates the superior performance of SAM with the ICL model to create more effective segmentation models within the in-context learning paradigm. Our approach employs SAM to refine segmentation results from ICL model and leverages ICL model to generate prompts for SAM, eliminating the need for manual prompt provision. Additionally, we introduce a semantic confidence map gener- ation method into our framework to guide the prediction of both ICL model and SAM, thereby further enhancing segmentation accuracy. Our method has been extensively eval- uated across multiple medical imaging contexts, including fundus, MRI, and CT images, spanning five datasets. The results demonstrate significant performance improvements, particularly in settings with few support pairs, where our method can achieve over a 10\% increase in the Dice coefficient compared to cutting edge ICL model. Our code will be publicly available.",
    pages = "641--656"
}

@InProceedings{hoebel2024,
    title = "Beyond Structured Attributes: Image-Based Predictive Trends for Chest X-Ray Classification",
    author = "Katharina V Hoebel and Jesseba Fernando and William Lotter",
    abstract = {A commonly emphasized challenge in medical AI is the drop in performance when testing on data from institutions other than those used for training. However, even if models trained on distinct datasets perform similarly well overall, they may still exhibit other systematic differences. Here, we study these potential dataset-centric prediction variations using two popular chest x-ray datasets, CheXpert (CXP) and MIMIC-CXR (MMC). While CXP-trained models generally perform better on CXP than MMC test data and vice versa, this performance decrease is not uniform across individual images. We find that image-level variations in predictions are not random but can be inferred well above chance, even for pathologies where the overall performance gap is small, suggesting that there are systematic tendencies of models trained on different datasets. Furthermore, these \"predictive tendencies\" are not solely explained by image statistics or attributes like radiographic position or patient sex, but rather are pathology-specific and related to higher-order image characteristics. Our findings stress the complexity of AI robustness and generalization, highlighting the need for a nuanced approach that especially considers the diversity of pathology presentation.},
    pages = "610--640"
}

@InProceedings{shi2024,
    title = "ThickV-Stain: Unprocessed Thick Tissues Virtual Staining for Rapid Intraoperative Histology",
    author = "Lulin Shi and Xingzhong Hou and Ivy H. M. Wong and Simon C. K. Chan and Zhenghui Chen and Claudia T. K. Lo and Terence T. W. Wong",
    abstract = "Virtual staining has shown great promise in realizing a rapid and low-cost clinical alternative for pathological examinations, eliminating the need for chemical reagents and laborious staining procedures. However, most of the previous studies mainly focus on thin slice samples, which still require tissue sectioning and are unsuitable for intraoperative use. In this paper, we propose a multi-scale model to virtually stain label-free and slide-free biological tissues, allowing hematoxylin- and eosin- (H\&E) staining generation in less than a minute for an image with 100 million pixels. We name this ThickV-Stain model, specifically developed to virtually stain intricated and unprocessed thick tissues. We harness the ability of a multi-scale network to encourage the model to capture multiple-level micromorphological characteristics from low-resolution images. Experimental results highlight the advantages of our multi-scale method for virtual staining on unprocessed thick samples. We also show the effectiveness of ThickV-Stain on thin sections, showing generalizability to other clinical workflows. The proposed method enables us to obtain virtually stained images from unstained samples within minutes and can be seamlessly integrated with downstream pathological analysis tasks, providing an efficient alternative scheme for intraoperative assessment as well as general pathological examination.",
    pages = "1434--1447"
}

@InProceedings{moris2024,
    title = "Semi-supervised learning with Noisy Students improves domain generalization in optic disc and cup segmentation in uncropped fundus images",
    author = "Eugenia Moris and Ignacio Larrabide and Jos\'e Ignacio Orlando",
    abstract = "Automated optic disc (OD) and cup (OC) segmentation in fundus images has been widely explored for computer-aided diagnosis of glaucoma. However, existing models usually suffer from drops in performance when applied on images significantly different than those used for training.Several domain generalization strategies have been introduced to mitigate this issue, although they are trained and evaluated using images manually cropped around the optic nerve head. This operation eliminates most sources of domain variation, therefore overestimating their actual ability to cope with new, unseen patterns. In this paper, we analyze the most recent and accurate methods for domain generalization in OD/OC segmentation by applying them on uncropped fundus pictures, observing notorious degradations in their performance when trained and evaluated under this setting.To overcome their drawbacks, we also introduce a simple semi-supervised learning approach for domain generalization based on the Noisy Student framework.Using a Teacher model trained on a combination of domains, we pseudo-labeled a dataset of 18.000 originally unlabeled images that are then used for training a Student model.This semi-supervised setting allowed the Student network to capture additional sources of variability while retaining the original cues and patterns used by the Teacher through the weak annotations.Our results on eight different public datasets show improvements in every unseen domain over all alternative methods, and are available in https://github.com/eugeniaMoris/Noisy\_student\_ODOC\_MIDL\_2024.",
    pages = "1056--1072"
}

@InProceedings{mordacq2024,
    title = "ADAPT: Multimodal Learning for Detecting Physiological Changes under Missing Modalities",
    author = "Julie Mordacq and Leo Milecki and Maria Vakalopoulou and Steve Oudot and Vicky Kalogeiton",
    abstract = "Multimodality has recently gained attention in the medical domain, where imaging or video modalities may be integrated with biomedical signals or health records. Yet, two challenges remain: balancing the contributions of modalities, especially in cases with a limited amount of data available, and tackling missing modalities. To address both issues, in this paper, we introduce the AnchoreD multimodAl Physiological Transformer (ADAPT), a multimodal, scalable framework with two key components: (i) aligning all modalities in the space of the strongest, richest modality (called anchor) to learn a joint embedding space, and (ii) a Masked Multimodal Transformer, leveraging both inter- and intra-modality correlations while handling missing modalities. We focus on detecting physiological changes in two real-life scenarios: stress in individuals induced by specific triggers and fighter pilots\' loss of consciousness induced by g-forces. We validate the generalizability of ADAPT through extensive experiments on two datasets for these tasks, where we set the new state of the art while demonstrating its robustness across various modality scenarios and its high potential for real-life applications. Our code is available at https://github.com/jumdc/ADAPT.git.",
    pages = "1040--1055"
}

@InProceedings{islam2024,
    title = "Uncertainty-aware retinal layer segmentation in OCT through probabilistic signed distance functions",
    author = "Mohammad Mohaiminul Islam and Coen de Vente and Bart Liefers and Caroline Klaver and Erik J Bekkers and Clara I. S\'anchez",
    abstract = "In this paper, we present a new approach for uncertainty-aware retinal layer segmentation in Optical Coherence Tomography (OCT) scans using probabilistic signed distance functions (SDF). Traditional pixel-wise and regression-based methods primarily encounter difficulties in precise segmentation and lack of geometrical grounding respectively. To address these shortcomings, our methodology refines the segmentation by predicting a signed distance function (SDF) that effectively parameterizes the retinal layer shape via level set. We further enhance the framework by integrating probabilistic modeling, applying Gaussian distributions to encapsulate the uncertainty in the shape parameterization. This ensures a robust representation of the retinal layer morphology even in the presence of ambiguous input, imaging noise, and unreliable segmentations. Both quantitative and qualitative evaluations demonstrate superior performance when compared to other methods. Additionally, we conducted experiments on artificially distorted datasets with various noise types—shadowing, blinking, speckle, and motion—common in OCT scans to showcase the effectiveness of our uncertainty estimation. Our findings demonstrate the possibility of obtaining reliable segmentation of retinal layers, as well as an initial step towards the characterization of layer integrity, a key biomarker for disease progression. Our code is available at \url{https://github.com/niazoys/RLS\_PSDF}.",
    pages = "672--693"
}

@InProceedings{patel2024,
    title = "Resolution and Field of View Invariant Generative Modelling with Latent Diffusion Models",
    author = "Ashay Patel and Mark S Graham and Vicky Goh and Sebastien Ourselin and M. Jorge Cardoso",
    abstract = "Large dataset requirements for deep learning methods can pose a challenge in the medical field, where datasets tend to be relatively small. Synthetic data can provide a suitable solution to this problem, when complemented with real data. However current generative methods normally require all data to be of the same resolution and, ideally, aligned to an atlas. This not only creates more stringent restrictions on the training data but also limits what data can be used for inference. To overcome this our work proposes a latent diffusion model that is able to control sample geometries by varying their resolution, field of view, and orientation. We demonstrate this work on whole body CT data, using a spatial conditioning mechanism. We showcase how our model provides samples as good as an ordinary latent diffusion model trained fully on whole body single resolution data. This is in addition to the benefit of further control over resolution, field of view, orientation, and even the emergent behaviour of super-resolution. We found that our model could create realistic images across the varying tasks showcasing the potential of this application.",
    pages = "1086--1097"
}

@InProceedings{builtjes2024,
    title = "Evaluating ChatGPT's Performance in Generating and Assessing Dutch Radiology Report Impressions",
    author = "Luc Builtjes and Monique Brink and Souraya Belkhir and Bram van Ginneken and Alessa Hering",
    abstract = "The integration of Large Language Models (LLMs), such as ChatGPT, in radiology couldoffer insight and interpretation to the increasing number of radiological findings generatedby Artificial Intelligence (AI). However, the complexity of medical text presents many chal-lenges for LLMs, particularly in uncommon languages such as Dutch. This study thereforeaims to evaluate ChatGPT’s ability to generate accurate ‘Impression’ sections of radiol-ogy reports, and its effectiveness in evaluating these sections compared against humanradiologist judgments. We utilized a dataset of CT-thorax radiology reports to fine-tuneChatGPT and then conducted a reader study with two radiologists and GPT-4 out-of-the-box to evaluate the AI-generated ‘Impression’ sections in comparison to the originals. Theresults revealed that human experts rated original impressions higher than AI-generatedones across correctness, completeness, and conciseness, highlighting a gap in the AI’s abil-ity to generate clinically reliable medical text. Additionally, GPT-4’s evaluations weremore favorable towards AI-generated content, indicating limitations in its out-of-the-boxuse as an evaluator in specialized domains. The study emphasizes the need for cautiousintegration of LLMs into medical domains and the importance of expert validation, yetalso acknowledges the inherent subjectivity in interpreting and evaluating medical reports.",
    pages = "168--183"
}

@InProceedings{ercan2024,
    title = "Predicting DNA Content Abnormalities in Barrett’s Esophagus: A Weakly Supervised Learning Paradigm",
    author = "Caner Ercan and Xiaoxi Pan and Thomas G. Paulson and Matthew D. Stachler and Carlo C. Maley and William M. Grady and Yinyin Yuan",
    abstract = "Barrett’s esophagus (BE) is the sole precursor to esophageal adenocarcinoma (EAC), and is an opportunity for developing biomarkers for cancer risk assessment. DNA content abnormalities, including aneuploidy, have been implicated in the progression to EAC in BE patients, but molecular assays require valuable tissue for its detection. We propose utilizing images from routine histology to detect ploidy status using deep learning.Employing a weakly supervised deep learning approach, multi-instance learning (MIL), we trained a model to predict ploidy using hematoxylin and eosin-stained whole slide images of endoscopic biopsies and flow cytometry results. The study introduces a novel data augmentation method for MIL, sequentially altering features from original and augmented images during training loops. This method improved the average area under curve (AUC) from 0.43, 0.64 and 0.81 for ResNet50, DenseNet121 and REMEDIS foundation model, respectively (training without any augmentation), to 0.61, 0.87 and 0.91 with the proposed augmentation strategy.The top-performing model, using REMEDIS foundation model as the backbone, achieved 0.93 AUC and 0.83 balanced accuracy to predict aneuploidy in the test cohort biopsies (n=279). Across all the patients (n=123), predicted aneuploidy status was correlated with progression to EAC (p=6.55e-06), similar to correlation with ploidy status based on flow cytometry results (p=2.84e-7). Supporting the findings, histologic nuclear features typically associated with dysplasia and DNA content abnormalities such as enlarged, hyperchromatic nuclei and loss of nuclear polarity, were seen in the samples called abnormal compared to the control diploid samples.In conclusion, our deep learning model efficiently predicts aneuploidy, a mechanism that has been shown to underpin BE progression to EAC. This method, preserving precious biopsy tissues, complements routine histology, offering potential for identifying individuals at high risk of progression through molecular-based advancements.",
    pages = "426--438"
}

@InProceedings{sideri-lampretsa2024,
    title = "SINR: Spline-enhanced implicit neural representation for multi-modal registration",
    author = "Vasiliki Sideri-Lampretsa and Julian McGinnis and Huaqi Qiu and Magdalini Paschali and Walter Simson and Daniel Rueckert",
    abstract = "Deformable image registration has undergone a transformative shift with the advent of deep learning. While convolutional neural networks (CNNs) allow for accelerated registration, they exhibit reduced accuracy compared to iterative pairwise optimization methods and require extensive training cohorts. Based on the advances in representing signals with neural networks, implicit neural representations (INRs) have emerged in the registration community to model dense displacement fields continuously. Using a pairwise registration setup, INRs mitigate the bias learned over a cohort of patients while leveraging advanced methodology and gradient-based optimization. However, the coordinate sampling scheme makes dense transformation parametrization with an INR prone to generating physiologically implausible configurations resulting in spatial folding. In this paper, we introduce SINR - a method to parameterize the continuous deformable transformation represented by an INR using Free Form Deformations (FFD). SINR allows for multi-modal deformable registration while mitigating folding issues found in current INR-based registration methods. SINR outperforms existing state-of-the-art methods on both 3D mono- and multi-modal brain registration on the CamCAN dataset, demonstrating its capabilities for pairwise mono- and multi-modal image registration.",
    pages = "1462--1474"
}

@InProceedings{molnar2024,
    title = "IHCScoreGAN: An unsupervised generative adversarial network for end-to-end ki67 scoring for clinical breast cancer diagnosis",
    author = "Carl Molnar and Thomas E. Tavolara and Christopher A. Garcia and David S. McClintock and Mark D. Zarella and Wenchao Han",
    abstract = "Ki67 is a biomarker whose activity is routinely measured and scored by pathologists through  immunohistochemistry (IHC) staining, which informs clinicians of patient prognosis and guides treatment. Currently, most clinical laboratories rely on a tedious, inconsistent manual scoring process to quantify the percentage of Ki67-positive cells. While many works have shown promise for Ki67 quantification using computational approaches, the current state-of-the-art methods have limited real-world feasibility: they either require large datasets of meticulous cell-level ground truth labels to train, or they provide pre-trained weights that may not generalize well to in-house data. To overcome these challenges, we propose IHCScoreGAN, the first unsupervised deep learning framework for end-to-end Ki67 scoring without the need for any ground truth labels. IHCScoreGAN only requires IHC image samples and unpaired synthetic data, yet it learns to generate colored cell segmentation masks while simultaneously predicting cell center point and biomarker expressions for Ki67 scoring, made possible through our novel dual-branch generator structure. We validated our framework on a large cohort of 2,136 clinically signed-out cases, yielding an accuracy of 0.97 and an F1-score of 0.95 and demonstrating substantially better performance than a pre-trained state-of-the-art supervised model. By removing ground truth requirements, our unsupervised technique constitutes an important step towards easily-trained Ki67 scoring solutions which can train on out-of-domain data in an unsupervised manner.",
    pages = "1011--1025"
}

@InProceedings{shaharabany2024,
    title = "Zero-Shot Medical Image Segmentation Based on Sparse Prompt Using Finetuned SAM",
    author = "Tal Shaharabany and Lior Wolf",
    abstract = "Segmentation of medical images plays a critical role in various clinical applications, facilitat- ing precise diagnosis, treatment planning, and disease monitoring. However, the scarcity of annotated data poses a significant challenge for training deep learning models in the medical imaging domain. In this paper, we propose a novel approach for minimally-guided zero-shot segmentation of medical images using the Segment Anything Model (SAM), orig- inally trained on natural images. The method leverages SAM’s ability to segment arbitrary objects in natural scenes and adapts it to the medical domain without the need for labeled medical data, except for a few foreground and background points on the test image it- self. To this end, we introduce a two-stage process, involving the extraction of an initial mask from self-similarity maps and test-time fine-tuning of SAM. We run experiments on diverse medical imaging datasets, including AMOS22, MoNuSeg and the Gland segmen- tation (GlaS) challenge, and demonstrate the effectiveness of our approach.",
    pages = "1387--1400"
}

@InProceedings{kamath2024,
    title = "Comparing the Performance of Radiation Oncologists versus a Deep Learning Dose Predictor to Estimate Dosimetric Impact of Segmentation Variations for Radiotherapy",
    author = "Amith Jagannath Kamath and Zahira Mercado Auf der Maur and Robert Poel and Jonas Willmann and Ekin Ermis and Elena Riggenbach and Nicolaus Andratschke and Mauricio Reyes",
    abstract = "Current evaluation methods for quality control of manual/automated tumor and organs-at- risk segmentation for radiotherapy are driven mostly by geometric correctness. It is however known that geometry-driven segmentation quality metrics cannot characterize potentially detrimental dosimetric effects of sub-optimal tumor segmentation. In this work, we build on prior studies proposing deep learning-based dose prediction models to extend its use for the task of contour quality evaluation of brain tumor treatment planning. Using a test set of 54 contour variants and their corresponding dose plans, we show that our model can be used to dosimetrically assess the quality of contours and can outperform clinical expert radiation oncologists while estimating sub-optimal situations. We compare results against three such experts and demonstrate improved accuracy in addition to time savings. Our code is available at https://github.com/ubern-mia/radonc-vs-dldp.",
    pages = "742--753"
}

@InProceedings{seince2024,
    title = "Dense Self-Supervised Learning for Medical Image Segmentation",
    author = {Maxime Seince and Lo{\"\i}c Le Folgoc and Luiz Facury De Souza and Elsa Angelini},
    abstract = "Deep learning has revolutionized medical image segmentation, but it relies heavily on high-quality annotations. The time, cost and expertise required to label images at the pixel-level for each new task has slowed down widespread adoption of the paradigm. We propose Pix2Rep, a self-supervised learning (SSL) approach for few-shot segmentation, that reduces the manual annotation burden by learning powerful pixel-level representations directly from unlabeled images. Pix2Rep is a novel pixel-level loss and pre-training paradigm for contrastive SSL on whole images. It is applied to generic encoder-decoder deep learning backbones (e.g., U-Net). Whereas most SSL methods enforce invariance of the learned image-level representations under intensity and spatial image augmentations, Pix2Rep enforces equivariance of the pixel-level representations. We demonstrate the framework on a task of cardiac MRI segmentation. Results show improved performance compared to existing semi- and self-supervised approaches; and a 5-fold reduction in the annotation burden for equivalent performance versus a fully supervised U-Net baseline. This includes a 30\% (resp. 31\%) DICE improvement for one-shot segmentation under linear-probing (resp. fine-tuning). Finally, we also integrate the novel Pix2Rep concept with the Barlow Twins non-contrastive SSL, which leads to even better segmentation performance.",
    pages = "1371--1386"
}

@InProceedings{andresen2024,
    title = "FluidRegNet: Longitudinal registration of retinal OCT images with new pathological fluids",
    author = "Julia Andresen and Jan Ehrhardt and Claus von der Burchard and Ayse Tatli and Johann Roider and Heinz Handels and Hristina Uzunova",
    abstract = "Eye diseases such as the chronic central serous chorioretinopathy are characterized by fluid deposits that alter the retina and impair vision. These fluids occur at irregular intervals and may dissolve spontaneously or thanks to treatment. Accurately capturing this behavior within an image registration framework is challenging due to the resulting prominent tissue deformations and missing image correspondences between visits. This paper presents FluidRegNet, a convolutional neural network for the registration of successive optical coherence tomography images of the retina. The correspondence between time points is established by predicting the position of the origin of the fluids by creating a fluid seed in the form of sparse intensity offsets in the moving image and registering the fluid seed to the affected area in the follow-up image. We show that this leads to deformation fields that more accurately reflect the actual dynamics of retinal fluid growth compared to other image registration methods. In addition, the network outputs are used for unsupervised fluid segmentation.",
    pages = "48--60"
}

@InProceedings{shokrollahi2024,
    title = "Advancing Multiplex Immunofluorescence Imaging Cell Detection using Semi-Supervised Learning with Pseudo-Labeling",
    author = "Yasin Shokrollahi and Karina Pinao Gonzales and Maria Esther Salvatierra and Simon P. Castillo and Tanishq Gautam and Pingjun Chen and B. Leticia Rodriguez and Sara Ranjbar and Patient Mosaic Team and Luisa Solis Soto and Yinyin Yuan and Xiaoxi Pan",
    abstract = "Accurate cell detection in multiplex immunofluorescence (mIF) is crucial for quantifying and analyzing the spatial distribution of complex cellular patterns within the tumor microenvironment. Despite its importance, cell detection in mIF is challenging, primarily due to difficulties obtaining comprehensive annotations. To address the challenge of limited and unevenly distributed annotations, we introduced a streamlined semi-supervised approach that effectively leveraged partially pathologist-annotated single-cell data in multiplexed images across different cancer types. We assessed three leading object detection models, Faster R-CNN, YOLOv5s, and YOLOv8s, with partially annotated data, selecting YOLOv8s for optimal performance. This model was subsequently used to generate pseudo labels, which enriched our dataset by adding more detected labels than the original partially annotated data, thus increasing its generalization and the comprehensiveness of cell detection. By fine-tuning the detector on the original dataset and the generated pseudo labels, we tested the refined model on five distinct cancer types using fully annotated data by pathologists. Our model achieved an average precision of 90.42\%, recall of 85.09\%, and an F1 Score of 84.75\%, underscoring our semi-supervised model\'s robustness and effectiveness. This study contributes to analyzing multiplexed images from different cancer types at cellular resolution by introducing sophisticated object detection methodologies and setting a novel approach to effectively navigate the constraints of limited annotated data with semi-supervised learning.",
    pages = "1448--1461"
}

@InProceedings{chelebian2024,
    title = "Learned morphological features guide cell type assignment of deconvolved spatial transcriptomics",
    author = "Eduard Chelebian and Christophe Avenel and Julio Leon and Chung-Chau Hon and Carolina Wahlby",
    abstract = "Spatial transcriptomics enables to study the relationship between gene expression and tissue organization. Despite many recent advancements, existing sequencing-based methods have a spatial resolution that limits identification of individual cells.  To address this, several cell type deconvolution methods have been proposed to integrate spatial gene expression with single-cell and single-nucleus RNA sequencing, producing per spot cell typing. However, these methods often overlook the contribution of morphology, which means cell identities are randomly assigned to the nuclei within a spot. In this paper, we introduce MHAST, a morphology-guided hierarchical permutation-based framework which efficiently reassigns cell types in spatial transcriptomics. We validate our method on simulated data, synthetic data, and a use case on the broadly used Tangram cell type deconvolution method with Visium data. We show that deconvolution-based cell typing using morphological tissue features from self-supervised deep learning lead to a more accurate annotation of the cells.",
    pages = "220--233"
}

@InProceedings{cho2024,
    title = "Pretraining Vision-Language Model for Difference Visual Question Answering in Longitudinal Chest X-rays",
    author = "Yeongjae Cho and Taehee Kim and Heejun Shin and Sungzoon Cho and Dongmyung Shin",
    abstract = "Difference visual question answering (diff-VQA) is a challenging task that requires answering complex questions based on differences between a pair of images. This task is particularly important in reading chest X-ray images because radiologists often compare multiple images of the same patient taken at different times to track disease progression and changes in its severity in their clinical practice. However, previous works focused on designing specific network architectures for the diff-VQA task, missing opportunities to enhance the model\'s performance using a pretrained vision-language model (VLM). Here, we introduce a novel VLM called PLURAL, which is pretrained on natural and longitudinal chest X-ray data for the diff-VQA task. The model is developed using a step-by-step approach, starting with being pretrained on natural images and texts, followed by being trained using longitudinal chest X-ray data. The longitudinal data consist of pairs of X-ray images, along with question-answer sets and radiologist’s reports that describe the changes in lung abnormalities and diseases over time. Our experimental results show that the PLURAL model outperforms state-of-the-art methods not only in diff-VQA for longitudinal X-rays but also in conventional VQA for a single X-ray image. Through extensive experiments, we demonstrate the effectiveness of the proposed VLM architecture and pretraining method in improving the model’s performance.",
    pages = "263--275"
}

@InProceedings{heinrich2024,
    title = "Implicit neural obfuscation for privacy preserving medical image sharing",
    author = "Mattias P Heinrich and Lasse Hansen",
    abstract = "Despite its undeniable success, deep learning for medical imaging with large public datasets leads to an often overlooked risk of leaking sensitive patient information. A person\'s X-ray, even with proper anonymisation applied, can readily serve as fingerprint and would enable a highly accurate re-identification of the same individual in a large pool of scans. Common practices for reducing privacy risks involve a synthetic deterioration of image quality, e.g. by adding noise or downsampling images, before sharing them publicly. Yet, this also adversely affects the quality of downstream image recognition models trained on such datasets. We propose a novel strategy for finding a better compromise of model quality and privacy preservation by means of implicit neural obfuscation. Our method jointly overfits a neural network to a small batch of patients\' X-ray scans and applies a substantial compression - the number of network parameters representing the images is more than 6x smaller than the original images. In addition, we introduce a k-anonymity mixing that injects partial information from other patients for each reconstruction. That way identifiable information is efficiently obfuscated, while we manage to maintain the quality of relevant image parts for the intended downstream task. Experimental validation on the public RANZCR CLiP dataset demonstrates improved segmentation quality and up to 3 times reduced privacy risks compared to a more basic image obfuscation baselines. In contrast to other recent work that learn specific anonymous representations, which no longer resemble visually meaningful scans, our approach remains interpretable and is not tied to a certain downstream network. Source code and a demo dataset are available at https://github.com/mattiaspaul/neuralObfuscation.",
    pages = "596--609"
}

@InProceedings{yuksel2024,
    title = "Style Randomization Improves the Robustness of Breast Density Estimation in MR Images",
    author = "Goksenin Yuksel and Koen Eppenhof and Jaap Kroes and Marcel Worring",
    abstract = "Breast density, a crucial risk factor for future breast cancer development, is defined bythe ratio of fat to fibro-glandular tissue (FGT) in the breast. Accurate breast and FGTsegmentation is essential for robust density estimation. Previous research on FGT segmen-tation in MRI has highlighted the significance of training on both images with and withoutfat suppression to enhance network’s robustness. In this study, we propose a novel dataaugmentation technique to further exploit the multi-modal training setup motivated by theresearch in style randomization. We demonstrate that the network trained with the pro-posed augmentation is resilient to variations in fat content, showcasing improved robustnesscompared to solely training with multi-modal data. Our method effectively improves FGTsegmentation, thereby enhancing the overall reliability of breast density estimation",
    pages = "1841--1850"
}

@InProceedings{philipp2024,
    title = "Annotation-Efficient Strategy for Segmentation of 3D Body Composition",
    author = "Lena Philipp and Maarten de Rooij and John Hermans and Matthieu Rutten and Horst Karl Hahn and Bram van Ginneken and Alessa Hering",
    abstract = "Body composition as a diagnostic and prognostic biomarker is gaining importance in various medical fields such as oncology. Therefore, accurate quantification methods are necessary, like analyzing CT images. While several studies introduced deep learning approaches to automatically segment a single slice, quantifying body composition in 3D remains understudied due to the high required annotation effort. This study proposes an annotation-efficient strategy using an iterative self-learning approach with sparse annotations to develop a segmentation model for the abdomen and pelvis, significantly reducing manual annotation needs. The developed model demonstrates outstanding performance with Dice scores for skeletal muscle (SM): 0.97+/-0.01, inter-/intra-muscular adipose tissue (IMAT): 0.83 +/- 0.07, visceral adipose tissue (VAT): 0.94 +/-0.04, and subcutaneous adipose tissue (SAT): 0.98 +/-0.02. A reader study supported these findings, indicating that most cases required negligible to no correction for accurate segmentation for SM, VAT and SAT. The variability in reader evaluations for IMAT underscores the challenge of achieving consensus on its quantification and signals a gap in our understanding of the precision required for accurately assessing this tissue through CT imaging. Moreover, the findings from this study offer advancements in annotation efficiency and present a robust tool for body composition analysis, with potential applications in enhancing diagnostic and prognostic assessments in clinical settings.",
    pages = "1107--1127"
}

@InProceedings{rahman2024,
    title = "UltraMAE: Multi-modal Masked Autoencoder for Ultrasound Pre-training",
    author = "Aimon Rahman and Vishal M. Patel",
    abstract = "Pre-training on a large dataset such as ImageNet followed by supervised fine-tuning has brought success in various deep learning-based tasks. However, the modalities of natural images and ultrasound images have considerable differences, making pre-training on natural images ineffective for ultrasound-related tasks. In this paper, we introduce a unified masking-based model for both ultrasound images and videos that learns better visual representation than the network with single-modality representations. This is the first large-scale generalized ultrasound pre-training network that simultaneously utilizes 100,000+ videos and images of different parts of the human anatomy such as the liver, bones, heart, thyroids, nerves, etc, making the network an effective benchmark pretrained model for any ultrasound-specific downstream tasks. We propose a novel method for ultrasound image analysis that utilizes an ultrasound-specific confidence map to guide low-level representation learning through masked feature acquisition. Our pre-trained network has demonstrated remarkable efficacy and versatility in tackling both classification and segmentation tasks across a range of ultrasound pathologies, highlighting its potential for widespread adoption and impact in the ultrasound field. In addition, we show that our pre-training model can be leveraged to learn efficiently with a small number of labeled ultrasound images.",
    pages = "1196--1206"
}

@InProceedings{wang2024a,
    title = "Joint Motion Estimation with Geometric Deformation Correction for Fetal Echo Planar Images Via Deep Learning",
    author = "Jian Wang and Razieh Faghihpirayesh and Deniz Erdogmus and Ali Gholipour",
    abstract = "In this paper, we introduce a novel end-to-end predictive model for efficient fetal motion correction using deep neural networks. Diverging from conventional methods that estimate fetal brain motions and geometric distortions separately, our approach introduces a newly developed joint learning framework that not only reliably estimates various degrees of rigid movements, but also effectively corrects local geometric distortions of fetal brain images. Specifically, we first develop a method to learn rigid motion through a closed-form update integrated into network training. Subsequently, we incorporate a diffeomorphic deformation estimation model to guide the motion correction network, particularly in regions where local distortions and deformations occur. To the best of our knowledge, our study is the first to simultaneously track fetal motion and address geometric deformations in fetal echo-planar images. We validated our model using real fetal functional magnetic resonance imaging data with simulated and real motions. Our method demonstrates significant practical value to measure, track, and correct fetal motion in fetal MRI.",
    pages = "1640--1651"
}

@InProceedings{fathi2024,
    title = "DeCoDEx: Confounder Detector Guidance for Improved Diffusion-based Counterfactual Explanations",
    author = "Nima Fathi and Amar Kumar and Brennan Nichyporuk and Mohammad Havaei and Tal Arbel",
    abstract = "Deep learning classifiers are prone to latching onto dominant confounders present in a dataset rather than on the causal markers associated with the target class, leading to poor generalization and biased predictions. Although explainability via counterfactual image generation has been successful at exposing the problem, bias mitigation strategies that permit accurate explainability in the presence of dominant and diverse artifacts remain unsolved. In this work, we propose the DeCoDEx framework and show how an external, pre-trained binary artifact detector can be leveraged during inference to guide a diffusion-based counterfactual image generator towards accurate explainability.  Experiments on the CheXpert dataset, using both synthetic artifacts and real visual artifacts (support devices), show that the proposed method successfully synthesizes the counterfactual images that change the causal pathology markers associated with Pleural Effusion while preserving or ignoring the visual artifacts. Augmentation of ERM and Group-DRO classifiers with the DeCoDEx generated images substantially improves the results across underrepresented groups that are out of distribution for each class. The code is made publicly available at https://github.com/NimaFathi/DeCoDEx.",
    pages = "439--451"
}

@InProceedings{kulkarni2024a,
    title = "Hidden in Plain Sight: Undetectable Adversarial Bias Attacks on Vulnerable Patient Populations",
    author = "Pranav Kulkarni and Andrew Chan and Nithya Navarathna and Skylar Chan and Paul Yi and Vishwa Sanjay Parekh",
    abstract = "The proliferation of artificial intelligence (AI) in radiology has shed light on the risk of deep learning (DL) models exacerbating clinical biases towards vulnerable patient populations. While prior literature has focused on quantifying biases exhibited by trained DL models, demographically targeted adversarial bias attacks on DL models and its implication in the clinical environment remains an underexplored field of research in medical imaging. In this work, we demonstrate that demographically targeted label poisoning attacks can introduce undetectable underdiagnosis bias in DL models. Our results across multiple performance metrics and demographic groups like sex, age, and their intersectional subgroups show that adversarial bias attacks demonstrate high-selectivity for bias in the targeted group by degrading group model performance without impacting overall model performance. Furthermore, our results indicate that adversarial bias attacks result in biased DL models that propagate prediction bias even when evaluated with external datasets.",
    pages = "793--821"
}

@InProceedings{roy2024,
    title = "Anomaly-focused Single Image Super-resolution with Artifact Removal for Chest X-rays using Distribution-aware Diffusion Model",
    author = "Dattatreyo Roy and Angshuman Paul",
    abstract = "Single image super-resolution (SISR) is a crucial task in the field of medical imaging. It transforms low-resolution images into high-resolution counterparts. Performing SISR on chest x-ray images enhances image quality, aiding better diagnosis. However, artifacts may be present in the images. We propose an anomaly-guided SISR process utilizing the denoising mechanism of the diffusion model to iteratively remove noise and restore the original image. We train the model to learn the data distribution, enabling it to eliminate artifacts within the images. Additionally, we ensure reconstruction of the disease regions by prioritizing their reconstruction. Our research experiment over the publicly available dataset and find that the existing SISR methods are unable to learn and remove these artificially added artifacts. On the other hand, our proposed model not only prioritizes superior image reconstruction but also remove the artifacts. Our method is found to outperform the existing methods. The code is publicly available at https://github.com/Datta-IITJ/MIDL\_code.git.",
    pages = "1297--1309"
}

@InProceedings{dutt2024,
    title = "Parameter-Efficient Fine-Tuning for Medical Image Analysis: The Missed Opportunity",
    author = "Raman Dutt and Linus Ericsson and Pedro Sanchez and Sotirios A. Tsaftaris and Timothy Hospedales",
    abstract = "Foundation models have significantly advanced medical image analysis through the pre-train fine-tune paradigm. Among various fine-tuning algorithms, Parameter-Efficient Fine-Tuning (PEFT) is increasingly utilized for knowledge transfer across diverse tasks, including vision-language and text-to-image generation. However, its application in medical image analysis is relatively unexplored due to the lack of a structured benchmark for evaluating PEFT methods. This study fills this gap by evaluating 17 distinct PEFT algorithms across convolutional and transformer-based networks on image classification and text-to-image generation tasks using six medical datasets of varying size, modality, and complexity. Through a battery of over 700 controlled experiments, our findings demonstrate PEFT\'s effectiveness, particularly in low data regimes common in medical imaging, with performance gains of up to 22\% in discriminative and generative tasks. These recommendations can assist the community in incorporating PEFT into their workflows and facilitate fair comparisons of future PEFT methods, ensuring alignment with advancements in other areas of machine learning and AI.",
    pages = "406--425"
}

@InProceedings{quan2024,
    title = "Slide-SAM: Medical SAM Meets Sliding Window",
    author = "Quan Quan and Fenghe Tang and Zikang Xu and Heqin Zhu and S Kevin Zhou",
    abstract = "The Segment Anything Model (SAM) has achieved a notable success in two-dimensional image segmentation in natural images. However, the substantial gap between medical and natural images hinders its direct application to medical image segmentation tasks. Particularly in 3D medical images, SAM struggles to learn contextual relationships between slices, limiting its practical applicability. Moreover, applying 2D SAM to 3D images requires prompting the entire volume, which is time- and label-consuming. To address these problems, we propose Slide-SAM, which treats a stack of three adjacent slices as a prediction window. It firstly takes three slices from a 3D volume and point- or bounding box prompts on the central slice as inputs to predict segmentation masks for all three slices. Subsequently, the masks of the top and bottom slices are then used to generate new prompts for adjacent slices. Finally, step-wise prediction can be achieved by sliding the prediction window forward or backward through the entire volume. Our model is trained on multiple public and private medical datasets and demonstrates its effectiveness through extensive 3D segmetnation experiments, with the help of minimal prompts. Code is available at https://github.com/Curli-quan/Slide-SAM.",
    pages = "1179--1195"
}

@InProceedings{zhou2024a,
    title = "Conditional Generation of 3D Brain Tumor Regions via VQGAN and Temporal-Agnostic Masked Transformer",
    author = "Meng Zhou and Farzad Khalvati",
    abstract = "Neuroradiology studies often suffer from lack of sufficient data to properly train deep learning models. Generative Adversarial Networks (GANs) can mitigate this problem by generating synthetic images to augment training datasets. However, GANs sometimes are unstable and struggle to produce high-resolution, realistic, and diverse images. An alternative solution is Diffusion Probabilistic Models, but these models require extensive computational resources. Additionally, most of the existing generation models are designed to generate the entire image volumes, rather than the regions of interest (ROIs) such as the tumor region. Research on brain tumor classification using magnetic resonance imaging (MRIs) has shown that it is easier to classify the ROIs compared to the entire image volumes. To this end, we present a class-conditioned ROI generation framework that combines a conditional vector-quantization GAN and a class-conditioned masked Transformer to generate high-resolution and diverse 3D brain tumor ROIs. We also propose a temporal-agnostic masking strategy to effectively learn relationships between semantic tokens in the latent space. Our experiments demonstrate that the proposed method can generate high-quality 3D MRIs of brain tumor regions for both low- and high-grade glioma (LGG/HGG) in the BraTS 2019 dataset. Using the generated data, our approach demonstrates superior performance compared to several baselines in a downstream task of brain tumor type classification. Our proposed method has the potential to facilitate accurate diagnosis of rare brain tumors using MRI-based machine learning models.",
    pages = "1878--1897"
}

@InProceedings{rebbah2024,
    title = "Deep blind arterial input function: signal correction in perfusion cardiac magnetic resonance",
    author = "Habib Rebbah and Magalie Viallon and Pierre Croisille and Timoth\'e Boutelier",
    abstract = "Objectives: The non-linear relationship between gadolinium concentration and the signal in perfusion cardiac magnetic resonance (CMR) poses a significant challenge for accurate quantification of pharmacokinetic parameters. This phenomenon primarily impacts the arterial input function (AIF), causing it to appear saturated in comparison to the temporal concentration profile. This study aims to leverage a blind deconvolution strategy through a deep-learning approach to address the saturation in the AIF.Methods: We propose the utilization of a convolutional neural network (CNN) architecture with the saturated AIF and a set of myocardial tissue signals as inputs, generating the corrected AIF as the output. To train the network, a dataset comprising over 3×10^6 simulated AIFs with associated signals from five simulated tissues response for each instance was employed. To assess the effectiveness of the approach, the trained network was evaluated using a dual-saturation sequence to compare the corrected AIF with the unsaturated version. The clinical dataset encompassed scans from 43 patients.Results: The mean square error (MSE) for the testing subset of the simulated database was 0.69\% of the peak. In the in vivo dataset, the coefficient of determination R2 was 0.26 and 0.86 for the saturated and corrected AIF, respectively, in comparison to the unsaturated AIF.Conclusion: The proposed network successfully corrects the acquisition-induced effects on the AIF. Moreover, the extensive simulated database, featuring diverse acquisition parameters, facilitates the robust generalization of the network\'s application.",
    pages = "1237--1256"
}

@InProceedings{verhuelsdonk2024,
    title = "Shape of my heart: Cardiac models through learned signed distance functions",
    author = {Jan Verh\"ulsdonk and Thomas Grandits and Francisco Sahli Costabal and Thomas Pinetz and Rolf Krause and Angelo Auricchio and Gundolf Haase and Simone Pezzuto and Alexander Effland},
    abstract = "The efficient construction of anatomical models is one of the major challenges of patientspecific in-silico models of the human heart. Current methods frequently rely on linear statistical models, allowing no advanced topological changes, or requiring medical image segmentation followed by a meshing pipeline, which strongly depends on image resolution, quality, and modality. These approaches are therefore limited in their transferability to other imaging domains. In this work, the cardiac shape is reconstructed by means of threedimensional deep signed distance functions with Lipschitz regularity. For this purpose, the shapes of cardiac MRI reconstructions are learned to model the spatial relation of multiple chambers. We demonstrate that this approach is also capable of reconstructing anatomical models from partial data, such as point clouds from a single ventricle, or modalities different from the trained MRI, such as the electroanatomical mapping (EAM).",
    pages = "1584--1605"
}

@InProceedings{cheng2024,
    title = "MFIF-Net: A Multi-Focal Image Fusion Network for Implantation Outcome Prediction of Blastocyst",
    author = "Yi Cheng and Tingting Chen and Yaojun Hu and Xiangqian Meng and Zuozhu Liu and Danny Chen and Jian Wu and Haochao Ying",
    abstract = "Accurately predicting implantation outcomes based on blastocyst developmental potential is valuable in in-vitro fertilization (IVF). Clinically, embryologists analyze multiple focal-plane images (FP-images) to comprehensively assess embryo grades, which is extremely cumbersome and easily prone to inconsistency. Developing automatic computer-aided methods for analyzing embryo images is highly desirable. However, effectively fusing multiple FP-images for prediction remains a largely under-explored issue. To this end, we propose a novel Multiple Focal-plane Image Fusion Network, called MFIF-Net, to predict implantation outcomes of blastocyst. Specifically, our MFIF-Net consists of two sub-networks: a Core Image Generation Network (CI-Gen) and a Key Feature Fusion Network (KFFNet). In CI-Gen, we fuse multiple FP-images to generate a core image by pixel-wise weighting since different FP-images can have different focus positions. To further capture key features in each FP-image, we propose KFFNet to extract key information from the FP-images again and fuse them with the core image. In KFFNet, a Fusion Module is designed to capture key information of each FP-image, for which Squeeze Multi-Headed Attention is developed to exchange features and mitigate computationally intensive issues in attention. Comprehensive experiments validate the superiority and the rationality of our MFIF-Net approach over state-of-the-art methods in various metrics. Ablation studies also confirm the positive impact of each component in our MFIF-Net.",
    pages = "250--262"
}

@InProceedings{wilde2024,
    title = "Medical diffusion on a budget: Textual Inversion for medical image generation",
    author = "Bram de Wilde and Anindo Saha and Maarten de Rooij and Henkjan Huisman and Geert Litjens",
    abstract = "Diffusion models for text-to-image generation, known for their efficiency, accessibility, and quality, have gained popularity. While inference with these systems on consumer-grade GPUs is increasingly feasible, training from scratch requires large captioned datasets and significant computational resources. In medical image generation, limited availability of large, publicly accessible datasets with text reports poses challenges due to legal and ethical concerns. This work shows that adapting pre-trained Stable Diffusion models to medical imaging modalities is achievable by training text embeddings using Textual Inversion.In this study, we experimented with small medical datasets (100 samples each from three modalities) and trained within hours to generate diagnostically accurate images, as judged by an expert radiologist. Experiments with Textual Inversion training and inference parameters reveal the necessity of larger embeddings and more examples in the medical domain. Classification experiments show an increase in diagnostic accuracy (AUC) for detecting prostate cancer on MRI, from 0.78 to 0.80. Further experiments demonstrate embedding flexibility through disease interpolation, combining pathologies, and inpainting for precise disease appearance control. Notably, the trained embeddings are compact (less than 1 MB), enabling easy data sharing with reduced privacy concerns.",
    pages = "1687--1706"
}

@InProceedings{lai2024,
    title = "Heterogeneous Medical Data Integration with Multi-Source StyleGAN",
    author = "Wei-Cheng Lai and Matthias Kirchler and Hadya Yassin and Jana Fehr and Alexander Rakowski and Hampus Olsson and Ludger Starke and Jason M. Millward and Sonia Waiczies and Christoph Lippert",
    abstract = "Conditional deep generative models have emerged as powerful tools for generating realistic images enabling fine-grained control over latent factors. In the medical domain, data scarcity and the need to integrate information from diverse sources present challenges for existing generative models, often resulting in low-quality image generation and poor controllability. To address these two issues, we propose Multi-Source StyleGAN (MSSG). MSSG learns jointly from multiple heterogeneous data sources with different available covariates and can generate new images controlling all covariates together, thereby overcoming both data scarcity and heterogeneity.We validate our method on semi-synthetic data of hand-written digit images with varying morphological features and in controlled multi-source simulations on retinal fundus images and brain magnetic resonance images. Finally, we apply MSSG in a real-world setting of brain MRI from different sources. Our proposed algorithm offers a promising direction for unbiased data generation from disparate sources. For the reproducibility of our experimental results, we provide [detailed code implementation](https://github.com/weslai/msstylegans).",
    pages = "857--887"
}

@InProceedings{dauchelle2024,
    title = "An unexpected confounder: how brain shape can be used to classify MRI scans ?",
    author = {Valentine Wargnier Dauchelle and Thomas Grenier and Micha\"el Sdika},
    abstract = "Although deep learning has proved its effectiveness in the analysis of medical images, its great ability to extract complex features makes it susceptible to base its decision on spurious confounders present in the images. However, especially for medical applications, network decisions must be based on relevant elements. Numerous confounding factors have been identified in the case of brain scans such as gender, age, MRI sites or scanners, etc. Nevertheless, although skull stripping is a classic preprocessing step for brain scans, brain shape has never been considered as a possible confounder. In this work, we show that brain shape is used in the classification of brain MRI scans from different databases, even when it should not be considered as a clinically relevant factor. To this purpose, we introduce a rigorous two steps method to assess whether a factor is a confounder or not, and we apply it to identify the brain shape as a confounding variable in brain images classification. Lastly, we propose to use a deformable registration in the data preprocessing pipeline to align the brain contours of the images in the datasets, whereas standard pipelines often do nothing more than affine registration. Including this deformable registration step makes the classification free from the brain shape confounding effect.",
    pages = "338--351"
}

@InProceedings{pina2024,
    title = "Cell-DETR: Efficient cell detection and classification in WSIs with transformers",
    author = "Oscar Pina and Eduard Dorca and Veronica Vilaplana",
    abstract = "Understanding cell interactions and subpopulation distribution is crucial for pathologists to support their diagnoses. This cell information is traditionally extracted from segmentation methods, which poses significant computational challenges on processing Whole Slide Images (WSIs) due to their giga-size nature. Nonetheless, the clinically relevant tasks are nuclei detection and classification rather than segmentation. In this manuscript, we undertake a comprehensive exploration of the applicability of detection transformers for cell detection and classification (Cell-DETR). Not only do we demonstrate the effectiveness of the method by achieving state-of-the-art performance on well-established benchmarks, but we also develop a pipeline to tackle these tasks on WSIs at scale to enable the development of downstream applications. We show its efficiency and feasibility by reporting a x3.4 faster inference time on a dataset featuring large WSIs. By addressing the challenges associated with large-scale cell detection, our work contributes valuable insights that paves the way for the development of scalable diagnosis pipelines based on cell-level information.",
    pages = "1128--1141"
}

@InProceedings{jiang2024,
    title = "Self-supervised pretraining in the wild imparts image acquisition robustness to medical image transformers: an application to lung cancer segmentation",
    author = "Jue Jiang and Harini Veeraraghavan",
    abstract = {Self-supervised learning (SSL) is an approach to pretrain models with unlabeled datasets and extract useful feature representations such that these models can be easily fine-tuned for various downstream tasks. \textcolor{blue}{Self-pretraining applies SSL on curated task-specific datasets.} Increasing availability of public data repositories has now made it possible to utilize diverse and large task unrelated datasets to pretrain models in the \"wild\" using SSL. However, the benefit of such wild-pretraining over self-pretraining has not been studied in the context of medical image analysis. Hence, we analyzed transformers (Swin and ViT) and a convolutional neural network created using wild- and self-pretraining trained to segment lung tumors from 3D-computed tomography (CT) scans in terms of: (a) accuracy, (b) fine-tuning epoch efficiency, and (c) robustness to image acquisition differences (contrast versus non-contrast, slice thickness, and image reconstruction kernels). We also studied feature reuse using centered kernel alignment (CKA) with the Swin networks. Our analysis with two independent testing (public N = 139; internal N = 196) datasets showed that wild-pretrained Swin models significantly outperformed self-pretrained Swin for the various imaging acquisitions. Fine-tuning epoch efficiency was higher for both wild-pretrained Swin and ViT models compared to their self-pretrained counterparts. Feature reuse close to the final encoder layers was lower than in the early layers for wild-pretrained models irrespective of the pretext tasks used in SSL. Models and code will be made available through GitHub upon manuscript acceptance.},
    pages = "708--721"
}

@InProceedings{xue2024,
    title = "Erase to Enhance: Data-Efficient Machine Unlearning in MRI Reconstruction",
    author = "Yuyang Xue and Jingshuai Liu and Steven McDonagh and Sotirios A. Tsaftaris",
    abstract = "Machine unlearning is a promising paradigm for removing unwanted data samples from a trained model, towards ensuring compliance with privacy regulations and limiting harmful biases. Although unlearning has been shown in, e.g., classification and recommendation systems, its potential in medical image-to-image translation, specifically in image reconstruction, has not been thoroughly investigated. This paper shows that machine unlearning is possible in MRI tasks and has the potential to benefit for bias removal. We set up a protocol to study how much shared knowledge exists between datasets of different organs, allowing us to effectively quantify the effect of unlearning. Our study reveals that combining training data can lead to hallucinations and reduced image quality in the reconstructed data. We use unlearning to remove hallucinations as a proxy exemplar of undesired data removal. Indeed, we show that machine unlearning is possible without full retraining. Furthermore, our observations indicate that maintaining high performance is feasible even when using only a subset of retain data. We have made our code publicly accessible.",
    pages = "1785--1800"
}

@InProceedings{zheng2024,
    title = "Towards a Collective Medical Imaging AI: Enabling Continual Learning from Peers",
    author = "Guangyao Zheng and Vladimir Braverman and Jeffrey Leal and Steven Rowe and Doris Leung and Michael A. Jacobs and Vishwa Sanjay Parekh",
    abstract = "Federated learning is an exciting area within machine learning that allows cross-silo training of large-scale machine learning models on disparate or similar tasks in a privacy-preserving manner. However, conventional federated learning frameworks require a synchronous training schedule and are incapable of lifelong learning. To that end, we propose an asynchronous decentralized federated lifelong learning (ADFLL) method that allows agents in the system to asynchronously and continually learn from their own previous experiences and others\', thus overcoming the potential drawbacks of conventional federated learning. We evaluate the ADFLL framework in two experimental setups for deep reinforcement learning (DRL) based landmark localization across different imaging modalities, orientations, and sequences. The ADFLL was compared to central aggregation and conventional lifelong learning for upper-bound comparison and with a conventional DRL model for lower-bound comparison. Across all the experiments, ADFLL demonstrated excellent capability to collaboratively learn all tasks across all the agents compared to the baseline models in in-distribution and out-of-distribution test sets. In conclusion, we provide a flexible, efficient, and robust federated lifelong learning framework that can be deployed in real-world applications.",
    pages = "1862--1877"
}

@InProceedings{grewal2024,
    title = "Multi-Objective Learning for Deformable Image Registration",
    author = "Monika Grewal and Henrike Westerveld and Peter Bosman and Tanja Alderliesten",
    abstract = "Deformable image registration (DIR) involves optimization of multiple conflicting objectives, however, not many existing DIR algorithms are multi-objective (MO). Further, while there has been progress in the design of deep learning algorithms for DIR, there is no work in the direction of MO DIR using deep learning. In this paper, we fill this gap by combining a recently proposed approach for MO training of neural networks with a well-known deep neural network for DIR and create a deep learning based MO DIR approach. We evaluate the proposed approach for DIR of pelvic magnetic resonance imaging (MRI) scans. We experimentally demonstrate that the proposed MO DIR approach -- providing multiple DIR outputs for each patient that each correspond to a different trade-off between the objectives -- has additional desirable properties from a clinical use point-of-view as compared to providing a single DIR output. The experiments also show that the proposed MO DIR approach provides a better spread of DIR outputs across the entire trade-off front than simply training multiple neural networks with weights for each objective sampled from a grid of possible values.",
    pages = "538--553"
}

@InProceedings{magg2024,
    title = "Training-free Prompt Placement by Propagation for SAM Predictions in Bone CT Scans",
    author = "Caroline Magg and Lukas P.E. Verweij and Maaike A. ter Wee and George S. Buijs and Johannes G.G. Dobbe and Geert J. Streekstra and Leendert Blankevoort and Clara I. S\'anchez",
    abstract = "The Segment Anything Model (SAM) is an interactive foundation segmentation model, showing impressive results for 2D natural images using prompts such as points and boxes. Transferring these results to medical image segmentation is challenging due to the 3D nature of medical images and the high demand of manual interaction. As a 2D architecture, SAM is applied slice-per-slice to a 3D medical scan. This hinders the application of SAM for volumetric medical scans since at least one prompt per class for each single slice is needed. In our work, the applicability is improve by reducing the number of necessary user-generated prompts. We introduce and evaluate multiple training-free strategies to automatically place box prompts in bone CT volumes, given only one initial box prompt per class. The average performance of our methods ranges from 54.22\% Dice to 88.26\% Dice. At the same time, the number of annotated pixels is reduced significantly from a few millions to two pixels per class. These promising results underline the potential of foundation models in medical image segmentation, paving the way for annotation-efficient, general approaches.",
    pages = "964--985"
}

@InProceedings{rio-torto2024,
    title = "Parameter-Efficient Generation of Natural Language Explanations for Chest X-ray Classification",
    author = "Isabel Rio-Torto and Jaime S Cardoso and Luis Filipe Teixeira",
    abstract = "The increased interest and importance of explaining neural networks\' predictions, especially in the medical community, associated with the known unreliability of saliency maps, the most common explainability method, has sparked research into other types of explanations. Natural Language Explanations (NLEs) emerge as an alternative, with the advantage of being inherently understandable by humans and the standard way that radiologists explain their diagnoses. We extend upon previous work on NLE generation for multi-label chest X-ray diagnosis by replacing the traditional decoder-only NLE generator with an encoder-decoder architecture. This constitutes a first step towards Reinforcement Learning-free adversarial generation of NLEs when no (or few) ground-truth NLEs are available for training, since the generation is done in the continuous encoder latent space, instead of in the discrete decoder output space.However, in the current scenario, large amounts of annotated examples are still required, which are especially costly to obtain in the medical domain, given that they need to be provided by clinicians. Thus, we explore how the recent developments in Parameter-Efficient Fine-Tuning (PEFT) can be leveraged for this use-case. We compare different PEFT methods and find that integrating the visual information into the NLE generator layers instead of only at the input achieves the best results, even outperforming the fully fine-tuned encoder-decoder-based model, while only training 12\% of the model parameters. Additionally, we empirically demonstrate the viability of supervising the NLE generation process on the encoder latent space, thus laying the foundation for RL-free adversarial training in low ground-truth NLE availability regimes. The code is publicly available at https://github.com/to\_be\_added.",
    pages = "1267--1281"
}

@InProceedings{koehler2024,
    title = "Efficiently correcting patch-based segmentation errors to control image-level performance in retinal images",
    author = {Patrick K\"ohler and Jeremiah Fadugba and Philipp Berens and Lisa M. Koch},
    abstract = "Segmentation models which are deployed into clinical practice need to meet a quality standard for each image. Even when models perform well on average, they may fail at segmenting individual images with a sufficiently high quality. We propose a combined quality control and error correction framework to reach the desired segmentation quality in each image. Our framework recommends the necessary number of local patches for manual review and estimates the impact of the intervention on the Dice Score of the corrected segmentation. This allows to trade off segmentation quality against time invested into manual review. We select the patches based on uncertainty maps obtained from an ensemble of segmentation models. We evaluated our method on retinal vessel segmentation on fundus images, where the Dice Score increased substantially after reviewing only a few patches. Our method accurately estimated the review’s impact on the Dice Score and we found that our framework controls the quality standard efficiently, i.e. reviewing as little as necessary.",
    pages = "841--856"
}

@InProceedings{chou2024,
    title = "Finite Volume Informed Graph Neural Network for Myocardial Perfusion Simulation",
    author = "Raoul Sall\'e de Chou and Matthew Sinclair and Sabrina Lynch and Nan Xiao and Laurent Najman and Irene Vignon-Clementel and Hugues Talbot",
    abstract = {Medical imaging and numerical simulation of partial differential equations (PDEs) representing biophysical processes, have been combined in the past few decades to provide noninvasive diagnostic and treatment prediction tools for various diseases. Most approaches involve solving computationally expensive PDEs, which can hinder their effective deployment in clinical settings. To overcome this limitation, deep learning has emerged as a promising method to accelerate numerical solvers. One challenge persists however in the generalization abilities of these models, given the wide variety of patient morphologies. This study addresses this challenge by introducing a physics-informed graph neural network designed to solve Darcy equations for the simulation of myocardial perfusion. Leveraging a finite volume discretization of the equations as a \"physics-informed\" loss, our model was successfully trained and tested on a 3D synthetic dataset, namely meshes representing simplified myocardium shapes. Subsequent evaluation on a genuine myocardium mesh, extracted from patient Computed Tomography images, demonstrated promising results and generalized capabilities. Such a fast solver, within a differentiable learning framework will enable to tackle inverse problems based on $\text{H}\_2$O-PET perfusion imaging data.},
    pages = "276--288"
}

@InProceedings{macpherson2024,
    title = "Automated ranking of chest x-ray radiological finding severity in a binary label setting",
    author = "Matthew Macpherson and Keerthini Muthuswamy and Ashik Amlani and Vicky Goh and Giovanni Montana",
    abstract = "Machine learning has demonstrated the ability to match or exceed human performance in detecting a range of abnormalities in chest x-rays. However, current models largely operate within a binary classification paradigm with findings either present or absent using fixed decision thresholds, whereas many clinical findings can be more usefully described on a scale of severity which a skilled radiologist will incorporate into a more nuanced report. This limitation is due, in part, to the difficulty and expense of manually annotating fine-grained labels for training and test images versus the relative ease of automatically extracting binary labels from the associated free text reports using NLP algorithms. In this paper we examine the ability of models trained with only binary training data to give useful abnormality severity information from their raw outputs. We assess performance on a ranking task using manually ranked test sets for each of five findings: cardiomegaly, consolidation, paratracheal hilar changes, pleural effusion and subcutaneous emphysema. We find the raw model output agrees with human-assessed severity ranking with Spearman\'s rank coefficients between 0.563 - 0.848. Using patient age as an additional radiological finding with full ground truth ranking available, we go on to compare a binary classifier output against a fully supervised RankNet model, quantifying the reduction in training data required in the fully supervised setting for equivalent performance. We show that model performance is improved using a semi-supervised approach supplementing a smaller set of fully supervised images with a larger set of binary labelled images.",
    pages = "949--963"
}

@InProceedings{wu2024,
    title = "IST-editing: Infinite spatial transcriptomic editing in a generated gigapixel mouse pup",
    author = "Jiqing Wu and Ingrid Berg and Viktor Koelzer",
    abstract = "Advanced spatial transcriptomics (ST) techniques provide comprehensive insights into complex organisms across multiple scales, while simultaneously posing challenges in biomedical image analysis. The spatial co-profiling of biological tissues by gigapixel whole slide images (WSI) and gene expression arrays motivates the development of innovative and efficient algorithmic approaches. Using Generative Adversarial Nets (GAN), we introduce **I**nfinite **S**patial **T**ranscriptomic **e**diting (IST-editing) and establish gene expression-guided editing in a generated gigapixel mouse pup. Trained with patch-wise high-plex gene expression (input) and matched image data (output), IST-editing enables the seamless synthesis of arbitrarily large bioimages at inference, *e.g.*, with a $106496 \times 53248$ resolution. After feeding edited gene expression values to the trained model, we simulate cell-, tissue- and animal-level morphological transitions in the generated mouse pup. Lastly, we discuss and evaluate editing effects on interpretable morphological features. The code and generated WSIs are publicly accessible via https://github.com/CTPLab/IST-editing.",
    pages = "1707--1724"
}

@InProceedings{dahan2024b,
    title = "Spatio-Temporal Encoding of Brain Dynamics with Surface Masked Autoencoders",
    author = "Simon Dahan and Logan Zane John Williams and Yourong Guo and Daniel Rueckert and Emma Claire Robinson",
    abstract = "The development of robust and generalisable models for encoding the spatio-temporal dynamics of human brain activity is crucial for advancing neuroscientific discoveries. However, significant individual variation in the organisation of the human cerebral cortex makes it difficult to identify population-level trends in these signals. Recently, Surface Vision Transformers (SiTs) have emerged as a promising approach for modelling cortical signals, yet they face some limitations in low-data scenarios due to the lack of inductive biases in their architecture. To address these challenges, this paper proposes the surface Masked AutoEncoder (sMAE) and video surface Masked AutoEncoder (vsMAE) - for multivariate and spatio-temporal pre-training of cortical signals over regular icosahedral grids. These models are trained to reconstruct cortical feature maps from masked versions of the input by learning strong latent representations of cortical structure and function. Such representations translate into better modelling of individual phenotypes and enhanced performance in downstream tasks. The proposed approach was evaluated on cortical phenotype regression using data from the young adult Human Connectome Project (HCP) and developing HCP (dHCP). Results show that (v)sMAE pre-trained models improve phenotyping prediction performance on multiple tasks by $\ge 26\%$, and offer faster convergence relative to models trained from scratch.  Finally, we show that pre-training vision transformers on large datasets, such as the UK Biobank (UKB), supports transfer learning to low-data regimes. Our code and pre-trained models are publicly available at https://github.com/metrics-lab/surface-masked-autoencoders.",
    pages = "306--325"
}

@InProceedings{valanarasu2024,
    title = "Disruptive Autoencoders: Leveraging Low-level features for 3D Medical Image Pre-training",
    author = "Jeya Maria Jose Valanarasu and Yucheng Tang and Dong Yang and Ziyue Xu and Can Zhao and Wenqi Li and Vishal M. Patel and Bennett Allan Landman and Daguang Xu and Yufan He and Vishwesh Nath",
    abstract = "Harnessing the power of pre-training on large-scale datasets like ImageNet forms a funda- mental building block for the progress of representation learning-driven solutions in com- puter vision. Medical images are inherently different from natural images as they are acquired in the form of many modalities (CT, MR, PET, Ultrasound etc.) and contain granulated information like tissue, lesion, organs etc. These characteristics of medical im- ages require special attention towards learning features representative of local context. In this work, we focus on designing an effective pre-training framework for 3D radiology im- ages. First, we propose a new masking strategy called local masking where the masking is performed across channel embeddings instead of tokens to improve the learning of local feature representations. We combine this with classical low-level perturbations like adding noise and downsampling to further enable low-level representation learning. To this end, we introduce Disruptive Autoencoders, a pre-training framework that attempts to re- construct the original image from disruptions created by a combination of local masking and low-level perturbations. We curate a large-scale dataset to enable pre-training of 3D medical radiology images (MRI and CT). The proposed pre-training framework is tested across multiple downstream tasks and achieves state-of-the-art performance. Notably, our proposed method tops the public test leaderboard of BTCV multi-organ segmentation chal- lenge. Our code can be found here: https://github.com/Project-MONAI/research-contributions/tree/main/DAE.",
    pages = "1553--1570"
}

@InProceedings{wang2024b,
    title = "OFELIA: Optical Flow-based Electrode LocalIzAtion",
    author = "Xinyi Wang and Zikang Xu and Qingsong Yao and Yiyong Sun and S Kevin Zhou",
    abstract = "Catheter ablation is one of the most common cardiac ablation procedures for atrial fibrillation, which is mainly based on catheters with electrodes collecting electrophysiology signals.Catheter electrode localization facilitates intraoperative catheter positioning, surgical planning, and other applications such as 3D model reconstruction.In this paper, we propose a novel deep network for automatic electrode localization in an X-ray sequence, which integrates spatiotemporal features between adjacent frames, aided by optical flow maps.To improve the utility and robustness of the proposed method, we first design a saturation-based optical flow dataset construction pipeline, then finetune the optical flow estimation to obtain more realistic and contrasting optical flow maps for electrode localization.The extensive results on clinical-challenging test sequences reveal the effectiveness of our method, with a mean radial error (MRE) of 0.95 mm for radiofrequency catheters and an MRE of 0.71 mm for coronary sinus catheters, outperforming several state-of-the-art landmark detection methods.",
    pages = "1652--1669"
}

@InProceedings{susmitha2024,
    title = "Analysis of Transformers for Medical Image Retrieval",
    author = "Arvapalli Sai Susmitha and Vinay P. Namboodiri",
    abstract = "This paper investigates the application of transformers to medical image retrieval. Although various methods have been attempted in this domain, transformers have not been extensively explored. Leveraging vision transformers, we consider co-attention between image tokens. Two main aspects are investigated: the analysis of various architectures and parameters for transformers and the evaluation of explanation techniques. Specifically, we employ contrastive learning to retrieve attention-based images that consider the relationships between query and database images. Our experiments on diverse medical datasets, such as ISIC 2017, COVID-19 chest X-ray, and Kvasir, using multiple transformer architectures, demonstrate superior performance compared to convolution-based methods and transformers using cross-entropy losses. Further, we conducted a quantitative evaluation of various state-of-the-art explanation techniques using insertion-deletion metrics, in addition to basic qualitative assessments. Among these methods, Transformer Input Sampling (TIS) stands out, showcasing superior performance and enhancing interpretability, thus distinguishing it from black-box models.",
    pages = "1497--1512"
}

@InProceedings{li2024,
    title = "From Barlow Twins to Triplet Training: Differentiating Dementia with Limited Data",
    author = {Yitong Li and Tom Nuno Wolf and Sebastian P\"olsterl and Igor Yakushev and Dennis M. Hedderich and Christian Wachinger},
    abstract = "Differential diagnosis of dementia is challenging due to overlapping symptoms, with structural magnetic resonance imaging (MRI) being the primary method for diagnosis. Despite the clinical value of computer-aided differential diagnosis, research has been limited, mainly due to the absence of public datasets that contain diverse types of dementia. This leaves researchers with small in-house datasets that are insufficient for training deep neural networks (DNNs). Self-supervised learning shows promise for utilizing unlabeled MRI scans in training, but small batch sizes for volumetric brain scans make its application challenging. To address these issues, we propose Triplet Training for differential diagnosis with limited target data. It consists of three key stages: (i) self-supervised pre-training on unlabeled data with Barlow Twins, (ii) self-distillation on task-related data, and (iii) fine-tuning on the target dataset. Our approach significantly outperforms traditional training strategies, achieving a balanced accuracy of 75.6\%. We further provide insights into the training process by visualizing changes in the latent space after each step. Finally, we validate the robustness of Triplet Training in terms of its individual components in a comprehensive ablation study. Our code is available at https://github.com/ai-med/TripletTraining.",
    pages = "888--902"
}

@InProceedings{dahan2024c,
    title = "Video Polyp Segmentation using Implicit Networks",
    author = "Aviad Dahan and Tal Shaharabany and Raja Giryes and Lior Wolf",
    abstract = "Polyp segmentation in endoscopic videos is an essential task in medical image and video analysis, requiring pixel-level accuracy to accurately identify and localize polyps within the video sequences. Addressing this task unveils the intricate interplay of dynamic changes in the video and the complexities involved in tracking polyps across frames. Our research presents an innovative approach to effectively meet these challenges that integrates, at test time, a pre-trained image (2D) model with a new form of implicit representation. By leveraging the temporal understanding provided by implicit networks and enhancing it with optical flow-based temporal losses, we significantly enhance the precision and consistency of polyp segmentation across sequential frames. Our proposed framework demonstrates excellent performance across various medical benchmarks and datasets, setting a new standard in video polyp segmentation with high spatial and temporal consistency. Our code is publicly available at https://github.com/AviadDahan/VPS-implicit.",
    pages = "326--337"
}

@InProceedings{krusen2024,
    title = "Real-time MR-based 3D motion monitoring using raw k-space data",
    author = "Marius Krusen and Floris Ernst",
    abstract = "Due to its great soft-tissue contrast and non-invasive nature, magnetic resonance imaging (MRI) is uniquely qualified for motion monitoring during radiotherapy.However, real-time capabilities are limited by its long acquisition times, particularly in 3D, and require highly undersampling k-space resulting in lower image resolution and image artifacts.In this paper, we propose a simple recurrent neural network (RNN) architecture to continually estimate target motion from single k-space spokes.By directly using the incoming k-space data, additional image reconstruction steps are avoided and less data is required between estimations achieving a latency of only a few milliseconds.The 4D XCAT phantom was used to generate realistic data of the abdomen affected by respiratory and cardiac motion and a simulated lesion inserted into the liver acted as the target.We show that using a Kooshball trajectory to sample 3D k-space gives superior results compared to a stack-of-stars (SoS) trajectory.The RNN quickly learns the motion pattern and can give new motion estimations at a frequency of more than 230 Hz, demonstrating the feasibility of drastically improving latency of MR-based motion monitoring systems.",
    pages = "768--781"
}

@InProceedings{wang2024c,
    title = "Skin Malignancy Classification Using Patients’ Skin Images and Meta-data: Multimodal Fusion for Improving Fairness",
    author = "Ke Wang and Ningyuan Shan and Henry Gouk and Iris Szu-Szu Ho",
    abstract = "Skin cancer image classification across skin tones is a challenging problem due to the fact that skin cancer can present differently on different skin tones. This study evaluates the performance of image only models and fusion models in skin malignancy classification. The fusion models we consider are able to take in additional patient data, such as an indicator of their skin tone, and merge this information with the features provided by the image-only model. Results from the experiment show that fusion models perform substantially better than image-only models. In particular, we find that a form of multiplicative fusion results in the best performing models. This finding suggests that skin tones add predictive value in skin malignancy prediction problems. We further demonstrate that feature fusion methods reduce, but do not entirely eliminate, the disparity in performance of the model on patients with different skin tones.",
    pages = "1670--1686"
}

@InProceedings{scholz2024,
    title = "Imbalance-aware loss functions improve medical image classification",
    author = "Daniel Scholz and Ayhan Can Erdur and Josef A Buchner and Jan C Peeken and Daniel Rueckert and Benedikt Wiestler",
    abstract = "Deep learning models offer unprecedented opportunities for diagnosis, prognosis, and treatment planning.    However, conventional deep learning pipelines often encounter challenges in learning unbiased classifiers within imbalanced data settings, frequently exhibiting bias towards minority classes.    In this study, we aim to improve medical image classification by effectively addressing class imbalance.    To this end, we employ differentiable loss functions derived from classification metrics commonly used in imbalanced data settings: Matthews correlation coefficient (MCC) and the F1 score.    We explore the efficacy of these loss functions both independently and in combination with cross-entropy loss and various batch sampling strategies on diverse medical datasets of 2D fundoscopy and 3D magnetic resonance images.    Our findings demonstrate that, compared to conventional loss functions, we achieve notable improvements in overall classification performance, with increases of up to +12\% in balanced accuracy and up to +51\% in class-wise F1 score for minority classes when utilizing cross-entropy coupled with metrics-derived loss. Additionally, we conduct feature visualization to gain insights into the behavior of these features during training with imbalance-aware loss functions. Our visualization reveals a more pronounced clustering of minority classes in the feature space, consistent with our classification results.    Our results underscore the effectiveness of combining cross-entropy loss with class-imbalance-aware loss functions in training more accurate classifiers, particularly for minority classes.",
    pages = "1341--1356"
}

@InProceedings{shen2024,
    title = "Med-Tuning: A New Parameter-Efficient Tuning Framework for Medical Volumetric Segmentation",
    author = "Jiachen Shen and Wenxuan Wang and Chen Chen and Jianbo Jiao and Jing Liu and Yan Zhang and Shanshan Song and Jiangyun Li",
    abstract = {The \"pre-training then fine-tuning (FT)\" paradigm is widely adopted to boost the model performance of deep learning-based methods for medical volumetric segmentation. However, conventional full FT incurs high computational and memory costs. Thus, it is of increasing importance to fine-tune pre-trained models for medical volumetric segmentation tasks in a both effective and parameter-efficient manner. In this paper, we introduce a new framework named Med-Tuning to realize parameter-efficient tuning (PET) for medical volumetric segmentation task and an efficient plug-and-play module named Med-Adapter for task-specific feature extraction. With a small number of tuned parameters, our framework enhances the 2D baselines\'s precision on segmentation tasks, which are pre-trained on natural images. Extensive experiments on three benchmark datasets (CT and MRI modalities) show that our method achieves better results than previous PET methods on volumetric segmentation tasks. Compared to full FT, Med-Tuning reduces the fine-tuned model parameters by up to 4x, with even better segmentation performance. Our project webpage is at \url{https://rubics-xuan.github.io/Med-Tuning/}.},
    pages = "1412--1433"
}

@InProceedings{rashmi2024,
    title = "Ano-swinMAE: Unsupervised Anomaly Detection in Brain MRI using swin Transformer based Masked Auto Encoder",
    author = "Kumari Rashmi and Ayantika Das and NagaGayathri Matcha and Keerthi Ram and Mohanasankar Sivaprakasam",
    abstract = "The advanced deep learning-based Autoencoding techniques have enabled the introduction of efficient Unsupervised Anomaly Detection (UAD) approaches. Several autoencoder-based approaches have been used to solve UAD tasks. However, most of these approaches do not have any constraints to ensure the removal of pathological features while restoring the healthy regions in the pseudo-healthy image reconstruction. To remove the occurrence of pathological features, we propose to utilize an Autoencoder which deploys a masking strategy to reconstruct images. Additionally, the masked regions need to be meaningfully inpainted to enforce global and local consistency in the generated images which makes transformer-based masked autoencoder a potential approach. Although the transformer models can incorporate global contextual information, they are often computationally expensive and dependent on a large amount of data for training. Hence we propose to employ a Swin transformer-based Masked Autoencoder (MAE) for anomaly detection (Ano-swinMAE) in brain MRI. Our proposed method Ano-swinMAE is trained on a  healthy cohort by masking a certain percentage of information from the input images. While inferring, a pathological image is given to the model and different segments of the brain MRI slice are sequentially masked and their corresponding generation is accumulated to create a map indicating potential locations of pathologies. We have quantitatively and qualitatively validated the performance increment of our method on the following publicly available datasets: BraTS (Glioma), MSLUB (Multiple Sclerosis) and White Matter Hyperintensities (WMH). We have also empirically evaluated the generalisation capability of the method in a cross modality data setup.",
    pages = "1221--1236"
}

@InProceedings{baumann2024,
    title = "HoVer-NeXt: A Fast Nuclei Segmentation and Classification Pipeline for Next Generation Histopathology",
    author = "Elias Baumann and Bastian Dislich and Josef Lorenz Rumberger and Iris D. Nagtegaal and Maria Rodriguez Martinez and Inti Zlobec",
    abstract = "In cancer, a variety of cell types, along with their local density and spatial organization within tissues, play a key role in driving cancer progression and modulating patient outcomes. At the basis of cancer diagnosis is the histopathological assessment of tissues, stained by hematoxylin \& eosin (H\&E), which gives the nuclei of cells a dark purple appearance, making them particularly distinguishable and quantifiable. The identification of individual nuclei, whether in a proliferating (mitosis) or resting state, and their further phenotyping (e.g. immune cells) is the foundation on which histopathology images can be used for further investigations into cellular interaction, prognosis or response prediction. To this end, we develop a H\&E based nuclei segmentation and classification model that is both fast (1.8s/mm2 at 0.5mpp, 3.2s/mm2 at 0.25mpp) and accurate (0.84 binary F1, 0.758 mean balanced Accuracy) which allows us to investigate the cellular composition of large-scale colorectal cancer (CRC) cohorts. We extend the publicly available Lizard CRC nuclei dataset with a mitosis class and publish further validation data for the rarest classes: mitosis and eosinophils. Moreover, our pipeline is 5× faster than the CellViT pipeline, 17× faster than the HoVer-Net pipeline, and performs competitively on the PanNuke pan-cancer nuclei dataset (47.7 mPQTiss, +3\% over HoVer-Net). Our work paves the way towards extensive single-cell information directly from H\&E slides, leading to a quantitative view of whole slide images. Code, model weights as well as all additional training and validation data, are publicly available on github.",
    pages = "61--86"
}

@InProceedings{boland2024,
    title = "There Are No Shortcuts to Anywhere Worth Going: Identifying Shortcuts in Deep Learning Models for Medical Image Analysis",
    author = "Christopher Boland and Keith A Goatman and Sotirios A. Tsaftaris and Sonia Dahdouh",
    abstract = "Many studies have reported human-level accuracy (or better) for AI-powered algorithms performing a specific clinical task, such as detecting pathology. However, these results often fail to generalize to other scanners or populations. Several mechanisms have been identified that confound generalization. One such is shortcut learning, where a network erroneously learns to depend on a fragile spurious feature, such as a text label added to the image, rather than scrutinizing the genuinely useful regions of the image. In this way, systems can exhibit misleadingly high test-set results while the labels are present but fail badly elsewhere where the relationship between the label and the spurious feature breaks down. In this paper, we investigate whether it is possible to detect shortcut learning and locate where the shortcut is happening in a neural network. We propose a novel methodology utilizing the sample difficulty metric Prediction Depth (PD) and KL divergence to identify specific layers of a neural network model where the learned features of a shortcut manifest. We demonstrate that our approach can effectively isolate these layers across several shortcuts, model architectures, and datasets. Using this, we show a correlation between the visual complexity of a shortcut, the depth of its feature manifestation within the model, and the extent to which a model relies on it. Finally, we highlight the nuanced relationship between learning rate and shortcut learning.",
    pages = "131--150"
}

@InProceedings{foellmer2024,
    title = "Active Learning with the nnUNet and Sample Selection with Uncertainty-Aware Submodular Mutual Information Measure",
    author = {Bernhard F\"ollmer and Kenrick Schulze and Christian Wald and Sebastian Stober and Wojciech Samek and Marc Dewey},
    abstract = "Annotating medical images for segmentation tasks is a time-consuming process that requiresexpert knowledge. Active learning can reduce this annotation cost and achieve optimalmodel performance by selecting only the most informative samples for annotation. However, the eectiveness of active learning sample selection strategies depends on the modelarchitecture and training procedure used. The nnUNet has achieved impressive results invarious automated medical image segmentation tasks due to its self-configuring pipelinefor automated model design and training. This raises the question of whether the nnUNetis applicable in an active learning setting to avoid cumbersome manual configuration ofthe training process and improve accessibility for non-experts in deep learning-based segmentation. This paper compares various sample selection strategies in an active learningsetting in which the self-configuring nnUNet is used as the segmentation model. Additionally, we propose a new sample selection strategy for UNet-like architectures: USIM - Uncertainty-Aware Submodular Mutual Information Measure. The method combinesuncertainty and submodular mutual information to select batches of uncertain, diverse,and representative samples. We evaluate the performance gain and labeling costs on threemedical image segmentation tasks with different segmentation challenges. Our findingsdemonstrate that utilizing nnUNet as the segmentation model in an active learning setting is feasible, and most sampling strategies outperform random sampling. Furthermore,we demonstrate that our proposed method yields a significant improvement compared toexisting baseline methods.",
    pages = "480--503"
}

@InProceedings{kulkarni2024b,
    title = "Registration Quality Evaluation Metric with Self-Supervised Siamese Networks",
    author = "Tanvi Kulkarni and Sriprabha Ramanarayanan and Keerthi Ram and Mohanasankar Sivaprakasam",
    abstract = "Registration is one of the most preliminary steps in many medical imaging downstream tasks. The registration quality determines the quality of the downstream task. Traditionally, registration quality evaluation is performed with pixel-wise metrics like Mean Squared Error (MSE) and Structural Similarity Index (SSIM). These pixel-wise measures are sometimes susceptible to local minima, providing sub-optimal and inconsistent quality evaluation. Moreover, it might be essential to incorporate semantic features crucial for human visual perception of the registration quality. Towards this end, we propose a data-driven approach to learn the semantic similarity between the registered and target images to ensure a perceptual and consistent evaluation of the registration quality. In this work, we train a Siamese network to classify registered and synthetically misaligned pairs of images. We leverage the latent Siamese encodings to formulate a semantic registration evaluation metric, SiamRegQC. We analyze SiamRegQC from different perspectives: robustness to local minima or smoothness of evaluation metric, sensitivity to smaller misalignment errors, consistency with visual inspection, and statistically significant evaluation of registration algorithms with a p-value $<$ 0.05. We demonstrate the effectiveness of SiamRegQC on two downstream tasks; (i) Rigid registration of 2D histological serial sections, where evaluating sub-pixel misalignment errors is critical for accurate 3D volume reconstruction. SiamRegQC provides a more realistic quality evaluation sensitive to smaller errors and consistent with visual inspection illustrated with more perceptual semantic feature maps rather than pixel-wise MSE maps. (ii) Unsupervised multimodal non-rigid registration, where the registration framework trained with SiamRegQC as a loss function exhibits a maximum average SSIM value of 0.825 over previously proposed deep similarity metrics.",
    pages = "822--840"
}

@InProceedings{jarimijafarbigloo2024,
    title = "Reducing Uncertainty in 3D Medical Image Segmentation under Limited Annotations through Contrastive Learning",
    author = "Sanaz Jarimijafarbigloo and Reza Azad and Amirhossein Kazerouni and Dorit Merhof",
    abstract = "Despite recent successes in semi-supervised learning for natural image segmentation, applying these methods to medical images presents challenges in obtaining discriminative representations from limited annotations. While contrastive learning frameworks excel in similarity measures for classification, their transferability to precise pixel-level segmentation in medical images is hindered, particularly when confronted with inherent prediction uncertainty.To overcome this issue, our approach incorporates two subnetworks to rectify erroneous predictions. The first network identifies uncertain predictions, generating an uncertainty attention map. The second network employs an uncertainty-aware descriptor to refine the representation of uncertain regions, enhancing the accuracy of predictions. Additionally, to adaptively recalibrate the representation of uncertain candidates, we define class prototypes based on reliable predictions. We then aim to minimize the discrepancy between class prototypes and uncertain predictions through a deep contrastive learning strategy.Our experimental results on organ segmentation from clinical MRI and CT scans demonstrate the effectiveness of our approach compared to state-of-the-art methods.",
    pages = "694--707"
}

@InProceedings{xin2024,
    title = "Deformation-aware GAN for Medical Image Synthesis with Substantially Misaligned Pairs",
    author = "Bowen Xin and Tony Young and Claire Wainwright and Tamara Blake and Leo Lebrat and Thomas Gaass and Thomas Benkert and Alto Stemmer and David Coman and Jason Dowling",
    abstract = "Medical image synthesis generates additional imaging modalities that are costly, invasive or harmful to acquire, which helps to facilitate the clinical workflow. When training pairs are substantially misaligned (e.g., lung MRI-CT pairs with respiratory motion), accurate image synthesis remains a critical challenge. Recent works explored the directional registration module to adjust misalignment in generative adversarial networks (GANs); however, substantial misalignment will lead to 1) suboptimal data mapping caused by correspondence ambiguity, and 2) degraded image fidelity caused by morphology influence on discriminators. To address the challenges, we propose a novel Deformation-aware GAN (DA-GAN) to dynamically correct the misalignment during the image synthesis based on multi-objective inverse consistency. Specifically, in the generative process, three levels of inverse consistency cohesively optimise symmetric registration and image generation for improved correspondence. In the adversarial process, to further improve image fidelity under misalignment, we design deformation-aware discriminators to disentangle the mismatched spatial morphology from the judgement of image fidelity. Experimental results show that DA-GAN achieved superior performance on a public dataset with simulated misalignments and a real-world lung MRI-CT dataset with respiratory motion misalignment. The results indicate the potential for a wide range of medical image synthesis tasks such as radiotherapy planning.",
    pages = "1754--1770"
}

@InProceedings{monedero2024,
    title = "RADR: A Robust Domain-Adversarial-based Framework for Automated Diabetic Retinopathy Severity Classification",
    author = "Sara M{\'\i}nguez Monedero and Fabian Westhaeusser and Ehsan Yaghoubi and Simone Frintrop and Marina Zimmermann",
    abstract = "Diabetic retinopathy (DR), a potentially vision-threatening condition, necessitates accurate diagnosis and staging, which deep-learning models can facilitate. However, these models often struggle with robustness in clinical practice due to distribution shifts caused by variations in data acquisition protocols and hardware. We propose RADR, a novel deep-learning framework for DR severity classification, aimed at generalization across diverse datasets and clinic cameras. Our work builds upon existing research: we combine several ideas to perform extensive dataset curation, preprocessing, and enrichment with camera information. We then use a domain adversarial training regime, which encourages our model to extract features that are both task-relevant and invariant to domain shifts. We explore our framework in its various levels of complexity in combination with multiple data augmentations policies in an ablative fashion. Experimental results demonstrate the effectiveness of our proposed method, achieving competitive performance to multiple state-of-the-art models on three unseen external datasets.",
    pages = "1026--1039"
}

@InProceedings{kuipers2024,
    title = "Generating Cerebral Vessel Trees of Acute Ischemic Stroke Patients using Conditional Set-Diffusion",
    author = "Thijs P. Kuipers and Praneeta R. Konduri and Henk Marquering and Erik J Bekkers",
    abstract = "The advancements in computational modeling and simulations have facilitated the emergence of in-silico clinical trials (ISCTs). ISCTs are valuable in developing and evaluating novel treatments targeting acute ischemic stroke (AIS), a prominent contributor to both mortality and disability rates. However, obtaining large populations of accurate anatomical structures that are required as input to ISCTs is labor-intensive and time-consuming. In this work, we propose and evaluate diffusion-based generative modeling and set transformers to generate a population of synthetic intracranial vessel tree centerlines with associated radii and vessel types. We condition our model on the presence of an occlusion in the middle cerebral artery, a frequently occurring occlusion location in AIS patients. Our analysis of generated synthetic populations shows that our model accurately produces diverse and realistic cerebral vessel trees that represent the geometric characteristics of the real population.",
    pages = "782--792"
}

@InProceedings{swain2024,
    title = "Nuclei Segmentation in Histopathological Images with  Enhanced U-Net3+",
    author = "Bishal Ranjan Swain and Kyung Joo Cheoi and Jaepil Ko",
    abstract = "In the rapidly evolving field of nuclei segmentation, there is an increasing trend towards developing a universal segmentation model capable of delivering top-tier results across diverse datasets. While achieving this is the ultimate goal, we argue that such a model should also outperform dataset-specific specialized models. To this end, we propose a task-specific feature sensitive U-Net model, that sets a baseline standard in segmentation of nuclei in histopathological images. We meticulously select and optimize the underlying U-Net3+ model, using adaptive feature selection to capture both short- and long-range dependencies. Max blur pooling is included to achieve scale and position invariance, while DropBlock is utilized to mitigate overfitting by selectively obscuring feature map regions. Additionally, a Guided Filter Block is employed to delineate fine-grained details in nuclei structures. Furthermore, we apply various data augmentation techniques, along with stain normalization, to reduce inconsistencies and thus resulting in significantly outperforming the state-of-the-art performance and paving the way for precise nuclear segmentation essential for cancer diagnosis and possible treatment strategies.",
    pages = "1513--1530"
}

@InProceedings{zhou2024b,
    title = "DDA: Dimensionality Driven Augmentation Search for Contrastive Learning in Laparoscopic Surgery",
    author = "Yuning Zhou and Henry Badgery and Matthew Read and James Bailey and Catherine Davey",
    abstract = "Self-supervised learning (SSL) has the potential for effective representation learning in medical imaging, but the choice of data augmentation is critical and domain-specific. It remains uncertain if general augmentation policies suit surgical applications. In this work, we automate the search for suitable augmentation policies through a new method called Dimensionality Driven Augmentation Search (DDA). DDA leverages the local dimensionality of deep representations as a proxy target, and differentiably searches for suitable data augmentation policies in contrastive learning. We demonstrate the effectiveness and efficiency of DDA in navigating a large search space and successfully identifying an appropriate data augmentation policy for laparoscopic surgery. We systematically evaluate DDA  across three laparoscopic image classification and segmentation tasks, where it significantly improves over existing baselines. Furthermore, DDA\'s optimised set of augmentations provides insight into domain-specific dependencies when applying contrastive learning in medical applications. For example, while hue is an effective augmentation for natural images, it is not advantageous for laparoscopic images.",
    pages = "1898--1926"
}

@InProceedings{sanderson2024,
    title = "Diffusion X-ray image denoising",
    author = "Daniel Sanderson and Pablo M. Olmos and Carlos Fern\'andez Del Cerro and Manuel Desco and M\'onica Abella",
    abstract = "X-ray imaging is a cornerstone in medical diagnosis, constituting a significant portion of the radiation dose encountered by patients. Excessive radiation poses health risks, particularly for pediatric patients, but despite the imperative to reduce radiation doses, conventional image processing methods for X-ray denoising often struggle with heuristic parameter calibration and prolonged execution times. Deep Learning solutions have emerged as promising alternatives, but their effectiveness varies, and challenges persist in preserving image quality.This paper presents an exploration of diffusion models for planar X-ray image denoising, a novel approach that to our knowledge has not been yet investigated in this domain. We perform real time denoising of Poisson noise while preserving image resolution and structural similarity. The results indicate that diffusion models show promise for planar X-ray image denoising, offering a potential improvement in the optimization of diagnostic utility amid dose reduction efforts.",
    pages = "1328--1340"
}

@InProceedings{qiehe2024,
    title = "NcIEMIL: Rethinking Decoupled Multiple Instance Learning Framework for Histopathological Slide Classification",
    author = "Sun Qiehe and Doukou Jiang and Jiawen Li and Renao Yan and Yonghong He and Tian Guan and Zhiqiang Cheng",
    abstract = "On account of superiority in annotation efficiency, multiple instance learning (MIL) has proved to be a promising framework for the whole slide image (WSI) classification in pathological diagnosis. However, current methods employ fully- or semi-decoupled frameworks to address the trade-off between billions of pixels and limited computational resources. This exacerbates the information bottleneck, leading to instance representations in a high-rank space that contains semantic redundancy compared to the potential low-rank category space of instances. Additionally, most negative instances are also independent of the positive properties of the bag. To address this, we introduce a weakly annotation-supervised filtering network, aiming to restore the low-rank nature of the slide-level representations. We then design a parallel aggregation structure that utilizes spatial attention mechanisms to model inter-correlation between instances and simultaneously assigns corresponding weights to channel dimensions to alleviate the redundant information introduced by feature extraction. Extensive experiments on the private gastrointestinal chemotaxis dataset and CAMELYON16 breast dataset show that our proposed framework is capable of handling both binary and multivariate classification problems and outperforms state-of-the-art MIL-based methods. The code is available at: https://github.com/polyethylene16/NcIEMIL.",
    pages = "1166--1178"
}

@InProceedings{trombetta2024,
    title = "Weakly supervised deep learning model with size constraint for prostate cancer detection in multiparametric MRI and generalization to unseen domains",
    author = "Robin Trombetta and Olivier Rouvi\`ere and Carole Lartizien",
    abstract = "Fully supervised deep models have shown promising performance for many medical segmentation tasks. Still, the deployment of these tools in clinics is limited by the very time-consuming collection of manually expert-annotated data. Moreover, most of the state-of-the-art models have been trained and validated on moderately homogeneous datasets. It is known that deep learning methods are often greatly degraded by domain or label shifts and are yet to be built in such a way as to be robust to unseen data or label distributions. In the clinical setting, this problematic is particularly relevant as the deployment institutions may have different scanners or acquisition protocols than those from which the data has been collected to train the model. In this work, we propose to address these two challenges on the detection of clinically significant prostate cancer (csPCa) from bi-parametric MRI. We evaluate the method proposed by (Kervadec et al., 2018) , which introduces a size constaint loss to produce fine semantic cancer lesions segmentations from weak circle scribbles annotations. Performance of the model is based on two public (PI-CAI and Prostate158) and one private databases. First, we show that the model achieves on-par performance with strong fully supervised baseline models, both on in-distribution validation data and unseen test images. Second, we observe a performance decrease for both fully supervised and weakly supervised models when tested on unseen data domains. This confirms the crucial need for efficient domain adaptation methods if deep learning models are aimed to be deployed in a clinical environment. Finally, we show that ensemble predictions from multiple trainings increase generalization performance.",
    pages = "1531--1552"
}

@InProceedings{fick2024,
    title = "Improving CNN-Based Mitosis Detection through Rescanning Annotated Glass Slides and Atypical Mitosis Subtyping",
    author = "Rutger RH Fick and Christof Bertram and Marc Aubreville",
    abstract = "The identification of mitotic figures (MFs) is a routine task in the histopathological assessment of tumor malignancy with known limitations for human observers. For a machine learning pipeline to robustly detect MFs, it must overcome a variety of conditions such as different scanners, staining protocols, tissue configurations, and organ types. In order to develop a deep learning-based algorithm that can cope with these challenges, there are two obstacles that need to be overcome: obtaining a large-scale dataset of MF annotations spread across different domains of interest, including whole slide images (WSIs) exhaustively annotated for MFs, and using the annotated MFs in an efficient training process to extract the most relevant features for classification.Our work attempts to address both of these challenges and establishes an MF detection pipeline trained solely on animal data, yet competitive on the mixed human/animal MIDOG22 dataset, and, in particular, on human breast cancer.First, we propose a processing pipeline that allows us to strengthen the true scanner robustness of our dataset by physically rescanning the glass slides of annotated WSIs and registering MF positions. To enable the use of such rescans for training, we propose a novel learning paradigm tailored for labels that match partially, which allows to account for ambiguous MF positions in the rescans caused by spurious, suboptimal fine-focus on potential MFs by the scanner. Second, we demonstrate how a multi-task learning approach for MF subtypes, including the prediction of atypical mitotic figures (AMFs), can significantly enhance a model\'s ability to distinguish MFs from imposters. Our algorithm, using a standard object detection pipeline, performs very competitively with an average test set F1 value across five runs of 0.80 on the MIDOG22 training set. We also demonstrate its ability to stratify overall survival on the TCGA-BRCA dataset based on mitotic density, though it falls short of reaching significance in stratifying survival based on AMFs.",
    pages = "452--464"
}