From 3b1a3ec5d1aac8e5e15e694be709530fd343d8a3 Mon Sep 17 00:00:00 2001 From: Josh <5685731+marcantony@users.noreply.github.com> Date: Sat, 31 Aug 2024 12:40:11 -0400 Subject: [PATCH] Fix performance of building row-level results (#577) * Generate row-level results with withColumns Iteratively using withColumn (singular) causes performance issues when iterating over a large sequence of columns. * Add back UNIQUENESS_ID --- src/main/scala/com/amazon/deequ/VerificationResult.scala | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/VerificationResult.scala b/src/main/scala/com/amazon/deequ/VerificationResult.scala index 6390db821..418a622e6 100644 --- a/src/main/scala/com/amazon/deequ/VerificationResult.scala +++ b/src/main/scala/com/amazon/deequ/VerificationResult.scala @@ -98,9 +98,7 @@ object VerificationResult { val columnNamesToMetrics: Map[String, Column] = verificationResultToColumn(verificationResult) val dataWithID = data.withColumn(UNIQUENESS_ID, monotonically_increasing_id()) - columnNamesToMetrics.foldLeft(dataWithID)( - (dataWithID, newColumn: (String, Column)) => - dataWithID.withColumn(newColumn._1, newColumn._2)).drop(UNIQUENESS_ID) + dataWithID.withColumns(columnNamesToMetrics).drop(UNIQUENESS_ID) } def checkResultsAsJson(verificationResult: VerificationResult,