From 3b41e4c46a23bc44f15f76b2c087e65ae6bf471e Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Fri, 19 Apr 2024 11:06:06 +1000 Subject: [PATCH 1/3] Configurable RetainCompletenessRule --- .../rules/RetainCompletenessRule.scala | 5 ++-- .../rules/ConstraintRulesTest.scala | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index 67ae61f92..f632b7d47 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -27,10 +27,9 @@ import scala.math.BigDecimal.RoundingMode * If a column is incomplete in the sample, we model its completeness as a binomial variable, * estimate a confidence interval and use this to define a lower bound for the completeness */ -case class RetainCompletenessRule() extends ConstraintRule[ColumnProfile] { - +case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0, sensitivity: Double = 1.96) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { - profile.completeness > 0.2 && profile.completeness < 1.0 + profile.completeness > minCompleteness && profile.completeness < maxCompleteness } override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = { diff --git a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala index 075247932..701a5d983 100644 --- a/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala +++ b/src/test/scala/com/amazon/deequ/suggestions/rules/ConstraintRulesTest.scala @@ -130,9 +130,14 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext "be applied correctly" in { val complete = StandardColumnProfile("col1", 1.0, 100, String, false, Map.empty, None) + val tenPercent = StandardColumnProfile("col1", 0.1, 100, String, false, Map.empty, None) val incomplete = StandardColumnProfile("col1", .25, 100, String, false, Map.empty, None) assert(!RetainCompletenessRule().shouldBeApplied(complete, 1000)) + assert(!RetainCompletenessRule(0.05, 0.9).shouldBeApplied(complete, 1000)) + assert(RetainCompletenessRule(0.05, 0.9).shouldBeApplied(tenPercent, 1000)) + assert(RetainCompletenessRule(0.0).shouldBeApplied(tenPercent, 1000)) + assert(RetainCompletenessRule(0.0).shouldBeApplied(incomplete, 1000)) assert(RetainCompletenessRule().shouldBeApplied(incomplete, 1000)) } @@ -183,6 +188,26 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext assert(metricResult.value.isSuccess) } + + "return evaluable constraint candidates with custom min/max completeness" in + withSparkSession { session => + + val dfWithColumnCandidate = getDfFull(session) + + val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5) + + val check = Check(CheckLevel.Warning, "some") + .addConstraint(RetainCompletenessRule(0.4, 0.6).candidate(fakeColumnProfile, 100).constraint) + + val verificationResult = VerificationSuite() + .onData(dfWithColumnCandidate) + .addCheck(check) + .run() + + val metricResult = verificationResult.metrics.head._2 + + assert(metricResult.value.isSuccess) + } } "UniqueIfApproximatelyUniqueRule" should { From ac337eae95ea830938d1dd8a8870dddf16205019 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Fri, 19 Apr 2024 11:12:50 +1000 Subject: [PATCH 2/3] Add doc string --- .../deequ/suggestions/rules/RetainCompletenessRule.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index f632b7d47..df2f49308 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -26,8 +26,11 @@ import scala.math.BigDecimal.RoundingMode /** * If a column is incomplete in the sample, we model its completeness as a binomial variable, * estimate a confidence interval and use this to define a lower bound for the completeness + * + * @param minCompleteness : minimum completeness threshold to determine if rule should be applied + * @param maxCompleteness : maximum completeness threshold to determine if rule should be applied */ -case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0, sensitivity: Double = 1.96) extends ConstraintRule[ColumnProfile] { +case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { profile.completeness > minCompleteness && profile.completeness < maxCompleteness } From db9b7646b58260495e4c6e40ad605b8ba9c78f93 Mon Sep 17 00:00:00 2001 From: Tuan Pham Date: Wed, 1 May 2024 11:19:08 +1000 Subject: [PATCH 3/3] Add default completeness const --- .../suggestions/rules/RetainCompletenessRule.scala | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala index df2f49308..9f995a112 100644 --- a/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala +++ b/src/main/scala/com/amazon/deequ/suggestions/rules/RetainCompletenessRule.scala @@ -20,6 +20,7 @@ import com.amazon.deequ.constraints.Constraint.completenessConstraint import com.amazon.deequ.profiles.ColumnProfile import com.amazon.deequ.suggestions.CommonConstraintSuggestion import com.amazon.deequ.suggestions.ConstraintSuggestion +import com.amazon.deequ.suggestions.rules.RetainCompletenessRule._ import scala.math.BigDecimal.RoundingMode @@ -30,7 +31,10 @@ import scala.math.BigDecimal.RoundingMode * @param minCompleteness : minimum completeness threshold to determine if rule should be applied * @param maxCompleteness : maximum completeness threshold to determine if rule should be applied */ -case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness: Double = 1.0) extends ConstraintRule[ColumnProfile] { +case class RetainCompletenessRule( + minCompleteness: Double = defaultMinCompleteness, + maxCompleteness: Double = defaultMaxCompleteness +) extends ConstraintRule[ColumnProfile] { override def shouldBeApplied(profile: ColumnProfile, numRecords: Long): Boolean = { profile.completeness > minCompleteness && profile.completeness < maxCompleteness } @@ -67,3 +71,8 @@ case class RetainCompletenessRule(minCompleteness: Double = 0.2, maxCompleteness "we model its completeness as a binomial variable, estimate a confidence interval " + "and use this to define a lower bound for the completeness" } + +object RetainCompletenessRule { + private val defaultMinCompleteness: Double = 0.2 + private val defaultMaxCompleteness: Double = 1.0 +}