Skip to content

Commit

Permalink
Add License information, Fix formatting
Browse files Browse the repository at this point in the history
  • Loading branch information
zeotuan committed May 16, 2024
1 parent 8cbffcd commit 3a9916f
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@ case class RetainCompletenessRule(
}

override def candidate(profile: ColumnProfile, numRecords: Long): ConstraintSuggestion = {
val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(profile.completeness, numRecords).lowerBound
val targetCompleteness = intervalStrategy.calculateTargetConfidenceInterval(
profile.completeness,
numRecords
).lowerBound

val constraint = completenessConstraint(profile.column, _ >= targetCompleteness)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/**
* Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules.interval

import breeze.stats.distributions.{Gaussian, Rand}
Expand All @@ -15,7 +31,11 @@ trait ConfidenceIntervalStrategy {
* @param confidence confidence level of method used to estimate the interval.
* @return
*/
def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval
def calculateTargetConfidenceInterval(
pHat: Double,
numRecords: Long,
confidence: Double = defaultConfidence
): ConfidenceInterval

def validateInput(pHat: Double, confidence: Double): Unit = {
require(0.0 <= pHat && pHat <= 1.0, "pHat must be between 0.0 and 1.0")
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/**
* Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules.interval

import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence}
Expand All @@ -12,7 +28,11 @@ import scala.math.BigDecimal.RoundingMode
* Normal approximation interval (Wikipedia)</a>
*/
case class WaldIntervalStrategy() extends ConfidenceIntervalStrategy {
def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = {
def calculateTargetConfidenceInterval(
pHat: Double,
numRecords: Long,
confidence: Double = defaultConfidence
): ConfidenceInterval = {
validateInput(pHat, confidence)
val successRatio = BigDecimal(pHat)
val marginOfError = BigDecimal(calculateZScore(confidence) * math.sqrt(pHat * (1 - pHat) / numRecords))
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
/**
* Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"). You may not
* use this file except in compliance with the License. A copy of the License
* is located at
*
* http://aws.amazon.com/apache2.0/
*
* or in the "license" file accompanying this file. This file is distributed on
* an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
* express or implied. See the License for the specific language governing
* permissions and limitations under the License.
*
*/

package com.amazon.deequ.suggestions.rules.interval

import com.amazon.deequ.suggestions.rules.interval.ConfidenceIntervalStrategy.{ConfidenceInterval, defaultConfidence}
Expand All @@ -13,7 +29,10 @@ import scala.math.BigDecimal.RoundingMode
*/
case class WilsonScoreIntervalStrategy() extends ConfidenceIntervalStrategy {

def calculateTargetConfidenceInterval(pHat: Double, numRecords: Long, confidence: Double = defaultConfidence): ConfidenceInterval = {
def calculateTargetConfidenceInterval(
pHat: Double, numRecords: Long,
confidence: Double = defaultConfidence
): ConfidenceInterval = {
validateInput(pHat, confidence)
val zScore = calculateZScore(confidence)
val zSquareOverN = math.pow(zScore, 2) / numRecords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,9 @@ class ConstraintRulesTest extends WordSpec with FixtureSupport with SparkContext
val fakeColumnProfile = getFakeColumnProfileWithNameAndCompleteness("att1", 0.5)

val check = Check(CheckLevel.Warning, "some")
.addConstraint(RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint)
.addConstraint(
RetainCompletenessRule(intervalStrategy = strategy).candidate(fakeColumnProfile, 100).constraint
)

val verificationResult = VerificationSuite()
.onData(dfWithColumnCandidate)
Expand Down

0 comments on commit 3a9916f

Please sign in to comment.