From ac6362e602977df6718c48a7befb9aef0610ddeb Mon Sep 17 00:00:00 2001 From: Mamadou S Diallo Date: Tue, 4 Feb 2025 10:35:53 -0500 Subject: [PATCH] added basic regression functionalities --- pyproject.toml | 2 +- src/samplics/apis/predict.py | 4 +- src/samplics/categorical/comparison.py | 93 ++++++----- src/samplics/categorical/tabulation.py | 121 +++++++++++---- src/samplics/categorical/tabulation_old.py | 95 ++++++++---- src/samplics/estimation/calibration.py | 4 +- src/samplics/estimation/expansion.py | 144 +++++++++++++----- src/samplics/estimation/replication.py | 61 ++++++-- src/samplics/regression/glm.py | 35 ++++- src/samplics/sae/eb_unit_model.py | 44 ++++-- src/samplics/sae/eblup_area_model.py | 4 +- src/samplics/sae/eblup_unit_model.py | 32 ++-- src/samplics/sae/robust_unit_model.py | 16 +- src/samplics/sae/sae_core_functions.py | 9 +- src/samplics/sampling/power_functions.py | 87 ++++++++--- src/samplics/sampling/selection.py | 65 +++++--- src/samplics/sampling/size.py | 60 ++++++-- src/samplics/sampling/size_functions.py | 4 +- src/samplics/utils/basic_functions.py | 4 +- src/samplics/utils/checks.py | 9 +- src/samplics/utils/formats.py | 14 +- src/samplics/utils/types.py | 45 ++++++ src/samplics/weighting/adjustment.py | 42 +++-- src/samplics/weighting/replicates.py | 36 +++-- tests/apis/sae/test_area_eblup.py | 4 +- tests/apis/sae/test_data_types.py | 12 +- tests/categorical/test_comparison.py | 56 +++++-- tests/categorical/test_crosstabulation.py | 44 ++++-- tests/datasets/test_datasets.py | 10 +- tests/estimation/test_expansion.py | 80 +++++++--- tests/estimation/test_expansion_fpc.py | 12 +- .../test_replication_conservative.py | 20 ++- .../test_replication_not_conservative.py | 24 ++- tests/regression/test_glm.py | 119 ++++++++++++++- tests/sae/test_eb_unit_model.py | 4 +- tests/sae/test_eblup_area_model2.py | 4 +- tests/sae/test_eblup_unit_model.py | 12 +- tests/sae/test_sae_core_functions.py | 16 +- tests/sampling/test_sample_size_mean.py | 54 +++++-- tests/sampling/test_sample_size_one_sample.py | 44 ++++-- tests/sampling/test_sample_size_proportion.py | 81 +++++++--- tests/sampling/test_sample_size_two_sample.py | 8 +- tests/sampling/test_selection.py | 51 +++++-- tests/simulated_population.py | 4 +- tests/types/test_container_directest.py | 12 +- tests/utils/test_basic_functions.py | 4 +- tests/utils/test_checks.py | 32 +++- tests/utils/test_formats.py | 4 +- tests/weighting/data_weights.py | 8 +- tests/weighting/test_adjustment.py | 29 +++- tests/weighting/test_calibration.py | 8 +- tests/weighting/test_replicates.py | 12 +- 52 files changed, 1370 insertions(+), 428 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index bf87a20c..3c4056c3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -67,7 +67,7 @@ exclude = ''' # target-version = ["py310", "py311"] [tool.pytest.ini_options] -addopts = "--ignore=tests/apis --ignore=tests/sae --ignore=tests/types --ignore=tests/categorical" +addopts = "--ignore=tests/apis --ignore=tests/sae --ignore=tests/types" # testpaths = ["tests"] diff --git a/src/samplics/apis/predict.py b/src/samplics/apis/predict.py index 2fa6ec2e..52e7e5ce 100644 --- a/src/samplics/apis/predict.py +++ b/src/samplics/apis/predict.py @@ -9,4 +9,6 @@ def predict( intercept: bool = True, # if True, it adds an intercept of 1 b_const: DictStrNum | Number = 1.0, ): - return _predict_eblup(x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const) + return _predict_eblup( + x=x, fit_eblup=fit_stats, y=y, intercept=intercept, b_const=b_const + ) diff --git a/src/samplics/categorical/comparison.py b/src/samplics/categorical/comparison.py index d921f085..cbf3c23a 100644 --- a/src/samplics/categorical/comparison.py +++ b/src/samplics/categorical/comparison.py @@ -30,9 +30,13 @@ class Ttest: - def __init__(self, samp_type: str, paired: bool = False, alpha: float = 0.05) -> None: + def __init__( + self, samp_type: str, paired: bool = False, alpha: float = 0.05 + ) -> None: if samp_type.lower() not in ("one-sample", "two-sample"): - raise ValueError("Parameter 'type' must be equal to 'one-sample', 'two-sample'!") + raise ValueError( + "Parameter 'type' must be equal to 'one-sample', 'two-sample'!" + ) assert_probabilities(x=alpha) self.samp_type = samp_type.lower() @@ -58,17 +62,27 @@ def __str__(self) -> str: return "No table to display" else: tbl_head = f"Design-based {self.samp_type.title()} T-test" - if (self.samp_type == "one-sample" and self.group_names == []) or self.paired: + if ( + self.samp_type == "one-sample" and self.group_names == [] + ) or self.paired: if self.samp_type == "one-sample": - tbl_subhead1 = f" Null hypothesis (Ho): mean = {self.stats['known_mean']}" + tbl_subhead1 = ( + f" Null hypothesis (Ho): mean = {self.stats['known_mean']}" + ) else: tbl_subhead1 = f" Null hypothesis (Ho): mean(Diff = {self.vars_names[0]} - {self.vars_names[1]}) = 0" tbl_subhead2 = f" t statistics: {self.stats['t']:.4f}" tbl_subhead3 = f" Degrees of freedom: {self.stats['df']:.2f}" tbl_subhead4 = " Alternative hypothesis (Ha):" - tbl_subhead4a = f" Prob(T < t) = {self.stats['p_value']['less_than']:.4f}" - tbl_subhead4b = f" Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}" - tbl_subhead4c = f" Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}" + tbl_subhead4a = ( + f" Prob(T < t) = {self.stats['p_value']['less_than']:.4f}" + ) + tbl_subhead4b = ( + f" Prob(|T| > |t|) = {self.stats['p_value']['not_equal']:.4f}" + ) + tbl_subhead4c = ( + f" Prob(T > t) = {self.stats['p_value']['greater_than']:.4f}" + ) return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead4c} \n\n{self.to_dataframe().to_string(index=False)}\n" @@ -78,30 +92,22 @@ def __str__(self) -> str: tbl_subhead1 = f" Null hypothesis (Ho): mean({self.group_names[0]}) = mean({self.group_names[1]}) " tbl_subhead2 = " Equal variance assumption:" tbl_subhead2a = f" t statistics: {self.stats['t_eq_variance']:.4f}" - tbl_subhead2b = f" Degrees of freedom: {self.stats['df_eq_variance']:.2f}" - tbl_subhead3 = " Alternative hypothesis (Ha):" - tbl_subhead3a = ( - f" Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}" - ) - tbl_subhead3b = ( - f" Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}" - ) - tbl_subhead3c = ( - f" Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}" + tbl_subhead2b = ( + f" Degrees of freedom: {self.stats['df_eq_variance']:.2f}" ) + tbl_subhead3 = " Alternative hypothesis (Ha):" + tbl_subhead3a = f" Prob(T < t) = {self.stats['p_value_eq_variance']['less_than']:.4f}" + tbl_subhead3b = f" Prob(|T| > |t|) = {self.stats['p_value_eq_variance']['not_equal']:.4f}" + tbl_subhead3c = f" Prob(T > t) = {self.stats['p_value_eq_variance']['greater_than']:.4f}" tbl_subhead4 = " Unequal variance assumption:" tbl_subhead4a = f" t statistics: {self.stats['t_uneq_variance']:.4f}" - tbl_subhead4b = f" Degrees of freedom: {self.stats['df_uneq_variance']:.2f}" - tbl_subhead5 = " Alternative hypothesis (Ha):" - tbl_subhead5a = ( - f" Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}" - ) - tbl_subhead5b = ( - f" Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}" - ) - tbl_subhead5c = ( - f" Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}" + tbl_subhead4b = ( + f" Degrees of freedom: {self.stats['df_uneq_variance']:.2f}" ) + tbl_subhead5 = " Alternative hypothesis (Ha):" + tbl_subhead5a = f" Prob(T < t) = {self.stats['p_value_uneq_variance']['less_than']:.4f}" + tbl_subhead5b = f" Prob(|T| > |t|) = {self.stats['p_value_uneq_variance']['not_equal']:.4f}" + tbl_subhead5c = f" Prob(T > t) = {self.stats['p_value_uneq_variance']['greater_than']:.4f}" return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead2a}\n{tbl_subhead2b}\n{tbl_subhead3}\n{tbl_subhead3a}\n{tbl_subhead3b}\n{tbl_subhead3c}\n{tbl_subhead4}\n{tbl_subhead4a}\n{tbl_subhead4b}\n{tbl_subhead5}\n{tbl_subhead5a}\n{tbl_subhead5b}\n{tbl_subhead5c} \n\n{self.to_dataframe().to_string(index=False)}\n" else: @@ -117,7 +123,9 @@ def _one_sample_one_group( ssu: Array, fpc: Union[Dict, float] = 1, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, ) -> None: one_sample = TaylorEstimator(param=PopParam.mean, alpha=self.alpha) @@ -186,7 +194,10 @@ def _two_groups_unpaired( t_equal_variance = (mean_group1 - mean_group2) / ( math.sqrt( - ((nb_obs_group1 - 1) * stddev_group1**2 + (nb_obs_group2 - 1) * stddev_group2**2) + ( + (nb_obs_group1 - 1) * stddev_group1**2 + + (nb_obs_group2 - 1) * stddev_group2**2 + ) / (nb_obs_group1 + nb_obs_group2 - 2) ) * math.sqrt(1 / nb_obs_group1 + 1 / nb_obs_group2) @@ -206,10 +217,14 @@ def _two_groups_unpaired( ) left_p_value_equal_variance = t.cdf(t_equal_variance, t_df_equal_variance) - both_p_value_equal_variance = 2 * t.cdf(-abs(t_equal_variance), t_df_equal_variance) + both_p_value_equal_variance = 2 * t.cdf( + -abs(t_equal_variance), t_df_equal_variance + ) left_p_value_unequal_variance = t.cdf(t_unequal_variance, t_df_unequal_variance) - both_p_value_unequal_variance = 2 * t.cdf(-abs(t_unequal_variance), t_df_unequal_variance) + both_p_value_unequal_variance = 2 * t.cdf( + -abs(t_unequal_variance), t_df_unequal_variance + ) stats = { "number_obs": {group1: nb_obs_group1, group2: nb_obs_group2}, @@ -262,7 +277,9 @@ def _two_samples_unpaired( ssu: Optional[Array] = None, fpc: Union[Dict, float] = 1, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, ) -> None: two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha) @@ -292,7 +309,9 @@ def compare( ssu: Optional[Array] = None, fpc: Union[Dict, float] = 1, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> None: @@ -301,7 +320,9 @@ def compare( if known_mean is None and group is None: raise AssertionError("Parameters 'known_mean' or 'group' must be provided!") if known_mean is not None and group is not None: - raise AssertionError("Only one parameter 'known_mean' or 'group' should be provided!") + raise AssertionError( + "Only one parameter 'known_mean' or 'group' should be provided!" + ) if varnames is None: self.vars_names = set_variables_names(y, None, "var") @@ -369,7 +390,9 @@ def compare( strata_comb=strata_comb, ) - two_samples_unpaired = TaylorEstimator(param=PopParam.mean, alpha=self.alpha) + two_samples_unpaired = TaylorEstimator( + param=PopParam.mean, alpha=self.alpha + ) two_samples_unpaired.estimate( y=_y, by=_group, diff --git a/src/samplics/categorical/tabulation.py b/src/samplics/categorical/tabulation.py index 5a28d46f..a7c6af9b 100644 --- a/src/samplics/categorical/tabulation.py +++ b/src/samplics/categorical/tabulation.py @@ -58,7 +58,9 @@ def __str__(self) -> str: tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}" tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}" tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}" - tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + tbl_subhead4 = ( + f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + ) return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n" @@ -73,7 +75,9 @@ def _estimate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> tuple[TaylorEstimator, list, int]: @@ -87,7 +91,9 @@ def _estimate( ssu, ) if var.ndim == 1: # Series - to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel()) + to_keep = to_keep & remove_nans( + var.values.ravel().shape[0], var.values.ravel() + ) elif var.ndim == 2: # DataFrame for col in var.columns: to_keep = to_keep & remove_nans( @@ -155,7 +161,9 @@ def tabulate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> None: @@ -188,7 +196,9 @@ def tabulate( _samp_weight = numpy_array(samp_weight) _samp_weight = ( - np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight + np.ones(vars_df.shape[0]) + if _samp_weight.shape in ((), (0,)) + else _samp_weight ) _samp_weight = ( np.repeat(_samp_weight, vars_df.shape[0]) @@ -282,7 +292,9 @@ def to_dataframe( oneway_df = pd.DataFrame([]) for var in self.vars_names: - var_df = pd.DataFrame(np.repeat(var, len(self.vars_levels[var])), columns=["variable"]) + var_df = pd.DataFrame( + np.repeat(var, len(self.vars_levels[var])), columns=["variable"] + ) var_df["category"] = self.vars_levels[var] var_df[self.param] = list(self.point_est[var].values()) var_df["stderror"] = list(self.stderror[var].values()) @@ -341,20 +353,22 @@ def __str__(self) -> str: if self.vars_names == []: return "No categorical variables to tabulate" else: - tbl_head = f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}" + tbl_head = ( + f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}" + ) tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}" tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}" tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}" - tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + tbl_subhead4 = ( + f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + ) chisq_dist = f"chi2({self.stats['Pearson-Unadj']['df']})" f_dist = f"F({self.stats['Pearson-Adj']['df_num']:.2f}, {self.stats['Pearson-Adj']['df_den']:.2f}" pearson_unadj = f"Unadjusted - {chisq_dist}: {self.stats['Pearson-Unadj']['chisq_value']:.4f} with p-value of {self.stats['Pearson-Unadj']['p_value']:.4f}" pearson_adj = f"Adjusted - {f_dist}): {self.stats['Pearson-Adj']['f_value']:.4f} with p-value of {self.stats['Pearson-Adj']['p_value']:.4f}" - pearson_test = ( - f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}" - ) + pearson_test = f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}" lr_unadj = f" Unadjusted - {chisq_dist}: {self.stats['LR-Unadj']['chisq_value']:.4f} with p-value of {self.stats['LR-Unadj']['p_value']:.4f}" lr_adj = f" Adjusted - {f_dist}): {self.stats['LR-Adj']['f_value']:.4f} with p-value of {self.stats['LR-Adj']['p_value']:.4f}" @@ -381,7 +395,9 @@ def _extract_estimates( else: for ll in missing_levels: tbl_est.covariance[level][ll] = 0.0 - tbl_est.covariance[level] = dict(sorted(tbl_est.covariance[level].items())) + tbl_est.covariance[level] = dict( + sorted(tbl_est.covariance[level].items()) + ) _tbl_est_point_est = dict(sorted(tbl_est.point_est.items())) _tbl_est_covariance = dict(sorted(tbl_est.covariance.items())) @@ -403,7 +419,9 @@ def tabulate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> None: @@ -442,7 +460,9 @@ def tabulate( df = vars.with_columns( samp_weight=numpy_array(samp_weight), stratum=( - np.repeat("__none__", vars.shape[0]) if stratum is None else numpy_array(stratum) + np.repeat("__none__", vars.shape[0]) + if stratum is None + else numpy_array(stratum) ), psu=( np.linspace(1, vars.shape[0], num=vars.shape[0]) @@ -466,10 +486,22 @@ def tabulate( if remove_nan: df = ( df.filter( - (pl.col(vars_names[0]).is_not_null() & ~pl.col(vars_names[1]).eq("NaN")) - & (pl.col(vars_names[1]).is_not_null() & ~pl.col(vars_names[1]).eq("NaN")) - & (pl.col("samp_weight").is_not_null() & pl.col("samp_weight").is_not_nan()) - & (pl.col("stratum").is_not_null() & ~pl.col(vars_names[1]).eq("NaN")) + ( + pl.col(vars_names[0]).is_not_null() + & ~pl.col(vars_names[1]).eq("NaN") + ) + & ( + pl.col(vars_names[1]).is_not_null() + & ~pl.col(vars_names[1]).eq("NaN") + ) + & ( + pl.col("samp_weight").is_not_null() + & pl.col("samp_weight").is_not_nan() + ) + & ( + pl.col("stratum").is_not_null() + & ~pl.col(vars_names[1]).eq("NaN") + ) & (pl.col("psu").is_not_null() & ~pl.col(vars_names[1]).eq("NaN")) & (pl.col("ssu").is_not_null() & ~pl.col(vars_names[1]).eq("NaN")) ) @@ -539,7 +571,8 @@ def tabulate( cov_est_srs = ( np.diag(cell_est) - - cell_est.reshape((cell_est.shape[0], 1)) @ cell_est.reshape((1, cell_est.shape[0])) + - cell_est.reshape((cell_est.shape[0], 1)) + @ cell_est.reshape((1, cell_est.shape[0])) ) / df.shape[0] # cov_est_srs = cov_est_srs * ((df.shape[0] - 1) / df.shape[0]) @@ -567,10 +600,12 @@ def tabulate( try: x1_t = np.transpose(x1) - x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ (x1_t @ cov_est_srs @ x2) - delta_est = np.linalg.inv(np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde) @ ( - np.transpose(x2_tilde) @ cov_est @ x2_tilde + x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_est_srs @ x1) @ ( + x1_t @ cov_est_srs @ x2 ) + delta_est = np.linalg.inv( + np.transpose(x2_tilde) @ cov_est_srs @ x2_tilde + ) @ (np.transpose(x2_tilde) @ cov_est @ x2_tilde) except np.linalg.LinAlgError: delta_est = np.zeros((nrows * ncols, nrows * ncols)) @@ -601,10 +636,18 @@ def tabulate( .drop("key") ) - poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key(key=vars_names[0]) - stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key(key=vars_names[0]) - lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key(key=vars_names[0]) - upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key(key=vars_names[0]) + poin_est_dict = tbl_df.select(vars_names + ["point_est"]).rows_by_key( + key=vars_names[0] + ) + stderror_dict = tbl_df.select(vars_names + ["stderror"]).rows_by_key( + key=vars_names[0] + ) + lower_ci_dict = tbl_df.select(vars_names + ["lower_ci"]).rows_by_key( + key=vars_names[0] + ) + upper_ci_dict = tbl_df.select(vars_names + ["upper_ci"]).rows_by_key( + key=vars_names[0] + ) for var1 in poin_est_dict: point_est = {} @@ -654,12 +697,18 @@ def tabulate( chisq_p = ( df.shape[0] - * ((tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2 / tbl_df["est_prop_null"]).sum() + * ( + (tbl_df["est_prop"] - tbl_df["est_prop_null"]) ** 2 + / tbl_df["est_prop_null"] + ).sum() ) chisq_lr = ( 2 * df.shape[0] - * (tbl_df["est_prop"] * (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log()) + * ( + tbl_df["est_prop"] + * (tbl_df["est_prop"] / tbl_df["est_prop_null"]).log() + ) .fill_nan(0) .sum() ) @@ -722,19 +771,27 @@ def to_dataframe( for _ in range(len(self.row_levels)): for _ in range(len(self.col_levels)): twoway_df[self.param] = sum( - pd.DataFrame.from_dict(self.point_est, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.point_est, orient="index" + ).values.tolist(), [], ) twoway_df["stderror"] = sum( - pd.DataFrame.from_dict(self.stderror, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.stderror, orient="index" + ).values.tolist(), [], ) twoway_df["lower_ci"] = sum( - pd.DataFrame.from_dict(self.lower_ci, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.lower_ci, orient="index" + ).values.tolist(), [], ) twoway_df["upper_ci"] = sum( - pd.DataFrame.from_dict(self.upper_ci, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.upper_ci, orient="index" + ).values.tolist(), [], ) # twoway_df.sort_values(by=self.vars_names, inplace=True) diff --git a/src/samplics/categorical/tabulation_old.py b/src/samplics/categorical/tabulation_old.py index 6570e0fe..b8c709b9 100644 --- a/src/samplics/categorical/tabulation_old.py +++ b/src/samplics/categorical/tabulation_old.py @@ -57,7 +57,9 @@ def __str__(self) -> str: tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}" tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}" tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}" - tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + tbl_subhead4 = ( + f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + ) return f"\n{tbl_head}\n{tbl_subhead1}\n{tbl_subhead2}\n{tbl_subhead3}\n{tbl_subhead4}\n\n {self.to_dataframe().to_string(index=False)}\n" @@ -72,7 +74,9 @@ def _estimate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> tuple[TaylorEstimator, list, int]: @@ -86,7 +90,9 @@ def _estimate( ssu, ) if var.ndim == 1: # Series - to_keep = to_keep & remove_nans(var.values.ravel().shape[0], var.values.ravel()) + to_keep = to_keep & remove_nans( + var.values.ravel().shape[0], var.values.ravel() + ) elif var.ndim == 2: # DataFrame for col in var.columns: to_keep = to_keep & remove_nans( @@ -154,7 +160,9 @@ def tabulate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> None: @@ -187,7 +195,9 @@ def tabulate( _samp_weight = numpy_array(samp_weight) _samp_weight = ( - np.ones(vars_df.shape[0]) if _samp_weight.shape in ((), (0,)) else _samp_weight + np.ones(vars_df.shape[0]) + if _samp_weight.shape in ((), (0,)) + else _samp_weight ) _samp_weight = ( np.repeat(_samp_weight, vars_df.shape[0]) @@ -281,7 +291,9 @@ def to_dataframe( oneway_df = pd.DataFrame([]) for var in self.vars_names: - var_df = pd.DataFrame(np.repeat(var, len(self.vars_levels[var])), columns=["variable"]) + var_df = pd.DataFrame( + np.repeat(var, len(self.vars_levels[var])), columns=["variable"] + ) var_df["category"] = self.vars_levels[var] var_df[self.param] = list(self.point_est[var].values()) var_df["stderror"] = list(self.stderror[var].values()) @@ -340,20 +352,22 @@ def __str__(self) -> str: if self.vars_names == []: return "No categorical variables to tabulate" else: - tbl_head = f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}" + tbl_head = ( + f"Cross-tabulation of {self.vars_names[0]} and {self.vars_names[1]}" + ) tbl_subhead1 = f" Number of strata: {self.design_info['nb_strata']}" tbl_subhead2 = f" Number of PSUs: {self.design_info['nb_psus']}" tbl_subhead3 = f" Number of observations: {self.design_info['nb_obs']}" - tbl_subhead4 = f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + tbl_subhead4 = ( + f" Degrees of freedom: {self.design_info['degrees_of_freedom']:.2f}" + ) chisq_dist = f"chi2({self.stats['Pearson-Unadj']['df']})" f_dist = f"F({self.stats['Pearson-Adj']['df_num']:.2f}, {self.stats['Pearson-Adj']['df_den']:.2f}" pearson_unadj = f"Unadjusted - {chisq_dist}: {self.stats['Pearson-Unadj']['chisq_value']:.4f} with p-value of {self.stats['Pearson-Unadj']['p_value']:.4f}" pearson_adj = f"Adjusted - {f_dist}): {self.stats['Pearson-Adj']['f_value']:.4f} with p-value of {self.stats['Pearson-Adj']['p_value']:.4f}" - pearson_test = ( - f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}" - ) + pearson_test = f"Pearson (with Rao-Scott adjustment):\n\t{pearson_unadj}\n\t{pearson_adj}" lr_unadj = f" Unadjusted - {chisq_dist}: {self.stats['LR-Unadj']['chisq_value']:.4f} with p-value of {self.stats['LR-Unadj']['p_value']:.4f}" lr_adj = f" Adjusted - {f_dist}): {self.stats['LR-Adj']['f_value']:.4f} with p-value of {self.stats['LR-Adj']['p_value']:.4f}" @@ -380,7 +394,9 @@ def _extract_estimates( else: for ll in missing_levels: tbl_est.covariance[level][ll] = 0.0 - tbl_est.covariance[level] = dict(sorted(tbl_est.covariance[level].items())) + tbl_est.covariance[level] = dict( + sorted(tbl_est.covariance[level].items()) + ) _tbl_est_point_est = dict(sorted(tbl_est.point_est.items())) _tbl_est_covariance = dict(sorted(tbl_est.covariance.items())) @@ -402,7 +418,9 @@ def tabulate( fpc: Union[dict, float] = 1, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, remove_nan: bool = False, ) -> None: @@ -448,9 +466,13 @@ def tabulate( if remove_nan: # vars_nans = vars.isna() # excluded_units = vars_nans.iloc[:, 0] | vars_nans.iloc[:, 1] - to_keep = remove_nans(vars.shape[0], vars.iloc[:, 0].values, vars.iloc[:, 1].values) + to_keep = remove_nans( + vars.shape[0], vars.iloc[:, 0].values, vars.iloc[:, 1].values + ) samp_weight = ( - samp_weight[to_keep] if samp_weight.shape not in ((), (0,)) else samp_weight + samp_weight[to_keep] + if samp_weight.shape not in ((), (0,)) + else samp_weight ) stratum = stratum[to_keep] if stratum.shape not in ((), (0,)) else stratum psu = psu[to_keep] if psu.shape not in ((), (0,)) else psu @@ -482,7 +504,9 @@ def tabulate( vars_levels = pd.DataFrame([ll for ll in itertools.product(*both_levels)]) vars_levels.columns = vars_names_str - vars_dummies = np.asarray(dmatrix(two_way_full_model, vars_levels, NA_action="raise")) + vars_dummies = np.asarray( + dmatrix(two_way_full_model, vars_levels, NA_action="raise") + ) # vars_dummies = np.delete(vars_dummies, obj=1, axis=0) # vars_dummies = np.delete(vars_dummies, obj=2, axis=1) @@ -560,7 +584,9 @@ def tabulate( # we have that inv(x' V x) = inv(x' L L' x) = z'z where z = inv(L) x # L is the Cholesky factor i.e. L = np.linalg.cholesky(V) x1_t = np.transpose(x1) - x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_prop_srs @ x1) @ (x1_t @ cov_prop_srs @ x2) + x2_tilde = x2 - x1 @ np.linalg.inv(x1_t @ cov_prop_srs @ x1) @ ( + x1_t @ cov_prop_srs @ x2 + ) breakpoint() delta_est = np.linalg.inv(np.transpose(x2_tilde) @ cov_prop_srs @ x2_tilde) @ ( @@ -593,17 +619,23 @@ def tabulate( ) stderror.update( { - vars_levels.iloc[r * ncols + c, 1]: cell_stderror[r * ncols + c], + vars_levels.iloc[r * ncols + c, 1]: cell_stderror[ + r * ncols + c + ], } ) lower_ci.update( { - vars_levels.iloc[r * ncols + c, 1]: cell_lower_ci[r * ncols + c], + vars_levels.iloc[r * ncols + c, 1]: cell_lower_ci[ + r * ncols + c + ], } ) upper_ci.update( { - vars_levels.iloc[r * ncols + c, 1]: cell_upper_ci[r * ncols + c], + vars_levels.iloc[r * ncols + c, 1]: cell_upper_ci[ + r * ncols + c + ], } ) self.point_est.update({vars_levels.iloc[r * ncols, 0]: point_est}) @@ -621,14 +653,17 @@ def tabulate( ).reshape(1, ncols) chisq_p = float( - vars.shape[0] * np.sum((point_est_df - point_est_null) ** 2 / point_est_null) + vars.shape[0] + * np.sum((point_est_df - point_est_null) ** 2 / point_est_null) ) # valid indexes (i,j) correspond to n_ij > 0 valid_indx = (point_est_df != 0) & (point_est_null != 0) log_mat = np.zeros(point_est_null.shape) - log_mat[valid_indx] = np.log(point_est_df[valid_indx] / point_est_null[valid_indx]) + log_mat[valid_indx] = np.log( + point_est_df[valid_indx] / point_est_null[valid_indx] + ) chisq_lr = float(2 * vars.shape[0] * np.sum(point_est_df * log_mat)) @@ -691,19 +726,27 @@ def to_dataframe( for _ in range(len(self.row_levels)): for _ in range(len(self.col_levels)): twoway_df[self.param] = sum( - pd.DataFrame.from_dict(self.point_est, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.point_est, orient="index" + ).values.tolist(), [], ) twoway_df["stderror"] = sum( - pd.DataFrame.from_dict(self.stderror, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.stderror, orient="index" + ).values.tolist(), [], ) twoway_df["lower_ci"] = sum( - pd.DataFrame.from_dict(self.lower_ci, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.lower_ci, orient="index" + ).values.tolist(), [], ) twoway_df["upper_ci"] = sum( - pd.DataFrame.from_dict(self.upper_ci, orient="index").values.tolist(), + pd.DataFrame.from_dict( + self.upper_ci, orient="index" + ).values.tolist(), [], ) # twoway_df.sort_values(by=self.vars_names, inplace=True) diff --git a/src/samplics/estimation/calibration.py b/src/samplics/estimation/calibration.py index 562d26c7..3f03b34a 100644 --- a/src/samplics/estimation/calibration.py +++ b/src/samplics/estimation/calibration.py @@ -12,7 +12,9 @@ class CalibrateEstimator: """Provides calibration based estimation""" - def __init__(self, param: str, alpha: float = 0.05, rand_seed: Optional[int] = None): + def __init__( + self, param: str, alpha: float = 0.05, rand_seed: Optional[int] = None + ): pass def estimate( diff --git a/src/samplics/estimation/expansion.py b/src/samplics/estimation/expansion.py index 93481063..129ae67e 100755 --- a/src/samplics/estimation/expansion.py +++ b/src/samplics/estimation/expansion.py @@ -43,7 +43,9 @@ class _SurveyEstimator: """General approach for sample estimation of linear parameters.""" - def __init__(self, param: PopParam, alpha: float = 0.05, rand_seed: Optional[int] = None): + def __init__( + self, param: PopParam, alpha: float = 0.05, rand_seed: Optional[int] = None + ): """Initializes the instance""" self.rand_seed: Optional[int] @@ -61,7 +63,9 @@ def __init__(self, param: PopParam, alpha: float = 0.05, rand_seed: Optional[int self.alpha = alpha - self.point_est: Any = {} # Union[dict[StringNumber, DictStrNum], DictStrNum, Number] + self.point_est: Any = ( + {} + ) # Union[dict[StringNumber, DictStrNum], DictStrNum, Number] self.variance: Any = {} self.covariance: Any = {} self.stderror: Any = {} @@ -105,7 +109,9 @@ def __str__(self) -> Any: and isinstance(self.upper_ci, dict) and isinstance(self.coef_var, dict) ): - if self.domains is not None and (self.param == PopParam.prop or self.as_factor): + if self.domains is not None and ( + self.param == PopParam.prop or self.as_factor + ): domains = list() levels = list() point_est = list() @@ -181,7 +187,9 @@ def _degree_of_freedom( self.degree_of_freedom = self.nb_psus - self.nb_strata - def _get_point_d(self, y: np.ndarray, samp_weight: np.ndarray, x: np.ndarray) -> float: + def _get_point_d( + self, y: np.ndarray, samp_weight: np.ndarray, x: np.ndarray + ) -> float: if self.param in (PopParam.prop, PopParam.mean): return float(np.sum(samp_weight * y) / np.sum(samp_weight)) elif self.param == PopParam.total: @@ -239,7 +247,11 @@ def _get_point( for k in range(categories.size): y_k = y_dummies[:, k] cat_dict_k = dict( - {categories[k]: self._get_point_d(y=y_k, samp_weight=samp_weight, x=x)} + { + categories[k]: self._get_point_d( + y=y_k, samp_weight=samp_weight, x=x + ) + } ) cat_dict.update(cat_dict_k) return cat_dict @@ -255,7 +267,11 @@ def _get_point( for k in range(categories.size): y_d_k = y_dummies[domain == d, k] cat_dict_d_k = dict( - {categories[k]: self._get_point_d(y=y_d_k, samp_weight=weight_d, x=x)} + { + categories[k]: self._get_point_d( + y=y_d_k, samp_weight=weight_d, x=x + ) + } ) cat_dict_d.update(cat_dict_d_k) estimate1[d] = cat_dict_d @@ -327,12 +343,16 @@ def _score_variable( if self.param in (PopParam.prop, PopParam.mean): scale_weights = np.sum(samp_weight) location_weights = np.sum(y_weighted, axis=0) / scale_weights - return np.asarray((y - location_weights) * samp_weight[:, None] / scale_weights) + return np.asarray( + (y - location_weights) * samp_weight[:, None] / scale_weights + ) elif self.param == PopParam.ratio: weighted_sum_x = np.sum(x * samp_weight) weighted_ratio = np.sum(y_weighted, axis=0) / weighted_sum_x return np.asarray( - samp_weight[:, None] * (y - x[:, None] * weighted_ratio) / weighted_sum_x + samp_weight[:, None] + * (y - x[:, None] * weighted_ratio) + / weighted_sum_x ) elif self.param == PopParam.total: return np.asarray(y_weighted) @@ -427,7 +447,9 @@ def _taylor_variance( y_score_s = y_score[stratum == s] samp_weight_s = samp_weight[stratum == s] psu_s = psu[stratum == s] if psu.shape not in ((), (0,)) else psu - nb_psus_in_s = np.size(np.unique(psu_s)) if psu_s.shape not in ((), (0,)) else 0 + nb_psus_in_s = ( + np.size(np.unique(psu_s)) if psu_s.shape not in ((), (0,)) else 0 + ) ssu[stratum == s] if ssu.shape not in ((), (0,)) else ssu covariance += fpc[s] * self._variance_stratum_between( y_score_s=y_score_s, @@ -484,7 +506,9 @@ def _get_variance( fpc=fpc, skipped_strata=skipped_strata, ) # new - if (self.param == PopParam.prop or as_factor) and isinstance(cov_score, np.ndarray): + if (self.param == PopParam.prop or as_factor) and isinstance( + cov_score, np.ndarray + ): variance1: dict[StringNumber, dict] = {} covariance1: dict[StringNumber, dict] = {} variance1 = dict(zip(categories, np.diag(cov_score))) @@ -589,7 +613,11 @@ def _estimate( if domain.shape in ((), (0,)): if ( - (self.param == PopParam.prop or as_factor and self.param == PopParam.mean) + ( + self.param == PopParam.prop + or as_factor + and self.param == PopParam.mean + ) and isinstance(self.point_est, dict) and isinstance(self.variance, dict) ): @@ -622,7 +650,9 @@ def _estimate( self.lower_ci = lower_ci self.upper_ci = upper_ci elif ( - as_factor and isinstance(self.point_est, dict) and isinstance(self.variance, dict) + as_factor + and isinstance(self.point_est, dict) + and isinstance(self.variance, dict) ): stderror = {} lower_ci = {} @@ -630,8 +660,12 @@ def _estimate( coef_var = {} for level in self.variance: stderror[level] = math.sqrt(self.variance[level]) - lower_ci[level] = self.point_est[level] - t_quantile * stderror[level] - upper_ci[level] = self.point_est[level] + t_quantile * stderror[level] + lower_ci[level] = ( + self.point_est[level] - t_quantile * stderror[level] + ) + upper_ci[level] = ( + self.point_est[level] + t_quantile * stderror[level] + ) coef_var[level] = stderror[level] / self.point_est[level] self.stderror = stderror @@ -648,7 +682,11 @@ def _estimate( elif isinstance(self.variance, dict): for key in self.variance: if ( - (self.param == PopParam.prop or as_factor and self.param == PopParam.mean) + ( + self.param == PopParam.prop + or as_factor + and self.param == PopParam.mean + ) and isinstance(self.point_est, dict) and isinstance(self.variance, dict) ): @@ -670,7 +708,9 @@ def _estimate( upper_ci[level] = 1 coef_var[level] = 0 else: - location_ci = math.log(point_est1[level] / (1 - point_est1[level])) + location_ci = math.log( + point_est1[level] / (1 - point_est1[level]) + ) scale_ci = stderror[level] / ( point_est1[level] * (1 - point_est1[level]) ) @@ -695,8 +735,12 @@ def _estimate( coef_var = {} for level in self.variance[key]: stderror[level] = math.sqrt(self.variance[key][level]) - lower_ci[level] = self.point_est[key][level] - t_quantile * stderror[level] - upper_ci[level] = self.point_est[key][level] + t_quantile * stderror[level] + lower_ci[level] = ( + self.point_est[key][level] - t_quantile * stderror[level] + ) + upper_ci[level] = ( + self.point_est[key][level] + t_quantile * stderror[level] + ) coef_var[level] = stderror[level] / self.point_est[key][level] self.stderror[key] = stderror @@ -705,12 +749,20 @@ def _estimate( self.upper_ci[key] = upper_ci elif isinstance(self.point_est, dict): self.stderror[key] = math.sqrt(self.variance[key]) - self.lower_ci[key] = self.point_est[key] - t_quantile * self.stderror[key] - self.upper_ci[key] = self.point_est[key] + t_quantile * self.stderror[key] - self.coef_var[key] = math.sqrt(self.variance[key]) / self.point_est[key] + self.lower_ci[key] = ( + self.point_est[key] - t_quantile * self.stderror[key] + ) + self.upper_ci[key] = ( + self.point_est[key] + t_quantile * self.stderror[key] + ) + self.coef_var[key] = ( + math.sqrt(self.variance[key]) / self.point_est[key] + ) def _raise_singleton_error(self): - raise ValueError(f"Only one PSU in the following strata: {self.single_psu_strata}") + raise ValueError( + f"Only one PSU in the following strata: {self.single_psu_strata}" + ) def _skip_singleton(self, skipped_strata: Array) -> Array: skipped_str = np.isin(self.single_psu_strata, skipped_strata) @@ -740,7 +792,9 @@ def _certainty_singleton( @staticmethod def _combine_strata(comb_strata: Array, _stratum: Array) -> Array: if comb_strata is None: - raise ValueError("The parameter 'strata_comb' must be provided to combine strata") + raise ValueError( + "The parameter 'strata_comb' must be provided to combine strata" + ) else: for s in comb_strata: _stratum[_stratum == s] = comb_strata[s] @@ -759,13 +813,17 @@ def estimate( fpc: Union[dict[StringNumber, Number], Series, Number] = 1.0, deff: bool = False, coef_var: bool = False, - single_psu: Union[SinglePSUEst, dict[StringNumber, SinglePSUEst]] = SinglePSUEst.error, + single_psu: Union[ + SinglePSUEst, dict[StringNumber, SinglePSUEst] + ] = SinglePSUEst.error, strata_comb: Optional[dict[Array, Array]] = None, as_factor: bool = False, remove_nan: bool = False, ) -> None: if as_factor and self.param not in (PopParam.mean, PopParam.total): - raise AssertionError("When as_factor is True, parameter must be mean or total!") + raise AssertionError( + "When as_factor is True, parameter must be mean or total!" + ) if self.param == PopParam.ratio and x.shape in ((), (0,)): raise AssertionError("x must be provided for ratio estimation.") @@ -790,19 +848,27 @@ def estimate( _y = _y[to_keep] if _y.shape not in ((), (0,)) else _y _x = _x[to_keep] if _x.shape not in ((), (0,)) else _x - _stratum = _stratum[to_keep] if _stratum.shape not in ((), (0,)) else _stratum + _stratum = ( + _stratum[to_keep] if _stratum.shape not in ((), (0,)) else _stratum + ) _psu = _psu[to_keep] if _psu.shape not in ((), (0,)) else _psu _ssu = _ssu[to_keep] if _ssu.shape not in ((), (0,)) else _ssu _domain = _domain[to_keep] if _domain.shape not in ((), (0,)) else _domain _by = _by[to_keep] if _by.shape not in ((), (0,)) else _by _samp_weight = ( - _samp_weight[to_keep] if _samp_weight.shape not in ((), (0,)) else _samp_weight + _samp_weight[to_keep] + if _samp_weight.shape not in ((), (0,)) + else _samp_weight ) self.by = np.unique(_by).tolist() if _by.shape not in ((), (0,)) else _by - self.strata = np.unique(_stratum) if _stratum.shape not in ((), (0,)) else _stratum + self.strata = ( + np.unique(_stratum) if _stratum.shape not in ((), (0,)) else _stratum + ) - self.domains = np.unique(_domain) if _domain.shape not in ((), (0,)) else _domain + self.domains = ( + np.unique(_domain) if _domain.shape not in ((), (0,)) else _domain + ) if _stratum.shape not in ((), (0,)): # TODO: we could improve efficiency by creating the pair [stratum,psu, ssu] ounce and @@ -814,7 +880,9 @@ def estimate( if single_psu == SinglePSUEst.error: self._raise_singleton_error() if single_psu == SinglePSUEst.skip: - skipped_strata = self._skip_singleton(skipped_strata=self.single_psu_strata) + skipped_strata = self._skip_singleton( + skipped_strata=self.single_psu_strata + ) if single_psu == SinglePSUEst.certainty: _psu = self._certainty_singleton( singletons=self.single_psu_strata, @@ -830,7 +898,9 @@ def estimate( if single_psu[s] == SinglePSUEst.error: self._raise_singleton_error() if single_psu[s] == SinglePSUEst.skip: - skipped_strata = self._skip_singleton(skipped_strata=numpy_array(s)) + skipped_strata = self._skip_singleton( + skipped_strata=numpy_array(s) + ) if single_psu[s] == SinglePSUEst.certainty: _psu = self._certainty_singleton( singletons=numpy_array(s), @@ -852,7 +922,9 @@ def estimate( self.fpc = fpc_as_dict(_stratum, fpc) else: if list(np.unique(_stratum)) != list(fpc.keys()): - raise AssertionError("fpc dictionary keys must be the same as the strata!") + raise AssertionError( + "fpc dictionary keys must be the same as the strata!" + ) else: self.fpc = fpc @@ -880,10 +952,14 @@ def estimate( _y_b = _y[group_b] _samp_weight_b = _samp_weight[group_b] _x_b = _x[group_b] if _x.shape not in ((), (0,)) else _x - _stratum_b = _stratum[group_b] if _stratum.shape not in ((), (0,)) else _stratum + _stratum_b = ( + _stratum[group_b] if _stratum.shape not in ((), (0,)) else _stratum + ) _psu_b = _psu[group_b] if _psu.shape not in ((), (0,)) else _psu _ssu_b = _ssu[group_b] if _ssu.shape not in ((), (0,)) else _ssu - _domain_b = _domain[group_b] if _domain.shape not in ((), (0,)) else _domain + _domain_b = ( + _domain[group_b] if _domain.shape not in ((), (0,)) else _domain + ) by_est = TaylorEstimator( param=self.param, diff --git a/src/samplics/estimation/replication.py b/src/samplics/estimation/replication.py index 10da8123..c7cbf62c 100644 --- a/src/samplics/estimation/replication.py +++ b/src/samplics/estimation/replication.py @@ -72,7 +72,9 @@ def __init__( self.nb_reps = rep_weight_cls.nb_reps self.rep_coefs = rep_weight_cls.rep_coefs self.degree_of_freedom = rep_weight_cls.degree_of_freedom - self.fay_coef = rep_weight_cls.fay_coef if self.method == RepMethod.brr else None + self.fay_coef = ( + rep_weight_cls.fay_coef if self.method == RepMethod.brr else None + ) def _rep_point( self, y: np.ndarray, rep_weights: np.ndarray, x: Optional[np.ndarray] @@ -85,7 +87,8 @@ def _rep_point( return np.asarray(np.sum(rep_weights * y[:, None], axis=0)) elif self.param == PopParam.ratio and x is not None: return np.asarray( - np.sum(rep_weights * y[:, None], axis=0) / np.sum(rep_weights * x[:, None], axis=0) + np.sum(rep_weights * y[:, None], axis=0) + / np.sum(rep_weights * x[:, None], axis=0) ) else: raise AssertionError("Parameter not valid!") @@ -130,19 +133,25 @@ def _variance( ) -> Number: variance = 0.0 rep_estimates = self._rep_point(y, rep_weights, x) - if self.method == RepMethod.jackknife: # page 155 (4.2.3 and 4.2.5) - Wolter(2003) + if ( + self.method == RepMethod.jackknife + ): # page 155 (4.2.3 and 4.2.5) - Wolter(2003) jk_factor = np.array(1 / (1 - rep_coefs)) pseudo_estimates = jk_factor * estimate - (jk_factor - 1) * rep_estimates if conservative: variance = float( - np.sum(rep_coefs * pow((pseudo_estimates - estimate) / (jk_factor - 1), 2)) + np.sum( + rep_coefs + * pow((pseudo_estimates - estimate) / (jk_factor - 1), 2) + ) ) elif not conservative: variance = float( np.sum( rep_coefs * pow( - (pseudo_estimates - np.mean(pseudo_estimates)) / (jk_factor - 1), + (pseudo_estimates - np.mean(pseudo_estimates)) + / (jk_factor - 1), 2, ) ) @@ -268,8 +277,12 @@ def _get_variance( cat_dict.update(cat_dict_k) return cat_dict else: - estimate = self._get_point(y=y, samp_weight=samp_weight, x=x, domain=domain) - return self._variance(y, rep_weights, rep_coefs, x, estimate, conservative) + estimate = self._get_point( + y=y, samp_weight=samp_weight, x=x, domain=domain + ) + return self._variance( + y, rep_weights, rep_coefs, x, estimate, conservative + ) else: variance_else1 = {} variance_else2 = {} @@ -344,9 +357,9 @@ def _get_confint( upper_ci[level] = math.exp(uu) / (1 + math.exp(uu)) return lower_ci, upper_ci else: - return estimate - quantile * pow(variance, 0.5), estimate + quantile * pow( + return estimate - quantile * pow( variance, 0.5 - ) + ), estimate + quantile * pow(variance, 0.5) else: lower_ci_else1 = {} @@ -369,8 +382,12 @@ def _get_confint( lower_ci_else1[key] = lower_ci_k upper_ci_else1[key] = upper_ci_k else: - lower_ci_else2[key] = estimate[key] - quantile * pow(variance[key], 0.5) - upper_ci_else2[key] = estimate[key] + quantile * pow(variance[key], 0.5) + lower_ci_else2[key] = estimate[key] - quantile * pow( + variance[key], 0.5 + ) + upper_ci_else2[key] = estimate[key] + quantile * pow( + variance[key], 0.5 + ) if self.param == PopParam.prop: return lower_ci_else1, upper_ci_else1 @@ -396,7 +413,9 @@ def _get_coefvar( if param == PopParam.prop: coef_var_k = {} for level in variance[key]: - coef_var_k[level] = pow(variance[key][level], 0.5) / estimate[key][level] + coef_var_k[level] = ( + pow(variance[key][level], 0.5) / estimate[key][level] + ) coef_var[key] = coef_var_k else: coef_var[key] = pow(variance[key], 0.5) / estimate[key] @@ -450,9 +469,15 @@ def estimate( _y = _y[to_keep] _x = _x[to_keep] if _x.shape not in ((), (0,)) else _x _samp_weight = _samp_weight[to_keep] - _rep_coefs = _rep_coefs[to_keep] if _rep_coefs.shape not in ((), (0,)) else _rep_coefs + _rep_coefs = ( + _rep_coefs[to_keep] + if _rep_coefs.shape not in ((), (0,)) + else _rep_coefs + ) _rep_weights = ( - _rep_weights[to_keep] if _rep_weights.shape not in ((), (0,)) else _rep_weights + _rep_weights[to_keep] + if _rep_weights.shape not in ((), (0,)) + else _rep_weights ) _domain = _domain[to_keep] if _domain.shape not in ((), (0,)) else _domain @@ -463,9 +488,13 @@ def estimate( self._rep_coefs(rep_coefs) - self.domains = np.unique(_domain) if _domain.shape not in ((), (0,)) else _domain + self.domains = ( + np.unique(_domain) if _domain.shape not in ((), (0,)) else _domain + ) - self.point_est = self._get_point(y=_y, samp_weight=_samp_weight, x=_x, domain=_domain) + self.point_est = self._get_point( + y=_y, samp_weight=_samp_weight, x=_x, domain=_domain + ) self.variance = self._get_variance( y=_y, samp_weight=_samp_weight, diff --git a/src/samplics/regression/glm.py b/src/samplics/regression/glm.py index d36a1873..d054952e 100644 --- a/src/samplics/regression/glm.py +++ b/src/samplics/regression/glm.py @@ -11,7 +11,7 @@ fpc_as_dict, numpy_array, ) -from samplics.utils.types import Array, Number, Series, StringNumber +from samplics.utils.types import Array, Number, Series, StringNumber, ModelType class SurveyGLM: @@ -21,7 +21,9 @@ def __init__(self): self.beta: np.ndarray @staticmethod - def _residuals(e: np.ndarray, psu: np.ndarray, nb_vars: Number) -> tuple(np.ndarray, Number): + def _residuals( + e: np.ndarray, psu: np.ndarray, nb_vars: Number + ) -> tuple(np.ndarray, Number): psus = np.unique(psu) if psus.shape[0] == 1 and e.shape[0] == 1: raise AssertionError("Only one observation in the stratum") @@ -63,12 +65,16 @@ def _calculate_g( def estimate( self, + model: ModelType, y: Array, x: Optional[Array] = None, samp_weight: Optional[Array] = None, stratum: Optional[Series] = None, psu: Optional[Series] = None, fpc: Union[dict[StringNumber, Number], Series, Number] = 1.0, + add_intercept: bool = False, + tol: float = 1e-8, + maxiter: int = 100, remove_nan: bool = False, ) -> None: _y = numpy_array(y) @@ -77,6 +83,12 @@ def estimate( _stratum = numpy_array(stratum) _samp_weight = numpy_array(samp_weight) + if _x.shape[0] > 0 and _x.ndim == 1: + _x = _x.reshape(_x.shape[0], 1) + + if add_intercept: + _x = np.insert(_x, 0, 1, axis=1) + if _samp_weight.shape in ((), (0,)): _samp_weight = np.ones(y.shape[0]) if _samp_weight.shape[0] == 1: @@ -86,11 +98,26 @@ def estimate( self.fpc = fpc_as_dict(_stratum, fpc) else: if np.unique(_stratum).tolist() != list(fpc.keys()): - raise AssertionError("fpc dictionary keys must be the same as the strata!") + raise AssertionError( + "fpc dictionary keys must be the same as the strata!" + ) else: self.fpc = fpc - glm_model = sm.GLM(endog=_y, exog=_x, var_weights=_samp_weight) + match model: + case ModelType.LINEAR: + glm_model = sm.GLM(endog=_y, exog=_x, var_weights=_samp_weight) + case ModelType.LOGISTIC: + glm_model = sm.GLM( + endog=_y, + exog=_x, + var_weights=_samp_weight, + family=sm.families.Binomial(), + ) + case _: + raise NotImplementedError(f"Model {model} is not implemented yet") + + # glm_model = sm.GLM(endog=_y, exog=_x, var_weights=_samp_weight) glm_results = glm_model.fit() g = self._calculate_g( diff --git a/src/samplics/sae/eb_unit_model.py b/src/samplics/sae/eb_unit_model.py index d14d3629..fa0f53a3 100644 --- a/src/samplics/sae/eb_unit_model.py +++ b/src/samplics/sae/eb_unit_model.py @@ -160,10 +160,14 @@ def _transformation(self, y: np.ndarray, inverse: bool) -> np.ndarray: elif self.boxcox["lambda"] != 0.0: if inverse: return np.asarray( - np.exp(np.log(1 + y * self.boxcox["lambda"]) / self.boxcox["lambda"]) + np.exp( + np.log(1 + y * self.boxcox["lambda"]) / self.boxcox["lambda"] + ) ) else: - return np.asarray(np.power(y, self.boxcox["lambda"]) / self.boxcox["lambda"]) + return np.asarray( + np.power(y, self.boxcox["lambda"]) / self.boxcox["lambda"] + ) else: raise AssertionError @@ -300,7 +304,9 @@ def _predict_indicator( scale=(sigma2u * (1 - self.gamma[d])) ** 0.5, size=cycle_size, ) - errors = np.random.normal(scale=scale_dr * (sigma2e**0.5), size=(cycle_size, N_dr)) + errors = np.random.normal( + scale=scale_dr * (sigma2e**0.5), size=(cycle_size, N_dr) + ) y_dr_j = mu_dr[None, :] + mu_bias_dr + re_effects[:, None] + errors if j == 0: y_dr = y_dr_j @@ -316,7 +322,9 @@ def _predict_indicator( end="", ) - y_d = np.append(y_dr, np.tile(y_s[area_s == d], [number_samples, 1]), axis=1) + y_d = np.append( + y_dr, np.tile(y_s[area_s == d], [number_samples, 1]), axis=1 + ) z_d = basic_functions.transform( y_d, llambda=self.boxcox["lambda"], @@ -509,7 +517,9 @@ def bootstrap_mse( eta_pop_boot = np.zeros((number_reps, nb_areas_ps)) eta_samp_boot = np.zeros((number_reps, nb_areas_ps)) - y_samp_boot = np.zeros((number_reps, int(np.sum(list(sample_size_dict.values()))))) + y_samp_boot = np.zeros( + (number_reps, int(np.sum(list(sample_size_dict.values())))) + ) print(f"Generating the {number_reps} bootstrap replicate populations") for b in range(number_cycles): start = b * cycle_size @@ -530,7 +540,9 @@ def bootstrap_mse( scale=self.error_std * scale_dict[d], size=(cycle_size, np.sum(indice_dict[d])), ) - yboot_d = (X_dict[d] @ self.fixed_effects)[None, :] + re_d[:, None] + err_d + yboot_d = ( + (X_dict[d] @ self.fixed_effects)[None, :] + re_d[:, None] + err_d + ) zboot_d = basic_functions.transform( yboot_d, llambda=self.boxcox["lambda"], @@ -542,7 +554,9 @@ def bootstrap_mse( if i == 0: yboot_s = yboot_d[:, -int(sample_size_dict[d]) :] else: - yboot_s = np.append(yboot_s, yboot_d[:, -int(sample_size_dict[d]) :], axis=1) + yboot_s = np.append( + yboot_s, yboot_d[:, -int(sample_size_dict[d]) :], axis=1 + ) if show_progress: run_id = b * nb_areas_ps + i @@ -577,7 +591,9 @@ def bootstrap_mse( # "pgtol": tol, "maxiter": maxiter, } # TODO: to improve in the future. Check: statsmodels.LikelihoodModel.fit() - print(f"Fitting and predicting using each of the {number_reps} bootstrap populations") + print( + f"Fitting and predicting using each of the {number_reps} bootstrap populations" + ) for b in range(number_reps): with warnings.catch_warnings(): warnings.filterwarnings("ignore") @@ -624,7 +640,9 @@ def bootstrap_mse( print("\n") - mse_boot = np.asarray(np.mean(np.power(eta_samp_boot - eta_pop_boot, 2), axis=0)) + mse_boot = np.asarray( + np.mean(np.power(eta_samp_boot - eta_pop_boot, 2), axis=0) + ) self.area_mse_boot = dict(zip(self.arear_list, mse_boot)) def to_dataframe( @@ -644,7 +662,9 @@ def to_dataframe( ncols = len(col_names) if self.area_est is None: - raise AssertionError("No prediction yet. Must predict the area level estimates.") + raise AssertionError( + "No prediction yet. Must predict the area level estimates." + ) elif self.area_mse_boot is None and ncols not in (2, 3): raise AssertionError("col_names must have 2 or 3 values") elif self.area_mse_boot is None and ncols == 3: @@ -653,6 +673,8 @@ def to_dataframe( if self.area_mse_boot is None: area_df = formats.dict_to_dataframe(col_names, self.area_est) else: - area_df = formats.dict_to_dataframe(col_names, self.area_est, self.area_mse_boot) + area_df = formats.dict_to_dataframe( + col_names, self.area_est, self.area_mse_boot + ) return area_df diff --git a/src/samplics/sae/eblup_area_model.py b/src/samplics/sae/eblup_area_model.py index c485ea4d..9674e9e9 100644 --- a/src/samplics/sae/eblup_area_model.py +++ b/src/samplics/sae/eblup_area_model.py @@ -195,7 +195,9 @@ def _partial_derivatives( V = np.diag(v_i) v_inv = np.linalg.inv(V) x_vinv_x = np.matmul(np.matmul(np.transpose(X), v_inv), X) - x_xvinvx_x = np.matmul(np.matmul(X, np.linalg.inv(x_vinv_x)), np.transpose(X)) + x_xvinvx_x = np.matmul( + np.matmul(X, np.linalg.inv(x_vinv_x)), np.transpose(X) + ) P = v_inv - np.matmul(np.matmul(v_inv, x_xvinvx_x), v_inv) P_B = np.matmul(P, B) P_B_P = np.matmul(P_B, P) diff --git a/src/samplics/sae/eblup_unit_model.py b/src/samplics/sae/eblup_unit_model.py index 893bd511..d790c534 100644 --- a/src/samplics/sae/eblup_unit_model.py +++ b/src/samplics/sae/eblup_unit_model.py @@ -188,7 +188,11 @@ def _mse( g3_afactor = (1 / afactor**2) * (1 / (sigma2u + sigma2e / afactor) ** 3) g3 = ( g3_afactor - * ((sigma2e**2) * i_ee + (sigma2u**2) * i_vv - 2 * (sigma2e * sigma2u) * (-i_ve)) + * ( + (sigma2e**2) * i_ee + + (sigma2u**2) * i_vv + - 2 * (sigma2e * sigma2u) * (-i_ve) + ) / i_determinant ) @@ -277,7 +281,9 @@ def fit( + np.log(nb_obs - self.fixed_effects.shape[0]) * nb_variance_params ) elif self.method == FitMethod.ml: - aic = -2 * basic_fit.llf + 2 * (self.fixed_effects.shape[0] + nb_variance_params) + aic = -2 * basic_fit.llf + 2 * ( + self.fixed_effects.shape[0] + nb_variance_params + ) bic = -2 * basic_fit.llf + np.log(nb_obs) * ( self.fixed_effects.shape[0] + nb_variance_params ) @@ -369,7 +375,8 @@ def predict( samp_rate[d] = 0 if d in self.areas_list: area_est[d] = ( - mu[d] + (samp_rate[d] + (1 - samp_rate[d]) * self.gamma[d]) * resid[d] + mu[d] + + (samp_rate[d] + (1 - samp_rate[d]) * self.gamma[d]) * resid[d] ) else: area_est[d] = mu[d] @@ -377,7 +384,11 @@ def predict( self.samp_rate = samp_rate self.area_est = area_est - A_ps = np.diag(np.zeros(Xp_mean.shape[1])) if Xp_mean.ndim >= 2 else np.asarray([0]) + A_ps = ( + np.diag(np.zeros(Xp_mean.shape[1])) + if Xp_mean.ndim >= 2 + else np.asarray([0]) + ) ps_area_list = self.areap[ps] for d in ps_area_list: @@ -385,9 +396,9 @@ def predict( n_ps_d = np.sum(areadps) X_ps_d = Xs[areadps] scale_ps_d = self.scales[areadps] - V_ps_d = (self.error_std**2) * np.diag(scale_ps_d) + (self.re_std**2) * np.ones( - [n_ps_d, n_ps_d] - ) + V_ps_d = (self.error_std**2) * np.diag(scale_ps_d) + ( + self.re_std**2 + ) * np.ones([n_ps_d, n_ps_d]) A_ps = A_ps + np.transpose(X_ps_d) @ np.linalg.inv(V_ps_d) @ X_ps_d ps_area_indices = np.isin(self.areas_list, ps_area_list) @@ -515,13 +526,16 @@ def bootstrap_mse( if b in steps: i += 1 print( - f"\r[%-{bar_length - 1}s] %d%%" % ("=" * i, 2 + (100 / bar_length) * i), + f"\r[%-{bar_length - 1}s] %d%%" + % ("=" * i, 2 + (100 / bar_length) * i), end="", ) if show_progress: print("\n") - self.area_mse_boot = dict(zip(ps_area_list, np.asarray(np.mean(boot_mse, axis=0)))) + self.area_mse_boot = dict( + zip(ps_area_list, np.asarray(np.mean(boot_mse, axis=0))) + ) # TODO: nonnegligeable sampling fractions, section 7.2.4, Rao and Molina (2015) diff --git a/src/samplics/sae/robust_unit_model.py b/src/samplics/sae/robust_unit_model.py index 9302c251..a208a7d0 100644 --- a/src/samplics/sae/robust_unit_model.py +++ b/src/samplics/sae/robust_unit_model.py @@ -197,7 +197,9 @@ def fit( boxcox=self.boxcox["lambda"], constant=self.boxcox["constant"], ) - eb_ul.fit(ys, Xs, areas, samp_weight, scales, False, tol=tol, maxiter=maxiter) + eb_ul.fit( + ys, Xs, areas, samp_weight, scales, False, tol=tol, maxiter=maxiter + ) self.scales = eb_ul.scales self.afactors = eb_ul.afactors self.ys = eb_ul.ys @@ -231,7 +233,9 @@ def fit( self.Xs = Xs self.areas = areas self.areas_list = np.unique(areas) - self.afactors = dict(zip(self.areas_list, basic_functions.sumby(areas, scales))) + self.afactors = dict( + zip(self.areas_list, basic_functions.sumby(areas, scales)) + ) self.ys_mean, self.Xs_mean, _, samp_size = area_stats( ys, Xs, areas, 0, 1, self.afactors, samp_weight ) @@ -273,7 +277,9 @@ def _predict_indicator_parametric( if j == number_cycles: cycle_size = last_cycle_size re_effects = np.random.normal(scale=sigma2u**0.5, size=cycle_size) - errors = np.random.normal(scale=scale_d * (sigma2e**0.5), size=(cycle_size, N_d)) + errors = np.random.normal( + scale=scale_d * (sigma2e**0.5), size=(cycle_size, N_d) + ) y_d_j = mu_d[None, :] + re_effects[:, None] + errors if j == 0: y_d = y_d_j @@ -331,7 +337,9 @@ def _predict_indicator_nonparametric( if i == 0: unit_errors = total_residuals_d - area_effects[i] else: - unit_errors = np.append(unit_errors, total_residuals_d - area_effects[i]) + unit_errors = np.append( + unit_errors, total_residuals_d - area_effects[i] + ) eta = np.zeros((number_samples, nb_areas)) * np.nan for i, d in enumerate(areas): diff --git a/src/samplics/sae/sae_core_functions.py b/src/samplics/sae/sae_core_functions.py index 79bfc72e..4ad3d00a 100644 --- a/src/samplics/sae/sae_core_functions.py +++ b/src/samplics/sae/sae_core_functions.py @@ -160,7 +160,8 @@ def inverse_covariance( gamma_d = sigma2u / (sigma2u + sigma2e / sum_scale_d) V_inv[start:end, start:end] = (1 / sigma2e) * ( np.diag(a_d) - - (gamma_d / sum_scale_d) * np.matmul(a_d[:, None], np.transpose(a_d[:, None])) + - (gamma_d / sum_scale_d) + * np.matmul(a_d[:, None], np.transpose(a_d[:, None])) ) return V_inv @@ -186,7 +187,11 @@ def log_det_covariance( det = 0 for d in np.unique(area): nd = np.sum(area == d) - det += np.sum(np.log(scale)) + nd * np.log(sigma2e) + np.log(1 + nd * sigma2u / sigma2e) + det += ( + np.sum(np.log(scale)) + + nd * np.log(sigma2e) + + np.log(1 + nd * sigma2u / sigma2e) + ) return det diff --git a/src/samplics/sampling/power_functions.py b/src/samplics/sampling/power_functions.py index f1c89251..9c7625cf 100644 --- a/src/samplics/sampling/power_functions.py +++ b/src/samplics/sampling/power_functions.py @@ -28,16 +28,24 @@ def calculate_power_prop( ): z_value = normal().ppf(1 - alpha / 2) - if isinstance(prop_0, dict) and isinstance(prop_1, dict) and isinstance(samp_size, dict): + if ( + isinstance(prop_0, dict) + and isinstance(prop_1, dict) + and isinstance(samp_size, dict) + ): if two_sides: powerr: dict = {} for s in prop_0: - z = (prop_1[s] - prop_0[s]) / math.sqrt(prop_1[s] * (1 - prop_1[s]) / samp_size[s]) + z = (prop_1[s] - prop_0[s]) / math.sqrt( + prop_1[s] * (1 - prop_1[s]) / samp_size[s] + ) powerr[s] = normal().cdf(z - z_value) + normal().cdf(-z - z_value) else: powerr: dict = {} for s in prop_0: - z = (prop_1[s] - prop_0[s]) / math.sqrt(prop_1[s] * (1 - prop_1[s]) / samp_size[s]) + z = (prop_1[s] - prop_0[s]) / math.sqrt( + prop_1[s] * (1 - prop_1[s]) / samp_size[s] + ) powerr[s] = normal().cdf(z - z_value) + normal().cdf(-z - z_value) elif ( isinstance(prop_0, (int, float)) @@ -54,7 +62,8 @@ def calculate_power_prop( ) else: return ( - 1 - normal().cdf(z_value - h * math.sqrt(samp_size)) + 1 + - normal().cdf(z_value - h * math.sqrt(samp_size)) # + normal().cdf(-z_value - h * math.sqrt(samp_size)) ) else: @@ -88,15 +97,21 @@ def calculate_power( samp_size: Union[DictStrNum, Number, Array], alpha: float, ): - if isinstance(delta, dict) and isinstance(sigma, dict) and isinstance(samp_size, dict): + if ( + isinstance(delta, dict) + and isinstance(sigma, dict) + and isinstance(samp_size, dict) + ): if two_sides: return { s: 1 - normal().cdf( - normal().ppf(1 - alpha / 2) - delta[s] / (sigma[s] / math.sqrt(samp_size[s])) + normal().ppf(1 - alpha / 2) + - delta[s] / (sigma[s] / math.sqrt(samp_size[s])) ) + normal().cdf( - -normal().ppf(1 - alpha / 2) - delta[s] / (sigma[s] / math.sqrt(samp_size[s])) + -normal().ppf(1 - alpha / 2) + - delta[s] / (sigma[s] / math.sqrt(samp_size[s])) ) for s in delta } @@ -116,7 +131,8 @@ def calculate_power( normal().ppf(1 - alpha / 2) - delta / (sigma / math.sqrt(samp_size)) ) + normal().cdf( - -normal().ppf(1 - alpha / 2) - delta / (sigma / math.sqrt(samp_size)) + -normal().ppf(1 - alpha / 2) + - delta / (sigma / math.sqrt(samp_size)) ) ) else: @@ -146,7 +162,8 @@ def calculate_power( ) else: power[k] = 1 - normal().cdf( - normal().ppf(1 - alpha) - delta[k] / (sigma[k] / math.sqrt(samp_size[k])) + normal().ppf(1 - alpha) + - delta[k] / (sigma[k] / math.sqrt(samp_size[k])) ) return power @@ -166,20 +183,35 @@ def power_for_one_proportion( assert_proportions(prop_0=prop_0, prop_1=prop_1, alpha=alpha) if isinstance(alpha, (int, float)): - z_value = normal().ppf(1 - alpha / 2) if type == "two-sided" else normal().ppf(1 - alpha) + z_value = ( + normal().ppf(1 - alpha / 2) + if type == "two-sided" + else normal().ppf(1 - alpha) + ) if isinstance(alpha, (np.ndarray, pd.Series, list, tuple)): alpha = numpy_array(alpha) - z_value = normal().ppf(1 - alpha / 2) if type == "two-sided" else normal().ppf(1 - alpha) + z_value = ( + normal().ppf(1 - alpha / 2) + if type == "two-sided" + else normal().ppf(1 - alpha) + ) - if isinstance(prop_0, dict) and isinstance(prop_1, dict) and isinstance(samp_size, dict): + if ( + isinstance(prop_0, dict) + and isinstance(prop_1, dict) + and isinstance(samp_size, dict) + ): power: dict = {} for s in prop_0: if arcsin: z = ( - 2 * math.asin(math.sqrt(prop_1[s])) - 2 * math.asin(math.sqrt(prop_0[s])) + 2 * math.asin(math.sqrt(prop_1[s])) + - 2 * math.asin(math.sqrt(prop_0[s])) ) * math.sqrt(samp_size[s]) else: - z = (prop_1[s] - prop_0[s]) / math.sqrt(prop_1[s] * (1 - prop_1[s]) / samp_size[s]) + z = (prop_1[s] - prop_0[s]) / math.sqrt( + prop_1[s] * (1 - prop_1[s]) / samp_size[s] + ) if isinstance(alpha, dict): z_value = ( @@ -200,9 +232,9 @@ def power_for_one_proportion( and isinstance(samp_size, (int, float)) ): if arcsin: - z = (2 * math.asin(math.sqrt(prop_1)) - 2 * math.asin(math.sqrt(prop_0))) * math.sqrt( - samp_size - ) + z = ( + 2 * math.asin(math.sqrt(prop_1)) - 2 * math.asin(math.sqrt(prop_0)) + ) * math.sqrt(samp_size) else: z = (prop_1 - prop_0) / math.sqrt(prop_1 * (1 - prop_1) / samp_size) @@ -223,9 +255,9 @@ def power_for_one_proportion( for s in prop_0: if arcsin: - z = (2 * np.arcsin(np.sqrt(prop_1)) - 2 * np.arcsin(np.sqrt(prop_0))) * np.sqrt( - samp_size - ) + z = ( + 2 * np.arcsin(np.sqrt(prop_1)) - 2 * np.arcsin(np.sqrt(prop_0)) + ) * np.sqrt(samp_size) else: z = (prop_1 - prop_0) / np.sqrt(prop_1 * (1 - prop_1) / samp_size) @@ -286,11 +318,13 @@ def power_for_one_mean( } elif type == "greater": return normal().cdf( - (mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) - normal().ppf(1 - alpha) + (mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) + - normal().ppf(1 - alpha) ) else: return normal().cdf( - -(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) - normal().ppf(1 - alpha) + -(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) + - normal().ppf(1 - alpha) ) elif ( isinstance(mean_0, (int, float)) @@ -300,15 +334,18 @@ def power_for_one_mean( ): if type == "two-sided": return normal().cdf( - abs(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) - normal().ppf(1 - alpha / 2) + abs(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) + - normal().ppf(1 - alpha / 2) ) elif type == "greater": return normal().cdf( - (mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) - normal().ppf(1 - alpha) + (mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) + - normal().ppf(1 - alpha) ) else: return normal().cdf( - -(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) - normal().ppf(1 - alpha) + -(mean_0 - mean_1) / (sigma / math.sqrt(samp_size)) + - normal().ppf(1 - alpha) ) elif ( diff --git a/src/samplics/sampling/selection.py b/src/samplics/sampling/selection.py index 0aaba0a0..a4650718 100644 --- a/src/samplics/sampling/selection.py +++ b/src/samplics/sampling/selection.py @@ -270,7 +270,9 @@ def _pps_sys_select_core( cumsize = np.append(0, np.cumsum(mos)) samp_interval = cumsize[-1] / samp_size random_start = np.random.random_sample() * samp_interval - random_picks = random_start + samp_interval * np.linspace(0, samp_size - 1, samp_size) + random_picks = random_start + samp_interval * np.linspace( + 0, samp_size - 1, samp_size + ) hits = np.zeros(samp_unit.size).astype(int) for k in range(cumsize.size - 1): @@ -328,12 +330,17 @@ def _pps_hv_select_core( initial_probs_selection = ( samp_size * diff_probs - * (1 + np.linspace(1, samp_size, samp_size) * probs_sorted[pop_size - samp_size] / s) + * ( + 1 + + np.linspace(1, samp_size, samp_size) + * probs_sorted[pop_size - samp_size] + / s + ) ) probs_sorted = np.delete(probs_sorted, -1) - selected_i = np.random.choice(np.arange(0, samp_size), size=1, p=initial_probs_selection)[ - 0 - ] + selected_i = np.random.choice( + np.arange(0, samp_size), size=1, p=initial_probs_selection + )[0] sampled_indices = all_indices_sorted[selected_i + 1 : samp_size] notsampled_indices = np.delete(all_indices_sorted, sampled_indices) @@ -349,7 +356,9 @@ def _pps_hv_select_core( ) p_starts = notsampled_probs / p_denominator range_part2 = range(pop_size - samp_size, pop_size - samp_size + selected_i) - p_starts[range_part2] = probs_sorted[pop_size - samp_size] / p_denominator[range_part2] + p_starts[range_part2] = ( + probs_sorted[pop_size - samp_size] / p_denominator[range_part2] + ) p_starts_sum = np.cumsum(np.flip(p_starts)[range(p_starts.size - 1)]) p_starts_sum = np.append(np.flip(p_starts_sum), 1) p_double_starts = p_starts / p_starts_sum @@ -363,7 +372,9 @@ def _pps_hv_select_core( p_double_space = 1 - (selected_i + 1 - ll) * np.append(0, p_double_space) p_double_space = np.delete(p_double_space, -1) a_j = ( - (samp_size - ll + 1) * p_starts[range(start_j, end_j)] * np.cumprod(p_double_space) + (samp_size - ll + 1) + * p_starts[range(start_j, end_j)] + * np.cumprod(p_double_space) ) indice_j = np.random.choice(sampling_space, size=1, p=a_j / np.sum(a_j))[0] selected_j = notsampled_indices[indice_j] @@ -422,10 +433,14 @@ def _pps_brewer_select_core( remaining_indices = np.delete(all_indices, sampled_indices) remaining_probs = np.delete(all_probs, sampled_indices) remaining_probs = ( - remaining_probs * (1 - remaining_probs) / (1 - (samp_size - s) * remaining_probs) + remaining_probs + * (1 - remaining_probs) + / (1 - (samp_size - s) * remaining_probs) ) remaining_probs = remaining_probs / sum(remaining_probs) - current_selection = np.random.choice(remaining_indices, 1, p=remaining_probs) + current_selection = np.random.choice( + remaining_indices, 1, p=remaining_probs + ) sampled_indices = np.append(sampled_indices, current_selection) sample[sampled_indices] = True @@ -637,11 +652,15 @@ def _sys_select( ( sample[stratum_units], hits[stratum_units], - ) = self._sys_selection_method(samp_unit[stratum_units], samp_size_s, samp_rate_s) + ) = self._sys_selection_method( + samp_unit[stratum_units], samp_size_s, samp_rate_s + ) elif isinstance(samp_size, int) or isinstance(samp_rate, float): samp_size_n = None if samp_size is None else samp_size samp_rate_n = None if samp_rate is None else samp_rate - sample, hits = self._sys_selection_method(samp_unit, samp_size_n, samp_rate_n) + sample, hits = self._sys_selection_method( + samp_unit, samp_size_n, samp_rate_n + ) return sample, hits @@ -660,7 +679,9 @@ def inclusion_probs( stratum = numpy_array(stratum) if isinstance(samp_size, (int, float)): strata = np.unique(stratum) - samp_size_temp = dict(zip(strata, np.repeat(int(samp_size), strata.shape[0]))) + samp_size_temp = dict( + zip(strata, np.repeat(int(samp_size), strata.shape[0])) + ) elif isinstance(samp_size, dict): samp_size_temp = samp_size.copy() else: @@ -686,9 +707,13 @@ def inclusion_probs( ): if self._anycertainty(samp_size_temp, stratum, mos): raise AssertionError("Some clusters are certainties.") - incl_probs = self._pps_inclusion_probs(samp_unit, samp_size_temp, mos, stratum) + incl_probs = self._pps_inclusion_probs( + samp_unit, samp_size_temp, mos, stratum + ) elif self.method == SelectMethod.sys: - incl_probs = self._sys_inclusion_probs(samp_unit, samp_size_temp, stratum, samp_rate) + incl_probs = self._sys_inclusion_probs( + samp_unit, samp_size_temp, stratum, samp_rate + ) else: raise ValueError("method not valid!") @@ -730,12 +755,16 @@ def select( self.strata = [] self.pop_size = _samp_unit.shape[0] if samp_rate is None: - self.samp_size = data_to_dict(data=samp_size, strat=self.strat, stratum=_stratum) + self.samp_size = data_to_dict( + data=samp_size, strat=self.strat, stratum=_stratum + ) self.samp_rate = self._calculate_samp_rate( strat=self.strat, pop_size=self.pop_size, samp_size=self.samp_size ) else: - self.samp_rate = data_to_dict(data=samp_rate, strat=self.strat, stratum=_stratum) + self.samp_rate = data_to_dict( + data=samp_rate, strat=self.strat, stratum=_stratum + ) self.samp_size = self._calculate_samp_size( strat=self.strat, pop_size=self.pop_size, samp_rate=self.samp_rate ) @@ -775,7 +804,9 @@ def select( ) if remove_nan: - items_to_keep = remove_nans(_samp_ids.shape[0], _samp_ids, _stratum, _mos, _probs) + items_to_keep = remove_nans( + _samp_ids.shape[0], _samp_ids, _stratum, _mos, _probs + ) _samp_ids = _samp_ids[items_to_keep] _stratum = _stratum[items_to_keep] _mos = _mos[items_to_keep] diff --git a/src/samplics/sampling/size.py b/src/samplics/sampling/size.py index b976adb3..bf54a6a6 100644 --- a/src/samplics/sampling/size.py +++ b/src/samplics/sampling/size.py @@ -48,16 +48,23 @@ def allocate( elif method.lower() == "propal": if isinstance(pop_size, dict) and stddev is None and samp_size is not None: total_pop = sum(list(pop_size.values())) - samp_size_h = [math.ceil((samp_size / total_pop) * pop_size[k]) for k in stratum] + samp_size_h = [ + math.ceil((samp_size / total_pop) * pop_size[k]) for k in stratum + ] sample_sizes = dict(zip(stratum, samp_size_h)) - elif isinstance(pop_size, dict) and stddev is not None and samp_size is not None: + elif ( + isinstance(pop_size, dict) and stddev is not None and samp_size is not None + ): total_pop = sum(list(pop_size.values())) samp_size_h = [ - math.ceil((samp_size / total_pop) * pop_size[k] * stddev[k]) for k in stratum + math.ceil((samp_size / total_pop) * pop_size[k] * stddev[k]) + for k in stratum ] sample_sizes = dict(zip(stratum, samp_size_h)) else: - raise ValueError("param 'pop_size' must be a dictionary and 'samp_size' an integer!") + raise ValueError( + "param 'pop_size' must be a dictionary and 'samp_size' an integer!" + ) elif method.lower() == "fixed_rate": if isinstance(rate, (int, float)) and pop_size is not None: samp_size_h = [math.ceil(rate * pop_size[k]) for k in stratum] @@ -81,7 +88,11 @@ def allocate( ) sample_sizes = dict(zip(stratum, samp_size_h)) elif method.lower() == "optimum_mean": - if isinstance(rate, (int, float)) and pop_size is not None and stddev is not None: + if ( + isinstance(rate, (int, float)) + and pop_size is not None + and stddev is not None + ): samp_size_h = [math.ceil(rate * pop_size[k] * stddev[k]) for k in stratum] else: raise ValueError( @@ -96,7 +107,11 @@ def allocate( "param 'stddev' and 'rate' must be a dictionary and number respectively!" ) sample_sizes = dict(zip(stratum, samp_size_h)) - elif method.lower() == "variable_rate" and isinstance(rate, dict) and pop_size is not None: + elif ( + method.lower() == "variable_rate" + and isinstance(rate, dict) + and pop_size is not None + ): samp_size_h = [math.ceil(rate[k] * pop_size[k]) for k in stratum] sample_sizes = dict(zip(stratum, samp_size_h)) else: @@ -206,10 +221,14 @@ def calculate( alpha: float = 0.05, ) -> None: if self.param == PopParam.prop and target is None: - raise AssertionError("target must be provided to calculate sample size for prop.") + raise AssertionError( + "target must be provided to calculate sample size for prop." + ) if self.param == PopParam.mean and sigma is None: - raise AssertionError("sigma must be provided to calculate sample size for mean.") + raise AssertionError( + "sigma must be provided to calculate sample size for mean." + ) if self.param == PopParam.prop: if isinstance(target, (int, float)) and not 0 <= target <= 1: @@ -270,7 +289,10 @@ def calculate( alpha=self.alpha, strat=self.strat, ) - elif self.param in (PopParam.mean, PopParam.total) and self.method == SizeMethod.wald: + elif ( + self.param in (PopParam.mean, PopParam.total) + and self.method == SizeMethod.wald + ): self.samp_size = calculate_ss_wald_mean( half_ci=self.half_ci, sigma=self.sigma, @@ -370,7 +392,9 @@ class SampleSizeMeanOneSample: deff_c: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) deff_w: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) resp_rate: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) - pop_size: Optional[Union[DictStrNum, Array, Number]] = field(init=False, default=None) + pop_size: Optional[Union[DictStrNum, Array, Number]] = field( + init=False, default=None + ) alpha: Union[DictStrNum, Array, Number] = field(init=False, default=0.05) beta: Union[DictStrNum, Array, Number] = field(init=False, default=0.20) @@ -529,7 +553,9 @@ class SampleSizePropOneSample: deff_c: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) deff_w: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) resp_rate: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) - pop_size: Optional[Union[DictStrNum, Array, Number]] = field(init=False, default=None) + pop_size: Optional[Union[DictStrNum, Array, Number]] = field( + init=False, default=None + ) alpha: Union[DictStrNum, Array, Number] = field(init=False, default=0.05) beta: Union[DictStrNum, Array, Number] = field(init=False, default=0.20) @@ -691,14 +717,18 @@ class SampleSizeMeanTwoSample: delta: Union[DictStrNum, Array, Number] = field(init=False, default_factory=dict) sigma_1: Union[DictStrNum, Array, Number] = field(init=False, default_factory=dict) sigma_2: Union[DictStrNum, Array, Number] = field(init=False, default_factory=dict) - equal_var: Union[DictStrNum, Array, Number] = field(init=False, default_factory=dict) + equal_var: Union[DictStrNum, Array, Number] = field( + init=False, default_factory=dict + ) samp_size: Union[DictStrNum, Array, Number] = field(init=False, default=0) actual_power: Union[DictStrNum, Array, Number] = field(init=False, default=0) deff_c: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) deff_w: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) resp_rate: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) - pop_size: Optional[Union[DictStrNum, Array, Number]] = field(init=False, default=None) + pop_size: Optional[Union[DictStrNum, Array, Number]] = field( + init=False, default=None + ) alpha: Union[DictStrNum, Array, Number] = field(init=False, default=0.05) beta: Union[DictStrNum, Array, Number] = field(init=False, default=0.20) @@ -874,7 +904,9 @@ class SampleSizePropTwoSample: deff_c: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) deff_w: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) resp_rate: Union[DictStrNum, Array, Number] = field(init=False, default=1.0) - pop_size: Optional[Union[DictStrNum, Array, Number]] = field(init=False, default=None) + pop_size: Optional[Union[DictStrNum, Array, Number]] = field( + init=False, default=None + ) alpha: Union[DictStrNum, Array, Number] = field(init=False, default=0.05) beta: Union[DictStrNum, Array, Number] = field(init=False, default=0.20) diff --git a/src/samplics/sampling/size_functions.py b/src/samplics/sampling/size_functions.py index 81692953..162e8095 100644 --- a/src/samplics/sampling/size_functions.py +++ b/src/samplics/sampling/size_functions.py @@ -309,7 +309,9 @@ def _calculate_ss_wald_mean_one_sample( z_beta = normal().ppf(power) return math.ceil( - (1 / resp_rate) * deff_c * ((z_alpha + z_beta) * sigma / (delta - abs(epsilon))) ** 2 + (1 / resp_rate) + * deff_c + * ((z_alpha + z_beta) * sigma / (delta - abs(epsilon))) ** 2 ) diff --git a/src/samplics/utils/basic_functions.py b/src/samplics/utils/basic_functions.py index 31c2a8aa..e6018a69 100644 --- a/src/samplics/utils/basic_functions.py +++ b/src/samplics/utils/basic_functions.py @@ -37,7 +37,9 @@ def set_variables_names( if len(vars.shape) == 2: return [prefix + "_" + str(k) for k in range(1, vars.shape[1] + 1)] else: - return [prefix + "_" + str(k) for k in range(1, len(vars.shape) + 1)] + return [ + prefix + "_" + str(k) for k in range(1, len(vars.shape) + 1) + ] elif isinstance(vars, (tuple, list)): return [prefix + "_" + str(k) for k in range(1, len(vars) + 1)] else: diff --git a/src/samplics/utils/checks.py b/src/samplics/utils/checks.py index a005fa09..48d7a497 100644 --- a/src/samplics/utils/checks.py +++ b/src/samplics/utils/checks.py @@ -70,7 +70,10 @@ def assert_response_status( ) -> None: if response_status is None: raise AssertionError("response_status is not provided") - elif not np.isin(response_status, ("in", "rr", "nr", "uk")).all() and response_dict is None: + elif ( + not np.isin(response_status, ("in", "rr", "nr", "uk")).all() + and response_dict is None + ): raise AssertionError( "The response status must only contains values in ('in', 'rr', 'nr', 'uk') or the mapping should be provided using response_dict parameter" ) @@ -85,4 +88,6 @@ def assert_response_status( def assert_brr_number_psus(psu: np.ndarray) -> None: if psu.size % 2 != 0: - raise AssertionError("For the BRR method, the number of PSUs must be a multiple of two.") + raise AssertionError( + "For the BRR method, the number of PSUs must be a multiple of two." + ) diff --git a/src/samplics/utils/formats.py b/src/samplics/utils/formats.py index 7e760c71..808799b5 100644 --- a/src/samplics/utils/formats.py +++ b/src/samplics/utils/formats.py @@ -67,7 +67,9 @@ def dataframe_to_array(df: pd.DataFrame) -> np.ndarray: if nb_vars > 1: for k in range(1, nb_vars): x_array = ( - x_array + "_&_" + df.select(col_names[k]).to_series().cast(pl.Utf8).to_numpy() + x_array + + "_&_" + + df.select(col_names[k]).to_series().cast(pl.Utf8).to_numpy() ) return x_array else: @@ -104,7 +106,11 @@ def dict_to_dataframe(col_names: list[str], *args: Any) -> pd.DataFrame: number_keys = len(keys) for k, arg in enumerate(args): args_keys = list(args[k].keys()) - if not isinstance(arg, dict) or (keys != args_keys) or number_keys != len(args_keys): + if ( + not isinstance(arg, dict) + or (keys != args_keys) + or number_keys != len(args_keys) + ): raise AssertionError( "All input parameters must be dictionaries with the same keys." ) @@ -163,7 +169,9 @@ def remove_nans(n: Number, *args: np.ndarray) -> list: return ~excluded_units -def fpc_as_dict(stratum: np.ndarray, fpc: Union[Array, Number]) -> Union[DictStrNum, Number]: +def fpc_as_dict( + stratum: np.ndarray, fpc: Union[Array, Number] +) -> Union[DictStrNum, Number]: if stratum.shape in ((), (0,)) and isinstance(fpc, (int, float)): return fpc elif stratum.shape not in ((), (0,)) and isinstance(fpc, (int, float)): diff --git a/src/samplics/utils/types.py b/src/samplics/utils/types.py index 414a9330..a2653544 100644 --- a/src/samplics/utils/types.py +++ b/src/samplics/utils/types.py @@ -6,6 +6,8 @@ import numpy as np import pandas as pd import polars as pl +from scipy.stats.distributions import cauchy +from statsmodels.genmod.families.links import inverse_power DF = pl.DataFrame | pd.DataFrame @@ -30,6 +32,49 @@ class FitMethod(Enum): reml = "REML" +@unique +class ModelType(Enum): + LINEAR = "Linear" + LOGISTIC = "Logistic" + # poisson = "Poisson" + # negative_binomial = "Negative Binomial" + # gamma = "Gamma" + # beta = "Beta" + # ordinal = "Ordinal" + # multinomial = "Multinomial" + # probit = "Probit" + # cloglog = "Cloglog" + # loglog = "Loglog" + # logit = "Logit" + # cloglog = "Cloglog" + # loglog = "Loglog" + # logit = "Logit" + + +@unique +class DistFamily(Enum): + GAUSSIAN = "Gaussian" + BINOMIAL = "Binomial" + NEG_BINOMIAL = "Negative Binomial" + # poisson = "Poisson" + # gamma = "Gamma" + # beta = "Beta" + + +@unique +class LinkFunction(Enum): + IDENTITY = "Identity" + LOGIT = "Logit" + PROBIT = "Probit" + CAUCHY = "Cauchy" + CLOGLOG = "Cloglog" + LOGLOG = "Loglog" + LOG = "Log" + INVERSE = "Inverse" + INVERSE_SQUARED = "Inverse Squared" + INVERSE_POWER = "Inverse Power" + + # Population parameters @unique class PopParam(Enum): diff --git a/src/samplics/weighting/adjustment.py b/src/samplics/weighting/adjustment.py index 532c8505..efc7706c 100644 --- a/src/samplics/weighting/adjustment.py +++ b/src/samplics/weighting/adjustment.py @@ -130,7 +130,10 @@ def _response( resp_status = formats.numpy_array(resp_status) checks.assert_response_status(resp_status, resp_dict) - if not np.isin(resp_status, ("in", "rr", "nr", "uk")).any() and resp_dict is not None: + if ( + not np.isin(resp_status, ("in", "rr", "nr", "uk")).any() + and resp_dict is not None + ): if "rr" not in resp_dict: raise ValueError("The response dictionary must contain the key 'rr'!") @@ -164,15 +167,17 @@ def _adj_factor( uk_weights_sum = float(np.sum(samp_weight[uk_sample])) if unknown_to_inelig: - adj_uk = (in_weights_sum + rr_weights_sum + nr_weights_sum + uk_weights_sum) / ( - in_weights_sum + rr_weights_sum + nr_weights_sum - ) + adj_uk = ( + in_weights_sum + rr_weights_sum + nr_weights_sum + uk_weights_sum + ) / (in_weights_sum + rr_weights_sum + nr_weights_sum) adj_rr = (rr_weights_sum + nr_weights_sum) / rr_weights_sum else: adj_uk = 1 adj_rr = (rr_weights_sum + nr_weights_sum + uk_weights_sum) / rr_weights_sum - adj_factor = np.zeros(samp_weight.size) # unknown and nonresponse will get 0 by default + adj_factor = np.zeros( + samp_weight.size + ) # unknown and nonresponse will get 0 by default adj_factor[rr_sample] = adj_rr * adj_uk adj_factor[in_sample] = adj_uk @@ -225,7 +230,9 @@ def adjust( adj_class = pd.DataFrame(np.column_stack(adj_class)) elif isinstance(adj_class, np.ndarray): adj_class = pd.DataFrame(adj_class) - elif not isinstance(adj_class, (pd.Series, pd.DataFrame, pl.Series, pl.DataFrame)): + elif not isinstance( + adj_class, (pd.Series, pd.DataFrame, pl.Series, pl.DataFrame) + ): raise AssertionError( "adj_class must be an numpy ndarray, a list of numpy ndarray or a pandas dataframe." ) @@ -316,9 +323,13 @@ def normalize( else: if control is None: self.control = np.sum(samp_weight.size) - norm_weight, self.adj_factor = self._norm_adjustment(samp_weight, self.control) + norm_weight, self.adj_factor = self._norm_adjustment( + samp_weight, self.control + ) elif isinstance(control, (int, float)): - norm_weight, self.adj_factor = self._norm_adjustment(samp_weight, control) + norm_weight, self.adj_factor = self._norm_adjustment( + samp_weight, control + ) self.control = control self.adj_method = "normalization" @@ -458,10 +469,13 @@ def rake( / sum_prev_wgt[margin][d] ) diff_ctrl_margin[d] = ( - np.abs(sum_wgt[margin][d] - control[margin][d]) / control[margin][d] + np.abs(sum_wgt[margin][d] - control[margin][d]) + / control[margin][d] ) if display_iter: - print(f" Difference against previous value for '{d}': {diff_margin[d]}") + print( + f" Difference against previous value for '{d}': {diff_margin[d]}" + ) print( f" Difference against control value for '{d}': {diff_ctrl_margin[d]}" ) @@ -603,7 +617,9 @@ def _core_vector(x_i: np.ndarray, core_factor: np.ndarray) -> np.ndarray: if x.shape == (x.size,): adj_factor = _core_vector(x, core_factor) else: - adj_factor = np.apply_along_axis(_core_vector, axis=1, arr=x, core_factor=core_factor) + adj_factor = np.apply_along_axis( + _core_vector, axis=1, arr=x, core_factor=core_factor + ) return adj_factor @@ -711,7 +727,9 @@ def calibrate( x_control=np.array(control_d_values), scale=scale_d, ) - adj_factor[domain == d] = 1 + self._calib_wgt(x_d, core_factor_d) / scale_d + adj_factor[domain == d] = ( + 1 + self._calib_wgt(x_d, core_factor_d) / scale_d + ) if additive: calib_weight = np.transpose(np.transpose(adj_factor) * samp_weight) diff --git a/src/samplics/weighting/replicates.py b/src/samplics/weighting/replicates.py index 7147c1cb..1411cb5b 100644 --- a/src/samplics/weighting/replicates.py +++ b/src/samplics/weighting/replicates.py @@ -138,7 +138,9 @@ def _boot_psus_replicates( ratio_sqrt = np.sqrt((1 - samp_rate) * sample_size / (nb_psus - 1)) - return np.asarray(1 - ratio_sqrt + ratio_sqrt * (nb_psus / sample_size) * psu_replicates) + return np.asarray( + 1 - ratio_sqrt + ratio_sqrt * (nb_psus / sample_size) * psu_replicates + ) def _boot_replicates( self, @@ -168,7 +170,9 @@ def _boot_replicates( return boot_coefs # BRR methods - def _brr_nb_reps(self, psu: np.ndarray, stratum: Optional[np.ndarray] = None) -> None: + def _brr_nb_reps( + self, psu: np.ndarray, stratum: Optional[np.ndarray] = None + ) -> None: if stratum is None: self.nb_psus = np.unique(psu).size self.nb_strata = self.nb_psus // 2 + self.nb_psus % 2 @@ -176,7 +180,9 @@ def _brr_nb_reps(self, psu: np.ndarray, stratum: Optional[np.ndarray] = None) -> self.nb_psus = np.unique(np.array(list(zip(stratum, psu))), axis=0).shape[0] self.nb_strata = np.unique(stratum).size if 2 * self.nb_strata != self.nb_psus: - raise AssertionError("Number of psus must be twice the number of strata!") + raise AssertionError( + "Number of psus must be twice the number of strata!" + ) if self.nb_reps < self.nb_strata: self.nb_reps = self.nb_strata @@ -189,11 +195,15 @@ def _brr_nb_reps(self, psu: np.ndarray, stratum: Optional[np.ndarray] = None) -> if math.pow(2, nb_reps_log2) != self.nb_reps: self.nb_reps = int(math.pow(2, nb_reps_log2)) - def _brr_replicates(self, psu: np.ndarray, stratum: Optional[np.ndarray]) -> np.ndarray: + def _brr_replicates( + self, psu: np.ndarray, stratum: Optional[np.ndarray] + ) -> np.ndarray: """Creates the brr replicate structure""" if not (0 <= self.fay_coef < 1): - raise ValueError("The Fay coefficient must be greater or equal to 0 and lower than 1.") + raise ValueError( + "The Fay coefficient must be greater or equal to 0 and lower than 1." + ) self._brr_nb_reps(psu, stratum) self.rep_coefs = list( @@ -225,11 +235,15 @@ def _brr_replicates(self, psu: np.ndarray, stratum: Optional[np.ndarray]) -> np. def _jkn_psus_replicates(nb_psus: int) -> np.ndarray: """Creates the jackknife delete-1 replicate structure""" - jk_coefs = (nb_psus / (nb_psus - 1)) * (np.ones((nb_psus, nb_psus)) - np.identity(nb_psus)) + jk_coefs = (nb_psus / (nb_psus - 1)) * ( + np.ones((nb_psus, nb_psus)) - np.identity(nb_psus) + ) return np.asarray(jk_coefs) - def _jkn_replicates(self, psu: np.ndarray, stratum: Optional[np.ndarray]) -> np.ndarray: + def _jkn_replicates( + self, psu: np.ndarray, stratum: Optional[np.ndarray] + ) -> np.ndarray: self.rep_coefs = ((self.nb_reps - 1) / self.nb_reps) * np.ones(self.nb_reps) if stratum is None: @@ -308,7 +322,9 @@ def replicate( strata = np.repeat(range(1, psus.size // 2 + 1), 2) stratum_psu = pd.DataFrame({str_varname: strata, psu_varname: psus}) psu_pd = pd.DataFrame({psu_varname: psu}) - stratum_psu = pd.merge(psu_pd, stratum_psu, on=psu_varname, how="left", sort=False) + stratum_psu = pd.merge( + psu_pd, stratum_psu, on=psu_varname, how="left", sort=False + ) stratum_psu = stratum_psu[[str_varname, psu_varname]] key = [str_varname, psu_varname] else: @@ -342,7 +358,9 @@ def replicate( if not rep_coefs: rep_cols = [col for col in full_sample if col.startswith(rep_prefix)] - full_sample[rep_cols] = full_sample[rep_cols].mul(samp_weight.values, axis=0) + full_sample[rep_cols] = full_sample[rep_cols].mul( + samp_weight.values, axis=0 + ) return full_sample diff --git a/tests/apis/sae/test_area_eblup.py b/tests/apis/sae/test_area_eblup.py index 329684b8..2b19f712 100644 --- a/tests/apis/sae/test_area_eblup.py +++ b/tests/apis/sae/test_area_eblup.py @@ -63,6 +63,8 @@ # breakpoint() -@pytest.mark.skipif(sys.platform == "linux", reason="Skip dev version on Github (Linux)") +@pytest.mark.skipif( + sys.platform == "linux", reason="Skip dev version on Github (Linux)" +) def test_sae1(): assert 1 == 2 diff --git a/tests/apis/sae/test_data_types.py b/tests/apis/sae/test_data_types.py index 68aaf94a..10dd315f 100644 --- a/tests/apis/sae/test_data_types.py +++ b/tests/apis/sae/test_data_types.py @@ -130,9 +130,15 @@ def test_directest_polars(self): est_pl = est.to_polars() assert (est.domains == est_pl.select("__domain").to_numpy().flatten()).all() - assert est.est == dict(zip(est.domains, est_pl.select("est").to_numpy().flatten())) - assert est.stderr == dict(zip(est.domains, est_pl.select("stderr").to_numpy().flatten())) - assert est.ssize == dict(zip(est.domains, est_pl.select("ssize").to_numpy().flatten())) + assert est.est == dict( + zip(est.domains, est_pl.select("est").to_numpy().flatten()) + ) + assert est.stderr == dict( + zip(est.domains, est_pl.select("stderr").to_numpy().flatten()) + ) + assert est.ssize == dict( + zip(est.domains, est_pl.select("ssize").to_numpy().flatten()) + ) def test_directest_pandas(self): est = DirectEst( diff --git a/tests/categorical/test_comparison.py b/tests/categorical/test_comparison.py index 31044fde..8a1d4c41 100644 --- a/tests/categorical/test_comparison.py +++ b/tests/categorical/test_comparison.py @@ -17,13 +17,17 @@ one_sample_known_mean = Ttest(samp_type="one-sample") -@pytest.mark.xfail(strict=True, reason="Parameters 'known_mean' or 'group' must be provided!") +@pytest.mark.xfail( + strict=True, reason="Parameters 'known_mean' or 'group' must be provided!" +) def test_one_sample_wrong_specifications1(): one_sample_wrong = Ttest() one_sample_wrong.compare(y) -@pytest.mark.xfail(strict=True, reason="Parameters 'known_mean' or 'group' must be provided!") +@pytest.mark.xfail( + strict=True, reason="Parameters 'known_mean' or 'group' must be provided!" +) def test_one_sample_wrong_specifications2(): one_sample_wrong = Ttest("one-sample") one_sample_wrong.compare(y) @@ -114,17 +118,29 @@ def test_one_sample_two_groups_number_obs(): def test_one_sample_two_groups_t_eq_variance(): assert np.isclose(one_sample_stats["t_eq_variance"], -3.66326, 1e-4) assert np.isclose(one_sample_stats["df_eq_variance"], 72, 1e-4) - assert np.isclose(one_sample_stats["p_value_eq_variance"]["less_than"], 0.0002362, 1e-4) - assert np.isclose(one_sample_stats["p_value_eq_variance"]["greater_than"], 0.9997638, 1e-4) - assert np.isclose(one_sample_stats["p_value_eq_variance"]["not_equal"], 0.0004725, 1e-4) + assert np.isclose( + one_sample_stats["p_value_eq_variance"]["less_than"], 0.0002362, 1e-4 + ) + assert np.isclose( + one_sample_stats["p_value_eq_variance"]["greater_than"], 0.9997638, 1e-4 + ) + assert np.isclose( + one_sample_stats["p_value_eq_variance"]["not_equal"], 0.0004725, 1e-4 + ) def test_one_sample_two_groups_t_uneq_variance(): assert np.isclose(one_sample_stats["t_uneq_variance"], -3.22454, 1e-4) assert np.isclose(one_sample_stats["df_uneq_variance"], 30.81429, 1e-4) - assert np.isclose(one_sample_stats["p_value_uneq_variance"]["less_than"], 0.0014909, 1e-4) - assert np.isclose(one_sample_stats["p_value_uneq_variance"]["greater_than"], 0.9985091, 1e-4) - assert np.isclose(one_sample_stats["p_value_uneq_variance"]["not_equal"], 0.0029818, 1e-4) + assert np.isclose( + one_sample_stats["p_value_uneq_variance"]["less_than"], 0.0014909, 1e-4 + ) + assert np.isclose( + one_sample_stats["p_value_uneq_variance"]["greater_than"], 0.9985091, 1e-4 + ) + assert np.isclose( + one_sample_stats["p_value_uneq_variance"]["not_equal"], 0.0029818, 1e-4 + ) ## Two-sample comparisons - UNPAIRED @@ -169,17 +185,29 @@ def test_two_samples_unpaired_number_obs(): def test_two_samples_unpaired_t_eq_variance(): assert np.isclose(two_samples_stats["t_eq_variance"], -3.630848, 1e-4) assert np.isclose(two_samples_stats["df_eq_variance"], 72, 1e-4) - assert np.isclose(two_samples_stats["p_value_eq_variance"]["less_than"], 0.0002627, 1e-4) - assert np.isclose(two_samples_stats["p_value_eq_variance"]["greater_than"], 0.9997372, 1e-4) - assert np.isclose(two_samples_stats["p_value_eq_variance"]["not_equal"], 0.0005254, 1e-4) + assert np.isclose( + two_samples_stats["p_value_eq_variance"]["less_than"], 0.0002627, 1e-4 + ) + assert np.isclose( + two_samples_stats["p_value_eq_variance"]["greater_than"], 0.9997372, 1e-4 + ) + assert np.isclose( + two_samples_stats["p_value_eq_variance"]["not_equal"], 0.0005254, 1e-4 + ) def test_two_samples_unpaired_t_uneq_variance(): assert np.isclose(two_samples_stats["t_uneq_variance"], -3.179685, 1e-4) assert np.isclose(two_samples_stats["df_uneq_variance"], 30.546278, 1e-4) - assert np.isclose(two_samples_stats["p_value_uneq_variance"]["less_than"], 0.0016850, 1e-4) - assert np.isclose(two_samples_stats["p_value_uneq_variance"]["greater_than"], 0.9983150, 1e-4) - assert np.isclose(two_samples_stats["p_value_uneq_variance"]["not_equal"], 0.0033701, 1e-4) + assert np.isclose( + two_samples_stats["p_value_uneq_variance"]["less_than"], 0.0016850, 1e-4 + ) + assert np.isclose( + two_samples_stats["p_value_uneq_variance"]["greater_than"], 0.9983150, 1e-4 + ) + assert np.isclose( + two_samples_stats["p_value_uneq_variance"]["not_equal"], 0.0033701, 1e-4 + ) ## two-sample with paired observations diff --git a/tests/categorical/test_crosstabulation.py b/tests/categorical/test_crosstabulation.py index 48294bce..4e1caa54 100644 --- a/tests/categorical/test_crosstabulation.py +++ b/tests/categorical/test_crosstabulation.py @@ -126,15 +126,21 @@ def test_empty_cells_3_prop(): np.testing.assert_almost_equal( crosstab_temp3.point_est["Man"]["American"], 0.167963, decimal=6 ) - np.testing.assert_almost_equal(crosstab_temp3.point_est["Man"]["European"], 0.32967, decimal=6) - np.testing.assert_almost_equal(crosstab_temp3.point_est["Man"]["Other"], 0.00000, decimal=6) + np.testing.assert_almost_equal( + crosstab_temp3.point_est["Man"]["European"], 0.32967, decimal=6 + ) + np.testing.assert_almost_equal( + crosstab_temp3.point_est["Man"]["Other"], 0.00000, decimal=6 + ) np.testing.assert_almost_equal( crosstab_temp3.point_est["Woman"]["American"], 0.0769231, decimal=6 ) np.testing.assert_almost_equal( crosstab_temp3.point_est["Woman"]["European"], 0.21978, decimal=6 ) - np.testing.assert_almost_equal(crosstab_temp3.point_est["Woman"]["Other"], 0.205664, decimal=6) + np.testing.assert_almost_equal( + crosstab_temp3.point_est["Woman"]["Other"], 0.205664, decimal=6 + ) def test_empty_cells_3_count(): @@ -209,14 +215,20 @@ def test_twoway_count_one_var_count(): tbl.tabulate(region, remove_nan=True) -@pytest.mark.xfail(strict=True, reason="For now, the method will fail if there are missing values") +@pytest.mark.xfail( + strict=True, reason="For now, the method will fail if there are missing values" +) def test_for_missing_values_in_the_design_matrix(): tbl_prop = CrossTabulation(PopParam.prop) - tbl_prop.tabulate([region, birth_cat], varnames=["region", "birth_cat"], remove_nan=False) + tbl_prop.tabulate( + [region, birth_cat], varnames=["region", "birth_cat"], remove_nan=False + ) tbl_count = CrossTabulation(PopParam.count) -tbl_count.tabulate([age_cat, birth_cat], varnames=["age_cat", "birth_cat"], remove_nan=True) +tbl_count.tabulate( + [age_cat, birth_cat], varnames=["age_cat", "birth_cat"], remove_nan=True +) def test_twoway_count_to_dataframe(): @@ -273,7 +285,9 @@ def test_twoway_count_upper_ci(): def test_twoway_count_stats_pearson(): assert np.isclose(tbl_count.stats["Pearson-Unadj"]["df"], 4, atol=1e-2) - assert np.isclose(tbl_count.stats["Pearson-Unadj"]["chisq_value"], 324.2777, atol=1e-4) + assert np.isclose( + tbl_count.stats["Pearson-Unadj"]["chisq_value"], 324.2777, atol=1e-4 + ) assert np.isclose(tbl_count.stats["Pearson-Unadj"]["p_value"], 0.0000, atol=1e-4) assert np.isclose(tbl_count.stats["Pearson-Adj"]["df_num"], 4, atol=1e-2) @@ -301,7 +315,9 @@ def test_twoway_count_design_info(): tbl_prop = CrossTabulation(PopParam.prop) -tbl_prop.tabulate([age_cat, birth_cat], varnames=["age_cat", "birth_cat"], remove_nan=True) +tbl_prop.tabulate( + [age_cat, birth_cat], varnames=["age_cat", "birth_cat"], remove_nan=True +) def test_twoway_prop_to_dataframe(): @@ -358,7 +374,9 @@ def test_twoway_prop_upper_ci(): def test_twoway_prop_stats_pearson(): assert np.isclose(tbl_prop.stats["Pearson-Unadj"]["df"], 4, atol=1e-2) - assert np.isclose(tbl_prop.stats["Pearson-Unadj"]["chisq_value"], 324.2777, atol=1e-4) + assert np.isclose( + tbl_prop.stats["Pearson-Unadj"]["chisq_value"], 324.2777, atol=1e-4 + ) assert np.isclose(tbl_prop.stats["Pearson-Unadj"]["p_value"], 0.0000, atol=1e-4) assert np.isclose(tbl_prop.stats["Pearson-Adj"]["df_num"], 4, atol=1e-2) @@ -450,7 +468,9 @@ def test_nhanes_twoway_prop_upper_ci(): @pytest.mark.skip("Not implemented yet") def test_nhanes_twoway_prop_stats_pearson(): assert np.isclose(tbl1_nhanes.stats["Pearson-Unadj"]["df"], 3, atol=1e-4) - assert np.isclose(tbl1_nhanes.stats["Pearson-Unadj"]["chisq_value"], 16.9728, atol=1e-4) + assert np.isclose( + tbl1_nhanes.stats["Pearson-Unadj"]["chisq_value"], 16.9728, atol=1e-4 + ) assert np.isclose(tbl1_nhanes.stats["Pearson-Unadj"]["p_value"], 0.0007, atol=1e-4) assert np.isclose(tbl1_nhanes.stats["Pearson-Adj"]["df_num"], 1.9230, atol=1e-4) @@ -536,7 +556,9 @@ def test_nhanes_twoway_count_upper_ci(): @pytest.mark.skip("Not implemented yet") def test_nhanes_twoway_count_stats_pearson(): assert np.isclose(tbl2_nhanes.stats["Pearson-Unadj"]["df"], 3, atol=1e-4) - assert np.isclose(tbl2_nhanes.stats["Pearson-Unadj"]["chisq_value"], 16.9728, atol=1e-4) + assert np.isclose( + tbl2_nhanes.stats["Pearson-Unadj"]["chisq_value"], 16.9728, atol=1e-4 + ) assert np.isclose(tbl2_nhanes.stats["Pearson-Unadj"]["p_value"], 0.0007, atol=1e-4) assert np.isclose(tbl2_nhanes.stats["Pearson-Adj"]["df_num"], 1.9230, atol=1e-4) diff --git a/tests/datasets/test_datasets.py b/tests/datasets/test_datasets.py index 43ecd472..154e225f 100644 --- a/tests/datasets/test_datasets.py +++ b/tests/datasets/test_datasets.py @@ -35,7 +35,10 @@ def test_loading_psu_sample(): psu_sample = load_psu_sample() assert psu_sample["name"] == "PSU Sample" - assert psu_sample["description"] == "The PSU sample obtained from the simulated PSU frame." + assert ( + psu_sample["description"] + == "The PSU sample obtained from the simulated PSU frame." + ) assert psu_sample["nrows"] == 10 assert psu_sample["ncols"] == 3 assert list(psu_sample["data"].columns) == ["cluster", "region", "psu_prob"] @@ -45,7 +48,10 @@ def test_loading_ssu_sample(): ssu_sample = load_ssu_sample() assert ssu_sample["name"] == "SSU Sample" - assert ssu_sample["description"] == "The SSU sample obtained from the simulated SSU frame." + assert ( + ssu_sample["description"] + == "The SSU sample obtained from the simulated SSU frame." + ) assert ssu_sample["nrows"] == 150 assert ssu_sample["ncols"] == 3 assert list(ssu_sample["data"].columns) == ["cluster", "household", "ssu_prob"] diff --git a/tests/estimation/test_expansion.py b/tests/estimation/test_expansion.py index b433f034..b94134fd 100644 --- a/tests/estimation/test_expansion.py +++ b/tests/estimation/test_expansion.py @@ -11,7 +11,9 @@ yrbs["y"] = yrbs["qn8"].replace({2: 0}) yrbs["x"] = 0.8 * yrbs["y"] + 0.5 -yrbs["domain"] = np.random.choice(["d1", "d2", "d3"], size=yrbs.shape[0], p=[0.1, 0.3, 0.6]) +yrbs["domain"] = np.random.choice( + ["d1", "d2", "d3"], size=yrbs.shape[0], p=[0.1, 0.3, 0.6] +) yrbs["by"] = np.random.choice(["b1", "b2"], size=yrbs.shape[0], p=[0.4, 0.6]) # yrbs["fpc"] = 1.0 @@ -47,7 +49,9 @@ def test_total_estimator_without_str_to_dataframe(): svy_total_without_str.estimate(y=y, samp_weight=weight, psu=psu, remove_nan=True) est_df = svy_total_without_str.to_dataframe() - assert (est_df.columns == ["_param", "_estimate", "_stderror", "_lci", "_uci", "_cv"]).all() + assert ( + est_df.columns == ["_param", "_estimate", "_stderror", "_lci", "_uci", "_cv"] + ).all() assert est_df._param[0] == PopParam.total assert np.isclose(est_df._estimate[0], 7938.333) assert np.isclose(est_df._stderror[0], 560.0856) @@ -82,7 +86,9 @@ def test_total_estimator_without_str_nor_psu(): def test_total_estimator_without_str_domain(): - svy_total_without_str_domain.estimate(y, weight, psu=psu, domain=domain, remove_nan=True) + svy_total_without_str_domain.estimate( + y, weight, psu=psu, domain=domain, remove_nan=True + ) assert np.isclose(svy_total_without_str_domain.point_est["d1"], 759.8535) assert np.isclose(svy_total_without_str_domain.point_est["d2"], 2335.8145) @@ -164,7 +170,9 @@ def test_total_estimator_with_str_domain(): def test_total_estimator_with_str_without_psu_domain(): - svy_total_with_str_domain.estimate(y, weight, stratum=stratum, domain=domain, remove_nan=True) + svy_total_with_str_domain.estimate( + y, weight, stratum=stratum, domain=domain, remove_nan=True + ) assert np.isclose(svy_total_with_str_domain.point_est["d1"], 759.8535) assert np.isclose(svy_total_with_str_domain.point_est["d2"], 2335.8145) @@ -210,7 +218,9 @@ def test_mean_estimator_without_str_nor_psu(): def test_mean_estimator_without_str_domain(): - svy_mean_without_str_domain.estimate(y, weight, psu=psu, domain=domain, remove_nan=True) + svy_mean_without_str_domain.estimate( + y, weight, psu=psu, domain=domain, remove_nan=True + ) assert np.isclose(svy_mean_without_str_domain.point_est["d1"], 0.8311598) assert np.isclose(svy_mean_without_str_domain.point_est["d2"], 0.797226) @@ -292,7 +302,9 @@ def test_mean_estimator_with_str_domain(): def test_mean_estimator_with_str_nor_psu_domain(): - svy_mean_with_str_domain.estimate(y, weight, stratum=stratum, domain=domain, remove_nan=True) + svy_mean_with_str_domain.estimate( + y, weight, stratum=stratum, domain=domain, remove_nan=True + ) assert np.isclose(svy_mean_with_str_domain.point_est["d1"], 0.8311598) assert np.isclose(svy_mean_with_str_domain.point_est["d2"], 0.797226) @@ -338,7 +350,9 @@ def test_ratio_estimator_without_str_nor_psu(): def test_ratio_estimator_without_str_domain(): - svy_ratio_without_str_domain.estimate(y, weight, x, psu=psu, domain=domain, remove_nan=True) + svy_ratio_without_str_domain.estimate( + y, weight, x, psu=psu, domain=domain, remove_nan=True + ) assert np.isclose(svy_ratio_without_str_domain.point_est["d1"], 0.7134861) assert np.isclose(svy_ratio_without_str_domain.point_est["d2"], 0.7006851) @@ -482,7 +496,9 @@ def test_prop_estimator_without_str_nor_psu(): def test_prop_estimator_without_str_domain(): - svy_prop_without_str_domain.estimate(y, weight, psu=psu, domain=domain, remove_nan=True) + svy_prop_without_str_domain.estimate( + y, weight, psu=psu, domain=domain, remove_nan=True + ) assert np.isclose(svy_prop_without_str_domain.point_est["d1"][0.0], 0.1688402) assert np.isclose(svy_prop_without_str_domain.point_est["d1"][1.0], 0.8311598) @@ -612,7 +628,9 @@ def test_prop_estimator_with_str_domain(): def test_prop_estimator_with_str_nor_psu_domain(): - svy_prop_with_str_domain.estimate(y, weight, stratum=stratum, domain=domain, remove_nan=True) + svy_prop_with_str_domain.estimate( + y, weight, stratum=stratum, domain=domain, remove_nan=True + ) assert np.isclose(svy_prop_with_str_domain.point_est["d1"][0.0], 0.1688402) assert np.isclose(svy_prop_with_str_domain.point_est["d1"][1.0], 0.8311598) @@ -645,7 +663,9 @@ def test_prop_estimator_with_str_nor_psu_domain(): def test_factor_mean_estimator_without_str(): - svy_factor_mean_without_str.estimate(y, weight, psu=psu, as_factor=True, remove_nan=True) + svy_factor_mean_without_str.estimate( + y, weight, psu=psu, as_factor=True, remove_nan=True + ) assert np.isclose(svy_factor_mean_without_str.point_est[0.0], 0.186_377_5) assert np.isclose(svy_factor_mean_without_str.point_est[1.0], 0.813_622_5) @@ -678,12 +698,20 @@ def test_factor_mean_estimator_without_str_domain(): y, weight, psu=psu, domain=domain, as_factor=True, remove_nan=True ) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d1"][0.0], 0.1688402) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d1"][1.0], 0.8311598) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d1"][0.0], 0.1688402 + ) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d1"][1.0], 0.8311598 + ) assert np.isclose(svy_factor_mean_without_str_domain.point_est["d2"][0.0], 0.202774) assert np.isclose(svy_factor_mean_without_str_domain.point_est["d2"][1.0], 0.797226) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d3"][0.0], 0.1809641) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d3"][1.0], 0.8190359) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d3"][0.0], 0.1809641 + ) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d3"][1.0], 0.8190359 + ) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d1"][0.0], 0.0203778) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d1"][1.0], 0.0203778) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d2"][0.0], 0.0260659) @@ -709,12 +737,20 @@ def test_factor_mean_estimator_without_str_nor_psu_domain(): y, weight, domain=domain, as_factor=True, remove_nan=True ) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d1"][0.0], 0.1688402) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d1"][1.0], 0.8311598) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d1"][0.0], 0.1688402 + ) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d1"][1.0], 0.8311598 + ) assert np.isclose(svy_factor_mean_without_str_domain.point_est["d2"][0.0], 0.202774) assert np.isclose(svy_factor_mean_without_str_domain.point_est["d2"][1.0], 0.797226) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d3"][0.0], 0.1809641) - assert np.isclose(svy_factor_mean_without_str_domain.point_est["d3"][1.0], 0.8190359) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d3"][0.0], 0.1809641 + ) + assert np.isclose( + svy_factor_mean_without_str_domain.point_est["d3"][1.0], 0.8190359 + ) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d1"][0.0], 0.0200196) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d1"][1.0], 0.0200196) assert np.isclose(svy_factor_mean_without_str_domain.stderror["d2"][0.0], 0.0125303) @@ -755,7 +791,9 @@ def test_factor_mean_estimator_with_str(): def test_factor_estimator_with_str_without_psu(): - svy_factor_mean_with_str.estimate(y, weight, stratum=stratum, as_factor=True, remove_nan=True) + svy_factor_mean_with_str.estimate( + y, weight, stratum=stratum, as_factor=True, remove_nan=True + ) assert np.isclose(svy_factor_mean_with_str.point_est[0.0], 0.1863775) assert np.isclose(svy_factor_mean_with_str.point_est[1.0], 0.8136225) @@ -1035,7 +1073,9 @@ def test_factor_mean_estimator_with_str_nor_psu_domain(): def test_factor_mean_estimator_with_str_dataframe(): - svy_est.estimate(y, weight, stratum=stratum, psu=psu, domain=domain, remove_nan=True) + svy_est.estimate( + y, weight, stratum=stratum, psu=psu, domain=domain, remove_nan=True + ) svy_est_df = svy_est.to_dataframe() assert svy_est_df.columns.tolist() == [ diff --git a/tests/estimation/test_expansion_fpc.py b/tests/estimation/test_expansion_fpc.py index 0e6b4404..e7c1a83f 100644 --- a/tests/estimation/test_expansion_fpc.py +++ b/tests/estimation/test_expansion_fpc.py @@ -47,7 +47,9 @@ def test_total_estimator_without_str_nor_psu(): def test_total_estimator_without_str_domain(): - svy_total_without_str_domain.estimate(y, weight, psu=psu, fpc=fpc_psu, domain=domain) + svy_total_without_str_domain.estimate( + y, weight, psu=psu, fpc=fpc_psu, domain=domain + ) assert np.isclose(svy_total_without_str_domain.point_est[1], 528.52) assert np.isclose(svy_total_without_str_domain.point_est[2], 324.6) @@ -259,7 +261,9 @@ def test_ratio_estimator_without_str_nor_psu(): def test_ratio_estimator_without_str_domain(): - svy_ratio_without_str_domain.estimate(y, weight, x, psu=psu, fpc=fpc_psu, domain=domain) + svy_ratio_without_str_domain.estimate( + y, weight, x, psu=psu, fpc=fpc_psu, domain=domain + ) assert np.isclose(svy_ratio_without_str_domain.point_est[1], 1.8950016) assert np.isclose(svy_ratio_without_str_domain.point_est[2], 1.887209) @@ -289,7 +293,9 @@ def test_ratio_estimator_without_str_nor_psu_domain(): def test_ratio_estimator_with_str(): - svy_ratio_with_str.estimate(y, weight, x, stratum=stratum, psu=psu, fpc=fpc_dict_psu) + svy_ratio_with_str.estimate( + y, weight, x, stratum=stratum, psu=psu, fpc=fpc_dict_psu + ) assert np.isclose(svy_ratio_with_str.point_est, 1.892038) assert np.isclose(svy_ratio_with_str.stderror, 0.0072954) diff --git a/tests/estimation/test_replication_conservative.py b/tests/estimation/test_replication_conservative.py index 421ff9a3..efd81182 100644 --- a/tests/estimation/test_replication_conservative.py +++ b/tests/estimation/test_replication_conservative.py @@ -95,7 +95,9 @@ def test_jkn_total_d(): def test_jkn_prop(): jkn_prop = ReplicateEstimator(RepMethod.jackknife, PopParam.prop) - jkn_prop.estimate(z_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=True, remove_nan=True) + jkn_prop.estimate( + z_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=True, remove_nan=True + ) jkn_var = jkn_prop.variance jkn_stderr_1 = pow(jkn_var[1], 0.5) jkn_stderr_2 = pow(jkn_var[2], 0.5) @@ -197,7 +199,9 @@ def test_jkn_ratio_d(): def test_brr_mean(): brr_mean = ReplicateEstimator(RepMethod.brr, PopParam.mean) - brr_mean.estimate(y_brr, sample_wgt_brr, rep_wgt_brr, conservative=True, remove_nan=True) + brr_mean.estimate( + y_brr, sample_wgt_brr, rep_wgt_brr, conservative=True, remove_nan=True + ) brr_var = brr_mean.variance brr_stderr = pow(brr_var, 0.5) assert np.isclose(brr_stderr, 0.1656454, atol=1e-7) @@ -263,7 +267,9 @@ def test_brr_total_d(): def test_brr_prop(): brr_prop = ReplicateEstimator(RepMethod.brr, PopParam.prop) - brr_prop.estimate(z_brr, sample_wgt_brr, rep_wgt_brr, conservative=True, remove_nan=True) + brr_prop.estimate( + z_brr, sample_wgt_brr, rep_wgt_brr, conservative=True, remove_nan=True + ) brr_var = brr_prop.variance brr_stderr_0 = pow(brr_var[0.0], 0.5) brr_stderr_1 = pow(brr_var[1.0], 0.5) @@ -356,7 +362,9 @@ def test_brr_ratio_d(): def test_fay_mean(): fay_mean = ReplicateEstimator(RepMethod.brr, PopParam.mean, fay_coef=fay_coef) - fay_mean.estimate(y_fay, sample_wgt_fay, rep_wgt_fay, conservative=True, remove_nan=True) + fay_mean.estimate( + y_fay, sample_wgt_fay, rep_wgt_fay, conservative=True, remove_nan=True + ) fay_var = fay_mean.variance fay_stderr = pow(fay_var, 0.5) assert np.isclose(fay_stderr, 0.1655724, atol=1e-7) @@ -420,7 +428,9 @@ def test_fay_total_d(): def test_fay_prop(): fay_prop = ReplicateEstimator(RepMethod.brr, PopParam.prop, fay_coef=fay_coef) - fay_prop.estimate(z_fay, sample_wgt_fay, rep_wgt_fay, conservative=True, remove_nan=True) + fay_prop.estimate( + z_fay, sample_wgt_fay, rep_wgt_fay, conservative=True, remove_nan=True + ) fay_var = fay_prop.variance fay_stderr_0 = pow(fay_var[0.0], 0.5) fay_stderr_1 = pow(fay_var[1.0], 0.5) diff --git a/tests/estimation/test_replication_not_conservative.py b/tests/estimation/test_replication_not_conservative.py index 92842f36..364c1b0a 100644 --- a/tests/estimation/test_replication_not_conservative.py +++ b/tests/estimation/test_replication_not_conservative.py @@ -22,7 +22,9 @@ def test_jkn_mean(): jkn_mean = ReplicateEstimator(RepMethod.jackknife, PopParam.mean) - jkn_mean.estimate(y_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=False, remove_nan=True) + jkn_mean.estimate( + y_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=False, remove_nan=True + ) jkn_var = jkn_mean.variance jkn_stderr = pow(jkn_var, 0.5) assert np.isclose(jkn_stderr, 0.2320822, atol=1e-7) @@ -89,7 +91,9 @@ def test_jkn_total_d(): def test_jkn_prop(): jkn_prop = ReplicateEstimator(RepMethod.jackknife, PopParam.prop) - jkn_prop.estimate(z_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=False, remove_nan=True) + jkn_prop.estimate( + z_jkn, sample_wgt_jkn, rep_wgt_jkn, conservative=False, remove_nan=True + ) jkn_var = jkn_prop.variance jkn_stderr_1 = pow(jkn_var[1], 0.5) jkn_stderr_2 = pow(jkn_var[2], 0.5) @@ -191,7 +195,9 @@ def test_jkn_ratio_d(): def test_brr_mean(): brr_mean = ReplicateEstimator(RepMethod.brr, PopParam.mean) - brr_mean.estimate(y_brr, sample_wgt_brr, rep_wgt_brr, conservative=False, remove_nan=True) + brr_mean.estimate( + y_brr, sample_wgt_brr, rep_wgt_brr, conservative=False, remove_nan=True + ) brr_var = brr_mean.variance brr_stderr = pow(brr_var, 0.5) assert np.isclose(brr_stderr, 0.1656454, atol=1e-7) @@ -257,7 +263,9 @@ def test_brr_total_d(): def test_brr_prop(): brr_prop = ReplicateEstimator(RepMethod.brr, PopParam.prop) - brr_prop.estimate(z_brr, sample_wgt_brr, rep_wgt_brr, conservative=False, remove_nan=True) + brr_prop.estimate( + z_brr, sample_wgt_brr, rep_wgt_brr, conservative=False, remove_nan=True + ) brr_var = brr_prop.variance brr_stderr_0 = pow(brr_var[0.0], 0.5) brr_stderr_1 = pow(brr_var[1.0], 0.5) @@ -350,7 +358,9 @@ def test_brr_ratio_d(): def test_fay_mean(): fay_mean = ReplicateEstimator(RepMethod.brr, PopParam.mean, fay_coef=fay_coef) - fay_mean.estimate(y_fay, sample_wgt_fay, rep_wgt_fay, conservative=False, remove_nan=True) + fay_mean.estimate( + y_fay, sample_wgt_fay, rep_wgt_fay, conservative=False, remove_nan=True + ) fay_var = fay_mean.variance fay_stderr = pow(fay_var, 0.5) assert np.isclose(fay_stderr, 0.1655723, atol=1e-6) @@ -414,7 +424,9 @@ def test_fay_total_d(): def test_fay_prop(): fay_prop = ReplicateEstimator(RepMethod.brr, PopParam.prop, fay_coef=fay_coef) - fay_prop.estimate(z_fay, sample_wgt_fay, rep_wgt_fay, conservative=False, remove_nan=True) + fay_prop.estimate( + z_fay, sample_wgt_fay, rep_wgt_fay, conservative=False, remove_nan=True + ) fay_var = fay_prop.variance fay_stderr_0 = pow(fay_var[0.0], 0.5) fay_stderr_1 = pow(fay_var[1.0], 0.5) diff --git a/tests/regression/test_glm.py b/tests/regression/test_glm.py index 0ca2e137..de200289 100644 --- a/tests/regression/test_glm.py +++ b/tests/regression/test_glm.py @@ -2,11 +2,13 @@ import pandas as pd from samplics.regression import SurveyGLM +from samplics.utils.types import ModelType api_strat = pd.read_csv("./tests/regression/api_strat.csv") y = api_strat["api00"] +y_bin = api_strat["api00"] > 743 # api_strat["api00"].quantile(0.75) x = api_strat[["ell", "meals", "mobility"]] x.insert(0, "intercept", 1) stratum = api_strat["stype"] @@ -14,9 +16,84 @@ weight = api_strat["pw"] -def test_survey_glm_not_stratified_srs(): +## Logistic regression + + +def test_reg_logistic_not_stratified(): + svyglm = SurveyGLM() + svyglm.estimate(model=ModelType.LOGISTIC, y=y_bin, x=x, samp_weight=weight) + + assert np.isclose(svyglm.beta[0], 2.665756) + assert np.isclose(svyglm.beta[1], -0.0370325) + assert np.isclose(svyglm.beta[2], -0.0846879) + assert np.isclose(svyglm.beta[3], -0.0044286) + + assert np.isclose(svyglm.cov_beta[0, 0], 0.4718306**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.0336339**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.0146019**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.0147885**2) + + +def test_reg_logistic_psu_not_stratified(): + svyglm = SurveyGLM() + svyglm.estimate(model=ModelType.LOGISTIC, y=y_bin, x=x, samp_weight=weight, psu=psu) + + assert np.isclose(svyglm.beta[0], 2.665756) + assert np.isclose(svyglm.beta[1], -0.0370325) + assert np.isclose(svyglm.beta[2], -0.0846879) + assert np.isclose(svyglm.beta[3], -0.0044286) + + assert np.isclose(svyglm.cov_beta[0, 0], 0.4949581**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.0318235**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.0143854**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.0153727**2) + + +def test_reg_logistic_stratified(): + svyglm = SurveyGLM() + svyglm.estimate( + model=ModelType.LOGISTIC, y=y_bin, x=x, samp_weight=weight, stratum=stratum + ) + + assert np.isclose(svyglm.beta[0], 2.665756) + assert np.isclose(svyglm.beta[1], -0.0370325) + assert np.isclose(svyglm.beta[2], -0.0846879) + assert np.isclose(svyglm.beta[3], -0.0044286) + + assert np.isclose(svyglm.cov_beta[0, 0], 0.4394433**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.0336371**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.0143307**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.0148366**2) + + +def test_reg_logistic_psu_stratified(): + svyglm = SurveyGLM() + svyglm.estimate( + model=ModelType.LOGISTIC, + y=y_bin, + x=x, + samp_weight=weight, + psu=psu, + stratum=stratum, + ) + + assert np.isclose(svyglm.beta[0], 2.665756) + assert np.isclose(svyglm.beta[1], -0.0370325) + assert np.isclose(svyglm.beta[2], -0.0846879) + assert np.isclose(svyglm.beta[3], -0.0044286) + + assert np.isclose(svyglm.cov_beta[0, 0], 0.4723953**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.0324191**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.0142838**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.015463**2) + + +## Linear regression + + +def test_reg_linear_not_stratified(): svyglm = SurveyGLM() - svyglm.estimate(y=y, x=x, samp_weight=weight) + svyglm.estimate(model=ModelType.LINEAR, y=y, x=x, samp_weight=weight) assert np.isclose(svyglm.beta[0], 820.8873) assert np.isclose(svyglm.beta[1], -0.4805866) @@ -29,9 +106,26 @@ def test_survey_glm_not_stratified_srs(): assert np.isclose(svyglm.cov_beta[3, 3], 0.4012498**2) -def test_survey_glm_stratified_srs(): +def test_reg_linear_psu_not_stratified(): svyglm = SurveyGLM() - svyglm.estimate(y=y, x=x, samp_weight=weight, stratum=stratum) + svyglm.estimate(model=ModelType.LINEAR, y=y, x=x, samp_weight=weight, psu=psu) + + assert np.isclose(svyglm.beta[0], 820.8873) + assert np.isclose(svyglm.beta[1], -0.4805866) + assert np.isclose(svyglm.beta[2], -3.141535) + assert np.isclose(svyglm.beta[3], 0.2257132) + + assert np.isclose(svyglm.cov_beta[0, 0], 11.65382**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.4292165**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.285711**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.4159033**2) + + +def test_reg_linear_stratified(): + svyglm = SurveyGLM() + svyglm.estimate( + model=ModelType.LINEAR, y=y, x=x, samp_weight=weight, stratum=stratum + ) assert np.isclose(svyglm.beta[0], 820.8873) assert np.isclose(svyglm.beta[1], -0.4805866) @@ -42,3 +136,20 @@ def test_survey_glm_stratified_srs(): assert np.isclose(svyglm.cov_beta[1, 1], 0.3977075**2) assert np.isclose(svyglm.cov_beta[2, 2], 0.2883001**2) assert np.isclose(svyglm.cov_beta[3, 3], 0.4026908**2) + + +def test_reg_linear_psu_stratified(): + svyglm = SurveyGLM() + svyglm.estimate( + model=ModelType.LINEAR, y=y, x=x, samp_weight=weight, psu=psu, stratum=stratum + ) + + assert np.isclose(svyglm.beta[0], 820.8873) + assert np.isclose(svyglm.beta[1], -0.4805866) + assert np.isclose(svyglm.beta[2], -3.141535) + assert np.isclose(svyglm.beta[3], 0.2257132) + + assert np.isclose(svyglm.cov_beta[0, 0], 10.10296**2) + assert np.isclose(svyglm.cov_beta[1, 1], 0.4109377**2) + assert np.isclose(svyglm.cov_beta[2, 2], 0.2743919**2) + assert np.isclose(svyglm.cov_beta[3, 3], 0.3873394**2) diff --git a/tests/sae/test_eb_unit_model.py b/tests/sae/test_eb_unit_model.py index 43e9cc16..0fb02579 100644 --- a/tests/sae/test_eb_unit_model.py +++ b/tests/sae/test_eb_unit_model.py @@ -39,7 +39,9 @@ def pov_gap(y, pov_line): eb_bhf_reml.predict(Xr, arear, pov_gap, 10, show_progress=False, pov_line=6477.484) -eb_bhf_reml.bootstrap_mse(Xr, arear, pov_gap, 10, show_progress=False, pov_line=6477.484) +eb_bhf_reml.bootstrap_mse( + Xr, arear, pov_gap, 10, show_progress=False, pov_line=6477.484 +) def test_eb_bhf_reml(): diff --git a/tests/sae/test_eblup_area_model2.py b/tests/sae/test_eblup_area_model2.py index f01a1b30..92df7b95 100644 --- a/tests/sae/test_eblup_area_model2.py +++ b/tests/sae/test_eblup_area_model2.py @@ -629,7 +629,9 @@ error_std = df.AWEEKS_SE -@pytest.mark.xfail(strict=True, reason="At least one standard error is NOT strickly positive!") +@pytest.mark.xfail( + strict=True, reason="At least one standard error is NOT strickly positive!" +) def test_standard_errors_non_positive(): fh_model_reml = EblupAreaModel(method="REML") fh_model_reml.fit( diff --git a/tests/sae/test_eblup_unit_model.py b/tests/sae/test_eblup_unit_model.py index f507dd7e..c53942fc 100644 --- a/tests/sae/test_eblup_unit_model.py +++ b/tests/sae/test_eblup_unit_model.py @@ -205,7 +205,9 @@ def test_bhf_reml_to_dataframe_not_default(): col_names=["parameter", "small_area", "modelled_estimate", "taylor_mse"] ) assert df.shape[1] == 4 - assert (df.columns == ["parameter", "small_area", "modelled_estimate", "taylor_mse"]).all() + assert ( + df.columns == ["parameter", "small_area", "modelled_estimate", "taylor_mse"] + ).all() # Bootstrap with REML @@ -223,7 +225,9 @@ def test_bhf_reml_to_dataframe_not_default(): def test_bhf_reml_to_dataframe_boot_default(): assert df1_reml.shape[1] == 5 - assert (df1_reml.columns == ["_parameter", "_area", "_estimate", "_mse", "_mse_boot"]).all() + assert ( + df1_reml.columns == ["_parameter", "_area", "_estimate", "_mse", "_mse_boot"] + ).all() df2_reml = eblup_bhf_reml_boot.to_dataframe( @@ -471,7 +475,9 @@ def test_area_est_bhf_ml_fpc(): def test_bhf_ml_to_dataframe_boot_default(): assert df1_ml.shape[1] == 5 - assert (df1_ml.columns == ["_parameter", "_area", "_estimate", "_mse", "_mse_boot"]).all() + assert ( + df1_ml.columns == ["_parameter", "_area", "_estimate", "_mse", "_mse_boot"] + ).all() df2_ml = eblup_bhf_ml_boot.to_dataframe( diff --git a/tests/sae/test_sae_core_functions.py b/tests/sae/test_sae_core_functions.py index 270f8243..0cd33d32 100644 --- a/tests/sae/test_sae_core_functions.py +++ b/tests/sae/test_sae_core_functions.py @@ -79,8 +79,12 @@ def test_log_det_covariance(): ## REML method -est_log_likelihod = log_likelihood("REML", y, X, beta, est_inv_covariance, est_log_det_covariance) -est_partial_deriv, est_info_matrix = partial_derivatives("REML", area, y, X, 1, 0.25, scale) +est_log_likelihod = log_likelihood( + "REML", y, X, beta, est_inv_covariance, est_log_det_covariance +) +est_partial_deriv, est_info_matrix = partial_derivatives( + "REML", area, y, X, 1, 0.25, scale +) def test_log_likehood(): @@ -106,8 +110,12 @@ def test_iterative_fisher(): ## REML method -est_log_likelihod_ml = log_likelihood("ML", y, X, beta, est_inv_covariance, est_log_det_covariance) -est_partial_deriv_ml, est_info_matrix_ml = partial_derivatives("ML", area, y, X, 1, 0.25, scale) +est_log_likelihod_ml = log_likelihood( + "ML", y, X, beta, est_inv_covariance, est_log_det_covariance +) +est_partial_deriv_ml, est_info_matrix_ml = partial_derivatives( + "ML", area, y, X, 1, 0.25, scale +) def test_log_likehood2(): diff --git a/tests/sampling/test_sample_size_mean.py b/tests/sampling/test_sample_size_mean.py index ffce7f88..c75ce6a2 100644 --- a/tests/sampling/test_sample_size_mean.py +++ b/tests/sampling/test_sample_size_mean.py @@ -113,18 +113,24 @@ def test_size_nat_wald_df3(): sigma=2, ) size_df = size_mean_nat.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_wald_df3_with_resp_rate(): size_mean_nat.calculate(half_ci=1, target=1, sigma=2, resp_rate=0.3) size_df = size_mean_nat.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_wald_df4(): size_mean_nat.calculate(half_ci=1, target=1, sigma=2, deff=1.5) - size_df = size_mean_nat.to_dataframe(["param", "mean", "sigma", "half_ci", "samp_size"]) + size_df = size_mean_nat.to_dataframe( + ["param", "mean", "sigma", "half_ci", "samp_size"] + ) assert (size_df.columns == ["param", "mean", "sigma", "half_ci", "samp_size"]).all() @@ -164,7 +170,9 @@ def test_size_mean_str_wald_size1_with_resp_rate1(): def test_size_mean_str_wald_size1_with_resp_rate2(): - size_str_mean_wald.calculate(half_ci=half_ci, target=1, sigma=2, resp_rate=resp_rate) + size_str_mean_wald.calculate( + half_ci=half_ci, target=1, sigma=2, resp_rate=resp_rate + ) assert size_str_mean_wald.samp_size["stratum1"] == 20 assert size_str_mean_wald.samp_size["stratum2"] == 52 assert size_str_mean_wald.samp_size["stratum3"] == 4 @@ -184,7 +192,9 @@ def test_size_mean_str_wald_size2(): def test_size_mean_str_wald_size2_with_resp_rate1(): - size_str_mean_wald.calculate(half_ci=half_ci, target=2, sigma=2, deff=deff, resp_rate=0.5) + size_str_mean_wald.calculate( + half_ci=half_ci, target=2, sigma=2, deff=deff, resp_rate=0.5 + ) assert size_str_mean_wald.samp_size["stratum1"] == 31 assert size_str_mean_wald.samp_size["stratum2"] == 40 assert size_str_mean_wald.samp_size["stratum3"] == 11 @@ -216,7 +226,9 @@ def test_size_str_mean_wald_size3(): def test_size_str_mean_wald_size3_with_resp_rate1(): - size_str_mean_wald.calculate(half_ci=half_ci, target=1, sigma=sigma, deff=deff, resp_rate=0.7) + size_str_mean_wald.calculate( + half_ci=half_ci, target=1, sigma=sigma, deff=deff, resp_rate=0.7 + ) assert size_str_mean_wald.samp_size["stratum1"] == 22 assert size_str_mean_wald.samp_size["stratum2"] == 8 assert size_str_mean_wald.samp_size["stratum3"] == 46 @@ -245,7 +257,9 @@ def test_size_str_mean_wald_size4(): def test_size_str_mean_wald_size4_with_resp_rate(): - size_str_mean_wald.calculate(half_ci=half_ci, target=2, sigma=sigma, deff=2, resp_rate=0.6) + size_str_mean_wald.calculate( + half_ci=half_ci, target=2, sigma=sigma, deff=2, resp_rate=0.6 + ) assert size_str_mean_wald.samp_size["stratum1"] == 52 assert size_str_mean_wald.samp_size["stratum2"] == 13 assert size_str_mean_wald.samp_size["stratum3"] == 36 @@ -256,29 +270,39 @@ def test_size_mean_str_wald_df1(): size_df = size_str_mean_wald.to_dataframe() assert size_df.shape[0] == 3 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() def test_size_mean_str_wald_df1_with_resp_rate(): - size_str_mean_wald.calculate(half_ci=half_ci, target=1, sigma=sigma, deff=2, resp_rate=1) + size_str_mean_wald.calculate( + half_ci=half_ci, target=1, sigma=sigma, deff=2, resp_rate=1 + ) size_df = size_str_mean_wald.to_dataframe() assert size_df.shape[0] == 3 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() def test_size_mean_str_wald_df2(): size_str_mean_wald.calculate(half_ci=half_ci, target=1, sigma=sigma, deff=2) - size_df = size_str_mean_wald.to_dataframe(["param", "str", "mean", "sigma", "E", "size"]) + size_df = size_str_mean_wald.to_dataframe( + ["param", "str", "mean", "sigma", "E", "size"] + ) assert size_df.shape[0] == 3 assert (size_df.columns == ["param", "str", "mean", "sigma", "E", "size"]).all() def test_size_mean_str_wald_df2_with_resp_rate(): - size_str_mean_wald.calculate(half_ci=half_ci, target=1, sigma=sigma, deff=2, resp_rate=0.65) - size_df = size_str_mean_wald.to_dataframe(["param", "str", "mean", "sigma", "E", "size"]) + size_str_mean_wald.calculate( + half_ci=half_ci, target=1, sigma=sigma, deff=2, resp_rate=0.65 + ) + size_df = size_str_mean_wald.to_dataframe( + ["param", "str", "mean", "sigma", "E", "size"] + ) assert size_df.shape[0] == 3 assert (size_df.columns == ["param", "str", "mean", "sigma", "E", "size"]).all() @@ -325,7 +349,9 @@ def test_size_mean_str_wald_fpc2(): def test_size_mean_str_wald_fpc2_with_resp_rate1(): - size_str_mean_wald_fpc.calculate(half_ci=half_ci2, sigma=sigma2, pop_size=1000, resp_rate=0.45) + size_str_mean_wald_fpc.calculate( + half_ci=half_ci2, sigma=sigma2, pop_size=1000, resp_rate=0.45 + ) assert size_str_mean_wald_fpc.samp_size["stratum1"] == 129 assert size_str_mean_wald_fpc.samp_size["stratum2"] == 129 assert size_str_mean_wald_fpc.samp_size["stratum3"] == 129 diff --git a/tests/sampling/test_sample_size_one_sample.py b/tests/sampling/test_sample_size_one_sample.py index a8931436..bc1d6c2e 100644 --- a/tests/sampling/test_sample_size_one_sample.py +++ b/tests/sampling/test_sample_size_one_sample.py @@ -47,7 +47,9 @@ def test_calculate_power_one_side(): 0.001, ) assert np.isclose( - calculate_power(two_sides=False, delta=0.2, sigma=3, samp_size=1392, alpha=0.05), + calculate_power( + two_sides=False, delta=0.2, sigma=3, samp_size=1392, alpha=0.05 + ), 0.801, 0.001, ) @@ -57,7 +59,9 @@ def test_calculate_power_one_side(): def test_ss_diff_mean_one_side1(): - size_diff_mean_one_side.calculate(mean_0=50, mean_1=52, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_one_side.calculate( + mean_0=50, mean_1=52, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_one_side.samp_size == 14 # assert np.isclose(size_diff_mean_one_side.actual_power, 0.802, atol=0.001) @@ -72,7 +76,9 @@ def test_ss_diff_mean_one_side1_with_resp_rate(): def test_ss_diff_mean_one_side2(): - size_diff_mean_one_side.calculate(mean_0=50, mean_1=50.8, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_one_side.calculate( + mean_0=50, mean_1=50.8, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_one_side.samp_size == 87 assert np.isclose(size_diff_mean_one_side.actual_power, 0.800, atol=0.001) @@ -87,13 +93,17 @@ def test_ss_diff_mean_one_side2_with_resp_rate(): def test_ss_diff_mean_one_side3(): - size_diff_mean_one_side.calculate(mean_0=50, mean_1=50.6, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_one_side.calculate( + mean_0=50, mean_1=50.6, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_one_side.samp_size == 155 assert np.isclose(size_diff_mean_one_side.actual_power, 0.801, atol=0.001) def test_ss_diff_mean_one_side4(): - size_diff_mean_one_side.calculate(mean_0=50, mean_1=50.2, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_one_side.calculate( + mean_0=50, mean_1=50.2, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_one_side.samp_size == 1392 assert np.isclose(size_diff_mean_one_side.actual_power, 0.801, atol=0.001) @@ -102,25 +112,33 @@ def test_ss_diff_mean_one_side4(): def test_ss_diff_mean_two_sides1(): - size_diff_mean_two_sides.calculate(mean_0=50, mean_1=52, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_two_sides.calculate( + mean_0=50, mean_1=52, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_two_sides.samp_size == 18 assert np.isclose(size_diff_mean_two_sides.actual_power, 0.807, atol=0.001) def test_ss_diff_mean_two_sides2(): - size_diff_mean_two_sides.calculate(mean_0=50, mean_1=50.8, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_two_sides.calculate( + mean_0=50, mean_1=50.8, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_two_sides.samp_size == 111 assert np.isclose(size_diff_mean_two_sides.actual_power, 0.802, atol=0.001) def test_ss_diff_mean_two_sides3(): - size_diff_mean_two_sides.calculate(mean_0=50, mean_1=50.6, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_two_sides.calculate( + mean_0=50, mean_1=50.6, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_two_sides.samp_size == 197 assert np.isclose(size_diff_mean_two_sides.actual_power, 0.802, atol=0.001) def test_ss_diff_mean_two_sides4(): - size_diff_mean_two_sides.calculate(mean_0=50, mean_1=50.2, sigma=3, alpha=0.05, power=0.80) + size_diff_mean_two_sides.calculate( + mean_0=50, mean_1=50.2, sigma=3, alpha=0.05, power=0.80 + ) assert size_diff_mean_two_sides.samp_size == 1766 assert np.isclose(size_diff_mean_two_sides.actual_power, 0.800, atol=0.001) @@ -129,7 +147,9 @@ def test_ss_diff_mean_two_sides4(): def test_ss_diff_prop_one_side1(): - size_diff_prop_one_side.calculate(prop_0=0.3, prop_1=0.5, delta=-0.10, alpha=0.05, power=0.80) + size_diff_prop_one_side.calculate( + prop_0=0.3, prop_1=0.5, delta=-0.10, alpha=0.05, power=0.80 + ) assert size_diff_prop_one_side.samp_size == 18 assert np.isclose(size_diff_prop_one_side.actual_power, 0.5208, atol=0.001) @@ -151,6 +171,8 @@ def test_ss_diff_prop_two_sides1(): def test_ss_diff_prop_two_sides2(): - size_diff_prop_two_sides.calculate(prop_0=0.6, prop_1=0.6, delta=-0.20, alpha=0.05, power=0.80) + size_diff_prop_two_sides.calculate( + prop_0=0.6, prop_1=0.6, delta=-0.20, alpha=0.05, power=0.80 + ) assert size_diff_prop_two_sides.samp_size == 52 assert np.isclose(size_diff_prop_two_sides.actual_power, 0.0500, atol=0.001) diff --git a/tests/sampling/test_sample_size_proportion.py b/tests/sampling/test_sample_size_proportion.py index 71596eed..21c731a8 100644 --- a/tests/sampling/test_sample_size_proportion.py +++ b/tests/sampling/test_sample_size_proportion.py @@ -14,7 +14,9 @@ def test_equal_stratum_error(): def test_equal(): - sizes, rates = allocate(method="equal", stratum=region, pop_size=pop_size, constant=15) + sizes, rates = allocate( + method="equal", stratum=region, pop_size=pop_size, constant=15 + ) assert sizes["Dakar"] == 15 assert sizes["Kaolack"] == 15 assert sizes["Ziguinchor"] == 15 @@ -32,7 +34,9 @@ def test_equal_error(): def test_propal(): - sizes, rates = allocate(method="propal", stratum=region, samp_size=100, pop_size=pop_size) + sizes, rates = allocate( + method="propal", stratum=region, samp_size=100, pop_size=pop_size + ) assert sizes["Dakar"] == 50 assert sizes["Kaolack"] == 30 assert sizes["Ziguinchor"] == 20 @@ -47,7 +51,9 @@ def test_propal_error(): def test_fixed_rate_number(): - sizes, rates = allocate(method="fixed_rate", stratum=region, pop_size=pop_size, rate=0.05) + sizes, rates = allocate( + method="fixed_rate", stratum=region, pop_size=pop_size, rate=0.05 + ) assert sizes["Dakar"] == 25 assert sizes["Kaolack"] == 15 assert sizes["Ziguinchor"] == 10 @@ -201,7 +207,10 @@ def test_deff_equal_errors(): assert sizes["Kaolack"] == 5 assert sizes["Ziguinchor"] == 20 assert rates["Dakar"] == 5 * stddev["Dakar"] * stddev["Dakar"] / pop_size["Dakar"] - assert rates["Kaolack"] == 5 * stddev["Kaolack"] * stddev["Kaolack"] / pop_size["Kaolack"] + assert ( + rates["Kaolack"] + == 5 * stddev["Kaolack"] * stddev["Kaolack"] / pop_size["Kaolack"] + ) assert ( rates["Ziguinchor"] == 5 * stddev["Ziguinchor"] * stddev["Ziguinchor"] / pop_size["Ziguinchor"] @@ -225,7 +234,9 @@ def test_equal_errors_error1(): def test_optimum_comparison_error2(): with pytest.raises(ValueError): - allocate(method="optimum_comparison", stratum=region, pop_size=pop_size, stddev=[5]) + allocate( + method="optimum_comparison", stratum=region, pop_size=pop_size, stddev=[5] + ) ## Design effects @@ -299,13 +310,17 @@ def test_size_nat_wald_size_with_deff_and_resp_rate(): def test_size_nat_wald_df(): size_nat_wald.calculate(target=0.80, half_ci=0.10) size_df = size_nat_wald.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_wald_df_with_resp_rate(): size_nat_wald.calculate(target=0.80, half_ci=0.10, resp_rate=0.5) size_df = size_nat_wald.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_wald_pop_size1(): @@ -442,7 +457,9 @@ def test_size_str_wald_size4_with_resp_rate1(): def test_size_str_wald_size4_with_resp_rate2(): - size_str_wald.calculate(target=target, half_ci=half_ci, deff=deff, resp_rate=resp_rate) + size_str_wald.calculate( + target=target, half_ci=half_ci, deff=deff, resp_rate=resp_rate + ) assert size_str_wald.samp_size["stratum1"] == 3 assert size_str_wald.samp_size["stratum2"] == 173 assert size_str_wald.samp_size["stratum3"] == 299 @@ -461,7 +478,9 @@ def test_size_str_wald_size5(): def test_size_str_wald_size5_with_resp_rate(): - size_str_wald.calculate(target=0.8, half_ci=0.1, deff=1.5, nb_strata=5, resp_rate=0.5) + size_str_wald.calculate( + target=0.8, half_ci=0.1, deff=1.5, nb_strata=5, resp_rate=0.5 + ) assert size_str_wald.samp_size["_stratum_1"] == 185 assert size_str_wald.samp_size["_stratum_2"] == 185 assert size_str_wald.samp_size["_stratum_3"] == 185 @@ -479,7 +498,8 @@ def test_size_str_wald_df(): size_df = size_str_wald.to_dataframe() assert size_df.shape[0] == 5 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() @@ -488,7 +508,8 @@ def test_size_str_wald_df_with_resp_rate(): size_df = size_str_wald.to_dataframe() assert size_df.shape[0] == 5 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() @@ -641,24 +662,32 @@ def test_size_nat_fleiss_size_with_deff4(): def test_size_nat_fleiss_df1(): size_nat_fleiss.calculate(target=0.80, half_ci=0.10) size_df = size_nat_fleiss.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_fleiss_df1_with_resp_rate(): size_nat_fleiss.calculate(target=0.80, half_ci=0.10, resp_rate=0.33) size_df = size_nat_fleiss.to_dataframe() - assert (size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"]).all() + assert ( + size_df.columns == ["_param", "_target", "_sigma", "_half_ci", "_samp_size"] + ).all() def test_size_nat_fleiss_df2(): size_nat_fleiss.calculate(target=0.80, half_ci=0.10) - size_df = size_nat_fleiss.to_dataframe(["param", "prop", "sigma", "half_ci", "size"]) + size_df = size_nat_fleiss.to_dataframe( + ["param", "prop", "sigma", "half_ci", "size"] + ) assert (size_df.columns == ["param", "prop", "sigma", "half_ci", "size"]).all() def test_size_nat_fleiss_df2_with_resp_rate(): size_nat_fleiss.calculate(target=0.80, half_ci=0.10, resp_rate=0.44) - size_df = size_nat_fleiss.to_dataframe(["param", "prop", "sigma", "half_ci", "size"]) + size_df = size_nat_fleiss.to_dataframe( + ["param", "prop", "sigma", "half_ci", "size"] + ) assert (size_df.columns == ["param", "prop", "sigma", "half_ci", "size"]).all() @@ -762,7 +791,8 @@ def test_size_str_fleiss_df1(): size_df = size_str_fleiss.to_dataframe() assert size_df.shape[0] == 5 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() @@ -771,19 +801,28 @@ def test_size_str_fleiss_df1_with_resp_rate(): size_df = size_str_fleiss.to_dataframe() assert size_df.shape[0] == 5 assert ( - size_df.columns == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] + size_df.columns + == ["_param", "_stratum", "_target", "_sigma", "_half_ci", "_samp_size"] ).all() def test_size_str_fleiss_df2(): size_str_fleiss.calculate(target=0.80, half_ci=0.10, nb_strata=5) - size_df = size_str_fleiss.to_dataframe(["param", "str", "prop", "sigma", "half_ci", "size"]) + size_df = size_str_fleiss.to_dataframe( + ["param", "str", "prop", "sigma", "half_ci", "size"] + ) assert size_df.shape[0] == 5 - assert (size_df.columns == ["param", "str", "prop", "sigma", "half_ci", "size"]).all() + assert ( + size_df.columns == ["param", "str", "prop", "sigma", "half_ci", "size"] + ).all() def test_size_str_fleiss_df2_with_resp_rate(): size_str_fleiss.calculate(target=0.80, half_ci=0.10, nb_strata=5, resp_rate=0.6) - size_df = size_str_fleiss.to_dataframe(["param", "str", "prop", "sigma", "half_ci", "size"]) + size_df = size_str_fleiss.to_dataframe( + ["param", "str", "prop", "sigma", "half_ci", "size"] + ) assert size_df.shape[0] == 5 - assert (size_df.columns == ["param", "str", "prop", "sigma", "half_ci", "size"]).all() + assert ( + size_df.columns == ["param", "str", "prop", "sigma", "half_ci", "size"] + ).all() diff --git a/tests/sampling/test_sample_size_two_sample.py b/tests/sampling/test_sample_size_two_sample.py index 209bb264..83da397e 100644 --- a/tests/sampling/test_sample_size_two_sample.py +++ b/tests/sampling/test_sample_size_two_sample.py @@ -43,7 +43,9 @@ def test_ss_diff_prop_one_side1(): def test_ss_diff_prop_one_side2(): - size_diff_prop_one_side.calculate(prop_1=0.65, prop_2=0.85, delta=0.05, alpha=0.05, power=0.80) + size_diff_prop_one_side.calculate( + prop_1=0.65, prop_2=0.85, delta=0.05, alpha=0.05, power=0.80 + ) assert size_diff_prop_one_side.samp_size == (98, 98) # assert np.isclose(size_diff_prop_one_side.actual_power, 0.802, atol=0.001) @@ -58,6 +60,8 @@ def test_ss_diff_prop_two_side1(): def test_ss_diff_prop_two_side2(): - size_diff_prop_two_side.calculate(prop_1=0.75, prop_2=0.80, delta=0.20, alpha=0.05, power=0.80) + size_diff_prop_two_side.calculate( + prop_1=0.75, prop_2=0.80, delta=0.20, alpha=0.05, power=0.80 + ) assert size_diff_prop_two_side.samp_size == (133, 133) # assert np.isclose(size_diff_prop_two_side.actual_power, 0.802, atol=0.001) diff --git a/tests/sampling/test_selection.py b/tests/sampling/test_selection.py index cbaaea57..85ceb2db 100644 --- a/tests/sampling/test_selection.py +++ b/tests/sampling/test_selection.py @@ -34,7 +34,9 @@ def test_grs_select(): def test_srswr_select(): - srs_sample, srs_number_hits, srs_probs = srs_design_wr.select(countries, sample_size) + srs_sample, srs_number_hits, srs_probs = srs_design_wr.select( + countries, sample_size + ) assert np.sum(srs_sample) <= sample_size assert np.sum(srs_number_hits) == sample_size assert (np.isclose(srs_probs, sample_size / countries.size)).all() @@ -44,7 +46,9 @@ def test_srswr_select(): def test_srswor_select(): - srs_sample, srs_number_hits, srs_probs = srs_design_wor.select(countries, sample_size) + srs_sample, srs_number_hits, srs_probs = srs_design_wor.select( + countries, sample_size + ) assert np.sum(srs_sample) == sample_size assert np.sum(srs_number_hits) == sample_size assert (np.unique(srs_number_hits) == (0, 1)).all() @@ -83,7 +87,9 @@ def test_stratified_srswr_select_same_size(): # obtained_sample_sizes = dict() for s in strata: assert size == np.sum(str_srswr_number_hits[continent == s]) - assert size / np.sum(continent == s) == np.unique(str_srswr_probs[continent == s]) + assert size / np.sum(continent == s) == np.unique( + str_srswr_probs[continent == s] + ) def test_stratified_srswr_select(): @@ -111,7 +117,9 @@ def test_stratified_srswor_select_same_size(): strata = np.unique(continent) for s in strata: assert size == np.sum(str_srswor_number_hits[continent == s]) - assert size / np.sum(continent == s) == np.unique(str_srswor_probs[continent == s]) + assert size / np.sum(continent == s) == np.unique( + str_srswor_probs[continent == s] + ) assert (np.unique(str_srswor_number_hits) == (0, 1)).all() @@ -175,7 +183,9 @@ def test_ppswr_sys_select_same_size(): def test_ppswr_sys_select(): - pps_sample, pps_hits, pps_probs = pps_sys_design_wr.select(countries, sample_size, mos=mos) + pps_sample, pps_hits, pps_probs = pps_sys_design_wr.select( + countries, sample_size, mos=mos + ) assert np.sum(pps_sample) <= sample_size assert np.sum(pps_hits) == sample_size assert np.isclose(pps_probs, sample_size * mos / np.sum(mos)).all() @@ -196,7 +206,9 @@ def test_ppswr_sys_select_shuffle(): def test_ppswor_hv_select(): - pps_sample, pps_hits, pps_probs = pps_hv_design_wor.select(countries, sample_size, mos=mos) + pps_sample, pps_hits, pps_probs = pps_hv_design_wor.select( + countries, sample_size, mos=mos + ) assert np.sum(pps_sample) == sample_size assert np.sum(pps_hits) == sample_size assert (np.unique(pps_hits) == (0, 1)).all() @@ -207,7 +219,9 @@ def test_ppswor_hv_select(): def test_ppswor_brewer_select(): - pps_sample, pps_hits, pps_probs = pps_brewer_design_wor.select(countries, sample_size, mos=mos) + pps_sample, pps_hits, pps_probs = pps_brewer_design_wor.select( + countries, sample_size, mos=mos + ) assert np.sum(pps_sample) == sample_size assert np.sum(pps_hits) == sample_size assert (np.unique(pps_hits) == (0, 1)).all() @@ -218,7 +232,9 @@ def test_ppswor_brewer_select(): def test_ppswor_murphy_select(): - pps_sample, pps_hits, pps_probs = pps_murphy_design_wor.select(countries, 2, mos=mos) + pps_sample, pps_hits, pps_probs = pps_murphy_design_wor.select( + countries, 2, mos=mos + ) assert np.sum(pps_sample) == 2 assert np.sum(pps_hits) == 2 assert (np.unique(pps_hits) == (0, 1)).all() @@ -298,7 +314,9 @@ def test_stratified_ppswr_brewer_select(): ) = str_ppswr_brewer_design.select(countries, sample_sizes_pps, continent, mos=mos) strata = np.unique(continent) for s in strata: - assert sample_sizes_pps[s] == np.sum(str_ppswr_brewer_number_hits[continent == s]) + assert sample_sizes_pps[s] == np.sum( + str_ppswr_brewer_number_hits[continent == s] + ) assert ( sample_sizes_pps[s] * mos[continent == s] / np.sum(mos[continent == s]) == str_ppswr_brewer_probs[continent == s] @@ -311,7 +329,9 @@ def test_stratified_ppswr_brewer_select(): countries_murphy = countries_population_murphy["country"] mos_murphy = countries_population_murphy["population_2019"].to_numpy() -mos_murphy[mos_murphy > np.mean(mos_murphy)] = mos_murphy[mos_murphy > np.mean(mos_murphy)] * 0.1 +mos_murphy[mos_murphy > np.mean(mos_murphy)] = ( + mos_murphy[mos_murphy > np.mean(mos_murphy)] * 0.1 +) sample_size_murphy = int(np.rint(countries_murphy.size / 40)) continent_murphy = countries_population_murphy["continent"] sample_sizes_murphy = dict( @@ -357,7 +377,8 @@ def test_stratified_ppswr_to_dataframet(): to_dataframe=True, ) assert ( - sample_df.columns == ["_samp_unit", "_stratum", "_mos", "_sample", "_hits", "_probs"] + sample_df.columns + == ["_samp_unit", "_stratum", "_mos", "_sample", "_hits", "_probs"] ).all() @@ -366,7 +387,9 @@ def test_stratified_ppswr_to_dataframet(): def test__calculate_samp_size_int(): some_design = SampleSelection(method=SelectMethod.pps_murphy, strat=False) - samp_size = some_design._calculate_samp_size(strat=False, pop_size=100, samp_rate=0.1) + samp_size = some_design._calculate_samp_size( + strat=False, pop_size=100, samp_rate=0.1 + ) assert samp_size == 10 @@ -383,7 +406,9 @@ def test__calculate_samp_size_dict(): def test__calculate_samp_rate_int(): some_design = SampleSelection(method=SelectMethod.pps_murphy, strat=False) - samp_rate = some_design._calculate_samp_rate(strat=False, pop_size=100, samp_size=35) + samp_rate = some_design._calculate_samp_rate( + strat=False, pop_size=100, samp_size=35 + ) assert samp_rate == 35 / 100 diff --git a/tests/simulated_population.py b/tests/simulated_population.py index 8ac01fc3..6e26e1b4 100644 --- a/tests/simulated_population.py +++ b/tests/simulated_population.py @@ -4,7 +4,9 @@ population_size = 35000000 admin1_nb = 10 -admin1_share = np.array([0.005, 0.020, 0.045, 0.075, 0.095, 0.105, 0.125, 0.130, 0.150, 0.250]) +admin1_share = np.array( + [0.005, 0.020, 0.045, 0.075, 0.095, 0.105, 0.125, 0.130, 0.150, 0.250] +) admin1_size = admin1_share * population_size if sum(admin1_share) != 1.000: diff --git a/tests/types/test_container_directest.py b/tests/types/test_container_directest.py index 89c1dd38..ab34eb7c 100644 --- a/tests/types/test_container_directest.py +++ b/tests/types/test_container_directest.py @@ -110,9 +110,15 @@ def test_directest_polars(self): est_pl = est.to_polars() assert (est.domains == est_pl.select("__domain").to_numpy().flatten()).all() - assert est.est == dict(zip(est.domains, est_pl.select("est").to_numpy().flatten())) - assert est.stderr == dict(zip(est.domains, est_pl.select("stderr").to_numpy().flatten())) - assert est.ssize == dict(zip(est.domains, est_pl.select("ssize").to_numpy().flatten())) + assert est.est == dict( + zip(est.domains, est_pl.select("est").to_numpy().flatten()) + ) + assert est.stderr == dict( + zip(est.domains, est_pl.select("stderr").to_numpy().flatten()) + ) + assert est.ssize == dict( + zip(est.domains, est_pl.select("ssize").to_numpy().flatten()) + ) def test_directest_pandas(self): est = DirectEst( diff --git a/tests/utils/test_basic_functions.py b/tests/utils/test_basic_functions.py index 84d95087..c0cc8764 100644 --- a/tests/utils/test_basic_functions.py +++ b/tests/utils/test_basic_functions.py @@ -143,7 +143,9 @@ def test_transform_exp_inverse(y): constant = -min(y) + 10 llambda = 2 y_transformed = transform(y + constant, llambda, constant, inverse=True) - assert np.isclose(y_transformed, np.exp(np.log(1 + (y + constant) * llambda) / llambda)).all() + assert np.isclose( + y_transformed, np.exp(np.log(1 + (y + constant) * llambda) / llambda) + ).all() # # @pytest.mark.xfail(strict=True) diff --git a/tests/utils/test_checks.py b/tests/utils/test_checks.py index 21cc91a0..a67f18aa 100644 --- a/tests/utils/test_checks.py +++ b/tests/utils/test_checks.py @@ -69,8 +69,12 @@ def test_in_range_np_pd_successes(x1, x2): def test_in_range_for_np_pd_fails(): assert not assert_in_range(low=11, high=23, x=np.array([20, 17, 111, 20, 23])) assert not assert_in_range(low=11, high=23, x=pd.Series([20, -107, 11, 20, 23])) - assert not assert_in_range(low=-101, high=0, x=np.array([-2, -17, -11, -20, 0.0023])) - assert not assert_in_range(low=-11, high=23, x=pd.Series([-10, 17, -11.0001, 20, 23])) + assert not assert_in_range( + low=-101, high=0, x=np.array([-2, -17, -11, -20, 0.0023]) + ) + assert not assert_in_range( + low=-11, high=23, x=pd.Series([-10, 17, -11.0001, 20, 23]) + ) def test_in_range_for_lists_successes(): @@ -82,8 +86,12 @@ def test_in_range_for_lists_successes(): def test_in_range_for_lists_fails(): assert not assert_in_range(low=11, high=23, x=np.array([20, 17, 111, 20, 23])) assert not assert_in_range(low=11, high=23, x=pd.Series([20, -107, 11, 20, 23])) - assert not assert_in_range(low=-101, high=0, x=np.array([-2, -17, -11, -20, 0.0023])) - assert not assert_in_range(low=-11, high=23, x=pd.Series([-10, 17, -11.0001, 20, 23])) + assert not assert_in_range( + low=-101, high=0, x=np.array([-2, -17, -11, -20, 0.0023]) + ) + assert not assert_in_range( + low=-11, high=23, x=pd.Series([-10, 17, -11.0001, 20, 23]) + ) @pytest.mark.parametrize( @@ -121,7 +129,9 @@ def test_assert_proportions_for_numbers_fails(x): assert assert_proportions(x=x) -@pytest.mark.xfail(strict=True, reason="At least one number in the lists is not between 0 and 1") +@pytest.mark.xfail( + strict=True, reason="At least one number in the lists is not between 0 and 1" +) @pytest.mark.parametrize("x", [[-0.0, 0.1, 0, 3], [-0.00001], [-1.000001, -1.1]]) def test_assert_proportions_for_list_fails(x): assert assert_proportions(x=x) @@ -153,7 +163,9 @@ def test_assert_proportions_for_pandas_Series_fails(x): @pytest.mark.xfail( strict=True, reason="At least one number in the dictionaries is not between 0 and 1" ) -@pytest.mark.parametrize("x", [{"one": 0.1, "two": 1.1}, {"one": -1, 3: 0.5}, {1: -0.1, 2: 1}]) +@pytest.mark.parametrize( + "x", [{"one": 0.1, "two": 1.1}, {"one": -1, 3: 0.5}, {1: -0.1, 2: 1}] +) def test_assert_proportions_for_dicts_fails(x): assert assert_proportions(x=x) is None @@ -217,7 +229,9 @@ def test_assert_response_status21(): assert assert_response_status("in", {"iN": 1}) is None assert assert_response_status("nr", {"in": "ineligible"}) is None assert assert_response_status("nr", {"in": 1}) is None - assert assert_response_status("nr", {"in": "ineligible", "nr": "nonresponse"}) is None + assert ( + assert_response_status("nr", {"in": "ineligible", "nr": "nonresponse"}) is None + ) @pytest.mark.xfail(strict=True, reason="Not in the standard dictionary") @@ -226,7 +240,9 @@ def test_assert_response_status22(): assert assert_response_status("in", {"in2": 1}) is None assert assert_response_status("nr", {"ineligible": "ineligible"}) is None assert assert_response_status("nr", {"Nonresp": 1}) is None - assert assert_response_status("nr", {"nr2": "ineligible", "nr": "nonresponse"}) is None + assert ( + assert_response_status("nr", {"nr2": "ineligible", "nr": "nonresponse"}) is None + ) def test_assert_brr_number_psus_sucesses(): diff --git a/tests/utils/test_formats.py b/tests/utils/test_formats.py index 432a51cb..7eaf2ed5 100644 --- a/tests/utils/test_formats.py +++ b/tests/utils/test_formats.py @@ -78,7 +78,9 @@ def test_sample_size_dict_str1(sample_size=5, stratification=True, stratum=strat assert samp_dict_str1[5] == 5 -def test_sample_size_dict_str2(sample_size=samp_size, stratification=True, stratum=stratum): +def test_sample_size_dict_str2( + sample_size=samp_size, stratification=True, stratum=stratum +): samp_dict_str2 = data_to_dict(sample_size, stratification, stratum) assert isinstance(samp_dict_str2, dict) assert samp_dict_str2["one"] == 11 diff --git a/tests/weighting/data_weights.py b/tests/weighting/data_weights.py index 306bc23b..15262c6f 100644 --- a/tests/weighting/data_weights.py +++ b/tests/weighting/data_weights.py @@ -10,7 +10,9 @@ number_strata = 10 number_units = 5000 -units = np.linspace(0, number_units - 1, number_units, dtype="int16") + 10 * number_units +units = ( + np.linspace(0, number_units - 1, number_units, dtype="int16") + 10 * number_units +) units = units.astype("str") sample = pd.DataFrame(units) @@ -26,7 +28,9 @@ area_type = pd.DataFrame(np.unique(sample["cluster_id"])) area_type.rename(columns={0: "cluster_id"}, inplace=True) -area_type["area_type"] = np.random.choice(("urban", "rural"), area_type.shape[0], p=(0.4, 0.6)) +area_type["area_type"] = np.random.choice( + ("urban", "rural"), area_type.shape[0], p=(0.4, 0.6) +) sample = pd.merge(sample, area_type, on="cluster_id") sample["response_status"] = np.random.choice( diff --git a/tests/weighting/test_adjustment.py b/tests/weighting/test_adjustment.py index 1d74b0bf..7e0b4d56 100644 --- a/tests/weighting/test_adjustment.py +++ b/tests/weighting/test_adjustment.py @@ -161,7 +161,9 @@ def test_nr_adjustment_without_adj_class(): def test_in_adjustment_without_adj_class(): ineligibles = response_code == np.repeat(0, response_code.size) - assert (nr_wgt_without_adj_class[ineligibles] == design_wgt.to_numpy()[ineligibles]).all() + assert ( + nr_wgt_without_adj_class[ineligibles] == design_wgt.to_numpy()[ineligibles] + ).all() def test_uk_adjustment_without_adj_class(): @@ -171,8 +173,13 @@ def test_uk_adjustment_without_adj_class(): def test_deff_weight_without_domain(): mean_wgt = np.mean(nr_wgt_without_adj_class) - deff_weight = 1 + np.mean(np.power(nr_wgt_without_adj_class - mean_wgt, 2) / mean_wgt**2) - assert sample_wgt_nr_without.calculate_deff_weight(nr_wgt_without_adj_class) == deff_weight + deff_weight = 1 + np.mean( + np.power(nr_wgt_without_adj_class - mean_wgt, 2) / mean_wgt**2 + ) + assert ( + sample_wgt_nr_without.calculate_deff_weight(nr_wgt_without_adj_class) + == deff_weight + ) assert sample_wgt_nr_without.deff_weight == deff_weight @@ -195,7 +202,9 @@ def test_nr_adjustment_with_adj_class(): def test_in_adjustment_with_adj_class(): ineligibles = response_code == np.repeat(0, response_code.size) - assert (nr_wgt_with_adj_class[ineligibles] == design_wgt.to_numpy()[ineligibles]).all() + assert ( + nr_wgt_with_adj_class[ineligibles] == design_wgt.to_numpy()[ineligibles] + ).all() def test_uk_adjustment_with_adj_class(): @@ -204,7 +213,9 @@ def test_uk_adjustment_with_adj_class(): def test_deff_weight_with_domain(): - deff_weight_region = sample_wgt_nr_with.calculate_deff_weight(nr_wgt_with_adj_class, region_id) + deff_weight_region = sample_wgt_nr_with.calculate_deff_weight( + nr_wgt_with_adj_class, region_id + ) for region in np.unique(region_id): nr_wgt_r = nr_wgt_with_adj_class[region_id == region] mean_wgt_r = np.mean(nr_wgt_r) @@ -259,7 +270,9 @@ def test_norm_wgt_with_class1(): assert np.isclose(np.sum(norm_wgt_wih_class_r), level1_with[region]) -norm_wgt_wih_class2 = sample_wgt_norm_with.normalize(nr_wgt_without_adj_class, domain=region_id) +norm_wgt_wih_class2 = sample_wgt_norm_with.normalize( + nr_wgt_without_adj_class, domain=region_id +) def test_norm_wgt_with_class2(): @@ -273,7 +286,9 @@ def test_norm_wgt_with_class2(): control_without = 50000 sample_wgt_ps_without = SampleWeight() -ps_wgt_wihout_class = sample_wgt_ps_without.poststratify(nr_wgt_without_adj_class, control_without) +ps_wgt_wihout_class = sample_wgt_ps_without.poststratify( + nr_wgt_without_adj_class, control_without +) def test_ps_without_adj_method(): diff --git a/tests/weighting/test_calibration.py b/tests/weighting/test_calibration.py index 215165e2..0930fa38 100644 --- a/tests/weighting/test_calibration.py +++ b/tests/weighting/test_calibration.py @@ -41,7 +41,9 @@ aux_dict["A4_&_B2"] = 165 aux_dict["A4_&_B3"] = 125 -sample_cat["_calib_wgt"] = SampleWeight().calibrate(sample_cat["wgt"], aux_array, control=aux_dict) +sample_cat["_calib_wgt"] = SampleWeight().calibrate( + sample_cat["wgt"], aux_array, control=aux_dict +) sample_cat["_calib_adjust_fct"] = sample_cat["_calib_wgt"] / sample_cat["wgt"] # print(sample_cat.drop_duplicates()) @@ -311,7 +313,9 @@ def test_calibration_num_domain(): aux_dict["B"] = 3355 -sample_mix["_calib_wgt"] = SampleWeight().calibrate(sample_mix["wgt"], aux_array, control=aux_dict) +sample_mix["_calib_wgt"] = SampleWeight().calibrate( + sample_mix["wgt"], aux_array, control=aux_dict +) sample_mix["_calib_adjust_fct"] = sample_mix["_calib_wgt"] / sample_mix["wgt"] # print(sample_mix.drop_duplicates()) diff --git a/tests/weighting/test_replicates.py b/tests/weighting/test_replicates.py index 358416db..97e869a6 100644 --- a/tests/weighting/test_replicates.py +++ b/tests/weighting/test_replicates.py @@ -4,7 +4,9 @@ from samplics.weighting import ReplicateWeight -test_data = pd.read_csv("./tests/weighting/small_data_for_testing_replicate_weights.csv") +test_data = pd.read_csv( + "./tests/weighting/small_data_for_testing_replicate_weights.csv" +) cluster_id = test_data["psu"].astype(str) stratum_id = test_data["stratum"].astype(str) @@ -13,14 +15,18 @@ nb_reps = 5 """Balanced Repeated Replicates (BRR) WITHOUT stratification""" -no_str_brr = ReplicateWeight(method=RepMethod.brr, strat=False, nb_reps=nb_reps, fay_coef=0.05) +no_str_brr = ReplicateWeight( + method=RepMethod.brr, strat=False, nb_reps=nb_reps, fay_coef=0.05 +) no_str_brr_wgt = no_str_brr.replicate(sample_wgt, cluster_id) # print(f"The BRR weights are: \n {no_str_brr_wgt.head(20)} \n") """Balanced Repeated Replicates (BRR) WITH stratification""" -str_brr = ReplicateWeight(method=RepMethod.brr, strat=True, nb_reps=nb_reps, fay_coef=0.05) +str_brr = ReplicateWeight( + method=RepMethod.brr, strat=True, nb_reps=nb_reps, fay_coef=0.05 +) str_brr_wgt = str_brr.replicate(sample_wgt, cluster_id, stratum_id) # print(f"The BRR weights are: \n {str_brr_wgt.head(20)} \n")