update parametric assumptions test with multiple comparison correction

axelwalter · axelwalter · commit 4cb3e2a1a596 · 2024-03-13T10:57:49.000+01:00
diff --git a/pages/6_Parametric_assumptions_evaluation.py b/pages/6_Parametric_assumptions_evaluation.py
@@ -33,20 +33,20 @@
         help="Select two options.",
     )
     if st.session_state.test_attribute and len(st.session_state.test_options) == 2:
-        tabs = st.tabs(["📊 Normal distribution", "📊 Equal variance"])
+        tabs = st.tabs(["📊 Normal distribution (Shapiro-Wilk test)", "📊 Equal variance (Levene test)"])
         with tabs[0]:
-            fig = test_normal_distribution(st.session_state.test_attribute, st.session_state.test_options)
+            fig = test_normal_distribution(st.session_state.test_attribute, st.session_state.test_options, corrections_map[st.session_state.p_value_correction])
             if fig:
                 show_fig(fig, "test-normal-distribution")
         with tabs[1]:
-            fig = test_equal_variance(st.session_state.test_attribute, st.session_state.test_options)
+            fig = test_equal_variance(st.session_state.test_attribute, st.session_state.test_options, corrections_map[st.session_state.p_value_correction])
             show_fig(fig, "test-equal-variance")
 
     st.info(
         """💡 **Interpretation**
 
 In both tests low p-values indicate that data points for a feature are **NOT** normal distributed or have similar variance.
-To meet **parametric** criteria the p-values in the histograms should be equally distributed between 0 and 1.
+To meet **parametric** criteria the p-values in the histograms should not be smaller than 0.05.
 When a larger number of data points indicate low p-values, it would be advisable to opt for a **non-parametric** statistical test.
 """
     )
diff --git a/src/testparametric.py b/src/testparametric.py
@@ -2,15 +2,16 @@
 import pandas as pd
 import plotly.express as px
 import scipy.stats as stats
+import pingouin as pg
 
 
 @st.cache_data
-def test_equal_variance(attribute, between):
+def test_equal_variance(attribute, between, correction):
     # test for equal variance
     data = pd.concat([st.session_state.data, st.session_state.md], axis=1)
     variance = pd.DataFrame(
         {
-            f"{between[0]} - {between[1]}": [
+            f"{between[0]} - {between[1]}": pg.multicomp([
                 stats.levene(
                     data.loc[
                         (data[attribute] == between[0]),
@@ -22,19 +23,19 @@ def test_equal_variance(attribute, between):
                     ],
                 )[1]
                 for f in st.session_state.data.columns
-            ]
+            ], method=correction)[1]
         }
     )
     fig = px.histogram(
         variance,
-        nbins=100,
+        nbins=20,
         template="plotly_white",
+        range_x=[-0.025, 1.025],
     )
-    fig.update_traces(marker_color="#696880")
     fig.update_layout(
         bargap=0.2,
         font={"color": "grey", "size": 12, "family": "Sans"},
-        title={"text": f"TEST FOR EQUAL VARIANCE", "font_color": "#3E3D53"},
+        title={"text": f"TEST FOR EQUAL VARIANCE (LEVENE)", "font_color": "#3E3D53"},
         xaxis_title="p-value",
         yaxis_title="count",
         showlegend=False
@@ -43,7 +44,7 @@ def test_equal_variance(attribute, between):
 
 
 @st.cache_data
-def test_normal_distribution(attribute, between):
+def test_normal_distribution(attribute, between, correction):
     # test for normal distribution
     data = pd.concat([st.session_state.data, st.session_state.md], axis=1)
     for b in between:
@@ -52,34 +53,33 @@ def test_normal_distribution(attribute, between):
             return None
     normality = pd.DataFrame(
         {
-            f"{b}": [
+            f"{b}": pg.multicomp([
                 stats.shapiro(
                     data.loc[
-                        (data[attribute] == between[0]),
+                        (data[attribute] == b),
                         f,
                     ]
                 )[1]
                 for f in st.session_state.data.columns
-            ]
+            ], method = correction)[1]
             for b in between
         }
     )
 
     fig = px.histogram(
-        normality.iloc[:, 1],
-        nbins=100,
+        normality,
+        nbins=20,
         template="plotly_white",
-        color_discrete_sequence=["#696880", "#ef553b"],
-        opacity=0.8,
+        range_x=[-0.025, 1.025],
+        barmode="group",
     )
-    fig.update_traces(marker_color="#696880")
 
     fig.update_layout(
         bargap=0.2,
         font={"color": "grey", "size": 12, "family": "Sans"},
-        title={"text": f"TEST FOR NORMALITY", "font_color": "#3E3D53"},
+        title={"text": f"TEST FOR NORMALITY (SHAPIRO-WILK)", "font_color": "#3E3D53"},
         xaxis_title="p-value",
         yaxis_title="count",
-        showlegend=False
+        showlegend=True
     )
     return fig