diff --git a/_build/html/v0.0.10/.buildinfo b/_build/html/v0.0.10/.buildinfo
new file mode 100644
index 000000000..a5a3eecc6
--- /dev/null
+++ b/_build/html/v0.0.10/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: b75ee6c592f4416888987109785aeaa6
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/_build/html/v0.0.10/.doctrees/acknowledgements.doctree b/_build/html/v0.0.10/.doctrees/acknowledgements.doctree
new file mode 100644
index 000000000..c7813a3dd
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/acknowledgements.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/changelog.doctree b/_build/html/v0.0.10/.doctrees/changelog.doctree
new file mode 100644
index 000000000..8f84795c3
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/changelog.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/citations.doctree b/_build/html/v0.0.10/.doctrees/citations.doctree
new file mode 100644
index 000000000..9f9719d6a
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/citations.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/contributors.doctree b/_build/html/v0.0.10/.doctrees/contributors.doctree
new file mode 100644
index 000000000..a0da0ed69
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/contributors.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/data_management.doctree b/_build/html/v0.0.10/.doctrees/data_management.doctree
new file mode 100644
index 000000000..51c9e992d
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/data_management.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/eda_plots.doctree b/_build/html/v0.0.10/.doctrees/eda_plots.doctree
new file mode 100644
index 000000000..9fa23699f
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/eda_plots.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/environment.pickle b/_build/html/v0.0.10/.doctrees/environment.pickle
new file mode 100644
index 000000000..096e3c9b3
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/environment.pickle differ
diff --git a/_build/html/v0.0.10/.doctrees/getting_started.doctree b/_build/html/v0.0.10/.doctrees/getting_started.doctree
new file mode 100644
index 000000000..aac12b374
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/getting_started.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/index.doctree b/_build/html/v0.0.10/.doctrees/index.doctree
new file mode 100644
index 000000000..a50bdc3ec
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/index.doctree differ
diff --git a/_build/html/v0.0.10/.doctrees/references.doctree b/_build/html/v0.0.10/.doctrees/references.doctree
new file mode 100644
index 000000000..12285ed06
Binary files /dev/null and b/_build/html/v0.0.10/.doctrees/references.doctree differ
diff --git a/_build/html/v0.0.10/_images/2d_pdp_grid.svg b/_build/html/v0.0.10/_images/2d_pdp_grid.svg
new file mode 100644
index 000000000..641db4ba6
--- /dev/null
+++ b/_build/html/v0.0.10/_images/2d_pdp_grid.svg
@@ -0,0 +1,4405 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/3d_pdp.svg b/_build/html/v0.0.10/_images/3d_pdp.svg
new file mode 100644
index 000000000..535371233
--- /dev/null
+++ b/_build/html/v0.0.10/_images/3d_pdp.svg
@@ -0,0 +1,8326 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/Bar_Age_regular_income.svg b/_build/html/v0.0.10/_images/Bar_Age_regular_income.svg
new file mode 100644
index 000000000..6f8aa40d4
--- /dev/null
+++ b/_build/html/v0.0.10/_images/Bar_Age_regular_income.svg
@@ -0,0 +1,1201 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/Stacked_Bar_Age_income.svg b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income.svg
new file mode 100644
index 000000000..d5510308b
--- /dev/null
+++ b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income.svg
@@ -0,0 +1,1943 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_pivoted.svg b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_pivoted.svg
new file mode 100644
index 000000000..2147fce1a
--- /dev/null
+++ b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_pivoted.svg
@@ -0,0 +1,2043 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_regular.svg b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_regular.svg
new file mode 100644
index 000000000..04478581f
--- /dev/null
+++ b/_build/html/v0.0.10/_images/Stacked_Bar_Age_income_regular.svg
@@ -0,0 +1,1347 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/Stacked_Bar_Age_sex.svg b/_build/html/v0.0.10/_images/Stacked_Bar_Age_sex.svg
new file mode 100644
index 000000000..7b2bcb137
--- /dev/null
+++ b/_build/html/v0.0.10/_images/Stacked_Bar_Age_sex.svg
@@ -0,0 +1,1970 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/all_plots_comparisons_boxplot.png b/_build/html/v0.0.10/_images/all_plots_comparisons_boxplot.png
new file mode 100644
index 000000000..c4f54b520
Binary files /dev/null and b/_build/html/v0.0.10/_images/all_plots_comparisons_boxplot.png differ
diff --git a/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot.png b/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot.png
new file mode 100644
index 000000000..cc236e21c
Binary files /dev/null and b/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot.png differ
diff --git a/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot_pivoted.png b/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot_pivoted.png
new file mode 100644
index 000000000..b05150e06
Binary files /dev/null and b/_build/html/v0.0.10/_images/all_plots_comparisons_violinplot_pivoted.png differ
diff --git a/_build/html/v0.0.10/_images/count_hist_distributions.svg b/_build/html/v0.0.10/_images/count_hist_distributions.svg
new file mode 100644
index 000000000..f08328f86
--- /dev/null
+++ b/_build/html/v0.0.10/_images/count_hist_distributions.svg
@@ -0,0 +1,1719 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/density_hist_dist_age.svg b/_build/html/v0.0.10/_images/density_hist_dist_age.svg
new file mode 100644
index 000000000..717ca6bf8
--- /dev/null
+++ b/_build/html/v0.0.10/_images/density_hist_dist_age.svg
@@ -0,0 +1,1375 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/density_hist_dist_mean_median.svg b/_build/html/v0.0.10/_images/density_hist_dist_mean_median.svg
new file mode 100644
index 000000000..cd480f5ed
--- /dev/null
+++ b/_build/html/v0.0.10/_images/density_hist_dist_mean_median.svg
@@ -0,0 +1,1935 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/eda_toolkit_logo.svg b/_build/html/v0.0.10/_images/eda_toolkit_logo.svg
new file mode 100644
index 000000000..d039d6f79
--- /dev/null
+++ b/_build/html/v0.0.10/_images/eda_toolkit_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/_build/html/v0.0.10/_images/hist_density_distributions.svg b/_build/html/v0.0.10/_images/hist_density_distributions.svg
new file mode 100644
index 000000000..5ce1a7191
--- /dev/null
+++ b/_build/html/v0.0.10/_images/hist_density_distributions.svg
@@ -0,0 +1,1744 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/kde_density_distributions.svg b/_build/html/v0.0.10/_images/kde_density_distributions.svg
new file mode 100644
index 000000000..b42cfeb18
--- /dev/null
+++ b/_build/html/v0.0.10/_images/kde_density_distributions.svg
@@ -0,0 +1,2301 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/normal_distribution.png b/_build/html/v0.0.10/_images/normal_distribution.png
new file mode 100644
index 000000000..837c60e0c
Binary files /dev/null and b/_build/html/v0.0.10/_images/normal_distribution.png differ
diff --git a/_build/html/v0.0.10/_images/scatter_plots_all_grid.png b/_build/html/v0.0.10/_images/scatter_plots_all_grid.png
new file mode 100644
index 000000000..78652ac74
Binary files /dev/null and b/_build/html/v0.0.10/_images/scatter_plots_all_grid.png differ
diff --git a/_build/html/v0.0.10/_images/scatter_plots_grid.png b/_build/html/v0.0.10/_images/scatter_plots_grid.png
new file mode 100644
index 000000000..5a51facd8
Binary files /dev/null and b/_build/html/v0.0.10/_images/scatter_plots_grid.png differ
diff --git a/_build/html/v0.0.10/_images/scatter_plots_grid_grouped.png b/_build/html/v0.0.10/_images/scatter_plots_grid_grouped.png
new file mode 100644
index 000000000..02a3b3916
Binary files /dev/null and b/_build/html/v0.0.10/_images/scatter_plots_grid_grouped.png differ
diff --git a/_build/html/v0.0.10/_images/summarize_combos.gif b/_build/html/v0.0.10/_images/summarize_combos.gif
new file mode 100644
index 000000000..402ee1efc
Binary files /dev/null and b/_build/html/v0.0.10/_images/summarize_combos.gif differ
diff --git a/_build/html/v0.0.10/_images/us_census_correlation_matrix.svg b/_build/html/v0.0.10/_images/us_census_correlation_matrix.svg
new file mode 100644
index 000000000..2a41e1afa
--- /dev/null
+++ b/_build/html/v0.0.10/_images/us_census_correlation_matrix.svg
@@ -0,0 +1,1766 @@
+
+
+
diff --git a/_build/html/v0.0.10/_images/us_census_correlation_matrix_full.svg b/_build/html/v0.0.10/_images/us_census_correlation_matrix_full.svg
new file mode 100644
index 000000000..d0df5da46
--- /dev/null
+++ b/_build/html/v0.0.10/_images/us_census_correlation_matrix_full.svg
@@ -0,0 +1,1907 @@
+
+
+
diff --git a/_build/html/v0.0.10/_sources/acknowledgements.rst.txt b/_build/html/v0.0.10/_sources/acknowledgements.rst.txt
new file mode 100644
index 000000000..e62da5a10
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/acknowledgements.rst.txt
@@ -0,0 +1,30 @@
+.. _acknowledgements:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.10/_sources/changelog.rst.txt b/_build/html/v0.0.10/_sources/changelog.rst.txt
new file mode 100644
index 000000000..72a6f4449
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/changelog.rst.txt
@@ -0,0 +1,621 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+`Version 0.0.10`_
+----------------------
+
+.. _Version 0.0.10: https://lshpaner.github.io/eda_toolkit/v0.0.10/index.html
+
+**Legend Handling**
+
+- The legend is now displayed only if there are valid legend handles (``len(handles) > 0``) and if ``show_legend`` is set to ``True``.
+
+- The check ``ax.get_legend().remove()`` ensures that unnecessary legends are removed if they are empty or if ``show_legend`` is set to ``False``.
+
+**Error Handling**
+- Error handling in the ``except`` block has been enhanced to ensure that any exceptions related to legends or labels are managed properly. The legend handling logic still respects the ``show_legend`` flag even in cases where exceptions occur.
+
+This update prevents empty legend squares from appearing and maintains the intended default behavior of showing legends only when they contain relevant content.
+
+
+`Version 0.0.9`_
+----------------------
+
+.. _Version 0.0.9: https://lshpaner.github.io/eda_toolkit/v0.0.9/index.html
+
+**Bug Fixes and Minor Improvements**
+
+Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
+**Visualization Enhancements**
+
+**DataFrame Columns:** Added a ``background_color`` variable to ``dataframe_columns```,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use ``hide()`` instead of ``hide_index()``. The highlighted
+columns allow for easier null versus unique value analysis.
+
+The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
+**Args:**
+
+- The ``df`` argument is specified as a ``pandas.DataFrame``.
+
+- The ``background_color`` argument is marked as optional, with a brief description of its role.
+
+- The ``return_df`` argument is also marked as optional, explaining what it controls.
+
+
+**Returns:** The return type is specified as ``pandas.DataFrame``, with a clear explanation of the difference based on the ``return_df`` flag.
+
+**KDE Distribution Plots:** Improved ``kde_distributions()`` with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
+**Scatter Plots:** Enhanced ``scatter_fit_plot()`` with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
+
+
+`Version 0.0.8`_
+----------------------
+
+.. _Version 0.0.8: https://lshpaner.github.io/eda_toolkit/v0.0.8/index.html
+
+
+:class:`stacked_crosstab_plot`
+
+- **Flexible `save_formats` Input**:
+ - `save_formats` now accepts a string, tuple, or list for specifying formats (e.g., `"png"`, `("png", "svg")`, or `["png", "svg"]`).
+ - Single strings or tuples are automatically converted to lists for consistent processing.
+
+- **Dynamic Error Handling**:
+ - Added checks to ensure a valid path is provided for each format in `save_formats`.
+ - Raises a `ValueError` if a format is specified without a corresponding path, with a clear, dynamic error message.
+
+- **Improved Plot Saving Logic**:
+ - Updated logic allows saving plots in one format (e.g., only `"png"` or `"svg"`) without requiring the other.
+ - Simplified and more intuitive path handling for saving plots.
+
+
+:class:`plot_3d_pdp`
+
+This update introduces several key changes to the `plot_3d_pdp` function, simplifying the function's interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
+**1. Parameter Changes**
+
+
+- **Removed Parameters:**
+
+ - The parameters ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general ``x_label``, ``y_label``, and ``z_label``. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+- **Default Values for Labels:**
+
+ - The parameters ``x_label``, ``y_label``, and ``z_label`` are now optional, with ``None`` as the default. If not provided, these labels will automatically default to the names of the features in the ``feature_names_list``. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+- **Changes in Default Values for View Angles:**
+
+ - The default values for camera positioning parameters have been updated: ``horizontal`` is now ``-1.25``, ``depth`` is now ``1.25``, and ``vertical`` is now ``1.25``. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+**2. Plot Generation Logic**
+
+- **Conditionally Checking Labels:**
+
+ - The function now checks whether ``x_label``, ``y_label``, and ``z_label`` are provided. If these are ``None``, the function will automatically assign default labels based on the ``feature_names_list``. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+- **Camera Position Adjustments:**
+
+ - The camera positions for the Plotly plot are now adjusted by multiplying ``horizontal``, ``depth``, and ``vertical`` by ``zoom_out_factor``. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+- **Surface Plot Coordinates Adjustments:**
+
+ - The order of the coordinates for the Plotly plot’s surface has been changed from ``ZZ, XX, YY[::-1]`` to ``ZZ, XX, YY``. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+**3. Code Simplifications**
+
+- **Removed Complexity:**
+
+ - By removing the ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+- **Fallback Mechanism for Grid Values:**
+
+ - The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+**4. Style Adjustments**
+
+- **Label Formatting:**
+
+ - The new version consistently uses ``y_label``, ``x_label``, and ``z_label`` for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+- **Color Bar Adjustments:**
+
+ - The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of ``0.6`` and a pad value of ``0.02``. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+**5. Potential Use Case Differences**
+
+- **Simplified Interface:**
+
+ - The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+- **Less Granular Control:**
+
+ - Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the ``*_plotly`` label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+**6. Matplotlib Plot Adjustments**
+
+- **Wireframe and Surface Plot Enhancements:**
+
+ - The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+**Summary**
+
+- Version ``0.0.8d`` of the `plot_3d_pdp` function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+- Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+**Decision Point**
+
+- This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
+
+
+Version 0.0.8c
+------------------------
+
+Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
+**Key Updates in 0.0.8c:**
+
+- **Bug Fixes:** Addressed minor issues identified in version ``0.0.8b`` to ensure smoother performance and better user experience.
+- **Additional Testing:** Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+- **Refinements:** Made small enhancements to existing features based on user feedback and internal testing results.
+
+**Summary of Changes**
+
+1. New Features & Enhancements
+
+- ``plot_3d_pdp`` Function:
+
+ - Added ``show_modebar`` Parameter: Introduced a new boolean parameter, ``show_modebar``, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Added parameters for ``left_margin``, ``right_margin``, and ``top_margin`` to provide users with more control over the plot layout in Plotly.
+
+ - Adjusted default values and added options for better customization of the Plotly color bar (``cbar_x``, ``cbar_thickness``) and title positioning (``title_x``, ``title_y``).
+
+ - Plotly Configuration:
+
+ - Enhanced the configuration options to allow users to enable or disable zoom functionality (``enable_zoom``) in the interactive Plotly plots.
+
+ - Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+ - Error Handling:
+
+ - Added input validation for ``html_file_path`` and ``html_file_name`` to ensure these are provided when necessary based on the selected ``plot_type``.
+
+- ``plot_2d_pdp`` Function:
+
+ - Introduced ``file_prefix`` Parameter:
+
+ - Added a new ``file_prefix`` parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+ - Enhanced Plot Type Flexibility:
+
+ - The ``plot_type`` parameter now includes an option to generate both grid and individual plots (``both``). This feature allows users to create a combination of both layout styles in one function call.
+
+ - Updated input validation and logic to handle this new option effectively.
+
+ - Added ``save_plots`` Parameter:
+
+ - Introduced a new parameter, ``save_plots``, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Included the ``save_plots`` parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+2. Documentation Updates
+
+- Docstrings:
+
+ - Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
+ - Detailed the use of new parameters such as ``show_modebar``, ``file_prefix``, ``save_plots``, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+3. Refactoring & Code Cleanup
+
+- Code Structure:
+
+ - Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
+ - Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
+
+
+Version 0.0.8b
+--------------------------------
+
+Version 0.0.8b is an exact replica of version ``0.0.8a``. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
+
+
+Version 0.0.8a
+--------------------------------
+
+Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
+**New Features:**
+
+1. Optional ``file_prefix`` in ``stacked_crosstab_plot`` Function
+
+ - The ``stacked_crosstab_plot`` function has been updated to make the ``file_prefix`` argument optional. If the user does not provide a ``file_prefix``, the function will now automatically generate a default prefix based on the ``col`` and ``func_col`` parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
+ - **Key Improvement:**
+
+ - Users can now omit the ``file_prefix`` argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
+ - Backward compatibility is maintained, allowing users who prefer to specify a custom ``file_prefix`` to continue doing so without any issues.
+
+2. **Introduction of 3D and 2D Partial Dependence Plot Functions**
+
+ - Two new functions, ``plot_3d_pdp`` and ``plot_2d_pdp``, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+ - ``plot_3d_pdp``: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
+ - ``plot_2d_pdp``: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+ - **Key Features:**
+
+ - **Compatibility:** Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
+ - **Customization:** Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
+ - **Interactive 3D Plots:** The ``plot_3d_pdp`` function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+**Impact:**
+
+- These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+- The optional ``file_prefix`` enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+- The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
+
+
+
+`Version 0.0.7`_
+----------------------
+
+.. _Version 0.0.7: https://lshpaner.github.io/eda_toolkit/v0.0.7/index.html
+
+**Added Function for Customizable Correlation Matrix Visualization**
+
+This release introduces a new function, ``flex_corr_matrix``, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to ``seaborn.heatmap()``.
+
+**Summary of Changes**
+
+- **New Function**: ``flex_corr_matrix``.
+
+ - **Functionality**:
+ - Generates a correlation heatmap for a given DataFrame.
+ - Supports both full and upper triangular correlation matrices based on the ``triangular`` parameter.
+ - Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+ - Accepts additional keyword arguments via ``**kwargs`` to pass directly to ``seaborn.heatmap()``.
+ - Includes validation to ensure the ``triangular``, ``annot``, and ``save_plots`` parameters are boolean values.
+ - Raises an exception if ``save_plots=True`` but neither ``image_path_png`` nor ``image_path_svg`` is specified.
+
+**Usage**
+
+.. code-block:: python
+
+ # Full correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=False, cmap="coolwarm", annot=True)
+
+ # Upper triangular correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=True, cmap="coolwarm", annot=True)
+
+
+**Contingency table df to object type**
+
+Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
+
+.. code-block:: python
+
+ df = df.astype(str).fillna("")
+
+
+`Version 0.0.6`_
+----------------------
+
+.. _Version 0.0.6: https://lshpaner.github.io/eda_toolkit/v0.0.6/index.html
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+`Version 0.0.5`_
+----------------------
+
+.. _Version 0.0.5: https://lshpaner.github.io/eda_toolkit/v0.0.5/index.html
+
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.10/_sources/citations.rst.txt b/_build/html/v0.0.10/_sources/citations.rst.txt
new file mode 100644
index 000000000..402d27d99
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.10). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.10},
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.10/_sources/contributors.rst.txt b/_build/html/v0.0.10/_sources/contributors.rst.txt
new file mode 100644
index 000000000..4da2fa18b
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/contributors.rst.txt
@@ -0,0 +1,59 @@
+.. _contributors:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Contributors/Maintainers
+=========================
+
+.. raw:: html
+
+
+
+.. image:: https://www.leonshpaner.com/author/leon-shpaner/avatar_hu48de79c369d5f7d4ff8056a297b2c4c5_1681850_270x270_fill_q90_lanczos_center.jpg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Leonid Shpaner `_ is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego's M.S. Applied Artificial Intelligence program.
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: https://oscargildata.com/portfolio_content/images/Oscar_LinkedIn_Pic.jpeg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Oscar Gil `_ is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
diff --git a/_build/html/v0.0.10/_sources/data_management.rst.txt b/_build/html/v0.0.10/_sources/data_management.rst.txt
new file mode 100644
index 000000000..3ee514b66
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/data_management.rst.txt
@@ -0,0 +1,1384 @@
+.. _data_management:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Data Management Overview
+===========================
+
+In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
+
+Data Management Techniques
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs. Defaults to ``"ID"``.
+ :type id_colname: str, optional
+ :param num_digits: The number of digits for the unique IDs. Defaults to ``9``.
+ :type num_digits: int, optional
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+.. note::
+ - If the dataframe index is not unique, a warning is printed.
+ - The function does not check if the number of rows exceeds the number of
+ unique IDs that can be generated with the specified number of digits.
+ - The first digit of the generated IDs is ensured to be non-zero.
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either `day/month/year` or
+ `month/day/year` format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a `day/month/year` format. If the first part is 12 or less, the function
+ attempts to parse the date as `month/day/year`, falling back to `day/month/year`
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df, background_color=None, return_df=False)
+
+ Analyze DataFrame columns to provide summary statistics such as data type,
+ null counts, unique values, and most frequent values.
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param background_color: Hex color code or color name for background styling in the output
+ DataFrame. Defaults to ``None``.
+ :type background_color: str, optional
+ :param return_df: If ``True``, returns the plain DataFrame with the summary statistics. If
+ ``False``, returns a styled DataFrame for visual presentation. Defaults to ``False``.
+ :type return_df: bool, optional
+
+ :returns: If `return_df` is ``True``, returns the plain DataFrame containing column summary
+ statistics. If `return_df` is ``False``, returns a styled DataFrame with optional
+ background color for specific columns.
+ :rtype: pandas.DataFrame
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of column names from the DataFrame to generate combinations.
+ :type variables: list of str
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum size of the combinations to generate. Defaults to ``2``.
+ :type min_length: int, optional
+
+ :returns: A tuple containing a dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+.. note::
+ - The function will create an Excel file with a sheet for each combination
+ of the specified variables, as well as a "Table of Contents" sheet with
+ hyperlinks to each summary table.
+ - The sheet names are limited to 31 characters due to Excel's constraints.
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+.. note::
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list of str, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order. Defaults to ``0``.
+ :type sort_by: int, optional
+ :raises ValueError: If no columns are specified or if ``sort_by`` is not ``0`` or ``1``.
+ :returns: A DataFrame containing the contingency table with the specified columns, a ``'Total'`` column representing the count of occurrences, and a ``'Percentage'`` column representing the percentage of the total count.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or "bins." This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or "bins," and
+then assigning each value to one of these intervals.
+
+.. note::
+
+ The code snippets below create age bins and assign a corresponding age group
+ label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+ categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+ and labels as needed for your specific data.
+
+
+Below, we use the ``age`` column of the census data [1]_ from the UCI Machine Learning Repository as an example:
+
+1. **Bins Definition**:
+ The bins are defined by specifying the boundaries of each interval. For example,
+ in the code snippet below, the ``bin_ages`` list specifies the boundaries for age groups:
+
+ .. code-block:: python
+
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+
+ Each pair of consecutive elements in ``bin_ages`` defines a bin. For example:
+
+ - The first bin is ``[0, 18)``,
+ - The second bin is ``[18, 30)``,
+ - and so on.
+
+\
+
+2. **Labels for Bins**:
+ The `label_ages` list provides labels corresponding to each bin:
+
+ .. code-block:: python
+
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ These labels are used to categorize the numerical values into meaningful groups.
+
+3. **Applying the Binning**:
+ The `pd.cut `_ function
+ from Pandas is used to apply the binning process. For each value in the ``age``
+ column of the DataFrame, it assigns a corresponding label based on which bin the
+ value falls into. Here, ``right=False`` indicates that each bin includes the
+ left endpoint but excludes the right endpoint. For example, if ``bin_ages =
+ [0, 10, 20, 30]``, then a value of ``10`` will fall into the bin ``[10, 20)`` and
+ be labeled accordingly.
+
+ .. code-block:: python
+
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+ **Mathematically**, for a given value `x` in the ``age`` column:
+
+ .. math::
+
+ \text{age_group} =
+ \begin{cases}
+ < 18 & \text{if } 0 \leq x < 18 \\
+ 18-29 & \text{if } 18 \leq x < 30 \\
+ \vdots \\
+ 100 + & \text{if } x \geq 100
+ \end{cases}
+
+ The parameter ``right=False`` in ``pd.cut`` means that the bins are left-inclusive
+ and right-exclusive, except for the last bin, which is always right-inclusive
+ when the upper bound is infinity (``float("inf")``).
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.10/_sources/eda_plots.rst.txt b/_build/html/v0.0.10/_sources/eda_plots.rst.txt
new file mode 100644
index 000000000..59ae2c981
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/eda_plots.rst.txt
@@ -0,0 +1,2666 @@
+.. _eda_plots:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Plotting and Theoretical Overview
+=======================================
+
+Gaussian Assumption for Normality
+----------------------------------
+
+The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
+
+.. math::
+
+ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
+
+where:
+
+- :math:`\mu` is the mean
+- :math:`\sigma^2` is the variance
+
+In a normally distributed dataset:
+
+- 68% of data falls within :math:`\mu \pm \sigma`
+- 95% within :math:`\mu \pm 2\sigma`
+- 99.7% within :math:`\mu \pm 3\sigma`
+
+.. raw:: html
+
+
+
+.. image:: ../assets/normal_distribution.png
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histograms and Kernel Density Estimation (KDE)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Histograms**:
+
+- Visualize data distribution by binning values and counting frequencies.
+- If data is Gaussian, the histogram approximates a bell curve.
+
+**KDE**:
+
+- A non-parametric way to estimate the PDF by smoothing individual data points with a kernel function.
+- The KDE for a dataset :math:`X = \{x_1, x_2, \ldots, x_n\}` is given by:
+
+.. math::
+
+ \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)
+
+where:
+
+- :math:`K` is the kernel function (often Gaussian)
+- :math:`h` is the bandwidth (smoothing parameter)
+
+.. raw:: html
+
+ Combined Use of Histograms and KDE
+
+\
+
+- **Histograms** offer a discrete, binned view of the data.
+- **KDE** provides a smooth, continuous estimate of the underlying distribution.
+- Together, they effectively illustrate how well the data aligns with the Gaussian assumption, highlighting any deviations from normality.
+
+
+Pearson Correlation Coefficient
+--------------------------------
+
+The Pearson correlation coefficient, often denoted as :math:`r`, is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from :math:`-1` to :math:`1`, where:
+
+- :math:`r = 1` indicates a perfect positive linear relationship.
+- :math:`r = -1` indicates a perfect negative linear relationship.
+- :math:`r = 0` indicates no linear relationship.
+
+The Pearson correlation coefficient between two variables :math:`X` and :math:`Y` is defined as:
+
+.. math::
+
+ r_{XY} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
+
+where:
+
+- :math:`\text{Cov}(X, Y)` is the covariance of :math:`X` and :math:`Y`.
+- :math:`\sigma_X` is the standard deviation of :math:`X`.
+- :math:`\sigma_Y` is the standard deviation of :math:`Y`.
+
+Covariance measures how much two variables change together. It is defined as:
+
+.. math::
+
+ \text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)
+
+where:
+
+- :math:`n` is the number of data points.
+- :math:`X_i` and :math:`Y_i` are the individual data points.
+- :math:`\mu_X` and :math:`\mu_Y` are the means of :math:`X` and :math:`Y`.
+
+The standard deviation measures the dispersion or spread of a set of values. For
+a variable :math:`X`, the standard deviation :math:`\sigma_X` is:
+
+.. math::
+
+ \sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)^2}
+
+Substituting the covariance and standard deviation into the Pearson correlation formula:
+
+.. math::
+
+ r_{XY} = \frac{\sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n} (X_i - \mu_X)^2} \sqrt{\sum_{i=1}^{n} (Y_i - \mu_Y)^2}}
+
+This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between :math:`X` and :math:`Y`.
+
+- :math:`r > 0`: Positive correlation. As :math:`X` increases, :math:`Y` tends to increase.
+- :math:`r < 0`: Negative correlation. As :math:`X` increases, :math:`Y` tends to decrease.
+- :math:`r = 0`: No linear correlation. There is no consistent linear relationship between :math:`X` and :math:`Y`.
+
+The closer the value of :math:`r` is to :math:`\pm 1`, the stronger the linear relationship between the two variables.
+
+
+Partial Dependence Foundations
+--------------------------------
+
+Let :math:`\mathbf{X}` represent the complete set of input features for a machine
+learning model, where :math:`\mathbf{X} = \{X_1, X_2, \dots, X_p\}`. Suppose we're
+particularly interested in a subset of these features, denoted by :math:`\mathbf{X}_S`.
+The complementary set, :math:`\mathbf{X}_C`, contains all the features in :math:`\mathbf{X}`
+that are not in :math:`\mathbf{X}_S`. Mathematically, this relationship is expressed as:
+
+.. math::
+
+ \mathbf{X}_C = \mathbf{X} \setminus \mathbf{X}_S
+
+where :math:`\mathbf{X}_C` is the set of features in :math:`\mathbf{X}` after
+removing the features in :math:`\mathbf{X}_S`.
+
+Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in :math:`\mathbf{X}_S` on the model's predictions, while averaging out the
+influence of the features in :math:`\mathbf{X}_C`. This is mathematically defined as:
+
+.. math::
+ \begin{align*}
+ \text{PD}_{\mathbf{X}_S}(x_S) &= \mathbb{E}_{\mathbf{X}_C} \left[ f(x_S, \mathbf{X}_C) \right] \\
+ &= \int f(x_S, x_C) \, p(x_C) \, dx_C \\
+ &= \int \left( \int f(x_S, x_C) \, p(x_C \mid x_S) \, dx_C \right) p(x_S) \, dx_S
+ \end{align*}
+
+
+where:
+
+- :math:`\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]` indicates that we are taking the expected value over the possible values of the features in the set :math:`\mathbf{X}_C`.
+- :math:`p(x_C)` represents the probability density function of the features in :math:`\mathbf{X}_C`.
+
+This operation effectively summarizes the model's output over all potential values of the complementary features, providing a clear view of how the features in :math:`\mathbf{X}_S` alone impact the model's predictions.
+
+
+**2D Partial Dependence Plots**
+
+Consider a trained machine learning model `2D Partial Dependence Plots`_ :math:`f(\mathbf{X})`, where :math:`\mathbf{X} = (X_1, X_2, \dots, X_p)` represents the vector of input features. The partial dependence of the predicted response :math:`\hat{y}` on a single feature :math:`X_j` is defined as:
+
+.. math::
+
+ \text{PD}(X_j) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, \mathbf{X}_{C_i})
+
+where:
+
+- :math:`X_j` is the feature of interest.
+- :math:`\mathbf{X}_{C_i}` represents the complement set of :math:`X_j`, meaning the remaining features in :math:`\mathbf{X}` not included in :math:`X_j` for the :math:`i`-th instance.
+- :math:`n` is the number of observations in the dataset.
+
+For two features, :math:`X_j` and :math:`X_k`, the partial dependence is given by:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of :math:`X_j` and :math:`X_k` vary, while the effects of the other features are averaged out.
+
+- **Single Feature PDP:** When plotting :math:`\text{PD}(X_j)`, the result is a 2D line plot showing the marginal effect of feature :math:`X_j` on the predicted outcome, averaged over all possible values of the other features.
+- **Two Features PDP:** When plotting :math:`\text{PD}(X_j, X_k)`, the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of :math:`X_j` and :math:`X_k` on the predicted outcome. The surface represents the expected value of the prediction as :math:`X_j` and :math:`X_k` vary, while all other features are averaged out.
+
+
+**3D Partial Dependence Plots**
+
+For a more comprehensive analysis, especially when exploring interactions between two features, `3D Partial Dependence Plots`_ are invaluable. The partial dependence function for two features in a 3D context is:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+Here, the function :math:`f(X_j, X_k, \mathbf{X}_{C_i})` is evaluated across a grid of values for :math:`X_j` and :math:`X_k`. The resulting 3D surface plot represents how the model's prediction changes over the joint range of these two features.
+
+The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model's predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+- **Feature Interaction Visualization:** The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+- **Enhanced Interpretation:** 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+.. raw:: html
+
+
+
+KDE Distribution Function
+-----------------------------
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, figsize=(5, 5), grid_figsize=None, hist_color="#0000FF", kde_color="#FF0000", mean_color="#000000", median_color="#000000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=None, n_cols=None, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None, plot_mean=False, plot_median=False, std_dev_levels=None, std_color="#808080", label_names=None, show_legend=True, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots. If 'all', plots will be generated for all numeric columns.
+ :type vars_of_interest: list of str, optional
+ :param figsize: Size of each individual plot, default is ``(5, 5)``. Used when only one plot is being generated or when saving individual plots.
+ :type figsize: tuple of int, optional
+ :param grid_figsize: Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on ``figsize``, ``n_rows``, and ``n_cols``.
+ :type grid_figsize: tuple of int, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param mean_color: Color of the mean line if ``plot_mean`` is True, default is ``'#000000'``.
+ :type mean_color: str, optional
+ :param median_color: Color of the median line if ``plot_median`` is True, default is ``'#000000'``.
+ :type median_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses ``figsize`` for determining the plot size, ignoring ``grid_figsize``.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+ :type log_scale_vars: str or list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: float, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``, ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param plot_mean: Whether to plot the mean as a vertical line, default is ``False``.
+ :type plot_mean: bool, optional
+ :param plot_median: Whether to plot the median as a vertical line, default is ``False``.
+ :type plot_median: bool, optional
+ :param std_dev_levels: Levels of standard deviation to plot around the mean.
+ :type std_dev_levels: list of int, optional
+ :param std_color: Color(s) for the standard deviation lines, default is ``'#808080'``.
+ :type std_color: str or list of str, optional
+ :param label_names: Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+ :type label_names: dict, optional
+ :param show_legend: Whether to show the legend on the plots, default is ``True``.
+ :type show_legend: bool, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+ - If ``grid_figsize`` is provided when only one plot is being created.
+
+ :raises UserWarning:
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``plot_type="both"`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+.. note::
+ If you do not set ``n_rows`` or ``n_cols`` to any values, the function will
+ automatically calculate and create a grid based on the number of variables being
+ plotted, ensuring an optimal arrangement of the plots.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both", # Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``plot_type="hist"``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ``y-axis`` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to ``"Count"`` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Histogram Example - (Mean and Median)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+histograms that include mean and median lines. The ``mean_color`` is set to ``"blue"``
+and the ``median_color`` is set to ``"black"``, allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted ``(plot_mean=True, plot_median=True)``.
+The ``y_axis_label`` remains ``"Density"``, indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using ``hist_color="brown"``, with a ``fill_alpha=0.60`` while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``, and
+the grid size is set to `15x5 inches`. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.60,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_mean_median.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Histogram Example - (Mean, Median, and Std. Deviation)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+``mean_color`` is set to ``"blue"`` and the median_color is set to ``"black"``,
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted ``(plot_mean=True, plot_median=True)``. The ``y_axis_label`` remains
+``"Density"``, indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using ``hist_color="brown"``,
+with a ``fill_alpha=0.40``, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors ``"purple"``,
+``"green"``, and ``"silver"`` for one, two, and three standard deviations, respectively.
+
+The layout is configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``,
+and the grid size is set to `15x5 inches`. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+.. note::
+
+ You have the freedom to choose whether to plot the mean, median, and
+ standard deviation lines. You can display one, none, or all of these simultaneously.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ ]
+
+ kde_distributions(
+ df=df,
+ figsize=(10, 6),
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.40,
+ plot_type="both",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ image_path_svg=image_path_svg,
+ image_path_png=image_path_png,
+ std_dev_levels=[
+ 1,
+ 2,
+ 3,
+ ],
+ std_color=[
+ "purple",
+ "green",
+ "silver",
+ ],
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_age.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. note::
+
+ The ``legend_labels_list`` parameter should be a list of lists, where each
+ inner list corresponds to the ground truth labels for the respective item in
+ the ``func_col`` list. Each element in the ``func_col`` list represents a
+ column in your DataFrame that you wish to analyze, and the corresponding
+ inner list in ``legend_labels_list`` should contain the labels that will be
+ used in the legend of your plots.
+
+For example:
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"], # Corresponds to "sex"
+ ["<=50K", ">50K"], # Corresponds to "income"
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. important::
+
+ Ensure that the number of elements in ``func_col``, ``legend_labels_list``,
+ and ``title`` are the same. Each item in ``func_col`` must have a corresponding
+ list of labels in ``legend_labels_list`` and a title in ``title``. This
+ consistency is essential for the function to correctly generate the plots
+ with the appropriate labels and titles.
+
+
+In this example:
+
+- ``func_col`` contains two elements: ``"sex"`` and ``"income"``. Each corresponds to a specific column in your DataFrame.
+- ``legend_labels_list`` is a nested list containing two inner lists:
+
+ - The first inner list, ``["Male", "Female"]``, corresponds to the ``"sex"`` column in ``func_col``.
+ - The second inner list, ``["<=50K", ">50K"]``, corresponds to the ``"income"`` column in ``func_col``.
+
+- ``title`` contains two elements: ``"Sex"`` and ``"Income"``, which will be used as the titles for the respective plots.
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_comp, n_rows=None, n_cols=None, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None, label_names=None, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_comp: List of comparison categories (columns in df).
+ :type metrics_comp: list of str
+ :param n_rows: Number of rows in the subplot grid. Calculated automatically if not provided.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. Calculated automatically if not provided.
+ :type n_cols: int, optional
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots. Default is ``True``.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param label_names: Dictionary mapping original column names to custom labels. Default is ``None``.
+ :type label_names: dict, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is provided and is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ``x-axis`` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+Scatter Fit Plot
+------------------
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars=None, y_vars=None, n_rows=None, n_cols=None, max_cols=4, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None, all_vars=None, label_names=None, **kwargs)
+
+ Create and save scatter plots or a grid of scatter plots for given ``x_vars``
+ and ``y_vars``, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the ``x-axis``.
+ :type x_vars: list of str, optional
+
+ :param y_vars: List of variable names to plot on the ``y-axis``.
+ :type y_vars: list of str, optional
+
+ :param n_rows: Number of rows in the subplot grid. Calculated based on the number of plots and ``n_cols`` if not specified.
+ :type n_rows: int, optional
+
+ :param n_cols: Number of columns in the subplot grid. Calculated based on the number of plots and ``max_cols`` if not specified.
+ :type n_cols: int, optional
+
+ :param max_cols: Maximum number of columns in the subplot grid. Default is ``4``.
+ :type max_cols: int, optional
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``. If None, plots will not be saved.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is 12.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is 10.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the ``hue`` parameter to be set.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+
+ :param all_vars: If provided, automatically generates scatter plots for all combinations of variables in this list, overriding `x_vars` and `y_vars`.
+ :type all_vars: list of str, optional
+
+ :param label_names: A dictionary to rename columns for display in the plot titles and labels.
+ :type label_names: dict, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``sns.scatterplot``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``all_vars`` is provided and either ``x_vars`` or ``y_vars`` is also provided.
+ - If neither ``all_vars`` nor both ``x_vars`` and ``y_vars`` are provided.
+ - If ``hue_palette`` is specified without ``hue``.
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``. This function does not return any value but generates and optionally saves scatter plots for the specified ``x_vars`` and ``y_vars``, or for all combinations of variables in ``all_vars`` if it is provided.
+
+
+
+Regression-Centric Scatter Plots Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots (All Combinations Example)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the ``df`` DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+1. **All Variables Combination**: The ``all_vars`` parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don't need to manually specify ``x_vars`` and ``y_vars``, as the function will iterate through each possible pair.
+
+2. **Grid Display**: The ``show_plot`` parameter is set to ``"grid"``, so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
+3. **Font Sizes**: The ``label_fontsize`` and ``tick_fontsize`` parameters are set to ``14`` and ``12``, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
+4. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``True``, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
+5. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
+6. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``True``, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ all_vars=df.select_dtypes(np.number).columns.to_list(),
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Correlation Matrices
+=====================
+
+**Generate and Save Customizable Correlation Heatmaps**
+
+The ``flex_corr_matrix`` function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
+**Customizable Plot Appearance**
+
+The function provides extensive customization options for the heatmap's appearance:
+
+- **Colormap Selection**: Choose from a variety of colormaps to represent the strength of correlations. The default is ``"coolwarm"``, but this can be adjusted to fit the needs of the analysis.
+
+- **Annotation**: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
+- **Figure Size and Layout**: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+**Triangular vs. Full Correlation Matrix**
+
+
+A key feature of the ``flex_corr_matrix`` function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
+**Label and Axis Configuration**
+
+
+The function offers flexibility in configuring axis labels and titles:
+
+- **Label Rotation**: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+- **Font Sizes**: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+- **Title Wrapping**: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+**Plot Display and Saving Options**
+
+
+The ``flex_corr_matrix`` function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
+
+.. function:: flex_corr_matrix(df, cols=None, annot=True, cmap="coolwarm", save_plots=False, image_path_png=None, image_path_svg=None, figsize=(10, 10), title="Cervical Cancer Data: Correlation Matrix", label_fontsize=12, tick_fontsize=10, xlabel_rot=45, ylabel_rot=0, xlabel_alignment="right", ylabel_alignment="center_baseline", text_wrap=50, vmin=-1, vmax=1, cbar_label="Correlation Index", triangular=True, **kwargs)
+
+ Create a customizable correlation heatmap with options for annotation, color mapping, figure size, and saving the plot.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param cols: List of column names to include in the correlation matrix. If None, all columns are included.
+ :type cols: list of str, optional
+
+ :param annot: Whether to annotate the heatmap with correlation coefficients. Default is ``True``.
+ :type annot: bool, optional
+
+ :param cmap: The colormap to use for the heatmap. Default is ``"coolwarm"``.
+ :type cmap: str, optional
+
+ :param save_plots: Controls whether to save the plots. Default is ``False``.
+ :type save_plots: bool, optional
+
+ :param image_path_png: Directory path to save PNG images of the heatmap.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the heatmap.
+ :type image_path_svg: str, optional
+
+ :param figsize: Width and height of the figure for the heatmap. Default is ``(10, 10)``.
+ :type figsize: tuple, optional
+
+ :param title: Title of the heatmap. Default is ``"Cervical Cancer Data: Correlation Matrix"``.
+ :type title: str, optional
+
+ :param label_fontsize: Font size for tick labels and colorbar label. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param xlabel_rot: Rotation angle for x-axis labels. Default is ``45``.
+ :type xlabel_rot: int, optional
+
+ :param ylabel_rot: Rotation angle for y-axis labels. Default is ``0``.
+ :type ylabel_rot: int, optional
+
+ :param xlabel_alignment: Horizontal alignment for x-axis labels. Default is ``"right"``.
+ :type xlabel_alignment: str, optional
+
+ :param ylabel_alignment: Vertical alignment for y-axis labels. Default is ``"center_baseline"``.
+ :type ylabel_alignment: str, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param vmin: Minimum value for the heatmap color scale. Default is ``-1``.
+ :type vmin: float, optional
+
+ :param vmax: Maximum value for the heatmap color scale. Default is ``1``.
+ :type vmax: float, optional
+
+ :param cbar_label: Label for the colorbar. Default is ``"Correlation Index"``.
+ :type cbar_label: str, optional
+
+ :param triangular: Whether to show only the upper triangle of the correlation matrix. Default is ``True``.
+ :type triangular: bool, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``seaborn.heatmap()``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``annot`` is not a boolean.
+ - If ``cols`` is not a list.
+ - If ``save_plots`` is not a boolean.
+ - If ``triangular`` is not a boolean.
+ - If ``save_plots`` is True but no image paths are provided.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves a correlation heatmap.
+
+Triangular Correlation Matrix Example
+--------------------------------------
+
+The provided code filters the census [1]_ DataFrame ``df`` to include only numeric columns using
+``select_dtypes(np.number)``. It then utilizes the ``flex_corr_matrix()`` function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+.. note::
+
+ This triangular matrix format is particularly useful for avoiding
+ redundancy in correlation matrices, as it excludes the lower half,
+ making it easier to focus on unique pairwise correlations.
+
+The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
+.. code-block:: python
+
+ # Select only numeric data to pass into the function
+ df_num = df.select_dtypes(np.number)
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="coolwarm",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Full Correlation Matrix Example
+----------------------------------
+
+In this modified census [1]_ example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting ``cmap="viridis"``, the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting ``triangular=False``, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="viridis",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Partial Dependence Plots
+=========================
+
+**Partial Dependence Plots (PDPs)** are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model's predictions.
+
+2D Partial Dependence Plots
+-----------------------------
+
+The ``plot_2d_pdp`` function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+- **Grid and Individual Plots**: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+- **Customization Options**: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+- **Saving Plots**: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
+
+.. function:: plot_2d_pdp(model, X_train, feature_names, features, title="PDP of house value on CA non-location features", grid_resolution=50, plot_type="grid", grid_figsize=(12, 8), individual_figsize=(6, 4), label_fontsize=12, tick_fontsize=10, text_wrap=50, image_path_png=None, image_path_svg=None, save_plots=None, file_prefix="partial_dependence")
+
+ Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param X_train: The training data used to compute partial dependence. Should correspond to the features used to train the model.
+ :type X_train: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names: A list of feature names corresponding to the columns in ``X_train``.
+ :type feature_names: list of str
+
+ :param features: A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+ :type features: list of int or tuple of int
+
+ :param title: The title for the entire plot. Default is ``"PDP of house value on CA non-location features"``.
+ :type title: str, optional
+
+ :param grid_resolution: The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is ``50``.
+ :type grid_resolution: int, optional
+
+ :param plot_type: The type of plot to generate. Choose ``"grid"`` for a grid layout, ``"individual"`` for separate plots, or ``"both"`` to generate both layouts. Default is ``"grid"``.
+ :type plot_type: str, optional
+
+ :param grid_figsize: Tuple specifying the width and height of the figure for the grid layout. Default is ``(12, 8)``.
+ :type grid_figsize: tuple, optional
+
+ :param individual_figsize: Tuple specifying the width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple, optional
+
+ :param label_fontsize: Font size for the axis labels and titles. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for the axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param image_path_png: The directory path where PNG images of the plots will be saved, if saving is enabled.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: The directory path where SVG images of the plots will be saved, if saving is enabled.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls whether to save the plots. Options include ``"all"``, ``"individual"``, ``"grid"``, or ``None`` (default). If saving is enabled, ensure ``image_path_png`` or ``image_path_svg`` are provided.
+ :type save_plots: str, optional
+
+ :param file_prefix: Prefix for the filenames of the saved grid plots. Default is ``"partial_dependence"``.
+ :type file_prefix: str, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``"grid"``, ``"individual"``, or ``"both"``.
+ - If ``save_plots`` is enabled but neither ``image_path_png`` nor ``image_path_svg`` is provided.
+
+ :returns: ``None``
+ This function generates partial dependence plots and displays them. It does not return any values.
+
+
+2D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California. [4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
+For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
+If you extend this to two interacting features, such as ``AveOccup`` and ``HouseAge``,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
+Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ from sklearn.ensemble import GradientBoostingRegressor
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=42,
+ )
+ model.fit(X_train, y_train)
+
+
+**Create 2D Partial Dependence Plot Grid**
+
+.. code-block:: python
+
+ # import the plot_2d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_2d_pdp
+
+ # Feature names
+ names = data.feature_names
+
+ # Generate 2D partial dependence plots
+ plot_2d_pdp(
+ model=model,
+ X_train=X_train,
+ feature_names=names,
+ features=[
+ "MedInc",
+ "AveOccup",
+ "HouseAge",
+ "AveRooms",
+ "Population",
+ ("AveOccup", "HouseAge"),
+ ],
+ title="PDP of house value on CA non-location features",
+ grid_figsize=(14, 10),
+ individual_figsize=(12, 4),
+ label_fontsize=14,
+ tick_fontsize=12,
+ text_wrap=120,
+ plot_type="grid",
+ image_path_png="path/to/save/png",
+ save_plots="all",
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+3D Partial Dependence Plots
+-----------------------------
+
+The ``plot_3d_pdp`` function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+- **Interactive and Static 3D Plots**: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+- **Colormap and Layout Customization**: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+- **Axis and Title Configuration**: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
+
+.. function:: plot_3d_pdp(model, dataframe, feature_names_list, x_label=None, y_label=None, z_label=None, title, html_file_path=None, html_file_name=None, image_filename=None, plot_type="both", matplotlib_colormap=None, plotly_colormap="Viridis", zoom_out_factor=None, wireframe_color=None, view_angle=(22, 70), figsize=(7, 4.5), text_wrap=50, horizontal=-1.25, depth=1.25, vertical=1.25, cbar_x=1.05, cbar_thickness=25, title_x=0.5, title_y=0.95, top_margin=100, image_path_png=None, image_path_svg=None, show_cbar=True, grid_resolution=20, left_margin=20, right_margin=65, label_fontsize=8, tick_fontsize=6, enable_zoom=True, show_modebar=True)
+
+ Generate 3D partial dependence plots for two features of a machine learning model.
+
+ This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param dataframe: The dataset on which the model was trained or a representative sample. If a DataFrame is provided, ``feature_names_list`` should correspond to the column names. If a NumPy array is provided, ``feature_names_list`` should correspond to the indices of the columns.
+ :type dataframe: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names_list: A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+ :type feature_names_list: list of str
+
+ :param x_label: Label for the x-axis in the plots. Default is ``None``.
+ :type x_label: str, optional
+
+ :param y_label: Label for the y-axis in the plots. Default is ``None``.
+ :type y_label: str, optional
+
+ :param z_label: Label for the z-axis in the plots. Default is ``None``.
+ :type z_label: str, optional
+
+ :param title: The title for the plots.
+ :type title: str
+
+ :param html_file_path: Path to save the interactive Plotly HTML file. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_path: str, optional
+
+ :param html_file_name: Name of the HTML file to save the interactive Plotly plot. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_name: str, optional
+
+ :param image_filename: Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is ``None``.
+ :type image_filename: str, optional
+
+ :param plot_type: The type of plots to generate. Options are:
+ - ``"static"``: Generate only static Matplotlib plots.
+ - ``"interactive"``: Generate only interactive Plotly plots.
+ - ``"both"``: Generate both static and interactive plots. Default is ``"both"``.
+ :type plot_type: str, optional
+
+ :param matplotlib_colormap: Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+ :type matplotlib_colormap: matplotlib.colors.Colormap, optional
+
+ :param plotly_colormap: Colormap for the Plotly plot. Default is ``"Viridis"``.
+ :type plotly_colormap: str, optional
+
+ :param zoom_out_factor: Factor to adjust the zoom level of the Plotly plot. Default is ``None``.
+ :type zoom_out_factor: float, optional
+
+ :param wireframe_color: Color for the wireframe in the Matplotlib plot. If ``None``, no wireframe is plotted. Default is ``None``.
+ :type wireframe_color: str, optional
+
+ :param view_angle: Elevation and azimuthal angles for the Matplotlib plot view. Default is ``(22, 70)``.
+ :type view_angle: tuple, optional
+
+ :param figsize: Figure size for the Matplotlib plot. Default is ``(7, 4.5)``.
+ :type figsize: tuple, optional
+
+ :param text_wrap: Maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param horizontal: Horizontal camera position for the Plotly plot. Default is ``-1.25``.
+ :type horizontal: float, optional
+
+ :param depth: Depth camera position for the Plotly plot. Default is ``1.25``.
+ :type depth: float, optional
+
+ :param vertical: Vertical camera position for the Plotly plot. Default is ``1.25``.
+ :type vertical: float, optional
+
+ :param cbar_x: Position of the color bar along the x-axis in the Plotly plot. Default is ``1.05``.
+ :type cbar_x: float, optional
+
+ :param cbar_thickness: Thickness of the color bar in the Plotly plot. Default is ``25``.
+ :type cbar_thickness: int, optional
+
+ :param title_x: Horizontal position of the title in the Plotly plot. Default is ``0.5``.
+ :type title_x: float, optional
+
+ :param title_y: Vertical position of the title in the Plotly plot. Default is ``0.95``.
+ :type title_y: float, optional
+
+ :param top_margin: Top margin for the Plotly plot layout. Default is ``100``.
+ :type top_margin: int, optional
+
+ :param image_path_png: Directory path to save the PNG file of the Matplotlib plot. Default is None.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save the SVG file of the Matplotlib plot. Default is None.
+ :type image_path_svg: str, optional
+
+ :param show_cbar: Whether to display the color bar in the Matplotlib plot. Default is ``True``.
+ :type show_cbar: bool, optional
+
+ :param grid_resolution: The resolution of the grid for computing partial dependence. Default is ``20``.
+ :type grid_resolution: int, optional
+
+ :param left_margin: Left margin for the Plotly plot layout. Default is ``20``.
+ :type left_margin: int, optional
+
+ :param right_margin: Right margin for the Plotly plot layout. Default is ``65``.
+ :type right_margin: int, optional
+
+ :param label_fontsize: Font size for axis labels in the Matplotlib plot. Default is ``8``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for tick labels in the Matplotlib plot. Default is ``6``.
+ :type tick_fontsize: int, optional
+
+ :param enable_zoom: Whether to enable zooming in the Plotly plot. Default is ``True``.
+ :type enable_zoom: bool, optional
+
+ :param show_modebar: Whether to display the mode bar in the Plotly plot. Default is ``True``.
+ :type show_modebar: bool, optional
+
+ :raises ValueError:
+ - If `plot_type` is not one of ``"static"``, ``"interactive"``, or ``"both"``.
+ - If `plot_type` is ``"interactive"`` or ``"both"`` and ``html_file_path`` or ``html_file_name`` are not provided.
+
+ :returns: ``None``
+ This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+ :notes:
+ - This function handles warnings related to scikit-learn's ``partial_dependence`` function, specifically a ``FutureWarning`` related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+ - To maintain compatibility with different versions of scikit-learn, the function attempts to use ``"values"`` for grid extraction in newer versions and falls back to ``"grid_values"`` for older versions.
+
+
+3D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
+For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
+This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
+Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
+
+Static Plot
+^^^^^^^^^^^^^^^^^
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=1,
+ )
+ model.fit(X_train, y_train)
+
+**Create Static 3D Partial Dependence Plot**
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8, 5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Interactive Plot
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+ )
+
+.. warning::
+
+ **Scrolling Notice:**
+
+ While interacting with the interactive Plotly plot below, scrolling down the
+ page using the mouse wheel may be blocked when the mouse pointer is hovering
+ over the plot. To continue scrolling, either move the mouse pointer outside
+ the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+.. raw:: html
+
+
+
+
+
+
+This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here's how it differs from
+generating a static plot using Matplotlib.
+
+**Key Differences**
+
+**Plot Type**:
+
+- The ``plot_type`` is set to ``"interactive"`` for the Plotly plot and ``"static"`` for the Matplotlib plot.
+
+**Interactive-Specific Parameters**:
+
+- **HTML File Path and Name**: The ``html_file_path`` and ``html_file_name`` parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
+- **Zoom and Positioning**: The interactive plot includes parameters like ``zoom_out_factor``, ``title_x``, ``cbar_x``, and ``cbar_thickness`` to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
+- **Mode Bar and Zoom**: The ``show_modebar`` and ``enable_zoom`` parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+**Static-Specific Parameters**:
+
+- **Figure Size and Wireframe Color**: The static plot uses parameters like ``figsize`` to control the size of the Matplotlib plot and ``wireframe_color`` to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+.. [2] Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+.. [3] Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+.. [4] Pace, R. K., & Barry, R. (1997). *Sparse Spatial Autoregressions*. *Statistics & Probability Letters*, 33(3), 291-297. `https://doi.org/10.1016/S0167-7152(96)00140-X `_.
+
diff --git a/_build/html/v0.0.10/_sources/getting_started.rst.txt b/_build/html/v0.0.10/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..68e4cd440
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/getting_started.rst.txt
@@ -0,0 +1,136 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.10``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``jinja2``: version ``3.1.4`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``nbformat``: version ``4.2.0`` or higher
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``plotly``: version ``5.18.0`` or higher
+- ``scikit-learn``: version ``1.0.2`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
+Description
+===============
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+.. [#] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.10/_sources/index.rst.txt b/_build/html/v0.0.10/_sources/index.rst.txt
new file mode 100644
index 000000000..5f0dc6c56
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/index.rst.txt
@@ -0,0 +1,57 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Data Management
+
+ data_management
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Plotting Heuristics
+
+ eda_plots
+
+.. toctree::
+ :maxdepth: 4
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ contributors
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.10/_sources/references.rst.txt b/_build/html/v0.0.10/_sources/references.rst.txt
new file mode 100644
index 000000000..335337c3a
--- /dev/null
+++ b/_build/html/v0.0.10/_sources/references.rst.txt
@@ -0,0 +1,33 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
The legend is now displayed only if there are valid legend handles (len(handles)>0) and if show_legend is set to True.
+
The check ax.get_legend().remove() ensures that unnecessary legends are removed if they are empty or if show_legend is set to False.
+
+
Error Handling
+- Error handling in the except block has been enhanced to ensure that any exceptions related to legends or labels are managed properly. The legend handling logic still respects the show_legend flag even in cases where exceptions occur.
+
This update prevents empty legend squares from appearing and maintains the intended default behavior of showing legends only when they contain relevant content.
Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
Visualization Enhancements
+
DataFrame Columns: Added a background_color variable to dataframe_columns`,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use hide() instead of hide_index(). The highlighted
+columns allow for easier null versus unique value analysis.
+
The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
Args:
+
+
The df argument is specified as a pandas.DataFrame.
+
The background_color argument is marked as optional, with a brief description of its role.
+
The return_df argument is also marked as optional, explaining what it controls.
+
+
Returns: The return type is specified as pandas.DataFrame, with a clear explanation of the difference based on the return_df flag.
+
KDE Distribution Plots: Improved kde_distributions() with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
Scatter Plots: Enhanced scatter_fit_plot() with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
Flexible `save_formats` Input:
+- save_formats now accepts a string, tuple, or list for specifying formats (e.g., “png”, (“png”, “svg”), or [“png”, “svg”]).
+- Single strings or tuples are automatically converted to lists for consistent processing.
+
Dynamic Error Handling:
+- Added checks to ensure a valid path is provided for each format in save_formats.
+- Raises a ValueError if a format is specified without a corresponding path, with a clear, dynamic error message.
+
Improved Plot Saving Logic:
+- Updated logic allows saving plots in one format (e.g., only “png” or “svg”) without requiring the other.
+- Simplified and more intuitive path handling for saving plots.
This update introduces several key changes to the plot_3d_pdp function, simplifying the function’s interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
1. Parameter Changes
+
+
Removed Parameters:
+
+
The parameters x_label_plotly, y_label_plotly, and z_label_plotly have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general x_label, y_label, and z_label. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+
+
Default Values for Labels:
+
+
The parameters x_label, y_label, and z_label are now optional, with None as the default. If not provided, these labels will automatically default to the names of the features in the feature_names_list. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+
+
Changes in Default Values for View Angles:
+
+
The default values for camera positioning parameters have been updated: horizontal is now -1.25, depth is now 1.25, and vertical is now 1.25. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+
+
+
2. Plot Generation Logic
+
+
Conditionally Checking Labels:
+
+
The function now checks whether x_label, y_label, and z_label are provided. If these are None, the function will automatically assign default labels based on the feature_names_list. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+
+
Camera Position Adjustments:
+
+
The camera positions for the Plotly plot are now adjusted by multiplying horizontal, depth, and vertical by zoom_out_factor. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+
+
Surface Plot Coordinates Adjustments:
+
+
The order of the coordinates for the Plotly plot’s surface has been changed from ZZ,XX,YY[::-1] to ZZ,XX,YY. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+
+
+
3. Code Simplifications
+
+
Removed Complexity:
+
+
By removing the x_label_plotly, y_label_plotly, and z_label_plotly parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+
+
Fallback Mechanism for Grid Values:
+
+
The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+
+
+
4. Style Adjustments
+
+
Label Formatting:
+
+
The new version consistently uses y_label, x_label, and z_label for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+
+
Color Bar Adjustments:
+
+
The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of 0.6 and a pad value of 0.02. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+
+
+
5. Potential Use Case Differences
+
+
Simplified Interface:
+
+
The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+
+
Less Granular Control:
+
+
Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the *_plotly label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+
+
+
6. Matplotlib Plot Adjustments
+
+
Wireframe and Surface Plot Enhancements:
+
+
The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+
+
+
Summary
+
+
Version 0.0.8d of the plot_3d_pdp function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+
Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+
Decision Point
+
+
This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
Key Updates in 0.0.8c:
+
+
Bug Fixes: Addressed minor issues identified in version 0.0.8b to ensure smoother performance and better user experience.
+
Additional Testing: Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+
Refinements: Made small enhancements to existing features based on user feedback and internal testing results.
+
+
Summary of Changes
+
+
New Features & Enhancements
+
+
+
plot_3d_pdp Function:
+
+
Added show_modebar Parameter: Introduced a new boolean parameter, show_modebar, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
Custom Margins and Layout Adjustments:
+
+
Added parameters for left_margin, right_margin, and top_margin to provide users with more control over the plot layout in Plotly.
+
Adjusted default values and added options for better customization of the Plotly color bar (cbar_x, cbar_thickness) and title positioning (title_x, title_y).
+
+
+
Plotly Configuration:
+
+
Enhanced the configuration options to allow users to enable or disable zoom functionality (enable_zoom) in the interactive Plotly plots.
+
Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+
+
Error Handling:
+
+
Added input validation for html_file_path and html_file_name to ensure these are provided when necessary based on the selected plot_type.
+
+
+
+
+
plot_2d_pdp Function:
+
+
Introduced file_prefix Parameter:
+
+
Added a new file_prefix parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+
+
Enhanced Plot Type Flexibility:
+
+
The plot_type parameter now includes an option to generate both grid and individual plots (both). This feature allows users to create a combination of both layout styles in one function call.
+
Updated input validation and logic to handle this new option effectively.
+
+
+
Added save_plots Parameter:
+
+
Introduced a new parameter, save_plots, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+
+
Custom Margins and Layout Adjustments:
+
+
Included the save_plots parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+
+
+
+
+
+
Documentation Updates
+
+
+
Docstrings:
+
+
Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
Detailed the use of new parameters such as show_modebar, file_prefix, save_plots, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+
+
+
+
Refactoring & Code Cleanup
+
+
+
Code Structure:
+
+
Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
Version 0.0.8b is an exact replica of version 0.0.8a. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
New Features:
+
+
Optional file_prefix in stacked_crosstab_plot Function
+
+
The stacked_crosstab_plot function has been updated to make the file_prefix argument optional. If the user does not provide a file_prefix, the function will now automatically generate a default prefix based on the col and func_col parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
Key Improvement:
+
+
Users can now omit the file_prefix argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
Backward compatibility is maintained, allowing users who prefer to specify a custom file_prefix to continue doing so without any issues.
+
+
+
+
+
Introduction of 3D and 2D Partial Dependence Plot Functions
+
+
Two new functions, plot_3d_pdp and plot_2d_pdp, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+
plot_3d_pdp: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
plot_2d_pdp: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+
+
Key Features:
+
+
Compatibility: Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
Customization: Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
Interactive 3D Plots: The plot_3d_pdp function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+
+
+
+
+
Impact:
+
+
These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+
The optional file_prefix enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+
The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
Added Function for Customizable Correlation Matrix Visualization
+
This release introduces a new function, flex_corr_matrix, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to seaborn.heatmap().
+
Summary of Changes
+
+
New Function: flex_corr_matrix.
+
+
Functionality:
+- Generates a correlation heatmap for a given DataFrame.
+- Supports both full and upper triangular correlation matrices based on the triangular parameter.
+- Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+- Accepts additional keyword arguments via **kwargs to pass directly to seaborn.heatmap().
+- Includes validation to ensure the triangular, annot, and save_plots parameters are boolean values.
+- Raises an exception if save_plots=True but neither image_path_png nor image_path_svg is specified.
+
+
+
+
Usage
+
# Full correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=False,cmap="coolwarm",annot=True)
+
+# Upper triangular correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=True,cmap="coolwarm",annot=True)
+
+
+
Contingency table df to object type
+
Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str, optional) – The name of the new column for the IDs. Defaults to "ID".
+
num_digits (int, optional) – The number of digits for the unique IDs. Defaults to 9.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
+
Note
+
+
If the dataframe index is not unique, a warning is printed.
+
+
The function does not check if the number of rows exceeds the number of
unique IDs that can be generated with the specified number of digits.
+
+
+
+
The first digit of the generated IDs is ensured to be non-zero.
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
Analyze DataFrame columns to provide summary statistics such as data type,
+null counts, unique values, and most frequent values.
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
background_color (str, optional) – Hex color code or color name for background styling in the output
+DataFrame. Defaults to None.
+
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
+False, returns a styled DataFrame for visual presentation. Defaults to False.
+
+
+
Returns:
+
If return_df is True, returns the plain DataFrame containing column summary
+statistics. If return_df is False, returns a styled DataFrame with optional
+background color for specific columns.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
The function will create an Excel file with a sheet for each combination
of the specified variables, as well as a “Table of Contents” sheet with
+hyperlinks to each summary table.
+
+
+
+
The sheet names are limited to 31 characters due to Excel’s constraints.
+
+
+
The function returns two outputs:
+
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
+
+
+
+
Note
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
Create a contingency table from one or more columns in a DataFrame, with sorting options.
+
This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the contingency_table function.
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame containing the contingency table with the specified columns, a 'Total' column representing the count of occurrences, and a 'Percentage' column representing the percentage of the total count.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or “bins.” This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or “bins,” and
+then assigning each value to one of these intervals.
+
+
Note
+
The code snippets below create age bins and assign a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
+
+
Below, we use the age column of the census data [1] from the UCI Machine Learning Repository as an example:
+
+
Bins Definition:
+The bins are defined by specifying the boundaries of each interval. For example,
+in the code snippet below, the bin_ages list specifies the boundaries for age groups:
These labels are used to categorize the numerical values into meaningful groups.
+
+
Applying the Binning:
+The pd.cut function
+from Pandas is used to apply the binning process. For each value in the age
+column of the DataFrame, it assigns a corresponding label based on which bin the
+value falls into. Here, right=False indicates that each bin includes the
+left endpoint but excludes the right endpoint. For example, if bin_ages=
+[0,10,20,30], then a value of 10 will fall into the bin [10,20) and
+be labeled accordingly.
The parameter right=False in pd.cut means that the bins are left-inclusive
+and right-exclusive, except for the last bin, which is always right-inclusive
+when the upper bound is infinity (float("inf")).
The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
The Pearson correlation coefficient, often denoted as \(r\), is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from \(-1\) to \(1\), where:
+
+
\(r = 1\) indicates a perfect positive linear relationship.
+
\(r = -1\) indicates a perfect negative linear relationship.
+
\(r = 0\) indicates no linear relationship.
+
+
The Pearson correlation coefficient between two variables \(X\) and \(Y\) is defined as:
This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between \(X\) and \(Y\).
+
+
\(r > 0\): Positive correlation. As \(X\) increases, \(Y\) tends to increase.
+
\(r < 0\): Negative correlation. As \(X\) increases, \(Y\) tends to decrease.
+
\(r = 0\): No linear correlation. There is no consistent linear relationship between \(X\) and \(Y\).
+
+
The closer the value of \(r\) is to \(\pm 1\), the stronger the linear relationship between the two variables.
Let \(\mathbf{X}\) represent the complete set of input features for a machine
+learning model, where \(\mathbf{X} = \{X_1, X_2, \dots, X_p\}\). Suppose we’re
+particularly interested in a subset of these features, denoted by \(\mathbf{X}_S\).
+The complementary set, \(\mathbf{X}_C\), contains all the features in \(\mathbf{X}\)
+that are not in \(\mathbf{X}_S\). Mathematically, this relationship is expressed as:
where \(\mathbf{X}_C\) is the set of features in \(\mathbf{X}\) after
+removing the features in \(\mathbf{X}_S\).
+
Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in \(\mathbf{X}_S\) on the model’s predictions, while averaging out the
+influence of the features in \(\mathbf{X}_C\). This is mathematically defined as:
\(\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]\) indicates that we are taking the expected value over the possible values of the features in the set \(\mathbf{X}_C\).
+
\(p(x_C)\) represents the probability density function of the features in \(\mathbf{X}_C\).
+
+
This operation effectively summarizes the model’s output over all potential values of the complementary features, providing a clear view of how the features in \(\mathbf{X}_S\) alone impact the model’s predictions.
+
2D Partial Dependence Plots
+
Consider a trained machine learning model 2D Partial Dependence Plots\(f(\mathbf{X})\), where \(\mathbf{X} = (X_1, X_2, \dots, X_p)\) represents the vector of input features. The partial dependence of the predicted response \(\hat{y}\) on a single feature \(X_j\) is defined as:
\(\mathbf{X}_{C_i}\) represents the complement set of \(X_j\), meaning the remaining features in \(\mathbf{X}\) not included in \(X_j\) for the \(i\)-th instance.
+
\(n\) is the number of observations in the dataset.
+
+
For two features, \(X_j\) and \(X_k\), the partial dependence is given by:
This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of \(X_j\) and \(X_k\) vary, while the effects of the other features are averaged out.
+
+
Single Feature PDP: When plotting \(\text{PD}(X_j)\), the result is a 2D line plot showing the marginal effect of feature \(X_j\) on the predicted outcome, averaged over all possible values of the other features.
+
Two Features PDP: When plotting \(\text{PD}(X_j, X_k)\), the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of \(X_j\) and \(X_k\) on the predicted outcome. The surface represents the expected value of the prediction as \(X_j\) and \(X_k\) vary, while all other features are averaged out.
+
+
3D Partial Dependence Plots
+
For a more comprehensive analysis, especially when exploring interactions between two features, 3D Partial Dependence Plots are invaluable. The partial dependence function for two features in a 3D context is:
Here, the function \(f(X_j, X_k, \mathbf{X}_{C_i})\) is evaluated across a grid of values for \(X_j\) and \(X_k\). The resulting 3D surface plot represents how the model’s prediction changes over the joint range of these two features.
+
The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model’s predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+
Feature Interaction Visualization: The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+
Enhanced Interpretation: 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
+
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
+
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
+
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
+
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
+
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
+
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
+
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
+
kwargs (additional keyword arguments) – Additional keyword arguments passed to the Seaborn plotting function.
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The plot_type="both" parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
+
Note
+
If you do not set n_rows or n_cols to any values, the function will
+automatically calculate and create a grid based on the number of variables being
+plotted, ensuring an optimal arrangement of the plots.
+
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both",# Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+plot_type="hist", meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The y-axis label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to "Count" to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=3),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is customized to generate
+histograms that include mean and median lines. The mean_color is set to "blue"
+and the median_color is set to "black", allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted (plot_mean=True,plot_median=True).
+The y_axis_label remains "Density", indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using hist_color="brown", with a fill_alpha=0.60 while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns (n_rows=1,n_cols=3), and
+the grid size is set to 15x5 inches. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
Histogram Example - (Mean, Median, and Std. Deviation)
+
In this example, the kde_distributions() function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+mean_color is set to "blue" and the median_color is set to "black",
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted (plot_mean=True,plot_median=True). The y_axis_label remains
+"Density", indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using hist_color="brown",
+with a fill_alpha=0.40, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors "purple",
+"green", and "silver" for one, two, and three standard deviations, respectively.
+
The layout is configured with a single row and multiple columns (n_rows=1,n_cols=3),
+and the grid size is set to 15x5 inches. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+
Note
+
You have the freedom to choose whether to plot the mean, median, and
+standard deviation lines. You can display one, none, or all of these simultaneously.
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
+
Note
+
The legend_labels_list parameter should be a list of lists, where each
+inner list corresponds to the ground truth labels for the respective item in
+the func_col list. Each element in the func_col list represents a
+column in your DataFrame that you wish to analyze, and the corresponding
+inner list in legend_labels_list should contain the labels that will be
+used in the legend of your plots.
+
+
For example:
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],# Corresponds to "sex"
+ ["<=50K",">50K"],# Corresponds to "income"
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Important
+
Ensure that the number of elements in func_col, legend_labels_list,
+and title are the same. Each item in func_col must have a corresponding
+list of labels in legend_labels_list and a title in title. This
+consistency is essential for the function to correctly generate the plots
+with the appropriate labels and titles.
+
+
In this example:
+
+
func_col contains two elements: "sex" and "income". Each corresponds to a specific column in your DataFrame.
+
legend_labels_list is a nested list containing two inner lists:
+
+
+
The first inner list, ["Male","Female"], corresponds to the "sex" column in func_col.
+
The second inner list, ["<=50K",">50K"], corresponds to the "income" column in func_col.
+
+
+
+
title contains two elements: "Sex" and "Income", which will be used as the titles for the respective plots.
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is provided and is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the x-axis labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
+
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
+
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
+
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
+
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
+
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
+
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
If all_vars is provided and either x_vars or y_vars is also provided.
+
If neither all_vars nor both x_vars and y_vars are provided.
+
If hue_palette is specified without hue.
+
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set but no image paths are provided.
+
If rotate_plot is not a boolean value.
+
If individual_figsize or grid_figsize are not tuples/lists with two numeric values.
+
+
+
+
Returns:
+
None. This function does not return any value but generates and optionally saves scatter plots for the specified x_vars and y_vars, or for all combinations of variables in all_vars if it is provided.
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
In this example, the scatter_fit_plot function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the df DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+
All Variables Combination: The all_vars parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don’t need to manually specify x_vars and y_vars, as the function will iterate through each possible pair.
+
Grid Display: The show_plot parameter is set to "grid", so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
Font Sizes: The label_fontsize and tick_fontsize parameters are set to 14 and 12, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
Best Fit Line: The add_best_fit_line parameter is set to True, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
Scatter Color: The scatter_color parameter is set to "#808080", applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
Correlation Coefficient: The show_correlation parameter is set to True, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+
These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
Generate and Save Customizable Correlation Heatmaps
+
The flex_corr_matrix function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
Customizable Plot Appearance
+
The function provides extensive customization options for the heatmap’s appearance:
+
+
Colormap Selection: Choose from a variety of colormaps to represent the strength of correlations. The default is "coolwarm", but this can be adjusted to fit the needs of the analysis.
+
Annotation: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
Figure Size and Layout: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+
Triangular vs. Full Correlation Matrix
+
A key feature of the flex_corr_matrix function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
Label and Axis Configuration
+
The function offers flexibility in configuring axis labels and titles:
+
+
Label Rotation: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+
Font Sizes: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+
Title Wrapping: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+
Plot Display and Saving Options
+
The flex_corr_matrix function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
The provided code filters the census [1] DataFrame df to include only numeric columns using
+select_dtypes(np.number). It then utilizes the flex_corr_matrix() function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+
Note
+
This triangular matrix format is particularly useful for avoiding
+redundancy in correlation matrices, as it excludes the lower half,
+making it easier to focus on unique pairwise correlations.
+
+
The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
# Select only numeric data to pass into the function
+df_num=df.select_dtypes(np.number)
+
In this modified census [1] example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting cmap="viridis", the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting triangular=False, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
Partial Dependence Plots (PDPs) are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model’s predictions.
The plot_2d_pdp function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+
Grid and Individual Plots: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+
Customization Options: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+
Saving Plots: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
X_train (pandas.DataFrame or numpy.ndarray) – The training data used to compute partial dependence. Should correspond to the features used to train the model.
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
Consider a scenario where you have a machine learning model predicting median
+house values in California. [4] Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
If you extend this to two interacting features, such as AveOccup and HouseAge,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
Fetch The CA Housing Dataset and Prepare The DataFrame
The plot_3d_pdp function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+
Interactive and Static 3D Plots: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+
Colormap and Layout Customization: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+
Axis and Title Configuration: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
Generate 3D partial dependence plots for two features of a machine learning model.
+
This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
+- "static": Generate only static Matplotlib plots.
+- "interactive": Generate only interactive Plotly plots.
+- "both": Generate both static and interactive plots. Default is "both".
+
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
If plot_type is not one of "static", "interactive", or "both".
+
If plot_type is "interactive" or "both" and html_file_path or html_file_name are not provided.
+
+
+
+
Returns:
+
None
+This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+
Notes:
+
+
This function handles warnings related to scikit-learn’s partial_dependence function, specifically a FutureWarning related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+
To maintain compatibility with different versions of scikit-learn, the function attempts to use "values" for grid extraction in newer versions and falls back to "grid_values" for older versions.
Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8,5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+)
+
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+)
+
+
+
+
Warning
+
Scrolling Notice:
+
While interacting with the interactive Plotly plot below, scrolling down the
+page using the mouse wheel may be blocked when the mouse pointer is hovering
+over the plot. To continue scrolling, either move the mouse pointer outside
+the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+
+
This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here’s how it differs from
+generating a static plot using Matplotlib.
+
Key Differences
+
Plot Type:
+
+
The plot_type is set to "interactive" for the Plotly plot and "static" for the Matplotlib plot.
+
+
Interactive-Specific Parameters:
+
+
HTML File Path and Name: The html_file_path and html_file_name parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
Zoom and Positioning: The interactive plot includes parameters like zoom_out_factor, title_x, cbar_x, and cbar_thickness to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
Mode Bar and Zoom: The show_modebar and enable_zoom parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+
Static-Specific Parameters:
+
+
Figure Size and Wireframe Color: The static plot uses parameters like figsize to control the size of the Matplotlib plot and wireframe_color to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+
By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.10/genindex.html b/_build/html/v0.0.10/genindex.html
new file mode 100644
index 000000000..a7178c7cc
--- /dev/null
+++ b/_build/html/v0.0.10/genindex.html
@@ -0,0 +1,359 @@
+
+
+
+
+
+
+
+ Index — EDA Toolkit 0.0.10 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
+
+
+
+
+
+
+
Index
+
+
+ A
+ | B
+ | C
+ | D
+ | E
+ | F
+ | H
+ | K
+ | P
+ | S
+
+
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.10.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.11/_sources/changelog.rst.txt b/_build/html/v0.0.11/_sources/changelog.rst.txt
new file mode 100644
index 000000000..1a113cbfc
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/changelog.rst.txt
@@ -0,0 +1,642 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+`Version 0.0.11`_
+----------------------
+
+.. _Version 0.0.11: https://lshpaner.github.io/eda_toolkit/v0.0.11/index.html
+
+
+**Description**
+
+Fixes a ``TypeError`` in the ``stacked_crosstab_plot`` function when ``save_formats`` is ``None``. The update ensures that ``save_formats`` defaults to an empty list, preventing iteration over a ``NoneType`` object.
+
+**Changes**
+
+- Initializes ``save_formats`` as an empty list if not provided.
+- Adds handling for string and tuple input types for ``save_formats``.
+
+**Issue Fixed**
+
+- Resolves ``TypeError`` when ``save_formats`` is ``None``.
+
+
+
+`Version 0.0.10`_
+----------------------
+
+.. _Version 0.0.10: https://lshpaner.github.io/eda_toolkit/v0.0.10/index.html
+
+**Legend Handling**
+
+- The legend is now displayed only if there are valid legend handles (``len(handles) > 0``) and if ``show_legend`` is set to ``True``.
+
+- The check ``ax.get_legend().remove()`` ensures that unnecessary legends are removed if they are empty or if ``show_legend`` is set to ``False``.
+
+**Error Handling**
+- Error handling in the ``except`` block has been enhanced to ensure that any exceptions related to legends or labels are managed properly. The legend handling logic still respects the ``show_legend`` flag even in cases where exceptions occur.
+
+This update prevents empty legend squares from appearing and maintains the intended default behavior of showing legends only when they contain relevant content.
+
+
+`Version 0.0.9`_
+----------------------
+
+.. _Version 0.0.9: https://lshpaner.github.io/eda_toolkit/v0.0.9/index.html
+
+**Bug Fixes and Minor Improvements**
+
+Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
+**Visualization Enhancements**
+
+**DataFrame Columns:** Added a ``background_color`` variable to ``dataframe_columns```,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use ``hide()`` instead of ``hide_index()``. The highlighted
+columns allow for easier null versus unique value analysis.
+
+The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
+**Args:**
+
+- The ``df`` argument is specified as a ``pandas.DataFrame``.
+
+- The ``background_color`` argument is marked as optional, with a brief description of its role.
+
+- The ``return_df`` argument is also marked as optional, explaining what it controls.
+
+
+**Returns:** The return type is specified as ``pandas.DataFrame``, with a clear explanation of the difference based on the ``return_df`` flag.
+
+**KDE Distribution Plots:** Improved ``kde_distributions()`` with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
+**Scatter Plots:** Enhanced ``scatter_fit_plot()`` with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
+
+
+`Version 0.0.8`_
+----------------------
+
+.. _Version 0.0.8: https://lshpaner.github.io/eda_toolkit/v0.0.8/index.html
+
+
+:class:`stacked_crosstab_plot`
+
+- **Flexible `save_formats` Input**:
+ - `save_formats` now accepts a string, tuple, or list for specifying formats (e.g., `"png"`, `("png", "svg")`, or `["png", "svg"]`).
+ - Single strings or tuples are automatically converted to lists for consistent processing.
+
+- **Dynamic Error Handling**:
+ - Added checks to ensure a valid path is provided for each format in `save_formats`.
+ - Raises a `ValueError` if a format is specified without a corresponding path, with a clear, dynamic error message.
+
+- **Improved Plot Saving Logic**:
+ - Updated logic allows saving plots in one format (e.g., only `"png"` or `"svg"`) without requiring the other.
+ - Simplified and more intuitive path handling for saving plots.
+
+
+:class:`plot_3d_pdp`
+
+This update introduces several key changes to the `plot_3d_pdp` function, simplifying the function's interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
+**1. Parameter Changes**
+
+
+- **Removed Parameters:**
+
+ - The parameters ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general ``x_label``, ``y_label``, and ``z_label``. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+- **Default Values for Labels:**
+
+ - The parameters ``x_label``, ``y_label``, and ``z_label`` are now optional, with ``None`` as the default. If not provided, these labels will automatically default to the names of the features in the ``feature_names_list``. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+- **Changes in Default Values for View Angles:**
+
+ - The default values for camera positioning parameters have been updated: ``horizontal`` is now ``-1.25``, ``depth`` is now ``1.25``, and ``vertical`` is now ``1.25``. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+**2. Plot Generation Logic**
+
+- **Conditionally Checking Labels:**
+
+ - The function now checks whether ``x_label``, ``y_label``, and ``z_label`` are provided. If these are ``None``, the function will automatically assign default labels based on the ``feature_names_list``. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+- **Camera Position Adjustments:**
+
+ - The camera positions for the Plotly plot are now adjusted by multiplying ``horizontal``, ``depth``, and ``vertical`` by ``zoom_out_factor``. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+- **Surface Plot Coordinates Adjustments:**
+
+ - The order of the coordinates for the Plotly plot’s surface has been changed from ``ZZ, XX, YY[::-1]`` to ``ZZ, XX, YY``. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+**3. Code Simplifications**
+
+- **Removed Complexity:**
+
+ - By removing the ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+- **Fallback Mechanism for Grid Values:**
+
+ - The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+**4. Style Adjustments**
+
+- **Label Formatting:**
+
+ - The new version consistently uses ``y_label``, ``x_label``, and ``z_label`` for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+- **Color Bar Adjustments:**
+
+ - The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of ``0.6`` and a pad value of ``0.02``. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+**5. Potential Use Case Differences**
+
+- **Simplified Interface:**
+
+ - The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+- **Less Granular Control:**
+
+ - Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the ``*_plotly`` label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+**6. Matplotlib Plot Adjustments**
+
+- **Wireframe and Surface Plot Enhancements:**
+
+ - The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+**Summary**
+
+- Version ``0.0.8d`` of the `plot_3d_pdp` function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+- Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+**Decision Point**
+
+- This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
+
+
+Version 0.0.8c
+------------------------
+
+Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
+**Key Updates in 0.0.8c:**
+
+- **Bug Fixes:** Addressed minor issues identified in version ``0.0.8b`` to ensure smoother performance and better user experience.
+- **Additional Testing:** Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+- **Refinements:** Made small enhancements to existing features based on user feedback and internal testing results.
+
+**Summary of Changes**
+
+1. New Features & Enhancements
+
+- ``plot_3d_pdp`` Function:
+
+ - Added ``show_modebar`` Parameter: Introduced a new boolean parameter, ``show_modebar``, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Added parameters for ``left_margin``, ``right_margin``, and ``top_margin`` to provide users with more control over the plot layout in Plotly.
+
+ - Adjusted default values and added options for better customization of the Plotly color bar (``cbar_x``, ``cbar_thickness``) and title positioning (``title_x``, ``title_y``).
+
+ - Plotly Configuration:
+
+ - Enhanced the configuration options to allow users to enable or disable zoom functionality (``enable_zoom``) in the interactive Plotly plots.
+
+ - Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+ - Error Handling:
+
+ - Added input validation for ``html_file_path`` and ``html_file_name`` to ensure these are provided when necessary based on the selected ``plot_type``.
+
+- ``plot_2d_pdp`` Function:
+
+ - Introduced ``file_prefix`` Parameter:
+
+ - Added a new ``file_prefix`` parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+ - Enhanced Plot Type Flexibility:
+
+ - The ``plot_type`` parameter now includes an option to generate both grid and individual plots (``both``). This feature allows users to create a combination of both layout styles in one function call.
+
+ - Updated input validation and logic to handle this new option effectively.
+
+ - Added ``save_plots`` Parameter:
+
+ - Introduced a new parameter, ``save_plots``, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Included the ``save_plots`` parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+2. Documentation Updates
+
+- Docstrings:
+
+ - Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
+ - Detailed the use of new parameters such as ``show_modebar``, ``file_prefix``, ``save_plots``, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+3. Refactoring & Code Cleanup
+
+- Code Structure:
+
+ - Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
+ - Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
+
+
+Version 0.0.8b
+--------------------------------
+
+Version 0.0.8b is an exact replica of version ``0.0.8a``. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
+
+
+Version 0.0.8a
+--------------------------------
+
+Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
+**New Features:**
+
+1. Optional ``file_prefix`` in ``stacked_crosstab_plot`` Function
+
+ - The ``stacked_crosstab_plot`` function has been updated to make the ``file_prefix`` argument optional. If the user does not provide a ``file_prefix``, the function will now automatically generate a default prefix based on the ``col`` and ``func_col`` parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
+ - **Key Improvement:**
+
+ - Users can now omit the ``file_prefix`` argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
+ - Backward compatibility is maintained, allowing users who prefer to specify a custom ``file_prefix`` to continue doing so without any issues.
+
+2. **Introduction of 3D and 2D Partial Dependence Plot Functions**
+
+ - Two new functions, ``plot_3d_pdp`` and ``plot_2d_pdp``, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+ - ``plot_3d_pdp``: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
+ - ``plot_2d_pdp``: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+ - **Key Features:**
+
+ - **Compatibility:** Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
+ - **Customization:** Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
+ - **Interactive 3D Plots:** The ``plot_3d_pdp`` function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+**Impact:**
+
+- These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+- The optional ``file_prefix`` enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+- The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
+
+
+
+`Version 0.0.7`_
+----------------------
+
+.. _Version 0.0.7: https://lshpaner.github.io/eda_toolkit/v0.0.7/index.html
+
+**Added Function for Customizable Correlation Matrix Visualization**
+
+This release introduces a new function, ``flex_corr_matrix``, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to ``seaborn.heatmap()``.
+
+**Summary of Changes**
+
+- **New Function**: ``flex_corr_matrix``.
+
+ - **Functionality**:
+ - Generates a correlation heatmap for a given DataFrame.
+ - Supports both full and upper triangular correlation matrices based on the ``triangular`` parameter.
+ - Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+ - Accepts additional keyword arguments via ``**kwargs`` to pass directly to ``seaborn.heatmap()``.
+ - Includes validation to ensure the ``triangular``, ``annot``, and ``save_plots`` parameters are boolean values.
+ - Raises an exception if ``save_plots=True`` but neither ``image_path_png`` nor ``image_path_svg`` is specified.
+
+**Usage**
+
+.. code-block:: python
+
+ # Full correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=False, cmap="coolwarm", annot=True)
+
+ # Upper triangular correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=True, cmap="coolwarm", annot=True)
+
+
+**Contingency table df to object type**
+
+Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
+
+.. code-block:: python
+
+ df = df.astype(str).fillna("")
+
+
+`Version 0.0.6`_
+----------------------
+
+.. _Version 0.0.6: https://lshpaner.github.io/eda_toolkit/v0.0.6/index.html
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+`Version 0.0.5`_
+----------------------
+
+.. _Version 0.0.5: https://lshpaner.github.io/eda_toolkit/v0.0.5/index.html
+
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.11/_sources/citations.rst.txt b/_build/html/v0.0.11/_sources/citations.rst.txt
new file mode 100644
index 000000000..51ef0423d
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.11). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.11},
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.11/_sources/contributors.rst.txt b/_build/html/v0.0.11/_sources/contributors.rst.txt
new file mode 100644
index 000000000..4da2fa18b
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/contributors.rst.txt
@@ -0,0 +1,59 @@
+.. _contributors:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Contributors/Maintainers
+=========================
+
+.. raw:: html
+
+
+
+.. image:: https://www.leonshpaner.com/author/leon-shpaner/avatar_hu48de79c369d5f7d4ff8056a297b2c4c5_1681850_270x270_fill_q90_lanczos_center.jpg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Leonid Shpaner `_ is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego's M.S. Applied Artificial Intelligence program.
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: https://oscargildata.com/portfolio_content/images/Oscar_LinkedIn_Pic.jpeg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Oscar Gil `_ is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
diff --git a/_build/html/v0.0.11/_sources/data_management.rst.txt b/_build/html/v0.0.11/_sources/data_management.rst.txt
new file mode 100644
index 000000000..056ce061c
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/data_management.rst.txt
@@ -0,0 +1,1409 @@
+.. _data_management:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Data Management Overview
+===========================
+
+In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
+
+Data Management Techniques
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs. Defaults to ``"ID"``.
+ :type id_colname: str, optional
+ :param num_digits: The number of digits for the unique IDs. Defaults to ``9``.
+ :type num_digits: int, optional
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+.. note::
+ - If the dataframe index is not unique, a warning is printed.
+ - The function does not check if the number of rows exceeds the number of
+ unique IDs that can be generated with the specified number of digits.
+ - The first digit of the generated IDs is ensured to be non-zero.
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either `day/month/year` or
+ `month/day/year` format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a `day/month/year` format. If the first part is 12 or less, the function
+ attempts to parse the date as `month/day/year`, falling back to `day/month/year`
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df, background_color=None, return_df=False)
+
+ Analyze DataFrame columns to provide summary statistics such as data type,
+ null counts, unique values, and most frequent values.
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param background_color: Hex color code or color name for background styling in the output
+ DataFrame. Defaults to ``None``.
+ :type background_color: str, optional
+ :param return_df: If ``True``, returns the plain DataFrame with the summary statistics. If
+ ``False``, returns a styled DataFrame for visual presentation. Defaults to ``False``.
+ :type return_df: bool, optional
+
+ :returns: If ``return_df`` is ``True``, returns the plain DataFrame containing column summary
+ statistics. If ``return_df`` is ``False``, returns a styled DataFrame with optional
+ background color for specific columns.
+ :rtype: pandas.DataFrame
+
+
+Census Income Example
+""""""""""""""""""""""""""""""
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+DataFrame Column Names
+""""""""""""""""""""""""""""""
+
+``unique_values_total``
+ This column indicates the total number of unique values present in each column of the DataFrame. It measures the distinct values that a column holds. For example, in the ``age`` column, there are 74 unique values, meaning the ages vary across 74 distinct entries.
+
+``max_unique_value``
+ This column shows the most frequently occurring value in each column. For example, in the ``workclass`` column, the most common value is ``Private``, indicating that this employment type is the most represented in the dataset. For numeric columns like ``capital-gain`` and ``capital-loss``, the most common value is ``0``, which suggests that the majority of individuals have no capital gain or loss.
+
+``max_unique_value_total``
+ This represents the count of the most frequently occurring value in each column. For instance, in the ``native-country`` column, the value ``United-States`` appears ``43,832`` times, indicating that the majority of individuals in the dataset are from the United States.
+
+``max_unique_value_pct``
+ This column shows the percentage that the most frequent value constitutes of the total number of rows. For example, in the ``race`` column, the value ``White`` makes up ``85.5%`` of the data, suggesting a significant majority of the dataset belongs to this racial group.
+
+Calculation Details
+""""""""""""""""""""""""""""""
+- ``unique_values_total`` is calculated using the ``nunique()`` function, which counts the number of unique values in a column.
+- ``max_unique_value`` is determined by finding the value with the highest frequency using ``value_counts()``. For string columns, any missing values (if present) are replaced with the string ``"null"`` before computing the frequency.
+- ``max_unique_value_total`` is the frequency count of the ``max_unique_value``.
+- ``max_unique_value_pct`` is the percentage of ``max_unique_value_total`` divided by the total number of rows in the DataFrame, providing an idea of how dominant the most frequent value is.
+
+This analysis helps in identifying columns with a high proportion of dominant values, like ``<=50K`` in the ``income`` column, which appears ``24,720`` times, making up ``50.61%`` of the entries. This insight can be useful for understanding data distributions, identifying potential data imbalances, or even spotting opportunities for feature engineering in further data processing steps.
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of column names from the DataFrame to generate combinations.
+ :type variables: list of str
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum size of the combinations to generate. Defaults to ``2``.
+ :type min_length: int, optional
+
+ :returns: A tuple containing a dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+.. note::
+ - The function will create an Excel file with a sheet for each combination
+ of the specified variables, as well as a "Table of Contents" sheet with
+ hyperlinks to each summary table.
+ - The sheet names are limited to 31 characters due to Excel's constraints.
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+.. note::
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list of str, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order. Defaults to ``0``.
+ :type sort_by: int, optional
+ :raises ValueError: If no columns are specified or if ``sort_by`` is not ``0`` or ``1``.
+ :returns: A DataFrame containing the contingency table with the specified columns, a ``'Total'`` column representing the count of occurrences, and a ``'Percentage'`` column representing the percentage of the total count.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or "bins." This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or "bins," and
+then assigning each value to one of these intervals.
+
+.. note::
+
+ The code snippets below create age bins and assign a corresponding age group
+ label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+ categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+ and labels as needed for your specific data.
+
+
+Below, we use the ``age`` column of the census data [1]_ from the UCI Machine Learning Repository as an example:
+
+1. **Bins Definition**:
+ The bins are defined by specifying the boundaries of each interval. For example,
+ in the code snippet below, the ``bin_ages`` list specifies the boundaries for age groups:
+
+ .. code-block:: python
+
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+
+ Each pair of consecutive elements in ``bin_ages`` defines a bin. For example:
+
+ - The first bin is ``[0, 18)``,
+ - The second bin is ``[18, 30)``,
+ - and so on.
+
+\
+
+2. **Labels for Bins**:
+ The `label_ages` list provides labels corresponding to each bin:
+
+ .. code-block:: python
+
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ These labels are used to categorize the numerical values into meaningful groups.
+
+3. **Applying the Binning**:
+ The `pd.cut `_ function
+ from Pandas is used to apply the binning process. For each value in the ``age``
+ column of the DataFrame, it assigns a corresponding label based on which bin the
+ value falls into. Here, ``right=False`` indicates that each bin includes the
+ left endpoint but excludes the right endpoint. For example, if ``bin_ages =
+ [0, 10, 20, 30]``, then a value of ``10`` will fall into the bin ``[10, 20)`` and
+ be labeled accordingly.
+
+ .. code-block:: python
+
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+ **Mathematically**, for a given value `x` in the ``age`` column:
+
+ .. math::
+
+ \text{age_group} =
+ \begin{cases}
+ < 18 & \text{if } 0 \leq x < 18 \\
+ 18-29 & \text{if } 18 \leq x < 30 \\
+ \vdots \\
+ 100 + & \text{if } x \geq 100
+ \end{cases}
+
+ The parameter ``right=False`` in ``pd.cut`` means that the bins are left-inclusive
+ and right-exclusive, except for the last bin, which is always right-inclusive
+ when the upper bound is infinity (``float("inf")``).
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.11/_sources/eda_plots.rst.txt b/_build/html/v0.0.11/_sources/eda_plots.rst.txt
new file mode 100644
index 000000000..a4741f127
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/eda_plots.rst.txt
@@ -0,0 +1,2665 @@
+.. _eda_plots:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Plotting and Theoretical Overview
+=======================================
+
+Gaussian Assumption for Normality
+----------------------------------
+
+The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
+
+.. math::
+
+ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
+
+where:
+
+- :math:`\mu` is the mean
+- :math:`\sigma^2` is the variance
+
+In a normally distributed dataset:
+
+- 68% of data falls within :math:`\mu \pm \sigma`
+- 95% within :math:`\mu \pm 2\sigma`
+- 99.7% within :math:`\mu \pm 3\sigma`
+
+.. raw:: html
+
+
+
+.. image:: ../assets/normal_distribution.png
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histograms and Kernel Density Estimation (KDE)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Histograms**:
+
+- Visualize data distribution by binning values and counting frequencies.
+- If data is Gaussian, the histogram approximates a bell curve.
+
+**KDE**:
+
+- A non-parametric way to estimate the PDF by smoothing individual data points with a kernel function.
+- The KDE for a dataset :math:`X = \{x_1, x_2, \ldots, x_n\}` is given by:
+
+.. math::
+
+ \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)
+
+where:
+
+- :math:`K` is the kernel function (often Gaussian)
+- :math:`h` is the bandwidth (smoothing parameter)
+
+.. raw:: html
+
+ Combined Use of Histograms and KDE
+
+\
+
+- **Histograms** offer a discrete, binned view of the data.
+- **KDE** provides a smooth, continuous estimate of the underlying distribution.
+- Together, they effectively illustrate how well the data aligns with the Gaussian assumption, highlighting any deviations from normality.
+
+
+Pearson Correlation Coefficient
+--------------------------------
+
+The Pearson correlation coefficient, often denoted as :math:`r`, is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from :math:`-1` to :math:`1`, where:
+
+- :math:`r = 1` indicates a perfect positive linear relationship.
+- :math:`r = -1` indicates a perfect negative linear relationship.
+- :math:`r = 0` indicates no linear relationship.
+
+The Pearson correlation coefficient between two variables :math:`X` and :math:`Y` is defined as:
+
+.. math::
+
+ r_{XY} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
+
+where:
+
+- :math:`\text{Cov}(X, Y)` is the covariance of :math:`X` and :math:`Y`.
+- :math:`\sigma_X` is the standard deviation of :math:`X`.
+- :math:`\sigma_Y` is the standard deviation of :math:`Y`.
+
+Covariance measures how much two variables change together. It is defined as:
+
+.. math::
+
+ \text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)
+
+where:
+
+- :math:`n` is the number of data points.
+- :math:`X_i` and :math:`Y_i` are the individual data points.
+- :math:`\mu_X` and :math:`\mu_Y` are the means of :math:`X` and :math:`Y`.
+
+The standard deviation measures the dispersion or spread of a set of values. For
+a variable :math:`X`, the standard deviation :math:`\sigma_X` is:
+
+.. math::
+
+ \sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)^2}
+
+Substituting the covariance and standard deviation into the Pearson correlation formula:
+
+.. math::
+
+ r_{XY} = \frac{\sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n} (X_i - \mu_X)^2} \sqrt{\sum_{i=1}^{n} (Y_i - \mu_Y)^2}}
+
+This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between :math:`X` and :math:`Y`.
+
+- :math:`r > 0`: Positive correlation. As :math:`X` increases, :math:`Y` tends to increase.
+- :math:`r < 0`: Negative correlation. As :math:`X` increases, :math:`Y` tends to decrease.
+- :math:`r = 0`: No linear correlation. There is no consistent linear relationship between :math:`X` and :math:`Y`.
+
+The closer the value of :math:`r` is to :math:`\pm 1`, the stronger the linear relationship between the two variables.
+
+
+Partial Dependence Foundations
+--------------------------------
+
+Let :math:`\mathbf{X}` represent the complete set of input features for a machine
+learning model, where :math:`\mathbf{X} = \{X_1, X_2, \dots, X_p\}`. Suppose we're
+particularly interested in a subset of these features, denoted by :math:`\mathbf{X}_S`.
+The complementary set, :math:`\mathbf{X}_C`, contains all the features in :math:`\mathbf{X}`
+that are not in :math:`\mathbf{X}_S`. Mathematically, this relationship is expressed as:
+
+.. math::
+
+ \mathbf{X}_C = \mathbf{X} \setminus \mathbf{X}_S
+
+where :math:`\mathbf{X}_C` is the set of features in :math:`\mathbf{X}` after
+removing the features in :math:`\mathbf{X}_S`.
+
+Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in :math:`\mathbf{X}_S` on the model's predictions, while averaging out the
+influence of the features in :math:`\mathbf{X}_C`. This is mathematically defined as:
+
+.. math::
+ \begin{align*}
+ \text{PD}_{\mathbf{X}_S}(x_S) &= \mathbb{E}_{\mathbf{X}_C} \left[ f(x_S, \mathbf{X}_C) \right] \\
+ &= \int f(x_S, x_C) \, p(x_C) \, dx_C \\
+ &= \int \left( \int f(x_S, x_C) \, p(x_C \mid x_S) \, dx_C \right) p(x_S) \, dx_S
+ \end{align*}
+
+
+where:
+
+- :math:`\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]` indicates that we are taking the expected value over the possible values of the features in the set :math:`\mathbf{X}_C`.
+- :math:`p(x_C)` represents the probability density function of the features in :math:`\mathbf{X}_C`.
+
+This operation effectively summarizes the model's output over all potential values of the complementary features, providing a clear view of how the features in :math:`\mathbf{X}_S` alone impact the model's predictions.
+
+
+**2D Partial Dependence Plots**
+
+Consider a trained machine learning model `2D Partial Dependence Plots`_ :math:`f(\mathbf{X})`, where :math:`\mathbf{X} = (X_1, X_2, \dots, X_p)` represents the vector of input features. The partial dependence of the predicted response :math:`\hat{y}` on a single feature :math:`X_j` is defined as:
+
+.. math::
+
+ \text{PD}(X_j) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, \mathbf{X}_{C_i})
+
+where:
+
+- :math:`X_j` is the feature of interest.
+- :math:`\mathbf{X}_{C_i}` represents the complement set of :math:`X_j`, meaning the remaining features in :math:`\mathbf{X}` not included in :math:`X_j` for the :math:`i`-th instance.
+- :math:`n` is the number of observations in the dataset.
+
+For two features, :math:`X_j` and :math:`X_k`, the partial dependence is given by:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of :math:`X_j` and :math:`X_k` vary, while the effects of the other features are averaged out.
+
+- **Single Feature PDP:** When plotting :math:`\text{PD}(X_j)`, the result is a 2D line plot showing the marginal effect of feature :math:`X_j` on the predicted outcome, averaged over all possible values of the other features.
+- **Two Features PDP:** When plotting :math:`\text{PD}(X_j, X_k)`, the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of :math:`X_j` and :math:`X_k` on the predicted outcome. The surface represents the expected value of the prediction as :math:`X_j` and :math:`X_k` vary, while all other features are averaged out.
+
+
+**3D Partial Dependence Plots**
+
+For a more comprehensive analysis, especially when exploring interactions between two features, `3D Partial Dependence Plots`_ are invaluable. The partial dependence function for two features in a 3D context is:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+Here, the function :math:`f(X_j, X_k, \mathbf{X}_{C_i})` is evaluated across a grid of values for :math:`X_j` and :math:`X_k`. The resulting 3D surface plot represents how the model's prediction changes over the joint range of these two features.
+
+The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model's predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+- **Feature Interaction Visualization:** The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+- **Enhanced Interpretation:** 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+.. raw:: html
+
+
+
+KDE Distribution Function
+-----------------------------
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, figsize=(5, 5), grid_figsize=None, hist_color="#0000FF", kde_color="#FF0000", mean_color="#000000", median_color="#000000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=None, n_cols=None, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None, plot_mean=False, plot_median=False, std_dev_levels=None, std_color="#808080", label_names=None, show_legend=True, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots. If 'all', plots will be generated for all numeric columns.
+ :type vars_of_interest: list of str, optional
+ :param figsize: Size of each individual plot, default is ``(5, 5)``. Used when only one plot is being generated or when saving individual plots.
+ :type figsize: tuple of int, optional
+ :param grid_figsize: Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on ``figsize``, ``n_rows``, and ``n_cols``.
+ :type grid_figsize: tuple of int, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param mean_color: Color of the mean line if ``plot_mean`` is True, default is ``'#000000'``.
+ :type mean_color: str, optional
+ :param median_color: Color of the median line if ``plot_median`` is True, default is ``'#000000'``.
+ :type median_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses ``figsize`` for determining the plot size, ignoring ``grid_figsize``.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+ :type log_scale_vars: str or list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: float, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``, ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param plot_mean: Whether to plot the mean as a vertical line, default is ``False``.
+ :type plot_mean: bool, optional
+ :param plot_median: Whether to plot the median as a vertical line, default is ``False``.
+ :type plot_median: bool, optional
+ :param std_dev_levels: Levels of standard deviation to plot around the mean.
+ :type std_dev_levels: list of int, optional
+ :param std_color: Color(s) for the standard deviation lines, default is ``'#808080'``.
+ :type std_color: str or list of str, optional
+ :param label_names: Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+ :type label_names: dict, optional
+ :param show_legend: Whether to show the legend on the plots, default is ``True``.
+ :type show_legend: bool, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+ - If ``grid_figsize`` is provided when only one plot is being created.
+
+ :raises UserWarning:
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``plot_type="both"`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+.. note::
+ If you do not set ``n_rows`` or ``n_cols`` to any values, the function will
+ automatically calculate and create a grid based on the number of variables being
+ plotted, ensuring an optimal arrangement of the plots.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both", # Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``plot_type="hist"``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ``y-axis`` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to ``"Count"`` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Histogram Example - (Mean and Median)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+histograms that include mean and median lines. The ``mean_color`` is set to ``"blue"``
+and the ``median_color`` is set to ``"black"``, allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted ``(plot_mean=True, plot_median=True)``.
+The ``y_axis_label`` remains ``"Density"``, indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using ``hist_color="brown"``, with a ``fill_alpha=0.60`` while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``, and
+the grid size is set to `15x5 inches`. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.60,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_mean_median.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Histogram Example - (Mean, Median, and Std. Deviation)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+``mean_color`` is set to ``"blue"`` and the median_color is set to ``"black"``,
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted ``(plot_mean=True, plot_median=True)``. The ``y_axis_label`` remains
+``"Density"``, indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using ``hist_color="brown"``,
+with a ``fill_alpha=0.40``, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors ``"purple"``,
+``"green"``, and ``"silver"`` for one, two, and three standard deviations, respectively.
+
+The layout is configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``,
+and the grid size is set to `15x5 inches`. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+.. note::
+
+ You have the freedom to choose whether to plot the mean, median, and
+ standard deviation lines. You can display one, none, or all of these simultaneously.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ ]
+
+ kde_distributions(
+ df=df,
+ figsize=(10, 6),
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.40,
+ plot_type="both",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ image_path_svg=image_path_svg,
+ image_path_png=image_path_png,
+ std_dev_levels=[
+ 1,
+ 2,
+ 3,
+ ],
+ std_color=[
+ "purple",
+ "green",
+ "silver",
+ ],
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_age.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in. Valid formats are ``'png'`` and ``'svg'``. If not provided, defaults to an empty list and no images will be saved.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. note::
+
+ The ``legend_labels_list`` parameter should be a list of lists, where each
+ inner list corresponds to the ground truth labels for the respective item in
+ the ``func_col`` list. Each element in the ``func_col`` list represents a
+ column in your DataFrame that you wish to analyze, and the corresponding
+ inner list in ``legend_labels_list`` should contain the labels that will be
+ used in the legend of your plots.
+
+For example:
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"], # Corresponds to "sex"
+ ["<=50K", ">50K"], # Corresponds to "income"
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. important::
+
+ Ensure that the number of elements in ``func_col``, ``legend_labels_list``,
+ and ``title`` are the same. Each item in ``func_col`` must have a corresponding
+ list of labels in ``legend_labels_list`` and a title in ``title``. This
+ consistency is essential for the function to correctly generate the plots
+ with the appropriate labels and titles.
+
+
+In this example:
+
+- ``func_col`` contains two elements: ``"sex"`` and ``"income"``. Each corresponds to a specific column in your DataFrame.
+- ``legend_labels_list`` is a nested list containing two inner lists:
+
+ - The first inner list, ``["Male", "Female"]``, corresponds to the ``"sex"`` column in ``func_col``.
+ - The second inner list, ``["<=50K", ">50K"]``, corresponds to the ``"income"`` column in ``func_col``.
+
+- ``title`` contains two elements: ``"Sex"`` and ``"Income"``, which will be used as the titles for the respective plots.
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=0, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_comp, n_rows=None, n_cols=None, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None, label_names=None, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_comp: List of comparison categories (columns in df).
+ :type metrics_comp: list of str
+ :param n_rows: Number of rows in the subplot grid. Calculated automatically if not provided.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. Calculated automatically if not provided.
+ :type n_cols: int, optional
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots. Default is ``True``.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param label_names: Dictionary mapping original column names to custom labels. Default is ``None``.
+ :type label_names: dict, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is provided and is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ``x-axis`` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+Scatter Fit Plot
+------------------
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars=None, y_vars=None, n_rows=None, n_cols=None, max_cols=4, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None, all_vars=None, label_names=None, **kwargs)
+
+ Create and save scatter plots or a grid of scatter plots for given ``x_vars``
+ and ``y_vars``, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the ``x-axis``.
+ :type x_vars: list of str, optional
+
+ :param y_vars: List of variable names to plot on the ``y-axis``.
+ :type y_vars: list of str, optional
+
+ :param n_rows: Number of rows in the subplot grid. Calculated based on the number of plots and ``n_cols`` if not specified.
+ :type n_rows: int, optional
+
+ :param n_cols: Number of columns in the subplot grid. Calculated based on the number of plots and ``max_cols`` if not specified.
+ :type n_cols: int, optional
+
+ :param max_cols: Maximum number of columns in the subplot grid. Default is ``4``.
+ :type max_cols: int, optional
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``. If None, plots will not be saved.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is 12.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is 10.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the ``hue`` parameter to be set.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+
+ :param all_vars: If provided, automatically generates scatter plots for all combinations of variables in this list, overriding `x_vars` and `y_vars`.
+ :type all_vars: list of str, optional
+
+ :param label_names: A dictionary to rename columns for display in the plot titles and labels.
+ :type label_names: dict, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``sns.scatterplot``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``all_vars`` is provided and either ``x_vars`` or ``y_vars`` is also provided.
+ - If neither ``all_vars`` nor both ``x_vars`` and ``y_vars`` are provided.
+ - If ``hue_palette`` is specified without ``hue``.
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``. This function does not return any value but generates and optionally saves scatter plots for the specified ``x_vars`` and ``y_vars``, or for all combinations of variables in ``all_vars`` if it is provided.
+
+
+
+Regression-Centric Scatter Plots Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots (All Combinations Example)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the ``df`` DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+1. **All Variables Combination**: The ``all_vars`` parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don't need to manually specify ``x_vars`` and ``y_vars``, as the function will iterate through each possible pair.
+
+2. **Grid Display**: The ``show_plot`` parameter is set to ``"grid"``, so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
+3. **Font Sizes**: The ``label_fontsize`` and ``tick_fontsize`` parameters are set to ``14`` and ``12``, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
+4. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``True``, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
+5. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
+6. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``True``, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ all_vars=df.select_dtypes(np.number).columns.to_list(),
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Correlation Matrices
+=====================
+
+**Generate and Save Customizable Correlation Heatmaps**
+
+The ``flex_corr_matrix`` function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
+**Customizable Plot Appearance**
+
+The function provides extensive customization options for the heatmap's appearance:
+
+- **Colormap Selection**: Choose from a variety of colormaps to represent the strength of correlations. The default is ``"coolwarm"``, but this can be adjusted to fit the needs of the analysis.
+
+- **Annotation**: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
+- **Figure Size and Layout**: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+**Triangular vs. Full Correlation Matrix**
+
+
+A key feature of the ``flex_corr_matrix`` function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
+**Label and Axis Configuration**
+
+
+The function offers flexibility in configuring axis labels and titles:
+
+- **Label Rotation**: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+- **Font Sizes**: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+- **Title Wrapping**: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+**Plot Display and Saving Options**
+
+
+The ``flex_corr_matrix`` function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
+
+.. function:: flex_corr_matrix(df, cols=None, annot=True, cmap="coolwarm", save_plots=False, image_path_png=None, image_path_svg=None, figsize=(10, 10), title="Cervical Cancer Data: Correlation Matrix", label_fontsize=12, tick_fontsize=10, xlabel_rot=45, ylabel_rot=0, xlabel_alignment="right", ylabel_alignment="center_baseline", text_wrap=50, vmin=-1, vmax=1, cbar_label="Correlation Index", triangular=True, **kwargs)
+
+ Create a customizable correlation heatmap with options for annotation, color mapping, figure size, and saving the plot.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param cols: List of column names to include in the correlation matrix. If None, all columns are included.
+ :type cols: list of str, optional
+
+ :param annot: Whether to annotate the heatmap with correlation coefficients. Default is ``True``.
+ :type annot: bool, optional
+
+ :param cmap: The colormap to use for the heatmap. Default is ``"coolwarm"``.
+ :type cmap: str, optional
+
+ :param save_plots: Controls whether to save the plots. Default is ``False``.
+ :type save_plots: bool, optional
+
+ :param image_path_png: Directory path to save PNG images of the heatmap.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the heatmap.
+ :type image_path_svg: str, optional
+
+ :param figsize: Width and height of the figure for the heatmap. Default is ``(10, 10)``.
+ :type figsize: tuple, optional
+
+ :param title: Title of the heatmap. Default is ``"Cervical Cancer Data: Correlation Matrix"``.
+ :type title: str, optional
+
+ :param label_fontsize: Font size for tick labels and colorbar label. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param xlabel_rot: Rotation angle for x-axis labels. Default is ``45``.
+ :type xlabel_rot: int, optional
+
+ :param ylabel_rot: Rotation angle for y-axis labels. Default is ``0``.
+ :type ylabel_rot: int, optional
+
+ :param xlabel_alignment: Horizontal alignment for x-axis labels. Default is ``"right"``.
+ :type xlabel_alignment: str, optional
+
+ :param ylabel_alignment: Vertical alignment for y-axis labels. Default is ``"center_baseline"``.
+ :type ylabel_alignment: str, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param vmin: Minimum value for the heatmap color scale. Default is ``-1``.
+ :type vmin: float, optional
+
+ :param vmax: Maximum value for the heatmap color scale. Default is ``1``.
+ :type vmax: float, optional
+
+ :param cbar_label: Label for the colorbar. Default is ``"Correlation Index"``.
+ :type cbar_label: str, optional
+
+ :param triangular: Whether to show only the upper triangle of the correlation matrix. Default is ``True``.
+ :type triangular: bool, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``seaborn.heatmap()``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``annot`` is not a boolean.
+ - If ``cols`` is not a list.
+ - If ``save_plots`` is not a boolean.
+ - If ``triangular`` is not a boolean.
+ - If ``save_plots`` is True but no image paths are provided.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves a correlation heatmap.
+
+Triangular Correlation Matrix Example
+--------------------------------------
+
+The provided code filters the census [1]_ DataFrame ``df`` to include only numeric columns using
+``select_dtypes(np.number)``. It then utilizes the ``flex_corr_matrix()`` function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+.. note::
+
+ This triangular matrix format is particularly useful for avoiding
+ redundancy in correlation matrices, as it excludes the lower half,
+ making it easier to focus on unique pairwise correlations.
+
+The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
+.. code-block:: python
+
+ # Select only numeric data to pass into the function
+ df_num = df.select_dtypes(np.number)
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="coolwarm",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Full Correlation Matrix Example
+----------------------------------
+
+In this modified census [1]_ example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting ``cmap="viridis"``, the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting ``triangular=False``, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="viridis",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Partial Dependence Plots
+=========================
+
+**Partial Dependence Plots (PDPs)** are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model's predictions.
+
+2D Partial Dependence Plots
+-----------------------------
+
+The ``plot_2d_pdp`` function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+- **Grid and Individual Plots**: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+- **Customization Options**: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+- **Saving Plots**: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
+
+.. function:: plot_2d_pdp(model, X_train, feature_names, features, title="PDP of house value on CA non-location features", grid_resolution=50, plot_type="grid", grid_figsize=(12, 8), individual_figsize=(6, 4), label_fontsize=12, tick_fontsize=10, text_wrap=50, image_path_png=None, image_path_svg=None, save_plots=None, file_prefix="partial_dependence")
+
+ Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param X_train: The training data used to compute partial dependence. Should correspond to the features used to train the model.
+ :type X_train: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names: A list of feature names corresponding to the columns in ``X_train``.
+ :type feature_names: list of str
+
+ :param features: A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+ :type features: list of int or tuple of int
+
+ :param title: The title for the entire plot. Default is ``"PDP of house value on CA non-location features"``.
+ :type title: str, optional
+
+ :param grid_resolution: The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is ``50``.
+ :type grid_resolution: int, optional
+
+ :param plot_type: The type of plot to generate. Choose ``"grid"`` for a grid layout, ``"individual"`` for separate plots, or ``"both"`` to generate both layouts. Default is ``"grid"``.
+ :type plot_type: str, optional
+
+ :param grid_figsize: Tuple specifying the width and height of the figure for the grid layout. Default is ``(12, 8)``.
+ :type grid_figsize: tuple, optional
+
+ :param individual_figsize: Tuple specifying the width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple, optional
+
+ :param label_fontsize: Font size for the axis labels and titles. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for the axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param image_path_png: The directory path where PNG images of the plots will be saved, if saving is enabled.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: The directory path where SVG images of the plots will be saved, if saving is enabled.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls whether to save the plots. Options include ``"all"``, ``"individual"``, ``"grid"``, or ``None`` (default). If saving is enabled, ensure ``image_path_png`` or ``image_path_svg`` are provided.
+ :type save_plots: str, optional
+
+ :param file_prefix: Prefix for the filenames of the saved grid plots. Default is ``"partial_dependence"``.
+ :type file_prefix: str, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``"grid"``, ``"individual"``, or ``"both"``.
+ - If ``save_plots`` is enabled but neither ``image_path_png`` nor ``image_path_svg`` is provided.
+
+ :returns: ``None``
+ This function generates partial dependence plots and displays them. It does not return any values.
+
+
+2D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California. [4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
+For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
+If you extend this to two interacting features, such as ``AveOccup`` and ``HouseAge``,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
+Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ from sklearn.ensemble import GradientBoostingRegressor
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=42,
+ )
+ model.fit(X_train, y_train)
+
+
+**Create 2D Partial Dependence Plot Grid**
+
+.. code-block:: python
+
+ # import the plot_2d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_2d_pdp
+
+ # Feature names
+ names = data.feature_names
+
+ # Generate 2D partial dependence plots
+ plot_2d_pdp(
+ model=model,
+ X_train=X_train,
+ feature_names=names,
+ features=[
+ "MedInc",
+ "AveOccup",
+ "HouseAge",
+ "AveRooms",
+ "Population",
+ ("AveOccup", "HouseAge"),
+ ],
+ title="PDP of house value on CA non-location features",
+ grid_figsize=(14, 10),
+ individual_figsize=(12, 4),
+ label_fontsize=14,
+ tick_fontsize=12,
+ text_wrap=120,
+ plot_type="grid",
+ image_path_png="path/to/save/png",
+ save_plots="all",
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+3D Partial Dependence Plots
+-----------------------------
+
+The ``plot_3d_pdp`` function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+- **Interactive and Static 3D Plots**: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+- **Colormap and Layout Customization**: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+- **Axis and Title Configuration**: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
+
+.. function:: plot_3d_pdp(model, dataframe, feature_names_list, x_label=None, y_label=None, z_label=None, title, html_file_path=None, html_file_name=None, image_filename=None, plot_type="both", matplotlib_colormap=None, plotly_colormap="Viridis", zoom_out_factor=None, wireframe_color=None, view_angle=(22, 70), figsize=(7, 4.5), text_wrap=50, horizontal=-1.25, depth=1.25, vertical=1.25, cbar_x=1.05, cbar_thickness=25, title_x=0.5, title_y=0.95, top_margin=100, image_path_png=None, image_path_svg=None, show_cbar=True, grid_resolution=20, left_margin=20, right_margin=65, label_fontsize=8, tick_fontsize=6, enable_zoom=True, show_modebar=True)
+
+ Generate 3D partial dependence plots for two features of a machine learning model.
+
+ This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param dataframe: The dataset on which the model was trained or a representative sample. If a DataFrame is provided, ``feature_names_list`` should correspond to the column names. If a NumPy array is provided, ``feature_names_list`` should correspond to the indices of the columns.
+ :type dataframe: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names_list: A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+ :type feature_names_list: list of str
+
+ :param x_label: Label for the x-axis in the plots. Default is ``None``.
+ :type x_label: str, optional
+
+ :param y_label: Label for the y-axis in the plots. Default is ``None``.
+ :type y_label: str, optional
+
+ :param z_label: Label for the z-axis in the plots. Default is ``None``.
+ :type z_label: str, optional
+
+ :param title: The title for the plots.
+ :type title: str
+
+ :param html_file_path: Path to save the interactive Plotly HTML file. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_path: str, optional
+
+ :param html_file_name: Name of the HTML file to save the interactive Plotly plot. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_name: str, optional
+
+ :param image_filename: Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is ``None``.
+ :type image_filename: str, optional
+
+ :param plot_type: The type of plots to generate. Options are:
+ - ``"static"``: Generate only static Matplotlib plots.
+ - ``"interactive"``: Generate only interactive Plotly plots.
+ - ``"both"``: Generate both static and interactive plots. Default is ``"both"``.
+ :type plot_type: str, optional
+
+ :param matplotlib_colormap: Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+ :type matplotlib_colormap: matplotlib.colors.Colormap, optional
+
+ :param plotly_colormap: Colormap for the Plotly plot. Default is ``"Viridis"``.
+ :type plotly_colormap: str, optional
+
+ :param zoom_out_factor: Factor to adjust the zoom level of the Plotly plot. Default is ``None``.
+ :type zoom_out_factor: float, optional
+
+ :param wireframe_color: Color for the wireframe in the Matplotlib plot. If ``None``, no wireframe is plotted. Default is ``None``.
+ :type wireframe_color: str, optional
+
+ :param view_angle: Elevation and azimuthal angles for the Matplotlib plot view. Default is ``(22, 70)``.
+ :type view_angle: tuple, optional
+
+ :param figsize: Figure size for the Matplotlib plot. Default is ``(7, 4.5)``.
+ :type figsize: tuple, optional
+
+ :param text_wrap: Maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param horizontal: Horizontal camera position for the Plotly plot. Default is ``-1.25``.
+ :type horizontal: float, optional
+
+ :param depth: Depth camera position for the Plotly plot. Default is ``1.25``.
+ :type depth: float, optional
+
+ :param vertical: Vertical camera position for the Plotly plot. Default is ``1.25``.
+ :type vertical: float, optional
+
+ :param cbar_x: Position of the color bar along the x-axis in the Plotly plot. Default is ``1.05``.
+ :type cbar_x: float, optional
+
+ :param cbar_thickness: Thickness of the color bar in the Plotly plot. Default is ``25``.
+ :type cbar_thickness: int, optional
+
+ :param title_x: Horizontal position of the title in the Plotly plot. Default is ``0.5``.
+ :type title_x: float, optional
+
+ :param title_y: Vertical position of the title in the Plotly plot. Default is ``0.95``.
+ :type title_y: float, optional
+
+ :param top_margin: Top margin for the Plotly plot layout. Default is ``100``.
+ :type top_margin: int, optional
+
+ :param image_path_png: Directory path to save the PNG file of the Matplotlib plot. Default is None.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save the SVG file of the Matplotlib plot. Default is None.
+ :type image_path_svg: str, optional
+
+ :param show_cbar: Whether to display the color bar in the Matplotlib plot. Default is ``True``.
+ :type show_cbar: bool, optional
+
+ :param grid_resolution: The resolution of the grid for computing partial dependence. Default is ``20``.
+ :type grid_resolution: int, optional
+
+ :param left_margin: Left margin for the Plotly plot layout. Default is ``20``.
+ :type left_margin: int, optional
+
+ :param right_margin: Right margin for the Plotly plot layout. Default is ``65``.
+ :type right_margin: int, optional
+
+ :param label_fontsize: Font size for axis labels in the Matplotlib plot. Default is ``8``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for tick labels in the Matplotlib plot. Default is ``6``.
+ :type tick_fontsize: int, optional
+
+ :param enable_zoom: Whether to enable zooming in the Plotly plot. Default is ``True``.
+ :type enable_zoom: bool, optional
+
+ :param show_modebar: Whether to display the mode bar in the Plotly plot. Default is ``True``.
+ :type show_modebar: bool, optional
+
+ :raises ValueError:
+ - If `plot_type` is not one of ``"static"``, ``"interactive"``, or ``"both"``.
+ - If `plot_type` is ``"interactive"`` or ``"both"`` and ``html_file_path`` or ``html_file_name`` are not provided.
+
+ :returns: ``None``
+ This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+ :notes:
+ - This function handles warnings related to scikit-learn's ``partial_dependence`` function, specifically a ``FutureWarning`` related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+ - To maintain compatibility with different versions of scikit-learn, the function attempts to use ``"values"`` for grid extraction in newer versions and falls back to ``"grid_values"`` for older versions.
+
+
+3D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
+For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
+This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
+Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
+
+Static Plot
+^^^^^^^^^^^^^^^^^
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=1,
+ )
+ model.fit(X_train, y_train)
+
+**Create Static 3D Partial Dependence Plot**
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8, 5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Interactive Plot
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+ )
+
+.. warning::
+
+ **Scrolling Notice:**
+
+ While interacting with the interactive Plotly plot below, scrolling down the
+ page using the mouse wheel may be blocked when the mouse pointer is hovering
+ over the plot. To continue scrolling, either move the mouse pointer outside
+ the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+.. raw:: html
+
+
+
+
+
+
+This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here's how it differs from
+generating a static plot using Matplotlib.
+
+**Key Differences**
+
+**Plot Type**:
+
+- The ``plot_type`` is set to ``"interactive"`` for the Plotly plot and ``"static"`` for the Matplotlib plot.
+
+**Interactive-Specific Parameters**:
+
+- **HTML File Path and Name**: The ``html_file_path`` and ``html_file_name`` parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
+- **Zoom and Positioning**: The interactive plot includes parameters like ``zoom_out_factor``, ``title_x``, ``cbar_x``, and ``cbar_thickness`` to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
+- **Mode Bar and Zoom**: The ``show_modebar`` and ``enable_zoom`` parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+**Static-Specific Parameters**:
+
+- **Figure Size and Wireframe Color**: The static plot uses parameters like ``figsize`` to control the size of the Matplotlib plot and ``wireframe_color`` to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+.. [2] Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+.. [3] Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+.. [4] Pace, R. K., & Barry, R. (1997). *Sparse Spatial Autoregressions*. *Statistics & Probability Letters*, 33(3), 291-297. `https://doi.org/10.1016/S0167-7152(96)00140-X `_.
+
diff --git a/_build/html/v0.0.11/_sources/getting_started.rst.txt b/_build/html/v0.0.11/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..bf69e01b4
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/getting_started.rst.txt
@@ -0,0 +1,136 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.11``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``jinja2``: version ``3.1.4`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``nbformat``: version ``4.2.0`` or higher
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``plotly``: version ``5.18.0`` or higher
+- ``scikit-learn``: version ``1.0.2`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
+Description
+===============
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+.. [#] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.11/_sources/index.rst.txt b/_build/html/v0.0.11/_sources/index.rst.txt
new file mode 100644
index 000000000..5f0dc6c56
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/index.rst.txt
@@ -0,0 +1,57 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Data Management
+
+ data_management
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Plotting Heuristics
+
+ eda_plots
+
+.. toctree::
+ :maxdepth: 4
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ contributors
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.11/_sources/references.rst.txt b/_build/html/v0.0.11/_sources/references.rst.txt
new file mode 100644
index 000000000..335337c3a
--- /dev/null
+++ b/_build/html/v0.0.11/_sources/references.rst.txt
@@ -0,0 +1,33 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Fixes a TypeError in the stacked_crosstab_plot function when save_formats is None. The update ensures that save_formats defaults to an empty list, preventing iteration over a NoneType object.
+
Changes
+
+
Initializes save_formats as an empty list if not provided.
+
Adds handling for string and tuple input types for save_formats.
The legend is now displayed only if there are valid legend handles (len(handles)>0) and if show_legend is set to True.
+
The check ax.get_legend().remove() ensures that unnecessary legends are removed if they are empty or if show_legend is set to False.
+
+
Error Handling
+- Error handling in the except block has been enhanced to ensure that any exceptions related to legends or labels are managed properly. The legend handling logic still respects the show_legend flag even in cases where exceptions occur.
+
This update prevents empty legend squares from appearing and maintains the intended default behavior of showing legends only when they contain relevant content.
Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
Visualization Enhancements
+
DataFrame Columns: Added a background_color variable to dataframe_columns`,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use hide() instead of hide_index(). The highlighted
+columns allow for easier null versus unique value analysis.
+
The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
Args:
+
+
The df argument is specified as a pandas.DataFrame.
+
The background_color argument is marked as optional, with a brief description of its role.
+
The return_df argument is also marked as optional, explaining what it controls.
+
+
Returns: The return type is specified as pandas.DataFrame, with a clear explanation of the difference based on the return_df flag.
+
KDE Distribution Plots: Improved kde_distributions() with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
Scatter Plots: Enhanced scatter_fit_plot() with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
Flexible `save_formats` Input:
+- save_formats now accepts a string, tuple, or list for specifying formats (e.g., “png”, (“png”, “svg”), or [“png”, “svg”]).
+- Single strings or tuples are automatically converted to lists for consistent processing.
+
Dynamic Error Handling:
+- Added checks to ensure a valid path is provided for each format in save_formats.
+- Raises a ValueError if a format is specified without a corresponding path, with a clear, dynamic error message.
+
Improved Plot Saving Logic:
+- Updated logic allows saving plots in one format (e.g., only “png” or “svg”) without requiring the other.
+- Simplified and more intuitive path handling for saving plots.
This update introduces several key changes to the plot_3d_pdp function, simplifying the function’s interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
1. Parameter Changes
+
+
Removed Parameters:
+
+
The parameters x_label_plotly, y_label_plotly, and z_label_plotly have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general x_label, y_label, and z_label. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+
+
Default Values for Labels:
+
+
The parameters x_label, y_label, and z_label are now optional, with None as the default. If not provided, these labels will automatically default to the names of the features in the feature_names_list. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+
+
Changes in Default Values for View Angles:
+
+
The default values for camera positioning parameters have been updated: horizontal is now -1.25, depth is now 1.25, and vertical is now 1.25. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+
+
+
2. Plot Generation Logic
+
+
Conditionally Checking Labels:
+
+
The function now checks whether x_label, y_label, and z_label are provided. If these are None, the function will automatically assign default labels based on the feature_names_list. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+
+
Camera Position Adjustments:
+
+
The camera positions for the Plotly plot are now adjusted by multiplying horizontal, depth, and vertical by zoom_out_factor. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+
+
Surface Plot Coordinates Adjustments:
+
+
The order of the coordinates for the Plotly plot’s surface has been changed from ZZ,XX,YY[::-1] to ZZ,XX,YY. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+
+
+
3. Code Simplifications
+
+
Removed Complexity:
+
+
By removing the x_label_plotly, y_label_plotly, and z_label_plotly parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+
+
Fallback Mechanism for Grid Values:
+
+
The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+
+
+
4. Style Adjustments
+
+
Label Formatting:
+
+
The new version consistently uses y_label, x_label, and z_label for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+
+
Color Bar Adjustments:
+
+
The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of 0.6 and a pad value of 0.02. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+
+
+
5. Potential Use Case Differences
+
+
Simplified Interface:
+
+
The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+
+
Less Granular Control:
+
+
Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the *_plotly label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+
+
+
6. Matplotlib Plot Adjustments
+
+
Wireframe and Surface Plot Enhancements:
+
+
The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+
+
+
Summary
+
+
Version 0.0.8d of the plot_3d_pdp function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+
Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+
Decision Point
+
+
This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
Key Updates in 0.0.8c:
+
+
Bug Fixes: Addressed minor issues identified in version 0.0.8b to ensure smoother performance and better user experience.
+
Additional Testing: Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+
Refinements: Made small enhancements to existing features based on user feedback and internal testing results.
+
+
Summary of Changes
+
+
New Features & Enhancements
+
+
+
plot_3d_pdp Function:
+
+
Added show_modebar Parameter: Introduced a new boolean parameter, show_modebar, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
Custom Margins and Layout Adjustments:
+
+
Added parameters for left_margin, right_margin, and top_margin to provide users with more control over the plot layout in Plotly.
+
Adjusted default values and added options for better customization of the Plotly color bar (cbar_x, cbar_thickness) and title positioning (title_x, title_y).
+
+
+
Plotly Configuration:
+
+
Enhanced the configuration options to allow users to enable or disable zoom functionality (enable_zoom) in the interactive Plotly plots.
+
Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+
+
Error Handling:
+
+
Added input validation for html_file_path and html_file_name to ensure these are provided when necessary based on the selected plot_type.
+
+
+
+
+
plot_2d_pdp Function:
+
+
Introduced file_prefix Parameter:
+
+
Added a new file_prefix parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+
+
Enhanced Plot Type Flexibility:
+
+
The plot_type parameter now includes an option to generate both grid and individual plots (both). This feature allows users to create a combination of both layout styles in one function call.
+
Updated input validation and logic to handle this new option effectively.
+
+
+
Added save_plots Parameter:
+
+
Introduced a new parameter, save_plots, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+
+
Custom Margins and Layout Adjustments:
+
+
Included the save_plots parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+
+
+
+
+
+
Documentation Updates
+
+
+
Docstrings:
+
+
Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
Detailed the use of new parameters such as show_modebar, file_prefix, save_plots, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+
+
+
+
Refactoring & Code Cleanup
+
+
+
Code Structure:
+
+
Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
Version 0.0.8b is an exact replica of version 0.0.8a. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
New Features:
+
+
Optional file_prefix in stacked_crosstab_plot Function
+
+
The stacked_crosstab_plot function has been updated to make the file_prefix argument optional. If the user does not provide a file_prefix, the function will now automatically generate a default prefix based on the col and func_col parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
Key Improvement:
+
+
Users can now omit the file_prefix argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
Backward compatibility is maintained, allowing users who prefer to specify a custom file_prefix to continue doing so without any issues.
+
+
+
+
+
Introduction of 3D and 2D Partial Dependence Plot Functions
+
+
Two new functions, plot_3d_pdp and plot_2d_pdp, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+
plot_3d_pdp: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
plot_2d_pdp: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+
+
Key Features:
+
+
Compatibility: Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
Customization: Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
Interactive 3D Plots: The plot_3d_pdp function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+
+
+
+
+
Impact:
+
+
These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+
The optional file_prefix enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+
The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
Added Function for Customizable Correlation Matrix Visualization
+
This release introduces a new function, flex_corr_matrix, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to seaborn.heatmap().
+
Summary of Changes
+
+
New Function: flex_corr_matrix.
+
+
Functionality:
+- Generates a correlation heatmap for a given DataFrame.
+- Supports both full and upper triangular correlation matrices based on the triangular parameter.
+- Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+- Accepts additional keyword arguments via **kwargs to pass directly to seaborn.heatmap().
+- Includes validation to ensure the triangular, annot, and save_plots parameters are boolean values.
+- Raises an exception if save_plots=True but neither image_path_png nor image_path_svg is specified.
+
+
+
+
Usage
+
# Full correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=False,cmap="coolwarm",annot=True)
+
+# Upper triangular correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=True,cmap="coolwarm",annot=True)
+
+
+
Contingency table df to object type
+
Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str, optional) – The name of the new column for the IDs. Defaults to "ID".
+
num_digits (int, optional) – The number of digits for the unique IDs. Defaults to 9.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
+
Note
+
+
If the dataframe index is not unique, a warning is printed.
+
+
The function does not check if the number of rows exceeds the number of
unique IDs that can be generated with the specified number of digits.
+
+
+
+
The first digit of the generated IDs is ensured to be non-zero.
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
Analyze DataFrame columns to provide summary statistics such as data type,
+null counts, unique values, and most frequent values.
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
background_color (str, optional) – Hex color code or color name for background styling in the output
+DataFrame. Defaults to None.
+
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
+False, returns a styled DataFrame for visual presentation. Defaults to False.
+
+
+
Returns:
+
If return_df is True, returns the plain DataFrame containing column summary
+statistics. If return_df is False, returns a styled DataFrame with optional
+background color for specific columns.
This column indicates the total number of unique values present in each column of the DataFrame. It measures the distinct values that a column holds. For example, in the age column, there are 74 unique values, meaning the ages vary across 74 distinct entries.
+
+
max_unique_value
This column shows the most frequently occurring value in each column. For example, in the workclass column, the most common value is Private, indicating that this employment type is the most represented in the dataset. For numeric columns like capital-gain and capital-loss, the most common value is 0, which suggests that the majority of individuals have no capital gain or loss.
+
+
max_unique_value_total
This represents the count of the most frequently occurring value in each column. For instance, in the native-country column, the value United-States appears 43,832 times, indicating that the majority of individuals in the dataset are from the United States.
+
+
max_unique_value_pct
This column shows the percentage that the most frequent value constitutes of the total number of rows. For example, in the race column, the value White makes up 85.5% of the data, suggesting a significant majority of the dataset belongs to this racial group.
unique_values_total is calculated using the nunique() function, which counts the number of unique values in a column.
+
max_unique_value is determined by finding the value with the highest frequency using value_counts(). For string columns, any missing values (if present) are replaced with the string "null" before computing the frequency.
+
max_unique_value_total is the frequency count of the max_unique_value.
+
max_unique_value_pct is the percentage of max_unique_value_total divided by the total number of rows in the DataFrame, providing an idea of how dominant the most frequent value is.
+
+
This analysis helps in identifying columns with a high proportion of dominant values, like <=50K in the income column, which appears 24,720 times, making up 50.61% of the entries. This insight can be useful for understanding data distributions, identifying potential data imbalances, or even spotting opportunities for feature engineering in further data processing steps.
+
+
+
+
Generating Summary Tables for Variable Combinations
+
This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.
The function will create an Excel file with a sheet for each combination
of the specified variables, as well as a “Table of Contents” sheet with
+hyperlinks to each summary table.
+
+
+
+
The sheet names are limited to 31 characters due to Excel’s constraints.
+
+
+
The function returns two outputs:
+
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
+
+
+
+
Note
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
Create a contingency table from one or more columns in a DataFrame, with sorting options.
+
This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the contingency_table function.
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame containing the contingency table with the specified columns, a 'Total' column representing the count of occurrences, and a 'Percentage' column representing the percentage of the total count.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or “bins.” This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or “bins,” and
+then assigning each value to one of these intervals.
+
+
Note
+
The code snippets below create age bins and assign a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
+
+
Below, we use the age column of the census data [1] from the UCI Machine Learning Repository as an example:
+
+
Bins Definition:
+The bins are defined by specifying the boundaries of each interval. For example,
+in the code snippet below, the bin_ages list specifies the boundaries for age groups:
These labels are used to categorize the numerical values into meaningful groups.
+
+
Applying the Binning:
+The pd.cut function
+from Pandas is used to apply the binning process. For each value in the age
+column of the DataFrame, it assigns a corresponding label based on which bin the
+value falls into. Here, right=False indicates that each bin includes the
+left endpoint but excludes the right endpoint. For example, if bin_ages=
+[0,10,20,30], then a value of 10 will fall into the bin [10,20) and
+be labeled accordingly.
The parameter right=False in pd.cut means that the bins are left-inclusive
+and right-exclusive, except for the last bin, which is always right-inclusive
+when the upper bound is infinity (float("inf")).
The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
The Pearson correlation coefficient, often denoted as \(r\), is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from \(-1\) to \(1\), where:
+
+
\(r = 1\) indicates a perfect positive linear relationship.
+
\(r = -1\) indicates a perfect negative linear relationship.
+
\(r = 0\) indicates no linear relationship.
+
+
The Pearson correlation coefficient between two variables \(X\) and \(Y\) is defined as:
This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between \(X\) and \(Y\).
+
+
\(r > 0\): Positive correlation. As \(X\) increases, \(Y\) tends to increase.
+
\(r < 0\): Negative correlation. As \(X\) increases, \(Y\) tends to decrease.
+
\(r = 0\): No linear correlation. There is no consistent linear relationship between \(X\) and \(Y\).
+
+
The closer the value of \(r\) is to \(\pm 1\), the stronger the linear relationship between the two variables.
Let \(\mathbf{X}\) represent the complete set of input features for a machine
+learning model, where \(\mathbf{X} = \{X_1, X_2, \dots, X_p\}\). Suppose we’re
+particularly interested in a subset of these features, denoted by \(\mathbf{X}_S\).
+The complementary set, \(\mathbf{X}_C\), contains all the features in \(\mathbf{X}\)
+that are not in \(\mathbf{X}_S\). Mathematically, this relationship is expressed as:
where \(\mathbf{X}_C\) is the set of features in \(\mathbf{X}\) after
+removing the features in \(\mathbf{X}_S\).
+
Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in \(\mathbf{X}_S\) on the model’s predictions, while averaging out the
+influence of the features in \(\mathbf{X}_C\). This is mathematically defined as:
\(\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]\) indicates that we are taking the expected value over the possible values of the features in the set \(\mathbf{X}_C\).
+
\(p(x_C)\) represents the probability density function of the features in \(\mathbf{X}_C\).
+
+
This operation effectively summarizes the model’s output over all potential values of the complementary features, providing a clear view of how the features in \(\mathbf{X}_S\) alone impact the model’s predictions.
+
2D Partial Dependence Plots
+
Consider a trained machine learning model 2D Partial Dependence Plots\(f(\mathbf{X})\), where \(\mathbf{X} = (X_1, X_2, \dots, X_p)\) represents the vector of input features. The partial dependence of the predicted response \(\hat{y}\) on a single feature \(X_j\) is defined as:
\(\mathbf{X}_{C_i}\) represents the complement set of \(X_j\), meaning the remaining features in \(\mathbf{X}\) not included in \(X_j\) for the \(i\)-th instance.
+
\(n\) is the number of observations in the dataset.
+
+
For two features, \(X_j\) and \(X_k\), the partial dependence is given by:
This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of \(X_j\) and \(X_k\) vary, while the effects of the other features are averaged out.
+
+
Single Feature PDP: When plotting \(\text{PD}(X_j)\), the result is a 2D line plot showing the marginal effect of feature \(X_j\) on the predicted outcome, averaged over all possible values of the other features.
+
Two Features PDP: When plotting \(\text{PD}(X_j, X_k)\), the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of \(X_j\) and \(X_k\) on the predicted outcome. The surface represents the expected value of the prediction as \(X_j\) and \(X_k\) vary, while all other features are averaged out.
+
+
3D Partial Dependence Plots
+
For a more comprehensive analysis, especially when exploring interactions between two features, 3D Partial Dependence Plots are invaluable. The partial dependence function for two features in a 3D context is:
Here, the function \(f(X_j, X_k, \mathbf{X}_{C_i})\) is evaluated across a grid of values for \(X_j\) and \(X_k\). The resulting 3D surface plot represents how the model’s prediction changes over the joint range of these two features.
+
The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model’s predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+
Feature Interaction Visualization: The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+
Enhanced Interpretation: 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
+
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
+
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
+
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
+
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
+
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
+
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
+
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
+
kwargs (additional keyword arguments) – Additional keyword arguments passed to the Seaborn plotting function.
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The plot_type="both" parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
+
Note
+
If you do not set n_rows or n_cols to any values, the function will
+automatically calculate and create a grid based on the number of variables being
+plotted, ensuring an optimal arrangement of the plots.
+
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both",# Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+plot_type="hist", meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The y-axis label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to "Count" to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=3),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is customized to generate
+histograms that include mean and median lines. The mean_color is set to "blue"
+and the median_color is set to "black", allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted (plot_mean=True,plot_median=True).
+The y_axis_label remains "Density", indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using hist_color="brown", with a fill_alpha=0.60 while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns (n_rows=1,n_cols=3), and
+the grid size is set to 15x5 inches. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
Histogram Example - (Mean, Median, and Std. Deviation)
+
In this example, the kde_distributions() function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+mean_color is set to "blue" and the median_color is set to "black",
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted (plot_mean=True,plot_median=True). The y_axis_label remains
+"Density", indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using hist_color="brown",
+with a fill_alpha=0.40, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors "purple",
+"green", and "silver" for one, two, and three standard deviations, respectively.
+
The layout is configured with a single row and multiple columns (n_rows=1,n_cols=3),
+and the grid size is set to 15x5 inches. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+
Note
+
You have the freedom to choose whether to plot the mean, median, and
+standard deviation lines. You can display one, none, or all of these simultaneously.
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
kind (str, optional) – The kind of plot to generate ('bar' or 'barh' for horizontal bars), default is 'bar'.
+
width (float, optional) – The width of the bars in the bar plot, default is 0.9.
+
rot (int, optional) – The rotation angle of the x-axis labels, default is 0.
+
custom_order (list, optional) – Specifies a custom order for the categories in the col.
+
image_path_png (str, optional) – Directory path where generated PNG plot images will be saved.
+
image_path_svg (str, optional) – Directory path where generated SVG plot images will be saved.
+
save_formats (list, optional) – List of file formats to save the plot images in. Valid formats are 'png' and 'svg'. If not provided, defaults to an empty list and no images will be saved.
+
color (list, optional) – List of colors to use for the plots. If not provided, a default color scheme is used.
+
output (str, optional) – Specify the output type: "plots_only", "crosstabs_only", or "both". Default is "both".
+
return_dict (bool, optional) – Specify whether to return the crosstabs dictionary, default is False.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
+
Note
+
The legend_labels_list parameter should be a list of lists, where each
+inner list corresponds to the ground truth labels for the respective item in
+the func_col list. Each element in the func_col list represents a
+column in your DataFrame that you wish to analyze, and the corresponding
+inner list in legend_labels_list should contain the labels that will be
+used in the legend of your plots.
+
+
For example:
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],# Corresponds to "sex"
+ ["<=50K",">50K"],# Corresponds to "income"
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Important
+
Ensure that the number of elements in func_col, legend_labels_list,
+and title are the same. Each item in func_col must have a corresponding
+list of labels in legend_labels_list and a title in title. This
+consistency is essential for the function to correctly generate the plots
+with the appropriate labels and titles.
+
+
In this example:
+
+
func_col contains two elements: "sex" and "income". Each corresponds to a specific column in your DataFrame.
+
legend_labels_list is a nested list containing two inner lists:
+
+
+
The first inner list, ["Male","Female"], corresponds to the "sex" column in func_col.
+
The second inner list, ["<=50K",">50K"], corresponds to the "income" column in func_col.
+
+
+
+
title contains two elements: "Sex" and "Income", which will be used as the titles for the respective plots.
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is provided and is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the x-axis labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
+
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
+
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
+
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
+
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
+
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
+
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
If all_vars is provided and either x_vars or y_vars is also provided.
+
If neither all_vars nor both x_vars and y_vars are provided.
+
If hue_palette is specified without hue.
+
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set but no image paths are provided.
+
If rotate_plot is not a boolean value.
+
If individual_figsize or grid_figsize are not tuples/lists with two numeric values.
+
+
+
+
Returns:
+
None. This function does not return any value but generates and optionally saves scatter plots for the specified x_vars and y_vars, or for all combinations of variables in all_vars if it is provided.
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
In this example, the scatter_fit_plot function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the df DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+
All Variables Combination: The all_vars parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don’t need to manually specify x_vars and y_vars, as the function will iterate through each possible pair.
+
Grid Display: The show_plot parameter is set to "grid", so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
Font Sizes: The label_fontsize and tick_fontsize parameters are set to 14 and 12, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
Best Fit Line: The add_best_fit_line parameter is set to True, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
Scatter Color: The scatter_color parameter is set to "#808080", applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
Correlation Coefficient: The show_correlation parameter is set to True, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+
These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
Generate and Save Customizable Correlation Heatmaps
+
The flex_corr_matrix function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
Customizable Plot Appearance
+
The function provides extensive customization options for the heatmap’s appearance:
+
+
Colormap Selection: Choose from a variety of colormaps to represent the strength of correlations. The default is "coolwarm", but this can be adjusted to fit the needs of the analysis.
+
Annotation: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
Figure Size and Layout: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+
Triangular vs. Full Correlation Matrix
+
A key feature of the flex_corr_matrix function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
Label and Axis Configuration
+
The function offers flexibility in configuring axis labels and titles:
+
+
Label Rotation: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+
Font Sizes: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+
Title Wrapping: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+
Plot Display and Saving Options
+
The flex_corr_matrix function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
The provided code filters the census [1] DataFrame df to include only numeric columns using
+select_dtypes(np.number). It then utilizes the flex_corr_matrix() function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+
Note
+
This triangular matrix format is particularly useful for avoiding
+redundancy in correlation matrices, as it excludes the lower half,
+making it easier to focus on unique pairwise correlations.
+
+
The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
# Select only numeric data to pass into the function
+df_num=df.select_dtypes(np.number)
+
In this modified census [1] example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting cmap="viridis", the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting triangular=False, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
Partial Dependence Plots (PDPs) are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model’s predictions.
The plot_2d_pdp function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+
Grid and Individual Plots: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+
Customization Options: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+
Saving Plots: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
X_train (pandas.DataFrame or numpy.ndarray) – The training data used to compute partial dependence. Should correspond to the features used to train the model.
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
Consider a scenario where you have a machine learning model predicting median
+house values in California. [4] Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
If you extend this to two interacting features, such as AveOccup and HouseAge,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
Fetch The CA Housing Dataset and Prepare The DataFrame
The plot_3d_pdp function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+
Interactive and Static 3D Plots: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+
Colormap and Layout Customization: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+
Axis and Title Configuration: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
Generate 3D partial dependence plots for two features of a machine learning model.
+
This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
+- "static": Generate only static Matplotlib plots.
+- "interactive": Generate only interactive Plotly plots.
+- "both": Generate both static and interactive plots. Default is "both".
+
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
If plot_type is not one of "static", "interactive", or "both".
+
If plot_type is "interactive" or "both" and html_file_path or html_file_name are not provided.
+
+
+
+
Returns:
+
None
+This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+
Notes:
+
+
This function handles warnings related to scikit-learn’s partial_dependence function, specifically a FutureWarning related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+
To maintain compatibility with different versions of scikit-learn, the function attempts to use "values" for grid extraction in newer versions and falls back to "grid_values" for older versions.
Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8,5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+)
+
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+)
+
+
+
+
Warning
+
Scrolling Notice:
+
While interacting with the interactive Plotly plot below, scrolling down the
+page using the mouse wheel may be blocked when the mouse pointer is hovering
+over the plot. To continue scrolling, either move the mouse pointer outside
+the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+
+
This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here’s how it differs from
+generating a static plot using Matplotlib.
+
Key Differences
+
Plot Type:
+
+
The plot_type is set to "interactive" for the Plotly plot and "static" for the Matplotlib plot.
+
+
Interactive-Specific Parameters:
+
+
HTML File Path and Name: The html_file_path and html_file_name parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
Zoom and Positioning: The interactive plot includes parameters like zoom_out_factor, title_x, cbar_x, and cbar_thickness to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
Mode Bar and Zoom: The show_modebar and enable_zoom parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+
Static-Specific Parameters:
+
+
Figure Size and Wireframe Color: The static plot uses parameters like figsize to control the size of the Matplotlib plot and wireframe_color to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+
By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.11/genindex.html b/_build/html/v0.0.11/genindex.html
new file mode 100644
index 000000000..66031b85a
--- /dev/null
+++ b/_build/html/v0.0.11/genindex.html
@@ -0,0 +1,359 @@
+
+
+
+
+
+
+
+ Index — EDA Toolkit 0.0.11 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
+
+
+
+
+
+
+
Index
+
+
+ A
+ | B
+ | C
+ | D
+ | E
+ | F
+ | H
+ | K
+ | P
+ | S
+
+
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.11.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.5/_sources/changelog.rst.txt b/_build/html/v0.0.5/_sources/changelog.rst.txt
new file mode 100644
index 000000000..f1ab527bc
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/changelog.rst.txt
@@ -0,0 +1,279 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+Version 0.0.5
+---------------------------
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.5/_sources/citations.rst.txt b/_build/html/v0.0.5/_sources/citations.rst.txt
new file mode 100644
index 000000000..47f562314
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.4). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.4},W
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.5/_sources/getting_started.rst.txt b/_build/html/v0.0.5/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..0026890c9
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/getting_started.rst.txt
@@ -0,0 +1,121 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.5``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``jinja2``: version ``3.1.4`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
diff --git a/_build/html/v0.0.5/_sources/index.rst.txt b/_build/html/v0.0.5/_sources/index.rst.txt
new file mode 100644
index 000000000..2a42e87c4
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/index.rst.txt
@@ -0,0 +1,50 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 3
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 3
+ :caption: Usage Guide
+
+ usage_guide
+
+.. toctree::
+ :maxdepth: 3
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.5/_sources/references.rst.txt b/_build/html/v0.0.5/_sources/references.rst.txt
new file mode 100644
index 000000000..7254cfba5
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/references.rst.txt
@@ -0,0 +1,36 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+The ``eda_toolkit`` library
+
+References
+==========
+
+1. Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+2. Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+3. Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+
diff --git a/_build/html/v0.0.5/_sources/usage_guide.rst.txt b/_build/html/v0.0.5/_sources/usage_guide.rst.txt
new file mode 100644
index 000000000..f60bdb193
--- /dev/null
+++ b/_build/html/v0.0.5/_sources/usage_guide.rst.txt
@@ -0,0 +1,2755 @@
+.. _usage_guide:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Description
+===========
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+
+Data Preparation and Management
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs.
+ :type id_colname: str
+ :param num_digits: The number of digits for the unique IDs.
+ :type num_digits: int
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either day/month/year or
+ month/day/year format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a day/month/year format. If the first part is 12 or less, the function
+ attempts to parse the date as month/day/year, falling back to day/month/year
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df)
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+
+ :returns: A DataFrame with the analysis results for each column.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of unique variables to generate combinations.
+ :type variables: list
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum length of combinations to generate. Defaults to ``2``.
+ :type min_length: int
+
+ :returns: A dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+ :notes:
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order.
+ :type sort_by: int
+ :raises ValueError: If no columns are specified or if sort_by is not ``0`` or ``1``.
+ :returns: A DataFrame with the specified columns, ``'Total'``, and ``'Percentage'``.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+If your DataFrame (e.g., the census data [1]_)
+does not have age or any other numerical column of interest binned, you can
+apply the following binning logic to categorize the data. Below, we use the age
+column from the UCI Machine Learning Repository as an example:
+
+.. code-block:: python
+
+ # Create age bins so that the ages can be categorized
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+ # Create labels for the bins
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ # Categorize the ages and assign to a new variable
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+`Note:` This code snippet creates age bins and assigns a corresponding age group
+label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+and labels as needed for your specific data.
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, grid_figsize=(10, 8), single_figsize=(6, 4), kde=True, hist_color="#0000FF", kde_color="#FF0000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=1, n_cols=1, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_path_png=None, single_var_image_path_svg=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots.
+ :type vars_of_interest: list of str, optional
+ :param grid_figsize: Size of the overall grid figure, default is ``(10, 8)``.
+ :type grid_figsize: tuple, optional
+ :param single_figsize: Size of individual figures for each variable, default is ``(6, 4)``.
+ :type single_figsize: tuple, optional
+ :param kde: Whether to include KDE plots on the histograms, default is ``True``.
+ :type kde: bool, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where
+ ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid, default is ``1``.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid, default is ``1``.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_path_png: Directory path to save the PNG images of the separate distribution plots.
+ :type single_var_image_path_png: str, optional
+ :param single_var_image_path_svg: Directory path to save the SVG images of the separate distribution plots.
+ :type single_var_image_path_svg: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots.
+ The variable name will be appended to this filename.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: List of variable names to apply log scaling.
+ :type log_scale_vars: list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: number or pair of numbers, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``,
+ ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+
+ :raises UserWarning:
+ - If ``stat`` is set to 'count' while ``kde`` is ``True``, as it may produce misleading plots.
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+---------------------------
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``kde=True`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+The ``plot_type="hist"`` parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+----------------------------
+
+In this example, the kde_distributions function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``kde=False``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+--------------------------
+
+In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ```y-axis``` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to `"Count"` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=4``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"],
+ ["<=50K", ">50K"],
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_boxplot_comp, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_boxplot_comp: List of comparison categories (columns in df).
+ :type metrics_boxplot_comp: list of str
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is (``6, 4``).
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of None, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is specified but is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_boxplot_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with `3` rows and `4`
+columns, using the ``n_rows=3`` and ``n_cols=4`` parameters. The ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ```x-axis``` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit impor box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+**Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.**
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars, y_vars, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None)
+
+ Create and save scatter plots or a grid of scatter plots for given x_vars
+ and y_vars, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the `x-axis`.
+ :type x_vars: list of str
+
+ :param y_vars: List of variable names to plot on the `y-axis`.
+ :type y_vars: list of str
+
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for `x-axis` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the `x-axis` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the `y-axis` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves scatter plots for the specified `x_vars` and `y_vars`.
+
+
+Regression-Centric Scatter Plots Example
+-----------------------------------------
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+-------------------------------------------
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.5.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str) – The name of the new column for the IDs.
+
num_digits (int) – The number of digits for the unique IDs.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
+
Returns:
+
A DataFrame with the analysis results for each column.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
Notes:
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame with the specified columns, 'Total', and 'Percentage'.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
If your DataFrame (e.g., the census data [1])
+does not have age or any other numerical column of interest binned, you can
+apply the following binning logic to categorize the data. Below, we use the age
+column from the UCI Machine Learning Repository as an example:
+
# Create age bins so that the ages can be categorized
+bin_ages=[
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+]
+
+# Create labels for the bins
+label_ages=[
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+]
+
+# Categorize the ages and assign to a new variable
+df["age_group"]=pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+)
+
+
+
Note: This code snippet creates age bins and assigns a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
+
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
+
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
+
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
+0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
+
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
+
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
+The variable name will be appended to this filename.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (number or pair of numbers, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
+'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The kde=True parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
The plot_type="hist" parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+kde=False, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The `y-axis` label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to “Count” to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=4),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],
+ ["<=50K",">50K"],
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is specified but is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_boxplot_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with 3 rows and 4
+columns, using the n_rows=3 and n_cols=4 parameters. The image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the `x-axis` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str) – List of variable names to plot on the x-axis.
+
y_vars (list of str) – List of variable names to plot on the y-axis.
+
n_rows (int) – Number of rows in the subplot grid.
+
n_cols (int) – Number of columns in the subplot grid.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.6/.buildinfo b/_build/html/v0.0.6/.buildinfo
new file mode 100644
index 000000000..7a677d526
--- /dev/null
+++ b/_build/html/v0.0.6/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: d2babcebc2afef2ccdc37520664ce5ca
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/_build/html/v0.0.6/.doctrees/acknowledgements.doctree b/_build/html/v0.0.6/.doctrees/acknowledgements.doctree
new file mode 100644
index 000000000..36ce5baff
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/acknowledgements.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/changelog.doctree b/_build/html/v0.0.6/.doctrees/changelog.doctree
new file mode 100644
index 000000000..b3e4e1c74
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/changelog.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/citations.doctree b/_build/html/v0.0.6/.doctrees/citations.doctree
new file mode 100644
index 000000000..14098beba
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/citations.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/environment.pickle b/_build/html/v0.0.6/.doctrees/environment.pickle
new file mode 100644
index 000000000..d538a55e3
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/environment.pickle differ
diff --git a/_build/html/v0.0.6/.doctrees/getting_started.doctree b/_build/html/v0.0.6/.doctrees/getting_started.doctree
new file mode 100644
index 000000000..efbbe8da9
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/getting_started.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/index.doctree b/_build/html/v0.0.6/.doctrees/index.doctree
new file mode 100644
index 000000000..c56862182
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/index.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/references.doctree b/_build/html/v0.0.6/.doctrees/references.doctree
new file mode 100644
index 000000000..f907a7cca
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/references.doctree differ
diff --git a/_build/html/v0.0.6/.doctrees/usage_guide.doctree b/_build/html/v0.0.6/.doctrees/usage_guide.doctree
new file mode 100644
index 000000000..c9798bbd1
Binary files /dev/null and b/_build/html/v0.0.6/.doctrees/usage_guide.doctree differ
diff --git a/_build/html/v0.0.6/_images/Bar_Age_regular_income.svg b/_build/html/v0.0.6/_images/Bar_Age_regular_income.svg
new file mode 100644
index 000000000..6f8aa40d4
--- /dev/null
+++ b/_build/html/v0.0.6/_images/Bar_Age_regular_income.svg
@@ -0,0 +1,1201 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/Stacked_Bar_Age_income.svg b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income.svg
new file mode 100644
index 000000000..d5510308b
--- /dev/null
+++ b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income.svg
@@ -0,0 +1,1943 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_pivoted.svg b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_pivoted.svg
new file mode 100644
index 000000000..2147fce1a
--- /dev/null
+++ b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_pivoted.svg
@@ -0,0 +1,2043 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_regular.svg b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_regular.svg
new file mode 100644
index 000000000..04478581f
--- /dev/null
+++ b/_build/html/v0.0.6/_images/Stacked_Bar_Age_income_regular.svg
@@ -0,0 +1,1347 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/Stacked_Bar_Age_sex.svg b/_build/html/v0.0.6/_images/Stacked_Bar_Age_sex.svg
new file mode 100644
index 000000000..7b2bcb137
--- /dev/null
+++ b/_build/html/v0.0.6/_images/Stacked_Bar_Age_sex.svg
@@ -0,0 +1,1970 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/all_plots_comparisons_boxplot.svg b/_build/html/v0.0.6/_images/all_plots_comparisons_boxplot.svg
new file mode 100644
index 000000000..6f3173cff
--- /dev/null
+++ b/_build/html/v0.0.6/_images/all_plots_comparisons_boxplot.svg
@@ -0,0 +1,11585 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot.svg b/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot.svg
new file mode 100644
index 000000000..571a084b8
--- /dev/null
+++ b/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot.svg
@@ -0,0 +1,5868 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot_pivoted.svg b/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot_pivoted.svg
new file mode 100644
index 000000000..0e317ba63
--- /dev/null
+++ b/_build/html/v0.0.6/_images/all_plots_comparisons_violinplot_pivoted.svg
@@ -0,0 +1,5764 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/count_hist_distributions.svg b/_build/html/v0.0.6/_images/count_hist_distributions.svg
new file mode 100644
index 000000000..521cd5a95
--- /dev/null
+++ b/_build/html/v0.0.6/_images/count_hist_distributions.svg
@@ -0,0 +1,1719 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/eda_toolkit_logo.svg b/_build/html/v0.0.6/_images/eda_toolkit_logo.svg
new file mode 100644
index 000000000..d039d6f79
--- /dev/null
+++ b/_build/html/v0.0.6/_images/eda_toolkit_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/_build/html/v0.0.6/_images/hist_density_distributions.svg b/_build/html/v0.0.6/_images/hist_density_distributions.svg
new file mode 100644
index 000000000..8bf1787a6
--- /dev/null
+++ b/_build/html/v0.0.6/_images/hist_density_distributions.svg
@@ -0,0 +1,1744 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/kde_density_distributions.svg b/_build/html/v0.0.6/_images/kde_density_distributions.svg
new file mode 100644
index 000000000..7564724e1
--- /dev/null
+++ b/_build/html/v0.0.6/_images/kde_density_distributions.svg
@@ -0,0 +1,2571 @@
+
+
+
diff --git a/_build/html/v0.0.6/_images/scatter_plots_grid.png b/_build/html/v0.0.6/_images/scatter_plots_grid.png
new file mode 100644
index 000000000..5a51facd8
Binary files /dev/null and b/_build/html/v0.0.6/_images/scatter_plots_grid.png differ
diff --git a/_build/html/v0.0.6/_images/scatter_plots_grid_grouped.png b/_build/html/v0.0.6/_images/scatter_plots_grid_grouped.png
new file mode 100644
index 000000000..02a3b3916
Binary files /dev/null and b/_build/html/v0.0.6/_images/scatter_plots_grid_grouped.png differ
diff --git a/_build/html/v0.0.6/_images/summarize_combos.gif b/_build/html/v0.0.6/_images/summarize_combos.gif
new file mode 100644
index 000000000..402ee1efc
Binary files /dev/null and b/_build/html/v0.0.6/_images/summarize_combos.gif differ
diff --git a/_build/html/v0.0.6/_sources/acknowledgements.rst.txt b/_build/html/v0.0.6/_sources/acknowledgements.rst.txt
new file mode 100644
index 000000000..e62da5a10
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/acknowledgements.rst.txt
@@ -0,0 +1,30 @@
+.. _acknowledgements:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.6/_sources/changelog.rst.txt b/_build/html/v0.0.6/_sources/changelog.rst.txt
new file mode 100644
index 000000000..d8fa48bef
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/changelog.rst.txt
@@ -0,0 +1,296 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+Version 0.0.6
+---------------------------
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+Version 0.0.5
+---------------------------
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.6/_sources/citations.rst.txt b/_build/html/v0.0.6/_sources/citations.rst.txt
new file mode 100644
index 000000000..47f562314
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.4). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.4},W
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.6/_sources/getting_started.rst.txt b/_build/html/v0.0.6/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..cb0ee3d94
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/getting_started.rst.txt
@@ -0,0 +1,121 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.6``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``jinja2``: version ``3.1.4`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
diff --git a/_build/html/v0.0.6/_sources/index.rst.txt b/_build/html/v0.0.6/_sources/index.rst.txt
new file mode 100644
index 000000000..2a42e87c4
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/index.rst.txt
@@ -0,0 +1,50 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 3
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 3
+ :caption: Usage Guide
+
+ usage_guide
+
+.. toctree::
+ :maxdepth: 3
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.6/_sources/references.rst.txt b/_build/html/v0.0.6/_sources/references.rst.txt
new file mode 100644
index 000000000..7254cfba5
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/references.rst.txt
@@ -0,0 +1,36 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+The ``eda_toolkit`` library
+
+References
+==========
+
+1. Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+2. Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+3. Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+
diff --git a/_build/html/v0.0.6/_sources/usage_guide.rst.txt b/_build/html/v0.0.6/_sources/usage_guide.rst.txt
new file mode 100644
index 000000000..f60bdb193
--- /dev/null
+++ b/_build/html/v0.0.6/_sources/usage_guide.rst.txt
@@ -0,0 +1,2755 @@
+.. _usage_guide:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Description
+===========
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+
+Data Preparation and Management
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs.
+ :type id_colname: str
+ :param num_digits: The number of digits for the unique IDs.
+ :type num_digits: int
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either day/month/year or
+ month/day/year format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a day/month/year format. If the first part is 12 or less, the function
+ attempts to parse the date as month/day/year, falling back to day/month/year
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df)
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+
+ :returns: A DataFrame with the analysis results for each column.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of unique variables to generate combinations.
+ :type variables: list
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum length of combinations to generate. Defaults to ``2``.
+ :type min_length: int
+
+ :returns: A dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+ :notes:
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order.
+ :type sort_by: int
+ :raises ValueError: If no columns are specified or if sort_by is not ``0`` or ``1``.
+ :returns: A DataFrame with the specified columns, ``'Total'``, and ``'Percentage'``.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+If your DataFrame (e.g., the census data [1]_)
+does not have age or any other numerical column of interest binned, you can
+apply the following binning logic to categorize the data. Below, we use the age
+column from the UCI Machine Learning Repository as an example:
+
+.. code-block:: python
+
+ # Create age bins so that the ages can be categorized
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+ # Create labels for the bins
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ # Categorize the ages and assign to a new variable
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+`Note:` This code snippet creates age bins and assigns a corresponding age group
+label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+and labels as needed for your specific data.
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, grid_figsize=(10, 8), single_figsize=(6, 4), kde=True, hist_color="#0000FF", kde_color="#FF0000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=1, n_cols=1, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_path_png=None, single_var_image_path_svg=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots.
+ :type vars_of_interest: list of str, optional
+ :param grid_figsize: Size of the overall grid figure, default is ``(10, 8)``.
+ :type grid_figsize: tuple, optional
+ :param single_figsize: Size of individual figures for each variable, default is ``(6, 4)``.
+ :type single_figsize: tuple, optional
+ :param kde: Whether to include KDE plots on the histograms, default is ``True``.
+ :type kde: bool, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where
+ ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid, default is ``1``.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid, default is ``1``.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_path_png: Directory path to save the PNG images of the separate distribution plots.
+ :type single_var_image_path_png: str, optional
+ :param single_var_image_path_svg: Directory path to save the SVG images of the separate distribution plots.
+ :type single_var_image_path_svg: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots.
+ The variable name will be appended to this filename.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: List of variable names to apply log scaling.
+ :type log_scale_vars: list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: number or pair of numbers, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``,
+ ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+
+ :raises UserWarning:
+ - If ``stat`` is set to 'count' while ``kde`` is ``True``, as it may produce misleading plots.
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+---------------------------
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``kde=True`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+The ``plot_type="hist"`` parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+----------------------------
+
+In this example, the kde_distributions function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``kde=False``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+--------------------------
+
+In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ```y-axis``` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to `"Count"` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=4``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"],
+ ["<=50K", ">50K"],
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_boxplot_comp, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_boxplot_comp: List of comparison categories (columns in df).
+ :type metrics_boxplot_comp: list of str
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is (``6, 4``).
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of None, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is specified but is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_boxplot_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with `3` rows and `4`
+columns, using the ``n_rows=3`` and ``n_cols=4`` parameters. The ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ```x-axis``` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit impor box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+**Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.**
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars, y_vars, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None)
+
+ Create and save scatter plots or a grid of scatter plots for given x_vars
+ and y_vars, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the `x-axis`.
+ :type x_vars: list of str
+
+ :param y_vars: List of variable names to plot on the `y-axis`.
+ :type y_vars: list of str
+
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for `x-axis` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the `x-axis` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the `y-axis` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves scatter plots for the specified `x_vars` and `y_vars`.
+
+
+Regression-Centric Scatter Plots Example
+-----------------------------------------
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+-------------------------------------------
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.6.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str) – The name of the new column for the IDs.
+
num_digits (int) – The number of digits for the unique IDs.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
+
Returns:
+
A DataFrame with the analysis results for each column.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
Notes:
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame with the specified columns, 'Total', and 'Percentage'.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
If your DataFrame (e.g., the census data [1])
+does not have age or any other numerical column of interest binned, you can
+apply the following binning logic to categorize the data. Below, we use the age
+column from the UCI Machine Learning Repository as an example:
+
# Create age bins so that the ages can be categorized
+bin_ages=[
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+]
+
+# Create labels for the bins
+label_ages=[
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+]
+
+# Categorize the ages and assign to a new variable
+df["age_group"]=pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+)
+
+
+
Note: This code snippet creates age bins and assigns a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
+
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
+
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
+
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
+0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
+
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
+
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
+The variable name will be appended to this filename.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (number or pair of numbers, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
+'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The kde=True parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
The plot_type="hist" parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+kde=False, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The `y-axis` label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to “Count” to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=4),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],
+ ["<=50K",">50K"],
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is specified but is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_boxplot_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with 3 rows and 4
+columns, using the n_rows=3 and n_cols=4 parameters. The image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the `x-axis` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str) – List of variable names to plot on the x-axis.
+
y_vars (list of str) – List of variable names to plot on the y-axis.
+
n_rows (int) – Number of rows in the subplot grid.
+
n_cols (int) – Number of columns in the subplot grid.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.7/.buildinfo b/_build/html/v0.0.7/.buildinfo
new file mode 100644
index 000000000..3935fecaa
--- /dev/null
+++ b/_build/html/v0.0.7/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 71b627a2c469b41c409ab10b07f4ed3e
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/_build/html/v0.0.7/.doctrees/acknowledgements.doctree b/_build/html/v0.0.7/.doctrees/acknowledgements.doctree
new file mode 100644
index 000000000..74bc543a3
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/acknowledgements.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/changelog.doctree b/_build/html/v0.0.7/.doctrees/changelog.doctree
new file mode 100644
index 000000000..4d2a2bbc8
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/changelog.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/citations.doctree b/_build/html/v0.0.7/.doctrees/citations.doctree
new file mode 100644
index 000000000..a4cbeb7c1
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/citations.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/contributors.doctree b/_build/html/v0.0.7/.doctrees/contributors.doctree
new file mode 100644
index 000000000..ef218785e
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/contributors.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/environment.pickle b/_build/html/v0.0.7/.doctrees/environment.pickle
new file mode 100644
index 000000000..22cbdfb1e
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/environment.pickle differ
diff --git a/_build/html/v0.0.7/.doctrees/getting_started.doctree b/_build/html/v0.0.7/.doctrees/getting_started.doctree
new file mode 100644
index 000000000..9649cf6b2
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/getting_started.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/index.doctree b/_build/html/v0.0.7/.doctrees/index.doctree
new file mode 100644
index 000000000..15a59a929
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/index.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/references.doctree b/_build/html/v0.0.7/.doctrees/references.doctree
new file mode 100644
index 000000000..c3e00e723
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/references.doctree differ
diff --git a/_build/html/v0.0.7/.doctrees/usage_guide.doctree b/_build/html/v0.0.7/.doctrees/usage_guide.doctree
new file mode 100644
index 000000000..006912c7d
Binary files /dev/null and b/_build/html/v0.0.7/.doctrees/usage_guide.doctree differ
diff --git a/_build/html/v0.0.7/_images/Bar_Age_regular_income.svg b/_build/html/v0.0.7/_images/Bar_Age_regular_income.svg
new file mode 100644
index 000000000..6f8aa40d4
--- /dev/null
+++ b/_build/html/v0.0.7/_images/Bar_Age_regular_income.svg
@@ -0,0 +1,1201 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/Stacked_Bar_Age_income.svg b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income.svg
new file mode 100644
index 000000000..d5510308b
--- /dev/null
+++ b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income.svg
@@ -0,0 +1,1943 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_pivoted.svg b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_pivoted.svg
new file mode 100644
index 000000000..2147fce1a
--- /dev/null
+++ b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_pivoted.svg
@@ -0,0 +1,2043 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_regular.svg b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_regular.svg
new file mode 100644
index 000000000..04478581f
--- /dev/null
+++ b/_build/html/v0.0.7/_images/Stacked_Bar_Age_income_regular.svg
@@ -0,0 +1,1347 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/Stacked_Bar_Age_sex.svg b/_build/html/v0.0.7/_images/Stacked_Bar_Age_sex.svg
new file mode 100644
index 000000000..7b2bcb137
--- /dev/null
+++ b/_build/html/v0.0.7/_images/Stacked_Bar_Age_sex.svg
@@ -0,0 +1,1970 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/all_plots_comparisons_boxplot.png b/_build/html/v0.0.7/_images/all_plots_comparisons_boxplot.png
new file mode 100644
index 000000000..c4f54b520
Binary files /dev/null and b/_build/html/v0.0.7/_images/all_plots_comparisons_boxplot.png differ
diff --git a/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot.png b/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot.png
new file mode 100644
index 000000000..cc236e21c
Binary files /dev/null and b/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot.png differ
diff --git a/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot_pivoted.png b/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot_pivoted.png
new file mode 100644
index 000000000..b05150e06
Binary files /dev/null and b/_build/html/v0.0.7/_images/all_plots_comparisons_violinplot_pivoted.png differ
diff --git a/_build/html/v0.0.7/_images/count_hist_distributions.svg b/_build/html/v0.0.7/_images/count_hist_distributions.svg
new file mode 100644
index 000000000..521cd5a95
--- /dev/null
+++ b/_build/html/v0.0.7/_images/count_hist_distributions.svg
@@ -0,0 +1,1719 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/eda_toolkit_logo.svg b/_build/html/v0.0.7/_images/eda_toolkit_logo.svg
new file mode 100644
index 000000000..d039d6f79
--- /dev/null
+++ b/_build/html/v0.0.7/_images/eda_toolkit_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/_build/html/v0.0.7/_images/hist_density_distributions.svg b/_build/html/v0.0.7/_images/hist_density_distributions.svg
new file mode 100644
index 000000000..8bf1787a6
--- /dev/null
+++ b/_build/html/v0.0.7/_images/hist_density_distributions.svg
@@ -0,0 +1,1744 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/kde_density_distributions.svg b/_build/html/v0.0.7/_images/kde_density_distributions.svg
new file mode 100644
index 000000000..7564724e1
--- /dev/null
+++ b/_build/html/v0.0.7/_images/kde_density_distributions.svg
@@ -0,0 +1,2571 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/normal_distribution.png b/_build/html/v0.0.7/_images/normal_distribution.png
new file mode 100644
index 000000000..837c60e0c
Binary files /dev/null and b/_build/html/v0.0.7/_images/normal_distribution.png differ
diff --git a/_build/html/v0.0.7/_images/scatter_plots_grid.png b/_build/html/v0.0.7/_images/scatter_plots_grid.png
new file mode 100644
index 000000000..5a51facd8
Binary files /dev/null and b/_build/html/v0.0.7/_images/scatter_plots_grid.png differ
diff --git a/_build/html/v0.0.7/_images/scatter_plots_grid_grouped.png b/_build/html/v0.0.7/_images/scatter_plots_grid_grouped.png
new file mode 100644
index 000000000..02a3b3916
Binary files /dev/null and b/_build/html/v0.0.7/_images/scatter_plots_grid_grouped.png differ
diff --git a/_build/html/v0.0.7/_images/summarize_combos.gif b/_build/html/v0.0.7/_images/summarize_combos.gif
new file mode 100644
index 000000000..402ee1efc
Binary files /dev/null and b/_build/html/v0.0.7/_images/summarize_combos.gif differ
diff --git a/_build/html/v0.0.7/_images/us_census_correlation_matrix.svg b/_build/html/v0.0.7/_images/us_census_correlation_matrix.svg
new file mode 100644
index 000000000..2a41e1afa
--- /dev/null
+++ b/_build/html/v0.0.7/_images/us_census_correlation_matrix.svg
@@ -0,0 +1,1766 @@
+
+
+
diff --git a/_build/html/v0.0.7/_images/us_census_correlation_matrix_full.svg b/_build/html/v0.0.7/_images/us_census_correlation_matrix_full.svg
new file mode 100644
index 000000000..d0df5da46
--- /dev/null
+++ b/_build/html/v0.0.7/_images/us_census_correlation_matrix_full.svg
@@ -0,0 +1,1907 @@
+
+
+
diff --git a/_build/html/v0.0.7/_sources/acknowledgements.rst.txt b/_build/html/v0.0.7/_sources/acknowledgements.rst.txt
new file mode 100644
index 000000000..e62da5a10
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/acknowledgements.rst.txt
@@ -0,0 +1,30 @@
+.. _acknowledgements:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.7/_sources/changelog.rst.txt b/_build/html/v0.0.7/_sources/changelog.rst.txt
new file mode 100644
index 000000000..828e1aebd
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/changelog.rst.txt
@@ -0,0 +1,338 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+Version 0.0.7
+---------------------------
+
+**Added Function for Customizable Correlation Matrix Visualization**
+
+This release introduces a new function, ``flex_corr_matrix``, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to ``seaborn.heatmap()``.
+
+**Summary of Changes**
+
+- **New Function**: ``flex_corr_matrix``.
+
+ - **Functionality**:
+ - Generates a correlation heatmap for a given DataFrame.
+ - Supports both full and upper triangular correlation matrices based on the ``triangular`` parameter.
+ - Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+ - Accepts additional keyword arguments via ``**kwargs`` to pass directly to ``seaborn.heatmap()``.
+ - Includes validation to ensure the ``triangular``, ``annot``, and ``save_plots`` parameters are boolean values.
+ - Raises an exception if ``save_plots=True`` but neither ``image_path_png`` nor ``image_path_svg`` is specified.
+
+**Usage**
+
+.. code-block:: python
+
+ # Full correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=False, cmap="coolwarm", annot=True)
+
+ # Upper triangular correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=True, cmap="coolwarm", annot=True)
+
+
+**Contingency table df to object type**
+
+Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
+
+.. code-block:: python
+
+ df = df.astype(str).fillna("")
+
+
+Version 0.0.6
+---------------------------
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+Version 0.0.5
+---------------------------
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.7/_sources/citations.rst.txt b/_build/html/v0.0.7/_sources/citations.rst.txt
new file mode 100644
index 000000000..5c8dc7744
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.7). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.7},W
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.7/_sources/contributors.rst.txt b/_build/html/v0.0.7/_sources/contributors.rst.txt
new file mode 100644
index 000000000..48874021f
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/contributors.rst.txt
@@ -0,0 +1,53 @@
+.. _contributors:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Contributors/Maintainers
+=========================
+
+.. raw:: html
+
+
+
+.. image:: https://www.leonshpaner.com/author/leon-shpaner/avatar_hu48de79c369d5f7d4ff8056a297b2c4c5_1681850_270x270_fill_q90_lanczos_center.jpg
+ :align: left
+ :width: 150
+ :height: 150
+
+`Leonid Shpaner `_ is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego's M.S. Applied Artificial Intelligence program.
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: https://oscargildata.com/portfolio_content/images/Oscar_LinkedIn_Pic.jpeg
+ :align: left
+ :width: 150
+ :height: 150
+
+`Oscar Gil `_ is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
+
+
diff --git a/_build/html/v0.0.7/_sources/getting_started.rst.txt b/_build/html/v0.0.7/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..5962e685c
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/getting_started.rst.txt
@@ -0,0 +1,121 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.7``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``jinja2``: version ``3.1.4`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
diff --git a/_build/html/v0.0.7/_sources/index.rst.txt b/_build/html/v0.0.7/_sources/index.rst.txt
new file mode 100644
index 000000000..77d24a0f7
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/index.rst.txt
@@ -0,0 +1,51 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Usage Guide
+
+ usage_guide
+
+.. toctree::
+ :maxdepth: 4
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ contributors
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.7/_sources/references.rst.txt b/_build/html/v0.0.7/_sources/references.rst.txt
new file mode 100644
index 000000000..cf7c135da
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/references.rst.txt
@@ -0,0 +1,33 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+References
+===========
+
+1. Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+2. Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+3. Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+
diff --git a/_build/html/v0.0.7/_sources/usage_guide.rst.txt b/_build/html/v0.0.7/_sources/usage_guide.rst.txt
new file mode 100644
index 000000000..1e0148d5b
--- /dev/null
+++ b/_build/html/v0.0.7/_sources/usage_guide.rst.txt
@@ -0,0 +1,3190 @@
+.. _usage_guide:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Description
+===========
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+
+Data Preparation and Management
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs.
+ :type id_colname: str
+ :param num_digits: The number of digits for the unique IDs.
+ :type num_digits: int
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either `day/month/year` or
+ `month/day/year` format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a `day/month/year` format. If the first part is 12 or less, the function
+ attempts to parse the date as `month/day/year`, falling back to `day/month/year`
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df)
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+
+ :returns: A DataFrame with the analysis results for each column.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of unique variables to generate combinations.
+ :type variables: list
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum length of combinations to generate. Defaults to ``2``.
+ :type min_length: int
+
+ :returns: A dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+ :notes:
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order.
+ :type sort_by: int
+ :raises ValueError: If no columns are specified or if sort_by is not ``0`` or ``1``.
+ :returns: A DataFrame with the specified columns, ``'Total'``, and ``'Percentage'``.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or "bins." This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or "bins," and
+then assigning each value to one of these intervals.
+
+.. note::
+
+ The code snippets below create age bins and assign a corresponding age group
+ label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+ categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+ and labels as needed for your specific data.
+
+
+Below, we use the ``age`` column of the census data [1]_ from the UCI Machine Learning Repository as an example:
+
+1. **Bins Definition**:
+ The bins are defined by specifying the boundaries of each interval. For example,
+ in the code snippet below, the ``bin_ages`` list specifies the boundaries for age groups:
+
+ .. code-block:: python
+
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+
+ Each pair of consecutive elements in ``bin_ages`` defines a bin. For example:
+
+ - The first bin is ``[0, 18)``,
+ - The second bin is ``[18, 30)``,
+ - and so on.
+
+\
+
+2. **Labels for Bins**:
+ The `label_ages` list provides labels corresponding to each bin:
+
+ .. code-block:: python
+
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ These labels are used to categorize the numerical values into meaningful groups.
+
+3. **Applying the Binning**:
+ The `pd.cut `_ function
+ from Pandas is used to apply the binning process. For each value in the ``age``
+ column of the DataFrame, it assigns a corresponding label based on which bin the
+ value falls into. Here, ``right=False`` indicates that each bin includes the
+ left endpoint but excludes the right endpoint. For example, if ``bin_ages =
+ [0, 10, 20, 30]``, then a value of ``10`` will fall into the bin ``[10, 20)`` and
+ be labeled accordingly.
+
+ .. code-block:: python
+
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+ **Mathematically**, for a given value `x` in the ``age`` column:
+
+ .. math::
+
+ \text{age_group} =
+ \begin{cases}
+ < 18 & \text{if } 0 \leq x < 18 \\
+ 18-29 & \text{if } 18 \leq x < 30 \\
+ \vdots \\
+ 100 + & \text{if } x \geq 100
+ \end{cases}
+
+ The parameter `right=False` in `pd.cut` means that the bins are left-inclusive
+ and right-exclusive, except for the last bin, which is always right-inclusive
+ when the upper bound is infinity (`float("inf")`).
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+Gaussian Assumption for Normality
+----------------------------------
+
+The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
+
+.. math::
+
+ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
+
+where:
+
+- :math:`\mu` is the mean
+- :math:`\sigma^2` is the variance
+
+In a normally distributed dataset:
+
+- 68% of data falls within :math:`\mu \pm \sigma`
+- 95% within :math:`\mu \pm 2\sigma`
+- 99.7% within :math:`\mu \pm 3\sigma`
+
+.. raw:: html
+
+
+
+.. image:: ../assets/normal_distribution.png
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histograms and KDE
+^^^^^^^^^^^^^^^^^^^^^^
+
+**Histograms**:
+
+- Visualize data distribution by binning values and counting frequencies.
+- If data is Gaussian, the histogram approximates a bell curve.
+
+**Kernel Density Estimation (KDE)**:
+
+- A non-parametric way to estimate the PDF by smoothing individual data points with a kernel function.
+- The KDE for a dataset :math:`X = \{x_1, x_2, \ldots, x_n\}` is given by:
+
+.. math::
+
+ \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)
+
+where:
+
+- :math:`K` is the kernel function (often Gaussian)
+- :math:`h` is the bandwidth (smoothing parameter)
+
+**Combined Use of Histograms and KDE**
+
+- **Histograms** offer a discrete, binned view of the data.
+- **KDE** provides a smooth, continuous estimate of the underlying distribution.
+- Together, they effectively illustrate how well the data aligns with the Gaussian assumption, highlighting any deviations from normality.
+
+KDE Distribution Function
+-----------------------------
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, grid_figsize=(10, 8), single_figsize=(6, 4), kde=True, hist_color="#0000FF", kde_color="#FF0000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=1, n_cols=1, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_path_png=None, single_var_image_path_svg=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots.
+ :type vars_of_interest: list of str, optional
+ :param grid_figsize: Size of the overall grid figure, default is ``(10, 8)``.
+ :type grid_figsize: tuple, optional
+ :param single_figsize: Size of individual figures for each variable, default is ``(6, 4)``.
+ :type single_figsize: tuple, optional
+ :param kde: Whether to include KDE plots on the histograms, default is ``True``.
+ :type kde: bool, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where
+ ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid, default is ``1``.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid, default is ``1``.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_path_png: Directory path to save the PNG images of the separate distribution plots.
+ :type single_var_image_path_png: str, optional
+ :param single_var_image_path_svg: Directory path to save the SVG images of the separate distribution plots.
+ :type single_var_image_path_svg: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots.
+ The variable name will be appended to this filename.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: List of variable names to apply log scaling.
+ :type log_scale_vars: list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: number or pair of numbers, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``,
+ ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+
+ :raises UserWarning:
+ - If ``stat`` is set to 'count' while ``kde`` is ``True``, as it may produce misleading plots.
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``kde=True`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+The ``plot_type="hist"`` parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the kde_distributions function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``kde=False``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ```y-axis``` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to `"Count"` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=4``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. note::
+
+ The ``legend_labels_list`` parameter should be a list of lists, where each
+ inner list corresponds to the ground truth labels for the respective item in
+ the ``func_col`` list. Each element in the ``func_col`` list represents a
+ column in your DataFrame that you wish to analyze, and the corresponding
+ inner list in ``legend_labels_list`` should contain the labels that will be
+ used in the legend of your plots.
+
+For example:
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"], # Corresponds to "sex"
+ ["<=50K", ">50K"], # Corresponds to "income"
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. important::
+
+ Ensure that the number of elements in ``func_col``, ``legend_labels_list``,
+ and ``title`` are the same. Each item in ``func_col`` must have a corresponding
+ list of labels in ``legend_labels_list`` and a title in ``title``. This
+ consistency is essential for the function to correctly generate the plots
+ with the appropriate labels and titles.
+
+
+In this example:
+
+- ``func_col`` contains two elements: ``"sex"`` and ``"income"``. Each corresponds to a specific column in your DataFrame.
+- ``legend_labels_list`` is a nested list containing two inner lists:
+
+ - The first inner list, ``["Male", "Female"]``, corresponds to the ``"sex"`` column in ``func_col``.
+ - The second inner list, ``["<=50K", ">50K"]``, corresponds to the ``"income"`` column in ``func_col``.
+
+- ``title`` contains two elements: ``"Sex"`` and ``"Income"``, which will be used as the titles for the respective plots.
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_boxplot_comp, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_boxplot_comp: List of comparison categories (columns in df).
+ :type metrics_boxplot_comp: list of str
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is (``6, 4``).
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of None, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is specified but is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_boxplot_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with `3` rows and `4`
+columns, using the ``n_rows=3`` and ``n_cols=4`` parameters. The ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ```x-axis``` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+Pearson Correlation Coefficient
+--------------------------------
+
+The Pearson correlation coefficient, often denoted as :math:`r`, is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from :math:`-1` to :math:`1`, where:
+
+- :math:`r = 1` indicates a perfect positive linear relationship.
+- :math:`r = -1` indicates a perfect negative linear relationship.
+- :math:`r = 0` indicates no linear relationship.
+
+The Pearson correlation coefficient between two variables :math:`X` and :math:`Y` is defined as:
+
+.. math::
+
+ r_{XY} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
+
+where:
+
+- :math:`\text{Cov}(X, Y)` is the covariance of :math:`X` and :math:`Y`.
+- :math:`\sigma_X` is the standard deviation of :math:`X`.
+- :math:`\sigma_Y` is the standard deviation of :math:`Y`.
+
+Covariance measures how much two variables change together. It is defined as:
+
+.. math::
+
+ \text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)
+
+where:
+
+- :math:`n` is the number of data points.
+- :math:`X_i` and :math:`Y_i` are the individual data points.
+- :math:`\mu_X` and :math:`\mu_Y` are the means of :math:`X` and :math:`Y`.
+
+The standard deviation measures the dispersion or spread of a set of values. For
+a variable :math:`X`, the standard deviation :math:`\sigma_X` is:
+
+.. math::
+
+ \sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)^2}
+
+Substituting the covariance and standard deviation into the Pearson correlation formula:
+
+.. math::
+
+ r_{XY} = \frac{\sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n} (X_i - \mu_X)^2} \sqrt{\sum_{i=1}^{n} (Y_i - \mu_Y)^2}}
+
+This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between :math:`X` and :math:`Y`.
+
+- :math:`r > 0`: Positive correlation. As :math:`X` increases, :math:`Y` tends to increase.
+- :math:`r < 0`: Negative correlation. As :math:`X` increases, :math:`Y` tends to decrease.
+- :math:`r = 0`: No linear correlation. There is no consistent linear relationship between :math:`X` and :math:`Y`.
+
+The closer the value of :math:`r` is to :math:`\pm 1`, the stronger the linear relationship between the two variables.
+
+Scatter Fit Plot
+------------------
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars, y_vars, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None)
+
+ Create and save scatter plots or a grid of scatter plots for given x_vars
+ and y_vars, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the `x-axis`.
+ :type x_vars: list of str
+
+ :param y_vars: List of variable names to plot on the `y-axis`.
+ :type y_vars: list of str
+
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for `x-axis` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the `x-axis` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the `y-axis` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves scatter plots for the specified `x_vars` and `y_vars`.
+
+
+Regression-Centric Scatter Plots Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Correlation Matrices
+=====================
+
+**Generate and Save Customizable Correlation Heatmaps**
+
+The ``flex_corr_matrix`` function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
+**Customizable Plot Appearance**
+
+The function provides extensive customization options for the heatmap's appearance:
+
+- **Colormap Selection**: Choose from a variety of colormaps to represent the strength of correlations. The default is ``"coolwarm"``, but this can be adjusted to fit the needs of the analysis.
+
+- **Annotation**: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
+- **Figure Size and Layout**: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+**Triangular vs. Full Correlation Matrix**
+
+
+A key feature of the ``flex_corr_matrix`` function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
+**Label and Axis Configuration**
+
+
+The function offers flexibility in configuring axis labels and titles:
+
+- **Label Rotation**: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+- **Font Sizes**: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+- **Title Wrapping**: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+**Plot Display and Saving Options**
+
+
+The ``flex_corr_matrix`` function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
+
+.. function:: flex_corr_matrix(df, cols=None, annot=True, cmap="coolwarm", save_plots=False, image_path_png=None, image_path_svg=None, figsize=(10, 10), title="Cervical Cancer Data: Correlation Matrix", label_fontsize=12, tick_fontsize=10, xlabel_rot=45, ylabel_rot=0, xlabel_alignment="right", ylabel_alignment="center_baseline", text_wrap=50, vmin=-1, vmax=1, cbar_label="Correlation Index", triangular=True, **kwargs)
+
+ Create a customizable correlation heatmap with options for annotation, color mapping, figure size, and saving the plot.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param cols: List of column names to include in the correlation matrix. If None, all columns are included.
+ :type cols: list of str, optional
+
+ :param annot: Whether to annotate the heatmap with correlation coefficients. Default is ``True``.
+ :type annot: bool, optional
+
+ :param cmap: The colormap to use for the heatmap. Default is ``"coolwarm"``.
+ :type cmap: str, optional
+
+ :param save_plots: Controls whether to save the plots. Default is ``False``.
+ :type save_plots: bool, optional
+
+ :param image_path_png: Directory path to save PNG images of the heatmap.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the heatmap.
+ :type image_path_svg: str, optional
+
+ :param figsize: Width and height of the figure for the heatmap. Default is ``(10, 10)``.
+ :type figsize: tuple, optional
+
+ :param title: Title of the heatmap. Default is ``"Cervical Cancer Data: Correlation Matrix"``.
+ :type title: str, optional
+
+ :param label_fontsize: Font size for tick labels and colorbar label. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param xlabel_rot: Rotation angle for x-axis labels. Default is ``45``.
+ :type xlabel_rot: int, optional
+
+ :param ylabel_rot: Rotation angle for y-axis labels. Default is ``0``.
+ :type ylabel_rot: int, optional
+
+ :param xlabel_alignment: Horizontal alignment for x-axis labels. Default is ``"right"``.
+ :type xlabel_alignment: str, optional
+
+ :param ylabel_alignment: Vertical alignment for y-axis labels. Default is ``"center_baseline"``.
+ :type ylabel_alignment: str, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param vmin: Minimum value for the heatmap color scale. Default is ``-1``.
+ :type vmin: float, optional
+
+ :param vmax: Maximum value for the heatmap color scale. Default is ``1``.
+ :type vmax: float, optional
+
+ :param cbar_label: Label for the colorbar. Default is ``"Correlation Index"``.
+ :type cbar_label: str, optional
+
+ :param triangular: Whether to show only the upper triangle of the correlation matrix. Default is ``True``.
+ :type triangular: bool, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``seaborn.heatmap()``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``annot`` is not a boolean.
+ - If ``cols`` is not a list.
+ - If ``save_plots`` is not a boolean.
+ - If ``triangular`` is not a boolean.
+ - If ``save_plots`` is True but no image paths are provided.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves a correlation heatmap.
+
+Triangular Correlation Matrix Example
+--------------------------------------
+
+The provided code filters the census [1]_ DataFrame ``df`` to include only numeric columns using
+``select_dtypes(np.number)``. It then utilizes the ``flex_corr_matrix()`` function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+.. note::
+
+ This triangular matrix format is particularly useful for avoiding
+ redundancy in correlation matrices, as it excludes the lower half,
+ making it easier to focus on unique pairwise correlations.
+
+The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
+.. code-block:: python
+
+ # Select only numeric data to pass into the function
+ df_num = df.select_dtypes(np.number)
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="coolwarm",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Full Correlation Matrix Example
+----------------------------------
+
+In this modified census [1]_ example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting ``cmap="viridis"``, the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting ``triangular=False``, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="viridis",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=False,
+ )
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Added Function for Customizable Correlation Matrix Visualization
+
This release introduces a new function, flex_corr_matrix, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to seaborn.heatmap().
+
Summary of Changes
+
+
New Function: flex_corr_matrix.
+
+
Functionality:
+- Generates a correlation heatmap for a given DataFrame.
+- Supports both full and upper triangular correlation matrices based on the triangular parameter.
+- Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+- Accepts additional keyword arguments via **kwargs to pass directly to seaborn.heatmap().
+- Includes validation to ensure the triangular, annot, and save_plots parameters are boolean values.
+- Raises an exception if save_plots=True but neither image_path_png nor image_path_svg is specified.
+
+
+
+
Usage
+
# Full correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=False,cmap="coolwarm",annot=True)
+
+# Upper triangular correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=True,cmap="coolwarm",annot=True)
+
+
+
Contingency table df to object type
+
Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.7/genindex.html b/_build/html/v0.0.7/genindex.html
new file mode 100644
index 000000000..d65b37b5f
--- /dev/null
+++ b/_build/html/v0.0.7/genindex.html
@@ -0,0 +1,333 @@
+
+
+
+
+
+
+
+ Index — EDA Toolkit 0.0.7 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
+
+
+
+
+
+
+
Index
+
+
+ A
+ | B
+ | C
+ | D
+ | E
+ | F
+ | H
+ | K
+ | P
+ | S
+
+
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.7.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str) – The name of the new column for the IDs.
+
num_digits (int) – The number of digits for the unique IDs.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
+
Returns:
+
A DataFrame with the analysis results for each column.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
Notes:
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame with the specified columns, 'Total', and 'Percentage'.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or “bins.” This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or “bins,” and
+then assigning each value to one of these intervals.
+
+
Note
+
The code snippets below create age bins and assign a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
+
+
Below, we use the age column of the census data [1] from the UCI Machine Learning Repository as an example:
+
+
Bins Definition:
+The bins are defined by specifying the boundaries of each interval. For example,
+in the code snippet below, the bin_ages list specifies the boundaries for age groups:
These labels are used to categorize the numerical values into meaningful groups.
+
+
Applying the Binning:
+The pd.cut function
+from Pandas is used to apply the binning process. For each value in the age
+column of the DataFrame, it assigns a corresponding label based on which bin the
+value falls into. Here, right=False indicates that each bin includes the
+left endpoint but excludes the right endpoint. For example, if bin_ages=
+[0,10,20,30], then a value of 10 will fall into the bin [10,20) and
+be labeled accordingly.
The parameter right=False in pd.cut means that the bins are left-inclusive
+and right-exclusive, except for the last bin, which is always right-inclusive
+when the upper bound is infinity (float(“inf”)).
The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
+
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
+
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
+
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
+0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
+
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
+
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
+The variable name will be appended to this filename.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (number or pair of numbers, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
+'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The kde=True parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
The plot_type="hist" parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+kde=False, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The `y-axis` label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to “Count” to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=4),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
+
Note
+
The legend_labels_list parameter should be a list of lists, where each
+inner list corresponds to the ground truth labels for the respective item in
+the func_col list. Each element in the func_col list represents a
+column in your DataFrame that you wish to analyze, and the corresponding
+inner list in legend_labels_list should contain the labels that will be
+used in the legend of your plots.
+
+
For example:
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],# Corresponds to "sex"
+ ["<=50K",">50K"],# Corresponds to "income"
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Important
+
Ensure that the number of elements in func_col, legend_labels_list,
+and title are the same. Each item in func_col must have a corresponding
+list of labels in legend_labels_list and a title in title. This
+consistency is essential for the function to correctly generate the plots
+with the appropriate labels and titles.
+
+
In this example:
+
+
func_col contains two elements: "sex" and "income". Each corresponds to a specific column in your DataFrame.
+
legend_labels_list is a nested list containing two inner lists:
+
+
+
The first inner list, ["Male","Female"], corresponds to the "sex" column in func_col.
+
The second inner list, ["<=50K",">50K"], corresponds to the "income" column in func_col.
+
+
+
+
title contains two elements: "Sex" and "Income", which will be used as the titles for the respective plots.
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is specified but is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_boxplot_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with 3 rows and 4
+columns, using the n_rows=3 and n_cols=4 parameters. The image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the `x-axis` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
The Pearson correlation coefficient, often denoted as \(r\), is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from \(-1\) to \(1\), where:
+
+
\(r = 1\) indicates a perfect positive linear relationship.
+
\(r = -1\) indicates a perfect negative linear relationship.
+
\(r = 0\) indicates no linear relationship.
+
+
The Pearson correlation coefficient between two variables \(X\) and \(Y\) is defined as:
This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between \(X\) and \(Y\).
+
+
\(r > 0\): Positive correlation. As \(X\) increases, \(Y\) tends to increase.
+
\(r < 0\): Negative correlation. As \(X\) increases, \(Y\) tends to decrease.
+
\(r = 0\): No linear correlation. There is no consistent linear relationship between \(X\) and \(Y\).
+
+
The closer the value of \(r\) is to \(\pm 1\), the stronger the linear relationship between the two variables.
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str) – List of variable names to plot on the x-axis.
+
y_vars (list of str) – List of variable names to plot on the y-axis.
+
n_rows (int) – Number of rows in the subplot grid.
+
n_cols (int) – Number of columns in the subplot grid.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
Generate and Save Customizable Correlation Heatmaps
+
The flex_corr_matrix function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
Customizable Plot Appearance
+
The function provides extensive customization options for the heatmap’s appearance:
+
+
Colormap Selection: Choose from a variety of colormaps to represent the strength of correlations. The default is "coolwarm", but this can be adjusted to fit the needs of the analysis.
+
Annotation: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
Figure Size and Layout: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+
Triangular vs. Full Correlation Matrix
+
A key feature of the flex_corr_matrix function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
Label and Axis Configuration
+
The function offers flexibility in configuring axis labels and titles:
+
+
Label Rotation: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+
Font Sizes: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+
Title Wrapping: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+
Plot Display and Saving Options
+
The flex_corr_matrix function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
The provided code filters the census [1] DataFrame df to include only numeric columns using
+select_dtypes(np.number). It then utilizes the flex_corr_matrix() function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+
Note
+
This triangular matrix format is particularly useful for avoiding
+redundancy in correlation matrices, as it excludes the lower half,
+making it easier to focus on unique pairwise correlations.
+
+
The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
# Select only numeric data to pass into the function
+df_num=df.select_dtypes(np.number)
+
In this modified census [1] example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting cmap="viridis", the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting triangular=False, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.8/.buildinfo b/_build/html/v0.0.8/.buildinfo
new file mode 100644
index 000000000..74e366f34
--- /dev/null
+++ b/_build/html/v0.0.8/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 0cc49252ae4a13912e09d6bd38b18b52
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/_build/html/v0.0.8/.doctrees/acknowledgements.doctree b/_build/html/v0.0.8/.doctrees/acknowledgements.doctree
new file mode 100644
index 000000000..da086b700
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/acknowledgements.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/changelog.doctree b/_build/html/v0.0.8/.doctrees/changelog.doctree
new file mode 100644
index 000000000..c39c63fa3
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/changelog.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/citations.doctree b/_build/html/v0.0.8/.doctrees/citations.doctree
new file mode 100644
index 000000000..eaca8ade2
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/citations.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/contributors.doctree b/_build/html/v0.0.8/.doctrees/contributors.doctree
new file mode 100644
index 000000000..246645d32
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/contributors.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/environment.pickle b/_build/html/v0.0.8/.doctrees/environment.pickle
new file mode 100644
index 000000000..9b3be3d92
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/environment.pickle differ
diff --git a/_build/html/v0.0.8/.doctrees/getting_started.doctree b/_build/html/v0.0.8/.doctrees/getting_started.doctree
new file mode 100644
index 000000000..6f34bdec3
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/getting_started.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/index.doctree b/_build/html/v0.0.8/.doctrees/index.doctree
new file mode 100644
index 000000000..fd4d378cb
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/index.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/references.doctree b/_build/html/v0.0.8/.doctrees/references.doctree
new file mode 100644
index 000000000..5f3c625c7
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/references.doctree differ
diff --git a/_build/html/v0.0.8/.doctrees/usage_guide.doctree b/_build/html/v0.0.8/.doctrees/usage_guide.doctree
new file mode 100644
index 000000000..0c96d7195
Binary files /dev/null and b/_build/html/v0.0.8/.doctrees/usage_guide.doctree differ
diff --git a/_build/html/v0.0.8/_images/2d_pdp_grid.svg b/_build/html/v0.0.8/_images/2d_pdp_grid.svg
new file mode 100644
index 000000000..641db4ba6
--- /dev/null
+++ b/_build/html/v0.0.8/_images/2d_pdp_grid.svg
@@ -0,0 +1,4405 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/3d_pdp.svg b/_build/html/v0.0.8/_images/3d_pdp.svg
new file mode 100644
index 000000000..535371233
--- /dev/null
+++ b/_build/html/v0.0.8/_images/3d_pdp.svg
@@ -0,0 +1,8326 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/Bar_Age_regular_income.svg b/_build/html/v0.0.8/_images/Bar_Age_regular_income.svg
new file mode 100644
index 000000000..6f8aa40d4
--- /dev/null
+++ b/_build/html/v0.0.8/_images/Bar_Age_regular_income.svg
@@ -0,0 +1,1201 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/Stacked_Bar_Age_income.svg b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income.svg
new file mode 100644
index 000000000..d5510308b
--- /dev/null
+++ b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income.svg
@@ -0,0 +1,1943 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_pivoted.svg b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_pivoted.svg
new file mode 100644
index 000000000..2147fce1a
--- /dev/null
+++ b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_pivoted.svg
@@ -0,0 +1,2043 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_regular.svg b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_regular.svg
new file mode 100644
index 000000000..04478581f
--- /dev/null
+++ b/_build/html/v0.0.8/_images/Stacked_Bar_Age_income_regular.svg
@@ -0,0 +1,1347 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/Stacked_Bar_Age_sex.svg b/_build/html/v0.0.8/_images/Stacked_Bar_Age_sex.svg
new file mode 100644
index 000000000..7b2bcb137
--- /dev/null
+++ b/_build/html/v0.0.8/_images/Stacked_Bar_Age_sex.svg
@@ -0,0 +1,1970 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/all_plots_comparisons_boxplot.png b/_build/html/v0.0.8/_images/all_plots_comparisons_boxplot.png
new file mode 100644
index 000000000..c4f54b520
Binary files /dev/null and b/_build/html/v0.0.8/_images/all_plots_comparisons_boxplot.png differ
diff --git a/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot.png b/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot.png
new file mode 100644
index 000000000..cc236e21c
Binary files /dev/null and b/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot.png differ
diff --git a/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot_pivoted.png b/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot_pivoted.png
new file mode 100644
index 000000000..b05150e06
Binary files /dev/null and b/_build/html/v0.0.8/_images/all_plots_comparisons_violinplot_pivoted.png differ
diff --git a/_build/html/v0.0.8/_images/count_hist_distributions.svg b/_build/html/v0.0.8/_images/count_hist_distributions.svg
new file mode 100644
index 000000000..521cd5a95
--- /dev/null
+++ b/_build/html/v0.0.8/_images/count_hist_distributions.svg
@@ -0,0 +1,1719 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/eda_toolkit_logo.svg b/_build/html/v0.0.8/_images/eda_toolkit_logo.svg
new file mode 100644
index 000000000..d039d6f79
--- /dev/null
+++ b/_build/html/v0.0.8/_images/eda_toolkit_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/_build/html/v0.0.8/_images/hist_density_distributions.svg b/_build/html/v0.0.8/_images/hist_density_distributions.svg
new file mode 100644
index 000000000..8bf1787a6
--- /dev/null
+++ b/_build/html/v0.0.8/_images/hist_density_distributions.svg
@@ -0,0 +1,1744 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/kde_density_distributions.svg b/_build/html/v0.0.8/_images/kde_density_distributions.svg
new file mode 100644
index 000000000..7564724e1
--- /dev/null
+++ b/_build/html/v0.0.8/_images/kde_density_distributions.svg
@@ -0,0 +1,2571 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/normal_distribution.png b/_build/html/v0.0.8/_images/normal_distribution.png
new file mode 100644
index 000000000..837c60e0c
Binary files /dev/null and b/_build/html/v0.0.8/_images/normal_distribution.png differ
diff --git a/_build/html/v0.0.8/_images/scatter_plots_grid.png b/_build/html/v0.0.8/_images/scatter_plots_grid.png
new file mode 100644
index 000000000..5a51facd8
Binary files /dev/null and b/_build/html/v0.0.8/_images/scatter_plots_grid.png differ
diff --git a/_build/html/v0.0.8/_images/scatter_plots_grid_grouped.png b/_build/html/v0.0.8/_images/scatter_plots_grid_grouped.png
new file mode 100644
index 000000000..02a3b3916
Binary files /dev/null and b/_build/html/v0.0.8/_images/scatter_plots_grid_grouped.png differ
diff --git a/_build/html/v0.0.8/_images/summarize_combos.gif b/_build/html/v0.0.8/_images/summarize_combos.gif
new file mode 100644
index 000000000..402ee1efc
Binary files /dev/null and b/_build/html/v0.0.8/_images/summarize_combos.gif differ
diff --git a/_build/html/v0.0.8/_images/us_census_correlation_matrix.svg b/_build/html/v0.0.8/_images/us_census_correlation_matrix.svg
new file mode 100644
index 000000000..2a41e1afa
--- /dev/null
+++ b/_build/html/v0.0.8/_images/us_census_correlation_matrix.svg
@@ -0,0 +1,1766 @@
+
+
+
diff --git a/_build/html/v0.0.8/_images/us_census_correlation_matrix_full.svg b/_build/html/v0.0.8/_images/us_census_correlation_matrix_full.svg
new file mode 100644
index 000000000..d0df5da46
--- /dev/null
+++ b/_build/html/v0.0.8/_images/us_census_correlation_matrix_full.svg
@@ -0,0 +1,1907 @@
+
+
+
diff --git a/_build/html/v0.0.8/_sources/acknowledgements.rst.txt b/_build/html/v0.0.8/_sources/acknowledgements.rst.txt
new file mode 100644
index 000000000..e62da5a10
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/acknowledgements.rst.txt
@@ -0,0 +1,30 @@
+.. _acknowledgements:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.8/_sources/changelog.rst.txt b/_build/html/v0.0.8/_sources/changelog.rst.txt
new file mode 100644
index 000000000..7fb600f84
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/changelog.rst.txt
@@ -0,0 +1,558 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+Version 0.0.8
+--------------------
+
+:class:`stacked_crosstab_plot`
+
+- **Flexible `save_formats` Input**:
+ - `save_formats` now accepts a string, tuple, or list for specifying formats (e.g., `"png"`, `("png", "svg")`, or `["png", "svg"]`).
+ - Single strings or tuples are automatically converted to lists for consistent processing.
+
+- **Dynamic Error Handling**:
+ - Added checks to ensure a valid path is provided for each format in `save_formats`.
+ - Raises a `ValueError` if a format is specified without a corresponding path, with a clear, dynamic error message.
+
+- **Improved Plot Saving Logic**:
+ - Updated logic allows saving plots in one format (e.g., only `"png"` or `"svg"`) without requiring the other.
+ - Simplified and more intuitive path handling for saving plots.
+
+
+:class:`plot_3d_pdp`
+
+This update introduces several key changes to the `plot_3d_pdp` function, simplifying the function's interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
+**1. Parameter Changes**
+
+
+- **Removed Parameters:**
+
+ - The parameters ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general ``x_label``, ``y_label``, and ``z_label``. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+- **Default Values for Labels:**
+
+ - The parameters ``x_label``, ``y_label``, and ``z_label`` are now optional, with ``None`` as the default. If not provided, these labels will automatically default to the names of the features in the ``feature_names_list``. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+- **Changes in Default Values for View Angles:**
+
+ - The default values for camera positioning parameters have been updated: ``horizontal`` is now ``-1.25``, ``depth`` is now ``1.25``, and ``vertical`` is now ``1.25``. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+**2. Plot Generation Logic**
+
+- **Conditionally Checking Labels:**
+
+ - The function now checks whether ``x_label``, ``y_label``, and ``z_label`` are provided. If these are ``None``, the function will automatically assign default labels based on the ``feature_names_list``. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+- **Camera Position Adjustments:**
+
+ - The camera positions for the Plotly plot are now adjusted by multiplying ``horizontal``, ``depth``, and ``vertical`` by ``zoom_out_factor``. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+- **Surface Plot Coordinates Adjustments:**
+
+ - The order of the coordinates for the Plotly plot’s surface has been changed from ``ZZ, XX, YY[::-1]`` to ``ZZ, XX, YY``. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+**3. Code Simplifications**
+
+- **Removed Complexity:**
+
+ - By removing the ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+- **Fallback Mechanism for Grid Values:**
+
+ - The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+**4. Style Adjustments**
+
+- **Label Formatting:**
+
+ - The new version consistently uses ``y_label``, ``x_label``, and ``z_label`` for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+- **Color Bar Adjustments:**
+
+ - The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of ``0.6`` and a pad value of ``0.02``. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+**5. Potential Use Case Differences**
+
+- **Simplified Interface:**
+
+ - The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+- **Less Granular Control:**
+
+ - Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the ``*_plotly`` label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+**6. Matplotlib Plot Adjustments**
+
+- **Wireframe and Surface Plot Enhancements:**
+
+ - The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+**Summary**
+
+- Version ``0.0.8d`` of the `plot_3d_pdp` function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+- Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+**Decision Point**
+
+- This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
+
+
+Version 0.0.8c
+------------------------
+
+Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
+**Key Updates in 0.0.8c:**
+
+- **Bug Fixes:** Addressed minor issues identified in version ``0.0.8b`` to ensure smoother performance and better user experience.
+- **Additional Testing:** Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+- **Refinements:** Made small enhancements to existing features based on user feedback and internal testing results.
+
+**Summary of Changes**
+
+1. New Features & Enhancements
+
+- ``plot_3d_pdp`` Function:
+
+ - Added ``show_modebar`` Parameter: Introduced a new boolean parameter, ``show_modebar``, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Added parameters for ``left_margin``, ``right_margin``, and ``top_margin`` to provide users with more control over the plot layout in Plotly.
+
+ - Adjusted default values and added options for better customization of the Plotly color bar (``cbar_x``, ``cbar_thickness``) and title positioning (``title_x``, ``title_y``).
+
+ - Plotly Configuration:
+
+ - Enhanced the configuration options to allow users to enable or disable zoom functionality (``enable_zoom``) in the interactive Plotly plots.
+
+ - Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+ - Error Handling:
+
+ - Added input validation for ``html_file_path`` and ``html_file_name`` to ensure these are provided when necessary based on the selected ``plot_type``.
+
+- ``plot_2d_pdp`` Function:
+
+ - Introduced ``file_prefix`` Parameter:
+
+ - Added a new ``file_prefix`` parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+ - Enhanced Plot Type Flexibility:
+
+ - The ``plot_type`` parameter now includes an option to generate both grid and individual plots (``both``). This feature allows users to create a combination of both layout styles in one function call.
+
+ - Updated input validation and logic to handle this new option effectively.
+
+ - Added ``save_plots`` Parameter:
+
+ - Introduced a new parameter, ``save_plots``, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Included the ``save_plots`` parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+2. Documentation Updates
+
+- Docstrings:
+
+ - Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
+ - Detailed the use of new parameters such as ``show_modebar``, ``file_prefix``, ``save_plots``, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+3. Refactoring & Code Cleanup
+
+- Code Structure:
+
+ - Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
+ - Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
+
+
+Version 0.0.8b
+--------------------------------
+
+Version 0.0.8b is an exact replica of version ``0.0.8a``. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
+
+
+Version 0.0.8a
+--------------------------------
+
+Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
+**New Features:**
+
+1. Optional ``file_prefix`` in ``stacked_crosstab_plot`` Function
+
+ - The ``stacked_crosstab_plot`` function has been updated to make the ``file_prefix`` argument optional. If the user does not provide a ``file_prefix``, the function will now automatically generate a default prefix based on the ``col`` and ``func_col`` parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
+ - **Key Improvement:**
+
+ - Users can now omit the ``file_prefix`` argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
+ - Backward compatibility is maintained, allowing users who prefer to specify a custom ``file_prefix`` to continue doing so without any issues.
+
+2. **Introduction of 3D and 2D Partial Dependence Plot Functions**
+
+ - Two new functions, ``plot_3d_pdp`` and ``plot_2d_pdp``, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+ - ``plot_3d_pdp``: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
+ - ``plot_2d_pdp``: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+ - **Key Features:**
+
+ - **Compatibility:** Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
+ - **Customization:** Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
+ - **Interactive 3D Plots:** The ``plot_3d_pdp`` function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+**Impact:**
+
+- These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+- The optional ``file_prefix`` enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+- The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
+
+
+
+Version 0.0.7
+---------------------------
+
+**Added Function for Customizable Correlation Matrix Visualization**
+
+This release introduces a new function, ``flex_corr_matrix``, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to ``seaborn.heatmap()``.
+
+**Summary of Changes**
+
+- **New Function**: ``flex_corr_matrix``.
+
+ - **Functionality**:
+ - Generates a correlation heatmap for a given DataFrame.
+ - Supports both full and upper triangular correlation matrices based on the ``triangular`` parameter.
+ - Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+ - Accepts additional keyword arguments via ``**kwargs`` to pass directly to ``seaborn.heatmap()``.
+ - Includes validation to ensure the ``triangular``, ``annot``, and ``save_plots`` parameters are boolean values.
+ - Raises an exception if ``save_plots=True`` but neither ``image_path_png`` nor ``image_path_svg`` is specified.
+
+**Usage**
+
+.. code-block:: python
+
+ # Full correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=False, cmap="coolwarm", annot=True)
+
+ # Upper triangular correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=True, cmap="coolwarm", annot=True)
+
+
+**Contingency table df to object type**
+
+Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
+
+.. code-block:: python
+
+ df = df.astype(str).fillna("")
+
+
+Version 0.0.6
+---------------------------
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+Version 0.0.5
+---------------------------
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.8/_sources/citations.rst.txt b/_build/html/v0.0.8/_sources/citations.rst.txt
new file mode 100644
index 000000000..634dbd3b2
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.8). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.8},
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.8/_sources/contributors.rst.txt b/_build/html/v0.0.8/_sources/contributors.rst.txt
new file mode 100644
index 000000000..4da2fa18b
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/contributors.rst.txt
@@ -0,0 +1,59 @@
+.. _contributors:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Contributors/Maintainers
+=========================
+
+.. raw:: html
+
+
+
+.. image:: https://www.leonshpaner.com/author/leon-shpaner/avatar_hu48de79c369d5f7d4ff8056a297b2c4c5_1681850_270x270_fill_q90_lanczos_center.jpg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Leonid Shpaner `_ is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego's M.S. Applied Artificial Intelligence program.
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: https://oscargildata.com/portfolio_content/images/Oscar_LinkedIn_Pic.jpeg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Oscar Gil `_ is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
diff --git a/_build/html/v0.0.8/_sources/getting_started.rst.txt b/_build/html/v0.0.8/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..19d3687d7
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/getting_started.rst.txt
@@ -0,0 +1,124 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.8``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``jinja2``: version ``3.1.4`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``nbformat``: version ``4.2.0`` or higher
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``plotly``: version ``5.18.0`` or higher
+- ``scikit-learn``: version ``1.0.2`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
diff --git a/_build/html/v0.0.8/_sources/index.rst.txt b/_build/html/v0.0.8/_sources/index.rst.txt
new file mode 100644
index 000000000..77d24a0f7
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/index.rst.txt
@@ -0,0 +1,51 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Usage Guide
+
+ usage_guide
+
+.. toctree::
+ :maxdepth: 4
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ contributors
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.8/_sources/references.rst.txt b/_build/html/v0.0.8/_sources/references.rst.txt
new file mode 100644
index 000000000..335337c3a
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/references.rst.txt
@@ -0,0 +1,33 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+References
+===========
+
+1. Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+2. Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+3. Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+4. Pace, R. K., & Barry, R. (1997). *Sparse Spatial Autoregressions*. *Statistics & Probability Letters*, 33(3), 291-297. `https://doi.org/10.1016/S0167-7152(96)00140-X `_.
diff --git a/_build/html/v0.0.8/_sources/usage_guide.rst.txt b/_build/html/v0.0.8/_sources/usage_guide.rst.txt
new file mode 100644
index 000000000..ee66b96ac
--- /dev/null
+++ b/_build/html/v0.0.8/_sources/usage_guide.rst.txt
@@ -0,0 +1,3789 @@
+.. _usage_guide:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Description
+===========
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+
+Data Preparation and Management
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs.
+ :type id_colname: str
+ :param num_digits: The number of digits for the unique IDs.
+ :type num_digits: int
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either `day/month/year` or
+ `month/day/year` format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a `day/month/year` format. If the first part is 12 or less, the function
+ attempts to parse the date as `month/day/year`, falling back to `day/month/year`
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df)
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+
+ :returns: A DataFrame with the analysis results for each column.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of unique variables to generate combinations.
+ :type variables: list
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum length of combinations to generate. Defaults to ``2``.
+ :type min_length: int
+
+ :returns: A dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+ :notes:
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order.
+ :type sort_by: int
+ :raises ValueError: If no columns are specified or if sort_by is not ``0`` or ``1``.
+ :returns: A DataFrame with the specified columns, ``'Total'``, and ``'Percentage'``.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or "bins." This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or "bins," and
+then assigning each value to one of these intervals.
+
+.. note::
+
+ The code snippets below create age bins and assign a corresponding age group
+ label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+ categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+ and labels as needed for your specific data.
+
+
+Below, we use the ``age`` column of the census data [1]_ from the UCI Machine Learning Repository as an example:
+
+1. **Bins Definition**:
+ The bins are defined by specifying the boundaries of each interval. For example,
+ in the code snippet below, the ``bin_ages`` list specifies the boundaries for age groups:
+
+ .. code-block:: python
+
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+
+ Each pair of consecutive elements in ``bin_ages`` defines a bin. For example:
+
+ - The first bin is ``[0, 18)``,
+ - The second bin is ``[18, 30)``,
+ - and so on.
+
+\
+
+2. **Labels for Bins**:
+ The `label_ages` list provides labels corresponding to each bin:
+
+ .. code-block:: python
+
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ These labels are used to categorize the numerical values into meaningful groups.
+
+3. **Applying the Binning**:
+ The `pd.cut `_ function
+ from Pandas is used to apply the binning process. For each value in the ``age``
+ column of the DataFrame, it assigns a corresponding label based on which bin the
+ value falls into. Here, ``right=False`` indicates that each bin includes the
+ left endpoint but excludes the right endpoint. For example, if ``bin_ages =
+ [0, 10, 20, 30]``, then a value of ``10`` will fall into the bin ``[10, 20)`` and
+ be labeled accordingly.
+
+ .. code-block:: python
+
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+ **Mathematically**, for a given value `x` in the ``age`` column:
+
+ .. math::
+
+ \text{age_group} =
+ \begin{cases}
+ < 18 & \text{if } 0 \leq x < 18 \\
+ 18-29 & \text{if } 18 \leq x < 30 \\
+ \vdots \\
+ 100 + & \text{if } x \geq 100
+ \end{cases}
+
+ The parameter `right=False` in `pd.cut` means that the bins are left-inclusive
+ and right-exclusive, except for the last bin, which is always right-inclusive
+ when the upper bound is infinity (`float("inf")`).
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+Gaussian Assumption for Normality
+----------------------------------
+
+The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
+
+.. math::
+
+ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
+
+where:
+
+- :math:`\mu` is the mean
+- :math:`\sigma^2` is the variance
+
+In a normally distributed dataset:
+
+- 68% of data falls within :math:`\mu \pm \sigma`
+- 95% within :math:`\mu \pm 2\sigma`
+- 99.7% within :math:`\mu \pm 3\sigma`
+
+.. raw:: html
+
+
+
+.. image:: ../assets/normal_distribution.png
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histograms and KDE
+^^^^^^^^^^^^^^^^^^^^^^
+
+**Histograms**:
+
+- Visualize data distribution by binning values and counting frequencies.
+- If data is Gaussian, the histogram approximates a bell curve.
+
+**Kernel Density Estimation (KDE)**:
+
+- A non-parametric way to estimate the PDF by smoothing individual data points with a kernel function.
+- The KDE for a dataset :math:`X = \{x_1, x_2, \ldots, x_n\}` is given by:
+
+.. math::
+
+ \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)
+
+where:
+
+- :math:`K` is the kernel function (often Gaussian)
+- :math:`h` is the bandwidth (smoothing parameter)
+
+**Combined Use of Histograms and KDE**
+
+- **Histograms** offer a discrete, binned view of the data.
+- **KDE** provides a smooth, continuous estimate of the underlying distribution.
+- Together, they effectively illustrate how well the data aligns with the Gaussian assumption, highlighting any deviations from normality.
+
+KDE Distribution Function
+-----------------------------
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, grid_figsize=(10, 8), single_figsize=(6, 4), kde=True, hist_color="#0000FF", kde_color="#FF0000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=1, n_cols=1, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_path_png=None, single_var_image_path_svg=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots.
+ :type vars_of_interest: list of str, optional
+ :param grid_figsize: Size of the overall grid figure, default is ``(10, 8)``.
+ :type grid_figsize: tuple, optional
+ :param single_figsize: Size of individual figures for each variable, default is ``(6, 4)``.
+ :type single_figsize: tuple, optional
+ :param kde: Whether to include KDE plots on the histograms, default is ``True``.
+ :type kde: bool, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where
+ ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid, default is ``1``.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid, default is ``1``.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_path_png: Directory path to save the PNG images of the separate distribution plots.
+ :type single_var_image_path_png: str, optional
+ :param single_var_image_path_svg: Directory path to save the SVG images of the separate distribution plots.
+ :type single_var_image_path_svg: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots.
+ The variable name will be appended to this filename.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: List of variable names to apply log scaling.
+ :type log_scale_vars: list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: number or pair of numbers, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``,
+ ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+
+ :raises UserWarning:
+ - If ``stat`` is set to 'count' while ``kde`` is ``True``, as it may produce misleading plots.
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``kde=True`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+The ``plot_type="hist"`` parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the kde_distributions function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``kde=False``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ```y-axis``` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to `"Count"` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=4``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ single_figsize=(4, 4), # Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. note::
+
+ The ``legend_labels_list`` parameter should be a list of lists, where each
+ inner list corresponds to the ground truth labels for the respective item in
+ the ``func_col`` list. Each element in the ``func_col`` list represents a
+ column in your DataFrame that you wish to analyze, and the corresponding
+ inner list in ``legend_labels_list`` should contain the labels that will be
+ used in the legend of your plots.
+
+For example:
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"], # Corresponds to "sex"
+ ["<=50K", ">50K"], # Corresponds to "income"
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. important::
+
+ Ensure that the number of elements in ``func_col``, ``legend_labels_list``,
+ and ``title`` are the same. Each item in ``func_col`` must have a corresponding
+ list of labels in ``legend_labels_list`` and a title in ``title``. This
+ consistency is essential for the function to correctly generate the plots
+ with the appropriate labels and titles.
+
+
+In this example:
+
+- ``func_col`` contains two elements: ``"sex"`` and ``"income"``. Each corresponds to a specific column in your DataFrame.
+- ``legend_labels_list`` is a nested list containing two inner lists:
+
+ - The first inner list, ``["Male", "Female"]``, corresponds to the ``"sex"`` column in ``func_col``.
+ - The second inner list, ``["<=50K", ">50K"]``, corresponds to the ``"income"`` column in ``func_col``.
+
+- ``title`` contains two elements: ``"Sex"`` and ``"Income"``, which will be used as the titles for the respective plots.
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_boxplot_comp, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_boxplot_comp: List of comparison categories (columns in df).
+ :type metrics_boxplot_comp: list of str
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is (``6, 4``).
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of None, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is specified but is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_boxplot_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with `3` rows and `4`
+columns, using the ``n_rows=3`` and ``n_cols=4`` parameters. The ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ```x-axis``` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_boxplot_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_boxplot_comp,
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+Pearson Correlation Coefficient
+--------------------------------
+
+The Pearson correlation coefficient, often denoted as :math:`r`, is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from :math:`-1` to :math:`1`, where:
+
+- :math:`r = 1` indicates a perfect positive linear relationship.
+- :math:`r = -1` indicates a perfect negative linear relationship.
+- :math:`r = 0` indicates no linear relationship.
+
+The Pearson correlation coefficient between two variables :math:`X` and :math:`Y` is defined as:
+
+.. math::
+
+ r_{XY} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
+
+where:
+
+- :math:`\text{Cov}(X, Y)` is the covariance of :math:`X` and :math:`Y`.
+- :math:`\sigma_X` is the standard deviation of :math:`X`.
+- :math:`\sigma_Y` is the standard deviation of :math:`Y`.
+
+Covariance measures how much two variables change together. It is defined as:
+
+.. math::
+
+ \text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)
+
+where:
+
+- :math:`n` is the number of data points.
+- :math:`X_i` and :math:`Y_i` are the individual data points.
+- :math:`\mu_X` and :math:`\mu_Y` are the means of :math:`X` and :math:`Y`.
+
+The standard deviation measures the dispersion or spread of a set of values. For
+a variable :math:`X`, the standard deviation :math:`\sigma_X` is:
+
+.. math::
+
+ \sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)^2}
+
+Substituting the covariance and standard deviation into the Pearson correlation formula:
+
+.. math::
+
+ r_{XY} = \frac{\sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n} (X_i - \mu_X)^2} \sqrt{\sum_{i=1}^{n} (Y_i - \mu_Y)^2}}
+
+This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between :math:`X` and :math:`Y`.
+
+- :math:`r > 0`: Positive correlation. As :math:`X` increases, :math:`Y` tends to increase.
+- :math:`r < 0`: Negative correlation. As :math:`X` increases, :math:`Y` tends to decrease.
+- :math:`r = 0`: No linear correlation. There is no consistent linear relationship between :math:`X` and :math:`Y`.
+
+The closer the value of :math:`r` is to :math:`\pm 1`, the stronger the linear relationship between the two variables.
+
+Scatter Fit Plot
+------------------
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars, y_vars, n_rows, n_cols, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None)
+
+ Create and save scatter plots or a grid of scatter plots for given x_vars
+ and y_vars, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the `x-axis`.
+ :type x_vars: list of str
+
+ :param y_vars: List of variable names to plot on the `y-axis`.
+ :type y_vars: list of str
+
+ :param n_rows: Number of rows in the subplot grid.
+ :type n_rows: int
+
+ :param n_cols: Number of columns in the subplot grid.
+ :type n_cols: int
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for `x-axis` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the `x-axis` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the `y-axis` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves scatter plots for the specified `x_vars` and `y_vars`.
+
+
+Regression-Centric Scatter Plots Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ n_rows=3,
+ n_cols=4,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="grid",
+ show_legend=True,
+ xlabel_rot=0,
+ show_plot="grid",
+ rotate_plot=False,
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Correlation Matrices
+=====================
+
+**Generate and Save Customizable Correlation Heatmaps**
+
+The ``flex_corr_matrix`` function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
+**Customizable Plot Appearance**
+
+The function provides extensive customization options for the heatmap's appearance:
+
+- **Colormap Selection**: Choose from a variety of colormaps to represent the strength of correlations. The default is ``"coolwarm"``, but this can be adjusted to fit the needs of the analysis.
+
+- **Annotation**: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
+- **Figure Size and Layout**: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+**Triangular vs. Full Correlation Matrix**
+
+
+A key feature of the ``flex_corr_matrix`` function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
+**Label and Axis Configuration**
+
+
+The function offers flexibility in configuring axis labels and titles:
+
+- **Label Rotation**: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+- **Font Sizes**: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+- **Title Wrapping**: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+**Plot Display and Saving Options**
+
+
+The ``flex_corr_matrix`` function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
+
+.. function:: flex_corr_matrix(df, cols=None, annot=True, cmap="coolwarm", save_plots=False, image_path_png=None, image_path_svg=None, figsize=(10, 10), title="Cervical Cancer Data: Correlation Matrix", label_fontsize=12, tick_fontsize=10, xlabel_rot=45, ylabel_rot=0, xlabel_alignment="right", ylabel_alignment="center_baseline", text_wrap=50, vmin=-1, vmax=1, cbar_label="Correlation Index", triangular=True, **kwargs)
+
+ Create a customizable correlation heatmap with options for annotation, color mapping, figure size, and saving the plot.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param cols: List of column names to include in the correlation matrix. If None, all columns are included.
+ :type cols: list of str, optional
+
+ :param annot: Whether to annotate the heatmap with correlation coefficients. Default is ``True``.
+ :type annot: bool, optional
+
+ :param cmap: The colormap to use for the heatmap. Default is ``"coolwarm"``.
+ :type cmap: str, optional
+
+ :param save_plots: Controls whether to save the plots. Default is ``False``.
+ :type save_plots: bool, optional
+
+ :param image_path_png: Directory path to save PNG images of the heatmap.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the heatmap.
+ :type image_path_svg: str, optional
+
+ :param figsize: Width and height of the figure for the heatmap. Default is ``(10, 10)``.
+ :type figsize: tuple, optional
+
+ :param title: Title of the heatmap. Default is ``"Cervical Cancer Data: Correlation Matrix"``.
+ :type title: str, optional
+
+ :param label_fontsize: Font size for tick labels and colorbar label. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param xlabel_rot: Rotation angle for x-axis labels. Default is ``45``.
+ :type xlabel_rot: int, optional
+
+ :param ylabel_rot: Rotation angle for y-axis labels. Default is ``0``.
+ :type ylabel_rot: int, optional
+
+ :param xlabel_alignment: Horizontal alignment for x-axis labels. Default is ``"right"``.
+ :type xlabel_alignment: str, optional
+
+ :param ylabel_alignment: Vertical alignment for y-axis labels. Default is ``"center_baseline"``.
+ :type ylabel_alignment: str, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param vmin: Minimum value for the heatmap color scale. Default is ``-1``.
+ :type vmin: float, optional
+
+ :param vmax: Maximum value for the heatmap color scale. Default is ``1``.
+ :type vmax: float, optional
+
+ :param cbar_label: Label for the colorbar. Default is ``"Correlation Index"``.
+ :type cbar_label: str, optional
+
+ :param triangular: Whether to show only the upper triangle of the correlation matrix. Default is ``True``.
+ :type triangular: bool, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``seaborn.heatmap()``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``annot`` is not a boolean.
+ - If ``cols`` is not a list.
+ - If ``save_plots`` is not a boolean.
+ - If ``triangular`` is not a boolean.
+ - If ``save_plots`` is True but no image paths are provided.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves a correlation heatmap.
+
+Triangular Correlation Matrix Example
+--------------------------------------
+
+The provided code filters the census [1]_ DataFrame ``df`` to include only numeric columns using
+``select_dtypes(np.number)``. It then utilizes the ``flex_corr_matrix()`` function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+.. note::
+
+ This triangular matrix format is particularly useful for avoiding
+ redundancy in correlation matrices, as it excludes the lower half,
+ making it easier to focus on unique pairwise correlations.
+
+The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
+.. code-block:: python
+
+ # Select only numeric data to pass into the function
+ df_num = df.select_dtypes(np.number)
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="coolwarm",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Full Correlation Matrix Example
+----------------------------------
+
+In this modified census [1]_ example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting ``cmap="viridis"``, the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting ``triangular=False``, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="viridis",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Partial Dependence Plots
+=========================
+
+**Partial Dependence Plots (PDPs)** are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model's predictions.
+
+Theoretical Foundation of PDPs
+--------------------------------
+
+Let :math:`\mathbf{X}` represent the complete set of input features for a machine
+learning model, where :math:`\mathbf{X} = \{X_1, X_2, \dots, X_p\}`. Suppose we're
+particularly interested in a subset of these features, denoted by :math:`\mathbf{X}_S`.
+The complementary set, :math:`\mathbf{X}_C`, contains all the features in :math:`\mathbf{X}`
+that are not in :math:`\mathbf{X}_S`. Mathematically, this relationship is expressed as:
+
+.. math::
+
+ \mathbf{X}_C = \mathbf{X} \setminus \mathbf{X}_S
+
+where :math:`\mathbf{X}_C` is the set of features in :math:`\mathbf{X}` after
+removing the features in :math:`\mathbf{X}_S`.
+
+Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in :math:`\mathbf{X}_S` on the model's predictions, while averaging out the
+influence of the features in :math:`\mathbf{X}_C`. This is mathematically defined as:
+
+.. math::
+ \begin{align*}
+ \text{PD}_{\mathbf{X}_S}(x_S) &= \mathbb{E}_{\mathbf{X}_C} \left[ f(x_S, \mathbf{X}_C) \right] \\
+ &= \int f(x_S, x_C) \, p(x_C) \, dx_C \\
+ &= \int \left( \int f(x_S, x_C) \, p(x_C \mid x_S) \, dx_C \right) p(x_S) \, dx_S
+ \end{align*}
+
+
+where:
+
+- :math:`\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]` indicates that we are taking the expected value over the possible values of the features in the set :math:`\mathbf{X}_C`.
+- :math:`p(x_C)` represents the probability density function of the features in :math:`\mathbf{X}_C`.
+
+This operation effectively summarizes the model's output over all potential values of the complementary features, providing a clear view of how the features in :math:`\mathbf{X}_S` alone impact the model's predictions.
+
+
+**2D Partial Dependence Plots**
+
+Consider a trained machine learning model :math:`f(\mathbf{X})`, where :math:`\mathbf{X} = (X_1, X_2, \dots, X_p)` represents the vector of input features. The partial dependence of the predicted response :math:`\hat{y}` on a single feature :math:`X_j` is defined as:
+
+.. math::
+
+ \text{PD}(X_j) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, \mathbf{X}_{C_i})
+
+where:
+
+- :math:`X_j` is the feature of interest.
+- :math:`\mathbf{X}_{C_i}` represents the complement set of :math:`X_j`, meaning the remaining features in :math:`\mathbf{X}` not included in :math:`X_j` for the :math:`i`-th instance.
+- :math:`n` is the number of observations in the dataset.
+
+For two features, :math:`X_j` and :math:`X_k`, the partial dependence is given by:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of :math:`X_j` and :math:`X_k` vary, while the effects of the other features are averaged out.
+
+- **Single Feature PDP:** When plotting :math:`\text{PD}(X_j)`, the result is a 2D line plot showing the marginal effect of feature :math:`X_j` on the predicted outcome, averaged over all possible values of the other features.
+- **Two Features PDP:** When plotting :math:`\text{PD}(X_j, X_k)`, the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of :math:`X_j` and :math:`X_k` on the predicted outcome. The surface represents the expected value of the prediction as :math:`X_j` and :math:`X_k` vary, while all other features are averaged out.
+
+
+**3D Partial Dependence Plots**
+
+For a more comprehensive analysis, especially when exploring interactions between two features, 3D Partial Dependence Plots are invaluable. The partial dependence function for two features in a 3D context is:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+Here, the function :math:`f(X_j, X_k, \mathbf{X}_{C_i})` is evaluated across a grid of values for :math:`X_j` and :math:`X_k`. The resulting 3D surface plot represents how the model's prediction changes over the joint range of these two features.
+
+The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model's predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+- **Feature Interaction Visualization:** The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+- **Enhanced Interpretation:** 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
+
+
+2D Partial Dependence Plots
+-----------------------------
+
+The ``plot_2d_pdp`` function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+- **Grid and Individual Plots**: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+- **Customization Options**: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+- **Saving Plots**: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
+
+.. function:: plot_2d_pdp(model, X_train, feature_names, features, title="PDP of house value on CA non-location features", grid_resolution=50, plot_type="grid", grid_figsize=(12, 8), individual_figsize=(6, 4), label_fontsize=12, tick_fontsize=10, text_wrap=50, image_path_png=None, image_path_svg=None, save_plots=None, file_prefix="partial_dependence")
+
+ Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param X_train: The training data used to compute partial dependence. Should correspond to the features used to train the model.
+ :type X_train: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names: A list of feature names corresponding to the columns in ``X_train``.
+ :type feature_names: list of str
+
+ :param features: A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+ :type features: list of int or tuple of int
+
+ :param title: The title for the entire plot. Default is ``"PDP of house value on CA non-location features"``.
+ :type title: str, optional
+
+ :param grid_resolution: The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is ``50``.
+ :type grid_resolution: int, optional
+
+ :param plot_type: The type of plot to generate. Choose ``"grid"`` for a grid layout, ``"individual"`` for separate plots, or ``"both"`` to generate both layouts. Default is ``"grid"``.
+ :type plot_type: str, optional
+
+ :param grid_figsize: Tuple specifying the width and height of the figure for the grid layout. Default is ``(12, 8)``.
+ :type grid_figsize: tuple, optional
+
+ :param individual_figsize: Tuple specifying the width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple, optional
+
+ :param label_fontsize: Font size for the axis labels and titles. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for the axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param image_path_png: The directory path where PNG images of the plots will be saved, if saving is enabled.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: The directory path where SVG images of the plots will be saved, if saving is enabled.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls whether to save the plots. Options include ``"all"``, ``"individual"``, ``"grid"``, or ``None`` (default). If saving is enabled, ensure ``image_path_png`` or ``image_path_svg`` are provided.
+ :type save_plots: str, optional
+
+ :param file_prefix: Prefix for the filenames of the saved grid plots. Default is ``"partial_dependence"``.
+ :type file_prefix: str, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``"grid"``, ``"individual"``, or ``"both"``.
+ - If ``save_plots`` is enabled but neither ``image_path_png`` nor ``image_path_svg`` is provided.
+
+ :returns: ``None``
+ This function generates partial dependence plots and displays them. It does not return any values.
+
+
+2D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California. [4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
+For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
+If you extend this to two interacting features, such as ``AveOccup`` and ``HouseAge``,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
+Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ from sklearn.ensemble import GradientBoostingRegressor
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=42,
+ )
+ model.fit(X_train, y_train)
+
+
+**Create 2D Partial Dependence Plot Grid**
+
+.. code-block:: python
+
+ # import the plot_2d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_2d_pdp
+
+ # Feature names
+ names = data.feature_names
+
+ # Generate 2D partial dependence plots
+ plot_2d_pdp(
+ model=model,
+ X_train=X_train,
+ feature_names=names,
+ features=[
+ "MedInc",
+ "AveOccup",
+ "HouseAge",
+ "AveRooms",
+ "Population",
+ ("AveOccup", "HouseAge"),
+ ],
+ title="PDP of house value on CA non-location features",
+ grid_figsize=(14, 10),
+ individual_figsize=(12, 4),
+ label_fontsize=14,
+ tick_fontsize=12,
+ text_wrap=120,
+ plot_type="grid",
+ image_path_png="path/to/save/png",
+ save_plots="all",
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+3D Partial Dependence Plots
+-----------------------------
+
+The ``plot_3d_pdp`` function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+- **Interactive and Static 3D Plots**: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+- **Colormap and Layout Customization**: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+- **Axis and Title Configuration**: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
+
+.. function:: plot_3d_pdp(model, dataframe, feature_names_list, x_label=None, y_label=None, z_label=None, title, html_file_path=None, html_file_name=None, image_filename=None, plot_type="both", matplotlib_colormap=None, plotly_colormap="Viridis", zoom_out_factor=None, wireframe_color=None, view_angle=(22, 70), figsize=(7, 4.5), text_wrap=50, horizontal=-1.25, depth=1.25, vertical=1.25, cbar_x=1.05, cbar_thickness=25, title_x=0.5, title_y=0.95, top_margin=100, image_path_png=None, image_path_svg=None, show_cbar=True, grid_resolution=20, left_margin=20, right_margin=65, label_fontsize=8, tick_fontsize=6, enable_zoom=True, show_modebar=True)
+
+ Generate 3D partial dependence plots for two features of a machine learning model.
+
+ This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param dataframe: The dataset on which the model was trained or a representative sample. If a DataFrame is provided, ``feature_names_list`` should correspond to the column names. If a NumPy array is provided, ``feature_names_list`` should correspond to the indices of the columns.
+ :type dataframe: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names_list: A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+ :type feature_names_list: list of str
+
+ :param x_label: Label for the x-axis in the plots. Default is ``None``.
+ :type x_label: str, optional
+
+ :param y_label: Label for the y-axis in the plots. Default is ``None``.
+ :type y_label: str, optional
+
+ :param z_label: Label for the z-axis in the plots. Default is ``None``.
+ :type z_label: str, optional
+
+ :param title: The title for the plots.
+ :type title: str
+
+ :param html_file_path: Path to save the interactive Plotly HTML file. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_path: str, optional
+
+ :param html_file_name: Name of the HTML file to save the interactive Plotly plot. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_name: str, optional
+
+ :param image_filename: Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is ``None``.
+ :type image_filename: str, optional
+
+ :param plot_type: The type of plots to generate. Options are:
+ - ``"static"``: Generate only static Matplotlib plots.
+ - ``"interactive"``: Generate only interactive Plotly plots.
+ - ``"both"``: Generate both static and interactive plots. Default is ``"both"``.
+ :type plot_type: str, optional
+
+ :param matplotlib_colormap: Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+ :type matplotlib_colormap: matplotlib.colors.Colormap, optional
+
+ :param plotly_colormap: Colormap for the Plotly plot. Default is ``"Viridis"``.
+ :type plotly_colormap: str, optional
+
+ :param zoom_out_factor: Factor to adjust the zoom level of the Plotly plot. Default is ``None``.
+ :type zoom_out_factor: float, optional
+
+ :param wireframe_color: Color for the wireframe in the Matplotlib plot. If ``None``, no wireframe is plotted. Default is ``None``.
+ :type wireframe_color: str, optional
+
+ :param view_angle: Elevation and azimuthal angles for the Matplotlib plot view. Default is ``(22, 70)``.
+ :type view_angle: tuple, optional
+
+ :param figsize: Figure size for the Matplotlib plot. Default is ``(7, 4.5)``.
+ :type figsize: tuple, optional
+
+ :param text_wrap: Maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param horizontal: Horizontal camera position for the Plotly plot. Default is ``-1.25``.
+ :type horizontal: float, optional
+
+ :param depth: Depth camera position for the Plotly plot. Default is ``1.25``.
+ :type depth: float, optional
+
+ :param vertical: Vertical camera position for the Plotly plot. Default is ``1.25``.
+ :type vertical: float, optional
+
+ :param cbar_x: Position of the color bar along the x-axis in the Plotly plot. Default is ``1.05``.
+ :type cbar_x: float, optional
+
+ :param cbar_thickness: Thickness of the color bar in the Plotly plot. Default is ``25``.
+ :type cbar_thickness: int, optional
+
+ :param title_x: Horizontal position of the title in the Plotly plot. Default is ``0.5``.
+ :type title_x: float, optional
+
+ :param title_y: Vertical position of the title in the Plotly plot. Default is ``0.95``.
+ :type title_y: float, optional
+
+ :param top_margin: Top margin for the Plotly plot layout. Default is ``100``.
+ :type top_margin: int, optional
+
+ :param image_path_png: Directory path to save the PNG file of the Matplotlib plot. Default is None.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save the SVG file of the Matplotlib plot. Default is None.
+ :type image_path_svg: str, optional
+
+ :param show_cbar: Whether to display the color bar in the Matplotlib plot. Default is ``True``.
+ :type show_cbar: bool, optional
+
+ :param grid_resolution: The resolution of the grid for computing partial dependence. Default is ``20``.
+ :type grid_resolution: int, optional
+
+ :param left_margin: Left margin for the Plotly plot layout. Default is ``20``.
+ :type left_margin: int, optional
+
+ :param right_margin: Right margin for the Plotly plot layout. Default is ``65``.
+ :type right_margin: int, optional
+
+ :param label_fontsize: Font size for axis labels in the Matplotlib plot. Default is ``8``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for tick labels in the Matplotlib plot. Default is ``6``.
+ :type tick_fontsize: int, optional
+
+ :param enable_zoom: Whether to enable zooming in the Plotly plot. Default is ``True``.
+ :type enable_zoom: bool, optional
+
+ :param show_modebar: Whether to display the mode bar in the Plotly plot. Default is ``True``.
+ :type show_modebar: bool, optional
+
+ :raises ValueError:
+ - If `plot_type` is not one of ``"static"``, ``"interactive"``, or ``"both"``.
+ - If `plot_type` is ``"interactive"`` or ``"both"`` and ``html_file_path`` or ``html_file_name`` are not provided.
+
+ :returns: ``None``
+ This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+ :notes:
+ - This function handles warnings related to scikit-learn's ``partial_dependence`` function, specifically a ``FutureWarning`` related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+ - To maintain compatibility with different versions of scikit-learn, the function attempts to use ``"values"`` for grid extraction in newer versions and falls back to ``"grid_values"`` for older versions.
+
+
+3D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
+For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
+This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
+Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
+
+Static Plot
+^^^^^^^^^^^^^^^^^
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=1,
+ )
+ model.fit(X_train, y_train)
+
+**Create Static 3D Partial Dependence Plot**
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8, 5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+ )
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Flexible `save_formats` Input:
+- save_formats now accepts a string, tuple, or list for specifying formats (e.g., “png”, (“png”, “svg”), or [“png”, “svg”]).
+- Single strings or tuples are automatically converted to lists for consistent processing.
+
Dynamic Error Handling:
+- Added checks to ensure a valid path is provided for each format in save_formats.
+- Raises a ValueError if a format is specified without a corresponding path, with a clear, dynamic error message.
+
Improved Plot Saving Logic:
+- Updated logic allows saving plots in one format (e.g., only “png” or “svg”) without requiring the other.
+- Simplified and more intuitive path handling for saving plots.
This update introduces several key changes to the plot_3d_pdp function, simplifying the function’s interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
1. Parameter Changes
+
+
Removed Parameters:
+
+
The parameters x_label_plotly, y_label_plotly, and z_label_plotly have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general x_label, y_label, and z_label. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+
+
Default Values for Labels:
+
+
The parameters x_label, y_label, and z_label are now optional, with None as the default. If not provided, these labels will automatically default to the names of the features in the feature_names_list. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+
+
Changes in Default Values for View Angles:
+
+
The default values for camera positioning parameters have been updated: horizontal is now -1.25, depth is now 1.25, and vertical is now 1.25. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+
+
+
2. Plot Generation Logic
+
+
Conditionally Checking Labels:
+
+
The function now checks whether x_label, y_label, and z_label are provided. If these are None, the function will automatically assign default labels based on the feature_names_list. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+
+
Camera Position Adjustments:
+
+
The camera positions for the Plotly plot are now adjusted by multiplying horizontal, depth, and vertical by zoom_out_factor. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+
+
Surface Plot Coordinates Adjustments:
+
+
The order of the coordinates for the Plotly plot’s surface has been changed from ZZ,XX,YY[::-1] to ZZ,XX,YY. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+
+
+
3. Code Simplifications
+
+
Removed Complexity:
+
+
By removing the x_label_plotly, y_label_plotly, and z_label_plotly parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+
+
Fallback Mechanism for Grid Values:
+
+
The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+
+
+
4. Style Adjustments
+
+
Label Formatting:
+
+
The new version consistently uses y_label, x_label, and z_label for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+
+
Color Bar Adjustments:
+
+
The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of 0.6 and a pad value of 0.02. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+
+
+
5. Potential Use Case Differences
+
+
Simplified Interface:
+
+
The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+
+
Less Granular Control:
+
+
Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the *_plotly label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+
+
+
6. Matplotlib Plot Adjustments
+
+
Wireframe and Surface Plot Enhancements:
+
+
The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+
+
+
Summary
+
+
Version 0.0.8d of the plot_3d_pdp function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+
Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+
Decision Point
+
+
This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
Key Updates in 0.0.8c:
+
+
Bug Fixes: Addressed minor issues identified in version 0.0.8b to ensure smoother performance and better user experience.
+
Additional Testing: Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+
Refinements: Made small enhancements to existing features based on user feedback and internal testing results.
+
+
Summary of Changes
+
+
New Features & Enhancements
+
+
+
plot_3d_pdp Function:
+
+
Added show_modebar Parameter: Introduced a new boolean parameter, show_modebar, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
Custom Margins and Layout Adjustments:
+
+
Added parameters for left_margin, right_margin, and top_margin to provide users with more control over the plot layout in Plotly.
+
Adjusted default values and added options for better customization of the Plotly color bar (cbar_x, cbar_thickness) and title positioning (title_x, title_y).
+
+
+
Plotly Configuration:
+
+
Enhanced the configuration options to allow users to enable or disable zoom functionality (enable_zoom) in the interactive Plotly plots.
+
Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+
+
Error Handling:
+
+
Added input validation for html_file_path and html_file_name to ensure these are provided when necessary based on the selected plot_type.
+
+
+
+
+
plot_2d_pdp Function:
+
+
Introduced file_prefix Parameter:
+
+
Added a new file_prefix parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+
+
Enhanced Plot Type Flexibility:
+
+
The plot_type parameter now includes an option to generate both grid and individual plots (both). This feature allows users to create a combination of both layout styles in one function call.
+
Updated input validation and logic to handle this new option effectively.
+
+
+
Added save_plots Parameter:
+
+
Introduced a new parameter, save_plots, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+
+
Custom Margins and Layout Adjustments:
+
+
Included the save_plots parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+
+
+
+
+
+
Documentation Updates
+
+
+
Docstrings:
+
+
Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
Detailed the use of new parameters such as show_modebar, file_prefix, save_plots, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+
+
+
+
Refactoring & Code Cleanup
+
+
+
Code Structure:
+
+
Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
Version 0.0.8b is an exact replica of version 0.0.8a. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
New Features:
+
+
Optional file_prefix in stacked_crosstab_plot Function
+
+
The stacked_crosstab_plot function has been updated to make the file_prefix argument optional. If the user does not provide a file_prefix, the function will now automatically generate a default prefix based on the col and func_col parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
Key Improvement:
+
+
Users can now omit the file_prefix argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
Backward compatibility is maintained, allowing users who prefer to specify a custom file_prefix to continue doing so without any issues.
+
+
+
+
+
Introduction of 3D and 2D Partial Dependence Plot Functions
+
+
Two new functions, plot_3d_pdp and plot_2d_pdp, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+
plot_3d_pdp: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
plot_2d_pdp: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+
+
Key Features:
+
+
Compatibility: Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
Customization: Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
Interactive 3D Plots: The plot_3d_pdp function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+
+
+
+
+
Impact:
+
+
These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+
The optional file_prefix enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+
The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
Added Function for Customizable Correlation Matrix Visualization
+
This release introduces a new function, flex_corr_matrix, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to seaborn.heatmap().
+
Summary of Changes
+
+
New Function: flex_corr_matrix.
+
+
Functionality:
+- Generates a correlation heatmap for a given DataFrame.
+- Supports both full and upper triangular correlation matrices based on the triangular parameter.
+- Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+- Accepts additional keyword arguments via **kwargs to pass directly to seaborn.heatmap().
+- Includes validation to ensure the triangular, annot, and save_plots parameters are boolean values.
+- Raises an exception if save_plots=True but neither image_path_png nor image_path_svg is specified.
+
+
+
+
Usage
+
# Full correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=False,cmap="coolwarm",annot=True)
+
+# Upper triangular correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=True,cmap="coolwarm",annot=True)
+
+
+
Contingency table df to object type
+
Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.8/genindex.html b/_build/html/v0.0.8/genindex.html
new file mode 100644
index 000000000..f094db961
--- /dev/null
+++ b/_build/html/v0.0.8/genindex.html
@@ -0,0 +1,354 @@
+
+
+
+
+
+
+
+ Index — EDA Toolkit 0.0.8 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
+
+
+
+
+
+
+
Index
+
+
+ A
+ | B
+ | C
+ | D
+ | E
+ | F
+ | H
+ | K
+ | P
+ | S
+
+
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.8.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str) – The name of the new column for the IDs.
+
num_digits (int) – The number of digits for the unique IDs.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
+
Returns:
+
A DataFrame with the analysis results for each column.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
Notes:
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame with the specified columns, 'Total', and 'Percentage'.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or “bins.” This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or “bins,” and
+then assigning each value to one of these intervals.
+
+
Note
+
The code snippets below create age bins and assign a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
+
+
Below, we use the age column of the census data [1] from the UCI Machine Learning Repository as an example:
+
+
Bins Definition:
+The bins are defined by specifying the boundaries of each interval. For example,
+in the code snippet below, the bin_ages list specifies the boundaries for age groups:
These labels are used to categorize the numerical values into meaningful groups.
+
+
Applying the Binning:
+The pd.cut function
+from Pandas is used to apply the binning process. For each value in the age
+column of the DataFrame, it assigns a corresponding label based on which bin the
+value falls into. Here, right=False indicates that each bin includes the
+left endpoint but excludes the right endpoint. For example, if bin_ages=
+[0,10,20,30], then a value of 10 will fall into the bin [10,20) and
+be labeled accordingly.
The parameter right=False in pd.cut means that the bins are left-inclusive
+and right-exclusive, except for the last bin, which is always right-inclusive
+when the upper bound is infinity (float(“inf”)).
The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
+
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
+
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
+
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
+0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
+
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
+
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
+The variable name will be appended to this filename.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (number or pair of numbers, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
+'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The kde=True parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
The plot_type="hist" parameter indicates that only histograms will be generated
+for each variable. Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=True,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ fill=True,
+ fill_alpha=0.60,
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+kde=False, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The `y-axis` label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to “Count” to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=4),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ kde=False,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ single_figsize=(4,4),# Size of individual figures
+ w_pad=1,
+ h_pad=1,
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
+
Note
+
The legend_labels_list parameter should be a list of lists, where each
+inner list corresponds to the ground truth labels for the respective item in
+the func_col list. Each element in the func_col list represents a
+column in your DataFrame that you wish to analyze, and the corresponding
+inner list in legend_labels_list should contain the labels that will be
+used in the legend of your plots.
+
+
For example:
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],# Corresponds to "sex"
+ ["<=50K",">50K"],# Corresponds to "income"
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Important
+
Ensure that the number of elements in func_col, legend_labels_list,
+and title are the same. Each item in func_col must have a corresponding
+list of labels in legend_labels_list and a title in title. This
+consistency is essential for the function to correctly generate the plots
+with the appropriate labels and titles.
+
+
In this example:
+
+
func_col contains two elements: "sex" and "income". Each corresponds to a specific column in your DataFrame.
+
legend_labels_list is a nested list containing two inner lists:
+
+
+
The first inner list, ["Male","Female"], corresponds to the "sex" column in func_col.
+
The second inner list, ["<=50K",">50K"], corresponds to the "income" column in func_col.
+
+
+
+
title contains two elements: "Sex" and "Income", which will be used as the titles for the respective plots.
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is specified but is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_boxplot_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid format with 3 rows and 4
+columns, using the n_rows=3 and n_cols=4 parameters. The image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the `x-axis` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
The Pearson correlation coefficient, often denoted as \(r\), is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from \(-1\) to \(1\), where:
+
+
\(r = 1\) indicates a perfect positive linear relationship.
+
\(r = -1\) indicates a perfect negative linear relationship.
+
\(r = 0\) indicates no linear relationship.
+
+
The Pearson correlation coefficient between two variables \(X\) and \(Y\) is defined as:
This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between \(X\) and \(Y\).
+
+
\(r > 0\): Positive correlation. As \(X\) increases, \(Y\) tends to increase.
+
\(r < 0\): Negative correlation. As \(X\) increases, \(Y\) tends to decrease.
+
\(r = 0\): No linear correlation. There is no consistent linear relationship between \(X\) and \(Y\).
+
+
The closer the value of \(r\) is to \(\pm 1\), the stronger the linear relationship between the two variables.
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str) – List of variable names to plot on the x-axis.
+
y_vars (list of str) – List of variable names to plot on the y-axis.
+
n_rows (int) – Number of rows in the subplot grid.
+
n_cols (int) – Number of columns in the subplot grid.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
Generate and Save Customizable Correlation Heatmaps
+
The flex_corr_matrix function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
Customizable Plot Appearance
+
The function provides extensive customization options for the heatmap’s appearance:
+
+
Colormap Selection: Choose from a variety of colormaps to represent the strength of correlations. The default is "coolwarm", but this can be adjusted to fit the needs of the analysis.
+
Annotation: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
Figure Size and Layout: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+
Triangular vs. Full Correlation Matrix
+
A key feature of the flex_corr_matrix function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
Label and Axis Configuration
+
The function offers flexibility in configuring axis labels and titles:
+
+
Label Rotation: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+
Font Sizes: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+
Title Wrapping: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+
Plot Display and Saving Options
+
The flex_corr_matrix function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
The provided code filters the census [1] DataFrame df to include only numeric columns using
+select_dtypes(np.number). It then utilizes the flex_corr_matrix() function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+
Note
+
This triangular matrix format is particularly useful for avoiding
+redundancy in correlation matrices, as it excludes the lower half,
+making it easier to focus on unique pairwise correlations.
+
+
The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
# Select only numeric data to pass into the function
+df_num=df.select_dtypes(np.number)
+
In this modified census [1] example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting cmap="viridis", the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting triangular=False, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
Partial Dependence Plots (PDPs) are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model’s predictions.
Let \(\mathbf{X}\) represent the complete set of input features for a machine
+learning model, where \(\mathbf{X} = \{X_1, X_2, \dots, X_p\}\). Suppose we’re
+particularly interested in a subset of these features, denoted by \(\mathbf{X}_S\).
+The complementary set, \(\mathbf{X}_C\), contains all the features in \(\mathbf{X}\)
+that are not in \(\mathbf{X}_S\). Mathematically, this relationship is expressed as:
where \(\mathbf{X}_C\) is the set of features in \(\mathbf{X}\) after
+removing the features in \(\mathbf{X}_S\).
+
Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in \(\mathbf{X}_S\) on the model’s predictions, while averaging out the
+influence of the features in \(\mathbf{X}_C\). This is mathematically defined as:
\(\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]\) indicates that we are taking the expected value over the possible values of the features in the set \(\mathbf{X}_C\).
+
\(p(x_C)\) represents the probability density function of the features in \(\mathbf{X}_C\).
+
+
This operation effectively summarizes the model’s output over all potential values of the complementary features, providing a clear view of how the features in \(\mathbf{X}_S\) alone impact the model’s predictions.
+
2D Partial Dependence Plots
+
Consider a trained machine learning model \(f(\mathbf{X})\), where \(\mathbf{X} = (X_1, X_2, \dots, X_p)\) represents the vector of input features. The partial dependence of the predicted response \(\hat{y}\) on a single feature \(X_j\) is defined as:
\(\mathbf{X}_{C_i}\) represents the complement set of \(X_j\), meaning the remaining features in \(\mathbf{X}\) not included in \(X_j\) for the \(i\)-th instance.
+
\(n\) is the number of observations in the dataset.
+
+
For two features, \(X_j\) and \(X_k\), the partial dependence is given by:
This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of \(X_j\) and \(X_k\) vary, while the effects of the other features are averaged out.
+
+
Single Feature PDP: When plotting \(\text{PD}(X_j)\), the result is a 2D line plot showing the marginal effect of feature \(X_j\) on the predicted outcome, averaged over all possible values of the other features.
+
Two Features PDP: When plotting \(\text{PD}(X_j, X_k)\), the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of \(X_j\) and \(X_k\) on the predicted outcome. The surface represents the expected value of the prediction as \(X_j\) and \(X_k\) vary, while all other features are averaged out.
+
+
3D Partial Dependence Plots
+
For a more comprehensive analysis, especially when exploring interactions between two features, 3D Partial Dependence Plots are invaluable. The partial dependence function for two features in a 3D context is:
Here, the function \(f(X_j, X_k, \mathbf{X}_{C_i})\) is evaluated across a grid of values for \(X_j\) and \(X_k\). The resulting 3D surface plot represents how the model’s prediction changes over the joint range of these two features.
+
The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model’s predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+
Feature Interaction Visualization: The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+
Enhanced Interpretation: 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
The plot_2d_pdp function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+
Grid and Individual Plots: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+
Customization Options: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+
Saving Plots: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
X_train (pandas.DataFrame or numpy.ndarray) – The training data used to compute partial dependence. Should correspond to the features used to train the model.
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
Consider a scenario where you have a machine learning model predicting median
+house values in California. [4] Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
If you extend this to two interacting features, such as AveOccup and HouseAge,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
Fetch The CA Housing Dataset and Prepare The DataFrame
The plot_3d_pdp function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+
Interactive and Static 3D Plots: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+
Colormap and Layout Customization: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+
Axis and Title Configuration: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
Generate 3D partial dependence plots for two features of a machine learning model.
+
This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
+- "static": Generate only static Matplotlib plots.
+- "interactive": Generate only interactive Plotly plots.
+- "both": Generate both static and interactive plots. Default is "both".
+
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
If plot_type is not one of "static", "interactive", or "both".
+
If plot_type is "interactive" or "both" and html_file_path or html_file_name are not provided.
+
+
+
+
Returns:
+
None
+This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+
Notes:
+
+
This function handles warnings related to scikit-learn’s partial_dependence function, specifically a FutureWarning related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+
To maintain compatibility with different versions of scikit-learn, the function attempts to use "values" for grid extraction in newer versions and falls back to "grid_values" for older versions.
Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8,5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+)
+
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+)
+
+
+
+
Warning
+
Scrolling Notice:
+
While interacting with the interactive Plotly plot below, scrolling down the
+page using the mouse wheel may be blocked when the mouse pointer is hovering
+over the plot. To continue scrolling, either move the mouse pointer outside
+the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+
+
This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here’s how it differs from
+generating a static plot using Matplotlib.
+
Key Differences
+
Plot Type:
+
+
The plot_type is set to "interactive" for the Plotly plot and "static" for the Matplotlib plot.
+
+
Interactive-Specific Parameters:
+
+
HTML File Path and Name: The html_file_path and html_file_name parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
Zoom and Positioning: The interactive plot includes parameters like zoom_out_factor, title_x, cbar_x, and cbar_thickness to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
Mode Bar and Zoom: The show_modebar and enable_zoom parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+
Static-Specific Parameters:
+
+
Figure Size and Wireframe Color: The static plot uses parameters like figsize to control the size of the Matplotlib plot and wireframe_color to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+
By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.9/.buildinfo b/_build/html/v0.0.9/.buildinfo
new file mode 100644
index 000000000..68cfefe3c
--- /dev/null
+++ b/_build/html/v0.0.9/.buildinfo
@@ -0,0 +1,4 @@
+# Sphinx build info version 1
+# This file records the configuration used when building these files. When it is not found, a full rebuild will be done.
+config: 53f25e0512cf18061513877a6cf8fc9a
+tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/_build/html/v0.0.9/.doctrees/acknowledgements.doctree b/_build/html/v0.0.9/.doctrees/acknowledgements.doctree
new file mode 100644
index 000000000..53d706e39
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/acknowledgements.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/changelog.doctree b/_build/html/v0.0.9/.doctrees/changelog.doctree
new file mode 100644
index 000000000..9209c73ef
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/changelog.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/citations.doctree b/_build/html/v0.0.9/.doctrees/citations.doctree
new file mode 100644
index 000000000..1bd9835a3
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/citations.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/contributors.doctree b/_build/html/v0.0.9/.doctrees/contributors.doctree
new file mode 100644
index 000000000..d040ba2c6
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/contributors.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/data_management.doctree b/_build/html/v0.0.9/.doctrees/data_management.doctree
new file mode 100644
index 000000000..20d1788c3
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/data_management.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/eda_plots.doctree b/_build/html/v0.0.9/.doctrees/eda_plots.doctree
new file mode 100644
index 000000000..3c84d4491
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/eda_plots.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/environment.pickle b/_build/html/v0.0.9/.doctrees/environment.pickle
new file mode 100644
index 000000000..52c523907
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/environment.pickle differ
diff --git a/_build/html/v0.0.9/.doctrees/getting_started.doctree b/_build/html/v0.0.9/.doctrees/getting_started.doctree
new file mode 100644
index 000000000..991ec521c
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/getting_started.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/index.doctree b/_build/html/v0.0.9/.doctrees/index.doctree
new file mode 100644
index 000000000..52174dbd9
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/index.doctree differ
diff --git a/_build/html/v0.0.9/.doctrees/references.doctree b/_build/html/v0.0.9/.doctrees/references.doctree
new file mode 100644
index 000000000..f8088f301
Binary files /dev/null and b/_build/html/v0.0.9/.doctrees/references.doctree differ
diff --git a/_build/html/v0.0.9/_images/2d_pdp_grid.svg b/_build/html/v0.0.9/_images/2d_pdp_grid.svg
new file mode 100644
index 000000000..641db4ba6
--- /dev/null
+++ b/_build/html/v0.0.9/_images/2d_pdp_grid.svg
@@ -0,0 +1,4405 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/3d_pdp.svg b/_build/html/v0.0.9/_images/3d_pdp.svg
new file mode 100644
index 000000000..535371233
--- /dev/null
+++ b/_build/html/v0.0.9/_images/3d_pdp.svg
@@ -0,0 +1,8326 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/Bar_Age_regular_income.svg b/_build/html/v0.0.9/_images/Bar_Age_regular_income.svg
new file mode 100644
index 000000000..6f8aa40d4
--- /dev/null
+++ b/_build/html/v0.0.9/_images/Bar_Age_regular_income.svg
@@ -0,0 +1,1201 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/Stacked_Bar_Age_income.svg b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income.svg
new file mode 100644
index 000000000..d5510308b
--- /dev/null
+++ b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income.svg
@@ -0,0 +1,1943 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_pivoted.svg b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_pivoted.svg
new file mode 100644
index 000000000..2147fce1a
--- /dev/null
+++ b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_pivoted.svg
@@ -0,0 +1,2043 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_regular.svg b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_regular.svg
new file mode 100644
index 000000000..04478581f
--- /dev/null
+++ b/_build/html/v0.0.9/_images/Stacked_Bar_Age_income_regular.svg
@@ -0,0 +1,1347 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/Stacked_Bar_Age_sex.svg b/_build/html/v0.0.9/_images/Stacked_Bar_Age_sex.svg
new file mode 100644
index 000000000..7b2bcb137
--- /dev/null
+++ b/_build/html/v0.0.9/_images/Stacked_Bar_Age_sex.svg
@@ -0,0 +1,1970 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/all_plots_comparisons_boxplot.png b/_build/html/v0.0.9/_images/all_plots_comparisons_boxplot.png
new file mode 100644
index 000000000..c4f54b520
Binary files /dev/null and b/_build/html/v0.0.9/_images/all_plots_comparisons_boxplot.png differ
diff --git a/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot.png b/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot.png
new file mode 100644
index 000000000..cc236e21c
Binary files /dev/null and b/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot.png differ
diff --git a/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot_pivoted.png b/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot_pivoted.png
new file mode 100644
index 000000000..b05150e06
Binary files /dev/null and b/_build/html/v0.0.9/_images/all_plots_comparisons_violinplot_pivoted.png differ
diff --git a/_build/html/v0.0.9/_images/count_hist_distributions.svg b/_build/html/v0.0.9/_images/count_hist_distributions.svg
new file mode 100644
index 000000000..f08328f86
--- /dev/null
+++ b/_build/html/v0.0.9/_images/count_hist_distributions.svg
@@ -0,0 +1,1719 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/density_hist_dist_age.svg b/_build/html/v0.0.9/_images/density_hist_dist_age.svg
new file mode 100644
index 000000000..717ca6bf8
--- /dev/null
+++ b/_build/html/v0.0.9/_images/density_hist_dist_age.svg
@@ -0,0 +1,1375 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/density_hist_dist_mean_median.svg b/_build/html/v0.0.9/_images/density_hist_dist_mean_median.svg
new file mode 100644
index 000000000..cd480f5ed
--- /dev/null
+++ b/_build/html/v0.0.9/_images/density_hist_dist_mean_median.svg
@@ -0,0 +1,1935 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/eda_toolkit_logo.svg b/_build/html/v0.0.9/_images/eda_toolkit_logo.svg
new file mode 100644
index 000000000..d039d6f79
--- /dev/null
+++ b/_build/html/v0.0.9/_images/eda_toolkit_logo.svg
@@ -0,0 +1 @@
+
\ No newline at end of file
diff --git a/_build/html/v0.0.9/_images/hist_density_distributions.svg b/_build/html/v0.0.9/_images/hist_density_distributions.svg
new file mode 100644
index 000000000..5ce1a7191
--- /dev/null
+++ b/_build/html/v0.0.9/_images/hist_density_distributions.svg
@@ -0,0 +1,1744 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/kde_density_distributions.svg b/_build/html/v0.0.9/_images/kde_density_distributions.svg
new file mode 100644
index 000000000..b42cfeb18
--- /dev/null
+++ b/_build/html/v0.0.9/_images/kde_density_distributions.svg
@@ -0,0 +1,2301 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/normal_distribution.png b/_build/html/v0.0.9/_images/normal_distribution.png
new file mode 100644
index 000000000..837c60e0c
Binary files /dev/null and b/_build/html/v0.0.9/_images/normal_distribution.png differ
diff --git a/_build/html/v0.0.9/_images/scatter_plots_all_grid.png b/_build/html/v0.0.9/_images/scatter_plots_all_grid.png
new file mode 100644
index 000000000..78652ac74
Binary files /dev/null and b/_build/html/v0.0.9/_images/scatter_plots_all_grid.png differ
diff --git a/_build/html/v0.0.9/_images/scatter_plots_grid.png b/_build/html/v0.0.9/_images/scatter_plots_grid.png
new file mode 100644
index 000000000..5a51facd8
Binary files /dev/null and b/_build/html/v0.0.9/_images/scatter_plots_grid.png differ
diff --git a/_build/html/v0.0.9/_images/scatter_plots_grid_grouped.png b/_build/html/v0.0.9/_images/scatter_plots_grid_grouped.png
new file mode 100644
index 000000000..02a3b3916
Binary files /dev/null and b/_build/html/v0.0.9/_images/scatter_plots_grid_grouped.png differ
diff --git a/_build/html/v0.0.9/_images/summarize_combos.gif b/_build/html/v0.0.9/_images/summarize_combos.gif
new file mode 100644
index 000000000..402ee1efc
Binary files /dev/null and b/_build/html/v0.0.9/_images/summarize_combos.gif differ
diff --git a/_build/html/v0.0.9/_images/us_census_correlation_matrix.svg b/_build/html/v0.0.9/_images/us_census_correlation_matrix.svg
new file mode 100644
index 000000000..2a41e1afa
--- /dev/null
+++ b/_build/html/v0.0.9/_images/us_census_correlation_matrix.svg
@@ -0,0 +1,1766 @@
+
+
+
diff --git a/_build/html/v0.0.9/_images/us_census_correlation_matrix_full.svg b/_build/html/v0.0.9/_images/us_census_correlation_matrix_full.svg
new file mode 100644
index 000000000..d0df5da46
--- /dev/null
+++ b/_build/html/v0.0.9/_images/us_census_correlation_matrix_full.svg
@@ -0,0 +1,1907 @@
+
+
+
diff --git a/_build/html/v0.0.9/_sources/acknowledgements.rst.txt b/_build/html/v0.0.9/_sources/acknowledgements.rst.txt
new file mode 100644
index 000000000..e62da5a10
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/acknowledgements.rst.txt
@@ -0,0 +1,30 @@
+.. _acknowledgements:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Acknowledgements
+=================
+
+We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
+We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
diff --git a/_build/html/v0.0.9/_sources/changelog.rst.txt b/_build/html/v0.0.9/_sources/changelog.rst.txt
new file mode 100644
index 000000000..3dc4db485
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/changelog.rst.txt
@@ -0,0 +1,604 @@
+.. _changelog:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Changelog
+=========
+
+`Version 0.0.9`_
+----------------------
+
+.. _Version 0.0.9: https://lshpaner.github.io/eda_toolkit/v0.0.9/index.html
+
+**Bug Fixes and Minor Improvements**
+
+Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
+**Visualization Enhancements**
+
+**DataFrame Columns:** Added a ``background_color`` variable to ``dataframe_columns```,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use ``hide()`` instead of ``hide_index()``. The highlighted
+columns allow for easier null versus unique value analysis.
+
+The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
+**Args:**
+
+- The ``df`` argument is specified as a ``pandas.DataFrame``.
+
+- The ``background_color`` argument is marked as optional, with a brief description of its role.
+
+- The ``return_df`` argument is also marked as optional, explaining what it controls.
+
+
+**Returns:** The return type is specified as ``pandas.DataFrame``, with a clear explanation of the difference based on the ``return_df`` flag.
+
+**KDE Distribution Plots:** Improved ``kde_distributions()`` with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
+**Scatter Plots:** Enhanced ``scatter_fit_plot()`` with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
+
+
+`Version 0.0.8`_
+----------------------
+
+.. _Version 0.0.8: https://lshpaner.github.io/eda_toolkit/v0.0.8/index.html
+
+
+:class:`stacked_crosstab_plot`
+
+- **Flexible `save_formats` Input**:
+ - `save_formats` now accepts a string, tuple, or list for specifying formats (e.g., `"png"`, `("png", "svg")`, or `["png", "svg"]`).
+ - Single strings or tuples are automatically converted to lists for consistent processing.
+
+- **Dynamic Error Handling**:
+ - Added checks to ensure a valid path is provided for each format in `save_formats`.
+ - Raises a `ValueError` if a format is specified without a corresponding path, with a clear, dynamic error message.
+
+- **Improved Plot Saving Logic**:
+ - Updated logic allows saving plots in one format (e.g., only `"png"` or `"svg"`) without requiring the other.
+ - Simplified and more intuitive path handling for saving plots.
+
+
+:class:`plot_3d_pdp`
+
+This update introduces several key changes to the `plot_3d_pdp` function, simplifying the function's interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
+**1. Parameter Changes**
+
+
+- **Removed Parameters:**
+
+ - The parameters ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general ``x_label``, ``y_label``, and ``z_label``. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+- **Default Values for Labels:**
+
+ - The parameters ``x_label``, ``y_label``, and ``z_label`` are now optional, with ``None`` as the default. If not provided, these labels will automatically default to the names of the features in the ``feature_names_list``. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+- **Changes in Default Values for View Angles:**
+
+ - The default values for camera positioning parameters have been updated: ``horizontal`` is now ``-1.25``, ``depth`` is now ``1.25``, and ``vertical`` is now ``1.25``. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+**2. Plot Generation Logic**
+
+- **Conditionally Checking Labels:**
+
+ - The function now checks whether ``x_label``, ``y_label``, and ``z_label`` are provided. If these are ``None``, the function will automatically assign default labels based on the ``feature_names_list``. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+- **Camera Position Adjustments:**
+
+ - The camera positions for the Plotly plot are now adjusted by multiplying ``horizontal``, ``depth``, and ``vertical`` by ``zoom_out_factor``. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+- **Surface Plot Coordinates Adjustments:**
+
+ - The order of the coordinates for the Plotly plot’s surface has been changed from ``ZZ, XX, YY[::-1]`` to ``ZZ, XX, YY``. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+**3. Code Simplifications**
+
+- **Removed Complexity:**
+
+ - By removing the ``x_label_plotly``, ``y_label_plotly``, and ``z_label_plotly`` parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+- **Fallback Mechanism for Grid Values:**
+
+ - The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+**4. Style Adjustments**
+
+- **Label Formatting:**
+
+ - The new version consistently uses ``y_label``, ``x_label``, and ``z_label`` for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+- **Color Bar Adjustments:**
+
+ - The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of ``0.6`` and a pad value of ``0.02``. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+**5. Potential Use Case Differences**
+
+- **Simplified Interface:**
+
+ - The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+- **Less Granular Control:**
+
+ - Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the ``*_plotly`` label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+**6. Matplotlib Plot Adjustments**
+
+- **Wireframe and Surface Plot Enhancements:**
+
+ - The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+**Summary**
+
+- Version ``0.0.8d`` of the `plot_3d_pdp` function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+- Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+**Decision Point**
+
+- This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
+
+
+Version 0.0.8c
+------------------------
+
+Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
+**Key Updates in 0.0.8c:**
+
+- **Bug Fixes:** Addressed minor issues identified in version ``0.0.8b`` to ensure smoother performance and better user experience.
+- **Additional Testing:** Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+- **Refinements:** Made small enhancements to existing features based on user feedback and internal testing results.
+
+**Summary of Changes**
+
+1. New Features & Enhancements
+
+- ``plot_3d_pdp`` Function:
+
+ - Added ``show_modebar`` Parameter: Introduced a new boolean parameter, ``show_modebar``, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Added parameters for ``left_margin``, ``right_margin``, and ``top_margin`` to provide users with more control over the plot layout in Plotly.
+
+ - Adjusted default values and added options for better customization of the Plotly color bar (``cbar_x``, ``cbar_thickness``) and title positioning (``title_x``, ``title_y``).
+
+ - Plotly Configuration:
+
+ - Enhanced the configuration options to allow users to enable or disable zoom functionality (``enable_zoom``) in the interactive Plotly plots.
+
+ - Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+ - Error Handling:
+
+ - Added input validation for ``html_file_path`` and ``html_file_name`` to ensure these are provided when necessary based on the selected ``plot_type``.
+
+- ``plot_2d_pdp`` Function:
+
+ - Introduced ``file_prefix`` Parameter:
+
+ - Added a new ``file_prefix`` parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+ - Enhanced Plot Type Flexibility:
+
+ - The ``plot_type`` parameter now includes an option to generate both grid and individual plots (``both``). This feature allows users to create a combination of both layout styles in one function call.
+
+ - Updated input validation and logic to handle this new option effectively.
+
+ - Added ``save_plots`` Parameter:
+
+ - Introduced a new parameter, ``save_plots``, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+ - Custom Margins and Layout Adjustments:
+
+ - Included the ``save_plots`` parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+2. Documentation Updates
+
+- Docstrings:
+
+ - Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
+ - Detailed the use of new parameters such as ``show_modebar``, ``file_prefix``, ``save_plots``, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+3. Refactoring & Code Cleanup
+
+- Code Structure:
+
+ - Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
+ - Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
+
+
+Version 0.0.8b
+--------------------------------
+
+Version 0.0.8b is an exact replica of version ``0.0.8a``. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
+
+
+Version 0.0.8a
+--------------------------------
+
+Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
+**New Features:**
+
+1. Optional ``file_prefix`` in ``stacked_crosstab_plot`` Function
+
+ - The ``stacked_crosstab_plot`` function has been updated to make the ``file_prefix`` argument optional. If the user does not provide a ``file_prefix``, the function will now automatically generate a default prefix based on the ``col`` and ``func_col`` parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
+ - **Key Improvement:**
+
+ - Users can now omit the ``file_prefix`` argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
+ - Backward compatibility is maintained, allowing users who prefer to specify a custom ``file_prefix`` to continue doing so without any issues.
+
+2. **Introduction of 3D and 2D Partial Dependence Plot Functions**
+
+ - Two new functions, ``plot_3d_pdp`` and ``plot_2d_pdp``, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+ - ``plot_3d_pdp``: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
+ - ``plot_2d_pdp``: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+ - **Key Features:**
+
+ - **Compatibility:** Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
+ - **Customization:** Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
+ - **Interactive 3D Plots:** The ``plot_3d_pdp`` function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+**Impact:**
+
+- These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+- The optional ``file_prefix`` enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+- The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
+
+
+
+`Version 0.0.7`_
+----------------------
+
+.. _Version 0.0.7: https://lshpaner.github.io/eda_toolkit/v0.0.7/index.html
+
+**Added Function for Customizable Correlation Matrix Visualization**
+
+This release introduces a new function, ``flex_corr_matrix``, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to ``seaborn.heatmap()``.
+
+**Summary of Changes**
+
+- **New Function**: ``flex_corr_matrix``.
+
+ - **Functionality**:
+ - Generates a correlation heatmap for a given DataFrame.
+ - Supports both full and upper triangular correlation matrices based on the ``triangular`` parameter.
+ - Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+ - Accepts additional keyword arguments via ``**kwargs`` to pass directly to ``seaborn.heatmap()``.
+ - Includes validation to ensure the ``triangular``, ``annot``, and ``save_plots`` parameters are boolean values.
+ - Raises an exception if ``save_plots=True`` but neither ``image_path_png`` nor ``image_path_svg`` is specified.
+
+**Usage**
+
+.. code-block:: python
+
+ # Full correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=False, cmap="coolwarm", annot=True)
+
+ # Upper triangular correlation matrix example
+ flex_corr_matrix(df=my_dataframe, triangular=True, cmap="coolwarm", annot=True)
+
+
+**Contingency table df to object type**
+
+Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
+
+.. code-block:: python
+
+ df = df.astype(str).fillna("")
+
+
+`Version 0.0.6`_
+----------------------
+
+.. _Version 0.0.6: https://lshpaner.github.io/eda_toolkit/v0.0.6/index.html
+
+**Added validation for Plot Type Parameter in KDE Distributions Function**
+
+This release adds a validation step for the ``plot_type`` parameter in the ``kde_distributions`` function. The allowed values for ``plot_type`` are ``"hist"``, ``"kde"``, and ``"both"``. If an invalid value is provided, the function will now raise a ``ValueError`` with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
+
+.. code-block:: python
+
+ # Validate plot_type parameter
+ valid_plot_types = ["hist", "kde", "both"]
+ if plot_type.lower() not in valid_plot_types:
+ raise ValueError(
+ f"Invalid plot_type value. Expected one of {valid_plot_types}, "
+ f"got '{plot_type}' instead."
+ )
+
+`Version 0.0.5`_
+----------------------
+
+.. _Version 0.0.5: https://lshpaner.github.io/eda_toolkit/v0.0.5/index.html
+
+
+**Ensure Consistent Font Size and Text Wrapping Across Plot Elements**
+
+This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the ``stacked_crosstab_plot`` function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+1. **Title Font Size and Text Wrapping:**
+ - Added a ``text_wrap`` parameter to control the wrapping of plot titles.
+ - Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ``ax.set_title()`` after plot generation.
+
+2. **Legend Font Size Consistency:**
+ - Incorporated ``label_fontsize`` into the legend font size by directly setting the font size of the legend text using ``plt.setp(legend.get_texts(), fontsize=label_fontsize)``.
+ - This ensures that the legend labels are consistent with the title and axis labels.
+
+**Testing**
+
+- Verified that titles now wrap correctly and match the specified ``label_fontsize``.
+- Confirmed that legend text scales according to ``label_fontsize``, ensuring consistent font sizes across all plot elements.
+
+
+Version 0.0.4
+---------------------------
+
+- **Stable release**
+
+ - No new updates to the codebase.
+
+ - Updated the project ``description`` variable in ``setup.py`` to re-emphasize key elements of the library.
+
+ - Minor README cleanup:
+
+ - Added icons for sections that did not have them.
+
+
+Version 0.0.3
+---------------------------
+
+- **Stable release**
+
+ - Updated logo size, fixed citation title, and made minor README cleanup:
+
+ - Added an additional section for documentation, cleaned up verbiage, moved acknowledgments section before licensing and support.
+
+Version 0.0.2
+---------------------------
+
+- **First stable release**
+ - No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+ - Added logo, badges, and Zenodo-certified citation to README.
+
+Version 0.0.1rc0
+-------------------------------
+
+- No new updates to the codebase; minimal documentation updates to README and ``setup.py`` files.
+
+Version 0.0.1b0
+-----------------------------
+
+**New Scatter Fit Plot and Additional Updates**
+
+- Added new ``scatter_fit_plot()``, removed unused ``data_types()``, and added comment section headers.
+
+**Added xlim and ylim Inputs to KDE Distribution**
+
+- ``kde_distribution()``:
+
+ - Added ``xlim`` and ``ylim`` inputs to allow users to customize axes limits in ``kde_distribution()``.
+
+**Added xlim and ylim Params to Stacked Crosstab Plot**
+
+- ``stacked_crosstab_plot()``:
+
+ - Added ``xlim`` and ``ylim`` input parameters to ``stacked_crosstab_plot()`` to give users more flexibility in controlling axes limits.
+
+**Added x and y Limits to Box and Violin Plots**
+
+- ``box_violin_plot()``:
+
+ - Changed function name from ``metrics_box_violin()`` to ``box_violin_plot()``.
+ - Added ``xlim`` and ``ylim`` inputs to control x and y-axis limits of ``box_violin_plot()`` (formerly ``metrics_box_violin``).
+
+**Added Ability to Remove Stacks from Plots, Plot All or One at a Time**
+
+**Key Changes**
+
+1. **Plot Type Parameter**
+ - ``plot_type``: This parameter allows the user to choose between ``"regular"``, ``"normalized"``, or ``"both"`` plot types.
+
+2. **Remove Stacks Parameter**
+ - ``remove_stacks``: This parameter, when set to ``True``, generates a regular bar plot using only the ``col`` parameter instead of a stacked bar plot. It only works when ``plot_type`` is set to "regular". If ``remove_stacks`` is set to ``True`` while ``plot_type`` is anything other than "regular", the function will raise an exception.
+
+**Explanation of Changes**
+
+- **Plot Type Parameter**
+
+ - Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+ - ``"regular"``: Standard bar plot.
+
+ - ``"normalized"``: Normalized bar plot.
+
+ - ``"both"``: Both regular and normalized bar plots.
+
+- **Remove Stacks Parameter**
+ - ``remove_stacks``: Generates a regular bar plot using only the ``col`` parameter, removing the stacking of the bars. Applicable only when ``plot_type`` is set to "regular". An exception is raised if used with any other ``plot_type``.
+
+These changes enhance the flexibility and functionality of the ``stacked_crosstab_plot`` function, allowing for more customizable and specific plot generation based on user requirements.
+
+Version 0.0.1b0
+-----------------------------
+
+**Refined KDE Distributions**
+
+**Key Changes**
+
+1. **Alpha Transparency for Histogram Fill**
+ - Added a ``fill_alpha`` parameter to control the transparency of the histogram bars' fill color.
+ - Default value is ``0.6``. An exception is raised if ``fill=False`` and ``fill_alpha`` is specified.
+
+2. **Custom Font Sizes**
+ - Introduced ``label_fontsize`` and ``tick_fontsize`` parameters to control font size of axis labels and tick marks independently.
+
+3. **Scientific Notation Toggle**
+ - Added a ``disable_sci_notation`` parameter to enable or disable scientific notation on axes.
+
+4. **Improved Error Handling**
+ - Added validation for the ``stat`` parameter to ensure valid options are accepted.
+ - Added checks for proper usage of ``fill_alpha`` and ``hist_edgecolor`` when ``fill`` is set to ``False``.
+
+5. **General Enhancements**
+ - Updated the function's docstring to reflect new parameters and provide comprehensive guidance on usage.
+
+Version 0.0.1b0
+-----------------------------
+
+**Enhanced KDE Distributions Function**
+
+**Added Parameters**
+
+1. **Grid Figsize and Single Figsize**
+ - Control the size of the overall grid figure and individual figures separately.
+
+2. **Hist Color and KDE Color`**
+ - Allow customization of histogram and KDE plot colors.
+
+3. **Edge Color**
+ - Allows customization of histogram bar edges.
+
+4. **Hue**
+ - Allows grouping data by a column.
+
+5. **Fill**
+ - Controls whether to fill histogram bars with color.
+
+6. **Y-axis Label`**
+ - Customizable y-axis label.
+
+7. **Log-Scaling**
+ - Specifies which variables to apply log scale.
+
+8. **Bins and Bin Width**
+ - Control the number and width of bins.
+
+9. **``stat``:**
+ - Allows different statistics for the histogram (``count``, ``density``, ``frequency``, ``probability``, ``proportion``, ``percent``).
+
+**Improvements**
+
+1. **Validation and Error Handling**
+ - Checks for invalid ``log_scale_vars`` and throws a ``ValueError`` if any are found.
+ - Throws a ``ValueError`` if ``edgecolor`` is changed while ``fill`` is set to ``False``.
+ - Issues a ``PerformanceWarning`` if both ``bins`` and ``binwidth`` are specified, warning of potential performance impacts.
+
+2. **Customizable Y-Axis Label**
+ - Allows users to specify custom y-axis labels.
+
+3. **Warning for KDE with Count**
+ - Issues a warning if KDE is used with ``stat='count'``, as it may produce misleading plots.
+
+**Updated Function to Ensure Unique IDs and Index Check**
+
+- Ensured that each generated ID in ``add_ids`` starts with a non-zero digit.
+- Added a check to verify that the DataFrame index is unique.
+- Printed a warning message if duplicate index entries are found.
+
+These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
+**Check for Unique Indices**
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
+**Generate Non-Zero Starting IDs**
+
+- The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+**Ensure Unique IDs**
+
+- A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+**Fix Int Conversion for Numeric Columns, Reset Decimal Places**
+
+- Fixed integer conversion issue for numeric columns when ``decimal_places=0`` in the ``save_dataframes_to_excel`` function.
+- Reset ``decimal_places`` default value to ``0``.
+
+These changes ensure correct formatting and avoid errors during conversion.
+
+**Contingency Table Updates**
+
+1. **Error Handling for Columns**
+ - Added a check to ensure at least one column is specified.
+ - Updated the function to accept a single column as a string or multiple columns as a list.
+ - Raised a ``ValueError`` if no columns are provided or if ``cols`` is not correctly specified.
+
+2. **Function Parameters**
+ - Changed parameters from ``col1`` and ``col2`` to a single parameter ``cols`` which can be either a string or a list.
+
+3. **Error Handling**
+ - Renamed ``SortBy`` to ``sort_by`` to standardize nomenclature.
+ - Added a check to ensure ``sort_by`` is either 0 or 1.
+ - Raised a ``ValueError`` if ``sort_by`` is not 0 or 1.
+
+5. **Sorting Logic**
+ - Updated the sorting logic to handle the new ``cols`` parameter structure.
+
+6. **Handling Categorical Data**
+ - Modified code to convert categorical columns to strings to avoid issues with ``fillna("")``.
+
+7. **Handling Missing Values**
+ - Added ``df = df.fillna('')`` to fill NA values within the function to account for missing data.
+
+8. **Improved Function Documentation**
+ - Updated function documentation to reflect new parameters and error handling.
+
+Version 0.0.1b0
+-----------------------------
+
+**Contingency Table Updates**
+
+- ``fillna('')`` added to output so that null values come through, removed ``'All'`` column name from output, sort options ``0`` and ``1``, updated docstring documentation. Tested successfully on ``Python 3.7.3``.
+
+**Compatibility Enhancement**
+
+1. Added a version check for ``Python 3.7`` and above.
+
+ - Conditional import of ``datetime`` to handle different Python versions.
+
+.. code-block:: python
+
+ if sys.version_info >= (3, 7):
+ from datetime import datetime
+ else:
+ import datetime
diff --git a/_build/html/v0.0.9/_sources/citations.rst.txt b/_build/html/v0.0.9/_sources/citations.rst.txt
new file mode 100644
index 000000000..33d5aa916
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/citations.rst.txt
@@ -0,0 +1,42 @@
+.. _citations:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Citing EDA Toolkit
+===================
+
+Shpaner, L., & Gil, O. (2024). EDA Toolkit (0.0.9). Zenodo. https://doi.org/10.5281/zenodo.13163208
+
+.. code:: bash
+
+ @software{shpaner_2024_13162633,
+ author = {Shpaner, Leonid and
+ Gil, Oscar},
+ title = {EDA Toolkit},
+ month = aug,
+ year = 2024,
+ publisher = {Zenodo},
+ version = {0.0.9},
+ doi = {10.5281/zenodo.13162633},
+ url = {https://doi.org/10.5281/zenodo.13162633}
+ }
+
diff --git a/_build/html/v0.0.9/_sources/contributors.rst.txt b/_build/html/v0.0.9/_sources/contributors.rst.txt
new file mode 100644
index 000000000..4da2fa18b
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/contributors.rst.txt
@@ -0,0 +1,59 @@
+.. _contributors:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+Contributors/Maintainers
+=========================
+
+.. raw:: html
+
+
+
+.. image:: https://www.leonshpaner.com/author/leon-shpaner/avatar_hu48de79c369d5f7d4ff8056a297b2c4c5_1681850_270x270_fill_q90_lanczos_center.jpg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Leonid Shpaner `_ is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego's M.S. Applied Artificial Intelligence program.
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: https://oscargildata.com/portfolio_content/images/Oscar_LinkedIn_Pic.jpeg
+ :align: left
+ :width: 150
+ :height: 150
+
+.. raw:: html
+
+
+
+`Oscar Gil `_ is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
diff --git a/_build/html/v0.0.9/_sources/data_management.rst.txt b/_build/html/v0.0.9/_sources/data_management.rst.txt
new file mode 100644
index 000000000..3ee514b66
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/data_management.rst.txt
@@ -0,0 +1,1384 @@
+.. _data_management:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Data Management Overview
+===========================
+
+In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
+
+Data Management Techniques
+===============================
+
+Path directories
+----------------
+
+**Ensure that the directory exists. If not, create it.**
+
+.. function:: ensure_directory(path)
+
+ :param path: The path to the directory that needs to be ensured.
+ :type path: str
+
+ :returns: None
+
+
+The ``ensure_directory`` function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``ensure_directory`` function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
+First, we define the base path as the parent directory of the current directory.
+The ``os.pardir`` constant, equivalent to ``"..""``, is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
+
+Next, we set paths for the PNG and SVG image directories, located within an
+``images`` folder in the parent directory. Using the ``ensure_directory``
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
+.. code-block:: python
+
+ from eda_toolkit import ensure_directory
+
+ import os # import operating system for dir
+
+
+ base_path = os.path.join(os.pardir)
+
+ # Go up one level from 'notebooks' to parent directory,
+ # then into the 'data' folder
+ data_path = os.path.join(os.pardir, "data")
+ data_output = os.path.join(os.pardir, "data_output")
+
+ # create image paths
+ image_path_png = os.path.join(base_path, "images", "png_images")
+ image_path_svg = os.path.join(base_path, "images", "svg_images")
+
+ # Use the function to ensure'data' directory exists
+ ensure_directory(data_path)
+ ensure_directory(data_output)
+ ensure_directory(image_path_png)
+ ensure_directory(image_path_svg)
+
+**Output**
+
+.. code-block:: python
+
+ Created directory: ../data
+ Created directory: ../data_output
+ Created directory: ../images/png_images
+ Created directory: ../images/svg_images
+
+
+Adding Unique Identifiers
+--------------------------
+
+**Add a column of unique IDs with a specified number of digits to the dataframe.**
+
+.. function:: add_ids(df, id_colname="ID", num_digits=9, seed=None, set_as_index=True)
+
+ :param df: The dataframe to add IDs to.
+ :type df: pd.DataFrame
+ :param id_colname: The name of the new column for the IDs. Defaults to ``"ID"``.
+ :type id_colname: str, optional
+ :param num_digits: The number of digits for the unique IDs. Defaults to ``9``.
+ :type num_digits: int, optional
+ :param seed: The seed for the random number generator. Defaults to ``None``.
+ :type seed: int, optional
+ :param set_as_index: Whether to set the new ID column as the index. Defaults to ``False``.
+ :type set_as_index: bool, optional
+
+ :returns: The updated dataframe with the new ID column.
+ :rtype: pd.DataFrame
+
+.. note::
+ - If the dataframe index is not unique, a warning is printed.
+ - The function does not check if the number of rows exceeds the number of
+ unique IDs that can be generated with the specified number of digits.
+ - The first digit of the generated IDs is ensured to be non-zero.
+
+The ``add_ids`` function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``add_ids`` function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the ``add_ids`` function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
+First, we import the pandas library and the ``add_ids`` function from the ``eda_toolkit``.
+Then, we create a sample dataframe with some data. We call the ``add_ids`` function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
+.. code-block:: python
+
+ from eda_toolkit import add_ids
+
+ # Add a column of unique IDs with 9 digits and call it "census_id"
+ df = add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+ )
+
+**Output**
+
+`First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: bash
+
+ DataFrame index is unique.
+
+.. raw:: html
+
+
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
74130842
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
97751875
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
12202842
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
96078789
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
35130194
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+
+
+
+\
+
+
+Trailing Period Removal
+-----------------------
+
+**Strip the trailing period from floats in a specified column of a DataFrame, if present.**
+
+.. function:: strip_trailing_period(df, column_name)
+
+ :param df: The DataFrame containing the column to be processed.
+ :type df: pd.DataFrame
+ :param column_name: The name of the column containing floats with potential trailing periods.
+ :type column_name: str
+
+ :returns: The updated DataFrame with the trailing periods removed from the specified column.
+ :rtype: pd.DataFrame
+
+
+ The ``strip_trailing_period`` function is designed to remove trailing periods
+ from float values in a specified column of a DataFrame. This can be particularly
+ useful when dealing with data that has been inconsistently formatted, ensuring
+ that all float values are correctly represented.
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``strip_trailing_period`` function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the ``strip_trailing_period`` function to remove any trailing periods from the specified column.
+
+.. code-block:: python
+
+ from eda_toolkit import strip_trailing_period
+
+ # Create a sample dataframe with trailing periods in some values
+ data = {
+ "values": [1.0, 2.0, 3.0, 4.0, 5.0, 6.],
+ }
+ df = pd.DataFrame(data)
+
+ # Remove trailing periods from the 'values' column
+ df = strip_trailing_period(df=df, column_name="values")
+
+
+**Output**
+
+`First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)`
+
+.. raw:: html
+
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
+
+\
+
+`Note:` The last row shows 6 as an `int` with a trailing period with its conversion to `float`.
+
+
+\
+
+Standardized Dates
+-------------------
+
+**Parse and standardize date strings based on the provided rule.**
+
+.. function:: parse_date_with_rule(date_str)
+
+ This function takes a date string and standardizes it to the ``ISO 8601`` format
+ (``YYYY-MM-DD``). It assumes dates are provided in either `day/month/year` or
+ `month/day/year` format. The function first checks if the first part of the
+ date string (day or month) is greater than 12, which unambiguously indicates
+ a `day/month/year` format. If the first part is 12 or less, the function
+ attempts to parse the date as `month/day/year`, falling back to `day/month/year`
+ if the former raises a ``ValueError`` due to an impossible date (e.g., month
+ being greater than 12).
+
+ :param date_str: A date string to be standardized.
+ :type date_str: str
+
+ :returns: A standardized date string in the format ``YYYY-MM-DD``.
+ :rtype: str
+
+ :raises ValueError: If ``date_str`` is in an unrecognized format or if the function
+ cannot parse the date.
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``parse_date_with_rule``
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the ``parse_date_with_rule``
+function to parse and standardize each date string to the ``ISO 8601`` format.
+
+.. code-block:: python
+
+ from eda_toolkit import parse_date_with_rule
+
+ # Sample date strings
+ date_strings = ["15/04/2021", "04/15/2021", "01/12/2020", "12/01/2020"]
+
+ # Standardize the date strings
+ standardized_dates = [parse_date_with_rule(date) for date in date_strings]
+
+ print(standardized_dates)
+
+**Output**
+
+.. code-block:: python
+
+ ['2021-04-15', '2021-04-15', '2020-12-01', '2020-01-12']
+
+
+
+.. important::
+
+ In the next example, we demonstrate how to apply the ``parse_date_with_rule``
+ function to a DataFrame column containing date strings using the ``.apply()`` method.
+ This is particularly useful when you need to standardize date formats across an
+ entire column in a DataFrame.
+
+.. code-block:: python
+
+ # Creating the DataFrame
+ data = {
+ "date_column": [
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name": ["Alice", "Bob", "Charlie", "David", "Eve"],
+ "amount": [100.0, 150.5, 200.75, 250.25, 300.0],
+ }
+
+ df = pd.DataFrame(data)
+
+ # Apply the function to the DataFrame column
+ df["standardized_date"] = df["date_column"].apply(parse_date_with_rule)
+
+ print(df)
+
+**Output**
+
+.. code-block:: python
+
+ date_column name amount standardized_date
+ 0 31/12/2021 Alice 100.00 2021-12-31
+ 1 01/01/2022 Bob 150.50 2022-01-01
+ 2 12/31/2021 Charlie 200.75 2021-12-31
+ 3 13/02/2022 David 250.25 2022-02-13
+ 4 07/04/2022 Eve 300.00 2022-04-07
+
+
+DataFrame Analysis
+-------------------
+
+**Analyze DataFrame columns, including dtype, null values, and unique value counts.**
+
+.. function:: dataframe_columns(df, background_color=None, return_df=False)
+
+ Analyze DataFrame columns to provide summary statistics such as data type,
+ null counts, unique values, and most frequent values.
+
+ This function analyzes the columns of a DataFrame, providing details about the data type,
+ the number and percentage of ``null`` values, the total number of unique values, and the most
+ frequent unique value along with its count and percentage. It handles special cases such as
+ converting date columns and replacing empty strings with Pandas ``NA`` values.
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param background_color: Hex color code or color name for background styling in the output
+ DataFrame. Defaults to ``None``.
+ :type background_color: str, optional
+ :param return_df: If ``True``, returns the plain DataFrame with the summary statistics. If
+ ``False``, returns a styled DataFrame for visual presentation. Defaults to ``False``.
+ :type return_df: bool, optional
+
+ :returns: If `return_df` is ``True``, returns the plain DataFrame containing column summary
+ statistics. If `return_df` is ``False``, returns a styled DataFrame with optional
+ background color for specific columns.
+ :rtype: pandas.DataFrame
+
+
+**Example Usage**
+
+In the example below, we demonstrate how to use the ``dataframe_columns``
+function to analyze a DataFrame's columns.
+
+.. code-block:: python
+
+ from eda_toolkit import dataframe_columns
+
+ dataframe_columns(df=df)
+
+
+**Output**
+
+`Result on Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository)` [1]_
+
+.. code-block:: python
+
+ Shape: (48842, 16)
+
+ Total seconds of processing time: 0.861555
+
+.. raw:: html
+
+
+
+
+
+
+
+
column
+
dtype
+
null_total
+
null_pct
+
unique_values_total
+
max_unique_value
+
max_unique_value_total
+
max_unique_value_pct
+
+
+
+
+
0
+
age
+
int64
+
0
+
0
+
74
+
36
+
1348
+
2.76
+
+
+
1
+
workclass
+
object
+
963
+
1.97
+
9
+
Private
+
33906
+
69.42
+
+
+
2
+
fnlwgt
+
int64
+
0
+
0
+
28523
+
203488
+
21
+
0.04
+
+
+
3
+
education
+
object
+
0
+
0
+
16
+
HS-grad
+
15784
+
32.32
+
+
+
4
+
education-num
+
int64
+
0
+
0
+
16
+
9
+
15784
+
32.32
+
+
+
5
+
marital-status
+
object
+
0
+
0
+
7
+
Married-civ-spouse
+
22379
+
45.82
+
+
+
6
+
occupation
+
object
+
966
+
1.98
+
15
+
Prof-specialty
+
6172
+
12.64
+
+
+
7
+
relationship
+
object
+
0
+
0
+
6
+
Husband
+
19716
+
40.37
+
+
+
8
+
race
+
object
+
0
+
0
+
5
+
White
+
41762
+
85.5
+
+
+
9
+
sex
+
object
+
0
+
0
+
2
+
Male
+
32650
+
66.85
+
+
+
10
+
capital-gain
+
int64
+
0
+
0
+
123
+
0
+
44807
+
91.74
+
+
+
11
+
capital-loss
+
int64
+
0
+
0
+
99
+
0
+
46560
+
95.33
+
+
+
12
+
hours-per-week
+
int64
+
0
+
0
+
96
+
40
+
22803
+
46.69
+
+
+
13
+
native-country
+
object
+
274
+
0.56
+
42
+
United-States
+
43832
+
89.74
+
+
+
14
+
income
+
object
+
0
+
0
+
4
+
<=50K
+
24720
+
50.61
+
+
+
15
+
age_group
+
category
+
0
+
0
+
9
+
18-29
+
13920
+
28.5
+
+
+
+
+
+
+
+\
+
+Generating Summary Tables for Variable Combinations
+-----------------------------------------------------
+
+**This function generates summary tables for all possible combinations of specified variables
+in a DataFrame and save them to an Excel file.**
+
+
+.. function:: summarize_all_combinations(df, variables, data_path, data_name, min_length=2)
+
+ :param df: The pandas DataFrame containing the data.
+ :type df: pandas.DataFrame
+ :param variables: List of column names from the DataFrame to generate combinations.
+ :type variables: list of str
+ :param data_path: Path where the output Excel file will be saved.
+ :type data_path: str
+ :param data_name: Name of the output Excel file.
+ :type data_name: str
+ :param min_length: Minimum size of the combinations to generate. Defaults to ``2``.
+ :type min_length: int, optional
+
+ :returns: A tuple containing a dictionary of summary tables and a list of all generated combinations.
+ :rtype: tuple(dict, list)
+
+.. note::
+ - The function will create an Excel file with a sheet for each combination
+ of the specified variables, as well as a "Table of Contents" sheet with
+ hyperlinks to each summary table.
+ - The sheet names are limited to 31 characters due to Excel's constraints.
+
+The function returns two outputs:
+
+1. ``summary_tables``: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
+2. ``all_combinations``: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
+**Example Usage**
+
+Below, we use the ``summarize_all_combinations`` function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1]_.
+
+.. code-block:: python
+
+ from eda_toolkit import summarize_all_combinations
+
+ # Define unique variables for the analysis
+ unique_vars = [
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+ ]
+
+ # Generate summary tables for all combinations of the specified variables
+ summary_tables, all_combinations = summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+ )
+
+ # Print all combinations of variables
+ print(all_combinations)
+
+**Output**
+
+.. code-blocK:: python
+
+ [('age_group', 'workclass'),
+ ('age_group', 'education'),
+ ('age_group', 'occupation'),
+ ('age_group', 'race'),
+ ('age_group', 'sex'),
+ ('age_group', 'income'),
+ ('workclass', 'education'),
+ ('workclass', 'occupation'),
+ ('workclass', 'race'),
+ ('workclass', 'sex'),
+ ('workclass', 'income'),
+ ('education', 'occupation'),
+ ('education', 'race'),
+ ('education', 'sex'),
+ ('education', 'income'),
+ ('occupation', 'race'),
+ ('occupation', 'sex'),
+ ('occupation', 'income'),
+ ('race', 'sex'),
+ ('race', 'income'),
+ ('sex', 'income'),
+ ('age_group', 'workclass', 'education'),
+ ('age_group', 'workclass', 'occupation'),
+ ('age_group', 'workclass', 'race'),
+ ('age_group', 'workclass', 'sex'),
+ ('age_group', 'workclass', 'income'),
+ ('age_group', 'education', 'occupation'),
+ ('age_group', 'education', 'race'),
+ ('age_group', 'education', 'sex'),
+ ('age_group', 'education', 'income'),
+ ('age_group', 'occupation', 'race'),
+ ('age_group', 'occupation', 'sex'),
+ ('age_group', 'occupation', 'income'),
+ ('age_group', 'race', 'sex'),
+ ('age_group', 'race', 'income'),
+ ('age_group', 'sex', 'income'),
+ ('workclass', 'education', 'occupation'),
+ ('workclass', 'education', 'race'),
+ ('workclass', 'education', 'sex'),
+ ('workclass', 'education', 'income'),
+ ('workclass', 'occupation', 'race'),
+ ('workclass', 'occupation', 'sex'),
+ ('workclass', 'occupation', 'income'),
+ ('workclass', 'race', 'sex'),
+ ('workclass', 'race', 'income'),
+ ('workclass', 'sex', 'income'),
+ ('education', 'occupation', 'race'),
+ ('education', 'occupation', 'sex'),
+ ('education', 'occupation', 'income'),
+ ('education', 'race', 'sex'),
+ ('education', 'race', 'income'),
+ ('education', 'sex', 'income'),
+ ('occupation', 'race', 'sex'),
+ ('occupation', 'race', 'income'),
+ ('occupation', 'sex', 'income'),
+ ('race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation'),
+ ('age_group', 'workclass', 'education', 'race'),
+ ('age_group', 'workclass', 'education', 'sex'),
+ ('age_group', 'workclass', 'education', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race'),
+ ('age_group', 'workclass', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'income'),
+ ('age_group', 'workclass', 'race', 'sex'),
+ ('age_group', 'workclass', 'race', 'income'),
+ ('age_group', 'workclass', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race'),
+ ('age_group', 'education', 'occupation', 'sex'),
+ ('age_group', 'education', 'occupation', 'income'),
+ ('age_group', 'education', 'race', 'sex'),
+ ('age_group', 'education', 'race', 'income'),
+ ('age_group', 'education', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex'),
+ ('age_group', 'occupation', 'race', 'income'),
+ ('age_group', 'occupation', 'sex', 'income'),
+ ('age_group', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race'),
+ ('workclass', 'education', 'occupation', 'sex'),
+ ('workclass', 'education', 'occupation', 'income'),
+ ('workclass', 'education', 'race', 'sex'),
+ ('workclass', 'education', 'race', 'income'),
+ ('workclass', 'education', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex'),
+ ('workclass', 'occupation', 'race', 'income'),
+ ('workclass', 'occupation', 'sex', 'income'),
+ ('workclass', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex'),
+ ('education', 'occupation', 'race', 'income'),
+ ('education', 'occupation', 'sex', 'income'),
+ ('education', 'race', 'sex', 'income'),
+ ('occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex'),
+ ('workclass', 'education', 'occupation', 'race', 'income'),
+ ('workclass', 'education', 'occupation', 'sex', 'income'),
+ ('workclass', 'education', 'race', 'sex', 'income'),
+ ('workclass', 'occupation', 'race', 'sex', 'income'),
+ ('education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'sex'),
+ ('age_group', 'workclass', 'education', 'occupation', 'race', 'income'),
+ ('age_group', 'workclass', 'education', 'occupation', 'sex', 'income'),
+ ('age_group', 'workclass', 'education', 'race', 'sex', 'income'),
+ ('age_group', 'workclass', 'occupation', 'race', 'sex', 'income'),
+ ('age_group', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('workclass', 'education', 'occupation', 'race', 'sex', 'income'),
+ ('age_group',
+ 'workclass',
+ 'education',
+ 'occupation',
+ 'race',
+ 'sex',
+ 'income')]
+
+
+When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/summarize_combos.gif
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Saving DataFrames to Excel with Customized Formatting
+-------------------------------------------------------
+**Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.**
+
+
+This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the ``save_dataframes_to_excel`` function.
+
+
+.. function:: save_dataframes_to_excel(file_path, df_dict, decimal_places=0)
+
+ :param file_path: Full path to the output Excel file.
+ :type file_path: str
+ :param df_dict: Dictionary where keys are sheet names and values are DataFrames to save.
+ :type df_dict: dict
+ :param decimal_places: Number of decimal places to round numeric columns. Default is 0.
+ :type decimal_places: int
+
+.. note::
+ - The function will autofit columns and left-align text.
+ - Numeric columns will be formatted with the specified number of decimal places.
+ - Headers will be bold and left-aligned without borders.
+
+The function performs the following tasks:
+
+- Writes each DataFrame to its respective sheet in the Excel file.
+- Rounds numeric columns to the specified number of decimal places.
+- Applies customized formatting to headers and cells.
+- Autofits columns based on the content length.
+
+**Example Usage**
+
+Below, we use the ``save_dataframes_to_excel`` function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between `18` and `40`.
+
+.. code-block:: python
+
+ from eda_toolkit import save_dataframes_to_excel
+
+ # Example usage
+ file_name = "df_census.xlsx" # Name of the output Excel file
+ file_path = os.path.join(data_path, file_name)
+
+ # filter DataFrame to Ages 18-40
+ filtered_df = df[(df["age"] > 18) & (df["age"] < 40)]
+
+ df_dict = {
+ "original_df": df,
+ "ages_18_to_40": filtered_df,
+ }
+
+ save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+ )
+
+
+**Output**
+
+The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between `18` and `40`, each on separate sheets with customized formatting.
+
+
+Creating Contingency Tables
+----------------------------
+
+**Create a contingency table from one or more columns in a DataFrame, with sorting options.**
+
+This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the ``contingency_table`` function.
+
+.. function:: contingency_table(df, cols=None, sort_by=0)
+
+ :param df: The DataFrame to analyze.
+ :type df: pandas.DataFrame
+ :param cols: Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+ :type cols: str or list of str, optional
+ :param sort_by: Enter ``0`` to sort results by column groups; enter ``1`` to sort results by totals in descending order. Defaults to ``0``.
+ :type sort_by: int, optional
+ :raises ValueError: If no columns are specified or if ``sort_by`` is not ``0`` or ``1``.
+ :returns: A DataFrame containing the contingency table with the specified columns, a ``'Total'`` column representing the count of occurrences, and a ``'Percentage'`` column representing the percentage of the total count.
+ :rtype: pandas.DataFrame
+
+**Example Usage**
+
+Below, we use the ``contingency_table`` function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]_
+
+.. code-block:: python
+
+ from eda_toolkit import contingency_table
+
+ # Example usage
+ contingency_table(
+ df=df,
+ cols=[
+ "age_group",
+ "workclass",
+ "race",
+ "sex",
+ ],
+ sort_by=1,
+ )
+
+**Output**
+
+The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the ``'Total'`` column in descending order because ``sort_by``
+is set to ``1``.
+
+
+.. code-block:: python
+
+
+ age_group workclass race sex Total Percentage
+ 0 30-39 Private White Male 5856 11.99
+ 1 18-29 Private White Male 5623 11.51
+ 2 40-49 Private White Male 4267 8.74
+ 3 18-29 Private White Female 3680 7.53
+ 4 50-59 Private White Male 2565 5.25
+ .. ... ... ... ... ... ...
+ 467 50-59 Federal-gov Other Male 1 0.00
+ 468 50-59 Local-gov Asian-Pac-Islander Female 1 0.00
+ 469 70-79 Self-emp-inc Black Male 1 0.00
+ 470 80-89 Local-gov Asian-Pac-Islander Male 1 0.00
+ 471 48842 100.00
+
+ [472 rows x 6 columns]
+
+
+\
+
+Highlighting Specific Columns in a DataFrame
+---------------------------------------------
+
+This section explains how to highlight specific columns in a DataFrame using the ``highlight_columns`` function.
+
+**Highlight specific columns in a DataFrame with a specified background color.**
+
+.. function:: highlight_columns(df, columns, color="yellow")
+
+ :param df: The DataFrame to be styled.
+ :type df: pandas.DataFrame
+ :param columns: List of column names to be highlighted.
+ :type columns: list of str
+ :param color: The background color to be applied for highlighting (default is `"yellow"`).
+ :type color: str, optional
+
+ :returns: A Styler object with the specified columns highlighted.
+ :rtype: pandas.io.formats.style.Styler
+
+**Example Usage**
+
+Below, we use the ``highlight_columns`` function to highlight the ``age`` and ``education``
+columns in the first 5 rows of the census [1]_ DataFrame with a pink background color.
+
+.. code-block:: python
+
+ from eda_toolkit import highlight_columns
+
+ # Applying the highlight function
+ highlighted_df = highlight_columns(
+ df=df,
+ columns=["age", "education"],
+ color="#F8C5C8",
+ )
+
+ highlighted_df
+
+**Output**
+
+The output will be a DataFrame with the specified columns highlighted in the given background color.
+The ``age`` and ``education`` columns will be highlighted in pink.
+
+The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the ``.render()`` method of the Styler object.
+
+
+.. raw:: html
+
+
+
+
+
+
age
+
workclass
+
fnlwgt
+
education
+
education-num
+
marital-status
+
occupation
+
relationship
+
+
+
+
census_id
+
+
+
+
+
+
+
+
+
+
+
82943611
+
39
+
State-gov
+
77516
+
Bachelors
+
13
+
Never-married
+
Adm-clerical
+
Not-in-family
+
+
+
42643227
+
50
+
Self-emp-not-inc
+
83311
+
Bachelors
+
13
+
Married-civ-spouse
+
Exec-managerial
+
Husband
+
+
+
93837254
+
38
+
Private
+
215646
+
HS-grad
+
9
+
Divorced
+
Handlers-cleaners
+
Not-in-family
+
+
+
87104229
+
53
+
Private
+
234721
+
11th
+
7
+
Married-civ-spouse
+
Handlers-cleaners
+
Husband
+
+
+
90069867
+
28
+
Private
+
338409
+
Bachelors
+
13
+
Married-civ-spouse
+
Prof-specialty
+
Wife
+
+
+
+\
+
+Binning Numerical Columns
+---------------------------
+
+Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or "bins." This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or "bins," and
+then assigning each value to one of these intervals.
+
+.. note::
+
+ The code snippets below create age bins and assign a corresponding age group
+ label to each age in the DataFrame. The ``pd.cut`` function from pandas is used to
+ categorize the ages and assign them to a new column, ``age_group``. Adjust the bins
+ and labels as needed for your specific data.
+
+
+Below, we use the ``age`` column of the census data [1]_ from the UCI Machine Learning Repository as an example:
+
+1. **Bins Definition**:
+ The bins are defined by specifying the boundaries of each interval. For example,
+ in the code snippet below, the ``bin_ages`` list specifies the boundaries for age groups:
+
+ .. code-block:: python
+
+ bin_ages = [
+ 0,
+ 18,
+ 30,
+ 40,
+ 50,
+ 60,
+ 70,
+ 80,
+ 90,
+ 100,
+ float("inf"),
+ ]
+
+
+ Each pair of consecutive elements in ``bin_ages`` defines a bin. For example:
+
+ - The first bin is ``[0, 18)``,
+ - The second bin is ``[18, 30)``,
+ - and so on.
+
+\
+
+2. **Labels for Bins**:
+ The `label_ages` list provides labels corresponding to each bin:
+
+ .. code-block:: python
+
+ label_ages = [
+ "< 18",
+ "18-29",
+ "30-39",
+ "40-49",
+ "50-59",
+ "60-69",
+ "70-79",
+ "80-89",
+ "90-99",
+ "100 +",
+ ]
+
+ These labels are used to categorize the numerical values into meaningful groups.
+
+3. **Applying the Binning**:
+ The `pd.cut `_ function
+ from Pandas is used to apply the binning process. For each value in the ``age``
+ column of the DataFrame, it assigns a corresponding label based on which bin the
+ value falls into. Here, ``right=False`` indicates that each bin includes the
+ left endpoint but excludes the right endpoint. For example, if ``bin_ages =
+ [0, 10, 20, 30]``, then a value of ``10`` will fall into the bin ``[10, 20)`` and
+ be labeled accordingly.
+
+ .. code-block:: python
+
+ df["age_group"] = pd.cut(
+ df["age"],
+ bins=bin_ages,
+ labels=label_ages,
+ right=False,
+ )
+
+ **Mathematically**, for a given value `x` in the ``age`` column:
+
+ .. math::
+
+ \text{age_group} =
+ \begin{cases}
+ < 18 & \text{if } 0 \leq x < 18 \\
+ 18-29 & \text{if } 18 \leq x < 30 \\
+ \vdots \\
+ 100 + & \text{if } x \geq 100
+ \end{cases}
+
+ The parameter ``right=False`` in ``pd.cut`` means that the bins are left-inclusive
+ and right-exclusive, except for the last bin, which is always right-inclusive
+ when the upper bound is infinity (``float("inf")``).
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.9/_sources/eda_plots.rst.txt b/_build/html/v0.0.9/_sources/eda_plots.rst.txt
new file mode 100644
index 000000000..a6dba5284
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/eda_plots.rst.txt
@@ -0,0 +1,2668 @@
+.. _eda_plots:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Plotting and Theoretical Overview
+=======================================
+
+Gaussian Assumption for Normality
+----------------------------------
+
+The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
+
+.. math::
+
+ f(x) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp\left(-\frac{(x-\mu)^2}{2\sigma^2}\right)
+
+where:
+
+- :math:`\mu` is the mean
+- :math:`\sigma^2` is the variance
+
+In a normally distributed dataset:
+
+- 68% of data falls within :math:`\mu \pm \sigma`
+- 95% within :math:`\mu \pm 2\sigma`
+- 99.7% within :math:`\mu \pm 3\sigma`
+
+.. raw:: html
+
+
+
+.. image:: ../assets/normal_distribution.png
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histograms and Kernel Density Estimation (KDE)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+**Histograms**:
+
+- Visualize data distribution by binning values and counting frequencies.
+- If data is Gaussian, the histogram approximates a bell curve.
+
+**KDE**:
+
+- A non-parametric way to estimate the PDF by smoothing individual data points with a kernel function.
+- The KDE for a dataset :math:`X = \{x_1, x_2, \ldots, x_n\}` is given by:
+
+.. math::
+
+ \hat{f}(x) = \frac{1}{nh} \sum_{i=1}^{n} K\left(\frac{x - x_i}{h}\right)
+
+where:
+
+- :math:`K` is the kernel function (often Gaussian)
+- :math:`h` is the bandwidth (smoothing parameter)
+
+.. raw:: html
+
+ Combined Use of Histograms and KDE
+
+\
+
+- **Histograms** offer a discrete, binned view of the data.
+- **KDE** provides a smooth, continuous estimate of the underlying distribution.
+- Together, they effectively illustrate how well the data aligns with the Gaussian assumption, highlighting any deviations from normality.
+
+
+Pearson Correlation Coefficient
+--------------------------------
+
+The Pearson correlation coefficient, often denoted as :math:`r`, is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from :math:`-1` to :math:`1`, where:
+
+- :math:`r = 1` indicates a perfect positive linear relationship.
+- :math:`r = -1` indicates a perfect negative linear relationship.
+- :math:`r = 0` indicates no linear relationship.
+
+The Pearson correlation coefficient between two variables :math:`X` and :math:`Y` is defined as:
+
+.. math::
+
+ r_{XY} = \frac{\text{Cov}(X, Y)}{\sigma_X \sigma_Y}
+
+where:
+
+- :math:`\text{Cov}(X, Y)` is the covariance of :math:`X` and :math:`Y`.
+- :math:`\sigma_X` is the standard deviation of :math:`X`.
+- :math:`\sigma_Y` is the standard deviation of :math:`Y`.
+
+Covariance measures how much two variables change together. It is defined as:
+
+.. math::
+
+ \text{Cov}(X, Y) = \frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)
+
+where:
+
+- :math:`n` is the number of data points.
+- :math:`X_i` and :math:`Y_i` are the individual data points.
+- :math:`\mu_X` and :math:`\mu_Y` are the means of :math:`X` and :math:`Y`.
+
+The standard deviation measures the dispersion or spread of a set of values. For
+a variable :math:`X`, the standard deviation :math:`\sigma_X` is:
+
+.. math::
+
+ \sigma_X = \sqrt{\frac{1}{n} \sum_{i=1}^{n} (X_i - \mu_X)^2}
+
+Substituting the covariance and standard deviation into the Pearson correlation formula:
+
+.. math::
+
+ r_{XY} = \frac{\sum_{i=1}^{n} (X_i - \mu_X)(Y_i - \mu_Y)}{\sqrt{\sum_{i=1}^{n} (X_i - \mu_X)^2} \sqrt{\sum_{i=1}^{n} (Y_i - \mu_Y)^2}}
+
+This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between :math:`X` and :math:`Y`.
+
+- :math:`r > 0`: Positive correlation. As :math:`X` increases, :math:`Y` tends to increase.
+- :math:`r < 0`: Negative correlation. As :math:`X` increases, :math:`Y` tends to decrease.
+- :math:`r = 0`: No linear correlation. There is no consistent linear relationship between :math:`X` and :math:`Y`.
+
+The closer the value of :math:`r` is to :math:`\pm 1`, the stronger the linear relationship between the two variables.
+
+
+Partial Dependence Foundations
+--------------------------------
+
+Let :math:`\mathbf{X}` represent the complete set of input features for a machine
+learning model, where :math:`\mathbf{X} = \{X_1, X_2, \dots, X_p\}`. Suppose we're
+particularly interested in a subset of these features, denoted by :math:`\mathbf{X}_S`.
+The complementary set, :math:`\mathbf{X}_C`, contains all the features in :math:`\mathbf{X}`
+that are not in :math:`\mathbf{X}_S`. Mathematically, this relationship is expressed as:
+
+.. math::
+
+ \mathbf{X}_C = \mathbf{X} \setminus \mathbf{X}_S
+
+where :math:`\mathbf{X}_C` is the set of features in :math:`\mathbf{X}` after
+removing the features in :math:`\mathbf{X}_S`.
+
+Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in :math:`\mathbf{X}_S` on the model's predictions, while averaging out the
+influence of the features in :math:`\mathbf{X}_C`. This is mathematically defined as:
+
+.. math::
+ \begin{align*}
+ \text{PD}_{\mathbf{X}_S}(x_S) &= \mathbb{E}_{\mathbf{X}_C} \left[ f(x_S, \mathbf{X}_C) \right] \\
+ &= \int f(x_S, x_C) \, p(x_C) \, dx_C \\
+ &= \int \left( \int f(x_S, x_C) \, p(x_C \mid x_S) \, dx_C \right) p(x_S) \, dx_S
+ \end{align*}
+
+
+where:
+
+- :math:`\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]` indicates that we are taking the expected value over the possible values of the features in the set :math:`\mathbf{X}_C`.
+- :math:`p(x_C)` represents the probability density function of the features in :math:`\mathbf{X}_C`.
+
+This operation effectively summarizes the model's output over all potential values of the complementary features, providing a clear view of how the features in :math:`\mathbf{X}_S` alone impact the model's predictions.
+
+
+**2D Partial Dependence Plots**
+
+Consider a trained machine learning model `2D Partial Dependence Plots`_ :math:`f(\mathbf{X})`, where :math:`\mathbf{X} = (X_1, X_2, \dots, X_p)` represents the vector of input features. The partial dependence of the predicted response :math:`\hat{y}` on a single feature :math:`X_j` is defined as:
+
+.. math::
+
+ \text{PD}(X_j) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, \mathbf{X}_{C_i})
+
+where:
+
+- :math:`X_j` is the feature of interest.
+- :math:`\mathbf{X}_{C_i}` represents the complement set of :math:`X_j`, meaning the remaining features in :math:`\mathbf{X}` not included in :math:`X_j` for the :math:`i`-th instance.
+- :math:`n` is the number of observations in the dataset.
+
+For two features, :math:`X_j` and :math:`X_k`, the partial dependence is given by:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of :math:`X_j` and :math:`X_k` vary, while the effects of the other features are averaged out.
+
+- **Single Feature PDP:** When plotting :math:`\text{PD}(X_j)`, the result is a 2D line plot showing the marginal effect of feature :math:`X_j` on the predicted outcome, averaged over all possible values of the other features.
+- **Two Features PDP:** When plotting :math:`\text{PD}(X_j, X_k)`, the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of :math:`X_j` and :math:`X_k` on the predicted outcome. The surface represents the expected value of the prediction as :math:`X_j` and :math:`X_k` vary, while all other features are averaged out.
+
+
+**3D Partial Dependence Plots**
+
+For a more comprehensive analysis, especially when exploring interactions between two features, `3D Partial Dependence Plots`_ are invaluable. The partial dependence function for two features in a 3D context is:
+
+.. math::
+
+ \text{PD}(X_j, X_k) = \frac{1}{n} \sum_{i=1}^{n} f(X_j, X_k, \mathbf{X}_{C_i})
+
+Here, the function :math:`f(X_j, X_k, \mathbf{X}_{C_i})` is evaluated across a grid of values for :math:`X_j` and :math:`X_k`. The resulting 3D surface plot represents how the model's prediction changes over the joint range of these two features.
+
+The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model's predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+- **Feature Interaction Visualization:** The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+- **Enhanced Interpretation:** 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
+
+
+KDE and Histogram Distribution Plots
+=======================================
+
+.. raw:: html
+
+
+
+KDE Distribution Function
+-----------------------------
+
+**Generate KDE or histogram distribution plots for specified columns in a DataFrame.**
+
+The ``kde_distributions`` function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2]_ for plotting, which is built on top of
+matplotlib [3]_ and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
+
+**Key Features and Parameters**
+
+- **Flexible Plotting**: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+- **Leverages Seaborn Library**: The function is built on the `seaborn` library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+- **Customization**: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+- **Scientific Notation Control**: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+- **Log Scaling**: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+- **Output Options**: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
+
+.. function:: kde_distributions(df, vars_of_interest=None, figsize=(5, 5), grid_figsize=None, hist_color="#0000FF", kde_color="#FF0000", mean_color="#000000", median_color="#000000", hist_edgecolor="#000000", hue=None, fill=True, fill_alpha=1, n_rows=None, n_cols=None, w_pad=1.0, h_pad=1.0, image_path_png=None, image_path_svg=None, image_filename=None, bbox_inches=None, single_var_image_filename=None, y_axis_label="Density", plot_type="both", log_scale_vars=None, bins="auto", binwidth=None, label_fontsize=10, tick_fontsize=10, text_wrap=50, disable_sci_notation=False, stat="density", xlim=None, ylim=None, plot_mean=False, plot_median=False, std_dev_levels=None, std_color="#808080", label_names=None, show_legend=True, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param vars_of_interest: List of column names for which to generate distribution plots. If 'all', plots will be generated for all numeric columns.
+ :type vars_of_interest: list of str, optional
+ :param figsize: Size of each individual plot, default is ``(5, 5)``. Used when only one plot is being generated or when saving individual plots.
+ :type figsize: tuple of int, optional
+ :param grid_figsize: Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on ``figsize``, ``n_rows``, and ``n_cols``.
+ :type grid_figsize: tuple of int, optional
+ :param hist_color: Color of the histogram bars, default is ``'#0000FF'``.
+ :type hist_color: str, optional
+ :param kde_color: Color of the KDE plot, default is ``'#FF0000'``.
+ :type kde_color: str, optional
+ :param mean_color: Color of the mean line if ``plot_mean`` is True, default is ``'#000000'``.
+ :type mean_color: str, optional
+ :param median_color: Color of the median line if ``plot_median`` is True, default is ``'#000000'``.
+ :type median_color: str, optional
+ :param hist_edgecolor: Color of the histogram bar edges, default is ``'#000000'``.
+ :type hist_edgecolor: str, optional
+ :param hue: Column name to group data by, adding different colors for each group.
+ :type hue: str, optional
+ :param fill: Whether to fill the histogram bars with color, default is ``True``.
+ :type fill: bool, optional
+ :param fill_alpha: Alpha transparency for the fill color of the histogram bars, where ``0`` is fully transparent and ``1`` is fully opaque. Default is ``1``.
+ :type fill_alpha: float, optional
+ :param n_rows: Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+ :type n_cols: int, optional
+ :param w_pad: Width padding between subplots, default is ``1.0``.
+ :type w_pad: float, optional
+ :param h_pad: Height padding between subplots, default is ``1.0``.
+ :type h_pad: float, optional
+ :param image_path_png: Directory path to save the PNG image of the overall distribution plots.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path to save the SVG image of the overall distribution plots.
+ :type image_path_svg: str, optional
+ :param image_filename: Filename to use when saving the overall distribution plots.
+ :type image_filename: str, optional
+ :param bbox_inches: Bounding box to use when saving the figure. For example, ``'tight'``.
+ :type bbox_inches: str, optional
+ :param single_var_image_filename: Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses ``figsize`` for determining the plot size, ignoring ``grid_figsize``.
+ :type single_var_image_filename: str, optional
+ :param y_axis_label: The label to display on the ``y-axis``, default is ``'Density'``.
+ :type y_axis_label: str, optional
+ :param plot_type: The type of plot to generate, options are ``'hist'``, ``'kde'``, or ``'both'``. Default is ``'both'``.
+ :type plot_type: str, optional
+ :param log_scale_vars: Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+ :type log_scale_vars: str or list of str, optional
+ :param bins: Specification of histogram bins, default is ``'auto'``.
+ :type bins: int or sequence, optional
+ :param binwidth: Width of each bin, overrides bins but can be used with binrange.
+ :type binwidth: float, optional
+ :param label_fontsize: Font size for axis labels, including xlabel, ylabel, and tick marks, default is ``10``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: Maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param disable_sci_notation: Toggle to disable scientific notation on axes, default is ``False``.
+ :type disable_sci_notation: bool, optional
+ :param stat: Aggregate statistic to compute in each bin (e.g., ``'count'``, ``'frequency'``, ``'probability'``, ``'percent'``, ``'density'``), default is ``'density'``.
+ :type stat: str, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param plot_mean: Whether to plot the mean as a vertical line, default is ``False``.
+ :type plot_mean: bool, optional
+ :param plot_median: Whether to plot the median as a vertical line, default is ``False``.
+ :type plot_median: bool, optional
+ :param std_dev_levels: Levels of standard deviation to plot around the mean.
+ :type std_dev_levels: list of int, optional
+ :param std_color: Color(s) for the standard deviation lines, default is ``'#808080'``.
+ :type std_color: str or list of str, optional
+ :param label_names: Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+ :type label_names: dict, optional
+ :param show_legend: Whether to show the legend on the plots, default is ``True``.
+ :type show_legend: bool, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``'hist'``, ``'kde'``, or ``'both'``.
+ - If ``stat`` is not one of ``'count'``, ``'density'``, ``'frequency'``, ``'probability'``, ``'proportion'``, ``'percent'``.
+ - If ``log_scale_vars`` contains variables that are not present in the DataFrame.
+ - If ``fill`` is set to ``False`` and ``hist_edgecolor`` is not the default.
+ - If ``grid_figsize`` is provided when only one plot is being created.
+
+ :raises UserWarning:
+ - If both ``bins`` and ``binwidth`` are specified, which may affect performance.
+
+ :returns: ``None``
+
+
+\
+
+.. raw:: html
+
+
+
+
+
+KDE and Histograms Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In the below example, the ``kde_distributions`` function is used to generate
+histograms for several variables of interest: ``"age"``, ``"education-num"``, and
+``"hours-per-week"``. These variables represent different demographic and
+financial attributes from the dataset. The ``plot_type="both"`` parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data's probability density.
+
+The visualizations are arranged in a single row of four columns, as specified
+by ``n_rows=1`` and ``n_cols=3``, respectively. The overall size of the grid
+figure is set to `14 inches` wide and `4 inches tall` (``grid_figsize=(14, 4)``),
+while each individual plot is configured to be `4 inches` by `4 inches`
+(``single_figsize=(4, 4)``). The ``fill=True`` parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+``w_pad=1`` and ``h_pad=1``, which add `1 inch` of padding both horizontally and
+vertically.
+
+.. note::
+ If you do not set ``n_rows`` or ``n_cols`` to any values, the function will
+ automatically calculate and create a grid based on the number of variables being
+ plotted, ensuring an optimal arrangement of the plots.
+
+To handle longer titles, the ``text_wrap=50`` parameter ensures that the title
+text wraps to a new line after `50 characters`. The ``bbox_inches="tight"`` setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in ``vars_of_interest`` are
+passed directly to the function for visualization.
+
+Each plot is saved individually with filenames that are prefixed by
+``"kde_density_single_distribution"``, followed by the variable name. The ```y-axis```
+for all plots is labeled as "Density" (``y_axis_label="Density"``), reflecting that
+the height of the bars or KDE line represents the data's density. The histograms
+are divided into `10 bins` (``bins=10``), offering a clear view of the distribution
+of each variable.
+
+Additionally, the font sizes for the axis labels and tick labels
+are set to `16 points` (``label_fontsize=16``) and `14 points` (``tick_fontsize=14``),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both", # Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/kde_density_distributions.svg
+ :alt: KDE Distributions - KDE (+) Histograms (Density)
+ :align: center
+ :width: 950px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Density)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is used to generate histograms for
+the variables ``"age"``, ``"education-num"``, and ``"hours-per-week"`` but with
+``plot_type="hist"``, meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+with a grid size of `14x4 inches` (``grid_figsize=(14, 4)``). The histograms are
+divided into `10 bins` (``bins=10``), and the ``y-axis`` is labeled "Density" (``y_axis_label="Density"``).
+Font sizes for the axis labels and tick labels are set to `16` and `14` points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ show_legend=False,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/hist_density_distributions.svg
+ :alt: KDE Distributions - Histograms (Density)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Histogram Example (Count)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is modified to generate histograms
+with a few key changes. The ``hist_color`` is set to `"orange"`, changing the color of the
+histogram bars. The ``y-axis`` label is updated to "Count" (``y_axis_label="Count"``),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to ``"Count"`` to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (``n_rows=1, n_cols=3``),
+a grid size of `14x4 inches`, and a bin count of `10`. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ show_legend=False,
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/count_hist_distributions.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Histogram Example - (Mean and Median)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+histograms that include mean and median lines. The ``mean_color`` is set to ``"blue"``
+and the ``median_color`` is set to ``"black"``, allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted ``(plot_mean=True, plot_median=True)``.
+The ``y_axis_label`` remains ``"Density"``, indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using ``hist_color="brown"``, with a ``fill_alpha=0.60`` while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``, and
+the grid size is set to `15x5 inches`. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ "education-num",
+ "hours-per-week",
+ ]
+
+ kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14, 4), # Size of the overall grid figure
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.60,
+ plot_type="hist",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_mean_median.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Histogram Example - (Mean, Median, and Std. Deviation)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``kde_distributions()`` function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+``mean_color`` is set to ``"blue"`` and the median_color is set to ``"black"``,
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted ``(plot_mean=True, plot_median=True)``. The ``y_axis_label`` remains
+``"Density"``, indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using ``hist_color="brown"``,
+with a ``fill_alpha=0.40``, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors ``"purple"``,
+``"green"``, and ``"silver"`` for one, two, and three standard deviations, respectively.
+
+The layout is configured with a single row and multiple columns ``(n_rows=1, n_cols=3)``,
+and the grid size is set to `15x5 inches`. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+.. note::
+
+ You have the freedom to choose whether to plot the mean, median, and
+ standard deviation lines. You can display one, none, or all of these simultaneously.
+
+.. code-block:: python
+
+ from eda_toolkit import kde_distributions
+
+ vars_of_interest = [
+ "age",
+ ]
+
+ kde_distributions(
+ df=df,
+ figsize=(10, 6),
+ text_wrap=50,
+ hist_color="brown",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ fill_alpha=0.40,
+ plot_type="both",
+ stat="Density",
+ label_fontsize=16, # Font size for axis labels
+ tick_fontsize=14, # Font size for tick labels
+ plot_mean=True,
+ plot_median=True,
+ mean_color="blue",
+ image_path_svg=image_path_svg,
+ image_path_png=image_path_png,
+ std_dev_levels=[
+ 1,
+ 2,
+ 3,
+ ],
+ std_color=[
+ "purple",
+ "green",
+ "silver",
+ ],
+ )
+
+.. raw:: html
+
+
+
+.. image:: ../assets/density_hist_dist_age.svg
+ :alt: KDE Distributions - Histograms (Count)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Stacked Crosstab Plots
+=======================
+
+**Generates stacked bar plots and crosstabs for specified columns in a DataFrame.**
+
+The ``stacked_crosstab_plot`` function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
+The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
+
+.. function:: stacked_crosstab_plot(df, col, func_col, legend_labels_list, title, kind="bar", width=0.9, rot=0, custom_order=None, image_path_png=None, image_path_svg=None, save_formats=None, color=None, output="both", return_dict=False, x=None, y=None, p=None, file_prefix=None, logscale=False, plot_type="both", show_legend=True, label_fontsize=12, tick_fontsize=10, text_wrap=50, remove_stacks=False)
+
+ Generates stacked or regular bar plots and crosstabs for specified columns.
+
+ This function allows users to create stacked bar plots (or regular bar plots
+ if stacks are removed) and corresponding crosstabs for specific columns
+ in a DataFrame. It provides options to customize the appearance, including
+ font sizes for axis labels, tick labels, and title text wrapping, and to
+ choose between regular or normalized plots.
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param col: The name of the column in the DataFrame to be analyzed.
+ :type col: str
+ :param func_col: List of ground truth columns to be analyzed.
+ :type func_col: list
+ :param legend_labels_list: List of legend labels for each ground truth column.
+ :type legend_labels_list: list
+ :param title: List of titles for the plots.
+ :type title: list
+ :param kind: The kind of plot to generate (``'bar'`` or ``'barh'`` for horizontal bars), default is ``'bar'``.
+ :type kind: str, optional
+ :param width: The width of the bars in the bar plot, default is ``0.9``.
+ :type width: float, optional
+ :param rot: The rotation angle of the ``x-axis`` labels, default is ``0``.
+ :type rot: int, optional
+ :param custom_order: Specifies a custom order for the categories in the ``col``.
+ :type custom_order: list, optional
+ :param image_path_png: Directory path where generated PNG plot images will be saved.
+ :type image_path_png: str, optional
+ :param image_path_svg: Directory path where generated SVG plot images will be saved.
+ :type image_path_svg: str, optional
+ :param save_formats: List of file formats to save the plot images in.
+ :type save_formats: list, optional
+ :param color: List of colors to use for the plots. If not provided, a default color scheme is used.
+ :type color: list, optional
+ :param output: Specify the output type: ``"plots_only"``, ``"crosstabs_only"``, or ``"both"``. Default is ``"both"``.
+ :type output: str, optional
+ :param return_dict: Specify whether to return the crosstabs dictionary, default is ``False``.
+ :type return_dict: bool, optional
+ :param x: The width of the figure.
+ :type x: int, optional
+ :param y: The height of the figure.
+ :type y: int, optional
+ :param p: The padding between the subplots.
+ :type p: int, optional
+ :param file_prefix: Prefix for the filename when output includes plots.
+ :type file_prefix: str, optional
+ :param logscale: Apply log scale to the ``y-axis``, default is ``False``.
+ :type logscale: bool, optional
+ :param plot_type: Specify the type of plot to generate: ``"both"``, ``"regular"``, ``"normalized"``. Default is ``"both"``.
+ :type plot_type: str, optional
+ :param show_legend: Specify whether to show the legend, default is ``True``.
+ :type show_legend: bool, optional
+ :param label_fontsize: Font size for axis labels, default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for tick labels on the axes, default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping, default is ``50``.
+ :type text_wrap: int, optional
+ :param remove_stacks: If ``True``, removes stacks and creates a regular bar plot using only the ``col`` parameter. Only works when ``plot_type`` is set to ``'regular'``. Default is ``False``.
+ :type remove_stacks: bool, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (`min, max`).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (`min, max`).
+ :type ylim: tuple or list, optional
+
+ :raises ValueError:
+ - If ``output`` is not one of ``"both"``, ``"plots_only"``, or ``"crosstabs_only"``.
+ - If ``plot_type`` is not one of ``"both"``, ``"regular"``, ``"normalized"``.
+ - If ``remove_stacks`` is set to True and ``plot_type`` is not ``"regular"``.
+ - If the lengths of ``title``, ``func_col``, and ``legend_labels_list`` are not equal.
+ :raises KeyError: If any columns specified in ``col`` or ``func_col`` are missing in the DataFrame.
+
+ :returns: Dictionary of crosstabs DataFrames if ``return_dict`` is ``True``. Otherwise, returns ``None``.
+ :rtype: ``dict`` or ``None``
+
+
+
+Stacked Bar Plots With Crosstabs Example
+-----------------------------------------
+
+The provided code snippet demonstrates how to use the ``stacked_crosstab_plot``
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here's a detailed breakdown of the code using the census
+dataset as an example [1]_.
+
+First, the ``func_col`` list is defined, specifying the columns ``["sex", "income"]``
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The ``legend_labels_list`` is then defined, with each entry corresponding to a
+column in ``func_col``. In this case, the labels for the ``sex`` column are
+``["Male", "Female"]``, and for the ``income`` column, they are ``["<=50K", ">50K"]``.
+These labels will be used to annotate the legends of the plots.
+
+Next, the ``title`` list is defined, providing titles for each plot corresponding
+to the columns in ``func_col``. The titles are set to ``["Sex", "Income"]``,
+which will be displayed on top of each respective plot.
+
+.. note::
+
+ The ``legend_labels_list`` parameter should be a list of lists, where each
+ inner list corresponds to the ground truth labels for the respective item in
+ the ``func_col`` list. Each element in the ``func_col`` list represents a
+ column in your DataFrame that you wish to analyze, and the corresponding
+ inner list in ``legend_labels_list`` should contain the labels that will be
+ used in the legend of your plots.
+
+For example:
+
+.. code-block:: python
+
+ # Define the func_col to use in the loop in order of usage
+ func_col = ["sex", "income"]
+
+ # Define the legend_labels to use in the loop
+ legend_labels_list = [
+ ["Male", "Female"], # Corresponds to "sex"
+ ["<=50K", ">50K"], # Corresponds to "income"
+ ]
+
+ # Define titles for the plots
+ title = [
+ "Sex",
+ "Income",
+ ]
+
+.. important::
+
+ Ensure that the number of elements in ``func_col``, ``legend_labels_list``,
+ and ``title`` are the same. Each item in ``func_col`` must have a corresponding
+ list of labels in ``legend_labels_list`` and a title in ``title``. This
+ consistency is essential for the function to correctly generate the plots
+ with the appropriate labels and titles.
+
+
+In this example:
+
+- ``func_col`` contains two elements: ``"sex"`` and ``"income"``. Each corresponds to a specific column in your DataFrame.
+- ``legend_labels_list`` is a nested list containing two inner lists:
+
+ - The first inner list, ``["Male", "Female"]``, corresponds to the ``"sex"`` column in ``func_col``.
+ - The second inner list, ``["<=50K", ">50K"]``, corresponds to the ``"income"`` column in ``func_col``.
+
+- ``title`` contains two elements: ``"Sex"`` and ``"Income"``, which will be used as the titles for the respective plots.
+
+.. note::
+
+ If you assign the function to a variable, the dictionary returned when
+ ``return_dict=True`` will be suppressed in the output. However, the dictionary
+ is still available within the assigned variable for further use.
+
+
+.. code-block:: python
+
+ from eda_toolkit import stacked_crosstab_plot
+
+ # Call the stacked_crosstab_plot function
+ stacked_crosstabs = stacked_crosstab_plot(
+ df=df,
+ col="age_group",
+ func_col=func_col,
+ legend_labels_list=legend_labels_list,
+ title=title,
+ kind="bar",
+ width=0.8,
+ rot=45, # axis rotation angle
+ custom_order=None,
+ color=["#00BFC4", "#F8766D"], # default color schema
+ output="both",
+ return_dict=True,
+ x=14,
+ y=8,
+ p=10,
+ logscale=False,
+ plot_type="both",
+ show_legend=True,
+ label_fontsize=14,
+ tick_fontsize=12,
+ )
+
+The above example generates stacked bar plots for ``"sex"`` and ``"income"``
+grouped by ``"education"``. The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+.. important::
+
+ **Importance of Correctly Aligning Labels**
+
+ It is crucial to properly align the elements in the ``legend_labels_list``,
+ ``title``, and ``func_col`` parameters when using the ``stacked_crosstab_plot``
+ function. Each of these lists must be ordered consistently because the function
+ relies on their alignment to correctly assign labels and titles to the
+ corresponding plots and legends.
+
+ **For instance, in the example above:**
+
+ - The first element in ``func_col`` is ``"sex"``, and it is aligned with the first set of labels ``["Male", "Female"]`` in ``legend_labels_list`` and the first title ``"Sex"`` in the ``title`` list.
+ - Similarly, the second element in ``func_col``, ``"income"``, aligns with the labels ``["<=50K", ">50K"]`` and the title ``"Income"``.
+
+ **Misalignment between these lists would result in incorrect labels or titles being
+ applied to the plots, potentially leading to confusion or misinterpretation of the data.
+ Therefore, it's important to ensure that each list is ordered appropriately and
+ consistently to accurately reflect the data being visualized.**
+
+ **Proper Setup of Lists**
+
+ When setting up the ``legend_labels_list``, ``title``, and ``func_col``, ensure
+ that each element in the lists corresponds to the correct variable in the DataFrame.
+ This involves:
+
+ - **Ordering**: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+ - **Consistency**: Double-checking that each label in ``legend_labels_list`` matches the categories present in the corresponding ``func_col``, and that the ``title`` accurately describes the plot.
+
+ By adhering to these guidelines, you can ensure that the ``stacked_crosstab_plot``
+ function produces accurate and meaningful visualizations that are easy to interpret and analyze.
+
+**Output**
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_sex.svg
+ :alt: KDE Distributions
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income.svg
+ :alt: Stacked Bar Plot Age vs. Income
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+.. note::
+
+ When you set ``return_dict=True``, you are able to see the crosstabs printed out
+ as shown below.
+
+.. raw:: html
+
+
+
+
+
Crosstab for sex
+
+
+
+
+
+
+
+
sex
+
Female
+
Male
+
Total
+
Female_%
+
Male_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
295
+
300
+
595
+
49.58
+
50.42
+
+
+
18-29
+
5707
+
8213
+
13920
+
41
+
59
+
+
+
30-39
+
3853
+
9076
+
12929
+
29.8
+
70.2
+
+
+
40-49
+
3188
+
7536
+
10724
+
29.73
+
70.27
+
+
+
50-59
+
1873
+
4746
+
6619
+
28.3
+
71.7
+
+
+
60-69
+
939
+
2115
+
3054
+
30.75
+
69.25
+
+
+
70-79
+
280
+
535
+
815
+
34.36
+
65.64
+
+
+
80-89
+
40
+
91
+
131
+
30.53
+
69.47
+
+
+
90-99
+
17
+
38
+
55
+
30.91
+
69.09
+
+
+
Total
+
16192
+
32650
+
48842
+
33.15
+
66.85
+
+
+
+
+
+
Crosstab for income
+
+
+
+
+
+
income
+
<=50K
+
>50K
+
Total
+
<=50K_%
+
>50K_%
+
+
+
age_group
+
+
+
+
+
+
+
+
< 18
+
595
+
0
+
595
+
100
+
0
+
+
+
18-29
+
13174
+
746
+
13920
+
94.64
+
5.36
+
+
+
30-39
+
9468
+
3461
+
12929
+
73.23
+
26.77
+
+
+
40-49
+
6738
+
3986
+
10724
+
62.83
+
37.17
+
+
+
50-59
+
4110
+
2509
+
6619
+
62.09
+
37.91
+
+
+
60-69
+
2245
+
809
+
3054
+
73.51
+
26.49
+
+
+
70-79
+
668
+
147
+
815
+
81.96
+
18.04
+
+
+
80-89
+
115
+
16
+
131
+
87.79
+
12.21
+
+
+
90-99
+
42
+
13
+
55
+
76.36
+
23.64
+
+
+
Total
+
37155
+
11687
+
48842
+
76.07
+
23.93
+
+
+
+\
+
+When you set ``return_dict=True``, you can access these crosstabs as
+DataFrames by assigning them to their own vriables. For example:
+
+.. code-block:: python
+
+ crosstab_age_sex = crosstabs_dict["sex"]
+ crosstab_age_income = crosstabs_dict["income"]
+
+
+Pivoted Stacked Bar Plots Example
+-----------------------------------
+
+Using the census dataset [1]_, to create horizontal stacked bar plots, set the ``kind`` parameter to
+``"barh"`` in the ``stacked_crosstab_plot function``. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the ``y-axis``.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_pivoted.svg
+ :alt: Stacked Bar Plot Age vs. Income (Pivoted)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Non-Normalized Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to create stacked bar plots without the normalized versions,
+set the ``plot_type`` parameter to ``"regular"`` in the ``stacked_crosstab_plot``
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the ``plot_type`` to ``"normalized"`` will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Stacked_Bar_Age_income_regular.svg
+ :alt: Stacked Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Regular Non-Stacked Bar Plots Example
+----------------------------------------------------
+
+In the census data [1]_, to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the ``plot_type`` parameter to ``"regular"``
+in the ``stacked_crosstab_plot`` function and enable ``remove_stacks`` by setting
+it to ``True``. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+``plot_type`` to ``"normalized"`` will display only the normalized plots.
+
+When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
+In the example below, the color of the bars has been set to a dark grey (``#333333``),
+and the legend has been removed by setting ``show_legend=False``. This illustrates
+regular bar plots for income by age, without stacking.
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/Bar_Age_regular_income.svg
+ :alt: Bar Plot Age vs. Income (Regular)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Box and Violin Plots
+===========================
+
+**Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.**
+
+The ``box_violin_plot`` function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
+With options to customize the plot type (``boxplot`` or ``violinplot``),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
+Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
+The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
+
+.. function:: box_violin_plot(df, metrics_list, metrics_comp, n_rows=None, n_cols=None, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, plot_type="boxplot", xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, xlim=None, ylim=None, label_names=None, **kwargs)
+
+ :param df: The DataFrame containing the data to plot.
+ :type df: pandas.DataFrame
+ :param metrics_list: List of metric names (columns in df) to plot.
+ :type metrics_list: list of str
+ :param metrics_comp: List of comparison categories (columns in df).
+ :type metrics_comp: list of str
+ :param n_rows: Number of rows in the subplot grid. Calculated automatically if not provided.
+ :type n_rows: int, optional
+ :param n_cols: Number of columns in the subplot grid. Calculated automatically if not provided.
+ :type n_cols: int, optional
+ :param image_path_png: Optional directory path to save ``.png`` images.
+ :type image_path_png: str, optional
+ :param image_path_svg: Optional directory path to save ``.svg`` images.
+ :type image_path_svg: str, optional
+ :param save_plots: String, ``"all"``, ``"individual"``, or ``"grid"`` to control saving plots.
+ :type save_plots: str, optional
+ :param show_legend: Boolean, True if showing the legend in the plots. Default is ``True``.
+ :type show_legend: bool, optional
+ :param plot_type: Specify the type of plot, either ``"boxplot"`` or ``"violinplot"``. Default is ``"boxplot"``.
+ :type plot_type: str, optional
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+ :param show_plot: Specify the plot display mode: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+ :param rotate_plot: Boolean, True if rotating (pivoting) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+ :param grid_figsize: Width and height of the figure for grid plots.
+ :type grid_figsize: tuple or list, optional
+ :param label_fontsize: Font size for axis labels. Default is ``12``.
+ :type label_fontsize: int, optional
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+ :param label_names: Dictionary mapping original column names to custom labels. Default is ``None``.
+ :type label_names: dict, optional
+ :param kwargs: Additional keyword arguments passed to the Seaborn plotting function.
+ :type kwargs: additional keyword arguments
+
+ :raises ValueError:
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set without specifying ``image_path_png`` or ``image_path_svg``.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` is not a tuple or list of two numbers.
+ - If ``grid_figsize`` is provided and is not a tuple or list of two numbers.
+
+ :returns: ``None``
+
+
+
+This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
+
+Box Plots Grid Example
+-----------------------
+
+In this example with the US census data [1]_, the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the ``"age_group"`` column in the
+DataFrame. The ``metrics_comp`` parameter is set to [``"age_group"``], meaning
+that the comparison will be based on different age groups. The ``metrics_list`` is
+provided as ``age_boxplot_list``, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe ``image_path_png`` and
+``image_path_svg`` parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to ``"all"``, ensuring that both
+individual and grid plots are saved.
+
+The plots are displayed in a grid format, as indicated by the ``show_plot="grid"``
+parameter. The ``plot_type`` is set to ``"boxplot"``, so the function will generate
+boxplots for each metric in the list. Additionally, the ```x-axis``` labels are rotated
+by 90 degrees (``xlabel_rot=90``) to ensure that the labels are legible. The legend is
+hidden by setting ``show_legend=False``, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
+
+
+.. code-block:: python
+
+ age_boxplot_list = df[
+ [
+ "education-num",
+ "hours-per-week",
+ ]
+ ].columns.to_list()
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="boxplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Violin Plots Grid Example
+--------------------------
+
+In this example with the US census data [1]_, we keep everything the same as the prior example, but change the
+``plot_type`` to ``violinplot``. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
+
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_comp=metrics_comp,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ save_plots="all",
+ show_plot="both",
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=90,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Pivoted Violin Plots Grid Example
+------------------------------------
+
+In this example with the US census data [1]_, we set ``xlabel_rot=0`` and ``rotate_plot=True``
+to pivot the plot, changing the orientation of the axes while keeping the ``x-axis`` labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
+
+.. code-block:: python
+
+ from eda_toolkit import box_violin_plot
+
+ metrics_comp = ["age_group"]
+
+ box_violin_plot(
+ df=df,
+ metrics_list=age_boxplot_list,
+ metrics_boxplot_comp=metrics_comp,
+ show_plot="both",
+ rotate_plot=True,
+ show_legend=False,
+ plot_type="violinplot",
+ xlabel_rot=0,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots and Best Fit Lines
+==================================
+
+Scatter Fit Plot
+------------------
+
+**Create and Save Scatter Plots or a Grid of Scatter Plots**
+
+This function, ``scatter_fit_plot``, is designed to generate scatter plots for
+one or more pairs of variables (``x_vars`` and ``y_vars``) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
+**Optional Best Fit Line**
+
+An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
+**Customizable Plot Aesthetics**
+
+The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+- **Point Color**: You can specify a default color for the scatter points or use a ``hue`` parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
+- **Point Size**: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
+- **Markers**: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+**Axis and Label Configuration**
+
+The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
+**Plot Display and Saving Options**
+
+The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
+**Correlation Coefficient Display**
+
+For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
+
+.. function:: scatter_fit_plot(df, x_vars=None, y_vars=None, n_rows=None, n_cols=None, max_cols=4, image_path_png=None, image_path_svg=None, save_plots=None, show_legend=True, xlabel_rot=0, show_plot="both", rotate_plot=False, individual_figsize=(6, 4), grid_figsize=None, label_fontsize=12, tick_fontsize=10, text_wrap=50, add_best_fit_line=False, scatter_color="C0", best_fit_linecolor="red", best_fit_linestyle="-", hue=None, hue_palette=None, size=None, sizes=None, marker="o", show_correlation=True, xlim=None, ylim=None, all_vars=None, label_names=None, **kwargs)
+
+ Create and save scatter plots or a grid of scatter plots for given ``x_vars``
+ and ``y_vars``, with an optional best fit line and customizable point color,
+ size, and markers.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param x_vars: List of variable names to plot on the ``x-axis``.
+ :type x_vars: list of str, optional
+
+ :param y_vars: List of variable names to plot on the ``y-axis``.
+ :type y_vars: list of str, optional
+
+ :param n_rows: Number of rows in the subplot grid. Calculated based on the number of plots and ``n_cols`` if not specified.
+ :type n_rows: int, optional
+
+ :param n_cols: Number of columns in the subplot grid. Calculated based on the number of plots and ``max_cols`` if not specified.
+ :type n_cols: int, optional
+
+ :param max_cols: Maximum number of columns in the subplot grid. Default is ``4``.
+ :type max_cols: int, optional
+
+ :param image_path_png: Directory path to save PNG images of the scatter plots.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the scatter plots.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls which plots to save: ``"all"``, ``"individual"``, or ``"grid"``. If None, plots will not be saved.
+ :type save_plots: str, optional
+
+ :param show_legend: Whether to display the legend on the plots. Default is ``True``.
+ :type show_legend: bool, optional
+
+ :param xlabel_rot: Rotation angle for ``x-axis`` labels. Default is ``0``.
+ :type xlabel_rot: int, optional
+
+ :param show_plot: Controls plot display: ``"individual"``, ``"grid"``, or ``"both"``. Default is ``"both"``.
+ :type show_plot: str, optional
+
+ :param rotate_plot: Whether to rotate (pivot) the plots. Default is ``False``.
+ :type rotate_plot: bool, optional
+
+ :param individual_figsize: Width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple or list, optional
+
+ :param grid_figsize: Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+ :type grid_figsize: tuple or list, optional
+
+ :param label_fontsize: Font size for axis labels. Default is 12.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is 10.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param add_best_fit_line: Whether to add a best fit line to the scatter plots. Default is ``False``.
+ :type add_best_fit_line: bool, optional
+
+ :param scatter_color: Color code for the scattered points. Default is ``"C0"``.
+ :type scatter_color: str, optional
+
+ :param best_fit_linecolor: Color code for the best fit line. Default is ``"red"``.
+ :type best_fit_linecolor: str, optional
+
+ :param best_fit_linestyle: Linestyle for the best fit line. Default is ``"-"``.
+ :type best_fit_linestyle: str, optional
+
+ :param hue: Column name for the grouping variable that will produce points with different colors.
+ :type hue: str, optional
+
+ :param hue_palette: Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the ``hue`` parameter to be set.
+ :type hue_palette: dict, list, or str, optional
+
+ :param size: Column name for the grouping variable that will produce points with different sizes.
+ :type size: str, optional
+
+ :param sizes: Dictionary mapping sizes (smallest and largest) to min and max values.
+ :type sizes: dict, optional
+
+ :param marker: Marker style used for the scatter points. Default is ``"o"``.
+ :type marker: str, optional
+
+ :param show_correlation: Whether to display the Pearson correlation coefficient in the plot title. Default is ``True``.
+ :type show_correlation: bool, optional
+
+ :param xlim: Limits for the ``x-axis`` as a tuple or list of (``min``, ``max``).
+ :type xlim: tuple or list, optional
+
+ :param ylim: Limits for the ``y-axis`` as a tuple or list of (``min``, ``max``).
+ :type ylim: tuple or list, optional
+
+ :param all_vars: If provided, automatically generates scatter plots for all combinations of variables in this list, overriding `x_vars` and `y_vars`.
+ :type all_vars: list of str, optional
+
+ :param label_names: A dictionary to rename columns for display in the plot titles and labels.
+ :type label_names: dict, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``sns.scatterplot``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``all_vars`` is provided and either ``x_vars`` or ``y_vars`` is also provided.
+ - If neither ``all_vars`` nor both ``x_vars`` and ``y_vars`` are provided.
+ - If ``hue_palette`` is specified without ``hue``.
+ - If ``show_plot`` is not one of ``"individual"``, ``"grid"``, or ``"both"``.
+ - If ``save_plots`` is not one of ``None``, ``"all"``, ``"individual"``, or ``"grid"``.
+ - If ``save_plots`` is set but no image paths are provided.
+ - If ``rotate_plot`` is not a boolean value.
+ - If ``individual_figsize`` or ``grid_figsize`` are not tuples/lists with two numeric values.
+
+ :returns: ``None``. This function does not return any value but generates and optionally saves scatter plots for the specified ``x_vars`` and ``y_vars``, or for all combinations of variables in ``all_vars`` if it is provided.
+
+
+
+Regression-Centric Scatter Plots Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this US census data [1]_ example, the ``scatter_fit_plot`` function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the ``show_correlation=True`` parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the ``add_best_fit_line=True`` parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ grid_figsize=None,
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. image:: ../assets/scatter_plots_grid.png
+ :alt: Scatter Plot Comparisons (with Best Fit Lines)
+ :align: center
+ :width: 900px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Scatter Plots Grouped by Category Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of
+scatter plots that examine the relationships between ``age`` and ``hours-per-week``
+as well as ``education-num`` and ``hours-per-week``. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+1. **Hue and Hue Palette**: The ``hue`` parameter is set to ``"income"``, meaning that the
+ data points in the scatter plots are colored according to the values in the ``income``
+ column. A custom color mapping is provided via the ``hue_palette`` parameter, where the
+ income categories ``"<=50K"`` and ``">50K"`` are assigned the colors ``"brown"`` and
+ ``"green"``, respectively. This change visually distinguishes the data points based on
+ income levels.
+
+2. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, which applies
+ a grey color to the scatter points when no ``hue`` is provided. However, since a ``hue``
+ is specified in this example, the ``hue_palette`` takes precedence and overrides this color setting.
+
+3. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``False``, meaning that
+ no best fit line is added to the scatter plots. This differs from the previous example where
+ a best fit line was included.
+
+4. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``False``, so the
+ Pearson correlation coefficient will not be displayed in the plot titles. This is another
+ change from the previous example where the correlation coefficient was included.
+
+5. **Hue Legend**: The ``show_legend`` parameter remains set to ``True``, ensuring that the
+ legend displaying the hue categories (``"<=50K"`` and ``">50K"``) appears on the plots,
+ helping to interpret the color coding of the data points.
+
+These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
+
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ hue_dict = {"<=50K": "brown", ">50K": "green"}
+
+ scatter_fit_plot(
+ df=df,
+ x_vars=["age", "education-num"],
+ y_vars=["hours-per-week"],
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=False,
+ scatter_color="#808080",
+ hue="income",
+ hue_palette=hue_dict,
+ show_correlation=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Scatter Plots (All Combinations Example)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In this example, the ``scatter_fit_plot`` function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the ``df`` DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+1. **All Variables Combination**: The ``all_vars`` parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don't need to manually specify ``x_vars`` and ``y_vars``, as the function will iterate through each possible pair.
+
+2. **Grid Display**: The ``show_plot`` parameter is set to ``"grid"``, so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
+3. **Font Sizes**: The ``label_fontsize`` and ``tick_fontsize`` parameters are set to ``14`` and ``12``, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
+4. **Best Fit Line**: The ``add_best_fit_line`` parameter is set to ``True``, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
+5. **Scatter Color**: The ``scatter_color`` parameter is set to ``"#808080"``, applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
+6. **Correlation Coefficient**: The ``show_correlation`` parameter is set to ``True``, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
+
+.. code-block:: python
+
+ from eda_toolkit import scatter_fit_plot
+
+ scatter_fit_plot(
+ df=df,
+ all_vars=df.select_dtypes(np.number).columns.to_list(),
+ show_legend=True,
+ show_plot="grid",
+ label_fontsize=14,
+ tick_fontsize=12,
+ add_best_fit_line=True,
+ scatter_color="#808080",
+ show_correlation=True,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Correlation Matrices
+=====================
+
+**Generate and Save Customizable Correlation Heatmaps**
+
+The ``flex_corr_matrix`` function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
+**Customizable Plot Appearance**
+
+The function provides extensive customization options for the heatmap's appearance:
+
+- **Colormap Selection**: Choose from a variety of colormaps to represent the strength of correlations. The default is ``"coolwarm"``, but this can be adjusted to fit the needs of the analysis.
+
+- **Annotation**: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
+- **Figure Size and Layout**: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+**Triangular vs. Full Correlation Matrix**
+
+
+A key feature of the ``flex_corr_matrix`` function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
+**Label and Axis Configuration**
+
+
+The function offers flexibility in configuring axis labels and titles:
+
+- **Label Rotation**: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+- **Font Sizes**: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+- **Title Wrapping**: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+**Plot Display and Saving Options**
+
+
+The ``flex_corr_matrix`` function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
+
+.. function:: flex_corr_matrix(df, cols=None, annot=True, cmap="coolwarm", save_plots=False, image_path_png=None, image_path_svg=None, figsize=(10, 10), title="Cervical Cancer Data: Correlation Matrix", label_fontsize=12, tick_fontsize=10, xlabel_rot=45, ylabel_rot=0, xlabel_alignment="right", ylabel_alignment="center_baseline", text_wrap=50, vmin=-1, vmax=1, cbar_label="Correlation Index", triangular=True, **kwargs)
+
+ Create a customizable correlation heatmap with options for annotation, color mapping, figure size, and saving the plot.
+
+ :param df: The DataFrame containing the data.
+ :type df: pandas.DataFrame
+
+ :param cols: List of column names to include in the correlation matrix. If None, all columns are included.
+ :type cols: list of str, optional
+
+ :param annot: Whether to annotate the heatmap with correlation coefficients. Default is ``True``.
+ :type annot: bool, optional
+
+ :param cmap: The colormap to use for the heatmap. Default is ``"coolwarm"``.
+ :type cmap: str, optional
+
+ :param save_plots: Controls whether to save the plots. Default is ``False``.
+ :type save_plots: bool, optional
+
+ :param image_path_png: Directory path to save PNG images of the heatmap.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save SVG images of the heatmap.
+ :type image_path_svg: str, optional
+
+ :param figsize: Width and height of the figure for the heatmap. Default is ``(10, 10)``.
+ :type figsize: tuple, optional
+
+ :param title: Title of the heatmap. Default is ``"Cervical Cancer Data: Correlation Matrix"``.
+ :type title: str, optional
+
+ :param label_fontsize: Font size for tick labels and colorbar label. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param xlabel_rot: Rotation angle for x-axis labels. Default is ``45``.
+ :type xlabel_rot: int, optional
+
+ :param ylabel_rot: Rotation angle for y-axis labels. Default is ``0``.
+ :type ylabel_rot: int, optional
+
+ :param xlabel_alignment: Horizontal alignment for x-axis labels. Default is ``"right"``.
+ :type xlabel_alignment: str, optional
+
+ :param ylabel_alignment: Vertical alignment for y-axis labels. Default is ``"center_baseline"``.
+ :type ylabel_alignment: str, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param vmin: Minimum value for the heatmap color scale. Default is ``-1``.
+ :type vmin: float, optional
+
+ :param vmax: Maximum value for the heatmap color scale. Default is ``1``.
+ :type vmax: float, optional
+
+ :param cbar_label: Label for the colorbar. Default is ``"Correlation Index"``.
+ :type cbar_label: str, optional
+
+ :param triangular: Whether to show only the upper triangle of the correlation matrix. Default is ``True``.
+ :type triangular: bool, optional
+
+ :param kwargs: Additional keyword arguments to pass to ``seaborn.heatmap()``.
+ :type kwargs: dict, optional
+
+ :raises ValueError:
+ - If ``annot`` is not a boolean.
+ - If ``cols`` is not a list.
+ - If ``save_plots`` is not a boolean.
+ - If ``triangular`` is not a boolean.
+ - If ``save_plots`` is True but no image paths are provided.
+
+ :returns: ``None``
+ This function does not return any value but generates and optionally saves a correlation heatmap.
+
+Triangular Correlation Matrix Example
+--------------------------------------
+
+The provided code filters the census [1]_ DataFrame ``df`` to include only numeric columns using
+``select_dtypes(np.number)``. It then utilizes the ``flex_corr_matrix()`` function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+.. note::
+
+ This triangular matrix format is particularly useful for avoiding
+ redundancy in correlation matrices, as it excludes the lower half,
+ making it easier to focus on unique pairwise correlations.
+
+The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
+.. code-block:: python
+
+ # Select only numeric data to pass into the function
+ df_num = df.select_dtypes(np.number)
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="coolwarm",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=True,
+ )
+
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Full Correlation Matrix Example
+----------------------------------
+
+In this modified census [1]_ example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting ``cmap="viridis"``, the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting ``triangular=False``, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
+
+.. code-block:: python
+
+ from eda_toolkit import flex_corr_matrix
+
+ flex_corr_matrix(
+ df=df,
+ cols=df_num.columns.to_list(),
+ annot=True,
+ cmap="viridis",
+ figsize=(10, 8),
+ title="US Census Correlation Matrix",
+ xlabel_alignment="right",
+ label_fontsize=14,
+ tick_fontsize=12,
+ xlabel_rot=45,
+ ylabel_rot=0,
+ text_wrap=50,
+ vmin=-1,
+ vmax=1,
+ cbar_label="Correlation Index",
+ triangular=False,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+Partial Dependence Plots
+=========================
+
+**Partial Dependence Plots (PDPs)** are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model's predictions.
+
+2D Partial Dependence Plots
+-----------------------------
+
+The ``plot_2d_pdp`` function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+- **Grid and Individual Plots**: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+- **Customization Options**: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+- **Saving Plots**: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
+
+.. function:: plot_2d_pdp(model, X_train, feature_names, features, title="PDP of house value on CA non-location features", grid_resolution=50, plot_type="grid", grid_figsize=(12, 8), individual_figsize=(6, 4), label_fontsize=12, tick_fontsize=10, text_wrap=50, image_path_png=None, image_path_svg=None, save_plots=None, file_prefix="partial_dependence")
+
+ Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param X_train: The training data used to compute partial dependence. Should correspond to the features used to train the model.
+ :type X_train: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names: A list of feature names corresponding to the columns in ``X_train``.
+ :type feature_names: list of str
+
+ :param features: A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+ :type features: list of int or tuple of int
+
+ :param title: The title for the entire plot. Default is ``"PDP of house value on CA non-location features"``.
+ :type title: str, optional
+
+ :param grid_resolution: The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is ``50``.
+ :type grid_resolution: int, optional
+
+ :param plot_type: The type of plot to generate. Choose ``"grid"`` for a grid layout, ``"individual"`` for separate plots, or ``"both"`` to generate both layouts. Default is ``"grid"``.
+ :type plot_type: str, optional
+
+ :param grid_figsize: Tuple specifying the width and height of the figure for the grid layout. Default is ``(12, 8)``.
+ :type grid_figsize: tuple, optional
+
+ :param individual_figsize: Tuple specifying the width and height of the figure for individual plots. Default is ``(6, 4)``.
+ :type individual_figsize: tuple, optional
+
+ :param label_fontsize: Font size for the axis labels and titles. Default is ``12``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for the axis tick labels. Default is ``10``.
+ :type tick_fontsize: int, optional
+
+ :param text_wrap: The maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param image_path_png: The directory path where PNG images of the plots will be saved, if saving is enabled.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: The directory path where SVG images of the plots will be saved, if saving is enabled.
+ :type image_path_svg: str, optional
+
+ :param save_plots: Controls whether to save the plots. Options include ``"all"``, ``"individual"``, ``"grid"``, or ``None`` (default). If saving is enabled, ensure ``image_path_png`` or ``image_path_svg`` are provided.
+ :type save_plots: str, optional
+
+ :param file_prefix: Prefix for the filenames of the saved grid plots. Default is ``"partial_dependence"``.
+ :type file_prefix: str, optional
+
+ :raises ValueError:
+ - If ``plot_type`` is not one of ``"grid"``, ``"individual"``, or ``"both"``.
+ - If ``save_plots`` is enabled but neither ``image_path_png`` nor ``image_path_svg`` is provided.
+
+ :returns: ``None``
+ This function generates partial dependence plots and displays them. It does not return any values.
+
+
+2D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California. [4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
+For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
+If you extend this to two interacting features, such as ``AveOccup`` and ``HouseAge``,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
+Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ from sklearn.ensemble import GradientBoostingRegressor
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=42,
+ )
+ model.fit(X_train, y_train)
+
+
+**Create 2D Partial Dependence Plot Grid**
+
+.. code-block:: python
+
+ # import the plot_2d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_2d_pdp
+
+ # Feature names
+ names = data.feature_names
+
+ # Generate 2D partial dependence plots
+ plot_2d_pdp(
+ model=model,
+ X_train=X_train,
+ feature_names=names,
+ features=[
+ "MedInc",
+ "AveOccup",
+ "HouseAge",
+ "AveRooms",
+ "Population",
+ ("AveOccup", "HouseAge"),
+ ],
+ title="PDP of house value on CA non-location features",
+ grid_figsize=(14, 10),
+ individual_figsize=(12, 4),
+ label_fontsize=14,
+ tick_fontsize=12,
+ text_wrap=120,
+ plot_type="grid",
+ image_path_png="path/to/save/png",
+ save_plots="all",
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+3D Partial Dependence Plots
+-----------------------------
+
+The ``plot_3d_pdp`` function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+- **Interactive and Static 3D Plots**: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+- **Colormap and Layout Customization**: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+- **Axis and Title Configuration**: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
+
+.. function:: plot_3d_pdp(model, dataframe, feature_names_list, x_label=None, y_label=None, z_label=None, title, html_file_path=None, html_file_name=None, image_filename=None, plot_type="both", matplotlib_colormap=None, plotly_colormap="Viridis", zoom_out_factor=None, wireframe_color=None, view_angle=(22, 70), figsize=(7, 4.5), text_wrap=50, horizontal=-1.25, depth=1.25, vertical=1.25, cbar_x=1.05, cbar_thickness=25, title_x=0.5, title_y=0.95, top_margin=100, image_path_png=None, image_path_svg=None, show_cbar=True, grid_resolution=20, left_margin=20, right_margin=65, label_fontsize=8, tick_fontsize=6, enable_zoom=True, show_modebar=True)
+
+ Generate 3D partial dependence plots for two features of a machine learning model.
+
+ This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+ :param model: The trained machine learning model used to generate partial dependence plots.
+ :type model: estimator object
+
+ :param dataframe: The dataset on which the model was trained or a representative sample. If a DataFrame is provided, ``feature_names_list`` should correspond to the column names. If a NumPy array is provided, ``feature_names_list`` should correspond to the indices of the columns.
+ :type dataframe: pandas.DataFrame or numpy.ndarray
+
+ :param feature_names_list: A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+ :type feature_names_list: list of str
+
+ :param x_label: Label for the x-axis in the plots. Default is ``None``.
+ :type x_label: str, optional
+
+ :param y_label: Label for the y-axis in the plots. Default is ``None``.
+ :type y_label: str, optional
+
+ :param z_label: Label for the z-axis in the plots. Default is ``None``.
+ :type z_label: str, optional
+
+ :param title: The title for the plots.
+ :type title: str
+
+ :param html_file_path: Path to save the interactive Plotly HTML file. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_path: str, optional
+
+ :param html_file_name: Name of the HTML file to save the interactive Plotly plot. Required if ``plot_type`` is ``"interactive"`` or ``"both"``. Default is ``None``.
+ :type html_file_name: str, optional
+
+ :param image_filename: Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is ``None``.
+ :type image_filename: str, optional
+
+ :param plot_type: The type of plots to generate. Options are:
+ - ``"static"``: Generate only static Matplotlib plots.
+ - ``"interactive"``: Generate only interactive Plotly plots.
+ - ``"both"``: Generate both static and interactive plots. Default is ``"both"``.
+ :type plot_type: str, optional
+
+ :param matplotlib_colormap: Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+ :type matplotlib_colormap: matplotlib.colors.Colormap, optional
+
+ :param plotly_colormap: Colormap for the Plotly plot. Default is ``"Viridis"``.
+ :type plotly_colormap: str, optional
+
+ :param zoom_out_factor: Factor to adjust the zoom level of the Plotly plot. Default is ``None``.
+ :type zoom_out_factor: float, optional
+
+ :param wireframe_color: Color for the wireframe in the Matplotlib plot. If ``None``, no wireframe is plotted. Default is ``None``.
+ :type wireframe_color: str, optional
+
+ :param view_angle: Elevation and azimuthal angles for the Matplotlib plot view. Default is ``(22, 70)``.
+ :type view_angle: tuple, optional
+
+ :param figsize: Figure size for the Matplotlib plot. Default is ``(7, 4.5)``.
+ :type figsize: tuple, optional
+
+ :param text_wrap: Maximum width of the title text before wrapping. Useful for managing long titles. Default is ``50``.
+ :type text_wrap: int, optional
+
+ :param horizontal: Horizontal camera position for the Plotly plot. Default is ``-1.25``.
+ :type horizontal: float, optional
+
+ :param depth: Depth camera position for the Plotly plot. Default is ``1.25``.
+ :type depth: float, optional
+
+ :param vertical: Vertical camera position for the Plotly plot. Default is ``1.25``.
+ :type vertical: float, optional
+
+ :param cbar_x: Position of the color bar along the x-axis in the Plotly plot. Default is ``1.05``.
+ :type cbar_x: float, optional
+
+ :param cbar_thickness: Thickness of the color bar in the Plotly plot. Default is ``25``.
+ :type cbar_thickness: int, optional
+
+ :param title_x: Horizontal position of the title in the Plotly plot. Default is ``0.5``.
+ :type title_x: float, optional
+
+ :param title_y: Vertical position of the title in the Plotly plot. Default is ``0.95``.
+ :type title_y: float, optional
+
+ :param top_margin: Top margin for the Plotly plot layout. Default is ``100``.
+ :type top_margin: int, optional
+
+ :param image_path_png: Directory path to save the PNG file of the Matplotlib plot. Default is None.
+ :type image_path_png: str, optional
+
+ :param image_path_svg: Directory path to save the SVG file of the Matplotlib plot. Default is None.
+ :type image_path_svg: str, optional
+
+ :param show_cbar: Whether to display the color bar in the Matplotlib plot. Default is ``True``.
+ :type show_cbar: bool, optional
+
+ :param grid_resolution: The resolution of the grid for computing partial dependence. Default is ``20``.
+ :type grid_resolution: int, optional
+
+ :param left_margin: Left margin for the Plotly plot layout. Default is ``20``.
+ :type left_margin: int, optional
+
+ :param right_margin: Right margin for the Plotly plot layout. Default is ``65``.
+ :type right_margin: int, optional
+
+ :param label_fontsize: Font size for axis labels in the Matplotlib plot. Default is ``8``.
+ :type label_fontsize: int, optional
+
+ :param tick_fontsize: Font size for tick labels in the Matplotlib plot. Default is ``6``.
+ :type tick_fontsize: int, optional
+
+ :param enable_zoom: Whether to enable zooming in the Plotly plot. Default is ``True``.
+ :type enable_zoom: bool, optional
+
+ :param show_modebar: Whether to display the mode bar in the Plotly plot. Default is ``True``.
+ :type show_modebar: bool, optional
+
+ :raises ValueError:
+ - If `plot_type` is not one of ``"static"``, ``"interactive"``, or ``"both"``.
+ - If `plot_type` is ``"interactive"`` or ``"both"`` and ``html_file_path`` or ``html_file_name`` are not provided.
+
+ :returns: ``None``
+ This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+ :notes:
+ - This function handles warnings related to scikit-learn's ``partial_dependence`` function, specifically a ``FutureWarning`` related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+ - To maintain compatibility with different versions of scikit-learn, the function attempts to use ``"values"`` for grid extraction in newer versions and falls back to ``"grid_values"`` for older versions.
+
+
+3D Plots - CA Housing Example
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (``AveOccup``) and the
+age of the house (``HouseAge``) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
+For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
+This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
+Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
+
+Static Plot
+^^^^^^^^^^^^^^^^^
+
+**Fetch The CA Housing Dataset and Prepare The DataFrame**
+
+.. code-block:: python
+
+ from sklearn.ensemble import GradientBoostingRegressor
+ from sklearn.datasets import fetch_california_housing
+ from sklearn.model_selection import train_test_split
+ import pandas as pd
+
+ # Load the dataset
+ data = fetch_california_housing()
+ df = pd.DataFrame(data.data, columns=data.feature_names)
+
+**Split The Data Into Training and Testing Sets**
+
+.. code-block:: python
+
+ X_train, X_test, y_train, y_test = train_test_split(
+ df, data.target, test_size=0.2, random_state=42
+ )
+
+**Train a GradientBoostingRegressor Model**
+
+.. code-block:: python
+
+ model = GradientBoostingRegressor(
+ n_estimators=100,
+ max_depth=4,
+ learning_rate=0.1,
+ loss="huber",
+ random_state=1,
+ )
+ model.fit(X_train, y_train)
+
+**Create Static 3D Partial Dependence Plot**
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8, 5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+ )
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+
+
+Interactive Plot
+^^^^^^^^^^^^^^^^^
+
+.. code-block:: python
+
+ # import the plot_3d_pdp function from
+ # the eda_toolkit library
+ from eda_toolkit import plot_3d_pdp
+
+ # Call the function to generate the plot
+ plot_3d_pdp(
+ model=model,
+ dataframe=X_test, # Use the test dataset
+ feature_names_list=["HouseAge", "AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+ )
+
+.. warning::
+
+ **Scrolling Notice:**
+
+ While interacting with the interactive Plotly plot below, scrolling down the
+ page using the mouse wheel may be blocked when the mouse pointer is hovering
+ over the plot. To continue scrolling, either move the mouse pointer outside
+ the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+.. raw:: html
+
+
+
+
+
+
+This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here's how it differs from
+generating a static plot using Matplotlib.
+
+**Key Differences**
+
+**Plot Type**:
+
+- The ``plot_type`` is set to ``"interactive"`` for the Plotly plot and ``"static"`` for the Matplotlib plot.
+
+**Interactive-Specific Parameters**:
+
+- **HTML File Path and Name**: The ``html_file_path`` and ``html_file_name`` parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
+- **Zoom and Positioning**: The interactive plot includes parameters like ``zoom_out_factor``, ``title_x``, ``cbar_x``, and ``cbar_thickness`` to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
+- **Mode Bar and Zoom**: The ``show_modebar`` and ``enable_zoom`` parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+**Static-Specific Parameters**:
+
+- **Figure Size and Wireframe Color**: The static plot uses parameters like ``figsize`` to control the size of the Matplotlib plot and ``wireframe_color`` to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+.. [1] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
+.. [2] Waskom, M. (2021). *Seaborn: Statistical Data Visualization*. *Journal of Open Source Software*, 6(60), 3021. `https://doi.org/10.21105/joss.03021 `_.
+
+.. [3] Hunter, J. D. (2007). *Matplotlib: A 2D Graphics Environment*. *Computing in Science & Engineering*, 9(3), 90-95. `https://doi.org/10.1109/MCSE.2007.55 `_.
+
+.. [4] Pace, R. K., & Barry, R. (1997). *Sparse Spatial Autoregressions*. *Statistics & Probability Letters*, 33(3), 291-297. `https://doi.org/10.1016/S0167-7152(96)00140-X `_.
+
diff --git a/_build/html/v0.0.9/_sources/getting_started.rst.txt b/_build/html/v0.0.9/_sources/getting_started.rst.txt
new file mode 100644
index 000000000..17fc2cc84
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/getting_started.rst.txt
@@ -0,0 +1,136 @@
+.. _getting_started:
+
+.. KFRE Python Library Documentation documentation master file, created by
+ sphinx-quickstart on Thu May 2 15:44:56 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+\
+
+
+Welcome to the EDA Toolkit Python Library Documentation!
+========================================================
+.. note::
+ This documentation is for ``eda_toolkit`` version ``0.0.9``.
+
+
+The ``eda_toolkit`` is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
+
+Project Links
+---------------
+
+1. `PyPI Page `_
+
+2. `GitHub Repository `_
+
+
+What is EDA?
+-------------
+
+Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
+
+
+Purpose of EDA Toolkit
+-----------------------
+The ``eda_toolkit`` library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+- **Data Management:** Tools for managing directories, generating unique IDs,
+ standardizing dates, and handling common DataFrame manipulations.
+- **Data Cleaning:** Functions to address missing values, remove outliers, and
+ correct formatting issues, ensuring data is ready for analysis.
+- **Data Visualization:** A variety of plotting functions, including KDE
+ distribution plots, stacked bar plots, scatter plots with optional best fit
+ lines, and box/violin plots, to visually explore data distributions,
+ relationships, and trends.
+- **Descriptive and Summary Statistics:** Methods to generate comprehensive
+ reports on data types, summary statistics (mean, median, standard deviation,
+ etc.), and to summarize all possible combinations of specified variables.
+- **Reporting and Export:** Features to save DataFrames to Excel with
+ customizable formatting, create contingency tables, and export generated
+ plots in multiple formats.
+
+
+
+Key Features
+-------------
+
+- **Ease of Use:** The toolkit is designed with simplicity in mind, offering intuitive and easy-to-use functions.
+- **Customizable:** Users can customize various aspects of the toolkit to fit their specific needs.
+- **Integration:** Seamlessly integrates with popular data science libraries such as ``Pandas``, ``NumPy``, ``Matplotlib``, and ``Seaborn``.
+- **Documentation and Examples:** Comprehensive documentation and examples to help users get started quickly and effectively.
+
+.. _prerequisites:
+
+Prerequisites
+-------------
+Before you install ``eda_toolkit``, ensure your system meets the following requirements:
+
+- **Python**: version ``3.7.4`` or higher is required to run ``eda_toolkit``.
+
+Additionally, ``eda_toolkit`` depends on the following packages, which will be automatically installed when you install ``eda_toolkit``:
+
+- ``jinja2``: version ``3.1.4`` or higher
+- ``matplotlib``: version ``3.5.3`` or higher
+- ``nbformat``: version ``4.2.0`` or higher
+- ``numpy``: version ``1.21.6`` or higher
+- ``pandas``: version ``1.3.5`` or higher
+- ``plotly``: version ``5.18.0`` or higher
+- ``scikit-learn``: version ``1.0.2`` or higher
+- ``seaborn``: version ``0.12.2`` or higher
+- ``xlsxwriter``: version ``3.2.0`` or higher
+
+.. _installation:
+
+Installation
+-------------
+
+You can install ``eda_toolkit`` directly from PyPI:
+
+.. code-block:: bash
+
+ pip install eda_toolkit
+
+
+Description
+===============
+
+This guide provides detailed instructions and examples for using the functions
+provided in the ``eda_toolkit`` library and how to use them effectively in your projects.
+
+For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [#]_. This dataset provides a rich source of
+information for demonstrating the functionalities of the ``eda_toolkit``.
+
+.. [#] Kohavi, R. (1996). *Census Income*. UCI Machine Learning Repository. `https://doi.org/10.24432/C5GP7S `_.
+
diff --git a/_build/html/v0.0.9/_sources/index.rst.txt b/_build/html/v0.0.9/_sources/index.rst.txt
new file mode 100644
index 000000000..5f0dc6c56
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/index.rst.txt
@@ -0,0 +1,57 @@
+.. EDA Toolkit documentation master file, created by
+ sphinx-quickstart on Mon Jul 29 08:15:33 2024.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
+
+.. raw:: html
+
+
+
+Table of Contents
+===================
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Getting Started
+
+ getting_started
+
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Data Management
+
+ data_management
+
+.. toctree::
+ :maxdepth: 4
+ :caption: Plotting Heuristics
+
+ eda_plots
+
+.. toctree::
+ :maxdepth: 4
+ :caption: About EDA Toolkit
+
+ acknowledgements
+ contributors
+ citations
+ changelog
+ references
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.9/_sources/references.rst.txt b/_build/html/v0.0.9/_sources/references.rst.txt
new file mode 100644
index 000000000..335337c3a
--- /dev/null
+++ b/_build/html/v0.0.9/_sources/references.rst.txt
@@ -0,0 +1,33 @@
+.. _references:
+
+.. _target-link:
+
+.. raw:: html
+
+
+
+.. image:: ../assets/eda_toolkit_logo.svg
+ :alt: EDA Toolkit Logo
+ :align: left
+ :width: 300px
+
+.. raw:: html
+
+
We would like to express our deepest gratitude to Dr. Ebrahim Tarshizi, our mentor during our time in the University of San Diego M.S. Applied Data Science Program. His unwavering dedication and mentorship played a pivotal role in our academic journey, guiding us to successfully graduate from the program and pursue successful careers as data scientists.
+
We also extend our thanks to the Shiley-Marcos School of Engineering at the University of San Diego for providing an exceptional learning environment and supporting our educational endeavors.
Improved error messages and validation checks across multiple functions to prevent common pitfalls and ensure smoother user experience.
+
Visualization Enhancements
+
DataFrame Columns: Added a background_color variable to dataframe_columns`,
+allowing the user to enter a string representing a color name, or hex value.
+Try/Except on the output, in case the end user has a deprecated version of Pandas,
+where the styler would use hide() instead of hide_index(). The highlighted
+columns allow for easier null versus unique value analysis.
+
The docstring now clearly describes the purpose of the function—analyzing
+DataFrame columns to provide summary statistics.
+
Args:
+
+
The df argument is specified as a pandas.DataFrame.
+
The background_color argument is marked as optional, with a brief description of its role.
+
The return_df argument is also marked as optional, explaining what it controls.
+
+
Returns: The return type is specified as pandas.DataFrame, with a clear explanation of the difference based on the return_df flag.
+
KDE Distribution Plots: Improved kde_distributions() with enhanced options for log scaling, mean/median plotting, custom standard deviation lines, and better handling of legends and scientific notation.
+
Scatter Plots: Enhanced scatter_fit_plot() with support for hue-based coloring, best fit lines, correlation display, and flexible grid plotting options.
Flexible `save_formats` Input:
+- save_formats now accepts a string, tuple, or list for specifying formats (e.g., “png”, (“png”, “svg”), or [“png”, “svg”]).
+- Single strings or tuples are automatically converted to lists for consistent processing.
+
Dynamic Error Handling:
+- Added checks to ensure a valid path is provided for each format in save_formats.
+- Raises a ValueError if a format is specified without a corresponding path, with a clear, dynamic error message.
+
Improved Plot Saving Logic:
+- Updated logic allows saving plots in one format (e.g., only “png” or “svg”) without requiring the other.
+- Simplified and more intuitive path handling for saving plots.
This update introduces several key changes to the plot_3d_pdp function, simplifying the function’s interface and improving usability, while maintaining the flexibility needed for diverse visualization needs.
+
1. Parameter Changes
+
+
Removed Parameters:
+
+
The parameters x_label_plotly, y_label_plotly, and z_label_plotly have been removed. These parameters previously allowed custom axis labels specifically for the Plotly plot, defaulting to the general x_label, y_label, and z_label. Removing these parameters simplifies the function signature while maintaining flexibility.
+
+
+
Default Values for Labels:
+
+
The parameters x_label, y_label, and z_label are now optional, with None as the default. If not provided, these labels will automatically default to the names of the features in the feature_names_list. This change makes the function more user-friendly, particularly for cases where default labels are sufficient.
+
+
+
Changes in Default Values for View Angles:
+
+
The default values for camera positioning parameters have been updated: horizontal is now -1.25, depth is now 1.25, and vertical is now 1.25. These adjustments refine the default 3D view perspective for the Plotly plot, providing a more intuitive starting view.
+
+
+
+
2. Plot Generation Logic
+
+
Conditionally Checking Labels:
+
+
The function now checks whether x_label, y_label, and z_label are provided. If these are None, the function will automatically assign default labels based on the feature_names_list. This enhancement reduces the need for users to manually specify labels, making the function more adaptive.
+
+
+
Camera Position Adjustments:
+
+
The camera positions for the Plotly plot are now adjusted by multiplying horizontal, depth, and vertical by zoom_out_factor. This change allows for more granular control over the 3D view, enhancing the interactivity and flexibility of the Plotly visualizations.
+
+
+
Surface Plot Coordinates Adjustments:
+
+
The order of the coordinates for the Plotly plot’s surface has been changed from ZZ,XX,YY[::-1] to ZZ,XX,YY. This adjustment ensures the proper alignment of axes and grids, resulting in more accurate visual representations.
+
+
+
+
3. Code Simplifications
+
+
Removed Complexity:
+
+
By removing the x_label_plotly, y_label_plotly, and z_label_plotly parameters, the code is now simpler and easier to maintain. This change reduces potential confusion and streamlines the function for users who do not need distinct labels for Matplotlib and Plotly plots.
+
+
+
Fallback Mechanism for Grid Values:
+
+
The function continues to implement a fallback mechanism when extracting grid values, ensuring compatibility with various versions of scikit-learn. This makes the function robust across different environments.
+
+
+
+
4. Style Adjustments
+
+
Label Formatting:
+
+
The new version consistently uses y_label, x_label, and z_label for axis labels in the Matplotlib plot, aligning the formatting across different plot types.
+
+
+
Color Bar Adjustments:
+
+
The color bar configuration in the Matplotlib plot has been slightly adjusted with a shrink value of 0.6 and a pad value of 0.02. These adjustments result in a more refined visual appearance, particularly in cases where space is limited.
+
+
+
+
5. Potential Use Case Differences
+
+
Simplified Interface:
+
+
The updated function is more streamlined for users who prefer a simplified interface without the need for separate label customizations for Plotly and Matplotlib plots. This makes it easier to use in common scenarios.
+
+
+
Less Granular Control:
+
+
Users who need more granular control, particularly for presentations or specific formatting, may find the older version more suitable. The removal of the *_plotly label parameters means that all plots now use the same labels across Matplotlib and Plotly.
+
+
+
+
6. Matplotlib Plot Adjustments
+
+
Wireframe and Surface Plot Enhancements:
+
+
The logic for plotting wireframes and surface plots in Matplotlib remains consistent with previous versions, with subtle enhancements to color and layout management to improve overall aesthetics.
+
+
+
+
Summary
+
+
Version 0.0.8d of the plot_3d_pdp function introduces simplifications that reduce the number of parameters and streamline the plotting process. While some customizability has been removed, the function remains flexible enough for most use cases and is easier to use.
+
Key updates include adjusted default camera views for 3D plots, removal of Plotly-specific label parameters, and improved automatic labeling and plotting logic.
+
+
Decision Point
+
+
This update may be especially useful for users who prefer a cleaner and more straightforward interface. However, those requiring detailed customizations may want to continue using the older version, depending on their specific needs.
Version 0.0.8c is a follow-up release to version 0.0.8b. This update includes minor enhancements and refinements based on feedback and additional testing. It serves as an incremental step towards improving the stability and functionality of the toolkit.
+
Key Updates in 0.0.8c:
+
+
Bug Fixes: Addressed minor issues identified in version 0.0.8b to ensure smoother performance and better user experience.
+
Additional Testing: Incorporated further tests to validate the changes introduced in previous versions and to prepare for future stable releases.
+
Refinements: Made small enhancements to existing features based on user feedback and internal testing results.
+
+
Summary of Changes
+
+
New Features & Enhancements
+
+
+
plot_3d_pdp Function:
+
+
Added show_modebar Parameter: Introduced a new boolean parameter, show_modebar, to allow users to toggle the visibility of the mode bar in Plotly interactive plots.
+
Custom Margins and Layout Adjustments:
+
+
Added parameters for left_margin, right_margin, and top_margin to provide users with more control over the plot layout in Plotly.
+
Adjusted default values and added options for better customization of the Plotly color bar (cbar_x, cbar_thickness) and title positioning (title_x, title_y).
+
+
+
Plotly Configuration:
+
+
Enhanced the configuration options to allow users to enable or disable zoom functionality (enable_zoom) in the interactive Plotly plots.
+
Updated the code to reflect these new parameters, allowing for greater flexibility in the appearance and interaction with the Plotly plots.
+
+
+
Error Handling:
+
+
Added input validation for html_file_path and html_file_name to ensure these are provided when necessary based on the selected plot_type.
+
+
+
+
+
plot_2d_pdp Function:
+
+
Introduced file_prefix Parameter:
+
+
Added a new file_prefix parameter to allow users to specify a prefix for filenames when saving grid plots. This change streamlines the naming process for saved plots and improves file organization.
+
+
+
Enhanced Plot Type Flexibility:
+
+
The plot_type parameter now includes an option to generate both grid and individual plots (both). This feature allows users to create a combination of both layout styles in one function call.
+
Updated input validation and logic to handle this new option effectively.
+
+
+
Added save_plots Parameter:
+
+
Introduced a new parameter, save_plots, to control the saving of plots. Users can specify whether to save all plots, only individual plots, only grid plots, or none.
+
+
+
Custom Margins and Layout Adjustments:
+
+
Included the save_plots parameter in the validation process to ensure paths are provided when needed for saving the plots.
+
+
+
+
+
+
+
Documentation Updates
+
+
+
Docstrings:
+
+
Updated docstrings for both functions to reflect the new parameters and enhancements, providing clearer and more comprehensive guidance for users.
+
Detailed the use of new parameters such as show_modebar, file_prefix, save_plots, and others, ensuring that the function documentation is up-to-date with the latest changes.
+
+
+
+
+
Refactoring & Code Cleanup
+
+
+
Code Structure:
+
+
Improved the code structure to maintain clarity and readability, particularly around the new functionality.
+
Consolidated the layout configuration settings for the Plotly plots into a more flexible and user-friendly format, making it easier for users to customize their plots.
Version 0.0.8b is an exact replica of version 0.0.8a. The purpose of this
+beta release was to test whether releasing it as the latest version would update
+its status on PyPI to reflect it as the latest release. However, it continues to
+be identified as a pre-release on PyPI.
Version 0.0.8a introduces significant enhancements and new features to improve
+the usability and functionality of the EDA Toolkit.
+
New Features:
+
+
Optional file_prefix in stacked_crosstab_plot Function
+
+
The stacked_crosstab_plot function has been updated to make the file_prefix argument optional. If the user does not provide a file_prefix, the function will now automatically generate a default prefix based on the col and func_col parameters. This change streamlines the process of generating plots by reducing the number of required arguments.
+
Key Improvement:
+
+
Users can now omit the file_prefix argument, and the function will still produce appropriately named plot files, enhancing ease of use.
+
Backward compatibility is maintained, allowing users who prefer to specify a custom file_prefix to continue doing so without any issues.
+
+
+
+
+
Introduction of 3D and 2D Partial Dependence Plot Functions
+
+
Two new functions, plot_3d_pdp and plot_2d_pdp, have been added to the toolkit, expanding the visualization capabilities for machine learning models.
+
+
plot_3d_pdp: Generates 3D partial dependence plots for two features, supporting both static visualizations (using Matplotlib) and interactive plots (using Plotly). The function offers extensive customization options, including labels, color maps, and saving formats.
+
plot_2d_pdp: Creates 2D partial dependence plots for specified features with flexible layout options (grid or individual plots) and customization of figure size, font size, and saving formats.
+
+
+
Key Features:
+
+
Compatibility: Both functions are compatible with various versions of scikit-learn, ensuring broad usability.
+
Customization: Extensive options for customizing visual elements, including figure size, font size, and color maps.
+
Interactive 3D Plots: The plot_3d_pdp function supports interactive visualizations, providing an enhanced user experience for exploring model predictions in 3D space.
+
+
+
+
+
+
Impact:
+
+
These updates improve the user experience by reducing the complexity of function calls and introducing powerful new tools for model interpretation.
+
The optional file_prefix enhancement simplifies plot generation while maintaining the flexibility to define custom filenames.
+
The new partial dependence plot functions offer robust visualization options, making it easier to analyze and interpret the influence of specific features in machine learning models.
Added Function for Customizable Correlation Matrix Visualization
+
This release introduces a new function, flex_corr_matrix, which allows users to
+generate both full and upper triangular correlation heatmaps with a high degree
+of customization. The function includes options to annotate the heatmap, save the
+plots, and pass additional parameters to seaborn.heatmap().
+
Summary of Changes
+
+
New Function: flex_corr_matrix.
+
+
Functionality:
+- Generates a correlation heatmap for a given DataFrame.
+- Supports both full and upper triangular correlation matrices based on the triangular parameter.
+- Allows users to customize various aspects of the plot, including colormap, figure size, axis label rotation, and more.
+- Accepts additional keyword arguments via **kwargs to pass directly to seaborn.heatmap().
+- Includes validation to ensure the triangular, annot, and save_plots parameters are boolean values.
+- Raises an exception if save_plots=True but neither image_path_png nor image_path_svg is specified.
+
+
+
+
Usage
+
# Full correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=False,cmap="coolwarm",annot=True)
+
+# Upper triangular correlation matrix example
+flex_corr_matrix(df=my_dataframe,triangular=True,cmap="coolwarm",annot=True)
+
+
+
Contingency table df to object type
+
Convert all columns in the DataFrame to object type to prevent issues with numerical columns.
Added validation for Plot Type Parameter in KDE Distributions Function
+
This release adds a validation step for the plot_type parameter in the kde_distributions function. The allowed values for plot_type are "hist", "kde", and "both". If an invalid value is provided, the function will now raise a ValueError with a clear message indicating the accepted values. This change improves the robustness of the function and helps prevent potential errors due to incorrect parameter values.
Ensure Consistent Font Size and Text Wrapping Across Plot Elements
+
This PR addresses inconsistencies in font sizes and text wrapping across various plot elements in the stacked_crosstab_plot function. The following updates have been implemented to ensure uniformity and improve the readability of plots:
+
+
Title Font Size and Text Wrapping:
+- Added a text_wrap parameter to control the wrapping of plot titles.
+- Ensured that title font sizes are consistent with axis label font sizes by explicitly setting the font size using ax.set_title() after plot generation.
+
Legend Font Size Consistency:
+- Incorporated label_fontsize into the legend font size by directly setting the font size of the legend text using plt.setp(legend.get_texts(),fontsize=label_fontsize).
+- This ensures that the legend labels are consistent with the title and axis labels.
+
+
Testing
+
+
Verified that titles now wrap correctly and match the specified label_fontsize.
+
Confirmed that legend text scales according to label_fontsize, ensuring consistent font sizes across all plot elements.
Added new scatter_fit_plot(), removed unused data_types(), and added comment section headers.
+
+
Added xlim and ylim Inputs to KDE Distribution
+
+
kde_distribution():
+
+
+
Added xlim and ylim inputs to allow users to customize axes limits in kde_distribution().
+
+
+
+
+
Added xlim and ylim Params to Stacked Crosstab Plot
+
+
stacked_crosstab_plot():
+
+
+
Added xlim and ylim input parameters to stacked_crosstab_plot() to give users more flexibility in controlling axes limits.
+
+
+
+
+
Added x and y Limits to Box and Violin Plots
+
+
box_violin_plot():
+
+
+
Changed function name from metrics_box_violin() to box_violin_plot().
+
Added xlim and ylim inputs to control x and y-axis limits of box_violin_plot() (formerly metrics_box_violin).
+
+
+
+
+
Added Ability to Remove Stacks from Plots, Plot All or One at a Time
+
Key Changes
+
+
Plot Type Parameter
+- plot_type: This parameter allows the user to choose between "regular", "normalized", or "both" plot types.
+
Remove Stacks Parameter
+- remove_stacks: This parameter, when set to True, generates a regular bar plot using only the col parameter instead of a stacked bar plot. It only works when plot_type is set to “regular”. If remove_stacks is set to True while plot_type is anything other than “regular”, the function will raise an exception.
+
+
Explanation of Changes
+
+
Plot Type Parameter
+
+
Provides flexibility to the user, allowing specification of the type of plot to generate:
+
+
"regular": Standard bar plot.
+
"normalized": Normalized bar plot.
+
"both": Both regular and normalized bar plots.
+
+
+
+
+
Remove Stacks Parameter
+- remove_stacks: Generates a regular bar plot using only the col parameter, removing the stacking of the bars. Applicable only when plot_type is set to “regular”. An exception is raised if used with any other plot_type.
+
+
These changes enhance the flexibility and functionality of the stacked_crosstab_plot function, allowing for more customizable and specific plot generation based on user requirements.
Alpha Transparency for Histogram Fill
+- Added a fill_alpha parameter to control the transparency of the histogram bars’ fill color.
+- Default value is 0.6. An exception is raised if fill=False and fill_alpha is specified.
+
Custom Font Sizes
+- Introduced label_fontsize and tick_fontsize parameters to control font size of axis labels and tick marks independently.
+
Scientific Notation Toggle
+- Added a disable_sci_notation parameter to enable or disable scientific notation on axes.
+
Improved Error Handling
+- Added validation for the stat parameter to ensure valid options are accepted.
+- Added checks for proper usage of fill_alpha and hist_edgecolor when fill is set to False.
+
General Enhancements
+- Updated the function’s docstring to reflect new parameters and provide comprehensive guidance on usage.
Grid Figsize and Single Figsize
+- Control the size of the overall grid figure and individual figures separately.
+
Hist Color and KDE Color`
+- Allow customization of histogram and KDE plot colors.
+
Edge Color
+- Allows customization of histogram bar edges.
+
Hue
+- Allows grouping data by a column.
+
Fill
+- Controls whether to fill histogram bars with color.
+
Y-axis Label`
+- Customizable y-axis label.
+
Log-Scaling
+- Specifies which variables to apply log scale.
+
Bins and Bin Width
+- Control the number and width of bins.
+
``stat``:
+- Allows different statistics for the histogram (count, density, frequency, probability, proportion, percent).
+
+
Improvements
+
+
Validation and Error Handling
+- Checks for invalid log_scale_vars and throws a ValueError if any are found.
+- Throws a ValueError if edgecolor is changed while fill is set to False.
+- Issues a PerformanceWarning if both bins and binwidth are specified, warning of potential performance impacts.
Warning for KDE with Count
+- Issues a warning if KDE is used with stat='count', as it may produce misleading plots.
+
+
Updated Function to Ensure Unique IDs and Index Check
+
+
Ensured that each generated ID in add_ids starts with a non-zero digit.
+
Added a check to verify that the DataFrame index is unique.
+
Printed a warning message if duplicate index entries are found.
+
+
These changes improve the robustness of the function, ensuring that the IDs generated are always unique and valid, and provide necessary feedback when the DataFrame index is not unique.
+
Check for Unique Indices
+- Before generating IDs, the function now checks if the DataFrame index is unique.
+- If duplicates are found, a warning is printed along with the list of duplicate index entries.
+
Generate Non-Zero Starting IDs
+
+
The ID generation process is updated to ensure that the first digit of each ID is always non-zero.
+
+
Ensure Unique IDs
+
+
A set is used to store the generated IDs, ensuring all IDs are unique before adding them to the DataFrame.
+
+
Fix Int Conversion for Numeric Columns, Reset Decimal Places
+
+
Fixed integer conversion issue for numeric columns when decimal_places=0 in the save_dataframes_to_excel function.
+
Reset decimal_places default value to 0.
+
+
These changes ensure correct formatting and avoid errors during conversion.
+
Contingency Table Updates
+
+
Error Handling for Columns
+- Added a check to ensure at least one column is specified.
+- Updated the function to accept a single column as a string or multiple columns as a list.
+- Raised a ValueError if no columns are provided or if cols is not correctly specified.
+
Function Parameters
+- Changed parameters from col1 and col2 to a single parameter cols which can be either a string or a list.
+
Error Handling
+- Renamed SortBy to sort_by to standardize nomenclature.
+- Added a check to ensure sort_by is either 0 or 1.
+- Raised a ValueError if sort_by is not 0 or 1.
+
+
+
Sorting Logic
+- Updated the sorting logic to handle the new cols parameter structure.
+
Handling Categorical Data
+- Modified code to convert categorical columns to strings to avoid issues with fillna("").
+
Handling Missing Values
+- Added df=df.fillna('') to fill NA values within the function to account for missing data.
+
Improved Function Documentation
+- Updated function documentation to reflect new parameters and error handling.
fillna('') added to output so that null values come through, removed 'All' column name from output, sort options 0 and 1, updated docstring documentation. Tested successfully on Python3.7.3.
+
+
Compatibility Enhancement
+
+
Added a version check for Python3.7 and above.
+
+
Conditional import of datetime to handle different Python versions.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
In any data-driven project, effective management of data is crucial. This
+section provides essential techniques for handling and preparing data to ensure
+consistency, accuracy, and ease of analysis. From directory setup and data
+cleaning to advanced data processing, these methods form the backbone of reliable
+data management. Dive into the following topics to enhance your data handling
+capabilities and streamline your workflow.
path (str) – The path to the directory that needs to be ensured.
+
+
Returns:
+
None
+
+
+
+
+
The ensure_directory function is a utility designed to facilitate the
+management of directory paths within your project. When working with data
+science projects, it is common to save and load data, images, and other
+artifacts from specific directories. This function helps in making sure that
+these directories exist before any read/write operations are performed. If
+the specified directory does not exist, the function creates it. If it
+already exists, it does nothing, thus preventing any errors related to
+missing directories.
+
Example Usage
+
In the example below, we demonstrate how to use the ensure_directory function
+to verify and create directories as needed. This example sets up paths for data and
+image directories, ensuring they exist before performing any operations that depend on them.
+
First, we define the base path as the parent directory of the current directory.
+The os.pardir constant, equivalent to ".."", is used to navigate up one
+directory level. Then, we define paths for the data directory and data output
+directory, both located one level up from the current directory.
+
Next, we set paths for the PNG and SVG image directories, located within an
+images folder in the parent directory. Using the ensure_directory
+function, we then verify that these directories exist. If any of the specified
+directories do not exist, the function creates them.
+
fromeda_toolkitimportensure_directory
+
+importos# import operating system for dir
+
+
+base_path=os.path.join(os.pardir)
+
+# Go up one level from 'notebooks' to parent directory,
+# then into the 'data' folder
+data_path=os.path.join(os.pardir,"data")
+data_output=os.path.join(os.pardir,"data_output")
+
+# create image paths
+image_path_png=os.path.join(base_path,"images","png_images")
+image_path_svg=os.path.join(base_path,"images","svg_images")
+
+# Use the function to ensure'data' directory exists
+ensure_directory(data_path)
+ensure_directory(data_output)
+ensure_directory(image_path_png)
+ensure_directory(image_path_svg)
+
id_colname (str, optional) – The name of the new column for the IDs. Defaults to "ID".
+
num_digits (int, optional) – The number of digits for the unique IDs. Defaults to 9.
+
seed (int, optional) – The seed for the random number generator. Defaults to None.
+
set_as_index (bool, optional) – Whether to set the new ID column as the index. Defaults to False.
+
+
+
Returns:
+
The updated dataframe with the new ID column.
+
+
Return type:
+
pd.DataFrame
+
+
+
+
+
+
Note
+
+
If the dataframe index is not unique, a warning is printed.
+
+
The function does not check if the number of rows exceeds the number of
unique IDs that can be generated with the specified number of digits.
+
+
+
+
The first digit of the generated IDs is ensured to be non-zero.
+
+
+
The add_ids function is used to append a column of unique identifiers with a
+specified number of digits to a given dataframe. This is particularly useful for
+creating unique patient or record IDs in datasets. The function allows you to
+specify a custom column name for the IDs, the number of digits for each ID, and
+optionally set a seed for the random number generator to ensure reproducibility.
+Additionally, you can choose whether to set the new ID column as the index of the dataframe.
+
Example Usage
+
In the example below, we demonstrate how to use the add_ids function to add a
+column of unique IDs to a dataframe. We start by importing the necessary libraries
+and creating a sample dataframe. We then use the add_ids function to generate
+and append a column of unique IDs with a specified number of digits to the dataframe.
+
First, we import the pandas library and the add_ids function from the eda_toolkit.
+Then, we create a sample dataframe with some data. We call the add_ids function,
+specifying the dataframe, the column name for the IDs, the number of digits for
+each ID, a seed for reproducibility, and whether to set the new ID column as the
+index. The function generates unique IDs for each row and adds them as the first
+column in the dataframe.
+
fromeda_toolkitimportadd_ids
+
+# Add a column of unique IDs with 9 digits and call it "census_id"
+df=add_ids(
+ df=df,
+ id_colname="census_id",
+ num_digits=9,
+ seed=111,
+ set_as_index=True,
+)
+
+
+
Output
+
First 5 Rows of Census Income Data (Adapted from Kohavi, 1996, UCI Machine Learning Repository) [1]
df (pd.DataFrame) – The DataFrame containing the column to be processed.
+
column_name (str) – The name of the column containing floats with potential trailing periods.
+
+
+
Returns:
+
The updated DataFrame with the trailing periods removed from the specified column.
+
+
Return type:
+
pd.DataFrame
+
+
+
The strip_trailing_period function is designed to remove trailing periods
+from float values in a specified column of a DataFrame. This can be particularly
+useful when dealing with data that has been inconsistently formatted, ensuring
+that all float values are correctly represented.
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the strip_trailing_period function to clean a
+column in a DataFrame. We start by importing the necessary libraries and creating a sample DataFrame.
+We then use the strip_trailing_period function to remove any trailing periods from the specified column.
+
fromeda_toolkitimportstrip_trailing_period
+
+# Create a sample dataframe with trailing periods in some values
+data={
+ "values":[1.0,2.0,3.0,4.0,5.0,6.],
+}
+df=pd.DataFrame(data)
+
+# Remove trailing periods from the 'values' column
+df=strip_trailing_period(df=df,column_name="values")
+
+
+
Output
+
First 6 Rows of Data Before and After Removing Trailing Periods (Adapted from Example)
+
+
+
+
+ Before:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.
+
+
+
+
+
+
+ After:
+
+
+
+
Index
+
Value
+
+
+
0
+
1.0
+
+
+
1
+
2.0
+
+
+
2
+
3.0
+
+
+
3
+
4.0
+
+
+
4
+
5.0
+
+
+
5
+
6.0
+
+
+
+
+
+
+
Note: The last row shows 6 as an int with a trailing period with its conversion to float.
This function takes a date string and standardizes it to the ISO8601 format
+(YYYY-MM-DD). It assumes dates are provided in either day/month/year or
+month/day/year format. The function first checks if the first part of the
+date string (day or month) is greater than 12, which unambiguously indicates
+a day/month/year format. If the first part is 12 or less, the function
+attempts to parse the date as month/day/year, falling back to day/month/year
+if the former raises a ValueError due to an impossible date (e.g., month
+being greater than 12).
+
+
Parameters:
+
date_str (str) – A date string to be standardized.
+
+
Returns:
+
A standardized date string in the format YYYY-MM-DD.
ValueError – If date_str is in an unrecognized format or if the function
+cannot parse the date.
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the parse_date_with_rule
+function to standardize date strings. We start by importing the necessary library
+and creating a sample list of date strings. We then use the parse_date_with_rule
+function to parse and standardize each date string to the ISO8601 format.
+
fromeda_toolkitimportparse_date_with_rule
+
+# Sample date strings
+date_strings=["15/04/2021","04/15/2021","01/12/2020","12/01/2020"]
+
+# Standardize the date strings
+standardized_dates=[parse_date_with_rule(date)fordateindate_strings]
+
+print(standardized_dates)
+
In the next example, we demonstrate how to apply the parse_date_with_rule
+function to a DataFrame column containing date strings using the .apply() method.
+This is particularly useful when you need to standardize date formats across an
+entire column in a DataFrame.
+
+
# Creating the DataFrame
+data={
+ "date_column":[
+ "31/12/2021",
+ "01/01/2022",
+ "12/31/2021",
+ "13/02/2022",
+ "07/04/2022",
+ ],
+ "name":["Alice","Bob","Charlie","David","Eve"],
+ "amount":[100.0,150.5,200.75,250.25,300.0],
+}
+
+df=pd.DataFrame(data)
+
+# Apply the function to the DataFrame column
+df["standardized_date"]=df["date_column"].apply(parse_date_with_rule)
+
+print(df)
+
Analyze DataFrame columns to provide summary statistics such as data type,
+null counts, unique values, and most frequent values.
+
This function analyzes the columns of a DataFrame, providing details about the data type,
+the number and percentage of null values, the total number of unique values, and the most
+frequent unique value along with its count and percentage. It handles special cases such as
+converting date columns and replacing empty strings with Pandas NA values.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame to analyze.
+
background_color (str, optional) – Hex color code or color name for background styling in the output
+DataFrame. Defaults to None.
+
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
+False, returns a styled DataFrame for visual presentation. Defaults to False.
+
+
+
Returns:
+
If return_df is True, returns the plain DataFrame containing column summary
+statistics. If return_df is False, returns a styled DataFrame with optional
+background color for specific columns.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
In the example below, we demonstrate how to use the dataframe_columns
+function to analyze a DataFrame’s columns.
The function will create an Excel file with a sheet for each combination
of the specified variables, as well as a “Table of Contents” sheet with
+hyperlinks to each summary table.
+
+
+
+
The sheet names are limited to 31 characters due to Excel’s constraints.
+
+
+
The function returns two outputs:
+
1. summary_tables: A dictionary where each key is a tuple representing a combination
+of variables, and each value is a DataFrame containing the summary table for that combination.
+Each summary table includes the count and proportion of occurrences for each unique combination of values.
+
2. all_combinations: A list of all generated combinations of the specified variables.
+This is useful for understanding which combinations were analyzed and included in the summary tables.
+
Example Usage
+
Below, we use the summarize_all_combinations function to generate summary tables for the specified
+variables from a DataFrame containing the census data [1].
+
fromeda_toolkitimportsummarize_all_combinations
+
+# Define unique variables for the analysis
+unique_vars=[
+ "age_group",
+ "workclass",
+ "education",
+ "occupation",
+ "race",
+ "sex",
+ "income",
+]
+
+# Generate summary tables for all combinations of the specified variables
+summary_tables,all_combinations=summarize_all_combinations(
+ df=df,
+ data_path=data_output,
+ variables=unique_vars,
+ data_name="census_summary_tables.xlsx",
+)
+
+# Print all combinations of variables
+print(all_combinations)
+
When applied to the US Census data, the output Excel file will contain summary tables for all possible combinations of the specified variables.
+The first sheet will be a Table of Contents with hyperlinks to each summary table.
Saving DataFrames to Excel with Customized Formatting
+
Save multiple DataFrames to separate sheets in an Excel file with customized
+formatting.
+
This section explains how to save multiple DataFrames to separate sheets in an Excel file with customized formatting using the save_dataframes_to_excel function.
file_path (str) – Full path to the output Excel file.
+
df_dict (dict) – Dictionary where keys are sheet names and values are DataFrames to save.
+
decimal_places (int) – Number of decimal places to round numeric columns. Default is 0.
+
+
+
+
+
+
+
Note
+
+
The function will autofit columns and left-align text.
+
Numeric columns will be formatted with the specified number of decimal places.
+
Headers will be bold and left-aligned without borders.
+
+
+
The function performs the following tasks:
+
+
Writes each DataFrame to its respective sheet in the Excel file.
+
Rounds numeric columns to the specified number of decimal places.
+
Applies customized formatting to headers and cells.
+
Autofits columns based on the content length.
+
+
Example Usage
+
Below, we use the save_dataframes_to_excel function to save two DataFrames:
+the original DataFrame and a filtered DataFrame with ages between 18 and 40.
+
fromeda_toolkitimportsave_dataframes_to_excel
+
+# Example usage
+file_name="df_census.xlsx"# Name of the output Excel file
+file_path=os.path.join(data_path,file_name)
+
+# filter DataFrame to Ages 18-40
+filtered_df=df[(df["age"]>18)&(df["age"]<40)]
+
+df_dict={
+ "original_df":df,
+ "ages_18_to_40":filtered_df,
+}
+
+save_dataframes_to_excel(
+ file_path=file_path,
+ df_dict=df_dict,
+ decimal_places=0,
+)
+
+
+
Output
+
The output Excel file will contain the original DataFrame and a filtered DataFrame as a separate tab with ages
+between 18 and 40, each on separate sheets with customized formatting.
Create a contingency table from one or more columns in a DataFrame, with sorting options.
+
This section explains how to create contingency tables from one or more columns in a DataFrame, with options to sort the results using the contingency_table function.
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
+
+
+
Raises:
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
+
Returns:
+
A DataFrame containing the contingency table with the specified columns, a 'Total' column representing the count of occurrences, and a 'Percentage' column representing the percentage of the total count.
+
+
Return type:
+
pandas.DataFrame
+
+
+
+
+
Example Usage
+
Below, we use the contingency_table function to create a contingency table
+from the specified columns in a DataFrame containing census data [1]
The output will be a contingency table with the specified columns, showing the
+total counts and percentages of occurrences for each combination of values. The
+table will be sorted by the 'Total' column in descending order because sort_by
+is set to 1.
df (pandas.DataFrame) – The DataFrame to be styled.
+
columns (list of str) – List of column names to be highlighted.
+
color (str, optional) – The background color to be applied for highlighting (default is “yellow”).
+
+
+
Returns:
+
A Styler object with the specified columns highlighted.
+
+
Return type:
+
pandas.io.formats.style.Styler
+
+
+
+
+
Example Usage
+
Below, we use the highlight_columns function to highlight the age and education
+columns in the first 5 rows of the census [1] DataFrame with a pink background color.
+
fromeda_toolkitimporthighlight_columns
+
+# Applying the highlight function
+highlighted_df=highlight_columns(
+ df=df,
+ columns=["age","education"],
+ color="#F8C5C8",
+)
+
+highlighted_df
+
+
+
Output
+
The output will be a DataFrame with the specified columns highlighted in the given background color.
+The age and education columns will be highlighted in pink.
+
The resulting styled DataFrame can be displayed in a Jupyter Notebook or saved to an
+HTML file using the .render() method of the Styler object.
Binning numerical columns is a technique used to convert continuous numerical
+data into discrete categories or “bins.” This is especially useful for simplifying
+analysis, creating categorical features from numerical data, or visualizing the
+distribution of data within specific ranges. The process of binning involves
+dividing a continuous range of values into a series of intervals, or “bins,” and
+then assigning each value to one of these intervals.
+
+
Note
+
The code snippets below create age bins and assign a corresponding age group
+label to each age in the DataFrame. The pd.cut function from pandas is used to
+categorize the ages and assign them to a new column, age_group. Adjust the bins
+and labels as needed for your specific data.
+
+
Below, we use the age column of the census data [1] from the UCI Machine Learning Repository as an example:
+
+
Bins Definition:
+The bins are defined by specifying the boundaries of each interval. For example,
+in the code snippet below, the bin_ages list specifies the boundaries for age groups:
These labels are used to categorize the numerical values into meaningful groups.
+
+
Applying the Binning:
+The pd.cut function
+from Pandas is used to apply the binning process. For each value in the age
+column of the DataFrame, it assigns a corresponding label based on which bin the
+value falls into. Here, right=False indicates that each bin includes the
+left endpoint but excludes the right endpoint. For example, if bin_ages=
+[0,10,20,30], then a value of 10 will fall into the bin [10,20) and
+be labeled accordingly.
The parameter right=False in pd.cut means that the bins are left-inclusive
+and right-exclusive, except for the last bin, which is always right-inclusive
+when the upper bound is infinity (float("inf")).
The Gaussian (normal) distribution is a key assumption in many statistical methods. It is mathematically represented by the probability density function (PDF):
The Pearson correlation coefficient, often denoted as \(r\), is a measure of
+the linear relationship between two variables. It quantifies the degree to which
+a change in one variable is associated with a change in another variable. The
+Pearson correlation ranges from \(-1\) to \(1\), where:
+
+
\(r = 1\) indicates a perfect positive linear relationship.
+
\(r = -1\) indicates a perfect negative linear relationship.
+
\(r = 0\) indicates no linear relationship.
+
+
The Pearson correlation coefficient between two variables \(X\) and \(Y\) is defined as:
This formula normalizes the covariance by the product of the standard deviations of the two variables, resulting in a dimensionless coefficient that indicates the strength and direction of the linear relationship between \(X\) and \(Y\).
+
+
\(r > 0\): Positive correlation. As \(X\) increases, \(Y\) tends to increase.
+
\(r < 0\): Negative correlation. As \(X\) increases, \(Y\) tends to decrease.
+
\(r = 0\): No linear correlation. There is no consistent linear relationship between \(X\) and \(Y\).
+
+
The closer the value of \(r\) is to \(\pm 1\), the stronger the linear relationship between the two variables.
Let \(\mathbf{X}\) represent the complete set of input features for a machine
+learning model, where \(\mathbf{X} = \{X_1, X_2, \dots, X_p\}\). Suppose we’re
+particularly interested in a subset of these features, denoted by \(\mathbf{X}_S\).
+The complementary set, \(\mathbf{X}_C\), contains all the features in \(\mathbf{X}\)
+that are not in \(\mathbf{X}_S\). Mathematically, this relationship is expressed as:
where \(\mathbf{X}_C\) is the set of features in \(\mathbf{X}\) after
+removing the features in \(\mathbf{X}_S\).
+
Partial Dependence Plots (PDPs) are used to illustrate the effect of the features
+in \(\mathbf{X}_S\) on the model’s predictions, while averaging out the
+influence of the features in \(\mathbf{X}_C\). This is mathematically defined as:
\(\mathbb{E}_{\mathbf{X}_C} \left[ \cdot \right]\) indicates that we are taking the expected value over the possible values of the features in the set \(\mathbf{X}_C\).
+
\(p(x_C)\) represents the probability density function of the features in \(\mathbf{X}_C\).
+
+
This operation effectively summarizes the model’s output over all potential values of the complementary features, providing a clear view of how the features in \(\mathbf{X}_S\) alone impact the model’s predictions.
+
2D Partial Dependence Plots
+
Consider a trained machine learning model 2D Partial Dependence Plots\(f(\mathbf{X})\), where \(\mathbf{X} = (X_1, X_2, \dots, X_p)\) represents the vector of input features. The partial dependence of the predicted response \(\hat{y}\) on a single feature \(X_j\) is defined as:
\(\mathbf{X}_{C_i}\) represents the complement set of \(X_j\), meaning the remaining features in \(\mathbf{X}\) not included in \(X_j\) for the \(i\)-th instance.
+
\(n\) is the number of observations in the dataset.
+
+
For two features, \(X_j\) and \(X_k\), the partial dependence is given by:
This results in a 2D surface plot (or contour plot) that shows how the predicted outcome changes as the values of \(X_j\) and \(X_k\) vary, while the effects of the other features are averaged out.
+
+
Single Feature PDP: When plotting \(\text{PD}(X_j)\), the result is a 2D line plot showing the marginal effect of feature \(X_j\) on the predicted outcome, averaged over all possible values of the other features.
+
Two Features PDP: When plotting \(\text{PD}(X_j, X_k)\), the result is a 3D surface plot (or a contour plot) that shows the combined marginal effect of \(X_j\) and \(X_k\) on the predicted outcome. The surface represents the expected value of the prediction as \(X_j\) and \(X_k\) vary, while all other features are averaged out.
+
+
3D Partial Dependence Plots
+
For a more comprehensive analysis, especially when exploring interactions between two features, 3D Partial Dependence Plots are invaluable. The partial dependence function for two features in a 3D context is:
Here, the function \(f(X_j, X_k, \mathbf{X}_{C_i})\) is evaluated across a grid of values for \(X_j\) and \(X_k\). The resulting 3D surface plot represents how the model’s prediction changes over the joint range of these two features.
+
The 3D plot offers a more intuitive visualization of feature interactions compared to 2D contour plots, allowing for a better understanding of the combined effects of features on the model’s predictions. The surface plot is particularly useful when you need to capture complex relationships that might not be apparent in 2D.
+
+
Feature Interaction Visualization: The 3D PDP provides a comprehensive view of the interaction between two features. The resulting surface plot allows for the visualization of how the model’s output changes when the values of two features are varied simultaneously, making it easier to understand complex interactions.
+
Enhanced Interpretation: 3D PDPs offer enhanced interpretability in scenarios where feature interactions are not linear or where the effect of one feature depends on the value of another. The 3D visualization makes these dependencies more apparent.
Generate KDE or histogram distribution plots for specified columns in a DataFrame.
+
The kde_distributions function is a versatile tool designed for generating
+Kernel Density Estimate (KDE) plots, histograms, or a combination of both for
+specified columns within a DataFrame. This function is particularly useful for
+visualizing the distribution of numerical data across various categories or groups.
+It leverages the powerful seaborn library [2] for plotting, which is built on top of
+matplotlib [3] and provides a high-level interface for drawing attractive and informative
+statistical graphics.
+
Key Features and Parameters
+
+
Flexible Plotting: The function supports creating histograms, KDE plots, or a combination of both for specified columns, allowing users to visualize data distributions effectively.
+
Leverages Seaborn Library: The function is built on the seaborn library, which provides high-level, attractive visualizations, making it easy to create complex plots with minimal code.
+
Customization: Users have control over plot aesthetics, such as colors, fill options, grid sizes, axis labels, tick marks, and more, allowing them to tailor the visualizations to their needs.
+
Scientific Notation Control: The function allows disabling scientific notation on the axes, providing better readability for certain types of data.
+
Log Scaling: The function includes an option to apply logarithmic scaling to specific variables, which is useful when dealing with data that spans several orders of magnitude.
+
Output Options: The function supports saving plots as PNG or SVG files, with customizable filenames and output directories, making it easy to integrate the plots into reports or presentations.
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
+
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
+
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
+
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
+
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
+
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
+
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
+
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
+
kwargs (additional keyword arguments) – Additional keyword arguments passed to the Seaborn plotting function.
In the below example, the kde_distributions function is used to generate
+histograms for several variables of interest: "age", "education-num", and
+"hours-per-week". These variables represent different demographic and
+financial attributes from the dataset. The plot_type="both" parameter ensures that a
+Kernel Density Estimate (KDE) plot is overlaid on the histograms, providing a
+smoothed representation of the data’s probability density.
+
The visualizations are arranged in a single row of four columns, as specified
+by n_rows=1 and n_cols=3, respectively. The overall size of the grid
+figure is set to 14 inches wide and 4 inches tall (grid_figsize=(14,4)),
+while each individual plot is configured to be 4 inches by 4 inches
+(single_figsize=(4,4)). The fill=True parameter fills the histogram
+bars with color, and the spacing between the subplots is managed using
+w_pad=1 and h_pad=1, which add 1 inch of padding both horizontally and
+vertically.
+
+
Note
+
If you do not set n_rows or n_cols to any values, the function will
+automatically calculate and create a grid based on the number of variables being
+plotted, ensuring an optimal arrangement of the plots.
+
+
To handle longer titles, the text_wrap=50 parameter ensures that the title
+text wraps to a new line after 50 characters. The bbox_inches="tight" setting
+is used when saving the figure, ensuring that it is cropped to remove any excess
+whitespace around the edges. The variables specified in vars_of_interest are
+passed directly to the function for visualization.
+
Each plot is saved individually with filenames that are prefixed by
+"kde_density_single_distribution", followed by the variable name. The `y-axis`
+for all plots is labeled as “Density” (y_axis_label="Density"), reflecting that
+the height of the bars or KDE line represents the data’s density. The histograms
+are divided into 10 bins (bins=10), offering a clear view of the distribution
+of each variable.
+
Additionally, the font sizes for the axis labels and tick labels
+are set to 16 points (label_fontsize=16) and 14 points (tick_fontsize=14),
+respectively, ensuring that all text within the plots is legible and well-formatted.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ fill_alpha=0.60,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="both",# Can also just plot KDE by itself by passing "kde"
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+)
+
In this example, the kde_distributions() function is used to generate histograms for
+the variables "age", "education-num", and "hours-per-week" but with
+plot_type="hist", meaning no KDE plots are included—only histograms are displayed.
+The plots are arranged in a single row of four columns (n_rows=1,n_cols=3),
+with a grid size of 14x4 inches (grid_figsize=(14,4)). The histograms are
+divided into 10 bins (bins=10), and the y-axis is labeled “Density” (y_axis_label="Density").
+Font sizes for the axis labels and tick labels are set to 16 and 14 points,
+respectively, ensuring clarity in the visualizations. This setup focuses on the
+histogram representation without the KDE overlay.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ fill=True,
+ text_wrap=50,
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Density",
+ bins=10,
+ plot_type="hist",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+ show_legend=False,
+)
+
In this example, the kde_distributions() function is modified to generate histograms
+with a few key changes. The hist_color is set to “orange”, changing the color of the
+histogram bars. The y-axis label is updated to “Count” (y_axis_label="Count"),
+reflecting that the histograms display the count of observations within each bin.
+Additionally, the stat parameter is set to "Count" to show the actual counts instead of
+densities. The rest of the parameters remain the same as in the previous example,
+with the plots arranged in a single row of four columns (n_rows=1,n_cols=3),
+a grid size of 14x4 inches, and a bin count of 10. This setup focuses on
+visualizing the raw counts in the dataset using orange-colored histograms.
+
fromeda_toolkitimportkde_distributions
+
+vars_of_interest=[
+ "age",
+ "education-num",
+ "hours-per-week",
+]
+
+kde_distributions(
+ df=df,
+ n_rows=1,
+ n_cols=3,
+ grid_figsize=(14,4),# Size of the overall grid figure
+ text_wrap=50,
+ hist_color="orange",
+ bbox_inches="tight",
+ vars_of_interest=vars_of_interest,
+ y_axis_label="Count",
+ bins=10,
+ plot_type="hist",
+ stat="Count",
+ label_fontsize=16,# Font size for axis labels
+ tick_fontsize=14,# Font size for tick labels
+ show_legend=False,
+)
+
In this example, the kde_distributions() function is customized to generate
+histograms that include mean and median lines. The mean_color is set to "blue"
+and the median_color is set to "black", allowing for a clear distinction
+between the two statistical measures. The function parameters are adjusted to
+ensure that both the mean and median lines are plotted (plot_mean=True,plot_median=True).
+The y_axis_label remains "Density", indicating that the histograms
+represent the density of observations within each bin. The histogram bars are
+colored using hist_color="brown", with a fill_alpha=0.60 while the s
+tatistical overlays enhance the interpretability of the data. The layout is
+configured with a single row and multiple columns (n_rows=1,n_cols=3), and
+the grid size is set to 15x5 inches. This example highlights how to visualize
+central tendencies within the data using a histogram that prominently displays
+the mean and median.
Histogram Example - (Mean, Median, and Std. Deviation)
+
In this example, the kde_distributions() function is customized to generate
+a histogram that include mean, median, and 3 standard deviation lines. The
+mean_color is set to "blue" and the median_color is set to "black",
+allowing for a clear distinction between these two central tendency measures.
+The function parameters are adjusted to ensure that both the mean and median lines
+are plotted (plot_mean=True,plot_median=True). The y_axis_label remains
+"Density", indicating that the histograms represent the density of observations
+within each bin. The histogram bars are colored using hist_color="brown",
+with a fill_alpha=0.40, which adjusts the transparency of the fill color.
+Additionally, standard deviation bands are plotted using colors "purple",
+"green", and "silver" for one, two, and three standard deviations, respectively.
+
The layout is configured with a single row and multiple columns (n_rows=1,n_cols=3),
+and the grid size is set to 15x5 inches. This setup is particularly useful for
+visualizing the central tendencies within the data while also providing a clear
+view of the distribution and spread through the standard deviation bands. The
+configuration used in this example showcases how histograms can be enhanced with
+statistical overlays to provide deeper insights into the data.
+
+
Note
+
You have the freedom to choose whether to plot the mean, median, and
+standard deviation lines. You can display one, none, or all of these simultaneously.
Generates stacked bar plots and crosstabs for specified columns in a DataFrame.
+
The stacked_crosstab_plot function is a versatile tool for generating stacked bar plots and contingency tables (crosstabs) from a pandas DataFrame. This function is particularly useful for visualizing categorical data across multiple columns, allowing users to easily compare distributions and relationships between variables. It offers extensive customization options, including control over plot appearance, color schemes, and the ability to save plots in multiple formats.
+
The function also supports generating both regular and normalized stacked bar plots, with the option to return the generated crosstabs as a dictionary for further analysis.
Generates stacked or regular bar plots and crosstabs for specified columns.
+
This function allows users to create stacked bar plots (or regular bar plots
+if stacks are removed) and corresponding crosstabs for specific columns
+in a DataFrame. It provides options to customize the appearance, including
+font sizes for axis labels, tick labels, and title text wrapping, and to
+choose between regular or normalized plots.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data to plot.
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
The provided code snippet demonstrates how to use the stacked_crosstab_plot
+function to generate stacked bar plots and corresponding crosstabs for different
+columns in a DataFrame. Here’s a detailed breakdown of the code using the census
+dataset as an example [1].
+
First, the func_col list is defined, specifying the columns ["sex","income"]
+to be analyzed. These columns will be used in the loop to generate separate plots.
+The legend_labels_list is then defined, with each entry corresponding to a
+column in func_col. In this case, the labels for the sex column are
+["Male","Female"], and for the income column, they are ["<=50K",">50K"].
+These labels will be used to annotate the legends of the plots.
+
Next, the title list is defined, providing titles for each plot corresponding
+to the columns in func_col. The titles are set to ["Sex","Income"],
+which will be displayed on top of each respective plot.
+
+
Note
+
The legend_labels_list parameter should be a list of lists, where each
+inner list corresponds to the ground truth labels for the respective item in
+the func_col list. Each element in the func_col list represents a
+column in your DataFrame that you wish to analyze, and the corresponding
+inner list in legend_labels_list should contain the labels that will be
+used in the legend of your plots.
+
+
For example:
+
# Define the func_col to use in the loop in order of usage
+func_col=["sex","income"]
+
+# Define the legend_labels to use in the loop
+legend_labels_list=[
+ ["Male","Female"],# Corresponds to "sex"
+ ["<=50K",">50K"],# Corresponds to "income"
+]
+
+# Define titles for the plots
+title=[
+ "Sex",
+ "Income",
+]
+
+
+
+
Important
+
Ensure that the number of elements in func_col, legend_labels_list,
+and title are the same. Each item in func_col must have a corresponding
+list of labels in legend_labels_list and a title in title. This
+consistency is essential for the function to correctly generate the plots
+with the appropriate labels and titles.
+
+
In this example:
+
+
func_col contains two elements: "sex" and "income". Each corresponds to a specific column in your DataFrame.
+
legend_labels_list is a nested list containing two inner lists:
+
+
+
The first inner list, ["Male","Female"], corresponds to the "sex" column in func_col.
+
The second inner list, ["<=50K",">50K"], corresponds to the "income" column in func_col.
+
+
+
+
title contains two elements: "Sex" and "Income", which will be used as the titles for the respective plots.
+
+
+
Note
+
If you assign the function to a variable, the dictionary returned when
+return_dict=True will be suppressed in the output. However, the dictionary
+is still available within the assigned variable for further use.
The above example generates stacked bar plots for "sex" and "income"
+grouped by "education". The plots are executed with legends, labels, and
+tick sizes customized for clarity. The function returns a dictionary of
+crosstabs for further analysis or export.
+
+
Important
+
Importance of Correctly Aligning Labels
+
It is crucial to properly align the elements in the legend_labels_list,
+title, and func_col parameters when using the stacked_crosstab_plot
+function. Each of these lists must be ordered consistently because the function
+relies on their alignment to correctly assign labels and titles to the
+corresponding plots and legends.
+
For instance, in the example above:
+
+
The first element in func_col is "sex", and it is aligned with the first set of labels ["Male","Female"] in legend_labels_list and the first title "Sex" in the title list.
+
Similarly, the second element in func_col, "income", aligns with the labels ["<=50K",">50K"] and the title "Income".
+
+
Misalignment between these lists would result in incorrect labels or titles being
+applied to the plots, potentially leading to confusion or misinterpretation of the data.
+Therefore, it’s important to ensure that each list is ordered appropriately and
+consistently to accurately reflect the data being visualized.
+
Proper Setup of Lists
+
When setting up the legend_labels_list, title, and func_col, ensure
+that each element in the lists corresponds to the correct variable in the DataFrame.
+This involves:
+
+
Ordering: Maintaining the same order across all three lists to ensure that labels and titles correspond correctly to the data being plotted.
+
Consistency: Double-checking that each label in legend_labels_list matches the categories present in the corresponding func_col, and that the title accurately describes the plot.
+
+
By adhering to these guidelines, you can ensure that the stacked_crosstab_plot
+function produces accurate and meaningful visualizations that are easy to interpret and analyze.
Using the census dataset [1], to create horizontal stacked bar plots, set the kind parameter to
+"barh" in the stacked_crosstab_plotfunction. This option pivots the
+standard vertical stacked bar plot into a horizontal orientation, making it easier
+to compare categories when there are many labels on the y-axis.
In the census data [1], to create stacked bar plots without the normalized versions,
+set the plot_type parameter to "regular" in the stacked_crosstab_plot
+function. This option removes the display of normalized plots beneath the regular
+versions. Alternatively, setting the plot_type to "normalized" will display
+only the normalized plots. The example below demonstrates regular stacked bar plots
+for income by age.
In the census data [1], to generate regular (non-stacked) bar plots without
+displaying their normalized versions, set the plot_type parameter to "regular"
+in the stacked_crosstab_plot function and enable remove_stacks by setting
+it to True. This configuration removes any stacked elements and prevents the
+display of normalized plots beneath the regular versions. Alternatively, setting
+plot_type to "normalized" will display only the normalized plots.
+
When unstacking bar plots in this fashion, the distribution is aligned in descending
+order, making it easier to visualize the most prevalent categories.
+
In the example below, the color of the bars has been set to a dark grey (#333333),
+and the legend has been removed by setting show_legend=False. This illustrates
+regular bar plots for income by age, without stacking.
Create and save individual boxplots or violin plots, an entire grid of plots,
+or both for given metrics and comparisons.
+
The box_violin_plot function is designed to generate both individual and grid
+plots of boxplots or violin plots for a set of specified metrics against comparison
+categories within a DataFrame. This function offers flexibility in how the plots are
+presented and saved, allowing users to create detailed visualizations that highlight
+the distribution of metrics across different categories.
+
With options to customize the plot type (boxplot or violinplot),
+axis label rotation, figure size, and whether to display or save the plots, this
+function can be adapted for a wide range of data visualization needs. Users can
+choose to display individual plots, a grid of plots, or both, depending on the
+requirements of their analysis.
+
Additionally, the function includes features for rotating the plots, adjusting
+the font sizes of labels, and selectively showing or hiding legends. It also
+supports the automatic saving of plots in either PNG or SVG format, depending on
+the specified paths, making it a powerful tool for producing publication-quality
+figures.
+
The function is particularly useful in scenarios where the user needs to compare
+the distribution of multiple metrics across different categories, enabling a
+clear visual analysis of how these metrics vary within the dataset.
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set without specifying image_path_png or image_path_svg.
+
If rotate_plot is not a boolean value.
+
If individual_figsize is not a tuple or list of two numbers.
+
If grid_figsize is provided and is not a tuple or list of two numbers.
+
+
+
+
Returns:
+
None
+
+
+
+
+
This function provides the ability to create and save boxplots or violin plots for specified metrics and comparison categories. It supports the generation of individual plots, a grid of plots, or both. Users can customize the appearance, save the plots to specified directories, and control the display of legends and labels.
In this example with the US census data [1], the box_violin_plot function is employed to create a grid of
+boxplots, comparing different metrics against the "age_group" column in the
+DataFrame. The metrics_comp parameter is set to ["age_group"], meaning
+that the comparison will be based on different age groups. The metrics_list is
+provided as age_boxplot_list, which contains the specific metrics to be visualized.
+The function is configured to arrange the plots in a grid formatThe image_path_png and
+image_path_svg parameters are specified to save the plots in both PNG and
+SVG formats, and the save_plots option is set to "all", ensuring that both
+individual and grid plots are saved.
+
The plots are displayed in a grid format, as indicated by the show_plot="grid"
+parameter. The plot_type is set to "boxplot", so the function will generate
+boxplots for each metric in the list. Additionally, the `x-axis` labels are rotated
+by 90 degrees (xlabel_rot=90) to ensure that the labels are legible. The legend is
+hidden by setting show_legend=False, keeping the plots clean and focused on the data.
+This configuration provides a comprehensive visual comparison of the specified
+metrics across different age groups, with all plots saved for future reference or publication.
In this example with the US census data [1], we keep everything the same as the prior example, but change the
+plot_type to violinplot. This adjustment will generate violin plots instead
+of boxplots while maintaining all other settings.
In this example with the US census data [1], we set xlabel_rot=0 and rotate_plot=True
+to pivot the plot, changing the orientation of the axes while keeping the x-axis labels upright.
+This adjustment flips the axes, providing a different perspective on the data distribution.
Create and Save Scatter Plots or a Grid of Scatter Plots
+
This function, scatter_fit_plot, is designed to generate scatter plots for
+one or more pairs of variables (x_vars and y_vars) from a given DataFrame.
+The function can produce either individual scatter plots or organize multiple
+scatter plots into a grid layout, making it easy to visualize relationships between
+different pairs of variables in one cohesive view.
+
Optional Best Fit Line
+
An optional feature of this function is the ability to add a best fit line to the
+scatter plots. This line, often called a regression line, is calculated using a
+linear regression model and represents the trend in the data. By adding this line,
+you can visually assess the linear relationship between the variables, and the
+function can also display the equation of this line in the plot’s legend.s
+
Customizable Plot Aesthetics
+
The function offers a wide range of customization options to tailor the appearance
+of the scatter plots:
+
+
Point Color: You can specify a default color for the scatter points or use a hue parameter to color the points based on a categorical variable. This allows for easy comparison across different groups within the data.
+
Point Size: The size of the scatter points can be controlled and scaled based on another variable, which can help highlight differences or patterns related to that variable.
+
Markers: The shape or style of the scatter points can also be customized. Whether you prefer circles, squares, or other marker types, the function allows you to choose the best representation for your data.
+
+
Axis and Label Configuration
+
The function also provides flexibility in setting axis labels, tick marks, and grid sizes. You can rotate axis labels for better readability, adjust font sizes, and even specify limits for the x and y axes to focus on particular data ranges.
+
Plot Display and Saving Options
+
The function allows you to display plots individually, as a grid, or both. Additionally, you can save the generated plots as PNG or SVG files, making it easy to include them in reports or presentations.
+
Correlation Coefficient Display
+
For users interested in understanding the strength of the relationship between variables, the function can also display the Pearson correlation coefficient directly in the plot title. This numeric value provides a quick reference to the linear correlation between the variables, offering further insight into their relationship.
Create and save scatter plots or a grid of scatter plots for given x_vars
+and y_vars, with an optional best fit line and customizable point color,
+size, and markers.
+
+
Parameters:
+
+
df (pandas.DataFrame) – The DataFrame containing the data.
+
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
+
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
+
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
+
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
+
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
+
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
+
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
If all_vars is provided and either x_vars or y_vars is also provided.
+
If neither all_vars nor both x_vars and y_vars are provided.
+
If hue_palette is specified without hue.
+
If show_plot is not one of "individual", "grid", or "both".
+
If save_plots is not one of None, "all", "individual", or "grid".
+
If save_plots is set but no image paths are provided.
+
If rotate_plot is not a boolean value.
+
If individual_figsize or grid_figsize are not tuples/lists with two numeric values.
+
+
+
+
Returns:
+
None. This function does not return any value but generates and optionally saves scatter plots for the specified x_vars and y_vars, or for all combinations of variables in all_vars if it is provided.
In this US census data [1] example, the scatter_fit_plot function is
+configured to display the Pearson correlation coefficient and a best fit line
+on each scatter plot. The correlation coefficient is shown in the plot title,
+controlled by the show_correlation=True parameter, which provides a measure
+of the strength and direction of the linear relationship between the variables.
+Additionally, the add_best_fit_line=True parameter adds a best fit line to
+each plot, with the equation for the line displayed in the legend. This equation,
+along with the best fit line, helps to visually assess the relationship between
+the variables, making it easier to identify trends and patterns in the data. The
+combination of the correlation coefficient and the best fit line offers both
+a quantitative and visual representation of the relationships, enhancing the
+interpretability of the scatter plots.
In this example, the scatter_fit_plot function is used to generate a grid of
+scatter plots that examine the relationships between age and hours-per-week
+as well as education-num and hours-per-week. Compared to the previous
+example, a few key inputs have been changed to adjust the appearance and functionality
+of the plots:
+
+
Hue and Hue Palette: The hue parameter is set to "income", meaning that the
+data points in the scatter plots are colored according to the values in the income
+column. A custom color mapping is provided via the hue_palette parameter, where the
+income categories "<=50K" and ">50K" are assigned the colors "brown" and
+"green", respectively. This change visually distinguishes the data points based on
+income levels.
+
Scatter Color: The scatter_color parameter is set to "#808080", which applies
+a grey color to the scatter points when no hue is provided. However, since a hue
+is specified in this example, the hue_palette takes precedence and overrides this color setting.
+
Best Fit Line: The add_best_fit_line parameter is set to False, meaning that
+no best fit line is added to the scatter plots. This differs from the previous example where
+a best fit line was included.
+
Correlation Coefficient: The show_correlation parameter is set to False, so the
+Pearson correlation coefficient will not be displayed in the plot titles. This is another
+change from the previous example where the correlation coefficient was included.
+
Hue Legend: The show_legend parameter remains set to True, ensuring that the
+legend displaying the hue categories ("<=50K" and ">50K") appears on the plots,
+helping to interpret the color coding of the data points.
+
+
These changes allow for the creation of scatter plots that highlight the income levels
+of individuals, with custom color coding and without additional elements like a best
+fit line or correlation coefficient. The resulting grid of plots is then saved as
+images in the specified paths.
In this example, the scatter_fit_plot function is used to generate a grid of scatter plots that explore the relationships between all numeric variables in the df DataFrame. The function automatically identifies and plots all possible combinations of these variables. Below are key aspects of this example:
+
+
All Variables Combination: The all_vars parameter is used to automatically generate scatter plots for all possible combinations of numerical variables in the DataFrame. This means you don’t need to manually specify x_vars and y_vars, as the function will iterate through each possible pair.
+
Grid Display: The show_plot parameter is set to "grid", so the scatter plots are displayed in a grid format. This is useful for comparing multiple relationships simultaneously.
+
Font Sizes: The label_fontsize and tick_fontsize parameters are set to 14 and 12, respectively. This increases the readability of axis labels and tick marks, making the plots more visually accessible.
+
Best Fit Line: The add_best_fit_line parameter is set to True, meaning that a best fit line is added to each scatter plot. This helps in visualizing the linear relationship between variables.
+
Scatter Color: The scatter_color parameter is set to "#808080", applying a grey color to the scatter points. This provides a neutral color that does not distract from the data itself.
+
Correlation Coefficient: The show_correlation parameter is set to True, so the Pearson correlation coefficient will be displayed in the plot titles. This helps to quantify the strength of the relationship between the variables.
+
+
These settings allow for the creation of scatter plots that comprehensively explore the relationships between all numeric variables in the DataFrame. The plots are saved in a grid format, with added best fit lines and correlation coefficients for deeper analysis. The resulting images can be stored in the specified directory for future reference.
Generate and Save Customizable Correlation Heatmaps
+
The flex_corr_matrix function is designed to create highly customizable correlation heatmaps for visualizing the relationships between variables in a DataFrame. This function allows users to generate either a full or triangular correlation matrix, with options for annotation, color mapping, and saving the plot in multiple formats.
+
Customizable Plot Appearance
+
The function provides extensive customization options for the heatmap’s appearance:
+
+
Colormap Selection: Choose from a variety of colormaps to represent the strength of correlations. The default is "coolwarm", but this can be adjusted to fit the needs of the analysis.
+
Annotation: Optionally annotate the heatmap with correlation coefficients, making it easier to interpret the strength of relationships at a glance.
+
Figure Size and Layout: Customize the dimensions of the heatmap to ensure it fits well within reports, presentations, or dashboards.
+
+
Triangular vs. Full Correlation Matrix
+
A key feature of the flex_corr_matrix function is the ability to generate either a full correlation matrix or only the upper triangle. This option is particularly useful when the matrix is large, as it reduces visual clutter and focuses attention on the unique correlations.
+
Label and Axis Configuration
+
The function offers flexibility in configuring axis labels and titles:
+
+
Label Rotation: Rotate x-axis and y-axis labels for better readability, especially when working with long variable names.
+
Font Sizes: Adjust the font sizes of labels and tick marks to ensure the plot is clear and readable.
+
Title Wrapping: Control the wrapping of long titles to fit within the plot without overlapping other elements.
+
+
Plot Display and Saving Options
+
The flex_corr_matrix function allows you to display the heatmap directly or save it as PNG or SVG files for use in reports or presentations. If saving is enabled, you can specify file paths and names for the images.
The provided code filters the census [1] DataFrame df to include only numeric columns using
+select_dtypes(np.number). It then utilizes the flex_corr_matrix() function
+to generate a right triangular correlation matrix, which only displays the
+upper half of the correlation matrix. The heatmap is customized with specific
+colormap settings, title, label sizes, axis label rotations, and other formatting
+options.
+
+
Note
+
This triangular matrix format is particularly useful for avoiding
+redundancy in correlation matrices, as it excludes the lower half,
+making it easier to focus on unique pairwise correlations.
+
+
The function also includes a labeled color bar, helping users quickly interpret
+the strength and direction of the correlations.
+
# Select only numeric data to pass into the function
+df_num=df.select_dtypes(np.number)
+
In this modified census [1] example, the key changes are the use of the viridis colormap
+and the decision to plot the full correlation matrix instead of just the upper
+triangle. By setting cmap="viridis", the heatmap will use a different color
+scheme, which can provide better visual contrast or align with specific aesthetic
+preferences. Additionally, by setting triangular=False, the full correlation
+matrix is displayed, allowing users to view all pairwise correlations, including
+both upper and lower halves of the matrix. This approach is beneficial when you
+want a comprehensive view of all correlations in the dataset.
Partial Dependence Plots (PDPs) are a powerful tool in machine learning
+interpretability, providing insights into how features influence the predicted
+outcome of a model. PDPs can be generated in both 2D and 3D, depending on
+whether you want to analyze the effect of one feature or the interaction between
+two features on the model’s predictions.
The plot_2d_pdp function generates 2D partial dependence plots for individual features or pairs of features. These plots are essential for examining the marginal effect of features on the predicted outcome.
+
+
Grid and Individual Plots: Generate all 2D partial dependence plots in a grid layout or as separate individual plots, offering flexibility in presentation.
+
Customization Options: Control the figure size, font sizes for labels and ticks, and the wrapping of long titles to ensure the plots are clear and informative.
+
Saving Plots: The function provides options to save the plots in PNG or SVG formats, and you can specify whether to save all plots, only individual plots, or just the grid plot.
Generate 2D partial dependence plots for specified features using the given machine learning model. The function allows for plotting in grid or individual layouts, with various customization options for figure size, font sizes, and title wrapping. Additionally, the plots can be saved in PNG or SVG formats with a customizable filename prefix.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
X_train (pandas.DataFrame or numpy.ndarray) – The training data used to compute partial dependence. Should correspond to the features used to train the model.
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
Consider a scenario where you have a machine learning model predicting median
+house values in California. [4] Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 2D partial
+dependence plot allows you to visualize this relationship in two ways: either as
+individual plots for each feature or as a combined plot showing the interaction
+between two features.
+
For instance, the 2D partial dependence plot can help you analyze how the age of
+the house impacts house values while holding the number of occupants constant, or
+vice versa. This is particularly useful for identifying the most influential
+features and understanding how changes in these features might affect the
+predicted house value.
+
If you extend this to two interacting features, such as AveOccup and HouseAge,
+you can explore their combined effect on house prices. The plot can reveal how
+different combinations of occupancy levels and house age influence the value,
+potentially uncovering non-linear relationships or interactions that might not be
+immediately obvious from a simple 1D analysis.
+
Here’s how you can generate and visualize these 2D partial dependence plots using
+the California housing dataset:
+
Fetch The CA Housing Dataset and Prepare The DataFrame
The plot_3d_pdp function extends the concept of partial dependence to three dimensions, allowing you to visualize the interaction between two features and their combined effect on the model’s predictions.
+
+
Interactive and Static 3D Plots: Generate static 3D plots using Matplotlib or interactive 3D plots using Plotly. The function also allows for generating both types simultaneously.
+
Colormap and Layout Customization: Customize the colormaps for both Matplotlib and Plotly plots. Adjust figure size, camera angles, and zoom levels to create plots that fit perfectly within your presentation or report.
+
Axis and Title Configuration: Customize axis labels for both Matplotlib and Plotly plots. Adjust font sizes and control the wrapping of long titles to maintain readability.
Generate 3D partial dependence plots for two features of a machine learning model.
+
This function supports both static (Matplotlib) and interactive (Plotly) visualizations, allowing for flexible and comprehensive analysis of the relationship between two features and the target variable in a model.
+
+
Parameters:
+
+
model (estimator object) – The trained machine learning model used to generate partial dependence plots.
+
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
+- "static": Generate only static Matplotlib plots.
+- "interactive": Generate only interactive Plotly plots.
+- "both": Generate both static and interactive plots. Default is "both".
+
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
If plot_type is not one of "static", "interactive", or "both".
+
If plot_type is "interactive" or "both" and html_file_path or html_file_name are not provided.
+
+
+
+
Returns:
+
None
+This function generates 3D partial dependence plots and displays or saves them. It does not return any values.
+
+
Notes:
+
+
This function handles warnings related to scikit-learn’s partial_dependence function, specifically a FutureWarning related to non-tuple sequences for multidimensional indexing. This warning is suppressed as it stems from the internal workings of scikit-learn in Python versions like 3.7.4.
+
To maintain compatibility with different versions of scikit-learn, the function attempts to use "values" for grid extraction in newer versions and falls back to "grid_values" for older versions.
Consider a scenario where you have a machine learning model predicting median
+house values in California.[4]_ Suppose you want to understand how non-location
+features like the average number of occupants per household (AveOccup) and the
+age of the house (HouseAge) jointly influence house values. A 3D partial
+dependence plot allows you to visualize this relationship in a more comprehensive
+manner, providing a detailed view of how these two features interact to affect
+the predicted house value.
+
For instance, the 3D partial dependence plot can help you explore how different
+combinations of house age and occupancy levels influence house values. By
+visualizing the interaction between AveOccup and HouseAge in a 3D space, you can
+uncover complex, non-linear relationships that might not be immediately apparent
+in 2D plots.
+
This type of plot is particularly useful when you need to understand the joint
+effect of two features on the target variable, as it provides a more intuitive
+and detailed view of how changes in both features impact predictions simultaneously.
+
Here’s how you can generate and visualize these 3D partial dependence plots
+using the California housing dataset:
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ image_filename="3d_pdp",
+ plot_type="static",
+ figsize=[8,5],
+ text_wrap=40,
+ wireframe_color="black",
+ image_path_png=image_path_png,
+ grid_resolution=30,
+)
+
# import the plot_3d_pdp function from
+# the eda_toolkit library
+fromeda_toolkitimportplot_3d_pdp
+
+# Call the function to generate the plot
+plot_3d_pdp(
+ model=model,
+ dataframe=X_test,# Use the test dataset
+ feature_names_list=["HouseAge","AveOccup"],
+ x_label="House Age",
+ y_label="Average Occupancy",
+ z_label="Partial Dependence",
+ title="3D Partial Dependence Plot of House Age vs. Average Occupancy",
+ html_file_path=image_path_png,
+ image_filename="3d_pdp",
+ html_file_name="3d_pdp.html",
+ plot_type="interactive",
+ text_wrap=40,
+ zoom_out_factor=0.5,
+ image_path_png=image_path_png,
+ image_path_svg=image_path_svg,
+ grid_resolution=30,
+ label_fontsize=8,
+ tick_fontsize=6,
+ title_x=0.38,
+ top_margin=10,
+ right_margin=250,
+ cbar_x=0.9,
+ cbar_thickness=25,
+ show_modebar=False,
+ enable_zoom=True,
+)
+
+
+
+
Warning
+
Scrolling Notice:
+
While interacting with the interactive Plotly plot below, scrolling down the
+page using the mouse wheel may be blocked when the mouse pointer is hovering
+over the plot. To continue scrolling, either move the mouse pointer outside
+the plot area or use the keyboard arrow keys to navigate down the page.
+
+
+
+
This interactive plot was generated using Plotly, which allows for rich,
+interactive visualizations directly in the browser. The plot above is an example
+of an interactive 3D Partial Dependence Plot. Here’s how it differs from
+generating a static plot using Matplotlib.
+
Key Differences
+
Plot Type:
+
+
The plot_type is set to "interactive" for the Plotly plot and "static" for the Matplotlib plot.
+
+
Interactive-Specific Parameters:
+
+
HTML File Path and Name: The html_file_path and html_file_name parameters are required to save the interactive Plotly plot as an HTML file. These parameters are not needed for static plots.
+
Zoom and Positioning: The interactive plot includes parameters like zoom_out_factor, title_x, cbar_x, and cbar_thickness to control the zoom level, title position, and color bar position in the Plotly plot. These parameters do not affect the static plot.
+
Mode Bar and Zoom: The show_modebar and enable_zoom parameters are specific to the interactive Plotly plot, allowing you to toggle the visibility of the mode bar and enable or disable zoom functionality.
+
+
Static-Specific Parameters:
+
+
Figure Size and Wireframe Color: The static plot uses parameters like figsize to control the size of the Matplotlib plot and wireframe_color to define the color of the wireframe in the plot. These parameters are not applicable to the interactive Plotly plot.
+
+
By adjusting these parameters, you can customize the behavior and appearance of your 3D Partial Dependence Plots according to your needs, whether for static or interactive visualization.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_build/html/v0.0.9/genindex.html b/_build/html/v0.0.9/genindex.html
new file mode 100644
index 000000000..2ece69c96
--- /dev/null
+++ b/_build/html/v0.0.9/genindex.html
@@ -0,0 +1,359 @@
+
+
+
+
+
+
+
+ Index — EDA Toolkit 0.0.9 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Index
+
+
+
+
+
+
+
+
+
+
Index
+
+
+ A
+ | B
+ | C
+ | D
+ | E
+ | F
+ | H
+ | K
+ | P
+ | S
+
+
Welcome to the EDA Toolkit Python Library Documentation!
+
+
Note
+
This documentation is for eda_toolkit version 0.0.9.
+
+
The eda_toolkit is a comprehensive library designed to streamline and
+enhance the process of Exploratory Data Analysis (EDA) for data scientists,
+analysts, and researchers. This toolkit provides a suite of functions and
+utilities that facilitate the initial investigation of datasets, enabling users
+to quickly gain insights, identify patterns, and uncover underlying structures
+in their data.
Exploratory Data Analysis (EDA) is a crucial step in the data science workflow.
+It involves various techniques to summarize the main characteristics of the data,
+often with visual methods. EDA helps in understanding the data better, identifying
+anomalies, discovering patterns, and forming hypotheses. This process is essential
+before applying any machine learning models, as it ensures the quality and relevance
+of the data.
The eda_toolkit library is a comprehensive suite of tools designed to
+streamline and automate many of the tasks associated with Exploratory Data
+Analysis (EDA). It offers a broad range of functionalities, including:
+
+
Data Management: Tools for managing directories, generating unique IDs,
+standardizing dates, and handling common DataFrame manipulations.
+
Data Cleaning: Functions to address missing values, remove outliers, and
+correct formatting issues, ensuring data is ready for analysis.
+
Data Visualization: A variety of plotting functions, including KDE
+distribution plots, stacked bar plots, scatter plots with optional best fit
+lines, and box/violin plots, to visually explore data distributions,
+relationships, and trends.
+
Descriptive and Summary Statistics: Methods to generate comprehensive
+reports on data types, summary statistics (mean, median, standard deviation,
+etc.), and to summarize all possible combinations of specified variables.
+
Reporting and Export: Features to save DataFrames to Excel with
+customizable formatting, create contingency tables, and export generated
+plots in multiple formats.
This guide provides detailed instructions and examples for using the functions
+provided in the eda_toolkit library and how to use them effectively in your projects.
+
For most of the ensuing examples, we will leverage the Census Income Data (1994) from
+the UCI Machine Learning Repository [1]. This dataset provides a rich source of
+information for demonstrating the functionalities of the eda_toolkit.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
background_color (str, optional) – Hex color code or color name for background styling in the output
+
background_color (str, optional) – Hex color code or color name for background styling in the output
DataFrame. Defaults to None.
-
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
+
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
False, returns a styled DataFrame for visual presentation. Defaults to False.
@@ -876,17 +875,17 @@
Generating Summary Tables for Variable CombinationsParameters:
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
-
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
+
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
Raises:
-
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
Returns:
A DataFrame containing the contingency table with the specified columns, a 'Total' column representing the count of occurrences, and a 'Percentage' column representing the percentage of the total count.
@@ -1211,8 +1211,8 @@
Highlighting Specific Columns in a DataFrameParameters:
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
-
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
-
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
-
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
-
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
-
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
-
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
-
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
-
hue (str, optional) – Column name to group data by, adding different colors for each group.
-
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
-
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
-
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
-
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
-
w_pad (float, optional) – Width padding between subplots, default is 1.0.
-
h_pad (float, optional) – Height padding between subplots, default is 1.0.
-
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
-
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
-
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
-
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
-
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
-
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
-
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
-
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
-
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
-
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
-
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
-
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
-
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
-
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
-
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
-
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
-
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
-
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
-
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
-
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
-
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
+
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
+
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
+
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
+
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
+
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
+
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
+
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
kwargs (additional keyword arguments) – Additional keyword arguments passed to the Seaborn plotting function.
p (int, optional) – The padding between the subplots.
-
file_prefix (str, optional) – Prefix for the filename when output includes plots.
-
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
-
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
-
show_legend (bool, optional) – Specify whether to show the legend, default is True.
-
label_fontsize (int, optional) – Font size for axis labels, default is 12.
-
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
-
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
-
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
-
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
-
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
-
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
-
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
-
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
-
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
-
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
-
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
-
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
-
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
-
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
-
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
-
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
-
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
-
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
-
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
-
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
-
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
-
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
-
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
-
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
-
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
-
marker (str, optional) – Marker style used for the scatter points. Default is "o".
-
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
-
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
-
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
-
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
+
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
+
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
+
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
+
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
+
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
+
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
+
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
-
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
-
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
-
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
-
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
-
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
-
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
-
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
-
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
-
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
-
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
-
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
-
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
-
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
-
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
-
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
-
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
-
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
-
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
-
plot_type (str, optional) – The type of plots to generate. Options are:
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
- "static": Generate only static Matplotlib plots.
- "interactive": Generate only interactive Plotly plots.
- "both": Generate both static and interactive plots. Default is "both".
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
-
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
-
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
-
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
-
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
-
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
-
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
-
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
-
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
-
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
-
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
-
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
-
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
-
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
-
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
-
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
-
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
-
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
-
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
-
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
-
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
-
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
-
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
-
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
-
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
Leonid Shpaner is a Data Scientist at UCLA Health. With over a decade of experience in analytics and teaching, he has collaborated on a wide variety of projects within financial services, education, personal development, and healthcare. He serves as a course facilitator for Data Analytics and Applied Statistics at Cornell University and is a lecturer of Statistics in Python for the University of San Diego’s M.S. Applied Artificial Intelligence program.
Oscar Gil is a Data Scientist at the University of California, Riverside, bringing over ten years of professional experience in the education data management industry. An effective data professional, he excels in Data Warehousing, Data Analytics, Data Wrangling, Machine Learning, SQL, Python, R, Data Automation, and Report Authoring. Oscar holds a Master of Science in Applied Data Science from the University of San Diego.
background_color (str, optional) – Hex color code or color name for background styling in the output
+
background_color (str, optional) – Hex color code or color name for background styling in the output
DataFrame. Defaults to None.
-
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
+
return_df (bool, optional) – If True, returns the plain DataFrame with the summary statistics. If
False, returns a styled DataFrame for visual presentation. Defaults to False.
@@ -904,17 +903,17 @@
Generating Summary Tables for Variable CombinationsParameters:
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
-
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
+
cols (str or list of str, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int, optional) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order. Defaults to 0.
Raises:
-
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
Returns:
A DataFrame containing the contingency table with the specified columns, a 'Total' column representing the count of occurrences, and a 'Percentage' column representing the percentage of the total count.
@@ -1239,8 +1239,8 @@
Highlighting Specific Columns in a DataFrameParameters:
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
-
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
-
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
-
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
-
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
-
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
-
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
-
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
-
hue (str, optional) – Column name to group data by, adding different colors for each group.
-
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
-
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
-
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
-
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
-
w_pad (float, optional) – Width padding between subplots, default is 1.0.
-
h_pad (float, optional) – Height padding between subplots, default is 1.0.
-
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
-
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
-
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
-
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
-
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
-
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
-
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
-
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
-
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
-
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
-
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
-
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
-
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
-
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
-
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
-
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
-
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
-
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
-
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
-
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
-
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots. If ‘all’, plots will be generated for all numeric columns.
+
figsize (tuple of int, optional) – Size of each individual plot, default is (5,5). Used when only one plot is being generated or when saving individual plots.
+
grid_figsize (tuple of int, optional) – Size of the overall grid of plots when multiple plots are generated in a grid. Ignored when only one plot is being generated or when saving individual plots. If not specified, it is calculated based on figsize, n_rows, and n_cols.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
mean_color (str, optional) – Color of the mean line if plot_mean is True, default is '#000000'.
+
median_color (str, optional) – Color of the median line if plot_median is True, default is '#000000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where 0 is fully transparent and 1 is fully opaque. Default is 1.
+
n_rows (int, optional) – Number of rows in the subplot grid. If not provided, it will be calculated automatically.
+
n_cols (int, optional) – Number of columns in the subplot grid. If not provided, it will be calculated automatically.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots. The variable name will be appended to this filename. This parameter uses figsize for determining the plot size, ignoring grid_figsize.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (str or list of str, optional) – Variable name(s) to apply log scaling. Can be a single string or a list of strings.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
binwidth (float, optional) – Width of each bin, overrides bins but can be used with binrange.
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency', 'probability', 'percent', 'density'), default is 'density'.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
plot_mean (bool, optional) – Whether to plot the mean as a vertical line, default is False.
+
plot_median (bool, optional) – Whether to plot the median as a vertical line, default is False.
+
std_dev_levels (list of int, optional) – Levels of standard deviation to plot around the mean.
+
std_color (str or list of str, optional) – Color(s) for the standard deviation lines, default is '#808080'.
+
label_names (dict, optional) – Custom labels for the variables of interest. Keys should be column names, and values should be the corresponding labels to display.
+
show_legend (bool, optional) – Whether to show the legend on the plots, default is True.
kwargs (additional keyword arguments) – Additional keyword arguments passed to the Seaborn plotting function.
kind (str, optional) – The kind of plot to generate ('bar' or 'barh' for horizontal bars), default is 'bar'.
-
width (float, optional) – The width of the bars in the bar plot, default is 0.9.
-
rot (int, optional) – The rotation angle of the x-axis labels, default is 0.
-
custom_order (list, optional) – Specifies a custom order for the categories in the col.
-
image_path_png (str, optional) – Directory path where generated PNG plot images will be saved.
-
image_path_svg (str, optional) – Directory path where generated SVG plot images will be saved.
-
save_formats (list, optional) – List of file formats to save the plot images in. Valid formats are 'png' and 'svg'. If not provided, defaults to an empty list and no images will be saved.
-
color (list, optional) – List of colors to use for the plots. If not provided, a default color scheme is used.
-
output (str, optional) – Specify the output type: "plots_only", "crosstabs_only", or "both". Default is "both".
-
return_dict (bool, optional) – Specify whether to return the crosstabs dictionary, default is False.
p (int, optional) – The padding between the subplots.
-
file_prefix (str, optional) – Prefix for the filename when output includes plots.
-
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
-
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
-
show_legend (bool, optional) – Specify whether to show the legend, default is True.
-
label_fontsize (int, optional) – Font size for axis labels, default is 12.
-
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
-
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
kind (str, optional) – The kind of plot to generate ('bar' or 'barh' for horizontal bars), default is 'bar'.
+
width (float, optional) – The width of the bars in the bar plot, default is 0.9.
+
rot (int, optional) – The rotation angle of the x-axis labels, default is 0.
+
custom_order (list, optional) – Specifies a custom order for the categories in the col.
+
image_path_png (str, optional) – Directory path where generated PNG plot images will be saved.
+
image_path_svg (str, optional) – Directory path where generated SVG plot images will be saved.
+
save_formats (list, optional) – List of file formats to save the plot images in. Valid formats are 'png' and 'svg'. If not provided, defaults to an empty list and no images will be saved.
+
color (list, optional) – List of colors to use for the plots. If not provided, a default color scheme is used.
+
output (str, optional) – Specify the output type: "plots_only", "crosstabs_only", or "both". Default is "both".
+
return_dict (bool, optional) – Specify whether to return the crosstabs dictionary, default is False.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
-
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
-
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
-
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
-
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
-
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
-
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
-
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
-
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
-
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
-
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
-
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
-
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
-
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
-
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
-
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
-
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
-
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
-
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
-
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
-
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
-
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
-
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
-
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
-
marker (str, optional) – Marker style used for the scatter points. Default is "o".
-
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
-
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
-
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
-
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
+
x_vars (list of str, optional) – List of variable names to plot on the x-axis.
+
y_vars (list of str, optional) – List of variable names to plot on the y-axis.
+
n_rows (int, optional) – Number of rows in the subplot grid. Calculated based on the number of plots and n_cols if not specified.
+
n_cols (int, optional) – Number of columns in the subplot grid. Calculated based on the number of plots and max_cols if not specified.
+
max_cols (int, optional) – Maximum number of columns in the subplot grid. Default is 4.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid". If None, plots will not be saved.
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots. Calculated based on the number of rows and columns if not specified.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette. This parameter requires the hue parameter to be set.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
all_vars (list of str, optional) – If provided, automatically generates scatter plots for all combinations of variables in this list, overriding x_vars and y_vars.
+
label_names (dict, optional) – A dictionary to rename columns for display in the plot titles and labels.
+
kwargs (dict, optional) – Additional keyword arguments to pass to sns.scatterplot.
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
-
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
-
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
-
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
-
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
-
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
-
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
-
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
-
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
-
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
-
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
-
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
-
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
+
feature_names (list of str) – A list of feature names corresponding to the columns in X_train.
+
features (list of int or tuple of int) – A list of feature indices or tuples of feature indices for which to generate partial dependence plots.
+
title (str, optional) – The title for the entire plot. Default is "PDPofhousevalueonCAnon-locationfeatures".
+
grid_resolution (int, optional) – The number of grid points to use for plotting the partial dependence. Higher values provide smoother curves but may increase computation time. Default is 50.
+
plot_type (str, optional) – The type of plot to generate. Choose "grid" for a grid layout, "individual" for separate plots, or "both" to generate both layouts. Default is "grid".
+
grid_figsize (tuple, optional) – Tuple specifying the width and height of the figure for the grid layout. Default is (12,8).
+
individual_figsize (tuple, optional) – Tuple specifying the width and height of the figure for individual plots. Default is (6,4).
+
label_fontsize (int, optional) – Font size for the axis labels and titles. Default is 12.
+
tick_fontsize (int, optional) – Font size for the axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
image_path_png (str, optional) – The directory path where PNG images of the plots will be saved, if saving is enabled.
+
image_path_svg (str, optional) – The directory path where SVG images of the plots will be saved, if saving is enabled.
+
save_plots (str, optional) – Controls whether to save the plots. Options include "all", "individual", "grid", or None (default). If saving is enabled, ensure image_path_png or image_path_svg are provided.
+
file_prefix (str, optional) – Prefix for the filenames of the saved grid plots. Default is "partial_dependence".
dataframe (pandas.DataFrame or numpy.ndarray) – The dataset on which the model was trained or a representative sample. If a DataFrame is provided, feature_names_list should correspond to the column names. If a NumPy array is provided, feature_names_list should correspond to the indices of the columns.
-
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
-
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
-
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
-
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
-
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
-
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
-
plot_type (str, optional) – The type of plots to generate. Options are:
+
feature_names_list (list of str) – A list of two feature names or indices corresponding to the features for which partial dependence plots are generated.
+
x_label (str, optional) – Label for the x-axis in the plots. Default is None.
+
y_label (str, optional) – Label for the y-axis in the plots. Default is None.
+
z_label (str, optional) – Label for the z-axis in the plots. Default is None.
html_file_path (str, optional) – Path to save the interactive Plotly HTML file. Required if plot_type is "interactive" or "both". Default is None.
+
html_file_name (str, optional) – Name of the HTML file to save the interactive Plotly plot. Required if plot_type is "interactive" or "both". Default is None.
+
image_filename (str, optional) – Base filename for saving static Matplotlib plots as PNG and/or SVG. Default is None.
+
plot_type (str, optional) – The type of plots to generate. Options are:
- "static": Generate only static Matplotlib plots.
- "interactive": Generate only interactive Plotly plots.
- "both": Generate both static and interactive plots. Default is "both".
matplotlib_colormap (matplotlib.colors.Colormap, optional) – Custom colormap for the Matplotlib plot. If not provided, a default colormap is used.
-
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
-
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
-
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
-
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
-
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
-
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
-
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
-
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
-
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
-
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
-
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
-
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
-
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
-
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
-
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
-
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
-
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
-
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
-
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
-
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
-
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
-
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
-
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
-
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
+
plotly_colormap (str, optional) – Colormap for the Plotly plot. Default is "Viridis".
+
zoom_out_factor (float, optional) – Factor to adjust the zoom level of the Plotly plot. Default is None.
+
wireframe_color (str, optional) – Color for the wireframe in the Matplotlib plot. If None, no wireframe is plotted. Default is None.
+
view_angle (tuple, optional) – Elevation and azimuthal angles for the Matplotlib plot view. Default is (22,70).
+
figsize (tuple, optional) – Figure size for the Matplotlib plot. Default is (7,4.5).
+
text_wrap (int, optional) – Maximum width of the title text before wrapping. Useful for managing long titles. Default is 50.
+
horizontal (float, optional) – Horizontal camera position for the Plotly plot. Default is -1.25.
+
depth (float, optional) – Depth camera position for the Plotly plot. Default is 1.25.
+
vertical (float, optional) – Vertical camera position for the Plotly plot. Default is 1.25.
+
cbar_x (float, optional) – Position of the color bar along the x-axis in the Plotly plot. Default is 1.05.
+
cbar_thickness (int, optional) – Thickness of the color bar in the Plotly plot. Default is 25.
+
title_x (float, optional) – Horizontal position of the title in the Plotly plot. Default is 0.5.
+
title_y (float, optional) – Vertical position of the title in the Plotly plot. Default is 0.95.
+
top_margin (int, optional) – Top margin for the Plotly plot layout. Default is 100.
+
image_path_png (str, optional) – Directory path to save the PNG file of the Matplotlib plot. Default is None.
+
image_path_svg (str, optional) – Directory path to save the SVG file of the Matplotlib plot. Default is None.
+
show_cbar (bool, optional) – Whether to display the color bar in the Matplotlib plot. Default is True.
+
grid_resolution (int, optional) – The resolution of the grid for computing partial dependence. Default is 20.
+
left_margin (int, optional) – Left margin for the Plotly plot layout. Default is 20.
+
right_margin (int, optional) – Right margin for the Plotly plot layout. Default is 65.
+
label_fontsize (int, optional) – Font size for axis labels in the Matplotlib plot. Default is 8.
+
tick_fontsize (int, optional) – Font size for tick labels in the Matplotlib plot. Default is 6.
+
enable_zoom (bool, optional) – Whether to enable zooming in the Plotly plot. Default is True.
+
show_modebar (bool, optional) – Whether to display the mode bar in the Plotly plot. Default is True.
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
-
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
+
cols (str or list, optional) – Name of the column (as a string) for a single column or list of column names for multiple columns. Must provide at least one column.
+
sort_by (int) – Enter 0 to sort results by column groups; enter 1 to sort results by totals in descending order.
Raises:
-
ValueError – If no columns are specified or if sort_by is not 0 or 1.
+
ValueError – If no columns are specified or if sort_by is not 0 or 1.
Returns:
A DataFrame with the specified columns, 'Total', and 'Percentage'.
@@ -1190,8 +1190,8 @@
Highlighting Specific Columns in a DataFrameParameters:
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
-
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
-
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
-
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
-
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
-
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
-
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
-
hue (str, optional) – Column name to group data by, adding different colors for each group.
-
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
-
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
+
vars_of_interest (list of str, optional) – List of column names for which to generate distribution plots.
+
grid_figsize (tuple, optional) – Size of the overall grid figure, default is (10,8).
+
single_figsize (tuple, optional) – Size of individual figures for each variable, default is (6,4).
+
kde (bool, optional) – Whether to include KDE plots on the histograms, default is True.
+
hist_color (str, optional) – Color of the histogram bars, default is '#0000FF'.
+
kde_color (str, optional) – Color of the KDE plot, default is '#FF0000'.
+
hist_edgecolor (str, optional) – Color of the histogram bar edges, default is '#000000'.
+
hue (str, optional) – Column name to group data by, adding different colors for each group.
+
fill (bool, optional) – Whether to fill the histogram bars with color, default is True.
+
fill_alpha (float, optional) – Alpha transparency for the fill color of the histogram bars, where
0 is fully transparent and 1 is fully opaque. Default is 1.
-
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
-
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
-
w_pad (float, optional) – Width padding between subplots, default is 1.0.
-
h_pad (float, optional) – Height padding between subplots, default is 1.0.
-
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
-
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
-
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
-
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
-
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
-
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
-
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
+
n_rows (int, optional) – Number of rows in the subplot grid, default is 1.
+
n_cols (int, optional) – Number of columns in the subplot grid, default is 1.
+
w_pad (float, optional) – Width padding between subplots, default is 1.0.
+
h_pad (float, optional) – Height padding between subplots, default is 1.0.
+
image_path_png (str, optional) – Directory path to save the PNG image of the overall distribution plots.
+
image_path_svg (str, optional) – Directory path to save the SVG image of the overall distribution plots.
+
image_filename (str, optional) – Filename to use when saving the overall distribution plots.
+
bbox_inches (str, optional) – Bounding box to use when saving the figure. For example, 'tight'.
+
single_var_image_path_png (str, optional) – Directory path to save the PNG images of the separate distribution plots.
+
single_var_image_path_svg (str, optional) – Directory path to save the SVG images of the separate distribution plots.
+
single_var_image_filename (str, optional) – Filename to use when saving the separate distribution plots.
The variable name will be appended to this filename.
-
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
-
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
-
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
-
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
+
y_axis_label (str, optional) – The label to display on the y-axis, default is 'Density'.
+
plot_type (str, optional) – The type of plot to generate, options are 'hist', 'kde', or 'both'. Default is 'both'.
+
log_scale_vars (list of str, optional) – List of variable names to apply log scaling.
+
bins (int or sequence, optional) – Specification of histogram bins, default is 'auto'.
binwidth (number or pair of numbers, optional) – Width of each bin, overrides bins but can be used with binrange.
-
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
-
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
-
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
-
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
-
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
+
label_fontsize (int, optional) – Font size for axis labels, including xlabel, ylabel, and tick marks, default is 10.
+
tick_fontsize (int, optional) – Font size for axis tick labels, default is 10.
+
text_wrap (int, optional) – Maximum width of the title text before wrapping, default is 50.
+
disable_sci_notation (bool, optional) – Toggle to disable scientific notation on axes, default is False.
+
stat (str, optional) – Aggregate statistic to compute in each bin (e.g., 'count', 'frequency',
'probability', 'percent', 'density'), default is 'density'.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
p (int, optional) – The padding between the subplots.
-
file_prefix (str, optional) – Prefix for the filename when output includes plots.
-
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
-
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
-
show_legend (bool, optional) – Specify whether to show the legend, default is True.
-
label_fontsize (int, optional) – Font size for axis labels, default is 12.
-
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
-
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
col (str) – The name of the column in the DataFrame to be analyzed.
+
func_col (list) – List of ground truth columns to be analyzed.
+
legend_labels_list (list) – List of legend labels for each ground truth column.
p (int, optional) – The padding between the subplots.
+
file_prefix (str, optional) – Prefix for the filename when output includes plots.
+
logscale (bool, optional) – Apply log scale to the y-axis, default is False.
+
plot_type (str, optional) – Specify the type of plot to generate: "both", "regular", "normalized". Default is "both".
+
show_legend (bool, optional) – Specify whether to show the legend, default is True.
+
label_fontsize (int, optional) – Font size for axis labels, default is 12.
+
tick_fontsize (int, optional) – Font size for tick labels on the axes, default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
remove_stacks (bool, optional) – If True, removes stacks and creates a regular bar plot using only the col parameter. Only works when plot_type is set to 'regular'. Default is False.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
x_vars (list of str) – List of variable names to plot on the x-axis.
-
y_vars (list of str) – List of variable names to plot on the y-axis.
-
n_rows (int) – Number of rows in the subplot grid.
-
n_cols (int) – Number of columns in the subplot grid.
-
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
-
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
-
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
-
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
-
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
-
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
-
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
-
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
-
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
-
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
-
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
-
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
-
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
-
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
-
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
-
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
-
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
-
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
-
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
-
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
-
marker (str, optional) – Marker style used for the scatter points. Default is "o".
-
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
-
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
-
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).
+
x_vars (list of str) – List of variable names to plot on the x-axis.
+
y_vars (list of str) – List of variable names to plot on the y-axis.
+
n_rows (int) – Number of rows in the subplot grid.
+
n_cols (int) – Number of columns in the subplot grid.
+
image_path_png (str, optional) – Directory path to save PNG images of the scatter plots.
+
image_path_svg (str, optional) – Directory path to save SVG images of the scatter plots.
+
save_plots (str, optional) – Controls which plots to save: "all", "individual", or "grid".
+
show_legend (bool, optional) – Whether to display the legend on the plots. Default is True.
+
xlabel_rot (int, optional) – Rotation angle for x-axis labels. Default is 0.
+
show_plot (str, optional) – Controls plot display: "individual", "grid", or "both". Default is "both".
+
rotate_plot (bool, optional) – Whether to rotate (pivot) the plots. Default is False.
+
individual_figsize (tuple or list, optional) – Width and height of the figure for individual plots. Default is (6,4).
+
grid_figsize (tuple or list, optional) – Width and height of the figure for grid plots.
+
label_fontsize (int, optional) – Font size for axis labels. Default is 12.
+
tick_fontsize (int, optional) – Font size for axis tick labels. Default is 10.
+
text_wrap (int, optional) – The maximum width of the title text before wrapping, default is 50.
+
add_best_fit_line (bool, optional) – Whether to add a best fit line to the scatter plots. Default is False.
+
scatter_color (str, optional) – Color code for the scattered points. Default is "C0".
+
best_fit_linecolor (str, optional) – Color code for the best fit line. Default is "red".
+
best_fit_linestyle (str, optional) – Linestyle for the best fit line. Default is "-".
+
hue (str, optional) – Column name for the grouping variable that will produce points with different colors.
+
hue_palette (dict, list, or str, optional) – Specifies colors for each hue level. Can be a dictionary mapping hue levels to colors, a list of colors, or the name of a seaborn color palette.
+
size (str, optional) – Column name for the grouping variable that will produce points with different sizes.
+
sizes (dict, optional) – Dictionary mapping sizes (smallest and largest) to min and max values.
+
marker (str, optional) – Marker style used for the scatter points. Default is "o".
+
show_correlation (bool, optional) – Whether to display the Pearson correlation coefficient in the plot title. Default is True.
+
xlim (tuple or list, optional) – Limits for the x-axis as a tuple or list of (min, max).
+
ylim (tuple or list, optional) – Limits for the y-axis as a tuple or list of (min, max).