Source code for datasafari.evaluator.evaluate_contingency_table

# DataSafari - DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
# Copyright (C) 2024 George Dreemer.
#
# Read more about DataSafari's LICENSE here: https://datasafari.dev/docs/other/lic-gpl3
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, specifically under
# version 3 of the License. No later versions are applicable.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see: https://github.com/ETA444/datasafari/blob/main/LICENSE

from typing import Union
import numpy as np
import pandas as pd
from scipy.stats.contingency import expected_freq


[docs] def evaluate_contingency_table( contingency_table: pd.DataFrame, min_sample_size_yates: int = 40, pipeline: bool = False, quiet: bool = False ) -> Union[dict, tuple]: """ **Evaluate the suitability of statistical tests for a given contingency table by analyzing its characteristics and guiding the selection of appropriate tests.** This function assesses the contingency table's suitability for chi-square tests, exact tests (Barnard's, Boschloo's, and Fisher's), and the application of Yates' correction within the chi-square test. It examines expected and observed frequencies, sample size, and table shape to guide the choice of appropriate statistical tests for hypothesis testing. Parameters: ----------- contingency_table : pd.DataFrame A contingency table generated from two categorical variables. min_sample_size_yates : int, optional, default: 40 The minimum sample size below which Yates' correction should be considered. pipeline : bool, optional, default: False Determines the format of the output. - ``True`` Outputs a tuple of boolean values representing the viability of each test. - ``False`` Outputs a dictionary with the test names as keys and their viabilities as boolean values. quiet : bool, optional, default: False Determines if output is printed to the console. - ``True`` Output is printed. - ``False`` Output is not printed. Returns: -------- dict or tuple Depending on the 'pipeline' parameter: - ``dict`` If pipeline=False, returns a dictionary with keys as test names ('chi2_contingency', 'yates_correction', 'barnard_exact', 'boschloo_exact', 'fisher_exact') and values as boolean indicators of their viability. - ``tuple`` If pipeline=True, returns a tuple of boolean values in the order: (chi2_viability, yates_correction_viability, barnard_viability, boschloo_viability, fisher_viability). Raises: ------- TypeErrors: - If `contingency_table` is not a pandas DataFrame. - If `min_sample_size_yates` is not an integer. - If `pipeline` or `quiet` is not a boolean. ValueErrors: - If the `contingency_table` is empty. - If `min_sample_size_yates` is not a positive integer. Examples: --------- Creating a contingency table from a small dataset and evaluating it: >>> import datasafari >>> import pandas as pd >>> data = { ... 'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'], ... 'Preference': ['Tea', 'Coffee', 'Coffee', 'Tea', 'Tea'] ... } >>> df_small = pd.DataFrame(data) >>> contingency_small = pd.crosstab(df_small['Gender'], df_small['Preference']) >>> viability_dict_small = evaluate_contingency_table(contingency_small) Using a larger dataset to demonstrate the effect of sample size on test viability: >>> import datasafari >>> import pandas as pd >>> import numpy as np >>> data_large = { ... 'Gender': np.random.choice(['Male', 'Female'], 200), ... 'Preference': np.random.choice(['Tea', 'Coffee'], 200) ... } >>> df_large = pd.DataFrame(data_large) >>> contingency_large = pd.crosstab(df_large['Gender'], df_large['Preference']) >>> viability_dict_large = evaluate_contingency_table(contingency_large) ... >>> # Applying the function in a pipeline to make further decisions: >>> contingency_pipeline = pd.crosstab(df_large['Gender'], df_large['Preference']) >>> chi2, yates, barnard, boschloo, fisher = evaluate_contingency_table(contingency_pipeline, pipeline=True) >>> if chi2: >>> print("Chi-square test is viable for this dataset.") >>> else: >>> print("Consider alternative tests such as Fisher's exact test.") """ # Error Handling # TypeErrors if not isinstance(contingency_table, pd.DataFrame): raise TypeError("evaluate_contingency_table(): The 'contingency_table' parameter must be a pandas DataFrame.") if not isinstance(min_sample_size_yates, int): raise TypeError("evaluate_contingency_table(): The 'min_sample_size_yates' parameter must be an integer.") if not isinstance(pipeline, bool): raise TypeError("evaluate_contingency_table(): The 'pipeline' parameter must be a boolean.") if not isinstance(quiet, bool): raise TypeError("evaluate_contingency_table(): The 'quiet' parameter must be a boolean.") # ValueErrors if contingency_table.empty: raise ValueError("evaluate_contingency_table(): The 'contingency_table' parameter must not be empty.") if min_sample_size_yates <= 0: raise ValueError("evaluate_contingency_table(): The 'min_sample_size_yates' parameter must be a positive integer.") # Main Function test_viability = {} # non-pipeline output # compute objects for checks min_expected_frequency = expected_freq(contingency_table).min() min_observed_frequency = contingency_table.min().min() sample_size = np.sum(contingency_table.values) table_shape = contingency_table.shape # assumption check for chi2_contingency test chi2_viability = True if min_expected_frequency >= 5 and min_observed_frequency >= 5 else False test_viability['chi2_contingency'] = chi2_viability # assumption check for chi2_contingency yate's-correction yates_correction_viability = True if table_shape == (2, 2) and sample_size < min_sample_size_yates else False test_viability['yates_correction'] = yates_correction_viability # assumption check for all exact tests barnard_viability, boschloo_viability, fisher_viability = (True, True, True) if table_shape == (2, 2) else (False, False, False) test_viability['barnard_exact'], test_viability['boschloo_exact'], test_viability['fisher_exact'] = barnard_viability, boschloo_viability, fisher_viability # console output title = "< CONTINGENCY TABLE EVALUATION >\n" on_chi2 = f"Based on minimum expected freq. ({min_expected_frequency}) & minimum observed freq. ({min_observed_frequency}):\n ➡ chi2_contingecy() viability: {'✔' if chi2_viability else '✘'}\n\n" on_yates = f"Based on table shape ({table_shape[0]}x{table_shape[1]}) & sample size ({sample_size}):\n ➡ chi2_contingecy() Yate's correction viability: {'✔' if yates_correction_viability else '✘'}\n\n" on_exact = f"Based on table shape ({table_shape[0]}x{table_shape[1]}):\n ➡ barnard_exact() viability: {'✔' if barnard_viability else '✘'}\n ➡ boschloo_exact() viability: {'✔' if boschloo_viability else '✘'}\n ➡ fisher_exact() viability: {'✔' if fisher_viability else '✘'}\n\n\n" print(title, on_chi2, on_yates, on_exact) if not quiet else "" if pipeline: return chi2_viability, yates_correction_viability, barnard_viability, boschloo_viability, fisher_viability elif not pipeline: return test_viability