Source code for datasafari.explorer.explore_cat

# DataSafari - DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
# Copyright (C) 2024 George Dreemer.
#
# Read more about DataSafari's LICENSE here: https://datasafari.dev/docs/other/lic-gpl3
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, specifically under
# version 3 of the License. No later versions are applicable.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see: https://github.com/ETA444/datasafari/blob/main/LICENSE

from typing import List, Optional
import pandas as pd
from datasafari.utils.calculators import calculate_entropy
from datasafari.evaluator.evaluate_dtype import evaluate_dtype


[docs] def explore_cat( df: pd.DataFrame, categorical_variables: List[str], method: str = 'all', output: str = 'print' ) -> Optional[str]: """ **Explore categorical variables within a DataFrame and gain insights on unique values, counts and percentages, and the entropy of variables to quantify data diversity.** Parameters: ----------- df : pd.DataFrame The DataFrame containing the data to be explored. categorical_variables : list A list of strings specifying the names of the categorical columns to explore. method : str, default: 'all' Specifies the method of exploration to apply. - ``'unique_values'`` Lists unique values for each specified categorical variable. - ``'counts_percentage'`` Shows counts and percentages for the unique values of each variable. - ``'entropy'`` Calculates the entropy for each variable, providing a measure of data diversity. See the 'calculate_entropy' function for more details on entropy calculation. - ``'all'`` Applies all the above methods sequentially. output : str, default: 'print' Determines the output format. - ``'print'`` Prints the results to the console. - ``'return'`` Returns the results as a single formatted string. Returns: -------- str or None - ``str`` If output='return', a string containing the formatted exploration results is returned. - ``None`` If output='print', results are printed to the console, and the function returns None. Raises: ------- TypeErrors: - If `df` is not a pandas DataFrame. - If `categorical_variables` is not a list or contains non-string elements. - If `method` or `output` is not a string. ValueErrors: - If the `df` is empty. - If `method` is not one of the valid options. - If `output` is not one of the valid options. - If 'categorical_variables' list is empty. - If variables provided through 'categorical_variables' are not categorical variables. - If any of the specified categorical variables are not found in the DataFrame. Examples: --------- Create a sample DataFrame to use in the examples: >>> import datasafari >>> import numpy as np >>> import pandas as pd >>> data = { ... 'Category1': np.random.choice(['Apple', 'Banana', 'Cherry'], size=100), ... 'Category2': np.random.choice(['Yes', 'No'], size=100), ... 'Category3': np.random.choice(['Low', 'Medium', 'High'], size=100) ... } >>> df = pd.DataFrame(data) The full potential of ``explore_cat()`` is unlocked by simply providing a dataframe and the categorical columns to explore: >>> explore_cat(df, ['Category1', 'Category2', 'Category3']) Display unique values for 'Category1' and 'Category2': >>> explore_cat(df, ['Category1', 'Category2'], method='unique_values', output='print') Explore counts and percentages for 'Category1' and 'Category2', then print the results: >>> explore_cat(df, ['Category1', 'Category2'], method='counts_percentage', output='print') Calculate and return the entropy of 'Category1', 'Category2', and 'Category3': >>> result = explore_cat(df, ['Category1', 'Category2', 'Category3'], method='entropy', output='return') >>> print(result) Comprehensive exploration of all specified methods for 'Category1', 'Category2', and 'Category3', displaying to console: >>> explore_cat(df, ['Category1', 'Category2', 'Category3'], method='all', output='print') Using 'all' method to explore 'Category1' and 'Category2', returning the results as a string: >>> result_str = explore_cat(df, ['Category1', 'Category2'], method='all', output='return') >>> print(result_str) Notes: ------ The ``'entropy'`` method provides a quantitative measure of the unpredictability or diversity within each specified categorical column, calculated as outlined in the documentation for 'calculate_entropy'. High entropy values indicate a more uniform distribution of categories, suggesting no single category overwhelmingly dominates. """ # Error Handling # # TypeErrors if not isinstance(df, pd.DataFrame): raise TypeError("explore_cat(): The df parameter must be a pandas DataFrame.") if not isinstance(categorical_variables, list): raise TypeError("explore_cat(): The categorical_variables parameter must be a list of variable names.\n Example: var_list = ['var1', 'var2', 'var3']") else: if not all(isinstance(var, str) for var in categorical_variables): raise TypeError("explore_cat(): All items in the categorical_variables list must be strings representing column names.") if not isinstance(method, str): raise TypeError("explore_cat(): The method parameter must be a string.\n Example: method = 'all'") if not isinstance(output, str): raise TypeError("explore_cat(): The output parameter must be a string. \n Example: output = 'return'") # ValueErrors # Check if df is empty if df.empty: raise ValueError("explore_cat(): The input DataFrame is empty.") # Check if method is valid valid_methods = ['unique_values', 'counts_percentage', 'entropy', 'all'] if method.lower() not in valid_methods: raise ValueError(f"explore_cat(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}") # Check if output is valid if output.lower() not in ['print', 'return']: raise ValueError("explore_cat(): Invalid output method. Choose 'print' or 'return'.") # Check if list has any members if len(categorical_variables) == 0: raise ValueError("explore_cat(): The 'categorical_variables' list must contain at least one column name.") # Check if variables are categorical categorical_types = evaluate_dtype(df, categorical_variables, output='list_c') if not all(categorical_types): raise ValueError("explore_cat(): The 'categorical_variables' list must contain only names of categorical variables.") # Check if specified variables exist in the DataFrame missing_vars = [var for var in categorical_variables if var not in df.columns] if missing_vars: raise ValueError(f"explore_cat(): The following variables were not found in the DataFrame: {', '.join(missing_vars)}") # Main Function # result = [] if method.lower() in ['unique_values', 'all']: # initial append for title of method section result.append("<______UNIQUE VALUES PER VARIABLE______>\n") # get the unique values per variable in categorical_variables list for variable_name in categorical_variables: unique_values = df[variable_name].unique() result.append(f"< Unique values of ['{variable_name}'] >\n\n{unique_values}\n\n") if method.lower() in ['counts_percentage', 'all']: # initial append for title of method section result.append("<______COUNTS & PERCENTAGE______>\n") # get the counts and percentages per unique value of variable in categorical_variables list for variable_name in categorical_variables: counts = df[variable_name].value_counts() percentages = df[variable_name].value_counts(normalize=True) * 100 # combine counts and percentages into a DataFrame summary_df = pd.DataFrame({'Counts': counts, 'Percentages': percentages}) # format percentages to be 2 decimals summary_df['Percentages'] = summary_df['Percentages'].apply(lambda x: f"{x:.2f}%") result.append(f"< Counts and percentages per unique value of ['{variable_name}'] >\n\n{summary_df}\n\n") if method.lower() in ['entropy', 'all']: result.append("<______ENTROPY OF CATEGORICAL VARIABLES______>\n") # include a tip on interpretation result.append("Tip: Higher entropy indicates greater diversity.*\n") for variable_name in categorical_variables: entropy_val, interpretation = calculate_entropy(df[variable_name]) result.append(f"Entropy of ['{variable_name}']: {entropy_val:.3f} {interpretation}\n") # include additional info tip result.append("* For more details on entropy, run: 'print(calculate_entropy.__doc__)'.\n") # Combine all results combined_result = "\n".join(result) if output.lower() == 'print': print(combined_result) elif output.lower() == 'return': return combined_result