# DataSafari - DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
# Copyright (C) 2024 George Dreemer.
#
# Read more about DataSafari's LICENSE here: https://datasafari.dev/docs/other/lic-gpl3
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, specifically under
# version 3 of the License. No later versions are applicable.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see: https://github.com/ETA444/datasafari/blob/main/LICENSE
from typing import List, Optional
import pandas as pd
from datasafari.utils.calculators import calculate_entropy
from datasafari.evaluator.evaluate_dtype import evaluate_dtype
[docs]
def explore_cat(
df: pd.DataFrame,
categorical_variables: List[str],
method: str = 'all',
output: str = 'print'
) -> Optional[str]:
"""
**Explore categorical variables within a DataFrame and gain insights on unique values, counts and percentages, and the entropy of variables to quantify data diversity.**
Parameters:
-----------
df : pd.DataFrame
The DataFrame containing the data to be explored.
categorical_variables : list
A list of strings specifying the names of the categorical columns to explore.
method : str, default: 'all'
Specifies the method of exploration to apply.
- ``'unique_values'`` Lists unique values for each specified categorical variable.
- ``'counts_percentage'`` Shows counts and percentages for the unique values of each variable.
- ``'entropy'`` Calculates the entropy for each variable, providing a measure of data diversity. See the 'calculate_entropy' function for more details on entropy calculation.
- ``'all'`` Applies all the above methods sequentially.
output : str, default: 'print'
Determines the output format.
- ``'print'`` Prints the results to the console.
- ``'return'`` Returns the results as a single formatted string.
Returns:
--------
str or None
- ``str`` If output='return', a string containing the formatted exploration results is returned.
- ``None`` If output='print', results are printed to the console, and the function returns None.
Raises:
-------
TypeErrors:
- If `df` is not a pandas DataFrame.
- If `categorical_variables` is not a list or contains non-string elements.
- If `method` or `output` is not a string.
ValueErrors:
- If the `df` is empty.
- If `method` is not one of the valid options.
- If `output` is not one of the valid options.
- If 'categorical_variables' list is empty.
- If variables provided through 'categorical_variables' are not categorical variables.
- If any of the specified categorical variables are not found in the DataFrame.
Examples:
---------
Create a sample DataFrame to use in the examples:
>>> import datasafari
>>> import numpy as np
>>> import pandas as pd
>>> data = {
... 'Category1': np.random.choice(['Apple', 'Banana', 'Cherry'], size=100),
... 'Category2': np.random.choice(['Yes', 'No'], size=100),
... 'Category3': np.random.choice(['Low', 'Medium', 'High'], size=100)
... }
>>> df = pd.DataFrame(data)
The full potential of ``explore_cat()`` is unlocked by simply providing a dataframe and the categorical columns to explore:
>>> explore_cat(df, ['Category1', 'Category2', 'Category3'])
Display unique values for 'Category1' and 'Category2':
>>> explore_cat(df, ['Category1', 'Category2'], method='unique_values', output='print')
Explore counts and percentages for 'Category1' and 'Category2', then print the results:
>>> explore_cat(df, ['Category1', 'Category2'], method='counts_percentage', output='print')
Calculate and return the entropy of 'Category1', 'Category2', and 'Category3':
>>> result = explore_cat(df, ['Category1', 'Category2', 'Category3'], method='entropy', output='return')
>>> print(result)
Comprehensive exploration of all specified methods for 'Category1', 'Category2', and 'Category3', displaying to console:
>>> explore_cat(df, ['Category1', 'Category2', 'Category3'], method='all', output='print')
Using 'all' method to explore 'Category1' and 'Category2', returning the results as a string:
>>> result_str = explore_cat(df, ['Category1', 'Category2'], method='all', output='return')
>>> print(result_str)
Notes:
------
The ``'entropy'`` method provides a quantitative measure of the unpredictability or diversity within each specified categorical column, calculated as outlined in the documentation for 'calculate_entropy'. High entropy values indicate a more uniform distribution of categories, suggesting no single category overwhelmingly dominates.
"""
# Error Handling #
# TypeErrors
if not isinstance(df, pd.DataFrame):
raise TypeError("explore_cat(): The df parameter must be a pandas DataFrame.")
if not isinstance(categorical_variables, list):
raise TypeError("explore_cat(): The categorical_variables parameter must be a list of variable names.\n Example: var_list = ['var1', 'var2', 'var3']")
else:
if not all(isinstance(var, str) for var in categorical_variables):
raise TypeError("explore_cat(): All items in the categorical_variables list must be strings representing column names.")
if not isinstance(method, str):
raise TypeError("explore_cat(): The method parameter must be a string.\n Example: method = 'all'")
if not isinstance(output, str):
raise TypeError("explore_cat(): The output parameter must be a string. \n Example: output = 'return'")
# ValueErrors
# Check if df is empty
if df.empty:
raise ValueError("explore_cat(): The input DataFrame is empty.")
# Check if method is valid
valid_methods = ['unique_values', 'counts_percentage', 'entropy', 'all']
if method.lower() not in valid_methods:
raise ValueError(f"explore_cat(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}")
# Check if output is valid
if output.lower() not in ['print', 'return']:
raise ValueError("explore_cat(): Invalid output method. Choose 'print' or 'return'.")
# Check if list has any members
if len(categorical_variables) == 0:
raise ValueError("explore_cat(): The 'categorical_variables' list must contain at least one column name.")
# Check if variables are categorical
categorical_types = evaluate_dtype(df, categorical_variables, output='list_c')
if not all(categorical_types):
raise ValueError("explore_cat(): The 'categorical_variables' list must contain only names of categorical variables.")
# Check if specified variables exist in the DataFrame
missing_vars = [var for var in categorical_variables if var not in df.columns]
if missing_vars:
raise ValueError(f"explore_cat(): The following variables were not found in the DataFrame: {', '.join(missing_vars)}")
# Main Function #
result = []
if method.lower() in ['unique_values', 'all']:
# initial append for title of method section
result.append("<______UNIQUE VALUES PER VARIABLE______>\n")
# get the unique values per variable in categorical_variables list
for variable_name in categorical_variables:
unique_values = df[variable_name].unique()
result.append(f"< Unique values of ['{variable_name}'] >\n\n{unique_values}\n\n")
if method.lower() in ['counts_percentage', 'all']:
# initial append for title of method section
result.append("<______COUNTS & PERCENTAGE______>\n")
# get the counts and percentages per unique value of variable in categorical_variables list
for variable_name in categorical_variables:
counts = df[variable_name].value_counts()
percentages = df[variable_name].value_counts(normalize=True) * 100
# combine counts and percentages into a DataFrame
summary_df = pd.DataFrame({'Counts': counts, 'Percentages': percentages})
# format percentages to be 2 decimals
summary_df['Percentages'] = summary_df['Percentages'].apply(lambda x: f"{x:.2f}%")
result.append(f"< Counts and percentages per unique value of ['{variable_name}'] >\n\n{summary_df}\n\n")
if method.lower() in ['entropy', 'all']:
result.append("<______ENTROPY OF CATEGORICAL VARIABLES______>\n")
# include a tip on interpretation
result.append("Tip: Higher entropy indicates greater diversity.*\n")
for variable_name in categorical_variables:
entropy_val, interpretation = calculate_entropy(df[variable_name])
result.append(f"Entropy of ['{variable_name}']: {entropy_val:.3f} {interpretation}\n")
# include additional info tip
result.append("* For more details on entropy, run: 'print(calculate_entropy.__doc__)'.\n")
# Combine all results
combined_result = "\n".join(result)
if output.lower() == 'print':
print(combined_result)
elif output.lower() == 'return':
return combined_result