# DataSafari - DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
# Copyright (C) 2024 George Dreemer.
#
# Read more about DataSafari's LICENSE here: https://datasafari.dev/docs/other/lic-gpl3
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details, specifically under
# version 3 of the License. No later versions are applicable.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see: https://github.com/ETA444/datasafari/blob/main/LICENSE
from typing import Optional
import pandas as pd
import io
from datasafari.utils.filters import filter_kwargs
valid_kwargs = {
'head': ['n'],
'describe': ['percentiles', 'include', 'exclude'],
'info': ['verbose', 'max_cols', 'memory_usage', 'show_counts']
}
[docs]
def explore_df(
df: pd.DataFrame,
method: str = 'all',
output: str = 'print',
**kwargs
) -> Optional[str]:
"""
**Explore a DataFrame and gain a birds-eye view of summary statistics, NAs, data types and more.**
The function combines the most common data exploration functions in one convenient output in your console.
Parameters:
-----------
df : pandas.DataFrame
DataFrame to be explored.
method : str, optional, default: 'all'
Specifies the method to apply on the DataFrame.
- ``'na'`` Displays counts of NAs per column and percentage of NAs.
- ``'desc'`` Shows summary statistics using the `describe` method.
- ``'head'`` Outputs the first few rows using `head`.
- ``'info'`` Provides concise information about the DataFrame using `info`.
- ``'all'`` Executes all the above methods sequentially.
output : str, optional, default: 'print'
Determines the output of the exploration results.
- ``'print'`` Prints the results to the console.
- ``'return'`` Returns the results as a string.
``**kwargs`` : dict
Additional arguments for pandas methods (e.g., ``'percentiles'`` for ``'desc'``). You can specify arguments applicable when 'method' is set to ``'all'``, which will be appropriately directed to each pandas method used. Note that the ``'buf'`` parameter in the ``'info'`` method is disabled and cannot be used.
Return:
-------
str or None
- ``str`` If output='return', a string containing the formatted exploration results is returned as a uniform string.
- ``None`` If output='print', results are printed to the console, and the function returns None.
Raises:
-------
TypeErrors:
- If `df` is not a pandas DataFrame.
- If `method` is not a string.
- If `output` is not a string.
ValueErrors:
- If `df` is empty.
- If `method` is not one of the valid options:.
- If `output` is not 'print' or 'return'.
- If 'buf' parameter is used in the 'info' method.
Examples:
---------
Create a sample DataFrame to use in the examples:
>>> import datasafari
>>> import numpy as np
>>> import pandas as pd
>>> data = {
... 'A': np.random.randn(100),
... 'B': np.random.rand(100) * 100,
... 'C': np.random.randint(1, 100, size=100),
... 'D': np.random.choice(['X', 'Y', 'Z'], size=100)
... }
>>> df = pd.DataFrame(data)
The full potential of ``explore_df()`` is unlocked by simply providing a dataframe:
>>> explore_df(df)
Alternatively, save the output to a string:
>>> summary = explore_df(df, 'all', output='return')
Display summary statistics with custom percentiles:
>>> explore_df(df, 'desc', percentiles=[0.05, 0.95], output='print')
Show the first 3 rows of the DataFrame:
>>> explore_df(df, 'head', n=3, output='print')
Provide detailed DataFrame information:
>>> explore_df(df, 'info', verbose=True, output='print')
Calculate and display the count and percentage of missing values:
>>> explore_df(df, 'na', output='print')
Execute a comprehensive exploration with custom settings:
>>> explore_df(df, 'all', n=3, percentiles=[0.25, 0.75], output='print')
Return comprehensive exploration results as a string:
>>> result_str = explore_df(df, 'all', n=5, output='return')
>>> print(result_str)
Use 'all' with kwargs applicable to specific methods, print the results:
>>> explore_df(df, 'all', n=5, percentiles=[0.1, 0.9], verbose=False, output='print')
"""
# Error Handling #
# TypeErrors for each parameter
if not isinstance(df, pd.DataFrame):
raise TypeError("explore_df(): The df parameter must be a pandas DataFrame.")
if not isinstance(method, str):
raise TypeError("explore_df(): The method parameter must be a string.\nExample: method = 'all'")
if not isinstance(output, str):
raise TypeError("explore_df(): The output parameter must be a string.\nExample: output = 'print'")
# ValueErrors
# Check if df is empty
if df.empty:
raise ValueError("explore_df(): The input DataFrame is empty.")
# Check for correct method
valid_methods = ['na', 'desc', 'head', 'info', 'all']
if method.lower() not in valid_methods:
raise ValueError(f"explore_df(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}.")
# Check for 'output'
if output.lower() not in ['print', 'return']:
raise ValueError("explore_df(): Invalid output method. Choose 'print' or 'return'.")
# Check for unsupported 'info' kwargs
if 'buf' in kwargs and method.lower() == 'info':
raise ValueError("explore_df(): 'buf' parameter is not supported in the 'info' method within explore_df.")
# Main Function Logic #
result = []
if method.lower() in ["desc", "all"]:
desc_kwargs = filter_kwargs('describe', kwargs, valid_kwargs)
result.append(f"<______DESCRIBE______>\n{str(df.describe(**desc_kwargs))}\n")
if method.lower() in ["head", "all"]:
head_kwargs = filter_kwargs('head', kwargs, valid_kwargs)
pd.set_option('display.max_columns', None)
result.append(f"<______HEAD______>\n{str(df.head(**head_kwargs))}\n")
pd.reset_option('display.max_columns')
if method.lower() in ["info", "all"]:
info_kwargs = filter_kwargs('info', kwargs, valid_kwargs)
buffer = io.StringIO()
df.info(buf=buffer, **info_kwargs)
result.append(f"<______INFO______>\n{buffer.getvalue()}\n")
if method.lower() in ["na", "all"]:
na_count = df.isna().sum()
na_percent = (df.isna().sum() / df.shape[0]) * 100
result.append(f"<______NA_COUNT______>\n{na_count}\n")
result.append(f"<______NA_PERCENT______>\n{na_percent}\n")
# Combine all results
combined_result = "\n".join(result)
if output.lower() == 'print':
print(combined_result)
elif output.lower() == 'return':
return combined_result