Source code for datasafari.explorer.explore_df

# DataSafari - DataSafari simplifies complex data science tasks into straightforward, powerful one-liners.
# Copyright (C) 2024 George Dreemer.
#
# Read more about DataSafari's LICENSE here: https://datasafari.dev/docs/other/lic-gpl3
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details, specifically under
# version 3 of the License. No later versions are applicable.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see: https://github.com/ETA444/datasafari/blob/main/LICENSE

from typing import Optional
import pandas as pd
import io
from datasafari.utils.filters import filter_kwargs


valid_kwargs = {
    'head': ['n'],
    'describe': ['percentiles', 'include', 'exclude'],
    'info': ['verbose', 'max_cols', 'memory_usage', 'show_counts']
}


[docs] def explore_df( df: pd.DataFrame, method: str = 'all', output: str = 'print', **kwargs ) -> Optional[str]: """ **Explore a DataFrame and gain a birds-eye view of summary statistics, NAs, data types and more.** The function combines the most common data exploration functions in one convenient output in your console. Parameters: ----------- df : pandas.DataFrame DataFrame to be explored. method : str, optional, default: 'all' Specifies the method to apply on the DataFrame. - ``'na'`` Displays counts of NAs per column and percentage of NAs. - ``'desc'`` Shows summary statistics using the `describe` method. - ``'head'`` Outputs the first few rows using `head`. - ``'info'`` Provides concise information about the DataFrame using `info`. - ``'all'`` Executes all the above methods sequentially. output : str, optional, default: 'print' Determines the output of the exploration results. - ``'print'`` Prints the results to the console. - ``'return'`` Returns the results as a string. ``**kwargs`` : dict Additional arguments for pandas methods (e.g., ``'percentiles'`` for ``'desc'``). You can specify arguments applicable when 'method' is set to ``'all'``, which will be appropriately directed to each pandas method used. Note that the ``'buf'`` parameter in the ``'info'`` method is disabled and cannot be used. Return: ------- str or None - ``str`` If output='return', a string containing the formatted exploration results is returned as a uniform string. - ``None`` If output='print', results are printed to the console, and the function returns None. Raises: ------- TypeErrors: - If `df` is not a pandas DataFrame. - If `method` is not a string. - If `output` is not a string. ValueErrors: - If `df` is empty. - If `method` is not one of the valid options:. - If `output` is not 'print' or 'return'. - If 'buf' parameter is used in the 'info' method. Examples: --------- Create a sample DataFrame to use in the examples: >>> import datasafari >>> import numpy as np >>> import pandas as pd >>> data = { ... 'A': np.random.randn(100), ... 'B': np.random.rand(100) * 100, ... 'C': np.random.randint(1, 100, size=100), ... 'D': np.random.choice(['X', 'Y', 'Z'], size=100) ... } >>> df = pd.DataFrame(data) The full potential of ``explore_df()`` is unlocked by simply providing a dataframe: >>> explore_df(df) Alternatively, save the output to a string: >>> summary = explore_df(df, 'all', output='return') Display summary statistics with custom percentiles: >>> explore_df(df, 'desc', percentiles=[0.05, 0.95], output='print') Show the first 3 rows of the DataFrame: >>> explore_df(df, 'head', n=3, output='print') Provide detailed DataFrame information: >>> explore_df(df, 'info', verbose=True, output='print') Calculate and display the count and percentage of missing values: >>> explore_df(df, 'na', output='print') Execute a comprehensive exploration with custom settings: >>> explore_df(df, 'all', n=3, percentiles=[0.25, 0.75], output='print') Return comprehensive exploration results as a string: >>> result_str = explore_df(df, 'all', n=5, output='return') >>> print(result_str) Use 'all' with kwargs applicable to specific methods, print the results: >>> explore_df(df, 'all', n=5, percentiles=[0.1, 0.9], verbose=False, output='print') """ # Error Handling # # TypeErrors for each parameter if not isinstance(df, pd.DataFrame): raise TypeError("explore_df(): The df parameter must be a pandas DataFrame.") if not isinstance(method, str): raise TypeError("explore_df(): The method parameter must be a string.\nExample: method = 'all'") if not isinstance(output, str): raise TypeError("explore_df(): The output parameter must be a string.\nExample: output = 'print'") # ValueErrors # Check if df is empty if df.empty: raise ValueError("explore_df(): The input DataFrame is empty.") # Check for correct method valid_methods = ['na', 'desc', 'head', 'info', 'all'] if method.lower() not in valid_methods: raise ValueError(f"explore_df(): Invalid method '{method}'. Valid options are: {', '.join(valid_methods)}.") # Check for 'output' if output.lower() not in ['print', 'return']: raise ValueError("explore_df(): Invalid output method. Choose 'print' or 'return'.") # Check for unsupported 'info' kwargs if 'buf' in kwargs and method.lower() == 'info': raise ValueError("explore_df(): 'buf' parameter is not supported in the 'info' method within explore_df.") # Main Function Logic # result = [] if method.lower() in ["desc", "all"]: desc_kwargs = filter_kwargs('describe', kwargs, valid_kwargs) result.append(f"<______DESCRIBE______>\n{str(df.describe(**desc_kwargs))}\n") if method.lower() in ["head", "all"]: head_kwargs = filter_kwargs('head', kwargs, valid_kwargs) pd.set_option('display.max_columns', None) result.append(f"<______HEAD______>\n{str(df.head(**head_kwargs))}\n") pd.reset_option('display.max_columns') if method.lower() in ["info", "all"]: info_kwargs = filter_kwargs('info', kwargs, valid_kwargs) buffer = io.StringIO() df.info(buf=buffer, **info_kwargs) result.append(f"<______INFO______>\n{buffer.getvalue()}\n") if method.lower() in ["na", "all"]: na_count = df.isna().sum() na_percent = (df.isna().sum() / df.shape[0]) * 100 result.append(f"<______NA_COUNT______>\n{na_count}\n") result.append(f"<______NA_PERCENT______>\n{na_percent}\n") # Combine all results combined_result = "\n".join(result) if output.lower() == 'print': print(combined_result) elif output.lower() == 'return': return combined_result