Source code for dataprep.eda.diff.compute

"""Computations for plot_diff([df...])."""

from typing import Optional, Union, List, Dict, Any
import dask.dataframe as dd
import pandas as pd
from ....errors import DataprepError
from ...intermediate import Intermediate
from ...utils import to_dask
from ...dtypes import DTypeDef
from ...configs import Config
from .multiple_df import compare_multiple_df  # type: ignore
from .multiple_column import compare_multiple_col  # type: ignore

__all__ = ["compute_diff"]


[docs]def compute_diff( df: Union[List[Union[pd.DataFrame, dd.DataFrame]], Union[pd.DataFrame, dd.DataFrame]], x: Optional[str] = None, *, cfg: Union[Config, Dict[str, Any], None] = None, display: Optional[List[str]] = None, dtype: Optional[DTypeDef] = None, ) -> Intermediate: """ All in one compute function. Parameters ---------- df DataFrame from which visualizations are generated cfg: Union[Config, Dict[str, Any], None], default None When a user call plot(), the created Config object will be passed to compute(). When a user call compute() directly, if he/she wants to customize the output, cfg is a dictionary for configuring. If not, cfg is None and default values will be used for parameters. display: Optional[List[str]], default None A list containing the names of the visualizations to display. Only exist when a user call compute() directly and want to customize the output x: Optional[str], default None A valid column name from the dataframe dtype: str or DType or dict of str or dict of DType, default None Specify Data Types for designated column or all columns. E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() """ # pylint:disable = too-many-branches if isinstance(cfg, dict): cfg = Config.from_dict(display, cfg) elif not cfg: cfg = Config() if isinstance(df, list): if len(df) < 2: raise DataprepError("plot_diff needs at least 2 DataFrames.") if len(df) > 5: raise DataprepError("Too many DataFrames, max: 5.") label = cfg.diff.label if not label: cfg.diff.label = [f"df{i+1}" for i in range(len(df))] elif len(df) != len(label): raise ValueError("Number of the given label doesn't match the number of DataFrames.") if cfg.diff.baseline > len(df) - 1: raise ValueError("Baseline is out of the boundary of the input.") df_list = list(map(to_dask, df)) for i, _ in enumerate(df_list): df_list[i].columns = df_list[i].columns.astype(str) if x: if [col for dfs in df for col in dfs.columns].count(x) < 2: raise DataprepError("x must exist in at least two DataFrames") # return compare_multiple_on_column(df_list, x) return compare_multiple_col(df_list, x, cfg) # type: ignore else: return compare_multiple_df(df_list, cfg, dtype) # type: ignore else: raise TypeError(f"Invalid input type: {type(df)}")