Source code for dataprep.eda.distribution.compute

"""
Computations for plot(df, ...)
"""


import warnings
from typing import Optional, Union, List, Dict, Any, Tuple
import dask.dataframe as dd
import pandas as pd

from ...configs import Config
from ...dtypes_v2 import DTypeDef, LatLong
from ...intermediate import Intermediate
from .bivariate import compute_bivariate
from .overview import compute_overview
from .trivariate import compute_trivariate
from .univariate import compute_univariate

__all__ = ["compute"]


[docs]def compute( df: Union[pd.DataFrame, dd.DataFrame], col1: Optional[Union[str, LatLong]] = None, col2: Optional[Union[str, LatLong]] = None, col3: Optional[str] = None, *, cfg: Union[Config, Dict[str, Any], None] = None, display: Optional[List[str]] = None, dtype: Optional[DTypeDef] = None, ) -> Intermediate: """ All in one compute function. Parameters ---------- df DataFrame from which visualizations are generated cfg: Union[Config, Dict[str, Any], None], default None When a user call plot(), the created Config object will be passed to compute(). When a user call compute() directly, if he/she wants to customize the output, cfg is a dictionary for configuring. If not, cfg is None and default values will be used for parameters. display: Optional[List[str]], default None A list containing the names of the visualizations to display. Only exist when a user call compute() directly and want to customize the output col1: Optional[str], default None A valid column name from the dataframe col2: Optional[str], default None A valid column name from the dataframe col3: Optional[str], default None A valid column name from the dataframe dtype: str or DType or dict of str or dict of DType, default None Specify Data Types for designated column or all columns. E.g. dtype = {"a": Continuous, "b": "Nominal"} or dtype = {"a": Continuous(), "b": "nominal"} or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous() """ # pylint: disable=too-many-arguments suppress_warnings() if isinstance(cfg, dict): cfg = Config.from_dict(display, cfg) elif not cfg: cfg = Config() x, y, z = col1, col2, col3 if not any([x, y, z]): return compute_overview(df, cfg, dtype) if sum(v is None for v in (x, y, z)) == 2: x = x or y or z if x is None: raise ValueError return compute_univariate(df, x, cfg, dtype) if sum(v is None for v in [x, y, z]) == 1: x, y = (v for v in [x, y, z] if v is not None) if x is None or y is None: raise ValueError return compute_bivariate(df, x, y, cfg, dtype) if x is not None and y is not None and z is not None: if not (isinstance(x, str) and isinstance(y, str) and isinstance(z, str)): raise TypeError("Column names should be string. Current column names: {x}, {y}, {z}") return compute_trivariate(df, x, y, z, cfg, dtype) raise ValueError("The input is not correct.")
def suppress_warnings() -> None: """ Suppress warnings. """ warnings.filterwarnings( "ignore", "The default value of regex will change from True to False in a future version", category=FutureWarning, ) warnings.filterwarnings( "ignore", "invalid value encountered in true_divide", category=RuntimeWarning, )