Source code for dataprep.eda.distribution.compute

"""
Computations for plot(df, ...)
"""


import warnings
from typing import Optional, Union, List, Dict, Any, Tuple
import dask.dataframe as dd
import pandas as pd

from ...configs import Config
from ...dtypes_v2 import DTypeDef, LatLong
from ...intermediate import Intermediate
from .bivariate import compute_bivariate
from .overview import compute_overview
from .trivariate import compute_trivariate
from .univariate import compute_univariate

__all__ = ["compute"]


[docs]def compute(
    df: Union[pd.DataFrame, dd.DataFrame],
    col1: Optional[Union[str, LatLong]] = None,
    col2: Optional[Union[str, LatLong]] = None,
    col3: Optional[str] = None,
    *,
    cfg: Union[Config, Dict[str, Any], None] = None,
    display: Optional[List[str]] = None,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    """
    All in one compute function.

    Parameters
    ----------
    df
        DataFrame from which visualizations are generated
    cfg: Union[Config, Dict[str, Any], None], default None
        When a user call plot(), the created Config object will be passed to compute().
        When a user call compute() directly, if he/she wants to customize the output,
        cfg is a dictionary for configuring. If not, cfg is None and
        default values will be used for parameters.
    display: Optional[List[str]], default None
        A list containing the names of the visualizations to display. Only exist when
        a user call compute() directly and want to customize the output
    col1: Optional[str], default None
        A valid column name from the dataframe
    col2: Optional[str], default None
        A valid column name from the dataframe
    col3: Optional[str], default None
        A valid column name from the dataframe
    dtype: str or DType or dict of str or dict of DType, default None
        Specify Data Types for designated column or all columns.
        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
        dtype = {"a": Continuous(), "b": "nominal"}
        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()
    """
    # pylint: disable=too-many-arguments

    suppress_warnings()

    if isinstance(cfg, dict):
        cfg = Config.from_dict(display, cfg)

    elif not cfg:
        cfg = Config()

    x, y, z = col1, col2, col3

    if not any([x, y, z]):
        return compute_overview(df, cfg, dtype)

    if sum(v is None for v in (x, y, z)) == 2:
        x = x or y or z
        if x is None:
            raise ValueError
        return compute_univariate(df, x, cfg, dtype)

    if sum(v is None for v in [x, y, z]) == 1:
        x, y = (v for v in [x, y, z] if v is not None)
        if x is None or y is None:
            raise ValueError
        return compute_bivariate(df, x, y, cfg, dtype)

    if x is not None and y is not None and z is not None:
        if not (isinstance(x, str) and isinstance(y, str) and isinstance(z, str)):
            raise TypeError("Column names should be string. Current column names: {x}, {y}, {z}")
        return compute_trivariate(df, x, y, z, cfg, dtype)

    raise ValueError("The input is not correct.")


def suppress_warnings() -> None:
    """
    Suppress warnings.
    """
    warnings.filterwarnings(
        "ignore",
        "The default value of regex will change from True to False in a future version",
        category=FutureWarning,
    )

    warnings.filterwarnings(
        "ignore",
        "invalid value encountered in true_divide",
        category=RuntimeWarning,
    )