Source code for dataprep.eda.missing.compute

"""This module implements the plot_missing(df) function's
calculating intermediate part
"""
from typing import Optional, cast, List, Any, Dict, Union
import warnings
from scipy.cluster.hierarchy import ClusterWarning

from ...configs import Config
from ...eda_frame import DataFrame, EDAFrame
from ...dtypes_v2 import DTypeDef
from ...utils import preprocess_dataframe
from ...intermediate import Intermediate
from .bivariate import compute_missing_bivariate
from .nullivariate import compute_missing_nullivariate
from .univariate import compute_missing_univariate

__all__ = ["compute_missing"]


[docs]def compute_missing(
    df: DataFrame,
    col1: Optional[str] = None,
    col2: Optional[str] = None,
    *,
    cfg: Union[Config, Dict[str, Any], None] = None,
    display: Optional[List[str]] = None,
    dtype: Optional[DTypeDef] = None,
) -> Intermediate:
    # pylint: disable=too-many-arguments

    """This function is designed to deal with missing values
    There are three functions: plot_missing(df), plot_missing(df, x)
    plot_missing(df, x, y)

    Parameters
    ----------
    df
        the pandas data_frame for which plots are calculated for each column
    col1
        a valid column name of the data frame
    col2
        a valid column name of the data frame
    cfg: Union[Config, Dict[str, Any], None], default None
        When a user call plot_missing(), the created Config object will be passed to
        compute_missing().
        When a user call compute_missing() directly, if he/she wants to customize the output,
        cfg is a dictionary for configuring. If not, cfg is None and
        default values will be used for parameters.
    display: Optional[List[str]], default None
        A list containing the names of the visualizations to display. Only exist when
        a user call compute_missing() directly and want to customize the output
    dtype: str or DType or dict of str or dict of DType, default None
        Specify Data Types for designated column or all columns.
        E.g.  dtype = {"a": Continuous, "b": "Nominal"} or
        dtype = {"a": Continuous(), "b": "nominal"}
        or dtype = Continuous() or dtype = "Continuous" or dtype = Continuous()

    Examples
    --------
    >>> from dataprep.eda.missing.computation import plot_missing
    >>> import pandas as pd
    >>> df = pd.read_csv("suicide-rate.csv")
    >>> plot_missing(df, "HDI_for_year")
    >>> plot_missing(df, "HDI_for_year", "population")
    """
    suppress_warnings()

    x, y = col1, col2

    eda_frame = EDAFrame(df, dtype=dtype)

    # pylint: disable=no-else-raise
    if isinstance(cfg, dict):
        cfg = Config.from_dict(display, cfg)
    elif not cfg:
        cfg = Config()
    if x is None and y is not None:
        raise ValueError("x cannot be None while y has value")
    elif x is not None and y is None:
        ret = compute_missing_univariate(eda_frame, x, cfg)
    elif x is not None and y is not None:
        ret = compute_missing_bivariate(eda_frame, x, y, cfg)
    else:
        ret = compute_missing_nullivariate(eda_frame, cfg)

    return cast(Intermediate, ret)


def suppress_warnings() -> None:
    """
    suppress warnings for plot_missing
    """
    warnings.filterwarnings(
        "ignore",
        "scipy.cluster: The symmetric non-negative hollow observation matrix looks "
        + "suspiciously like an uncondensed distance matrix",
        category=ClusterWarning,
    )
    warnings.filterwarnings(
        "ignore", "invalid value encountered in double_scalars", category=RuntimeWarning
    )
    warnings.filterwarnings(
        "ignore",
        "invalid value encountered in true_divide",
        category=RuntimeWarning,
    )