Source code for dataprep.eda.correlation

"""
    This module implements the plot_correlation(df) function.
"""

from typing import Any, Dict, List, Optional, Tuple, Union

import dask.dataframe as dd
import pandas as pd

from ..configs import Config
from ..container import Container
from ...progress_bar import ProgressBar
from .compute import compute_correlation
from .render import render_correlation

__all__ = ["render_correlation", "compute_correlation", "plot_correlation"]


[docs]def plot_correlation( df: Union[pd.DataFrame, dd.DataFrame], col1: Optional[str] = None, col2: Optional[str] = None, *, value_range: Optional[Tuple[float, float]] = None, k: Optional[int] = None, config: Optional[Dict[str, Any]] = None, display: Optional[List[str]] = None, progress: bool = True, ) -> Container: """ This function is designed to calculate the correlation between columns There are three functions: plot_correlation(df), plot_correlation(df, x) plot_correlation(df, x, y) There are also some parameters such as k and value_range to satisfy your requirement Parameters ---------- df The pandas data_frame for which plots are calculated for each column. col1 A valid column name of the data frame. col2 A valid column name of the data frame. value_range Range of value. k Choose top-k element. config A dictionary for configuring the visualizations E.g. config={"scatter.sample_size": 5000} display A list containing the names of the visualizations to display E.g. display=["Pearson"] progress Enable the progress bar. Examples -------- >>> from dataprep.eda.correlation.computation import plot_correlation >>> import pandas as pd >>> df = pd.read_csv("suicide-rate.csv") >>> plot_correlation(df) >>> plot_correlation(df, k=6) >>> plot_correlation(df, "suicides") >>> plot_correlation(df, "suicides", k=3) >>> plot_correlation(df, "suicides", value_range=[-1, 0.3]) >>> plot_correlation(df, "suicides", value_range=[-1, 0.3], k=2) >>> plot_correlation(df, x_name="population", y_name="suicides_no") >>> plot_correlation(df, x_name="population", y_name="suicides", k=5) Note ---- This function only supports numerical or categorical data, and it is better to drop None, Nan and Null value before using it """ cfg = Config.from_dict(display, config) with ProgressBar(minimum=1, disable=not progress): itmdt = compute_correlation(df, col1, col2, cfg=cfg, value_range=value_range, k=k) to_render = render_correlation(itmdt, cfg) return Container(to_render, itmdt.visual_type, cfg)