Source code for dataprep.eda.utils

"""Miscellaneous functions
"""
import logging
from math import ceil
from typing import Any, Dict, List, Optional, Tuple, Union, cast
from collections import Counter


import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
import pandas._libs.missing as libmissing
from bokeh.models import Legend, FuncTickFormatter
from bokeh.plotting import Figure
from scipy.stats import gaussian_kde as gaussian_kde_
from scipy.stats import ks_2samp as ks_2samp_
from scipy.stats import normaltest as normaltest_
from scipy.stats import skewtest as skewtest_
from .dtypes import (
    drop_null,
    Nominal,
    detect_dtype,
    is_dtype,
)


LOGGER = logging.getLogger(__name__)


[docs]def to_dask(df: Union[pd.DataFrame, dd.DataFrame]) -> dd.DataFrame: """Convert a dataframe to a dask dataframe.""" if isinstance(df, dd.DataFrame): return df elif isinstance(df, dd.Series): return df.to_frame() if isinstance(df, pd.Series): df = df.to_frame() df_size = df.memory_usage(deep=True).sum() npartitions = ceil(df_size / 128 / 1024 / 1024) # 128 MB partition size return dd.from_pandas(df, npartitions=npartitions)
[docs]def preprocess_dataframe( org_df: Union[pd.DataFrame, dd.DataFrame], used_columns: Optional[Union[List[str], List[object]]] = None, excluded_columns: Optional[Union[List[str], List[object]]] = None, detect_small_distinct: bool = True, ) -> dd.DataFrame: """ Make a dask dataframe with only used_columns. This function will do the following: 1. keep only used_columns. 2. transform column name to string (avoid object column name) and rename duplicate column names in form of {col}_{id}. 3. reset index 4. transform object column to string column (note that obj column can contain cells from different type). 5. transform to dask dataframe if input is pandas dataframe. Parameters ---------------- org_df: dataframe the original dataframe used_columns: optional list[str], default None used columns in org_df excluded_columns: optional list[str], default None excluded columns from used_columns, mainly used for geo point data processing. detect_small_distinct: bool, default True whether to detect numerical columns with small distinct values as categorical column. """ if used_columns is None: df = org_df.copy() else: # Process the case when used_columns are string column name, # but org_df column name is object. used_columns_set = set(used_columns) used_cols_obj = set() for col in org_df.columns: if str(col) in used_columns_set or col in used_columns_set: used_cols_obj.add(col) df = org_df[used_cols_obj] columns = list(df.columns) # Resolve duplicate names in columns. # Duplicate names will be renamed as col_{id}. column_count = Counter(columns) current_id: Dict[Any, int] = dict() for i, col in enumerate(columns): if column_count[col] > 1: current_id[col] = current_id.get(col, 0) + 1 new_col_name = f"{col}_{current_id[col]}" else: new_col_name = f"{col}" columns[i] = new_col_name df.columns = columns df = df.reset_index(drop=True) df = to_dask(df) # Since an object column could contains multiple types # in different cells. transform non-na values in object column to string. # Function `_notna2str` transforms an obj to str if it is not NA. # The check for NA is similar to pd.isna, but will treat a list obj as # a scalar and return a single boolean, rather than a list of booleans. # Otherwise when a cell is tuple or list it will throw an error. _notna2str = lambda obj: obj if libmissing.checknull(obj) else str(obj) for col in df.columns: col_dtype = detect_dtype(df[col], detect_small_distinct=detect_small_distinct) if (is_dtype(col_dtype, Nominal())) and ( (excluded_columns is None) or (col not in excluded_columns) ): df[col] = df[col].apply(_notna2str, meta=(col, "object")) return df
[docs]def sample_n(arr: np.ndarray, n: int) -> np.ndarray: # pylint: disable=C0103 """Sample n values uniformly from the range of the `arr`, not from the distribution of `arr`'s elems.""" if len(arr) <= n: return arr subsel = np.linspace(0, len(arr) - 1, n) subsel = np.floor(subsel).astype(int) return arr[subsel]
[docs]def relocate_legend(fig: Figure, loc: str) -> Figure: """Relocate legend(s) from center to `loc`.""" remains = [] targets = [] for layout in fig.center: if isinstance(layout, Legend): targets.append(layout) else: remains.append(layout) fig.center = remains for layout in targets: fig.add_layout(layout, loc) return fig
[docs]def cut_long_name(name: str, max_len: int = 18) -> str: """If the name is longer than `max_len`, cut it to `max_len` length and append "...""" # Bug 136 Fixed name = str(name) cut_name = f"{name[:13]}...{name[len(name)-3:]}" if len(name) > max_len else name return cut_name
[docs]def fuse_missing_perc(name: str, perc: float) -> str: """Append (x.y%) to the name if `perc` is not 0.""" if perc == 0: return name return f"{name} ({perc:.1%})"
# Dictionary for mapping the time unit to its formatting. Each entry is of the # form unit:(unit code for pd.Grouper freq parameter, pandas to_period strftime # formatting for line charts, pandas to_period strftime formatting for box plot, # label format). DTMAP = { "year": ("Y", "%Y", "%Y", "Year"), "quarter": ("Q", "Q%q %Y", "Q%q %Y", "Quarter"), "month": ("M", "%B %Y", "%b %Y", "Month"), "week": ("W-SAT", "%d %B, %Y", "%d %b, %Y", "Week of"), "day": ("D", "%d %B, %Y", "%d %b, %Y", "Date"), "hour": ("H", "%d %B, %Y, %I %p", "%d %b, %Y, %I %p", "Hour"), "minute": ("T", "%d %B, %Y, %I:%M %p", "%d %b, %Y, %I:%M %p", "Minute"), "second": ("S", "%d %B, %Y, %I:%M:%S %p", "%d %b, %Y, %I:%M:%S %p", "Second"), } def _get_timeunit(min_time: pd.Timestamp, max_time: pd.Timestamp, dflt: int) -> str: """Auxillary function to find an appropriate time unit. Will find the time unit such that the number of time units are closest to dflt.""" dt_secs = { "year": 60 * 60 * 24 * 365, "quarter": 60 * 60 * 24 * 91, "month": 60 * 60 * 24 * 30, "week": 60 * 60 * 24 * 7, "day": 60 * 60 * 24, "hour": 60 * 60, "minute": 60, "second": 1, } time_rng_secs = (max_time - min_time).total_seconds() prev_bin_cnt, prev_unit = 0, "year" for unit, secs_in_unit in dt_secs.items(): cur_bin_cnt = time_rng_secs / secs_in_unit if abs(prev_bin_cnt - dflt) < abs(cur_bin_cnt - dflt): return prev_unit prev_bin_cnt = cur_bin_cnt prev_unit = unit return prev_unit def _calc_box_stats(grp_srs: dd.Series, grp: str, dlyd: bool = False) -> pd.DataFrame: """ Auxiliary function to calculate the Tukey box plot statistics dlyd is for if this function is called when dask is computing in parallel (dask.delayed) """ stats: Dict[str, Any] = dict() try: # this is a bad fix for the problem of when there is no data passed to this function if dlyd: qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]), 3) else: qntls = np.round(grp_srs.quantile([0.25, 0.50, 0.75]).compute(), 3) stats["q1"], stats["q2"], stats["q3"] = qntls[0.25], qntls[0.50], qntls[0.75] except ValueError: stats["q1"], stats["q2"], stats["q3"] = np.nan, np.nan, np.nan iqr = stats["q3"] - stats["q1"] stats["lw"] = grp_srs[grp_srs >= stats["q1"] - 1.5 * iqr].min() stats["uw"] = grp_srs[grp_srs <= stats["q3"] + 1.5 * iqr].max() if not dlyd: stats["lw"], stats["uw"] = dask.compute(stats["lw"], stats["uw"]) otlrs = grp_srs[(grp_srs < stats["lw"]) | (grp_srs > stats["uw"])] if len(otlrs) > 100: # sample 100 outliers otlrs = otlrs.sample(frac=100 / len(otlrs)) stats["otlrs"] = list(otlrs) if dlyd else list(otlrs.compute()) return pd.DataFrame({grp: stats}) def _calc_box_otlrs(df: dd.DataFrame) -> Tuple[List[str], List[float]]: """ Calculate the outliers for a box plot """ outx: List[str] = [] # list for the outlier groups outy: List[float] = [] # list for the outlier values for ind in df.index: otlrs = df.loc[ind]["otlrs"] outx = outx + [df.loc[ind]["grp"]] * len(otlrs) outy = outy + otlrs return outx, outy def _calc_line_dt( df: dd.DataFrame, unit: str, agg: Optional[str] = None, ngroups: Optional[int] = None, largest: Optional[bool] = None, ) -> Union[ Tuple[pd.DataFrame, Dict[str, int], str], Tuple[pd.DataFrame, str, float], Tuple[pd.DataFrame, str], ]: """ Calculate a line or multiline chart with date on the x axis. If df contains one datetime column, it will make a line chart of the frequency of values. If df contains a datetime and categorical column, it will compute the frequency of each categorical value in each time group. If df contains a datetime and numerical column, it will compute the aggregate of the numerical column grouped by the time groups. If df contains a datetime, categorical, and numerical column, it will compute the aggregate of the numerical column for values in the categorical column grouped by time. Parameters ---------- df A dataframe unit The unit of time over which to group the values agg Aggregate to use for the numerical column ngroups Number of groups for the categorical column largest Use the largest or smallest groups in the categorical column """ # pylint: disable=too-many-locals x = df.columns[0] # time column unit = _get_timeunit(df[x].min(), df[x].max(), 100) if unit == "auto" else unit if unit not in DTMAP.keys(): raise ValueError grouper = pd.Grouper(key=x, freq=DTMAP[unit][0]) # for grouping the time values # multiline charts if ngroups and largest: hist_dict: Dict[str, Tuple[np.ndarray, np.ndarray, List[str]]] = dict() hist_lst: List[Tuple[np.ndarray, np.ndarray, List[str]]] = list() agg = "freq" if agg is None else agg # default agg if unspecified for notational concision # categorical column for grouping over, each resulting group is a line in the chart grpby_col = df.columns[1] if len(df.columns) == 2 else df.columns[2] df, grp_cnt_stats, largest_grps = _calc_groups(df, grpby_col, ngroups, largest) groups = df.groupby([grpby_col]) for grp in largest_grps: srs = groups.get_group(grp) # calculate the frequencies or aggregate value in each time group if len(df.columns) == 3: dfr = srs.groupby(grouper)[df.columns[1]].agg(agg).reset_index() else: dfr = srs[x].to_frame().groupby(grouper).size().reset_index() dfr.columns = [x, agg] # if grouping by week, make the label for the week the beginning Sunday dfr[x] = dfr[x] - pd.to_timedelta(6, unit="d") if unit == "week" else dfr[x] # format the label dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1]) hist_lst.append((list(dfr[agg]), list(dfr[x]), list(dfr["lbl"]))) hist_lst = dask.compute(*hist_lst) for elem in zip(largest_grps, hist_lst): hist_dict[elem[0]] = elem[1] return hist_dict, grp_cnt_stats, DTMAP[unit][3] # single line charts if agg is None: # frequency of datetime column miss_pct = round(df[x].isna().sum() / len(df) * 100, 1) dfr = drop_null(df).groupby(grouper).size().reset_index() dfr.columns = [x, "freq"] dfr["pct"] = dfr["freq"] / len(df) * 100 else: # aggregate over a second column dfr = df.groupby(grouper)[df.columns[1]].agg(agg).reset_index() dfr.columns = [x, agg] dfr[x] = dfr[x] - pd.to_timedelta(6, unit="d") if unit == "week" else dfr[x] dfr["lbl"] = dfr[x].dt.to_period("S").dt.strftime(DTMAP[unit][1]) return (dfr, DTMAP[unit][3], miss_pct) if agg is None else (dfr, DTMAP[unit][3]) def _calc_groups( df: dd.DataFrame, x: str, ngroups: int, largest: bool = True ) -> Tuple[dd.DataFrame, Dict[str, int], List[str]]: """Auxillary function to parse the dataframe to consist of only the groups with the largest counts. """ # group count statistics to inform the user of the sampled output grp_cnt_stats: Dict[str, int] = dict() srs = df.groupby(x).size() srs_lrgst = srs.nlargest(n=ngroups) if largest else srs.nsmallest(n=ngroups) try: largest_grps = list(srs_lrgst.index.compute()) grp_cnt_stats[f"{x}_ttl"] = len(srs.index.compute()) except AttributeError: largest_grps = list(srs_lrgst.index) grp_cnt_stats[f"{x}_ttl"] = len(srs.index) df = df[df[x].isin(largest_grps)] grp_cnt_stats[f"{x}_shw"] = len(largest_grps) return df, grp_cnt_stats, largest_grps @dask.delayed(name="scipy-normaltest", pure=True, nout=2) # pylint: disable=no-value-for-parameter def normaltest(arr: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: """Delayed version of scipy normaltest. Due to the dask version will trigger a compute.""" return cast(Tuple[np.ndarray, np.ndarray], normaltest_(arr)) @dask.delayed(name="scipy-ks_2samp", pure=True, nout=2) # pylint: disable=no-value-for-parameter def ks_2samp(data1: np.ndarray, data2: np.ndarray) -> Tuple[float, float]: """Delayed version of scipy ks_2samp.""" return cast(Tuple[float, float], ks_2samp_(data1, data2)) @dask.delayed( # pylint: disable=no-value-for-parameter name="scipy-gaussian_kde", pure=True, nout=2 ) def gaussian_kde(arr: np.ndarray) -> Tuple[float, float]: """Delayed version of scipy gaussian_kde.""" return cast(Tuple[np.ndarray, np.ndarray], gaussian_kde_(arr)) @dask.delayed(name="scipy-skewtest", pure=True, nout=2) # pylint: disable=no-value-for-parameter def skewtest(arr: np.ndarray) -> Tuple[float, float]: """Delayed version of scipy skewtest.""" return cast(Tuple[float, float], skewtest_(arr))
[docs]def tweak_figure( fig: Figure, ptype: Optional[str] = None, show_yticks: bool = False, max_lbl_len: int = 15, ) -> None: """ Set some common attributes for a figure """ fig.axis.major_label_text_font_size = "9pt" fig.title.text_font_size = "10pt" fig.axis.minor_tick_line_color = "white" if ptype in ["pie", "qq", "heatmap"]: fig.ygrid.grid_line_color = None if ptype in ["bar", "pie", "hist", "kde", "qq", "heatmap", "line"]: fig.xgrid.grid_line_color = None if ptype in ["bar", "hist", "line"] and not show_yticks: fig.ygrid.grid_line_color = None fig.yaxis.major_label_text_font_size = "0pt" fig.yaxis.major_tick_line_color = None if ptype in ["bar", "nested", "stacked", "heatmap", "box"]: fig.xaxis.major_label_orientation = np.pi / 3 fig.xaxis.formatter = FuncTickFormatter( code=""" if (tick.length > %d) return tick.substring(0, %d-2) + '...'; else return tick; """ % (max_lbl_len, max_lbl_len) ) if ptype in ["nested", "stacked", "box"]: fig.xgrid.grid_line_color = None if ptype in ["nested", "stacked"]: fig.y_range.start = 0 fig.x_range.range_padding = 0.03 if ptype in ["line", "boxnum"]: fig.min_border_right = 20 fig.xaxis.major_label_standoff = 7 fig.xaxis.major_label_orientation = 0 fig.xaxis.major_tick_line_color = None
def _format_ticks(ticks: List[float]) -> List[str]: """ Format the tick values """ formatted_ticks = [] for tick in ticks: # format the tick values before, after = f"{tick:e}".split("e") if float(after) > 1e15 or abs(tick) < 1e4: formatted_ticks.append(str(tick)) continue mod_exp = int(after) % 3 factor = 1 if mod_exp == 0 else 10 if mod_exp == 1 else 100 value = np.round(float(before) * factor, len(str(before))) value = int(value) if value.is_integer() else value if abs(tick) >= 1e12: formatted_ticks.append(str(value) + "T") elif abs(tick) >= 1e9: formatted_ticks.append(str(value) + "B") elif abs(tick) >= 1e6: formatted_ticks.append(str(value) + "M") elif abs(tick) >= 1e4: formatted_ticks.append(str(value) + "K") return formatted_ticks def _format_axis(fig: Figure, minv: int, maxv: int, axis: str) -> None: """ Format the axis ticks """ # pylint: disable=too-many-locals # divisor for 5 ticks (5 results in ticks that are too close together) divisor = 4.5 # interval if np.isinf(minv) or np.isinf(maxv): gap = 1.0 else: gap = (maxv - minv) / divisor # get exponent from scientific notation _, after = f"{gap:.0e}".split("e") # round to this amount round_to = -1 * int(after) # round the first x tick minv = np.round(minv, round_to) # round value between ticks gap = np.round(gap, round_to) # make the tick values ticks = [float(minv)] if not np.isinf(maxv): while max(ticks) + gap < maxv: ticks.append(max(ticks) + gap) ticks = np.round(ticks, round_to) ticks = [int(tick) if tick.is_integer() else tick for tick in ticks] formatted_ticks = _format_ticks(ticks) if axis == "x": fig.xgrid.ticker = ticks fig.xaxis.ticker = ticks fig.xaxis.major_label_overrides = dict(zip(ticks, formatted_ticks)) fig.xaxis.major_label_text_font_size = "10pt" fig.xaxis.major_label_standoff = 7 # fig.xaxis.major_label_orientation = 0 fig.xaxis.major_tick_line_color = None elif axis == "y": fig.ygrid.ticker = ticks fig.yaxis.ticker = ticks fig.yaxis.major_label_overrides = dict(zip(ticks, formatted_ticks)) fig.yaxis.major_label_text_font_size = "10pt" fig.yaxis.major_label_standoff = 5 def _format_bin_intervals(bins_arr: np.ndarray) -> List[str]: """ Auxillary function to format bin intervals in a histogram """ bins_arr = np.round(bins_arr, 3) bins_arr = [int(val) if float(val).is_integer() else val for val in bins_arr] intervals = [f"[{bins_arr[i]}, {bins_arr[i + 1]})" for i in range(len(bins_arr) - 2)] intervals.append(f"[{bins_arr[-2]},{bins_arr[-1]}]") return intervals