Source code for dataprep.clean.clean_headers

"""
Clean and standardize column headers for a DataFrame.
"""
import re
from typing import Any, Dict, List, Optional, Union
from unicodedata import normalize

import dask.dataframe as dd
import numpy as np
import pandas as pd

NULL_VALUES = {np.nan, "", None}

CASE_STYLES = {
    "snake",
    "kebab",
    "camel",
    "pascal",
    "const",
    "sentence",
    "title",
    "lower",
    "upper",
}


[docs]def clean_headers(
    df: Union[pd.DataFrame, dd.DataFrame],
    case: str = "snake",
    replace: Optional[Dict[str, str]] = None,
    remove_accents: bool = True,
    report: bool = True,
) -> pd.DataFrame:
    """
    Function to clean column headers (column names).

    Read more in the :ref:`User Guide <clean_headers_user_guide>`.

    Parameters
    ----------
    df
        Dataframe from which column names are to be cleaned.
    case
        The desired case style of the column name.
            - 'snake': 'column_name'
            - 'kebab': 'column-name'
            - 'camel': 'columnName'
            - 'pascal': 'ColumnName'
            - 'const': 'COLUMN_NAME'
            - 'sentence': 'Column name'
            - 'title': 'Column Name'
            - 'lower': 'column name'
            - 'upper': 'COLUMN NAME'

        (default: 'snake')
    replace
        Values to replace in the column names.
            - {'old_value': 'new_value'}

        (default: None)
    remove_accents
        If True, strip accents from the column names.

        (default: True)
    report
        If True, output the summary report. Otherwise, no report is outputted.

        (default: True)

    Examples
    --------
    Clean column names by converting the names to camel case style, removing accents,
    and correcting a mispelling.

    >>> df = pd.DataFrame({'FirstNom': ['Philip', 'Turanga'], 'lastName': ['Fry', 'Leela'], \
'Téléphone': ['555-234-5678', '(604) 111-2335']})
    >>> clean_headers(df, case='camel', replace={'Nom': 'Name'})
    Column Headers Cleaning Report:
        2 values cleaned (66.67%)
      firstName lastName       telephone
    0    Philip      Fry    555-234-5678
    1   Turanga    Leela  (604) 111-2335
    """
    if case not in CASE_STYLES:
        raise ValueError(
            f"case {case} is invalid, it needs to be one of {', '.join(c for c in CASE_STYLES)}"
        )

    # Store original column names for creating cleaning report
    orig_columns = df.columns.astype(str).tolist()

    if replace:
        df = df.rename(columns=lambda col: _replace_values(col, replace))

    if remove_accents:
        df = df.rename(columns=_remove_accents)

    df = df.rename(columns=lambda col: _convert_case(col, case))

    df.columns = _rename_duplicates(df.columns, case)

    # Count the number of changed column names
    new_columns = df.columns.astype(str).tolist()
    cleaned = [1 if new_columns[i] != orig_columns[i] else 0 for i in range(len(orig_columns))]
    stats = {"cleaned": sum(cleaned)}

    # Output a report describing the result of clean_headers
    if report:
        _create_report(stats, len(df.columns))

    return df


def _convert_case(name: Any, case: str) -> Any:
    """
    Convert case style of a column name.

    Parameters
    ----------
    name
        Column name.
    case
        The desired case style of the column name.
    """
    if name in NULL_VALUES:
        name = "header"

    if case in {"snake", "kebab", "camel", "pascal", "const"}:
        words = _split_strip_string(str(name))
    else:
        words = _split_string(str(name))

    if case == "snake":
        name = "_".join(words).lower()
    elif case == "kebab":
        name = "-".join(words).lower()
    elif case == "camel":
        name = words[0].lower() + "".join(w.capitalize() for w in words[1:])
    elif case == "pascal":
        name = "".join(w.capitalize() for w in words)
    elif case == "const":
        name = "_".join(words).upper()
    elif case == "sentence":
        name = " ".join(words).capitalize()
    elif case == "title":
        name = " ".join(w.capitalize() for w in words)
    elif case == "lower":
        name = " ".join(words).lower()
    elif case == "upper":
        name = " ".join(words).upper()

    return name


def _split_strip_string(string: str) -> List[str]:
    """
    Split the string into separate words and strip punctuation
    and special characters.
    """
    string = re.sub(r"[!()*+\,\-./:;<=>?[\]^_{|}~]", " ", string)
    string = re.sub(r"[\'\"\`]", "", string)

    return re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+|[0-9]+|\W+)", r" \1", string)).split()


def _split_string(string: str) -> List[str]:
    """
    Split the string into separate words.
    """
    string = re.sub(r"[\-_]", " ", string)

    return re.sub(r"([A-Z][a-z]+)", r" \1", re.sub(r"([A-Z]+)", r"\1", string)).split()


def _replace_values(name: Any, mapping: Dict[str, str]) -> Any:
    """
    Replace string values in the column name.

    Parameters
    ----------
    name
        Column name.
    mapping
        Maps old values in the column name to the new values.
    """
    if name in NULL_VALUES:
        return name

    name = str(name)
    for old_value, new_value in mapping.items():
        # If the old value or the new value is not alphanumeric, add underscores to the
        # beginning and end so the new value will be parsed correctly for _convert_case()
        new_val = (
            fr"{new_value}" if old_value.isalnum() and new_value.isalnum() else fr"_{new_value}_"
        )
        name = re.sub(fr"{old_value}", new_val, name, flags=re.IGNORECASE)

    return name


def _remove_accents(name: Any) -> Any:
    """
    Return the normal form for a Unicode string name using canonical
    decomposition.
    """
    if not isinstance(name, str):
        return name

    return normalize("NFD", name).encode("ascii", "ignore").decode("ascii")


def _rename_duplicates(names: pd.Index, case: str) -> Any:
    """
    Rename duplicated column names to append a number at the end.
    """
    if case in {"snake", "const"}:
        sep = "_"
    elif case in {"camel", "pascal"}:
        sep = ""
    elif case == "kebab":
        sep = "-"
    else:
        sep = " "

    names = list(names)
    counts: Dict[str, int] = {}

    for i, col in enumerate(names):
        cur_count = counts.get(col, 0)
        if cur_count > 0:
            names[i] = f"{col}{sep}{cur_count}"
        counts[col] = cur_count + 1

    return names


def _create_report(stats: Dict[str, int], ncols: int) -> None:
    """
    Describe what was done in the cleaning process.
    """
    print("Column Headers Cleaning Report:")
    if stats["cleaned"] > 0:
        nclnd = stats["cleaned"]
        pclnd = round(nclnd / ncols * 100, 2)
        print(f"\t{nclnd} values cleaned ({pclnd}%)")