Source code for dataprep.clean.clean_de_stnr

"""
Clean and validate a DataFrame column containing German tax numbers (STNRs).
"""
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches
from typing import Any, Union
from typing import Optional
from operator import itemgetter

import dask.dataframe as dd
import numpy as np
import pandas as pd

from stdnum.de import stnr
from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, to_dask


[docs]def clean_de_stnr(
    df: Union[pd.DataFrame, dd.DataFrame],
    column: str,
    output_format: str = "standard",
    inplace: bool = False,
    errors: str = "coerce",
    progress: bool = True,
) -> pd.DataFrame:
    """
    Clean German tax numbers (STNRs) type data in a DataFrame column.

    Parameters
    ----------
        df
            A pandas or Dask DataFrame containing the data to be cleaned.
        col
            The name of the column containing data of STNR type.
        output_format
            The output format of standardized number string.
            If output_format = 'compact', return string without any separators or whitespace.
            If output_format = 'standard', return string with proper separators and whitespace.

            (default: "standard")
        inplace
           If True, delete the column containing the data that was cleaned.
           Otherwise, keep the original column.

           (default: False)
        errors
            How to handle parsing errors.
            - ‘coerce’: invalid parsing will be set to NaN.
            - ‘ignore’: invalid parsing will return the input.
            - ‘raise’: invalid parsing will raise an exception.

            (default: 'coerce')
        progress
            If True, display a progress bar.

            (default: True)
    Examples
    --------
    Clean a column of STNR data.

    >>> df = pd.DataFrame({{
            "stnr": [
            "181/815/0815 5",
            "136695978"]
            })
    >>> clean_de_stnr(df, 'stnr')
            stnr                 stnr_clean
    0       181/815/0815 5       181/815/08155
    1       136695978            NaN
    """

    if output_format not in {"compact", "standard"}:
        raise ValueError(
            f"output_format {output_format} is invalid. " 'It needs to be "compact" or "standard".'
        )

    # convert to dask
    df = to_dask(df)

    # To clean, create a new column "clean_code_tup" which contains
    # the cleaned values and code indicating how the initial value was
    # changed in a tuple. Then split the column of tuples and count the
    # amount of different codes to produce the report
    df["clean_code_tup"] = df[column].map_partitions(
        lambda srs: [_format(x, output_format, errors) for x in srs],
        meta=object,
    )

    df = df.assign(
        _temp_=df["clean_code_tup"].map(itemgetter(0)),
    )

    df = df.rename(columns={"_temp_": f"{column}_clean"})

    df = df.drop(columns=["clean_code_tup"])

    if inplace:
        df[column] = df[f"{column}_clean"]
        df = df.drop(columns=f"{column}_clean")
        df = df.rename(columns={column: f"{column}_clean"})

    with ProgressBar(minimum=1, disable=not progress):
        df = df.compute()

    return df


[docs]def validate_de_stnr(
    df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame],
    column: str = "",
    region: Optional[str] = None,
) -> Union[bool, pd.Series, pd.DataFrame]:
    """
    Validate if a data cell is STNR in a DataFrame column. For each cell, return True or False.
    The region can be supplied to verify that the number is assigned in that region.

    Parameters
    ----------
    df
            A pandas or Dask DataFrame containing the data to be validated.
    col
            The name of the column to be validated.
    region
            Specify the region that the number belongs to.

            (default: None)
    """
    if isinstance(df, (pd.Series, dd.Series)):
        return df.apply(stnr.is_valid, args=(region,))
    elif isinstance(df, (pd.DataFrame, dd.DataFrame)):
        if column != "":
            return df[column].apply(stnr.is_valid, args=(region,))
        else:
            return df.applymap(lambda x: stnr.is_valid(x, region))
    return stnr.is_valid(df, region)


def _format(val: Any, output_format: str = "standard", errors: str = "coarse") -> Any:
    """
    Reformat a number string with proper separators and whitespace.

    Parameters
    ----------
    val
           The value of number string.
    output_format
           If output_format = 'compact', return string without any separators or whitespace.
           If output_format = 'standard', return string with proper separators and whitespace.
    """
    val = str(val)
    result: Any = []

    if val in NULL_VALUES:
        return [np.nan]

    if not validate_de_stnr(val):
        if errors == "raise":
            raise ValueError(f"Unable to parse value {val}")
        error_result = val if errors == "ignore" else np.nan
        return [error_result]

    if output_format == "compact":
        result = [stnr.compact(val)] + result
    elif output_format == "standard":
        result = [stnr.format(val)] + result

    return result