Source code for dataprep.clean.clean_ip

"""
Clean and validate a DataFrame column containing IP addresses.
"""
from ipaddress import ip_address
from operator import itemgetter
from typing import Any, Union

import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd

from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, create_report_new, to_dask


[docs]def clean_ip(
    df: Union[pd.DataFrame, dd.DataFrame],
    column: str,
    input_format: str = "auto",
    output_format: str = "compressed",
    inplace: bool = False,
    errors: str = "coerce",
    report: bool = True,
    progress: bool = True,
) -> Union[pd.DataFrame, dd.DataFrame]:
    """
    Clean and standardize IP addresses.

    Read more in the :ref:`User Guide <ip_userguide>`.

    Parameters
    ----------
    df
        A pandas or Dask DataFrame containing the data to be cleaned.
    column
        The name of the column containing IP addresses.
    input_format
        The input format of the IP addresses.
            - 'auto': parse both ipv4 and ipv6 addresses.
            - 'ipv4': only parse ipv4 addresses.
            - 'ipv6': only parse ipv6 addresses.

        (default: 'auto')
    output_format
        The desired output format of the IP addresses.
            - 'compressed': compressed representation ('12.3.4.5')
            - 'full': full representation ('0012.0003.0004.0005')
            - 'binary': binary representation ('00001100000000110000010000000101')
            - 'hexa': hexadecimal representation ('0xc030405')
            - 'integer': integer representation (201524229)
            - 'packed': packed binary representation (big-endian, a bytes object)

        (default: 'compressed')
    inplace
        If True, delete the column containing the data that was cleaned. Otherwise,
        keep the original column.

        (default: False)
    errors
        How to handle parsing errors.
            - ‘coerce’: invalid parsing will be set to NaN.
            - ‘ignore’: invalid parsing will return the input.
            - ‘raise’: invalid parsing will raise an exception.

        (default: 'coerce')
    report
        If True, output the summary report. Otherwise, no report is outputted.

        (default: True)
    progress
        If True, display a progress bar.

        (default: True)

    Examples
    --------

    >>> df = pd.DataFrame({'ip': ['2001:0db8:85a3:0000:0000:8a2e:0370:7334', '233.5.6.000']})
    >>> clean_ip(df, 'ip')
    IP Cleaning Report:
        2 values cleaned (100.0%)
    Result contains 2 (100.0%) values in the correct format and 0 null values (0.0%)
                                            ip                      ip_clean
    0  2001:0db8:85a3:0000:0000:8a2e:0370:7334  2001:db8:85a3::8a2e:370:7334
    1                              233.5.6.000                     233.5.6.0
    """
    # pylint: disable=too-many-arguments

    # check if the parameters are of correct processing types and values
    if not isinstance(df, (pd.DataFrame, dd.DataFrame)):
        raise ValueError("df is invalid, it needs to be a pandas or Dask DataFrame")

    if not isinstance(column, str):
        raise ValueError(f"column {column} is invalid")

    if input_format not in {"ipv4", "ipv6", "auto"}:
        raise ValueError(
            f'input_format {input_format} is invalid, it needs to be "ipv4", "ipv6" or "auto"'
        )

    if output_format not in {"compressed", "full", "binary", "hexa", "integer", "packed"}:
        raise ValueError(
            f'output_format {output_format} is invalid, it needs to be "compressed", "full", '
            '"binary", "hexa", "integer" or "packed"'
        )

    if not isinstance(inplace, bool):
        raise ValueError(f"inplace {inplace} is invalid, it needs to be True or False")

    if not isinstance(report, bool):
        raise ValueError(f"report {report} is invalid, it needs to be True or False")

    if errors not in {"coerce", "ignore", "raise"}:
        raise ValueError(f'errors {errors} is invalid, it needs to be "coerce", "ignore", "raise"')

    # convert to dask
    df = to_dask(df)

    # To clean, create a new column "clean_code_tup" which contains
    # the cleaned values and code indicating how the initial value was
    # changed in a tuple. Then split the column of tuples and count the
    # amount of different codes to produce the report
    df["clean_code_tup"] = df[column].map_partitions(
        lambda srs: [_format_ip(x, input_format, output_format, errors) for x in srs],
        meta=object,
    )
    df = df.assign(
        _temp_=df["clean_code_tup"].map(itemgetter(0)),
        _code_=df["clean_code_tup"].map(itemgetter(1)),
    )
    df = df.rename(columns={"_temp_": f"{column}_clean"})

    # counts of codes indicating how values were changed
    stats = df["_code_"].value_counts(sort=False)
    df = df.drop(columns=["clean_code_tup", "_code_"])

    if inplace:
        df = df.drop(columns=column)

    with ProgressBar(minimum=1, disable=not progress):
        df, stats = dask.compute(df, stats)

    # output a report describing the result of clean_ip
    if report:
        create_report_new("IP", stats, errors)

    return df


[docs]def validate_ip(x: Union[str, pd.Series], input_format: str = "auto") -> Union[bool, pd.Series]:
    """
    Validate IP addresses.

    Read more in the :ref:`User Guide <ip_userguide>`.

    Parameters
    ----------
    x
        pandas Series of IP addresses or a str ip address value
    input_format
        The IP address format to validate.
            - 'auto': validate both ipv4 and ipv6 addresses.
            - 'ipv4': only validate ipv4 addresses.
            - 'ipv6': only validate ipv6 addresses.

        (default: 'auto')

    Examples
    --------

    >>> validate_ip('fdf8:f53b:82e4::53')
    True
    >>> df = pd.DataFrame({'ip': ['fdf8:f53b:82e4::53', None]})
    >>> validate_ip(df['ip'])
    0     True
    1    False
    Name: ip, dtype: bool
    """
    if isinstance(x, pd.Series):
        return x.apply(_check_ip, args=(input_format, False))
    return _check_ip(x, input_format, False)


def _format_ip(val: Any, input_format: str, output_format: str, errors: str) -> Any:
    """
    This function transforms the value val into the desired ip format if possible

    The last component of the returned tuple contains a code indicating how the
    input value was changed:
        0 := the value is null
        1 := the value could not be parsed
        2 := the value is cleaned and the cleaned value is DIFFERENT than the input value
        3 := the value is cleaned and is THE SAME as the input value (no transformation)
    """
    # pylint: disable=too-many-branches
    address, status = _check_ip(val, input_format, True)

    if status == "null":
        return np.nan, 0
    if status == "unknown":
        if errors == "raise":
            raise ValueError(f"Unable to parse value {val}")
        return val if errors == "ignore" else np.nan, 1

    # compressed version without the leading zeros (for ipv6 double colon for zeros)
    if output_format == "compressed":
        result = address.compressed

    # Converts the integer repesentation of the ip address to its hexadecimal
    # form. Does not contain any dots or colons.
    elif output_format == "hexa":
        result = hex(int(address))

    # converts the ip address to its binary representation
    elif output_format == "binary":
        if address.version == 4:
            result = "{0:032b}".format(int(address))
        else:
            result = "{0:0128b}".format(int(address))

    # converts to integer format
    elif output_format == "integer":
        result = int(address)

    # converts to packed binary format (big-endian)
    elif output_format == "packed":
        result = address.packed

    # convert to full representation
    else:
        dlm = "." if address.version == 4 else ":"  # delimiter
        result = dlm.join(f"{'0' * (4 - len(x))}{x}" for x in address.exploded.split(dlm))

    return result, 2 if result != val else 3


def _check_ip(val: Any, input_format: str, clean: bool) -> Any:
    """
    Function to check whether a value is valid ip address
    """
    try:
        if val in NULL_VALUES:
            return (None, "null") if clean else False

        address = ip_address(val)
        vers = address.version

        if vers == 4 and input_format != "ipv6" or vers == 6 and input_format != "ipv4":
            return (address, "success") if clean else True
        return (None, "unknown") if clean else False

    except (TypeError, ValueError):
        return (None, "unknown") if clean else False


# def report_ip(nrows: int, errors: str, column: str) -> None:
#     """
#     This function displays the stats report
#     """
#     correct_format = (
#         STATS["correct_format"] - 1 if (STATS["first_val"] == 100) else STATS["correct_format"]
#     )
#     correct_format_percentage = (correct_format / nrows) * 100

#     incorrect_format = (
#     STATS["incorrect_format"] - 1 if (STATS["first_val"] == 200) else STATS["incorrect_format"]
#     )
#     incorrect_format_percentage = (incorrect_format / nrows) * 100

#     set_to = "NaN" if (errors == "coerce") else "their original values"
#     result_null = "null values" if (errors == "coerce") else "null / not parsable values"
#     result = (
#         f"Result contains {correct_format} "
#         f"({(correct_format / nrows) * 100 :.2f} %) rows  in correct format(stored in column "\
#         f"`{column}_transformed`) and {incorrect_format} {result_null}"
#         f"({(incorrect_format / nrows) * 100:.2f} %)."
#     )

#     print(
#         f"""
# IP address cleaning report:
#         {correct_format} values parsed ({correct_format_percentage:.2f} %)
#         {incorrect_format} values unable to be parsed ({incorrect_format_percentage:.2f} %), "\
#         f"set to {set_to}
# {result}
#         """
#     )