Source code for dataprep.clean.clean_address

"""
Clean and validate a DataFrame column containing US street addresses.
"""

import re
from operator import itemgetter
from typing import Any, Dict, List, Tuple, Union

import dask
import dask.dataframe as dd
import numpy as np
import pandas as pd
import usaddress

from ..progress_bar import ProgressBar
from .address_utils import (
    ABBR_STATES,
    FULL_PREFIX,
    FULL_STATES,
    KEYWORDS,
    PREFIXES,
    SUFFIXES,
    TAG_MAPPING,
)
from .utils import NULL_VALUES, create_report_new, to_dask


[docs]def clean_address(
    df: Union[pd.DataFrame, dd.DataFrame],
    column: str,
    output_format: str = "(building) house_number street_prefix_abbr "
    "street_name street_suffix_abbr, apartment, city, state_abbr zipcode",
    must_contain: Tuple[str, ...] = ("house_number", "street_name"),
    split: bool = False,
    inplace: bool = False,
    errors: str = "coerce",
    report: bool = True,
    progress: bool = True,
) -> pd.DataFrame:
    """
    Clean and standardize US street addresses.

    Read more in the :ref:`User Guide <address_userguide>`.

    Parameters
    ----------
    df
        A pandas or Dask DataFrame containing the data to be cleaned.
    column
        The name of the column containing addresses.
    output_format
        The output format can be specified using the following keywords.
            - 'house_number': '1234'
            - 'street_prefix_abbr': 'N', 'S', 'E', or 'W'
            - 'street_prefix_full': 'North', 'South', 'East', or 'West'
            - 'street_name': 'Main'
            - 'street_suffix_abbr': 'St', 'Ave'
            - 'street_suffix_full': 'Street', 'Avenue'
            - 'apartment': 'Apt 1'
            - 'building': 'Staples Center'
            - 'city': 'Los Angeles'
            - 'state_abbr': 'CA'
            - 'state_full': 'California'
            - 'zipcode': '57903'

        The output_format can contain '\\\\t' characters to specify how to split the output into
        columns.

        (default: '(building) house_number street_prefix_abbr street_name street_suffix_abbr,
        apartment, city, state_abbr zipcode')
    must_contain
        A tuple containing parts of the address that must be included for the address to be
        successfully cleaned.

            - 'house_number': '1234'
            - 'street_prefix': 'N', 'North'
            - 'street_name': 'Main'
            - 'street_suffix': 'St', 'Avenue'
            - 'apartment': 'Apt 1'
            - 'building': 'Staples Center'
            - 'city': 'Los Angeles'
            - 'state': 'CA', 'California'
            - 'zipcode': '57903'

        (default: ('house_number', 'street_name'))
    split
        If True, each component of the address specified by the output_format parameter will be put
        into it's own column.

        For example if output_format = "house_number street_name" and split = True, then there
        will be one column for house_number and another for street_name.

        (default: False)
    inplace
        If True, delete the column containing the data that was cleaned. Otherwise,
        keep the original column.

        (default: False)
    errors
        How to handle parsing errors.
            - ‘coerce’: invalid parsing will be set to NaN.
            - ‘ignore’: invalid parsing will return the input.
            - ‘raise’: invalid parsing will raise an exception.

        (default: 'coerce')
    report
        If True, output the summary report. Otherwise, no report is outputted.

        (default: True)
    progress
        If True, display a progress bar.

        (default: True)

    Examples
    --------
    Clean addresses and add the house number and street name to separate columns.

    >>> df = pd.DataFrame({'address': ['123 pine avenue', '1234 w main st 57033']})
    >>> clean_address(df, 'address', output_format='house_number \\t street_name')
    Address Cleaning Report:
            2 values cleaned (100.0%)
    Result contains 2 (100.0%) values in the correct format and 0 null values (0.0%)
        address                house_number      street_name
    0    123 pine avenue           123             Pine
    1   1234 w main st 57033       1234            Main
    """
    # pylint: disable=too-many-arguments

    df = to_dask(df)

    df["clean_code_tup"] = df[column].map_partitions(
        lambda srs: [_format_address(x, output_format, must_contain, split, errors) for x in srs],
        meta=object,
    )

    headers = _get_column_names(output_format, split)

    # if there's only one column in the output, name it f"{column}_clean". Otherwise,
    # get names from the output_format
    if len(headers) == 1:
        df = df.assign(
            _temp_=df["clean_code_tup"].map(itemgetter(0)),
            _code_=df["clean_code_tup"].map(itemgetter(1)),
        )
        df = df.rename(columns={"_temp_": f"{column}_clean"})
    else:
        assignments = {
            headers[i]: df["clean_code_tup"].map(itemgetter(i), meta=(headers[i], str))
            for i in range(len(headers))
        }
        assignments["_code_"] = df["clean_code_tup"].map(
            itemgetter(len(headers)), meta=("_code_", int)
        )
        df = df.assign(**assignments)

    stats = df["_code_"].value_counts(sort=False)
    df = df.drop(columns=["clean_code_tup", "_code_"])

    if inplace:
        df = df.drop(columns=column)

    with ProgressBar(minimum=1, disable=not progress):
        df, stats = dask.compute(df, stats)

    if report:
        create_report_new("Address", stats, errors)
    return df


[docs]def validate_address(
    x: Union[str, pd.Series], must_contain: Tuple[str, ...] = ("house_number", "street_name")
) -> Union[bool, pd.Series]:
    """
    Validate US street addresses.

    Read more in the :ref:`User Guide <address_userguide>`.

    Parameters
    ----------
    x
        pandas Series of addresses or a string containing an address.
    must_contain
        A tuple containing parts of the address that must be included for the
        address to be successfully cleaned.

            - 'house_number': '1234'
            - 'street_prefix': 'N', 'North'
            - 'street_name': 'Main'
            - 'street_suffix': 'St', 'Avenue'
            - 'apartment': 'Apt 1'
            - 'building': 'Staples Center'
            - 'city': 'Los Angeles'
            - 'state': 'CA', 'California'
            - 'zipcode': '57903'

        (default: ('house_number', 'street_name'))

    Examples
    --------

    >>> df = pd.DataFrame({'address': ['123 pine avenue', 'NULL']})
    >>> validate_address(df['address'])
    0    True
    1    False
    Name: address, dtype: bool
    """

    if isinstance(x, pd.Series):
        return x.apply(_check_address, args=(must_contain, False))

    return _check_address(x, must_contain, False)


def _format_address(
    address: Any, output_format: str, must_contain: Tuple[str, ...], split: bool, errors: str
) -> Any:
    """
    Function to transform an address instance into the desired format

    The last component of the returned tuple contains a code indicating how the
    input value was changed:
        0 := the value is null
        1 := the value could not be parsed
        2 := the value is cleaned and the cleaned value is DIFFERENT than the input value
        3 := the value is cleaned and is THE SAME as the input value (no transformation)
    """
    address_dict, status = _check_address(address, must_contain, True)
    outputs = _address_dict_to_string(address_dict, output_format, split)

    if status == "null":
        return (np.nan,) * len(_get_column_names(output_format, split)) + (0,)

    elif status == "unknown":
        if errors == "raise":
            raise ValueError(f"unable to parse value {address}")
        return tuple(
            np.nan if not value else value if errors == "ignore" else np.nan for value in outputs
        ) + (1,)

    if len(outputs) == 1 and address == outputs[0]:
        code = 3
    else:
        code = 2
    return tuple(np.nan if not value else value for value in outputs) + (code,)


def _check_address(address: Any, must_contain: Tuple[str, ...], clean: bool) -> Any:
    """
    Finds the index of the given country in the DATA dataframe.

    Parameters
    ----------
    address_str
        address value to be cleaned
    must_contain
        A tuple containing parts of the address that must be included for the
         address to be successfully cleaned
    clean
        If True, a tuple (index, status) is returned.
        If False, the function returns True/False to be used by the validate address function.
    """
    if address in NULL_VALUES:
        return (None, "null") if clean else False

    address = re.sub(r"[().]", "", str(address))

    try:
        address, _ = usaddress.tag(address, TAG_MAPPING)

    except usaddress.RepeatedLabelError:
        return (None, "unknown") if clean else False

    status = _check_status(address, must_contain)

    if status:
        return (address, "success") if clean else True

    return (address, "unknown") if clean else False


def _check_status(address_dict: Dict[str, str], must_contain: Tuple[str, ...]) -> bool:
    """
    Returns True if all address attributes in must_contain are present in
    address_dict, otherwise returns False.
    """
    return all(address_part in address_dict for address_part in must_contain)


def _address_dict_to_string(address: Dict[str, str], output_format: str, split: bool) -> List[str]:
    """
    Returns a list of address parts, in a format specified by output_format.
    Each item in the list will be added to the final dataframe in it's own column.
    """

    address_items = _clean_address_parts(address)

    # add tabs between each attribute if split is True
    if split:
        output_format = "\t".join(output_format.split())

    # add a comma after the street name if there is no street suffix
    # in address_items
    if "street_suffix_abbr" not in address_items and not split:
        output_format = output_format.replace("street_name", "street_name,")

    # first split output_format into each column of the final output
    # for each column split it into attributes and add the corresponding
    # cleaned part of the address to the output for each attribute
    output = []
    columns = output_format.split("\t")
    current_part = ""

    for column in columns:
        for output_attr in column.split():
            for address_attr, address_val in address_items.items():
                idx = output_attr.find(address_attr)
                if idx != -1 and address_val is not None:
                    # include parts at the beginning and end ie. include parens
                    # if (building) is in output_str. Only if split is False
                    end = idx + len(address_attr)
                    if split:
                        current_part += f" {address_val}"
                    else:
                        current_part += f" {output_attr[:idx]}{address_val}{output_attr[end:]}"
        output.append(current_part.strip(" ,").replace(" # ", " "))
        current_part = ""

    return output


def _clean_address_parts(address_dict: Dict[str, str]) -> Dict[str, str]:
    """
    Apply basic cleaning functions to parts of the address.
    """
    if not address_dict:
        return {}

    result_dict: Dict[str, str] = {}

    cleaning_funcs = {
        "house_number": _clean_house_number,
        "street_prefix": _clean_prefix,
        "street_name": _clean_street,
        "street_suffix": _clean_suffix,
        "state": _clean_state,
        "city": _clean_city,
        "building": _clean_building,
        "apartment": _clean_apartment,
        "zipcode": _clean_zip,
    }
    for address_attr, value in address_dict.items():
        if address_attr in cleaning_funcs:
            cleaning_funcs[address_attr](result_dict, value)

    return result_dict


def _get_column_names(output_format: str, split: bool) -> List[str]:
    """
    returns the column names that will be present in the final dataframe,
    based on the output_format
    """
    if not split:
        return [name.strip() for name in output_format.split("\t")]

    output_tokens = output_format.split()
    headers = []
    for output_part in output_tokens:
        for attr in KEYWORDS:
            if attr in output_part:
                headers.append(attr)
                break
    return headers


def _clean_prefix(result_dict: Dict[str, str], prefix: str) -> None:
    """
    Adds a cleaned full prefix and cleaned abbreviated prefix to result_dict,
    based on the value of street prefix
    """
    prefix_abbr = PREFIXES.get(prefix.lower())
    if prefix_abbr:
        result_dict["street_prefix_abbr"] = prefix_abbr
        result_dict["street_prefix_full"] = FULL_PREFIX[prefix_abbr]


def _clean_suffix(result_dict: Dict[str, str], suffix: str) -> None:
    """
    Adds a cleaned full suffix and cleaned abbreviated suffix to result_dict,
    based on the value of the street suffix
    """
    suffix_tuple = SUFFIXES.get(suffix.upper())
    if suffix_tuple:
        result_dict["street_suffix_abbr"] = suffix_tuple[0].capitalize() + "."
        result_dict["street_suffix_full"] = suffix_tuple[1].capitalize()


def _clean_state(result_dict: Dict[str, str], state: str) -> None:
    """
    Adds a cleaned full state and cleaned abbreviated state to result_dict,
    based on the value of the state
    """
    if state.title() in FULL_STATES:
        result_dict["state_abbr"] = FULL_STATES[state.title()]
        result_dict["state_full"] = state.title()
    if state.upper() in ABBR_STATES:
        result_dict["state_abbr"] = state.upper()
        result_dict["state_full"] = ABBR_STATES[state.upper()]


def _clean_city(result_dict: Dict[str, str], city: str) -> None:
    """
    capitalize each word of city and add it to result_dict
    """
    result_dict["city"] = city.title()


def _clean_house_number(result_dict: Dict[str, str], house_number: str) -> None:
    """
    adds house_number to result_dict
    """
    result_dict["house_number"] = house_number


def _clean_building(result_dict: Dict[str, str], building: str) -> None:
    """
    capitalize each word of building and add it to result_dict
    """
    result_dict["building"] = building.title()


def _clean_zip(result_dict: Dict[str, str], zipcode: str) -> None:
    """
    adds zipcode to result_dict
    """
    result_dict["zipcode"] = zipcode


def _clean_street(result_dict: Dict[str, str], street: str) -> None:
    """
    capitalize each word of the street name and add it to result_dict,
    except keep the number suffixes 'st', 'nd', 'rd', 'th' lower case
    """
    if re.match(r"\d+[st|nd|rd|th]", street, flags=re.IGNORECASE):
        result_dict["street_name"] = street.lower()
    else:
        result_dict["street_name"] = street.title()


def _clean_apartment(result_dict: Dict[str, str], apartment: str) -> None:
    """
    capitalize each word of the apartment and add it to result_dict
    """
    result_dict["apartment"] = apartment.title()