Source code for dataprep.clean.clean_isbn

"""
Clean and validate a DataFrame column containing ISBN numbers.
"""
# pylint: disable=too-many-lines, too-many-arguments, too-many-branches
from typing import Any, Union
from operator import itemgetter

import dask.dataframe as dd
import numpy as np
import pandas as pd

from stdnum import isbn
from ..progress_bar import ProgressBar
from .utils import NULL_VALUES, to_dask


[docs]def clean_isbn( df: Union[pd.DataFrame, dd.DataFrame], column: str, output_format: str = "standard", split: bool = False, inplace: bool = False, errors: str = "coerce", progress: bool = True, ) -> pd.DataFrame: """ Clean ISBN type data in a DataFrame column. Parameters ---------- df A pandas or Dask DataFrame containing the data to be cleaned. column The name of the column containing data of ISBN type. output_format The output format of standardized number string. If output_format = 'compact', return string without any separators. If output_format = 'standard', return string with proper separators. If output_format = 'isbn13', return ISBN string with 13 digits. If output_format = 'isbn10', return ISBN string with 10 digits. (default: "standard") split If True, each component of derived from its number string will be put into its own column. (default: False) inplace If True, delete the column containing the data that was cleaned. Otherwise, keep the original column. (default: False) errors How to handle parsing errors. - ‘coerce’: invalid parsing will be set to NaN. - ‘ignore’: invalid parsing will return the input. - ‘raise’: invalid parsing will raise an exception. (default: 'coerce') progress If True, display a progress bar. (default: True) Examples -------- Clean a column of ISBN data. >>> df = pd.DataFrame({{ "isbn": [ "978-9024538270", "978-9024538271"] }) >>> clean_isbn(df, 'isbn', inplace=True) isbn_clean 0 978-90-245-3827-0 1 NaN """ if output_format not in {"compact", "standard", "isbn13", "isbn10"}: raise ValueError( f"output_format {output_format} is invalid. " 'It needs to be "compact", "standard", "isbn13" or "isbn10".' ) # convert to dask df = to_dask(df) # To clean, create a new column "clean_code_tup" which contains # the cleaned values and code indicating how the initial value was # changed in a tuple. Then split the column of tuples and count the # amount of different codes to produce the report df["clean_code_tup"] = df[column].map_partitions( lambda srs: [_format(x, output_format, split, errors) for x in srs], meta=object, ) if split: # For some reason the meta data for the last 3 components needs to be # set. I think this is a dask bug df = df.assign( _temp_=df["clean_code_tup"].map(itemgetter(0), meta=("_temp", object)), prefix_code=df["clean_code_tup"].map(itemgetter(1), meta=("prefix_code", object)), group_code=df["clean_code_tup"].map(itemgetter(2), meta=("group_code", object)), publisher_code=df["clean_code_tup"].map(itemgetter(3), meta=("publisher_code", object)), item_code=df["clean_code_tup"].map(itemgetter(4), meta=("item_code", object)), check_digit=df["clean_code_tup"].map(itemgetter(5), meta=("check_digit", object)), ) else: df = df.assign( _temp_=df["clean_code_tup"].map(itemgetter(0)), ) df = df.rename(columns={"_temp_": f"{column}_clean"}) df = df.drop(columns=["clean_code_tup"]) if inplace: df[column] = df[f"{column}_clean"] df = df.drop(columns=f"{column}_clean") df = df.rename(columns={column: f"{column}_clean"}) with ProgressBar(minimum=1, disable=not progress): df = df.compute() # output a report describing the result of clean_country return df
[docs]def validate_isbn( df: Union[str, pd.Series, dd.Series, pd.DataFrame, dd.DataFrame], column: str = "", ) -> Union[bool, pd.Series, pd.DataFrame]: """ Validate if a data cell is ISBN in a DataFrame column. For each cell, return True or False. Parameters ---------- df A pandas or Dask DataFrame containing the data to be validated. column The name of the column to be validated. """ if isinstance(df, (pd.Series, dd.Series)): return df.apply(isbn.is_valid) elif isinstance(df, (pd.DataFrame, dd.DataFrame)): if column != "": return df[column].apply(isbn.is_valid) else: return df.applymap(isbn.is_valid) return isbn.is_valid(df)
def _format( val: Any, output_format: str = "standard", split: bool = False, errors: str = "coarse" ) -> Any: """ Reformat a number string with proper separators (formats). Parameters ---------- val The value of number string. output_format If output_format = 'compact', return string without any separators. If output_format = 'standard', return string with proper separators function. If output_format = 'isbn13', return ISBN string with 13 digits. If output_format = 'isbn10', return ISBN string with 10 digits. """ val = str(val) result: Any = [] if val in NULL_VALUES: if split: return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] else: return [np.nan] if not validate_isbn(val): if errors == "raise": raise ValueError(f"Unable to parse value {val}") error_result = val if errors == "ignore" else np.nan if split: return [error_result, np.nan, np.nan, np.nan, np.nan, np.nan] else: return [error_result] if split: result = list(isbn.split(isbn.to_isbn13(val))) if len(result) == 0: return [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] if output_format == "compact": result = [isbn.compact(val)] + result elif output_format == "standard": result = [isbn.format(val)] + result elif output_format == "isbn13": result = [isbn.format(isbn.to_isbn13(val))] + result elif output_format == "isbn10": result = [isbn.format(isbn.to_isbn10(val))] + result return result