Source code for hyswap.percentiles

"""Percentile calculation functions."""

import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from hyswap.utils import filter_data_by_month_day
from hyswap.utils import filter_data_by_time
from hyswap.utils import define_year_doy_columns
from hyswap.utils import set_window_width
from hyswap.utils import rolling_average
from hyswap.exceedance import calculate_exceedance_probability_from_values



[docs]
def calculate_fixed_percentile_thresholds(
        data,
        data_column_name=None,
        percentiles=np.array((5, 10, 25, 50, 75, 90, 95)),
        method='weibull',
        date_column_name=None,
        ignore_na=True,
        include_min_max=True,
        include_metadata=True,
        mask_out_of_range=True,
        **kwargs):
    """Calculate fixed percentile thresholds using historical data.
    Fixed percentiles are calculated using all data in the period of
    record. See the `Calculations Quick-Reference <https://doi-usgs.github.io/hyswap/meta/calculations.html#streamflow-percentiles>`
    for more information.

    Parameters
    ----------
    data : pandas.DataFrame or array-like
        DataFrame, Series, or 1-D array containing data used to calculate
        percentile thresholds. If DataFrame, "data_column_name" must be
        specified and expects a datetime index unless "date_column_name" is
        provided. If Series, must include a datetime index. If 1-D array, then
        "include_metadata" must be set to False since a datetime index is not
        included with data.

    data_column_name : str, optional
        Name of column containing data to analyze if input is a DataFrame.
        Default is None.

    percentiles : array_like, optional
        Percentiles to calculate. Default is (5, 10, 25, 50, 75, 90, 95).
        Note: Values of 0 and 100 are ignored as unbiased plotting position
        formulas do not assign values to 0 or 100th percentile.

    method : str, optional
        Method to use to calculate percentiles. Default is 'weibull' (Type 6).
        Additional available methods are 'interpolated_inverted_cdf' (Type 4),
        'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8),
        and 'normal_unbiased' (Type 9).

    date_column_name : str, optional
        For data provided as DataFrame, name of column containing date
        information. If None, the index of `data` is used.

    ignore_na : bool, optional
        Ignore NA values in percentile calculations

    include_min_max : bool, optional
        When set to True, include min and max streamflow value in addition to
        streamflow values for percentile levels. Default is True.

    include_metadata : bool, optional
        When set to True, return additional columns describing the data
        including count, mean, start_yr, end_yr. Default is True. Input data
        must include a datetime column as either index or specified by
        date_column_name.

    mask_out_of_range :  bool, optional
        When set to True, percentiles that are beyond the min/max percentile
        rank of the observed data are set to NA. When enabled, high or low
        percentiles may not be calculated when few data points are
        available. Default is True.

    **kwargs : dict, optional
        Additional keyword arguments to pass to `numpy.percentile`.

    Returns
    -------
    percentiles : pandas.DataFrame
        Percentiles of the data in a DataFrame so the thresholds and
        percentile values are tied together.

    Examples
    --------
    Calculate percentile thresholds from some synthetic data using 'linear'
    method.

    .. doctest::

        >>> data = pd.DataFrame({'values': np.arange(101),
        ...                      'date': pd.date_range('2020-01-01', '2020-04-10')}).set_index('date')  # noqa: E501
        >>> results = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, 'values', percentiles=[25, 75, 95], method='linear')
        >>> results
                min   p25   p75   p95  max  mean  count start_yr end_yr
        values    0  25.0  75.0  95.0  100  50.0    101     2020   2020

    Calculate percentile thresholds without additional metadata columns

    .. doctest::

        >>> data = np.arange(101)
        >>> results = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, percentiles=[5, 25, 75, 95], method='linear',
        ...     include_metadata=False)
        >>> results
                min  p05   p25   p75   p95  max
        values    0  5.0  25.0  75.0  95.0  100

    Calculate percentile thresholds using default 'weibull' method
        >>> data = np.arange(101)
        >>> results = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, percentiles=[5, 25, 50, 75, 95],
        ...     include_metadata=False)
        >>> results
                min  p05   p25   p50   p75   p95  max
        values    0  4.1  24.5  50.0  75.5  95.9  100

    Calculate percentile thresholds from a small number of observations and
    mask out out of range percentile levels

        >>> data = np.arange(11)
        >>> results = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, percentiles=np.array((1, 10, 50, 90, 99)),
        ...     include_metadata=False)
        >>> results
                min  p01  p10  p50  p90  p99  max
        values    0  NaN  0.2  5.0  9.8  NaN   10
    """
    if isinstance(data, pd.DataFrame):
        # set the df index
        if date_column_name is not None:
            data = data.set_index(date_column_name)
        # If data column name is not in dataframe
        if data_column_name not in data:
            raise ValueError('DataFrame missing data_column_name')
        data = data[data_column_name]

    # ignore 0 and 100 percentiles if passed in
    if isinstance(percentiles, np.ndarray):
        percentiles = percentiles[~np.isin(percentiles, [0, 100])]
    elif isinstance(percentiles, list):
        percentiles = [x for x in percentiles if x not in (0, 100)]

    if ignore_na:
        pct = np.nanpercentile(data, percentiles, method=method, **kwargs)
    else:
        pct = np.percentile(data, percentiles, method=method, **kwargs)

    # round values smaller than three decimal places to zero to avoid extremely
    # small threshold values being returned.
    pct[(pct > 0) & (pct < 0.001)] = 0

    df_out = pd.DataFrame(data={"values": pct}, index=percentiles)

    if mask_out_of_range:
        min_pct_rank = (1 - calculate_exceedance_probability_from_values(np.nanmin(data), data, method=method))*100  # noqa: E501
        max_pct_rank = (1 - calculate_exceedance_probability_from_values(np.nanmax(data), data, method=method))*100  # noqa: E501
        df_out.loc[(df_out.index > max_pct_rank) | (df_out.index < min_pct_rank)] = np.nan  # noqa: E501

    # transpose so percentile levels are columns
    df_out = df_out.T
    df_out.columns = "p" + df_out.columns.astype(str).str.zfill(2)

    if include_min_max:
        # add min as first column of dataframe and max as last column
        if ignore_na:
            df_out.insert(0, 'min', np.min(data))
            df_out['max'] = np.max(data)
        else:
            df_out.insert(0, 'min', np.nanmin(data))
            df_out['max'] = np.nanmax(data)
    if include_metadata:
        if isinstance(data, pd.Series):
            if not data.index.inferred_type == "datetime64":
                raise ValueError("Datetime index must be provided with include_metadata=True.")  # noqa: E501
        else:
            raise ValueError("Data input format must include a datetime index with include_metadata=True.")  # noqa: E501
        if ignore_na:
            df_out['mean'] = np.round(np.nanmean(data), 2)
        else:
            df_out['mean'] = np.round(np.mean(data), 2)
        df_out['count'] = len(data)
        df_out['start_yr'] = data.index.min().strftime('%Y')
        df_out['end_yr'] = data.index.max().strftime('%Y')

    return df_out




[docs]
def calculate_variable_percentile_thresholds_by_day_of_year(
        df,
        data_column_name,
        percentiles=[5, 10, 25, 50, 75, 90, 95],
        method='weibull',
        date_column_name=None,
        window_width='daily',
        year_type='calendar',
        leading_values=0,
        trailing_values=0,
        clip_leap_day=False,
        ignore_na=True,
        include_min_max=True,
        include_metadata=True,
        mask_out_of_range=True,
        **kwargs):
    """Calculate variable percentile thresholds of data by day of year.
    Variable percentiles are calculated using flow observations for
    each day from all years on record. See the `Calculations Quick-Reference <https://doi-usgs.github.io/hyswap/meta/calculations.html#streamflow-percentiles>`
    for more information.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing data to calculate daily percentile thresholds for.

    data_column_name : str
        Name of column containing data to analyze.

    percentiles : array_like, optional
        Percentile thresholds to calculate, default is
        [5, 10, 25, 50, 75, 90, 95]. Note: Values of 0 and 100 are ignored as
        unbiased plotting position formulas do not assign values to 0 or 100th
        percentile.

    method : str, optional
        Method to use to calculate percentiles. Default is 'weibull' (Type 6).
        Additional available methods are 'interpolated_inverted_cdf' (Type 4),
        'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8),
        and 'normal_unbiased' (Type 9).

    date_column_name : str, optional
        Name of column containing date information. If None, the index of
        `df` is used.

    window_width : str, optional
        The window width of the data in days. Must be one of 'daily',
        '7-day', '14-day', and '28-day'. If '7-day', '14-day', or
        '28-day' is specified, the data will be averaged over the
        specified period. NaN values will be used for any days that
        do not have data. If present, NaN values will result in NaN
        values for the entire period.

    year_type : str, optional
        The type of year to use. Must be one of 'calendar', 'water', or
        'climate'. Default is 'calendar' which starts the year on January 1
        and ends on December 31. 'water' starts the year on October 1 and
        ends on September 30 of the following year which is the "water year".
        For example, October 1, 2010 to September 30, 2011 is "water year
        2011". 'climate' years begin on April 1 and end on March 31 of the
        following year, they are numbered by the ending year. For example,
        April 1, 2010 to March 31, 2011 is "climate year 2011".

    leading_values : int, optional
        For the temporal filtering, this is an argument setting the
        number of leading values to include in the output, inclusive.
        Default is 0, and parameter only applies to 'day' time_interval.

    trailing_values : int, optional
        For the temporal filtering, this is an argument setting the
        number of trailing values to include in the output, inclusive.
        Default is 0, and parameter only applies to 'day' time_interval.

    clip_leap_day : bool, optional
        If True, February 29 is removed from the DataFrame. Default is False.

    ignore_na : bool, optional
        Ignore NA values in percentile calculations

    include_min_max : bool, optional
        When set to True, include min and max streamflow value in addition to
        streamflow values for percentile levels. Default is True.

    include_metadata : bool, optional
        When set to True, return additional columns describing the data
        including count, mean, start_yr, end_yr. Default is True

    mask_out_of_range :  bool, optional
        When set to True, percentiles that are beyond the min/max percentile
        rank of the observed data are set to NA. When enabled, high or low
        percentiles may not be calculated when few data points are
        available. Default is True.

    **kwargs : dict, optional
        Additional keyword arguments to pass to `numpy.percentile`.

    Returns
    -------
    percentiles : pandas.DataFrame
        DataFrame containing threshold percentiles of data by day of year.
        The DataFrame has a multi-index of 'doy' and 'year_type'.
        Returns a DataFrame of NaNs for each percentile/day if
        provided an empty DataFrame or DataFrame with insufficient data

    Examples
    --------
    Calculate default thresholds by day of year from some real data in
    preparation for plotting.

    .. doctest::
        :skipif: True  # dataretrieval functions break CI pipeline

        >>> df, _ = dataretrieval.nwis.get_dv(
        ...     "03586500", parameterCd="00060",
        ...     start="1776-01-01", end="2022-12-31")
        >>> results = percentiles.calculate_variable_percentile_thresholds_by_day_of_year(  # noqa: E501
        ...     df, "00060_Mean")
        >>> len(results.index)  # 366 days in a leap year
        366
    """
    # If the dataframe is empty, create a dummy dataframe to
    # run through function
    if clip_leap_day:
        # use a non-leap year as reference for empty df
        date_rng = pd.date_range(start='1901-01-01', end='1901-12-31')
    else:
        # use a leap year as reference for empty df
        date_rng = pd.date_range(start='1904-01-01', end='1904-12-31')
    if df.empty:
        warnings.warn('No valid data provided, returning NA values for percentile thresholds')  # noqa: E501
        df = pd.DataFrame(index=date_rng)
        df[data_column_name] = np.nan

    # If data column name is not in dataframe
    if data_column_name not in df:
        warnings.warn('DataFrame missing data_column_name, returning NA values for percentile thresholds')  # noqa: E501
        df = pd.DataFrame(index=date_rng)
        df[data_column_name] = np.nan
    # define year and day of year columns and convert date column to datetime
    # if necessary
    df = define_year_doy_columns(df, date_column_name=date_column_name,
                                 year_type=year_type,
                                 clip_leap_day=clip_leap_day)
    # do rolling average for time as needed
    window = set_window_width(window_width)
    df = rolling_average(df, data_column_name, window)

    # create an empty dataframe to hold percentiles based on day-of-year
    # ignore 0 and 100 percentiles if passed in
    if isinstance(percentiles, np.ndarray):
        percentiles = percentiles[~np.isin(percentiles, [0, 100])]
    elif isinstance(percentiles, list):
        percentiles = [x for x in percentiles if x not in (0, 100)]
    cols = [f"p{perc:02d}" for perc in percentiles]
    if include_min_max:
        cols = ['min'] + cols + ['max']
    if include_metadata:
        cols = cols + ['mean', 'count', 'start_yr', 'end_yr']
    doy_index = date_rng.day_of_year.values
    percentiles_by_day = pd.DataFrame(index=doy_index,
                                      columns=cols)

    # loop through days of year available
    for doy in doy_index:
        # get historical data for the day of year
        data = filter_data_by_time(df, doy, data_column_name,
                                   leading_values=leading_values,
                                   trailing_values=trailing_values,
                                   drop_na=ignore_na)
        if not data.empty:
            if not np.isnan(data).all():
                # calculate percentiles for the day of year
                # and add to DataFrame
                _pct = calculate_fixed_percentile_thresholds(
                    data.to_frame(), data_column_name,
                    percentiles=percentiles,
                    method=method, ignore_na=ignore_na,
                    include_min_max=include_min_max,
                    include_metadata=include_metadata,
                    mask_out_of_range=mask_out_of_range, **kwargs)
                percentiles_by_day.loc[doy_index == doy, :] = _pct.values.tolist()[0]  # noqa: E501
            else:
                # if all values are NA
                # set percentiles to NaN
                percentiles_by_day.loc[doy_index == doy, :] = np.nan
        else:
            # if the data subset for doy is empty
            # set percentiles to NaN
            percentiles_by_day.loc[doy_index == doy, :] = np.nan
    if clip_leap_day:
        wy_sub = 273
        cy_sub = 90
        wy_cy_sub = 365
    else:
        wy_sub = 274
        cy_sub = 91
        wy_cy_sub = 366
    # sort index by year type
    percentiles_by_day = percentiles_by_day.sort_index()
    if year_type == 'climate':
        doy_index = doy_index - cy_sub
        doy_index[doy_index < 1] += wy_cy_sub
    if year_type == 'water':
        doy_index = doy_index - wy_sub
        doy_index[doy_index < 1] += wy_cy_sub
    # reorder by water year or climate index and rename
    percentiles_by_day = percentiles_by_day.loc[doy_index]
    percentiles_by_day.reset_index(drop=True, inplace=True)
    percentiles_by_day.index = pd.MultiIndex.from_arrays(
        [percentiles_by_day.index + 1, [year_type] * len(doy_index)],
        names=['doy', 'year_type'])

    # return percentiles by day of year
    return percentiles_by_day




[docs]
def calculate_variable_percentile_thresholds_by_day(
        df,
        data_column_name,
        percentiles=[5, 10, 25, 50, 75, 90, 95],
        method='weibull',
        date_column_name=None,
        window_width='daily',
        leading_values=0,
        trailing_values=0,
        clip_leap_day=False,
        ignore_na=True,
        include_min_max=True,
        include_metadata=True,
        mask_out_of_range=True,
        **kwargs):
    """Calculate variable percentile thresholds of data by day

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing data to calculate daily percentile thresholds for.

    data_column_name : str
        Name of column containing data to analyze.

    percentiles : array_like, optional
        Percentile thresholds to calculate, default is
        [5, 10, 25, 50, 75, 90, 95]. Note: Values of 0 and 100 are ignored as
        unbiased plotting position formulas do not assign values to 0 or 100th
        percentile.

    method : str, optional
        Method to use to calculate percentiles. Default is 'weibull' (Type 6).
        Additional available methods are 'interpolated_inverted_cdf' (Type 4),
        'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8),
        and 'normal_unbiased' (Type 9).

    date_column_name : str, optional
        Name of column containing date information. If None, the index of
        `df` is used.

    window_width : str, optional
        The window width of the data in days. Must be one of 'daily',
        '7-day', '14-day', and '28-day'. If '7-day', '14-day', or
        '28-day' is specified, the data will be averaged over the
        specified period. NaN values will be used for any days that
        do not have data. If present, NaN values will result in NaN
        values for the entire period.

    leading_values : int, optional
        For the temporal filtering, this is an argument setting the
        number of leading values to include in the output, inclusive.
        Default is 0, and parameter only applies to 'day' time_interval.

    trailing_values : int, optional
        For the temporal filtering, this is an argument setting the
        number of trailing values to include in the output, inclusive.
        Default is 0, and parameter only applies to 'day' time_interval.

    clip_leap_day : bool, optional
        If True, February 29 is removed from the DataFrame. Default is False.

    ignore_na : bool, optional
        Ignore NA values in percentile calculations

    include_min_max : bool, optional
        When set to True, include min and max streamflow value in addition to
        streamflow values for percentile levels. Default is True.

    include_metadata : bool, optional
        When set to True, return additional columns describing the data
        including count, mean, start_yr, end_yr. Default is True

    mask_out_of_range :  bool, optional
        When set to True, percentiles that are beyond the min/max percentile
        rank of the observed data are set to NA. When enabled, high or low
        percentiles may not be calculated when few data points are
        available. Default is True.

    **kwargs : dict, optional
        Additional keyword arguments to pass to `numpy.percentile`.

    Returns
    -------
    percentiles : pandas.DataFrame
        DataFrame containing threshold percentiles of data by month-day.
        Will return a DataFrame of NaNs for each percentile/day if
        provided an empty DataFrame or DataFrame with insufficient data

    Examples
    --------
    Calculate default thresholds by day from some real data in
    preparation for plotting.

    .. doctest::
        :skipif: True  # dataretrieval functions break CI pipeline

        >>> df, _ = dataretrieval.nwis.get_dv(
        ...     "03586500", parameterCd="00060",
        ...     start="1776-01-01", end="2022-12-31")
        >>> results = percentiles.calculate_variable_percentile_thresholds_by_day(  # noqa: E501
        ...     df, "00060_Mean")
        >>> len(results.index)  # 366 days in a leap year
        366
    """
    # If the dataframe is empty, create a dummy dataframe to
    # run through function
    if clip_leap_day:
        # use a non-leap year as reference for empty df
        date_rng = pd.date_range(start='1901-01-01', end='1901-12-31')
    else:
        # use a leap year as reference for empty df
        date_rng = pd.date_range(start='1904-01-01', end='1904-12-31')
    if df.empty:
        warnings.warn('No valid data provided, returning NA values for percentile thresholds')  # noqa: E501
        df = pd.DataFrame(index=date_rng)
        df[data_column_name] = np.nan
    # If data column name is not in dataframe
    if data_column_name not in df:
        warnings.warn('DataFrame missing data_column_name, returning NA values for percentile thresholds')  # noqa: E501
        df = pd.DataFrame(index=date_rng)
        df[data_column_name] = np.nan

    # set the df index
    if date_column_name is not None:
        df = df.set_index(date_column_name)

    # do rolling average for time as needed
    window = set_window_width(window_width)
    df = rolling_average(df, data_column_name, window)

    # create an empty dataframe to hold percentiles based on month-day
    # ignore 0 and 100 percentiles if passed in
    if isinstance(percentiles, np.ndarray):
        percentiles = percentiles[~np.isin(percentiles, [0, 100])]
    elif isinstance(percentiles, list):
        percentiles = [x for x in percentiles if x not in (0, 100)]
    cols = [f"p{perc:02d}" for perc in percentiles]
    if include_min_max:
        cols = ['min'] + cols + ['max']
    if include_metadata:
        cols = cols + ['mean', 'count', 'start_yr', 'end_yr']
    month_day_index = date_rng.strftime("%m-%d")
    percentiles_by_day = pd.DataFrame(index=month_day_index,
                                      columns=cols)
    percentiles_by_day.index.names = ['month_day']
    # loop through days of year available
    for month_day in month_day_index:
        # get historical data for the day of year
        data = filter_data_by_month_day(df, month_day, data_column_name,
                                        leading_values=leading_values,
                                        trailing_values=trailing_values,
                                        drop_na=ignore_na)
        if not data.empty:
            if not np.isnan(data).all():
                # calculate percentiles for the day of year
                # and add to DataFrame
                _pct = calculate_fixed_percentile_thresholds(
                    data.to_frame(), data_column_name,
                    percentiles=percentiles,
                    method=method, ignore_na=ignore_na,
                    include_min_max=include_min_max,
                    include_metadata=include_metadata,
                    mask_out_of_range=mask_out_of_range, **kwargs)
                percentiles_by_day.loc[month_day_index == month_day, :] = _pct.values.tolist()[0]  # noqa: E501
            else:
                # if all values are NA
                # set percentiles to NaN
                percentiles_by_day.loc[
                    month_day_index == month_day, :] = np.nan
        else:
            # if the data subset for doy is empty
            # set percentiles to NaN
            percentiles_by_day.loc[month_day_index == month_day, :] = np.nan

    return percentiles_by_day




[docs]
def calculate_fixed_percentile_from_value(value, percentile_df):
    """Calculate percentile from a value and fixed percentile thresholds.

    This function enables faster calculation of the percentile associated with
    a given value if percentile values and corresponding fixed percentile
    thresholds are known from other data from the same station or site.
    This calculation is done using linear interpolation. A value greater than
    the largest streamflow value in the percentile threshold dataframe results
    in a percentile of 100. A value less than the smallest streamflow value in
    the percentile threshold dataframe results in a percentile of 0.

    Parameters
    ----------
    value : float, np.ndarray
        New value(s) to calculate percentile for. Can be a single value or an
        array of values.

    percentile_df : pd.DataFrame
        DataFrame where columns are the percentile thresholds values and the
        values are stored in a row called "values". Typically generated by the
        `calculate_fixed_percentile_thresholds` functions but could be
        provided manually or from data pulled from the NWIS stats service.

    Returns
    -------
    percentile : float, np.ndarray
        Percentile associated with the input value(s).

    Examples
    --------
    Calculate the percentile associated with a value from some synthetic data.

    .. doctest::

        >>> data = pd.DataFrame({'values': np.arange(1001),
        ...                      'date': pd.date_range('2020-01-01', '2022-09-27')}).set_index('date')  # noqa: E501
        >>> pcts_df = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, 'values', percentiles=[5, 10, 25, 50, 75, 90, 95])
        >>> new_percentile = percentiles.calculate_fixed_percentile_from_value(
        ...     500, pcts_df).item()
        >>> new_percentile
        50.0

    Calculate the percentiles associated with multiple values for some data
    downloaded from NWIS.

    .. doctest::
        :skipif: True  # dataretrieval functions break CI pipeline

        >>> data, _ = dataretrieval.nwis.get_dv(
        ...     "04288000", parameterCd="00060",
        ...     start="1900-01-01", end="2021-12-31")
        >>> pcts_df = percentiles.calculate_fixed_percentile_thresholds(
        ...     data, '00060_Mean',
        ...     percentiles=[5, 10, 25, 50, 75, 90, 95,],
        ...     method='linear')
        >>> new_data, _ = dataretrieval.nwis.get_dv(
        ...     "04288000", parameterCd="00060",
        ...     start="2022-01-01", end="2022-01-07")
        >>> new_data['est_pct'] = percentiles.calculate_fixed_percentile_from_value(  # noqa: E501
        ...     new_data['00060_Mean'], pcts_df)
        >>> new_data['est_pct'].to_list()
        [62.9, 75.0, 55.65, 47.54, 53.55, 55.32, 50.97]

    """
    # extract percentile levels and values
    thresholds = [int(col[1:]) for col in percentile_df.filter(like='p')]
    percentile_values = percentile_df.filter(like='p').iloc[0].to_list()
    if 'min' and 'max' in percentile_df.columns:
        thresholds = [0] + thresholds + [100]
        percentile_values = [percentile_df.at['values', 'min']] + \
            percentile_values + [percentile_df.at['values', 'max']]
    # ensure all values are set to float type for interpolation
    thresholds = np.array(thresholds, dtype=np.float32)
    percentile_values = np.array(percentile_values, dtype=np.float32)
    # check if there are NA percentile levels and remove them so they are
    # ignored during interpolation
    na_mask = ~np.isnan(percentile_values)
    percentile_values = percentile_values[na_mask]
    thresholds = thresholds[na_mask]
    # do and return linear interpolation
    if len(percentile_values) > 0:
        estimated_percentile = np.interp(value, percentile_values,
                                         thresholds,
                                         left=0, right=100).round(2)
    else:
        estimated_percentile = np.nan
    return estimated_percentile




[docs]
def calculate_variable_percentile_from_value(value, percentile_df, month_day):
    """Calculate percentile from a value and variable percentile thresholds.

    This function enables faster calculation of the percentile associated with
    a given value for a single day of the year if percentile values and
    corresponding variable percentile thresholds are known from other data from
    the same station or site. This calculation is done using linear
    interpolation. A value greater than the largest streamflow value in the
    percentile threshold dataframe for the month-day of interest results
    in a percentile of 100. A value less than the smallest streamflow value in
    the percentile threshold dataframe results in a percentile of 0.

    Parameters
    ----------
    value : float, np.ndarray
        New value(s) to calculate percentile for. Can be a single value or an
        array of values.

    percentile_df : pd.DataFrame
        DataFrame containing threshold percentiles of data by day of year.
        Typically generated by the `calculate_variable_percentile_thresholds`
        function but could be provided manually or from data pulled from the
        NWIS stats service.

    month_day : str
        string of month-day of year to lookup percentile thresholds for value

    Returns
    -------
    percentile : float, np.ndarray
        Percentile associated with the input value(s).

    Examples
    --------
    Calculate the percentile associated with a value using flow records
    downloaded from NWIS.

    .. doctest::
        :skipif: True  # dataretrieval functions break CI pipeline

        >>> data, _ = dataretrieval.nwis.get_dv(
        ...     "03586500", parameterCd="00060",
        ...     start="1776-01-01", end="2022-12-31")
        >>> pcts_df = percentiles.calculate_variable_percentile_thresholds_by_day(  # noqa: E501
        ...     data, '00060_Mean',
        ...     percentiles=[5, 10, 25, 50, 75, 90, 95],
        ...     method='linear')
        >>> new_percentile = percentiles.calculate_variable_percentile_from_value(  # noqa: E501
        ...     500, pcts_df, '06-30')
        >>> new_percentile
        96.58
    """
    # retrieve percentile thresholds for the day of year of interest
    pct_values = percentile_df.loc[percentile_df.index.get_level_values('month_day') == month_day]  # noqa: E501

    if not pct_values.empty and not pct_values.isnull().all().all():
        pct_values = pct_values.reset_index(drop=True)
        pct_values = pct_values.rename(index={0: "values"})
        est_pct = calculate_fixed_percentile_from_value(value, pct_values)
    else:
        # return NaN if no threshold values are provided
        est_pct = np.nan

    return est_pct




[docs]
def calculate_multiple_variable_percentiles_from_values(df, data_column_name,
                                                        percentile_df,
                                                        date_column_name=None):
    """Calculate variable percentiles for multiple values

    This function enables calculation of estimated percentiles for multiple
    values across multiple days of the year using existing variable percentile
    thresholds. This calculation is done using linear interpolation.
    A value greater than the largest streamflow value in the
    percentile threshold dataframe for the month-day of interest results
    in a percentile of 100. A value less than the smallest streamflow value in
    the percentile threshold dataframe results in a percentile of 0.

    Parameters
    ----------
    df : pd.DataFrame
        Pandas dataframe containing new values to calculate percentiles for.

    data_column_name : str
        Name of column containing data to analyze.

    percentile_df : pd.DataFrame
        DataFrame containing threshold percentiles of data by day of year.
        Typically generated by the `calculate_variable_percentile_thresholds`
        functions but could be provided manually or from data pulled from the
        NWIS stats service.

    date_column_name : str, optional
        Name of column containing date information. If None, the index of
        `df` is used.

    Returns
    -------
    df : pd.DataFrame
        Pandas dataframe of values with estimated percentile column added

    Examples
    --------
    Calculate the percentiles associated with multiple values using flow
    records downloaded from NWIS.

    .. doctest::
        :skipif: True  # dataretrieval functions break CI pipeline

        >>> data, _ = dataretrieval.nwis.get_dv(
        ...     "04288000", parameterCd="00060",
        ...     start="1900-01-01", end="2021-12-31")
        >>> pcts_df = percentiles.calculate_variable_percentile_thresholds_by_day(  # noqa: E501
        ...     data, '00060_Mean',
        ...     percentiles=[5, 10, 25, 50, 75, 90, 95],
        ...     method='linear')
        >>> new_data, _ = dataretrieval.nwis.get_dv(
        ...     "04288000", parameterCd="00060",
        ...     start="2022-01-01", end="2022-01-07")
        >>> new_percentiles = percentiles.calculate_multiple_variable_percentiles_from_values(  # noqa: E501
        ...     new_data, '00060_Mean', pcts_df)
        >>> new_percentiles['est_pct'].to_list()
        [64.81, 77.7, 56.67, 45.0, 55.59, 59.38, 49.12]
    """
    if date_column_name is None:
        date_column_name = 'datetime'

    df = df.reset_index()
    df['est_pct'] = df.apply(lambda row: calculate_variable_percentile_from_value(  # noqa: E501
        row[data_column_name], percentile_df,
        datetime.strftime(row[date_column_name], '%m-%d')), axis=1)
    df['est_pct'] = df['est_pct'].round(2)
    df = df.set_index(date_column_name)

    return df