Source code for hyswap.percentiles

"""Percentile calculation functions."""

import numpy as np
import pandas as pd
import warnings
from datetime import datetime
from hyswap.utils import filter_data_by_month_day
from hyswap.utils import filter_data_by_time
from hyswap.utils import define_year_doy_columns
from hyswap.utils import set_window_width
from hyswap.utils import rolling_average
from hyswap.exceedance import calculate_exceedance_probability_from_values


[docs] def calculate_fixed_percentile_thresholds( data, data_column_name=None, percentiles=np.array((5, 10, 25, 50, 75, 90, 95)), method='weibull', date_column_name=None, ignore_na=True, include_min_max=True, include_metadata=True, mask_out_of_range=True, **kwargs): """Calculate fixed percentile thresholds using historical data. Fixed percentiles are calculated using all data in the period of record. See the `Calculations Quick-Reference <https://doi-usgs.github.io/hyswap/meta/calculations.html#streamflow-percentiles>` for more information. Parameters ---------- data : pandas.DataFrame or array-like DataFrame, Series, or 1-D array containing data used to calculate percentile thresholds. If DataFrame, "data_column_name" must be specified and expects a datetime index unless "date_column_name" is provided. If Series, must include a datetime index. If 1-D array, then "include_metadata" must be set to False since a datetime index is not included with data. data_column_name : str, optional Name of column containing data to analyze if input is a DataFrame. Default is None. percentiles : array_like, optional Percentiles to calculate. Default is (5, 10, 25, 50, 75, 90, 95). Note: Values of 0 and 100 are ignored as unbiased plotting position formulas do not assign values to 0 or 100th percentile. method : str, optional Method to use to calculate percentiles. Default is 'weibull' (Type 6). Additional available methods are 'interpolated_inverted_cdf' (Type 4), 'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8), and 'normal_unbiased' (Type 9). date_column_name : str, optional For data provided as DataFrame, name of column containing date information. If None, the index of `data` is used. ignore_na : bool, optional Ignore NA values in percentile calculations include_min_max : bool, optional When set to True, include min and max streamflow value in addition to streamflow values for percentile levels. Default is True. include_metadata : bool, optional When set to True, return additional columns describing the data including count, mean, start_yr, end_yr. Default is True. Input data must include a datetime column as either index or specified by date_column_name. mask_out_of_range : bool, optional When set to True, percentiles that are beyond the min/max percentile rank of the observed data are set to NA. When enabled, high or low percentiles may not be calculated when few data points are available. Default is True. **kwargs : dict, optional Additional keyword arguments to pass to `numpy.percentile`. Returns ------- percentiles : pandas.DataFrame Percentiles of the data in a DataFrame so the thresholds and percentile values are tied together. Examples -------- Calculate percentile thresholds from some synthetic data using 'linear' method. .. doctest:: >>> data = pd.DataFrame({'values': np.arange(101), ... 'date': pd.date_range('2020-01-01', '2020-04-10')}).set_index('date') # noqa: E501 >>> results = percentiles.calculate_fixed_percentile_thresholds( ... data, 'values', percentiles=[25, 75, 95], method='linear') >>> results min p25 p75 p95 max mean count start_yr end_yr values 0 25.0 75.0 95.0 100 50.0 101 2020 2020 Calculate percentile thresholds without additional metadata columns .. doctest:: >>> data = np.arange(101) >>> results = percentiles.calculate_fixed_percentile_thresholds( ... data, percentiles=[5, 25, 75, 95], method='linear', ... include_metadata=False) >>> results min p05 p25 p75 p95 max values 0 5.0 25.0 75.0 95.0 100 Calculate percentile thresholds using default 'weibull' method >>> data = np.arange(101) >>> results = percentiles.calculate_fixed_percentile_thresholds( ... data, percentiles=[5, 25, 50, 75, 95], ... include_metadata=False) >>> results min p05 p25 p50 p75 p95 max values 0 4.1 24.5 50.0 75.5 95.9 100 Calculate percentile thresholds from a small number of observations and mask out out of range percentile levels >>> data = np.arange(11) >>> results = percentiles.calculate_fixed_percentile_thresholds( ... data, percentiles=np.array((1, 10, 50, 90, 99)), ... include_metadata=False) >>> results min p01 p10 p50 p90 p99 max values 0 NaN 0.2 5.0 9.8 NaN 10 """ if isinstance(data, pd.DataFrame): # set the df index if date_column_name is not None: data = data.set_index(date_column_name) # If data column name is not in dataframe if data_column_name not in data: raise ValueError('DataFrame missing data_column_name') data = data[data_column_name] # ignore 0 and 100 percentiles if passed in if isinstance(percentiles, np.ndarray): percentiles = percentiles[~np.isin(percentiles, [0, 100])] elif isinstance(percentiles, list): percentiles = [x for x in percentiles if x not in (0, 100)] if ignore_na: pct = np.nanpercentile(data, percentiles, method=method, **kwargs) else: pct = np.percentile(data, percentiles, method=method, **kwargs) # round values smaller than three decimal places to zero to avoid extremely # small threshold values being returned. pct[(pct > 0) & (pct < 0.001)] = 0 df_out = pd.DataFrame(data={"values": pct}, index=percentiles) if mask_out_of_range: min_pct_rank = (1 - calculate_exceedance_probability_from_values(np.nanmin(data), data, method=method))*100 # noqa: E501 max_pct_rank = (1 - calculate_exceedance_probability_from_values(np.nanmax(data), data, method=method))*100 # noqa: E501 df_out.loc[(df_out.index > max_pct_rank) | (df_out.index < min_pct_rank)] = np.nan # noqa: E501 # transpose so percentile levels are columns df_out = df_out.T df_out.columns = "p" + df_out.columns.astype(str).str.zfill(2) if include_min_max: # add min as first column of dataframe and max as last column if ignore_na: df_out.insert(0, 'min', np.min(data)) df_out['max'] = np.max(data) else: df_out.insert(0, 'min', np.nanmin(data)) df_out['max'] = np.nanmax(data) if include_metadata: if isinstance(data, pd.Series): if not data.index.inferred_type == "datetime64": raise ValueError("Datetime index must be provided with include_metadata=True.") # noqa: E501 else: raise ValueError("Data input format must include a datetime index with include_metadata=True.") # noqa: E501 if ignore_na: df_out['mean'] = np.round(np.nanmean(data), 2) else: df_out['mean'] = np.round(np.mean(data), 2) df_out['count'] = len(data) df_out['start_yr'] = data.index.min().strftime('%Y') df_out['end_yr'] = data.index.max().strftime('%Y') return df_out
[docs] def calculate_variable_percentile_thresholds_by_day_of_year( df, data_column_name, percentiles=[5, 10, 25, 50, 75, 90, 95], method='weibull', date_column_name=None, window_width='daily', year_type='calendar', leading_values=0, trailing_values=0, clip_leap_day=False, ignore_na=True, include_min_max=True, include_metadata=True, mask_out_of_range=True, **kwargs): """Calculate variable percentile thresholds of data by day of year. Variable percentiles are calculated using flow observations for each day from all years on record. See the `Calculations Quick-Reference <https://doi-usgs.github.io/hyswap/meta/calculations.html#streamflow-percentiles>` for more information. Parameters ---------- df : pandas.DataFrame DataFrame containing data to calculate daily percentile thresholds for. data_column_name : str Name of column containing data to analyze. percentiles : array_like, optional Percentile thresholds to calculate, default is [5, 10, 25, 50, 75, 90, 95]. Note: Values of 0 and 100 are ignored as unbiased plotting position formulas do not assign values to 0 or 100th percentile. method : str, optional Method to use to calculate percentiles. Default is 'weibull' (Type 6). Additional available methods are 'interpolated_inverted_cdf' (Type 4), 'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8), and 'normal_unbiased' (Type 9). date_column_name : str, optional Name of column containing date information. If None, the index of `df` is used. window_width : str, optional The window width of the data in days. Must be one of 'daily', '7-day', '14-day', and '28-day'. If '7-day', '14-day', or '28-day' is specified, the data will be averaged over the specified period. NaN values will be used for any days that do not have data. If present, NaN values will result in NaN values for the entire period. year_type : str, optional The type of year to use. Must be one of 'calendar', 'water', or 'climate'. Default is 'calendar' which starts the year on January 1 and ends on December 31. 'water' starts the year on October 1 and ends on September 30 of the following year which is the "water year". For example, October 1, 2010 to September 30, 2011 is "water year 2011". 'climate' years begin on April 1 and end on March 31 of the following year, they are numbered by the ending year. For example, April 1, 2010 to March 31, 2011 is "climate year 2011". leading_values : int, optional For the temporal filtering, this is an argument setting the number of leading values to include in the output, inclusive. Default is 0, and parameter only applies to 'day' time_interval. trailing_values : int, optional For the temporal filtering, this is an argument setting the number of trailing values to include in the output, inclusive. Default is 0, and parameter only applies to 'day' time_interval. clip_leap_day : bool, optional If True, February 29 is removed from the DataFrame. Default is False. ignore_na : bool, optional Ignore NA values in percentile calculations include_min_max : bool, optional When set to True, include min and max streamflow value in addition to streamflow values for percentile levels. Default is True. include_metadata : bool, optional When set to True, return additional columns describing the data including count, mean, start_yr, end_yr. Default is True mask_out_of_range : bool, optional When set to True, percentiles that are beyond the min/max percentile rank of the observed data are set to NA. When enabled, high or low percentiles may not be calculated when few data points are available. Default is True. **kwargs : dict, optional Additional keyword arguments to pass to `numpy.percentile`. Returns ------- percentiles : pandas.DataFrame DataFrame containing threshold percentiles of data by day of year. The DataFrame has a multi-index of 'doy' and 'year_type'. Returns a DataFrame of NaNs for each percentile/day if provided an empty DataFrame or DataFrame with insufficient data Examples -------- Calculate default thresholds by day of year from some real data in preparation for plotting. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> df, _ = dataretrieval.nwis.get_dv( ... "03586500", parameterCd="00060", ... start="1776-01-01", end="2022-12-31") >>> results = percentiles.calculate_variable_percentile_thresholds_by_day_of_year( # noqa: E501 ... df, "00060_Mean") >>> len(results.index) # 366 days in a leap year 366 """ # If the dataframe is empty, create a dummy dataframe to # run through function if clip_leap_day: # use a non-leap year as reference for empty df date_rng = pd.date_range(start='1901-01-01', end='1901-12-31') else: # use a leap year as reference for empty df date_rng = pd.date_range(start='1904-01-01', end='1904-12-31') if df.empty: warnings.warn('No valid data provided, returning NA values for percentile thresholds') # noqa: E501 df = pd.DataFrame(index=date_rng) df[data_column_name] = np.nan # If data column name is not in dataframe if data_column_name not in df: warnings.warn('DataFrame missing data_column_name, returning NA values for percentile thresholds') # noqa: E501 df = pd.DataFrame(index=date_rng) df[data_column_name] = np.nan # define year and day of year columns and convert date column to datetime # if necessary df = define_year_doy_columns(df, date_column_name=date_column_name, year_type=year_type, clip_leap_day=clip_leap_day) # do rolling average for time as needed window = set_window_width(window_width) df = rolling_average(df, data_column_name, window) # create an empty dataframe to hold percentiles based on day-of-year # ignore 0 and 100 percentiles if passed in if isinstance(percentiles, np.ndarray): percentiles = percentiles[~np.isin(percentiles, [0, 100])] elif isinstance(percentiles, list): percentiles = [x for x in percentiles if x not in (0, 100)] cols = [f"p{perc:02d}" for perc in percentiles] if include_min_max: cols = ['min'] + cols + ['max'] if include_metadata: cols = cols + ['mean', 'count', 'start_yr', 'end_yr'] doy_index = date_rng.day_of_year.values percentiles_by_day = pd.DataFrame(index=doy_index, columns=cols) # loop through days of year available for doy in doy_index: # get historical data for the day of year data = filter_data_by_time(df, doy, data_column_name, leading_values=leading_values, trailing_values=trailing_values, drop_na=ignore_na) if not data.empty: if not np.isnan(data).all(): # calculate percentiles for the day of year # and add to DataFrame _pct = calculate_fixed_percentile_thresholds( data.to_frame(), data_column_name, percentiles=percentiles, method=method, ignore_na=ignore_na, include_min_max=include_min_max, include_metadata=include_metadata, mask_out_of_range=mask_out_of_range, **kwargs) percentiles_by_day.loc[doy_index == doy, :] = _pct.values.tolist()[0] # noqa: E501 else: # if all values are NA # set percentiles to NaN percentiles_by_day.loc[doy_index == doy, :] = np.nan else: # if the data subset for doy is empty # set percentiles to NaN percentiles_by_day.loc[doy_index == doy, :] = np.nan if clip_leap_day: wy_sub = 273 cy_sub = 90 wy_cy_sub = 365 else: wy_sub = 274 cy_sub = 91 wy_cy_sub = 366 # sort index by year type percentiles_by_day = percentiles_by_day.sort_index() if year_type == 'climate': doy_index = doy_index - cy_sub doy_index[doy_index < 1] += wy_cy_sub if year_type == 'water': doy_index = doy_index - wy_sub doy_index[doy_index < 1] += wy_cy_sub # reorder by water year or climate index and rename percentiles_by_day = percentiles_by_day.loc[doy_index] percentiles_by_day.reset_index(drop=True, inplace=True) percentiles_by_day.index = pd.MultiIndex.from_arrays( [percentiles_by_day.index + 1, [year_type] * len(doy_index)], names=['doy', 'year_type']) # return percentiles by day of year return percentiles_by_day
[docs] def calculate_variable_percentile_thresholds_by_day( df, data_column_name, percentiles=[5, 10, 25, 50, 75, 90, 95], method='weibull', date_column_name=None, window_width='daily', leading_values=0, trailing_values=0, clip_leap_day=False, ignore_na=True, include_min_max=True, include_metadata=True, mask_out_of_range=True, **kwargs): """Calculate variable percentile thresholds of data by day Parameters ---------- df : pandas.DataFrame DataFrame containing data to calculate daily percentile thresholds for. data_column_name : str Name of column containing data to analyze. percentiles : array_like, optional Percentile thresholds to calculate, default is [5, 10, 25, 50, 75, 90, 95]. Note: Values of 0 and 100 are ignored as unbiased plotting position formulas do not assign values to 0 or 100th percentile. method : str, optional Method to use to calculate percentiles. Default is 'weibull' (Type 6). Additional available methods are 'interpolated_inverted_cdf' (Type 4), 'hazen' (Type 5), 'linear' (Type 7), 'median_unbiased' (Type 8), and 'normal_unbiased' (Type 9). date_column_name : str, optional Name of column containing date information. If None, the index of `df` is used. window_width : str, optional The window width of the data in days. Must be one of 'daily', '7-day', '14-day', and '28-day'. If '7-day', '14-day', or '28-day' is specified, the data will be averaged over the specified period. NaN values will be used for any days that do not have data. If present, NaN values will result in NaN values for the entire period. leading_values : int, optional For the temporal filtering, this is an argument setting the number of leading values to include in the output, inclusive. Default is 0, and parameter only applies to 'day' time_interval. trailing_values : int, optional For the temporal filtering, this is an argument setting the number of trailing values to include in the output, inclusive. Default is 0, and parameter only applies to 'day' time_interval. clip_leap_day : bool, optional If True, February 29 is removed from the DataFrame. Default is False. ignore_na : bool, optional Ignore NA values in percentile calculations include_min_max : bool, optional When set to True, include min and max streamflow value in addition to streamflow values for percentile levels. Default is True. include_metadata : bool, optional When set to True, return additional columns describing the data including count, mean, start_yr, end_yr. Default is True mask_out_of_range : bool, optional When set to True, percentiles that are beyond the min/max percentile rank of the observed data are set to NA. When enabled, high or low percentiles may not be calculated when few data points are available. Default is True. **kwargs : dict, optional Additional keyword arguments to pass to `numpy.percentile`. Returns ------- percentiles : pandas.DataFrame DataFrame containing threshold percentiles of data by month-day. Will return a DataFrame of NaNs for each percentile/day if provided an empty DataFrame or DataFrame with insufficient data Examples -------- Calculate default thresholds by day from some real data in preparation for plotting. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> df, _ = dataretrieval.nwis.get_dv( ... "03586500", parameterCd="00060", ... start="1776-01-01", end="2022-12-31") >>> results = percentiles.calculate_variable_percentile_thresholds_by_day( # noqa: E501 ... df, "00060_Mean") >>> len(results.index) # 366 days in a leap year 366 """ # If the dataframe is empty, create a dummy dataframe to # run through function if clip_leap_day: # use a non-leap year as reference for empty df date_rng = pd.date_range(start='1901-01-01', end='1901-12-31') else: # use a leap year as reference for empty df date_rng = pd.date_range(start='1904-01-01', end='1904-12-31') if df.empty: warnings.warn('No valid data provided, returning NA values for percentile thresholds') # noqa: E501 df = pd.DataFrame(index=date_rng) df[data_column_name] = np.nan # If data column name is not in dataframe if data_column_name not in df: warnings.warn('DataFrame missing data_column_name, returning NA values for percentile thresholds') # noqa: E501 df = pd.DataFrame(index=date_rng) df[data_column_name] = np.nan # set the df index if date_column_name is not None: df = df.set_index(date_column_name) # do rolling average for time as needed window = set_window_width(window_width) df = rolling_average(df, data_column_name, window) # create an empty dataframe to hold percentiles based on month-day # ignore 0 and 100 percentiles if passed in if isinstance(percentiles, np.ndarray): percentiles = percentiles[~np.isin(percentiles, [0, 100])] elif isinstance(percentiles, list): percentiles = [x for x in percentiles if x not in (0, 100)] cols = [f"p{perc:02d}" for perc in percentiles] if include_min_max: cols = ['min'] + cols + ['max'] if include_metadata: cols = cols + ['mean', 'count', 'start_yr', 'end_yr'] month_day_index = date_rng.strftime("%m-%d") percentiles_by_day = pd.DataFrame(index=month_day_index, columns=cols) percentiles_by_day.index.names = ['month_day'] # loop through days of year available for month_day in month_day_index: # get historical data for the day of year data = filter_data_by_month_day(df, month_day, data_column_name, leading_values=leading_values, trailing_values=trailing_values, drop_na=ignore_na) if not data.empty: if not np.isnan(data).all(): # calculate percentiles for the day of year # and add to DataFrame _pct = calculate_fixed_percentile_thresholds( data.to_frame(), data_column_name, percentiles=percentiles, method=method, ignore_na=ignore_na, include_min_max=include_min_max, include_metadata=include_metadata, mask_out_of_range=mask_out_of_range, **kwargs) percentiles_by_day.loc[month_day_index == month_day, :] = _pct.values.tolist()[0] # noqa: E501 else: # if all values are NA # set percentiles to NaN percentiles_by_day.loc[ month_day_index == month_day, :] = np.nan else: # if the data subset for doy is empty # set percentiles to NaN percentiles_by_day.loc[month_day_index == month_day, :] = np.nan return percentiles_by_day
[docs] def calculate_fixed_percentile_from_value(value, percentile_df): """Calculate percentile from a value and fixed percentile thresholds. This function enables faster calculation of the percentile associated with a given value if percentile values and corresponding fixed percentile thresholds are known from other data from the same station or site. This calculation is done using linear interpolation. A value greater than the largest streamflow value in the percentile threshold dataframe results in a percentile of 100. A value less than the smallest streamflow value in the percentile threshold dataframe results in a percentile of 0. Parameters ---------- value : float, np.ndarray New value(s) to calculate percentile for. Can be a single value or an array of values. percentile_df : pd.DataFrame DataFrame where columns are the percentile thresholds values and the values are stored in a row called "values". Typically generated by the `calculate_fixed_percentile_thresholds` functions but could be provided manually or from data pulled from the NWIS stats service. Returns ------- percentile : float, np.ndarray Percentile associated with the input value(s). Examples -------- Calculate the percentile associated with a value from some synthetic data. .. doctest:: >>> data = pd.DataFrame({'values': np.arange(1001), ... 'date': pd.date_range('2020-01-01', '2022-09-27')}).set_index('date') # noqa: E501 >>> pcts_df = percentiles.calculate_fixed_percentile_thresholds( ... data, 'values', percentiles=[5, 10, 25, 50, 75, 90, 95]) >>> new_percentile = percentiles.calculate_fixed_percentile_from_value( ... 500, pcts_df).item() >>> new_percentile 50.0 Calculate the percentiles associated with multiple values for some data downloaded from NWIS. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> data, _ = dataretrieval.nwis.get_dv( ... "04288000", parameterCd="00060", ... start="1900-01-01", end="2021-12-31") >>> pcts_df = percentiles.calculate_fixed_percentile_thresholds( ... data, '00060_Mean', ... percentiles=[5, 10, 25, 50, 75, 90, 95,], ... method='linear') >>> new_data, _ = dataretrieval.nwis.get_dv( ... "04288000", parameterCd="00060", ... start="2022-01-01", end="2022-01-07") >>> new_data['est_pct'] = percentiles.calculate_fixed_percentile_from_value( # noqa: E501 ... new_data['00060_Mean'], pcts_df) >>> new_data['est_pct'].to_list() [62.9, 75.0, 55.65, 47.54, 53.55, 55.32, 50.97] """ # extract percentile levels and values thresholds = [int(col[1:]) for col in percentile_df.filter(like='p')] percentile_values = percentile_df.filter(like='p').iloc[0].to_list() if 'min' and 'max' in percentile_df.columns: thresholds = [0] + thresholds + [100] percentile_values = [percentile_df.at['values', 'min']] + \ percentile_values + [percentile_df.at['values', 'max']] # ensure all values are set to float type for interpolation thresholds = np.array(thresholds, dtype=np.float32) percentile_values = np.array(percentile_values, dtype=np.float32) # check if there are NA percentile levels and remove them so they are # ignored during interpolation na_mask = ~np.isnan(percentile_values) percentile_values = percentile_values[na_mask] thresholds = thresholds[na_mask] # do and return linear interpolation if len(percentile_values) > 0: estimated_percentile = np.interp(value, percentile_values, thresholds, left=0, right=100).round(2) else: estimated_percentile = np.nan return estimated_percentile
[docs] def calculate_variable_percentile_from_value(value, percentile_df, month_day): """Calculate percentile from a value and variable percentile thresholds. This function enables faster calculation of the percentile associated with a given value for a single day of the year if percentile values and corresponding variable percentile thresholds are known from other data from the same station or site. This calculation is done using linear interpolation. A value greater than the largest streamflow value in the percentile threshold dataframe for the month-day of interest results in a percentile of 100. A value less than the smallest streamflow value in the percentile threshold dataframe results in a percentile of 0. Parameters ---------- value : float, np.ndarray New value(s) to calculate percentile for. Can be a single value or an array of values. percentile_df : pd.DataFrame DataFrame containing threshold percentiles of data by day of year. Typically generated by the `calculate_variable_percentile_thresholds` function but could be provided manually or from data pulled from the NWIS stats service. month_day : str string of month-day of year to lookup percentile thresholds for value Returns ------- percentile : float, np.ndarray Percentile associated with the input value(s). Examples -------- Calculate the percentile associated with a value using flow records downloaded from NWIS. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> data, _ = dataretrieval.nwis.get_dv( ... "03586500", parameterCd="00060", ... start="1776-01-01", end="2022-12-31") >>> pcts_df = percentiles.calculate_variable_percentile_thresholds_by_day( # noqa: E501 ... data, '00060_Mean', ... percentiles=[5, 10, 25, 50, 75, 90, 95], ... method='linear') >>> new_percentile = percentiles.calculate_variable_percentile_from_value( # noqa: E501 ... 500, pcts_df, '06-30') >>> new_percentile 96.58 """ # retrieve percentile thresholds for the day of year of interest pct_values = percentile_df.loc[percentile_df.index.get_level_values('month_day') == month_day] # noqa: E501 if not pct_values.empty and not pct_values.isnull().all().all(): pct_values = pct_values.reset_index(drop=True) pct_values = pct_values.rename(index={0: "values"}) est_pct = calculate_fixed_percentile_from_value(value, pct_values) else: # return NaN if no threshold values are provided est_pct = np.nan return est_pct
[docs] def calculate_multiple_variable_percentiles_from_values(df, data_column_name, percentile_df, date_column_name=None): """Calculate variable percentiles for multiple values This function enables calculation of estimated percentiles for multiple values across multiple days of the year using existing variable percentile thresholds. This calculation is done using linear interpolation. A value greater than the largest streamflow value in the percentile threshold dataframe for the month-day of interest results in a percentile of 100. A value less than the smallest streamflow value in the percentile threshold dataframe results in a percentile of 0. Parameters ---------- df : pd.DataFrame Pandas dataframe containing new values to calculate percentiles for. data_column_name : str Name of column containing data to analyze. percentile_df : pd.DataFrame DataFrame containing threshold percentiles of data by day of year. Typically generated by the `calculate_variable_percentile_thresholds` functions but could be provided manually or from data pulled from the NWIS stats service. date_column_name : str, optional Name of column containing date information. If None, the index of `df` is used. Returns ------- df : pd.DataFrame Pandas dataframe of values with estimated percentile column added Examples -------- Calculate the percentiles associated with multiple values using flow records downloaded from NWIS. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> data, _ = dataretrieval.nwis.get_dv( ... "04288000", parameterCd="00060", ... start="1900-01-01", end="2021-12-31") >>> pcts_df = percentiles.calculate_variable_percentile_thresholds_by_day( # noqa: E501 ... data, '00060_Mean', ... percentiles=[5, 10, 25, 50, 75, 90, 95], ... method='linear') >>> new_data, _ = dataretrieval.nwis.get_dv( ... "04288000", parameterCd="00060", ... start="2022-01-01", end="2022-01-07") >>> new_percentiles = percentiles.calculate_multiple_variable_percentiles_from_values( # noqa: E501 ... new_data, '00060_Mean', pcts_df) >>> new_percentiles['est_pct'].to_list() [64.81, 77.7, 56.67, 45.0, 55.59, 59.38, 49.12] """ if date_column_name is None: date_column_name = 'datetime' df = df.reset_index() df['est_pct'] = df.apply(lambda row: calculate_variable_percentile_from_value( # noqa: E501 row[data_column_name], percentile_df, datetime.strftime(row[date_column_name], '%m-%d')), axis=1) df['est_pct'] = df['est_pct'].round(2) df = df.set_index(date_column_name) return df