Source code for hyswap.rasterhydrograph

"""Raster hydrograph functionality."""

import pandas as pd
from hyswap.utils import rolling_average
from hyswap.utils import define_year_doy_columns
from hyswap.utils import set_window_width


[docs] def format_data(df, data_column_name, date_column_name=None, window_width='daily', year_type='calendar', begin_year=None, end_year=None, clip_leap_day=False, **kwargs): """ Format data for raster hydrograph. Parameters ---------- df : pandas.DataFrame The data to format. Must have a date column or the index must be the date values. data_column_name : str Name of column containing data to analyze. date_column_name : str, optional Name of column containing date information. If None, the index of `df` will be used. Defaults to None. window_width : str, optional The window width of the data in days. Must be one of 'daily', '7-day', '14-day', and '28-day'. If '7-day', '14-day', or '28-day' is specified, the data will be averaged over the specified period. NaN values will be used for any days that do not have data. If present, NaN values will result in NaN values for the entire period. year_type : str, optional The type of year to use. Must be one of 'calendar', 'water', or 'climate'. Default is 'calendar' which starts the year on January 1 and ends on December 31. 'water' starts the year on October 1 and ends on September 30 of the following year which is the "water year". For example, October 1, 2010 to September 30, 2011 is "water year 2011". 'climate' years begin on April 1 and end on March 31 of the following year, they are numbered by the ending year. For example, April 1, 2010 to March 31, 2011 is "climate year 2011". begin_year : int, optional The first year to include in the data. Default is None which uses the first year in the data. end_year : int, optional The last year to include in the data. Default is None which uses the last year in the data. clip_leap_day : bool, optional If True, removes leap day '02-29' from the percentiles dataset used to create the plot. Defaults to False. **kwargs Keyword arguments to pass to the pandas.DataFrame.rolling method. Returns ------- pandas.DataFrame The formatted data starting on the first day of the first year and ending on the last day of the last year with the specified data type and year type. Examples -------- Formatting synthetic daily data for a raster hydrograph. .. doctest:: >>> df = pd.DataFrame({'date': pd.date_range('1/1/2010', '12/31/2010'), ... 'data': np.random.rand(365)}) >>> df_formatted = rasterhydrograph.format_data(df, 'data', 'date') >>> df_formatted.index[0].item() 2010 >>> len(df_formatted.columns) 365 Formatting real daily data for a raster hydrograph. .. doctest:: :skipif: True # dataretrieval functions break CI pipeline >>> df, _ = dataretrieval.nwis.get_dv( ... "03586500", parameterCd="00060", ... start="2000-01-01", end="2002-12-31") >>> df_formatted = rasterhydrograph.format_data(df, '00060_Mean') >>> df_formatted.index[0] 2000 >>> len(df_formatted.columns) 365 """ # check inputs, set date to index, define year/doy columns df_out = _check_inputs(df, data_column_name, date_column_name, window_width, year_type, begin_year, end_year, clip_leap_day=clip_leap_day) # calculate the date range date_range = _calculate_date_range(df_out, year_type, begin_year, end_year) # format date_range as YYYY-MM-DD date_range = date_range.strftime('%Y-%m-%d') # set window window = set_window_width(window_width) # make output data frame # calculation of rolling mean is done on the data column df_out = rolling_average(df_out, data_column_name, window, **kwargs) # convert date index to YYYY-MM-DD format df_out.index = df_out.index.strftime('%Y-%m-%d') # expand data frame to include all dates in date_range df_out = df_out.reindex(date_range) # convert date index to datetime format df_out.index = pd.to_datetime(df_out.index) # re-define year and doy columns df_out = define_year_doy_columns(df_out, year_type=year_type, clip_leap_day=clip_leap_day) # sort by date df_out = df_out.sort_index() # Incorporate leap year decision into x-axis labels if clip_leap_day: year = 1902 else: year = 1903 # Create x-axis scale and labels if year_type == 'water': month_day_order = pd.date_range(start=f'{year}-10-01', end=f'{year+1}-09-30').strftime('%m-%d') # noqa: E501 elif year_type == 'climate': month_day_order = pd.date_range(start=f'{year}-04-01', end=f'{year+1}-03-31').strftime('%m-%d') # noqa: E501 else: month_day_order = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31').strftime('%m-%d') # noqa: E501 # set index to year and day of year columns df_out = df_out.pivot(index='index_year', columns='index_month_day', values=data_column_name) # re-arrange columns by year_type df_out = df_out[month_day_order] # reverse order of the index so year order matches legacy Water Watch df_out = df_out.iloc[::-1] # remove all NaN columns and rows df_out = df_out.dropna(axis=1, how='all') df_out = df_out.dropna(axis=0, how='all') return df_out
[docs] def _check_inputs(df, data_column_name, date_column_name, window_width, year_type, begin_year, end_year, clip_leap_day): """Private function to check inputs for the format_data function. Parameters ---------- df : pandas.DataFrame The data to format. Must have a date column or the index must be the date values. data_column_name : str Name of column containing data to analyze. date_column_name : str, None Name of column containing date information. If None, the index of `df` will be used. Defaults to None. window_width : str The window width of the data in days. Must be one of 'daily', '7-day', '14-day', and '28-day'. If '7-day', '14-day', or '28-day' is specified, the data will be averaged over the specified period. NaN values will be used for any days that do not have data. If present, NaN values will result in NaN values for the entire period. year_type : str The type of year to use. Must be one of 'calendar' or 'water'. 'calendar' starts the year on January 1 and ends on December 31. 'water' starts the year on October 1 and ends on September 30. begin_year : int, None The first year to include in the data. If None, the first year in the data will be used. end_year : int, None The last year to include in the data. If None, the last year in the data will be used. clip_leap_day : bool, optional If True, removes leap day '02-29' from the percentiles dataset used to create the plot. Returns ------- df : pandas.DataFrame The dataframe with the date column formatted as a datetime and set as the index. New year and doy (day of year) columns are added too and are set based on the year_type. Feb 29th is also removed from the dataframe if it exists. """ # check the data frame if not isinstance(df, pd.DataFrame): raise TypeError('df must be a pandas.DataFrame') # check data type if not isinstance(window_width, str): raise TypeError('window_width must be a string') if window_width not in ['daily', '7-day', '14-day', '28-day']: raise ValueError('window_width must be one of "daily", "7-day", ' '"14-day", and "28-day"') # check data column name if not isinstance(data_column_name, str): raise TypeError('data_column_name must be a string') # check date column name if date_column_name is not None: if not isinstance(date_column_name, str): raise TypeError('date_column_name must be a string') # check begin year if begin_year is not None: if not isinstance(begin_year, int): raise TypeError('begin_year must be an integer') if date_column_name is not None: if begin_year < df['date'].dt.year.min(): raise ValueError('begin_year must be greater than or equal to ' 'the minimum year in the data') else: if begin_year < df.index.year.min(): raise ValueError('begin_year must be greater than or equal to ' 'the minimum year in the data') # check end year if end_year is not None: if not isinstance(end_year, int): raise TypeError('end_year must be an integer') if date_column_name is not None: if end_year > df['date'].dt.year.max(): raise ValueError('end_year must be less than or equal to the ' 'maximum year in the data') else: if end_year > df.index.year.max(): raise ValueError('end_year must be less than or equal to the ' 'maximum year in the data') # define year and doy columns and set index as date col if needed df = define_year_doy_columns(df, date_column_name, year_type, clip_leap_day=clip_leap_day) return df
[docs] def _calculate_date_range(df, year_type, begin_year, end_year): """Private function to calculate the date range and set the index. Parameters ---------- df : pandas.DataFrame The data to format. Must have a date column or the index must be the date values. year_type : str The type of year to use. Must be one of 'calendar', 'water', or 'climate'. begin_year : int, None The first year to include in the data. If None, the first year in the data will be used. end_year : int, None The last year to include in the data. If None, the last year in the data will be used. Returns ------- date_range : pandas.DatetimeIndex The date range. """ # set begin/end year if not provided if begin_year is None: begin_year = df.index.year.min() if end_year is None: end_year = df.index.year.max() # calendar year from Jan 1 to Dec 31 if year_type == 'calendar': begin_date = pd.to_datetime(str(begin_year) + '-01-01') end_date = pd.to_datetime(str(end_year) + '-12-31') # water year from Oct 1 to Sep 30 elif year_type == 'water': begin_date = pd.to_datetime(str(begin_year-1) + '-10-01') end_date = pd.to_datetime(str(end_year) + '-09-30') # climate year from Apr 1 to Mar 31 elif year_type == 'climate': begin_date = pd.to_datetime(str(begin_year-1) + '-04-01') end_date = pd.to_datetime(str(end_year) + '-03-31') # set date range date_range = pd.date_range(begin_date, end_date) return date_range