Source code for hyswap.rasterhydrograph
"""Raster hydrograph functionality."""
import pandas as pd
from hyswap.utils import rolling_average
from hyswap.utils import define_year_doy_columns
from hyswap.utils import set_window_width
[docs]
def format_data(df, data_column_name, date_column_name=None,
window_width='daily', year_type='calendar',
begin_year=None, end_year=None,
clip_leap_day=False, **kwargs):
"""
Format data for raster hydrograph.
Parameters
----------
df : pandas.DataFrame
The data to format. Must have a date column or the index must be the
date values.
data_column_name : str
Name of column containing data to analyze.
date_column_name : str, optional
Name of column containing date information. If None, the index of
`df` will be used. Defaults to None.
window_width : str, optional
The window width of the data in days. Must be one of 'daily',
'7-day', '14-day', and '28-day'. If '7-day', '14-day', or
'28-day' is specified, the data will be averaged over the
specified period. NaN values will be used for any days that
do not have data. If present, NaN values will result in NaN
values for the entire period.
year_type : str, optional
The type of year to use. Must be one of 'calendar', 'water', or
'climate'. Default is 'calendar' which starts the year on January 1
and ends on December 31. 'water' starts the year on October 1 and
ends on September 30 of the following year which is the "water year".
For example, October 1, 2010 to September 30, 2011 is "water year
2011". 'climate' years begin on April 1 and end on March 31 of the
following year, they are numbered by the ending year. For example,
April 1, 2010 to March 31, 2011 is "climate year 2011".
begin_year : int, optional
The first year to include in the data. Default is None which uses the
first year in the data.
end_year : int, optional
The last year to include in the data. Default is None which uses the
last year in the data.
clip_leap_day : bool, optional
If True, removes leap day '02-29' from the percentiles dataset
used to create the plot. Defaults to False.
**kwargs
Keyword arguments to pass to the pandas.DataFrame.rolling method.
Returns
-------
pandas.DataFrame
The formatted data starting on the first day of the first year and
ending on the last day of the last year with the specified data type
and year type.
Examples
--------
Formatting synthetic daily data for a raster hydrograph.
.. doctest::
>>> df = pd.DataFrame({'date': pd.date_range('1/1/2010', '12/31/2010'),
... 'data': np.random.rand(365)})
>>> df_formatted = rasterhydrograph.format_data(df, 'data', 'date')
>>> df_formatted.index[0].item()
2010
>>> len(df_formatted.columns)
365
Formatting real daily data for a raster hydrograph.
.. doctest::
:skipif: True # dataretrieval functions break CI pipeline
>>> df, _ = dataretrieval.nwis.get_dv(
... "03586500", parameterCd="00060",
... start="2000-01-01", end="2002-12-31")
>>> df_formatted = rasterhydrograph.format_data(df, '00060_Mean')
>>> df_formatted.index[0]
2000
>>> len(df_formatted.columns)
365
"""
# check inputs, set date to index, define year/doy columns
df_out = _check_inputs(df, data_column_name, date_column_name,
window_width, year_type, begin_year, end_year,
clip_leap_day=clip_leap_day)
# calculate the date range
date_range = _calculate_date_range(df_out, year_type, begin_year, end_year)
# format date_range as YYYY-MM-DD
date_range = date_range.strftime('%Y-%m-%d')
# set window
window = set_window_width(window_width)
# make output data frame
# calculation of rolling mean is done on the data column
df_out = rolling_average(df_out, data_column_name, window, **kwargs)
# convert date index to YYYY-MM-DD format
df_out.index = df_out.index.strftime('%Y-%m-%d')
# expand data frame to include all dates in date_range
df_out = df_out.reindex(date_range)
# convert date index to datetime format
df_out.index = pd.to_datetime(df_out.index)
# re-define year and doy columns
df_out = define_year_doy_columns(df_out, year_type=year_type,
clip_leap_day=clip_leap_day)
# sort by date
df_out = df_out.sort_index()
# Incorporate leap year decision into x-axis labels
if clip_leap_day:
year = 1902
else:
year = 1903
# Create x-axis scale and labels
if year_type == 'water':
month_day_order = pd.date_range(start=f'{year}-10-01', end=f'{year+1}-09-30').strftime('%m-%d') # noqa: E501
elif year_type == 'climate':
month_day_order = pd.date_range(start=f'{year}-04-01', end=f'{year+1}-03-31').strftime('%m-%d') # noqa: E501
else:
month_day_order = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31').strftime('%m-%d') # noqa: E501
# set index to year and day of year columns
df_out = df_out.pivot(index='index_year', columns='index_month_day',
values=data_column_name)
# re-arrange columns by year_type
df_out = df_out[month_day_order]
# reverse order of the index so year order matches legacy Water Watch
df_out = df_out.iloc[::-1]
# remove all NaN columns and rows
df_out = df_out.dropna(axis=1, how='all')
df_out = df_out.dropna(axis=0, how='all')
return df_out
[docs]
def _check_inputs(df, data_column_name, date_column_name,
window_width, year_type, begin_year, end_year,
clip_leap_day):
"""Private function to check inputs for the format_data function.
Parameters
----------
df : pandas.DataFrame
The data to format. Must have a date column or the index must be the
date values.
data_column_name : str
Name of column containing data to analyze.
date_column_name : str, None
Name of column containing date information. If None, the index of
`df` will be used. Defaults to None.
window_width : str
The window width of the data in days. Must be one of 'daily',
'7-day', '14-day', and '28-day'. If '7-day', '14-day', or
'28-day' is specified, the data will be averaged over the
specified period. NaN values will be used for any days that
do not have data. If present, NaN values will result in NaN
values for the entire period.
year_type : str
The type of year to use. Must be one of 'calendar' or 'water'.
'calendar' starts the year on January 1 and ends on
December 31. 'water' starts the year on October 1 and ends on
September 30.
begin_year : int, None
The first year to include in the data. If None, the first year in
the data will be used.
end_year : int, None
The last year to include in the data. If None, the last year in the
data will be used.
clip_leap_day : bool, optional
If True, removes leap day '02-29' from the percentiles dataset
used to create the plot.
Returns
-------
df : pandas.DataFrame
The dataframe with the date column formatted as a datetime and set as
the index. New year and doy (day of year) columns are added too and
are set based on the year_type. Feb 29th is also removed from the
dataframe if it exists.
"""
# check the data frame
if not isinstance(df, pd.DataFrame):
raise TypeError('df must be a pandas.DataFrame')
# check data type
if not isinstance(window_width, str):
raise TypeError('window_width must be a string')
if window_width not in ['daily', '7-day', '14-day', '28-day']:
raise ValueError('window_width must be one of "daily", "7-day", '
'"14-day", and "28-day"')
# check data column name
if not isinstance(data_column_name, str):
raise TypeError('data_column_name must be a string')
# check date column name
if date_column_name is not None:
if not isinstance(date_column_name, str):
raise TypeError('date_column_name must be a string')
# check begin year
if begin_year is not None:
if not isinstance(begin_year, int):
raise TypeError('begin_year must be an integer')
if date_column_name is not None:
if begin_year < df['date'].dt.year.min():
raise ValueError('begin_year must be greater than or equal to '
'the minimum year in the data')
else:
if begin_year < df.index.year.min():
raise ValueError('begin_year must be greater than or equal to '
'the minimum year in the data')
# check end year
if end_year is not None:
if not isinstance(end_year, int):
raise TypeError('end_year must be an integer')
if date_column_name is not None:
if end_year > df['date'].dt.year.max():
raise ValueError('end_year must be less than or equal to the '
'maximum year in the data')
else:
if end_year > df.index.year.max():
raise ValueError('end_year must be less than or equal to the '
'maximum year in the data')
# define year and doy columns and set index as date col if needed
df = define_year_doy_columns(df, date_column_name, year_type,
clip_leap_day=clip_leap_day)
return df
[docs]
def _calculate_date_range(df, year_type, begin_year, end_year):
"""Private function to calculate the date range and set the index.
Parameters
----------
df : pandas.DataFrame
The data to format. Must have a date column or the index must be the
date values.
year_type : str
The type of year to use. Must be one of 'calendar', 'water', or
'climate'.
begin_year : int, None
The first year to include in the data. If None, the first year in
the data will be used.
end_year : int, None
The last year to include in the data. If None, the last year in the
data will be used.
Returns
-------
date_range : pandas.DatetimeIndex
The date range.
"""
# set begin/end year if not provided
if begin_year is None:
begin_year = df.index.year.min()
if end_year is None:
end_year = df.index.year.max()
# calendar year from Jan 1 to Dec 31
if year_type == 'calendar':
begin_date = pd.to_datetime(str(begin_year) + '-01-01')
end_date = pd.to_datetime(str(end_year) + '-12-31')
# water year from Oct 1 to Sep 30
elif year_type == 'water':
begin_date = pd.to_datetime(str(begin_year-1) + '-10-01')
end_date = pd.to_datetime(str(end_year) + '-09-30')
# climate year from Apr 1 to Mar 31
elif year_type == 'climate':
begin_date = pd.to_datetime(str(begin_year-1) + '-04-01')
end_date = pd.to_datetime(str(end_year) + '-03-31')
# set date range
date_range = pd.date_range(begin_date, end_date)
return date_range