"""Similarity measures for hyswap."""
import numpy as np
import pandas as pd
from scipy import stats
from hyswap.utils import filter_to_common_time
[docs]
def calculate_correlations(df_list, data_column_name, df_names=None):
"""Calculate Pearson correlations between dataframes in df_list.
This function is designed to calculate the Pearson correlation
coefficients between dataframes in df_list. The dataframes in df_list are
expected to have the same columns. The correlation coefficients are
calculated using the `numpy.corrcoeff` function.
Parameters
----------
df_list : list
List of dataframes. The dataframes are expected to have the same
columns. Likely inputs are the output of a function like
dataretrieval.nwis.get_dv() or similar
data_column_name : str
Name of the column containing data to use for the correlation
calculation.
df_names : list, optional
List of names for the dataframes in df_list. If provided, the names
will be used to label the rows and columns of the output array. If
not provided, the column "site_no" will be used if available, if it is
not available, the index of the dataframe in the list will be used.
Returns
-------
correlations : pandas.DataFrame
Dataframe of correlation coefficients. The rows and columns are
labeled with the names of the dataframes in df_list as provided
by df_names argument.
n_obs : int
Number of observations used to calculate the energy distance.
Examples
--------
Calculate correlations between two synthetic dataframes.
.. doctest::
>>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
>>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
>>> results, n_obs = similarity.calculate_correlations([df1, df2], 'a')
>>> results
0 1
0 1.0 -1.0
1 -1.0 1.0
"""
# handle the names of the dataframes
df_names = _name_handling(df_list, df_names)
# preprocess dataframe list so they have the same index/times
df_list, n_obs = filter_to_common_time(df_list)
# calculate correlations between all pairs of dataframes in the list
correlations = np.empty((len(df_list), len(df_list)))
for i, df1 in enumerate(df_list):
for j, df2 in enumerate(df_list):
correlations[i, j] = np.corrcoef(
df1[data_column_name], df2[data_column_name])[0, 1]
# turn the correlations into a dataframe
correlations = pd.DataFrame(
correlations, index=df_names, columns=df_names)
return correlations, n_obs
[docs]
def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
"""Calculate Wasserstein distance between dataframes in df_list.
This function is designed to calculate the Wasserstein distance between
dataframes in df_list. The dataframes in df_list are expected to have the
same columns. The Wasserstein distance is calculated using the
`scipy.stats.wasserstein_distance` function.
Parameters
----------
df_list : list
List of dataframes. The dataframes are expected to have the same
columns. Likely inputs are the output of a function like
dataretrieval.nwis.get_dv() or similar
data_column_name : str
Name of the column to use for the Wasserstein distance calculation.
df_names : list, optional
List of names for the dataframes in df_list. If provided, the names
will be used to label the rows and columns of the output array. If
not provided, the column "site_no" will be used if available, if it is
not available, the index of the dataframe in the list will be used.
Returns
-------
wasserstein_distances : pandas.DataFrame
Dataframe of Wasserstein distances. The rows and columns are
labeled with the names of the dataframes in df_list as provided
by df_names argument.
n_obs : int
Number of observations used to calculate the energy distance.
Examples
--------
Calculate Wasserstein distances between two synthetic dataframes.
.. doctest::
>>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
>>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
>>> results, n_obs = similarity.calculate_wasserstein_distance(
... [df1, df2], 'a')
>>> results
0 1
0 0.0 9.0
1 9.0 0.0
"""
# handle the names of the dataframes
df_names = _name_handling(df_list, df_names)
# preprocess dataframe list so they have the same index/times
df_list, n_obs = filter_to_common_time(df_list)
# calculate distances between all pairs of dataframes in the list
wasserstein_distances = np.empty((len(df_list), len(df_list)))
for i, df1 in enumerate(df_list):
for j, df2 in enumerate(df_list):
wasserstein_distances[i, j] = stats.wasserstein_distance(
df1[data_column_name], df2[data_column_name])
# handle the names of the dataframes
df_names = _name_handling(df_list, df_names)
# turn the distances into a dataframe
wasserstein_distances = pd.DataFrame(
wasserstein_distances, index=df_names, columns=df_names)
return wasserstein_distances, n_obs
[docs]
def calculate_energy_distance(df_list, data_column_name, df_names=None):
"""Calculate energy distance between dataframes in df_list.
This function is designed to calculate the energy distance between
dataframes in df_list. The dataframes in df_list are expected to have the
same columns. The energy distance is calculated using the
`scipy.stats.energy_distance` function.
Parameters
----------
df_list : list
List of dataframes. The dataframes are expected to have the same
columns. Likely inputs are the output of a function like
dataretrieval.nwis.get_dv() or similar
data_column_name : str
Name of the column to use for the energy distance calculation.
df_names : list, optional
List of names for the dataframes in df_list. If provided, the names
will be used to label the rows and columns of the output array. If
not provided, the column "site_no" will be used if available, if it is
not available, the index of the dataframe in the list will be used.
Returns
-------
energy_distances : pandas.DataFrame
Dataframe of energy distances. The rows and columns are
labeled with the names of the dataframes in df_list as provided
by df_names argument.
n_obs : int
Number of observations used to calculate the energy distance.
Examples
--------
Calculate energy distances between two synthetic dataframes.
.. doctest::
>>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
>>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
>>> results, n_obs = similarity.calculate_energy_distance(
... [df1, df2], 'a')
>>> results
0 1
0 0.000000 3.376389
1 3.376389 0.000000
"""
# handle the names of the dataframes
df_names = _name_handling(df_list, df_names)
# preprocess dataframe list so they have the same index/times
df_list, n_obs = filter_to_common_time(df_list)
# calculate distances between all pairs of dataframes in the list
energy_distances = np.empty((len(df_list), len(df_list)))
for i, df1 in enumerate(df_list):
for j, df2 in enumerate(df_list):
energy_distances[i, j] = stats.energy_distance(
df1[data_column_name], df2[data_column_name])
# handle the names of the dataframes
df_names = _name_handling(df_list, df_names)
# turn the distances into a dataframe
energy_distances = pd.DataFrame(
energy_distances, index=df_names, columns=df_names)
return energy_distances, n_obs
[docs]
def _name_handling(df_list, df_names):
"""Private function to handle the names of the dataframes."""
if df_names is None:
df_names = []
for i, df in enumerate(df_list):
if 'site_no' in df.columns:
df_names.append(df['site_no'].iloc[0])
else:
df_names.append(str(i))
return df_names