Source code for hyswap.similarity

"""Similarity measures for hyswap."""

import numpy as np
import pandas as pd
from scipy import stats
from hyswap.utils import filter_to_common_time



[docs]
def calculate_correlations(df_list, data_column_name, df_names=None):
    """Calculate Pearson correlations between dataframes in df_list.

    This function is designed to calculate the Pearson correlation
    coefficients between dataframes in df_list. The dataframes in df_list are
    expected to have the same columns. The correlation coefficients are
    calculated using the `numpy.corrcoeff` function.

    Parameters
    ----------
    df_list : list
        List of dataframes. The dataframes are expected to have the same
        columns. Likely inputs are the output of a function like
        dataretrieval.nwis.get_dv() or similar

    data_column_name : str
        Name of the column containing data to use for the correlation
        calculation.

    df_names : list, optional
        List of names for the dataframes in df_list. If provided, the names
        will be used to label the rows and columns of the output array. If
        not provided, the column "site_no" will be used if available, if it is
        not available, the index of the dataframe in the list will be used.

    Returns
    -------
    correlations : pandas.DataFrame
        Dataframe of correlation coefficients. The rows and columns are
        labeled with the names of the dataframes in df_list as provided
        by df_names argument.

    n_obs : int
        Number of observations used to calculate the energy distance.

    Examples
    --------
    Calculate correlations between two synthetic dataframes.

    .. doctest::

        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
        >>> results, n_obs = similarity.calculate_correlations([df1, df2], 'a')
        >>> results
             0    1
        0  1.0 -1.0
        1 -1.0  1.0
    """
    # handle the names of the dataframes
    df_names = _name_handling(df_list, df_names)
    # preprocess dataframe list so they have the same index/times
    df_list, n_obs = filter_to_common_time(df_list)
    # calculate correlations between all pairs of dataframes in the list
    correlations = np.empty((len(df_list), len(df_list)))
    for i, df1 in enumerate(df_list):
        for j, df2 in enumerate(df_list):
            correlations[i, j] = np.corrcoef(
                df1[data_column_name], df2[data_column_name])[0, 1]
    # turn the correlations into a dataframe
    correlations = pd.DataFrame(
        correlations, index=df_names, columns=df_names)
    return correlations, n_obs




[docs]
def calculate_wasserstein_distance(df_list, data_column_name, df_names=None):
    """Calculate Wasserstein distance between dataframes in df_list.

    This function is designed to calculate the Wasserstein distance between
    dataframes in df_list. The dataframes in df_list are expected to have the
    same columns. The Wasserstein distance is calculated using the
    `scipy.stats.wasserstein_distance` function.

    Parameters
    ----------
    df_list : list
        List of dataframes. The dataframes are expected to have the same
        columns. Likely inputs are the output of a function like
        dataretrieval.nwis.get_dv() or similar

    data_column_name : str
        Name of the column to use for the Wasserstein distance calculation.

    df_names : list, optional
        List of names for the dataframes in df_list. If provided, the names
        will be used to label the rows and columns of the output array. If
        not provided, the column "site_no" will be used if available, if it is
        not available, the index of the dataframe in the list will be used.

    Returns
    -------
    wasserstein_distances : pandas.DataFrame
        Dataframe of Wasserstein distances. The rows and columns are
        labeled with the names of the dataframes in df_list as provided
        by df_names argument.

    n_obs : int
        Number of observations used to calculate the energy distance.

    Examples
    --------
    Calculate Wasserstein distances between two synthetic dataframes.

    .. doctest::

        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
        >>> results, n_obs = similarity.calculate_wasserstein_distance(
        ...     [df1, df2], 'a')
        >>> results
             0    1
        0  0.0  9.0
        1  9.0  0.0
    """
    # handle the names of the dataframes
    df_names = _name_handling(df_list, df_names)
    # preprocess dataframe list so they have the same index/times
    df_list, n_obs = filter_to_common_time(df_list)
    # calculate distances between all pairs of dataframes in the list
    wasserstein_distances = np.empty((len(df_list), len(df_list)))
    for i, df1 in enumerate(df_list):
        for j, df2 in enumerate(df_list):
            wasserstein_distances[i, j] = stats.wasserstein_distance(
                df1[data_column_name], df2[data_column_name])
    # handle the names of the dataframes
    df_names = _name_handling(df_list, df_names)
    # turn the distances into a dataframe
    wasserstein_distances = pd.DataFrame(
        wasserstein_distances, index=df_names, columns=df_names)
    return wasserstein_distances, n_obs




[docs]
def calculate_energy_distance(df_list, data_column_name, df_names=None):
    """Calculate energy distance between dataframes in df_list.

    This function is designed to calculate the energy distance between
    dataframes in df_list. The dataframes in df_list are expected to have the
    same columns. The energy distance is calculated using the
    `scipy.stats.energy_distance` function.

    Parameters
    ----------
    df_list : list
        List of dataframes. The dataframes are expected to have the same
        columns. Likely inputs are the output of a function like
        dataretrieval.nwis.get_dv() or similar

    data_column_name : str
        Name of the column to use for the energy distance calculation.

    df_names : list, optional
        List of names for the dataframes in df_list. If provided, the names
        will be used to label the rows and columns of the output array. If
        not provided, the column "site_no" will be used if available, if it is
        not available, the index of the dataframe in the list will be used.

    Returns
    -------
    energy_distances : pandas.DataFrame
        Dataframe of energy distances. The rows and columns are
        labeled with the names of the dataframes in df_list as provided
        by df_names argument.

    n_obs : int
        Number of observations used to calculate the energy distance.

    Examples
    --------
    Calculate energy distances between two synthetic dataframes.

    .. doctest::

        >>> df1 = pd.DataFrame({'a': np.arange(10), 'b': np.arange(10)})
        >>> df2 = pd.DataFrame({'a': -1*np.arange(10), 'b': np.arange(10)})
        >>> results, n_obs = similarity.calculate_energy_distance(
        ...     [df1, df2], 'a')
        >>> results
                  0         1
        0  0.000000  3.376389
        1  3.376389  0.000000
    """
    # handle the names of the dataframes
    df_names = _name_handling(df_list, df_names)
    # preprocess dataframe list so they have the same index/times
    df_list, n_obs = filter_to_common_time(df_list)
    # calculate distances between all pairs of dataframes in the list
    energy_distances = np.empty((len(df_list), len(df_list)))
    for i, df1 in enumerate(df_list):
        for j, df2 in enumerate(df_list):
            energy_distances[i, j] = stats.energy_distance(
                df1[data_column_name], df2[data_column_name])
    # handle the names of the dataframes
    df_names = _name_handling(df_list, df_names)
    # turn the distances into a dataframe
    energy_distances = pd.DataFrame(
        energy_distances, index=df_names, columns=df_names)
    return energy_distances, n_obs




[docs]
def _name_handling(df_list, df_names):
    """Private function to handle the names of the dataframes."""
    if df_names is None:
        df_names = []
        for i, df in enumerate(df_list):
            if 'site_no' in df.columns:
                df_names.append(df['site_no'].iloc[0])
            else:
                df_names.append(str(i))
    return df_names