Source code for lore_sa.util

import numpy as np
import pandas as pd
import scipy.stats as st

from scipy.spatial.distance import jaccard
import warnings

from lore_sa.encoder_decoder import OneHotEnc


[docs]def vector2dict(x, feature_names):
    """
    Return a dictionary feature name : value
    :param x: list of values
    :param feature_names: ordered feature list as the x instance
    :return:
    """
    return {k: v for k, v in zip(feature_names, x)}

def neuclidean(x, y):
    return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))

def record2str(x, feature_names, numeric_columns, encdec=None):
    xd = vector2dict(x, feature_names)
    if encdec:
        x_dec = encdec.dec(x)
    s = '{ '
    for att, val in xd.items():
        #print('att ', att)
        #print('att val ', val)
        if att not in numeric_columns and val == 0.0:
            continue
        elif att in numeric_columns:
            s += '%s = %s, ' % (att, val)
        elif encdec is None:
            s += '%s = %s' % (att, val)
        else:
            if isinstance(encdec, OneHotEnc):
                att_split = att.split('=')
                s += '%s = %s, ' % (att_split[0], att_split[1])

            '''
            if encdec:
                att_split = [None]*2
                att_split[0] = att
                try:
                    ind = feature_names.tolist().index(att)
                except:
                    print('in except', feature_names)
                    ind = feature_names.index(att)
                att_split[1] = x_dec[0][ind]
            else:
                att_split = att.split('=')
            #print('att split ', att_split)
            s += '%s = %s, ' % (att_split[0], att_split[1])'''

    s = s[:-2] + ' }'
    return s


def multilabel2str(y, class_name):
    mstr = ', '.join([class_name[i] for i in range(len(y)) if y[i] == 1.0])
    return mstr


def multi_dt_predict(X, dt_list):
    nbr_labels = len(dt_list)
    Y = np.zeros((X.shape[0], nbr_labels))
    for l in range(nbr_labels):
        Y[:, l] = dt_list[l].predict(X)
    return Y

def mixed_distance_idx(x, y, idx, ddist=jaccard, cdist=neuclidean):

    dim = len(x)
    xc, xd = x[:idx], x[idx:]
    yc, yd = y[:idx], y[idx:]

    wc = 1.0 * len(xc) / dim
    cd = cdist(xc, yc)

    wd = 1.0 * len(xd) / dim
    dd = ddist(xd, yd)

    return wd * dd + wc * cd
def calculate_feature_values(X, numeric_columns_index, categorical_use_prob=False, continuous_fun_estimation=False,
                             size=1000):

    feature_values = list()
    for i in range(X.shape[1]):
        values = X[:, i]
        unique_values = np.unique(values)
        if len(unique_values) == 1:
            new_values = np.array([unique_values[0]] * size)
        else:
            if i in numeric_columns_index:
                values = values.astype(np.float)
                if continuous_fun_estimation:
                    new_values = get_distr_values(values, size)
                else:  # suppose is gaussian
                    mu = float(np.mean(values))
                    sigma = float(np.std(values))
                    new_values = np.random.normal(mu, sigma, size)
                new_values = np.concatenate((values, new_values), axis=0)
            else:
                if categorical_use_prob:
                    diff_values, counts = np.unique(values, return_counts=True)
                    prob = 1.0 * counts / np.sum(counts)
                    new_values = np.random.choice(diff_values, size=size, p=prob)
                else:  # uniform distribution
                    diff_values = unique_values
                    new_values = diff_values

        feature_values.append(new_values)
    return feature_values


def get_distr_values(x, size=1000):
    nbr_bins = int(np.round(estimate_nbr_bins(x)))
    name, params = best_fit_distribution(x, nbr_bins)
    # print(name, params)
    dist = getattr(st, name)

    arg = params[:-2]
    loc = params[-2]
    scale = params[-1]

    start = dist.ppf(0.01, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.01, loc=loc, scale=scale)
    end = dist.ppf(0.99, *arg, loc=loc, scale=scale) if arg else dist.ppf(0.99, loc=loc, scale=scale)

    distr_values = np.linspace(start, end, size)

    return distr_values


# Distributions to check
DISTRIBUTIONS = [st.uniform, st.exponweib, st.expon, st.expon, st.gamma, st.beta, st.alpha,
                 st.chi, st.chi2, st.laplace, st.lognorm, st.norm, st.powerlaw] #st.dweibull,


def freedman_diaconis(x):
    iqr = np.subtract(*np.percentile(x, [75, 25]))
    n = len(x)
    h = max(2.0 * iqr / n**(1.0/3.0), 1)
    k = np.ceil((np.max(x) - np.min(x))/h)
    return k


def struges(x):
    n = len(x)
    k = np.ceil(np.log2(n)) + 1
    return k


def estimate_nbr_bins(x):
    if len(x) == 1:
        return 1
    k_fd = freedman_diaconis(x) if len(x) > 2 else 1
    k_struges = struges(x)
    if k_fd == float('inf') or np.isnan(k_fd):
        k_fd = np.sqrt(len(x))
    k = max(k_fd, k_struges)
    return k


# Create models from data
[docs]def best_fit_distribution(data, bins=200, ax=None):
    """Model data by finding best fit distribution to data"""
    # Get histogram of original data
    y, x = np.histogram(data, bins=bins, density=True)
    x = (x + np.roll(x, -1))[:-1] / 2.0

    # Best holders
    best_distribution = st.norm
    best_params = (0.0, 1.0)
    best_sse = np.inf

    # Estimate distribution parameters from data
    for distribution in DISTRIBUTIONS:

        # Try to fit the distribution
        try:
                #print 'aaa'
            # Ignore warnings from data that can't be fit
            with warnings.catch_warnings():
                warnings.filterwarnings('ignore')

                # fit dist to data
                params = distribution.fit(data)

                # Separate parts of parameters
                arg = params[:-2]
                loc = params[-2]
                scale = params[-1]

                # Calculate fitted PDF and error with fit in distribution
                pdf = distribution.pdf(x, loc=loc, scale=scale, *arg)
                sse = np.sum(np.power(y - pdf, 2.0))

                # if axis pass in add to plot
                try:
                    if ax:
                        pd.Series(pdf, x).plot(ax=ax)
                except Exception:
                    pass

                # identify if this distribution is better
                # print distribution.name, sse
                if best_sse > sse > 0:
                    best_distribution = distribution
                    best_params = params
                    best_sse = sse

        except Exception:
            pass

    return best_distribution.name, best_params


# def amp2math(c):
#     if '&le;' in c:
#         idx = c.find('&le;')
#         cnew = '%s%s%s' % (c[:idx], '<=', c[idx + 4:])
#         return cnew
#     elif '&lt;' in c:
#         idx = c.find('&lt;')
#         cnew = '%s%s%s' % (c[:idx], '<', c[idx + 4:])
#         return cnew
#     elif '&gl;' in c:
#         idx = c.find('&gl;')
#         cnew = '%s%s%s' % (c[:idx], '>=', c[idx + 4:])
#         return cnew
#     elif '&gt;' in c:
#         idx = c.find('&gt;')
#         cnew = '%s%s%s' % (c[:idx], '>', c[idx + 4:])
#         return cnew
#     return c
#
#
# def math2amp(c):
#     if '<=' in c:
#         idx = c.find('<=')
#         cnew = '%s%s%s' % (c[:idx], '&le;', c[idx + 2:])
#         return cnew
#     elif '<' in c:
#         idx = c.find('<')
#         cnew = '%s%s%s' % (c[:idx], '&lt;', c[idx + 1:])
#         return cnew
#     elif '>=' in c:
#         idx = c.find('>=')
#         cnew = '%s%s%s' % (c[:idx], '&gl;', c[idx + 2:])
#         return cnew
#     elif '>' in c:
#         idx = c.find('>')
#         cnew = '%s%s%s' % (c[:idx], '&gt;', c[idx + 1:])
#         return cnew
#     return c


[docs]def sigmoid(x, x0=0.5, k=10.0, L=1.0):
    """
    A logistic function or logistic curve is a common "S" shape (sigmoid curve

    :param x: value to transform
    :param x0: the x-value of the sigmoid's midpoint
    :param k: the curve's maximum value
    :param L: the steepness of the curve
    :return: sigmoid of x
    """
    return L / (1.0 + np.exp(-k * (x - x0)))


def neuclidean(x, y):
    return 0.5 * np.var(x - y) / (np.var(x) + np.var(y))


def nmeandev(x, y):  # normalized mean deviation
    return np.mean(np.abs(x-y)/np.max([np.abs(x), np.abs(y)], axis=0))