Source code for datascience.util

"""Utility functions"""

__all__ = ['percentile', 'plot_cdf_area', 'plot_normal_cdf', 'table_apply',
           'minimize']


import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('agg', warn=False)
import matplotlib.pyplot as plt
from scipy import stats
from scipy import optimize
import functools

[docs]def percentile(p, arr=None): """Returns the pth percentile of the input array (the value that is at least as great as p% of the values in the array) If arr is not provided, percentile returns itself curried with p >>> percentile(67, [1, 3, 5, 9]) 9 >>> percentile(66, [1, 3, 5, 9]) 5 >>> f = percentile(66) >>> f([1, 3, 5, 9]) 5 """ if arr is None: return lambda arr: percentile(p, arr) return np.percentile(arr, p, interpolation='higher')
[docs]def plot_normal_cdf(rbound=None, lbound=None, mean=0, sd=1): """Plots a normal curve with specified parameters and area below curve shaded between ``lbound`` and ``rbound``. Args: ``rbound`` (numeric): right boundary of shaded region ``lbound`` (numeric): left boundary of shaded region; by default is negative infinity ``mean`` (numeric): mean/expectation of normal distribution ``sd`` (numeric): standard deviation of normal distribution """ shade = rbound is not None or lbound is not None shade_left = rbound is not None and lbound is not None inf = 3.5 * sd step = 0.1 rlabel = rbound llabel = lbound if rbound is None: rbound = inf + mean rlabel = "$\infty$" if lbound is None: lbound = -inf + mean llabel = "-$\infty$" pdf_range = np.arange(-inf + mean, inf + mean, step) plt.plot(pdf_range, stats.norm.pdf(pdf_range, loc=mean, scale=sd), color='k', lw=1) cdf_range = np.arange(lbound, rbound + step, step) if shade: plt.fill_between(cdf_range, stats.norm.pdf(cdf_range, loc=mean, scale=sd), color='gold') if shade_left: cdf_range = np.arange(-inf+mean, lbound + step, step) plt.fill_between(cdf_range, stats.norm.pdf(cdf_range, loc=mean, scale=sd), color='darkblue') plt.ylim(0, stats.norm.pdf(0, loc=0, scale=sd) * 1.25) plt.xlabel('z') plt.ylabel('$\phi$(z)', rotation=90) plt.title("Normal Curve ~ ($\mu$ = {0}, $\sigma$ = {1}) " "{2} < z < {3}".format(mean, sd, llabel, rlabel), fontsize=16) plt.show() # Old name
plot_cdf_area = plot_normal_cdf
[docs]def table_apply(table, func, subset=None): """Applies a function to each column and returns a Table. Uses pandas `apply` under the hood, then converts back to a Table Parameters ---------- table : instance of Table The table to apply your function to func : function Any function that will work with DataFrame.apply subset : list | None A list of columns to apply the function to. If None, function will be applied to all columns in table Returns ------- tab : instance of Table A table with the given function applied. It will either be the shape == shape(table), or shape (1, table.shape[1]) """ from . import Table df = table.to_df() if subset is not None: # Iterate through columns subset = np.atleast_1d(subset) if any([i not in df.columns for i in subset]): err = np.where([i not in df.columns for i in subset])[0] err = "Column mismatch: {0}".format( [subset[i] for i in err]) raise ValueError(err) for col in subset: df[col] = df[col].apply(func) else: df = df.apply(func) if isinstance(df, pd.Series): # Reshape it so that we can easily convert back df = pd.DataFrame(df).T tab = Table.from_df(df) return tab
[docs]def minimize(f, start=None, **vargs): """Minimize a function f of one or more arguments. Returns either: (a) the minimizing argument of a one-argument function (b) an array of minimizing arguments of a multi-argument function """ if start is None: arg_count = f.__code__.co_argcount assert arg_count > 0, "Please pass starting values explicitly" start = [0] * arg_count if not hasattr(start, '__len__'): start = [start] @functools.wraps(f) def wrapper(args): return f(*args) result = optimize.minimize(wrapper, start, **vargs) rounded = np.round(result.x, 7) if len(start) == 1: return rounded.item(0) else: return rounded