Source code for pmareport.predictors

# -*- coding: utf-8 -*-
'''
The model used to predict appointment duration is a decision tree.
The model is evaluated by the precentage of predicted times that are within
a threshold (5 minutes by default) of the actual duration.

The class `DurationPredictor` splits the data into testing and training, builds
the model (using scikit-learn's implementation of decision tree)
and evaluates the model both on a cross validation split of the training set
and on the test set.

`DurationPredictor` also includes functionality to
turn non-integer categorical features into ints, which scikit-learn's decision
tree implementation requires.
'''

import pandas as pd
from sklearn import tree
from sklearn import cross_validation
import numpy as np


[docs]def read_data(fp='../data/pmadata.csv'):
    '''
    Read clinic data from a csv into a pandas dataframe.

    :param str fp: the file path of the csv file
    '''
    return pd.read_csv(fp)


[docs]def percent_within(y_true, y_pred, thresh=5):
    '''
    Calculate the percentage of predictions are within
    `thresh` of the true value.

    :param array-like y_true: the true values
    :param array-like y_pred: the predicted values
    :param float thresh: the threshold for a close prediction

    :returns:
        the percent of predictions within the treshold from the true value
    :rtype: float
    '''
    return np.sum(np.abs(y_true - y_pred) < thresh)/float(len(y_true))*100


[docs]class DurationPredictor(object):
    '''
    A model to predict the duration of an appointment.

    For example, let's make a dataframe with random data in columns `feat1`
    and `response`.

    >>> df = pd.DataFrame(np.random.randn(30,2), columns=['feat1', 'response'])

    We add a column `feat2` with categorical values ('a' or 'b').

    >>> df['feat2'] = np.random.choice(['a', 'b'], 30)

    Let's make a `DurationPredictor` object from our example dataframe.

    >>> dec_pred = DurationPredictor(
    ...     df=df,
    ...     feat_cols=['feat1', 'feat2'],
    ...     response_col='response'
    ...     )

    To turn `feat2` into a column of ints
    (which scikit-learn's decision tree implementation requires),
    we use `make_int`.

    >>> dec_pred.make_int(col='feat2')

    We split our data set into train and test with 10% left out to test.

    >>> dec_pred.train_test(test_size=0.1)

    Now let's make the model, a decision tree of maximum depth 3,
    and get its average score on a 10-fold cross validation split.
    The score is the percentage of predictions within 5 minutes
    of the acutal value.

    >>> dec_pred.make_model(max_depth=3)
    >>> cv_score = dec_pred.cv_evalution(thresh=5)
    >>> cv_score >= 0 and cv_score <= 100
    True

    Fit the model on the full training set and evaluate it on the test set.

    >>> test_score = dec_pred.fit()
    >>> test_score >= 0 and test_score <= 100
    True

    :param dataframe df: the data
    :param list feat_cols: a list of the names of the feature columns
    :param str response_col: the name of the response column

    '''
    def __init__(self, df, feat_cols, response_col):
        self.df = df
        self.feat_cols = feat_cols
        self.response_col = response_col
        self.int_funcs = {}

[docs]    def make_int(self, col):
        '''
        Encode categorical variables of type other than int
        as ints for input into the decision tree.

        :param str col: the name of the column with categorical values
        '''

        categories = list(set(self.df[col]))
        int_func = lambda x: categories.index(x)
        self.df[col+'i'] = self.df[col].apply(int_func)
        self.feat_cols.remove(col)
        self.feat_cols.append(col+'i')
        self.int_funcs[col] = int_func

[docs]    def train_test(self, test_size=0.1):
        '''
        Split the data into train and test sets.

        :param float test_size: the percentage of rows to leave out as test
        '''

        self.train, self.test = cross_validation.train_test_split(
            self.df,
            test_size=test_size
            )
        self.Xtrain = self.train[self.feat_cols]
        self.ytrain = self.train[self.response_col]
        self.Xtest = self.test[self.feat_cols]
        self.ytest = self.test[self.response_col]

[docs]    def make_model(self, max_depth=3):
        '''
        Make the model, a decision tree with maximum depth `max_depth`.

        :param max_depth: the maximum depth of the decision tree
        '''
        self.model = tree.DecisionTreeRegressor(max_depth=max_depth)

[docs]    def cv_evalution(self, n_folds=10, thresh=5):
        '''
        Evaluate the model on a cross valdation split
        of the training data with `n_folds` nmber of folds.
        The metric is the percent of predictions within `thresh`
        of the true value.

        :param int n_folds: the number of folds for the cross validation
        :param float thresh:
            the threshold for considering a prediction close to the true value

        :returns: the average of metric values over the folds
        :rtype: float
        '''
        cv = cross_validation.KFold(len(self.train), n_folds=n_folds)
        score_list = []
        for train, test in cv:
            cvXtrain = self.Xtrain.iloc[train]
            cvXtest = self.Xtrain.iloc[test]
            cvytrain = self.ytrain.iloc[train]
            cvytest = self.ytrain.iloc[test]
            self.model.fit(cvXtrain, cvytrain)
            pred = self.model.predict(cvXtest)
            score = percent_within(y_true=cvytest, y_pred=pred, thresh=5)
            score_list.append(score)
        return np.mean(score_list)

[docs]    def fit(self, thresh=5):
        '''
        Fit the model on the training set and evaluate it
        on the test set. The metric is the percent of
        predictions within `thresh` of the true value.

        :param float thresh:
            the threshold for considering a prediction close to the true value

        :returns: the score of the model on the test set
        :rtype: float
        '''
        self.model.fit(self.Xtrain, self.ytrain)
        predictions = self.model.predict(self.Xtest)
        score = percent_within(
            y_true=self.ytest,
            y_pred=predictions,
            thresh=thresh
            )
        return score