Analyzing Iris Data Set with Scikit-learn

The following code demonstrate the use of python Scikit-learn to analyze/categorize the iris data set used commonly in machine learning. This post also highlight several of the methods and modules available for various machine learning studies.

While the code is not very lengthy, it did cover quite a comprehensive area as below:

Data preprocessing: data encoding, scaling.
Feature decomposition/dimension reduction with PCA. PCA is not needed or applicable to the Iris data set as the number of features is only 4. Nevertheless, it is shown here as a tool.
Splitting test and training set.
Classifier: Logistic Regression. Only logistic regression is shown here. Random forest and SVM can also be used for this dataset.
GridSearch: for parameters sweeping.
Pipeline: Pipeline which combined all the steps + gridsearch with Pipeline
Scoring metrics, Cross Validation, confusion matrix.

import sys, re, time, datetime, os
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import plt

from sklearn.datasets import load_iris
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import accuracy_score, confusion_matrix

def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """
        pretty print for confusion matrixes
        Code from: https://gist.github.com/zachguo/10296432

    """
    columnwidth = max([len(x) for x in labels]+[5]) # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print "    " + empty_cell,
    for label in labels:
        print "%{0}s".format(columnwidth) % label,
    print
    # Print rows
    for i, label1 in enumerate(labels):
        print "    %{0}s".format(columnwidth) % label1,
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] &gt; hide_threshold else empty_cell
            print cell,
        print

def pca_2component_scatter(data_df, predictors, legend):
    """
        outlook of data set by decomposing data to only 2 pca components.
        do: scaling --&gt; either maxmin or stdscaler

    """

    print 'PCA plotting'

    data_df[predictors] =  StandardScaler().fit_transform(data_df[predictors])

    pca_components = ['PCA1','PCA2'] #make this exist then insert the fit transform
    pca = PCA(n_components = 2)
    for n in pca_components: data_df[n] = ''
    data_df[pca_components] = pca.fit_transform(data_df[predictors])

    sns.lmplot('PCA1', 'PCA2',
       data=data_df,
       fit_reg=False,
       hue=legend,
       scatter_kws={"marker": "D",
                    "s": 100})
    plt.show()

if __name__ == "__main__":

    iris =  load_iris()
    target_df = pd.DataFrame(data= iris.data, columns=iris.feature_names )

    #combining the categorial output
    target_df['species'] = pd.Categorical.from_codes(codes= iris.target,categories = iris.target_names)
    target_df['species_coded'] = iris.target #encoding --&gt; as provided in iris dataset

    print '\nList of features and output'
    print target_df.columns.tolist()

    print '\nOutlook of data'
    print target_df.head()

    print "\nPrint out any missing data for each rows. "
    print np.where(target_df.isnull())

    predictors =[ n for n in target_df.columns.tolist() if n not in  ['species','species_coded']]
    target = 'species_coded' #use the encoded version y-train, y-test

    print '\nPCA plotting'
    pca_2component_scatter(target_df, predictors, 'species')

    print "\nSplit train test set."
    X_train, X_test, y_train, y_test = train_test_split(target_df[predictors], target_df[target], test_size=0.25, random_state=42)
    #test_size -- should be between 0.0 and 1.0 and represent the proportion of the dataset to include in the test split
    #random state -- Pseudo-random number generator state used for random sampling.(any particular number use?
    print "Shape of training set: {}, Shape of test set: {}".format(X_train.shape, X_test.shape)

    print "\nCreating pipeline with the estimators"
    estimators = [
                    ('standardscaler',StandardScaler()),
                    ('reduce_dim', PCA()),
                    ('clf', LogisticRegression())#the logistic regression use from ML teset not part of actual test. --&gt; may have to change the way it is is done
                ]

    #Parameters of the estimators in the pipeline can be accessed using the &lt;estimator&gt;__&lt;parameter&gt; syntax:
    pipe = Pipeline(estimators)

    #input the grid search
    params = dict(reduce_dim__n_components=[2, 3, 4], clf__C=[0.1, 10, 100,1000])
    grid_search = GridSearchCV(pipe, param_grid=params, cv =5)

    grid_search.fit(X_train, y_train)

    print '\nGrid Search Results:'
    gridsearch_result = pd.DataFrame(grid_search.cv_results_)
    gridsearch_display_cols = ['param_' + n for n in params.keys()] + ['mean_test_score']
    print gridsearch_result[gridsearch_display_cols]
    print '\nBest Parameters: ', grid_search.best_params_
    print '\nBest Score: ', grid_search.best_score_

    print "\nCross validation Performance on the training set with optimal parms"
    pipe.set_params(clf__C=100)
    pipe.set_params(reduce_dim__n_components=4)#how much PCA should reduce??
    scores = cross_val_score(pipe, X_train, y_train, cv=5)
    print scores

    print "\nPerformance on the test set with optimal parms:"
    pipe.fit(X_train, y_train)
    predicted = pipe.predict(X_test)

    print 'Acuracy Score on test set: {}'.format(accuracy_score(y_test, predicted))

    print "\nCross tab(confusion matrix) on results:"

    print_cm(confusion_matrix(y_test, predicted),iris.target_names)

Output:

Output