Principal Components Analysis
Given a collection of points in a multidimensional space, a "best fitting" line can be defined as one that minimizes the average squared distance from a point to the line. The next best-fitting line can be similarly chosen from directions perpendicular to the first. Repeating this process yields an orthogonal basis in which different individual dimensions of the data are uncorrelated. These basis vectors are called principal components, and several related procedures principal component analysis (PCA).
Python Example
To download the code click here.
""" dimensionality_reduction_using_scikit_learn.py """ # Import needed libraries. import numpy as np import matplotlib.pyplot as plotlib from sklearn import datasets from sklearn.model_selection import train_test_split from sklearn.decomposition import PCA from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline from sklearn.preprocessing import StandardScaler # Set parameters. random_state = 0 test_data_proportion = 0.25 number_of_neighbors = 5 number_of_components = 2 # Load Digits dataset X, y = datasets.load_digits(return_X_y=True) # Split data into X/y train/test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_data_proportion, stratify=y, random_state=random_state) dim = len(X[0]) n_classes = len(np.unique(y)) # Reduce dimension to 2 with PCA pca = make_pipeline(StandardScaler(), PCA(n_components=number_of_components, random_state=random_state)) # Reduce dimension to 2 with LinearDiscriminantAnalysis lda = make_pipeline(StandardScaler(), LinearDiscriminantAnalysis(n_components=number_of_components)) # Instantiate a k nearest neighbors classifier for evaluation. knn = KNeighborsClassifier(n_neighbors=number_of_neighbors) # Make a list of the methods to be compared dim_reduction_methods = [('PCA', pca), ('LDA', lda)] # plt.figure() for i, (name, model) in enumerate(dim_reduction_methods): plotlib.figure() # plt.subplot(1, 3, i + 1, aspect=1) # Fit the method's model model.fit(X_train, y_train) # Fit a nearest neighbor classifier on the embedded training set knn.fit(model.transform(X_train), y_train) # Compute the nearest neighbor accuracy on the embedded test set acc_knn = knn.score(model.transform(X_test), y_test) # Embed the data set in 2 dimensions using the fitted model X_embedded = model.transform(X) # Plot the projected points and show the evaluation score plotlib.scatter(X_embedded[:, 0], X_embedded[:, 1], c=y, s=30, cmap='Set1') plotlib.title("{}, KNN (k={})\nTest accuracy = {:.2f}".format(name, number_of_neighbors, acc_knn)) plotlib.show()
Results are shown below: