from sklearn import datasets, svm, metrics from sklearn.model_selection import train_test_split import matplotlib.pyplot as plt import numpy as np # The digits dataset digits = datasets.load_digits() # Flatten the image to apply classifier n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create support vector classifier classifier = svm.SVC(gamma=0.001) # Split into train and test subsets (50% each) X_train, X_test, y_train, y_test = train_test_split( data, digits.target, test_size=0.5, shuffle=False) # Learn the digits on the first half of the digits classifier.fit(X_train, y_train) n_samples/2 # test on second half of data n = np.random.randint(int(n_samples/2),n_samples) plt.imshow(digits.images[n], cmap=plt.cm.gray_r, interpolation='nearest') print('Predicted: ' + str(classifier.predict(digits.data[n:n+1])[0])) # Select Option by Number # 0 = Linear, 1 = Quadratic, 2 = Inner Target # 3 = Moons, 4 = Concentric Circles, 5 = Distinct Clusters select_option = 5 # generate data data_options = ['linear','quadratic','target','moons','circles','blobs'] option = data_options[select_option] # number of data points n = 2000 X = np.random.random((n,2)) mixing = 0.0 # add random mixing element to data xplot = np.linspace(0,1,100) if option=='linear': y = np.array([False if (X[i,0]+X[i,1])>=(1.0+mixing/2-np.random.rand()*mixing) else True for i in range(n)]) yplot = 1-xplot elif option=='quadratic': y = np.array([False if X[i,0]**2>=X[i,1]+(np.random.rand()-0.5)\ *mixing else True for i in range(n)]) yplot = xplot**2 elif option=='target': y = np.array([False if (X[i,0]-0.5)**2+(X[i,1]-0.5)**2<=0.1 +(np.random.rand()-0.5)*0.2*mixing else True for i in range(n)]) j = False yplot = np.empty(100) for i,x in enumerate(xplot): r = 0.1-(x-0.5)**2 if r<=0: yplot[i] = np.nan else: j = not j # plot both sides of circle yplot[i] = (2*j-1)*np.sqrt(r)+0.5 elif option=='moons': X, y = datasets.make_moons(n_samples=n,noise=0.05) yplot = xplot*0.0 elif option=='circles': X, y = datasets.make_circles(n_samples=n,noise=0.05,factor=0.5) yplot = xplot*0.0 elif option=='blobs': X, y = datasets.make_blobs(n_samples=n,centers=[[-5,3],[5,-3]],cluster_std=2.0) yplot = xplot*0.0 plt.scatter(X[y>0.5,0],X[y>0.5,1],color='blue',marker='^',label='True') plt.scatter(X[y<0.5,0],X[y<0.5,1],color='red',marker='x',label='False') if option not in ['moons','circles','blobs']: plt.plot(xplot,yplot,'k.',label='Division') plt.legend() plt.savefig(str(select_option)+'.png') # Split into train and test subsets (50% each) XA, XB, yA, yB = train_test_split(X, y, test_size=0.5, shuffle=False) # Plot regression results def assess(P): plt.figure() plt.scatter(XB[P==1,0],XB[P==1,1],marker='^',color='blue',label='True') plt.scatter(XB[P==0,0],XB[P==0,1],marker='x',color='red',label='False') plt.scatter(XB[P!=yB,0],XB[P!=yB,1],marker='s',color='orange',alpha=0.5,label='Incorrect') if option not in ['moons','circles','blobs']: plt.plot(xplot,yplot,'k.',label='Division') plt.legend() # Supervised Classification # Logistic Regression from sklearn.linear_model import LogisticRegression lr = LogisticRegression(solver='lbfgs') lr.fit(XA,yA) yP = lr.predict(XB) assess(yP) # Naïve Bayes from sklearn.naive_bayes import GaussianNB nb = GaussianNB() nb.fit(XA,yA) yP = nb.predict(XB) assess(yP) # Stochastic Gradient Descent from sklearn.linear_model import SGDClassifier sgd = SGDClassifier(loss='modified_huber', shuffle=True,random_state=101) sgd.fit(XA,yA) yP = sgd.predict(XB) assess(yP) # K-Nearest Neighbors from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=5) knn.fit(XA,yA) yP = knn.predict(XB) assess(yP) # Decision Tree from sklearn.tree import DecisionTreeClassifier dtree = DecisionTreeClassifier(max_depth=10,random_state=101,max_features=None,\ min_samples_leaf=5) dtree.fit(XA,yA) yP = dtree.predict(XB) assess(yP) # Random Forest from sklearn.ensemble import RandomForestClassifier rfm = RandomForestClassifier(n_estimators=70,oob_score=True,n_jobs=1,\ random_state=101,max_features=None,min_samples_leaf=3) rfm.fit(XA,yA) yP = rfm.predict(XB) assess(yP) # Support Vector Classifier from sklearn.svm import SVC svm = SVC(gamma='scale', C=1.0, random_state=101) svm.fit(XA,yA) yP = svm.predict(XB) assess(yP) # Neural Network from sklearn.neural_network import MLPClassifier clf = MLPClassifier(solver='lbfgs',alpha=1e-5,max_iter=200,\ activation='relu',hidden_layer_sizes=(10,30,10),\ random_state=1, shuffle=True) clf.fit(XA,yA) yP = clf.predict(XB) assess(yP) # Unsupervised Classification # K-Means Clustering from sklearn.cluster import KMeans km = KMeans(n_clusters=2) km.fit(XA) yP = km.predict(XB) # Arbitrary labels with unsupervised clustering may need to be reversed if len(XB[yP!=yB]) > n/4: yP = 1 - yP assess(yP) # Gaussian Mixture Model from sklearn.mixture import GaussianMixture gmm = GaussianMixture(n_components=2) gmm.fit(XA) yP = gmm.predict_proba(XB) # produces probabilities # Arbitrary labels with unsupervised clustering may need to be reversed if len(XB[np.round(yP[:,0])!=yB]) > n/4: yP = 1 - yP assess(np.round(yP[:,0])) # Spectral Clustering from sklearn.cluster import SpectralClustering sc = SpectralClustering(n_clusters=2,eigen_solver='arpack',\ affinity='nearest_neighbors') yP = sc.fit_predict(XB) # No separation between fit and predict calls, need to fit and predict on same dataset # Arbitrary labels with unsupervised clustering may need to be reversed if len(XB[yP!=yB]) > n/4: yP = 1 - yP assess(yP) plt.show()