# -*- coding: utf-8 -*- """ 'preprocessing_demo.py' Data Standardization, or mean removal and variance scaling David Pan, UAH """ import numpy as np # Need to change the path name for the infile below infile = "mvnrnd.csv" dataset = np.loadtxt(infile, delimiter=',') X = dataset[:, 0:2] y = dataset[:,2] from sklearn import preprocessing scaler = preprocessing.StandardScaler().fit(X) # Mean scaler.mean_ X.mean(axis = 0) # Standard deviation scaler.scale_ X.std(axis = 0) X_scaled = scaler.transform(X) X_scaled.mean(axis = 0) X_scaled.var(axis = 0) from sklearn.neighbors import KNeighborsClassifier neigh = KNeighborsClassifier(n_neighbors=5) neigh.fit(X_scaled, y) # Apply the same scaling on the test data Xin Xin = X.mean(axis = 0) Xin_scaled = scaler.transform([Xin]) neigh.predict(Xin_scaled) neigh.predict_proba(Xin_scaled) # Alternatively, use pipeline from sklearn.pipeline import make_pipeline from sklearn.neighbors import KNeighborsClassifier from sklearn.preprocessing import StandardScaler pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=5)) pipe.fit(X, y) pipe.predict([Xin]) pipe.predict_proba([Xin])