1. 数据预处理
    1. Standardization
      1. from sklearn.preprocessing import StandardScaler scaler = StandardScaler().fit(X_train) standardized_X = scaler.transform(X_train) standardized_X_test = scaler.transform(X_test)
    2. Normalization
      1. from sklearn.preprocessing import Normalizer scaler = Normalizer().fit(X_train) normalized_X = scaler.transform(X_train) normalized_X_test = scaler.transform(X_test)
    3. Binarization
      1. from sklearn.preprocessing import Binarizer binarizer = Binarizer(threshold=0.0).fit(X) binary_X = binarizer.transform(X)
    4. Encoding Categorical Features
      1. from sklearn.preprocessing import LabelEncoder enc = LabelEncoder() y = enc.fit_transform(y)
    5. Imputing Missing Values
      1. from sklearn.preprocessing import Imputer imp = Imputer(missing_values=0, strategy='mean', axis=0) imp.fit_transform(X_train)
    6. Generating Polynomial Features
      1. from sklearn.preprocessing import PolynomialFeatures poly = PolynomialFeatures(5) oly.fit_transform(X)
  2. 训练和测试数据
    1. from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
  3. 构建模型
    1. 监督学习评估
      1. Linear Regression
        1. from sklearn.linear_model import LinearRegression lr = LinearRegression(normalize=True)
      2. Support Vector Machines (SVM)
        1. from sklearn.svm import SVC svc = SVC(kernel='linear')
      3. Naive Bayes
        1. from sklearn.naive_bayes import GaussianNB gnb = GaussianNB()
      4. KNN
        1. from sklearn import neighbors knn = neighbors.KNeighborsClassifier(n_neighbors=5)
    2. 无监督学习评估
      1. PCA
        1. from sklearn.decomposition import PCA pca = PCA(n_components=0.95)
      2. K Means
        1. from sklearn.cluster import KMeans k_means = KMeans(n_clusters=3, random_state=0)
  4. 训练模型
    1. 监督学习
      1. lr.fit(X, y) knn.fit(X_train, y_train) svc.fit(X_train, y_train)
    2. 无监督学习
      1. k_means.fit(X_train) pca_model = pca.fit_transform(X_train)
  5. 预测
    1. 监督学习
      1. y_pred = svc.predict(np.random.random((2,5))) y_pred = lr.predict(X_test) y_pred = knn.predict_proba(X_test))
    2. 无监督学习
      1. y_pred = k_means.predict(X_test)
  6. 模型评估
    1. 分类指标
      1. 准确率
        1. knn.score(X_test, y_test) from sklearn.metrics import accuracy_score accuracy_score(y_test, y_pred)
      2. 分类报告
        1. from sklearn.metrics import classification_report print(classification_report(y_test, y_pred)))
      3. 混淆矩阵
        1. from sklearn.metrics import confusion_matrix print(confusion_matrix(y_test, y_pred)))
    2. 回归指标
      1. 均方差MSE
        1. from sklearn.metrics import mean_squared_error mean_squared_error(y_test, y_pred))
      2. 平均绝对值差
        1. from sklearn.metrics import mean_absolute_error y_true = [3, -0.5, 2]) mean_absolute_error(y_true, y_pred))
      3. R2分值
        1. from sklearn.metrics import r2_score r2_score(y_true, y_pred))
    3. 聚类指标
      1. 调整的兰德指数(Rand Index)
        1. from sklearn.metrics import adjusted_rand_score adjusted_rand_score(y_true, y_pred))
      2. 同质性(Homogeneity)
        1. from sklearn.metrics import homogeneity_score homogeneity_score(y_true, y_pred))
      3. V-measure
        1. from sklearn.metrics import v_measure_score metrics.v_measure_score(y_true, y_pred)
    4. 交叉验证
      1. print(cross_val_score(knn, X_train, y_train, cv=4)) print(cross_val_score(lr, X, y, cv=2))