-
数据预处理
-
Standardization
- from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
standardized_X = scaler.transform(X_train)
standardized_X_test = scaler.transform(X_test)
-
Normalization
- from sklearn.preprocessing import Normalizer
scaler = Normalizer().fit(X_train)
normalized_X = scaler.transform(X_train)
normalized_X_test = scaler.transform(X_test)
-
Binarization
- from sklearn.preprocessing import Binarizer
binarizer = Binarizer(threshold=0.0).fit(X)
binary_X = binarizer.transform(X)
-
Encoding Categorical Features
- from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
y = enc.fit_transform(y)
-
Imputing Missing Values
- from sklearn.preprocessing import Imputer
imp = Imputer(missing_values=0, strategy='mean', axis=0)
imp.fit_transform(X_train)
-
Generating Polynomial Features
- from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(5)
oly.fit_transform(X)
-
训练和测试数据
- from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)
-
构建模型
-
监督学习评估
-
Linear Regression
- from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)
-
Support Vector Machines (SVM)
- from sklearn.svm import SVC
svc = SVC(kernel='linear')
-
Naive Bayes
- from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
-
KNN
- from sklearn import neighbors
knn = neighbors.KNeighborsClassifier(n_neighbors=5)
-
无监督学习评估
-
PCA
- from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
-
K Means
- from sklearn.cluster import KMeans
k_means = KMeans(n_clusters=3, random_state=0)
-
训练模型
-
监督学习
- lr.fit(X, y)
knn.fit(X_train, y_train)
svc.fit(X_train, y_train)
-
无监督学习
- k_means.fit(X_train)
pca_model = pca.fit_transform(X_train)
-
预测
-
监督学习
- y_pred = svc.predict(np.random.random((2,5)))
y_pred = lr.predict(X_test)
y_pred = knn.predict_proba(X_test))
-
无监督学习
- y_pred = k_means.predict(X_test)
-
模型评估
-
分类指标
-
准确率
- knn.score(X_test, y_test)
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)
-
分类报告
- from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred)))
-
混淆矩阵
- from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test, y_pred)))
-
回归指标
-
均方差MSE
- from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred))
-
平均绝对值差
- from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2])
mean_absolute_error(y_true, y_pred))
-
R2分值
- from sklearn.metrics import r2_score
r2_score(y_true, y_pred))
-
聚类指标
-
调整的兰德指数(Rand Index)
- from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(y_true, y_pred))
-
同质性(Homogeneity)
- from sklearn.metrics import homogeneity_score
homogeneity_score(y_true, y_pred))
-
V-measure
- from sklearn.metrics import v_measure_score
metrics.v_measure_score(y_true, y_pred)
-
交叉验证
- print(cross_val_score(knn, X_train, y_train, cv=4))
print(cross_val_score(lr, X, y, cv=2))