机器学习所用教程是GitHub的一个开源项目机器学习100天中文版
这里还是坚持记录一下学习过程吧,machine leaning start!
import numpy as np
import pandas as pd
Country | Age | Salary | Purchased |
---|---|---|---|
France | 44 | 72000 | No |
Germany | 40 | No | |
Spain | 27 | 48000 | Yes |
France | 48 | 79000 | Yes |
dataset = pd.read_csv('Data.csv')
X = dataset.iloc[ : , : -1].values
Y = dataset.iloc[ : , 3].values
第三步 处理丢失数据
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer = imp_mean.fit(X[ : , 1:3])
X[ : , 1:3] = imputer.transform(X[ : , 1:3])
第四步 解析分类数据
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
labelencoder_X = LabelEncoder()
X[ : , 0] = labelencoder_X.fit_transform(X[ : , 0])
# 创建虚拟变量
onehotencoder = OneHotEncoder(categorical_features = [0])
X = onehotencoder.fit_transform(X).toarray()
labelencoder_Y = LabelEncoder()
Y = labelencoder_Y.fit_transform(Y)
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split( X , Y , test_size = 0.2, random_state = 0)
将连续变量归一化
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)