from sklearn import preprocessing
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))
scaledFeatures=minmax_scale.fit_transform(Features)
scaledFeatures[:2]
output:
array([[0. , 0. , 0.36116884, 0. , 0. ,
0.41250333, 0. , 0. , 1. ],
[0. , 1. , 0.00939458, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
msk=numpy.random.rand(len(all_df))
train_df=all_df[msk]
test_df=all_df[~msk]
print('total:',len(all_df),'train:',len(train_df),'test:',len(test_df))
output:
total: 1309 train: 1045 test: 264
def PreprocessData(raw_df):
df=raw_df.drop(['name'],axis=1)
age_mean=df['age'].mean()
df['age']=df['age'].fillna(age_mean)
fare_mean=df['fare'].mean()
df['fare']=df['fare'].fillna(fare_mean)
df['sex']=df['sex'].map({'female':0,'male':1}).astype(int)
x_OneHot_df=pd.get_dummies(data=df,columns=["embarked"])
ndarray=x_OneHot_df.values
Features=ndarray[:,1:]
Label=ndarray[:,0]
minmax_scale=preprocessing.MinMaxScaler(feature_range=(0,1))
scaledFeatures=minmax_scale.fit_transform(Features)
return scaledFeatures,Label
train_Features,train_Label=PreprocessData(train_df)
test_Features,test_Label=PreprocessData(test_df)
train_Features[:2]
output:
array([[0. , 1. , 0.00939458, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ],
[0. , 1. , 0.37369494, 0.125 , 0.22222222,
0.2958059 , 0. , 0. , 1. ]])
train_Label[:2]
output:
array([1., 0.])