import pandas as pd df = pd.read_csv('train.csv') df.head()
2.計算缺失值
df.isnull().sum() #輸入Item_weight和Outlet_size中缺少的值 mean = df['Item_Weight'].mean() #imputing item_weight with mean df['Item_Weight'].fillna(mean, inplace =True)
#導入所需要的包 from sklearn import neighbors from sklearn.metrics import mean_squared_error from math import sqrt import matplotlib.pyplot as plt %matplotlib inline
rmse_val = [] #存儲不同K值的RMSE值 for K in range(20): K = K+1 model = neighbors.KNeighborsRegressor(n_neighbors = K)
model.fit(x_train, y_train) #合適的模型 pred=model.predict(x_test) #對測試集進行測試 error = sqrt(mean_squared_error(y_test,pred)) #計算RMSE值 rmse_val.append(error) #存儲RMSE值 print('RMSE value for k= ' , K , 'is:', error)
輸出:
RMSE value for k = 1 is: 1579.8352322344945 RMSE value for k = 2 is: 1362.7748806138618 RMSE value for k = 3 is: 1278.868577489459 RMSE value for k = 4 is: 1249.338516122638 RMSE value for k = 5 is: 1235.4514224035129 RMSE value for k = 6 is: 1233.2711649472913 RMSE value for k = 7 is: 1219.0633086651026 RMSE value for k = 8 is: 1222.244674933665 RMSE value for k = 9 is: 1219.5895059285074 RMSE value for k = 10 is: 1225.106137547365 RMSE value for k = 11 is: 1229.540283771085 RMSE value for k = 12 is: 1239.1504407152086 RMSE value for k = 13 is: 1242.3726040709887 RMSE value for k = 14 is: 1251.505810196545 RMSE value for k = 15 is: 1253.190119191363 RMSE value for k = 16 is: 1258.802262564038 RMSE value for k = 17 is: 1260.884931441893 RMSE value for k = 18 is: 1265.5133661294733 RMSE value for k = 19 is: 1269.619416217394 RMSE value for k = 20 is: 1272.10881411344