英招

牢骚太盛防肠断,风物长宜放眼量

0%

XGBoost回归分析(by R)

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
#导入所需的工具包
library(xgboost)
library(caret)
library(pROC)
library(hydroGOF)

#导入数据
origindata <- read.table('E:\\data\\data.txt',encoding="UTF-8",header = T,sep = "\t")
#删除第1列
origindata <-origindata[,-1]
#删除NA行
origindata <-na.omit(origindata)
#最后一行是标签,名字改为target
colnames(origindata)[length(origindata)]<-"target"

#设置随机种子,使结果能够重现,同时随机划分训练集测试集(8:2)
set.seed(1234)
choose<-createDataPartition(y=origindata$target,p=0.8,list=F)
traindata<-origindata[choose,]
testdata<-origindata[-choose,]

#构建模型并预测,最后一列作为y,其他列作为x
data_trainx<-as.matrix(traindata[,1:length(origindata)-1])
data_trainy<-as.numeric(as.character(traindata$target))
data_testx<-as.matrix(testdata[,1:length(origindata)-1])
data_testy<-as.numeric(as.character(testdata$target))

#构建矩阵作为XGBoost的输入
dtrain <- xgb.DMatrix(data = data_trainx, label = data_trainy)
dtest <- xgb.DMatrix(data = data_testx, label = data_testy)

watchlist <- list(train = dtrain, test = dtest)


DO_xg <- xgb.train(
data = dtrain,
eta = 0.3,
gamma = 0.001,
max_depth = 2,
subsample = 0.7,
colsample_bytree = 0.4,
objective = "reg:squarederror",
nrounds = 1000,
watchlist = watchlist,
verbose = 1,
print_every_n = 100,
early_stopping_rounds = 200
)

#训练集与测试集预测
trainpred<-predict(DO_xg,newdata = dtrain)
testpred<-predict(DO_xg,newdata = dtest)

#打印误差
#训练集精度指标
trainR2<-1-sum((traindata$target-trainpred)^2)/sum((traindata$target-mean(traindata$target))^2)
trainRMSE<-sqrt(mean((traindata$target-trainpred)^2))
trainMAE<-mean(abs(traindata$target-trainpred))
trainRSE<-sqrt(sum((traindata$target-trainpred)^2)) / sqrt(sum((traindata$target-mean(traindata$target))^2))
trainMSE<-mean((traindata$target-trainpred)^2)
trainMAPE<-mean(abs((traindata$target-trainpred) / traindata$target))
trainMSPE<-mean(((traindata$target-trainpred) / traindata$target)^2)
cat("训练集R2:",trainR2,",RMSE:",trainRMSE,",MAE:",trainMAE,",RSE:",trainRSE,",MSE:",trainMSE,",MAPE:",trainMAPE,",MSPE:",trainMSPE)
#测试集集精度指标
testR2<-1-sum((testdata$target-testpred)^2)/sum((testdata$target-mean(testdata$target))^2)
testRMSE<-sqrt(mean((testdata$target-testpred)^2))
testMAE<-mean(abs(testdata$target-testpred))
testRSE<-sqrt(sum((testdata$target-testpred)^2)) / sqrt(sum((testdata$target-mean(testdata$target))^2))
testMSE<-mean((testdata$target-testpred)^2)
testMAPE<-mean(abs((testdata$target-testpred) / testdata$target))
testMSPE<-mean(((testdata$target-testpred) / testdata$target)^2)
cat("测试集R2:",testR2,",RMSE:",testRMSE,",MAE:",testMAE,",RSE:",testRSE,",MSE:",testMSE,",MAPE:",testMAPE,",MSPE:",testMSPE)
cat("target平均:",mean(testdata$target),"target平均误差范围:±",mean(abs(testdata$target-testpred)),"target最大误差:±",max((testdata$target-testpred)))

#画图
test_group <- rep('Test Samples',times=nrow(dtest))
testPlot <- data.frame(testdata$target,testpred,test_group)
colnames(testPlot)<-c('DO',
'Prediction_DO',
'Group')
train_group <- rep('Train Samples',times=nrow(dtrain))
trainPlot <- data.frame(traindata$target,trainpred,train_group)
colnames(trainPlot)<-c('DO',
'Prediction_DO',
'Group')
Plotset <- rbind(testPlot,trainPlot)
library(ggplot2)
library(ggThemeAssist)
Plot2 <- ggplot(data = Plotset)+
geom_point(data = Plotset[1:nrow(testPlot),],
aes(x=DO,y=Prediction_DO,color='Test Samples'),
size=1.5,
shape=8)+
theme(axis.line = element_line(colour = "black"))+
geom_point(data = Plotset[nrow(testPlot)+1:nrow(Plotset),],
aes(x=DO,y=Prediction_DO,color='Train Samples'),
shape=8) + theme(panel.grid.major = element_line(colour = NA),
panel.grid.minor = element_line(colour = NA),
legend.title = element_blank(),
panel.background = element_rect(fill = "white"),
legend.background = element_rect(colour = "black",linetype = "solid"),
legend.position = c(0.25,0.72))+
geom_smooth(data=Plotset,
aes(x=DO,y=Prediction_DO),
method = 'lm',formula = y~x,se = FALSE,
color='black',size=0.5)+
theme(plot.title = element_text(hjust = 0.5))+coord_equal()+
scale_x_continuous(limits = c(min(c(testPlot$DO,testPlot$Prediction_DO)), max(c(testPlot$DO,testPlot$Prediction_DO))))+
scale_y_continuous(limits = c(min(c(testPlot$DO,testPlot$Prediction_DO)), max(c(testPlot$DO,testPlot$Prediction_DO))))+
theme(axis.text.x = element_text(size = 14,family="serif"))+
theme(axis.text.y = element_text(size = 14,family="serif"))+
theme(legend.text=element_text(size=14,family="serif"))+
theme(axis.title.x = element_text(size = 14,family="serif"))+
theme(axis.title.y = element_text(size = 14,family="serif"))
Plot2

#设置默认路径,即后续图片保存的位置
setwd("E:\\output")
jpeg(filename =paste0("target预测.jpg"),width=1000,height=1000,res=250)#res 是指 PPI(Pixel Per Inch) ,即图像的采样率(在图像中,每英寸所包含的像素数目)
Plot2
dev.off()