本文主要是介绍工业产量分析与预测,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
工业产量分析与预测
- 导入数据与数据概况
- 数据描述统计
- 时序数据进行分析
- 尝试建立时间序列预测模型
- 先对汽车进行分析
- 试试灰色预测
- 灰色预测汽车
- 灰色预测天然气
- 灰色预测钢材
- 组合模型
- 探索相关关系
- 各省份进行聚类分析及可视化
- 标准化后再聚类
导入数据与数据概况
# 安装库专用# 通过如下命令设定镜像
options(repos = 'http://mirrors.ustc.edu.cn/CRAN/')
# 查看镜像是否修改
getOption('repos')
# 尝试下载R包
#若有需要,进行安装
#install.packages('plot3D')
#install.packages("reshape2")
#install.packages("pheatmap")
‘http://mirrors.ustc.edu.cn/CRAN/’
#设置工作路径
setwd("D:/LengPY")#导入数据
library(readxl)
yhyear<-read_excel("yhdata.xlsx",sheet=1)
yhshengfen<-read_excel("yhdata.xlsx",sheet=2)
### 年份数据
head(yhyear)
年份 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> |
2005 | 37771.14 | 570.49 | 493.20 |
2006 | 46893.36 | 727.89 | 585.53 |
2007 | 56560.87 | 888.89 | 692.40 |
2008 | 60460.29 | 930.59 | 802.99 |
2009 | 69405.40 | 1379.53 | 852.69 |
2010 | 80276.58 | 1826.53 | 957.91 |
### 分省份数据
head(yhshengfen)
地区 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> |
北京 | 170.7077 | 164.0186 | 29.73 |
天津 | 5454.9542 | 104.1491 | 34.90 |
河北 | 28409.6316 | 105.0836 | 5.84 |
山西 | 5594.2450 | 6.5780 | 64.62 |
内蒙古 | 2565.0281 | 2.8977 | 22.07 |
辽宁 | 7254.4315 | 79.1603 | 6.18 |
数据描述统计
summary(yhshengfen)
地区 钢材 汽车 天然气 Length:31 Min. : 0 Min. : 0.000 Min. : 0.000 Class :character 1st Qu.: 1037 1st Qu.: 6.135 1st Qu.: 0.185 Mode :character Median : 2565 Median : 61.858 Median : 5.840 Mean : 3886 Mean : 82.828 Mean : 56.829 3rd Qu.: 3804 3rd Qu.:104.616 3rd Qu.: 40.280 Max. :28410 Max. :311.972 Max. :473.420
summary(yhyear)
年份 钢材 汽车 天然气 Length:15 Min. : 37771 Min. : 570.5 Min. : 493.2 Class :character 1st Qu.: 64933 1st Qu.:1155.1 1st Qu.: 827.8 Mode :character Median : 95578 Median :1927.6 Median :1106.1 Mean : 86863 Mean :1879.5 Mean :1107.5 3rd Qu.:106507 3rd Qu.:2509.0 3rd Qu.:1357.4 Max. :120457 Max. :2901.8 Max. :1761.7
library(reshape2)
plotdata<-melt(yhyear,id.vars=c("年份"),variable.name="type",value.name="number")
plotdata$年份<-as.numeric(plotdata$年份)
head(plotdata)
Warning message:
"package 'reshape2' was built under R version 4.0.5"
年份 | type | number | |
---|---|---|---|
<dbl> | <fct> | <dbl> | |
1 | 2005 | 钢材 | 37771.14 |
2 | 2006 | 钢材 | 46893.36 |
3 | 2007 | 钢材 | 56560.87 |
4 | 2008 | 钢材 | 60460.29 |
5 | 2009 | 钢材 | 69405.40 |
6 | 2010 | 钢材 | 80276.58 |
时序数据进行分析
#数据变形
library(reshape2)
mydata<-melt(yhyear,id.vars=c("年份"),variable.name="type",value.name="number")
head(mydata)
年份 | type | number | |
---|---|---|---|
<chr> | <fct> | <dbl> | |
1 | 2005 | 钢材 | 37771.14 |
2 | 2006 | 钢材 | 46893.36 |
3 | 2007 | 钢材 | 56560.87 |
4 | 2008 | 钢材 | 60460.29 |
5 | 2009 | 钢材 | 69405.40 |
6 | 2010 | 钢材 | 80276.58 |
#绘制不同点形状的折线图
cardata<-mydata[which(mydata$type=='钢材'),]
library(ggplot2)
p <- ggplot(cardata,aes(x=年份,y=number,,shape=type,colour=type,group=type,fill=type)) +geom_line(size =0.8)+geom_point(mapping = NULL,data = NULL,size = 5,na.rm = FALSE,show.legend = NA,inherit.aes = TRUE)+geom_point(shape="\u2620")p
Warning message:
"package 'ggplot2' was built under R version 4.0.4"
#绘制不同点形状的折线图
cardata<-mydata[which(mydata$type!='钢材'),]
library(ggplot2)
p <- ggplot(cardata,aes(x=年份,y=number,,shape=type,colour=type,group=type,fill=type)) +geom_line(size =0.8)+geom_point(mapping = NULL,data = NULL,size = 5,na.rm = FALSE,show.legend = NA,inherit.aes = TRUE)+geom_point(shape="\u2620")p
尝试建立时间序列预测模型
钢材
library(ggplot2)
theme_set(theme_bw(base_family = "STKaiti"))
library(dplyr)
library(tidyr)
library(zoo)
library(tseries)
library(ggfortify)
library(gridExtra)
library(forecast)
# ## 转化为时间序列数据
gangcai_ts<-ts(yhyear$钢材)
## Ljung-Box 检验
Box.test(gangcai_ts,type ="Ljung-Box")
Box-Ljung testdata: gangcai_ts
X-squared = 10.972, df = 1, p-value = 0.000925
## 分析序列的自相关系数和偏自相关系数确定参数p和q
p1 <- autoplot(acf(gangcai_ts,lag.max = 30,plot = F))+ggtitle("序列自相关图")
p2 <- autoplot(pacf(gangcai_ts,lag.max = 30,plot = F))+ggtitle("序列偏自相关图")
gridExtra::grid.arrange(p1,p2,nrow=2)
auto.arima(gangcai_ts)
Series: gangcai_ts ARIMA(0,1,0) with drift Coefficients: drift 5906.129s.e. 1441.853sigma^2 estimated as 31343862: log likelihood=-140.17AIC=284.34 AICc=285.43 BIC=285.62
## 对数据建立ARIMA(1,1,0)模型,并预测后面的数据ARIMAmod1 <- arima(gangcai_ts,order = c(1,1,0))summary(ARIMAmod1)
Call:arima(x = gangcai_ts, order = c(1, 1, 0))Coefficients: ar1 0.6761s.e. 0.1941sigma^2 estimated as 34933933: log likelihood = -141.75, aic = 287.51Training set error measures: ME RMSE MAE MPE MAPE MASE ACF1Training set 1951.221 5710.087 4604.412 2.718953 5.479596 0.6374903 -0.2117251
## 对拟合残差进行白噪声检验Box.test(ARIMAmod1$residuals,type ="Ljung-Box")
Box-Ljung testdata: ARIMAmod1$residualsX-squared = 0.8165, df = 1, p-value = 0.3662
## 可视化模型未来的预测值par(family = "STKaiti")plot(forecast(ARIMAmod1,h=5))
Warning message in title(main = main, xlab = xlab, ylab = ylab, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(1, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(2, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(2, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(2, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(2, ...):"Windows字体数据库里没有这样的字体系列"Warning message in axis(2, ...):"Windows字体数据库里没有这样的字体系列"
在这里插入图片描述
# z这数据预测效果不太好,应该尝试多元方法
forecast(ARIMAmod1)
Point Forecast Lo 80 Hi 80 Lo 95 Hi 9516 125304.1 117729.53 132878.7 113719.78 136888.517 128581.2 113797.67 143364.7 105971.74 151190.718 130796.7 108896.36 152697.1 97303.01 164290.519 132294.6 103627.21 160962.0 88451.61 176137.620 133307.3 98306.37 168308.2 79778.01 186836.521 133991.9 93096.68 174887.2 71448.06 196535.822 134454.8 88077.02 180832.6 63526.11 205383.523 134767.7 83279.78 186255.7 56023.72 213511.724 134979.3 78711.94 191246.6 48925.81 221032.825 135122.3 74366.95 195877.7 42205.00 228039.6
先对汽车进行分析
# ## 转化为时间序列数据trq_ts<-ts(yhyear$汽车)## Ljung-Box 检验Box.test(trq_ts,type ="Ljung-Box")
Box-Ljung testdata: trq_tsX-squared = 12.75, df = 1, p-value = 0.0003561
p-value =0.001106<0.05 说明该序列为非随机数据
library(ggfortify)library(gridExtra)library(forecast)
autoplot(trq_ts)+ggtitle("序列变化趋势")
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
## 白噪声检验Box.test(trq_ts,type ="Ljung-Box")
Box-Ljung testdata: trq_tsX-squared = 12.75, df = 1, p-value = 0.0003561
p-value = 0.001106 ,说明不是白噪声
## 平稳性检验,单位根检验adf.test(trq_ts)
Warning message in adf.test(trq_ts):"p-value greater than printed p-value"Augmented Dickey-Fuller Testdata: trq_tsDickey-Fuller = 0.33839, Lag order = 2, p-value = 0.99alternative hypothesis: stationary
p-value = 0.4931,说明数据是不平稳的
## 分析序列的自相关系数和偏自相关系数确定参数p和qp1 <- autoplot(acf(trq_ts,lag.max = 30,plot = F))+ ggtitle("序列自相关图")p2 <- autoplot(pacf(trq_ts,lag.max = 30,plot = F))+ ggtitle("序列偏自相关图")gridExtra::grid.arrange(p1,p2,nrow=2)
偏自相关图3阶后截尾,可以认为p的取值为1左右,
自相关图5阶后截尾,可以认为q的取值为0左右,???不确定
auto.arima(trq_ts)
Series: trq_ts ARIMA(0,1,0) with drift Coefficients: drift 142.6560s.e. 49.9215sigma^2 estimated as 37574: log likelihood=-93.08AIC=190.17 AICc=191.26 BIC=191.45
可知ARIMA(0,1,0)较好
## 对数据建立ARIMA(0,1,0)模型,并预测后面的数据ARIMAmod <- arima(trq_ts,order = c(0,1,0))summary(ARIMAmod)
Call:arima(x = trq_ts, order = c(0, 1, 0))
sigma^2 estimated as 55241: log likelihood = -96.3, aic = 194.6Training set error measures: ME RMSE MAE MPE MAPE MASE ACF1Training set 133.1836 227.0638 177.7351 8.847576 10.53488 0.9335331 0.2076724
由上表可知模型效果情况
## 对拟合残差进行白噪声检验Box.test(ARIMAmod$residuals,type ="Ljung-Box")
Box-Ljung testdata: ARIMAmod$residualsX-squared = 0.019715, df = 1, p-value = 0.8883
p-value = 0.3503 ,说明是白噪声
## 可视化模型未来的预测值par(family = "STKaiti")plot(forecast(ARIMAmod,h=5))
forecast(ARIMAmod)
Point Forecast Lo 80 Hi 80 Lo 95 Hi 9516 2448.930 2200.4523 2697.408 2068.91609 2828.94417 2383.369 1924.5864 2842.152 1681.72152 3085.01618 2347.171 1696.4904 2997.852 1352.04095 3342.30119 2327.185 1504.6088 3149.762 1069.16309 3585.20720 2316.151 1339.5995 3292.702 822.64448 3809.65721 2310.058 1194.5963 3425.520 604.10650 4016.01022 2306.694 1064.7215 3548.667 407.26076 4206.12823 2304.837 946.5300 3663.144 227.48573 4382.18824 2303.812 837.5700 3770.053 61.38863 4546.23525 2303.245 736.0695 3870.421 -93.54334 4700.034
试试灰色预测
都是用15年数据预测后五年,评估是看15年误差情况,后五个就是预测值
灰色预测汽车
yhyear
年份 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> |
2005 | 37771.14 | 570.490 | 493.200 |
2006 | 46893.36 | 727.890 | 585.530 |
2007 | 56560.87 | 888.890 | 692.400 |
2008 | 60460.29 | 930.590 | 802.990 |
2009 | 69405.40 | 1379.530 | 852.690 |
2010 | 80276.58 | 1826.530 | 957.910 |
2011 | 88619.57 | 1841.640 | 1053.370 |
2012 | 95577.83 | 1927.620 | 1106.080 |
2013 | 108200.54 | 2212.090 | 1208.580 |
2014 | 112513.12 | 2372.520 | 1301.570 |
2015 | 103468.41 | 2450.350 | 1346.097 |
2016 | 104813.45 | 2811.910 | 1368.650 |
2017 | 104642.05 | 2901.810 | 1480.350 |
2018 | 113287.33 | 2782.740 | 1601.590 |
2019 | 120456.94 | 2567.674 | 1761.740 |
#灰色预测x<-yhyear$汽车gm11<-function(x,k){#x为行向量数据#做一次累加n<-length(x)x1<-numeric(n);for(i in 1:n){ x1[i]<-sum(x[1:i]);}#x1的均值数列z1<-numeric(n)m<-n-1for(j in 1:m){ z1[j+1]<-(0.5*x1[j+1]+0.5*x1[j])}Yn=t(t(x[2:n]))B<-matrix(1,nrow=n-1,ncol=2)B[,1]<-t(t(-z1[2:n]))#solve(M)求M的逆#最小二乘法求解参数列u<-solve(t(B)%*%B)%*%t(B)%*%Yn;a<-u[1];b<-u[2];#预测x2<-numeric(k);x2[1]<-x[1];for(i in 1:k-1){ x2[1+i]=(x[1]-b/a)*exp(-a*i)+b/a;}x2=c(0,x2);#还原数据y=diff(x2);y}#调用函数gm11(x,20)
- 570.49
- 1151.7064043779
- 1245.64577762778
- 1347.2473517763
- 1457.13609716952
- 1575.98796009879
- 1704.53402067315
- 1843.56498983028
- 1993.93607314784
- 2156.57223137344
- 2332.47387003174
- 2522.72299310652
- 2728.48985865025
- 2951.04017726093
- 3191.74289770544
- 3452.07862758024
- 3733.64874080659
- 4038.18522798192
- 4367.56135017904
- 4723.80316172634
##qichecar<-data.frame(year=2005:2019,pre=gm11(x,15),yhyear$汽车)
head(car)
year | pre | yhyear.汽车 | |
---|---|---|---|
<int> | <dbl> | <dbl> | |
1 | 2005 | 570.490 | 570.49 |
2 | 2006 | 1151.706 | 727.89 |
3 | 2007 | 1245.646 | 888.89 |
4 | 2008 | 1347.247 | 930.59 |
5 | 2009 | 1457.136 | 1379.53 |
6 | 2010 | 1575.988 | 1826.53 |
library(Metrics)library(ROCR)#全部数据sprintf("MAPE误差为: %f",mape(car$pre,car$yhyear.汽车))
Warning message:"package 'Metrics' was built under R version 4.0.4"Attaching package: 'Metrics'
The following object is masked from 'package:forecast': accuracy
Warning message:"package 'ROCR' was built under R version 4.0.4"
‘MAPE误差为: 0.132848’
plotdata <- data.frame(X =car$year,Y1 = car$yhyear.汽车,rfpre =car$pre)plotdata <- gather(plotdata,key="model",value="value",c(-X))## 可视化模型的预测误差ggplot(plotdata,aes(x = X,y = value))+ geom_line(aes(linetype = model,colour = model),size = 0.8)+ theme(legend.position = c(0.1,0.8), plot.title = element_text(hjust = 0.5))+ ggtitle("预测效果可视化")
灰色预测天然气
#灰色预测x<-yhyear$天然气gm11<-function(x,k){#x为行向量数据#做一次累加n<-length(x)x1<-numeric(n);for(i in 1:n){ x1[i]<-sum(x[1:i]);}#x1的均值数列z1<-numeric(n)m<-n-1for(j in 1:m){ z1[j+1]<-(0.5*x1[j+1]+0.5*x1[j])}Yn=t(t(x[2:n]))B<-matrix(1,nrow=n-1,ncol=2)B[,1]<-t(t(-z1[2:n]))#solve(M)求M的逆#最小二乘法求解参数列u<-solve(t(B)%*%B)%*%t(B)%*%Yn;a<-u[1];b<-u[2];#预测x2<-numeric(k);x2[1]<-x[1];for(i in 1:k-1){ x2[1+i]=(x[1]-b/a)*exp(-a*i)+b/a;}x2=c(0,x2);#还原数据y=diff(x2);y}#调用函数gm11(x,20)
- 493.200000000001
- 698.574451446175
- 750.082742319664
- 805.388916186466
- 864.773003988903
- 928.535684311362
- 996.999805801801
- 1070.51202184655
- 1149.44454577538
- 1234.19703548378
- 1325.19861701499
- 1422.91005734758
- 1527.82609739019
- 1640.47795699596
- 1761.43602468017
- 1891.31274566023
- 2030.76572284046
- 2180.50104644332
- 2341.27686914573
- 2513.90724482249
##天然气tairanqi<-data.frame(year=2005:2019,pre=gm11(x,15),yhyear$天然气)
library(Metrics)library(ROCR)#全部数据sprintf("MAPE误差为: %f",mape(tairanqi$pre,tairanqi$yhyear.天然气))
‘MAPE误差为: 0.039464’
plotdata <- data.frame(X =tairanqi$year,Y1 = tairanqi$yhyear.天然气,rfpre =tairanqi$pre)plotdata <- gather(plotdata,key="model",value="value",c(-X))## 可视化模型的预测误差ggplot(plotdata,aes(x = X,y = value))+ geom_line(aes(linetype = model,colour = model),size = 0.8)+ theme(legend.position = c(0.1,0.8), plot.title = element_text(hjust = 0.5))+ ggtitle("预测效果可视化")
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
灰色预测钢材
#灰色预测x<-yhyear$钢材gm11<-function(x,k){#x为行向量数据#做一次累加n<-length(x)x1<-numeric(n);for(i in 1:n){ x1[i]<-sum(x[1:i]);}#x1的均值数列z1<-numeric(n)m<-n-1for(j in 1:m){ z1[j+1]<-(0.5*x1[j+1]+0.5*x1[j])}Yn=t(t(x[2:n]))B<-matrix(1,nrow=n-1,ncol=2)B[,1]<-t(t(-z1[2:n]))#solve(M)求M的逆#最小二乘法求解参数列u<-solve(t(B)%*%B)%*%t(B)%*%Yn;a<-u[1];b<-u[2];#预测x2<-numeric(k);x2[1]<-x[1];for(i in 1:k-1){ x2[1+i]=(x[1]-b/a)*exp(-a*i)+b/a;}x2=c(0,x2);#还原数据y=diff(x2);y}#调用函数gm11(x,20)
- 37771.1399999999
- 62010.887920134
- 65519.7723170475
- 69227.2068415897
- 73144.4264473023
- 77283.3018172646
- 81656.3753368349
- 86276.8991019032
- 91158.8750778423
- 96317.0975308546
- 101767.197860289
- 107525.6919678
- 113610.030306885
- 120038.650764475
- 126831.034534831
- 134007.765155054
- 141590.590881134
- 149602.490593528
- 158067.743432017
- 167012.002370838
##天然气gangcai<-data.frame(year=2005:2019,pre=gm11(x,15),yhyear$钢材)
library(Metrics)library(ROCR)#全部数据sprintf("MAPE误差为: %f",mape(gangcai$pre,gangcai$yhyear.钢材))
‘MAPE误差为: 0.091504’
plotdata <- data.frame(X =gangcai$year,Y1 = gangcai$yhyear.钢材,rfpre =gangcai$pre)plotdata <- gather(plotdata,key="model",value="value",c(-X))## 可视化模型的预测误差ggplot(plotdata,aes(x = X,y = value))+ geom_line(aes(linetype = model,colour = model),size = 0.8)+ theme(legend.position = c(0.1,0.8), plot.title = element_text(hjust = 0.5))+ ggtitle("预测效果可视化")
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
组合模型
结合时间序列和灰色预测,取个平均就行
探索相关关系
head(yhshengfen)
地区 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> |
北京 | 170.7077 | 164.0186 | 29.73 |
天津 | 5454.9542 | 104.1491 | 34.90 |
河北 | 28409.6316 | 105.0836 | 5.84 |
山西 | 5594.2450 | 6.5780 | 64.62 |
内蒙古 | 2565.0281 | 2.8977 | 22.07 |
辽宁 | 7254.4315 | 79.1603 | 6.18 |
library(ggplot2)library(ggExtra)g <- ggplot(yhyear, aes(钢材, 汽车)) +geom_count() +geom_smooth(method="lm", se=F,color='red')ggMarginal(g, type = "histogram", fill="blue")ggMarginal(g, type = "boxplot", fill="transparent") ggMarginal(g, type = "density", fill="transparent")
表明两个产品有明显的线性关系
#(a) 二维散点与统计直方图library(ggplot2)library(RColorBrewer)library(ggpubr)ggscatterhist( yhyear, x ='天然气', y = '汽车', shape=21,fill="#00AFBB",color = "black",size = 3, alpha = 1, #palette = c("#00AFBB", "#E7B800", "#FC4E07"), margin.params = list( fill="#00AFBB",color = "black", size = 0.2,alpha=1), margin.plot = "histogram", legend = c(0.8,0.8), ggtheme = theme_minimal())
同上,有线性关系
mcor<-cor(yhyear[,c(2:4)])library(corrplot) # 载入corrplot包corrplot(mcor)
mcor
钢材 | 汽车 | 天然气 | |
---|---|---|---|
钢材 | 1.0000000 | 0.9543912 | 0.9575996 |
汽车 | 0.9543912 | 1.0000000 | 0.9490545 |
天然气 | 0.9575996 | 0.9490545 | 1.0000000 |
可发现,这三种产业都具有极高的相关性,可结合这几个行业的关系进行补充分析
各省份进行聚类分析及可视化
标准化后再聚类
cludata<-yhshengfen[,2:4]library(RSNNS)## 数据max-min归一化到0-1之间cludata[,1:3] <- normalizeData(cludata[,1:3],"0_1")
## 计算组内平方和 组间平方和tot_withinss <- vector()betweenss <- vector()for(ii in 1:8){ k1 <- kmeans(cludata[,c(1:3)],ii) tot_withinss[ii] <- k1$tot.withinss betweenss[ii] <- k1$betweenss}kmeanvalue <- data.frame(kk = 1:8, tot_withinss = tot_withinss, betweenss = betweenss)
library(ggplot2)library(gridExtra)library(ggdendro)library(cluster)library(ggfortify)p1 <- ggplot(kmeanvalue,aes(x = kk,y = tot_withinss))+ theme_bw()+ geom_point() + geom_line() +labs(y = "value") + ggtitle("Total within-cluster sum of squares")+ theme(plot.title = element_text(hjust = 0.5))+ scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)p2 <- ggplot(kmeanvalue,aes(x = kk,y = betweenss))+ theme_bw()+ geom_point() +geom_line() +labs(y = "value") + ggtitle("The between-cluster sum of squares") + theme(plot.title = element_text(hjust = 0.5))+ scale_x_continuous("kmean 聚类个数",kmeanvalue$kk)grid.arrange(p1,p2,nrow=2)
set.seed(245)k3 <- kmeans(cludata[,c(1:3)],5)summary(k3)
Length Class Mode cluster 31 -none- numericcenters 15 -none- numerictotss 1 -none- numericwithinss 5 -none- numerictot.withinss 1 -none- numericbetweenss 1 -none- numericsize 5 -none- numericiter 1 -none- numericifault 1 -none- numeric
## 对聚类结果可视化clusplot(cludata[,c(1:3)],k3$cluster,main = "kmean cluster number=3")
## 可视化轮廓图,表示聚类效果sis1 <- silhouette(k3$cluster,dist(cludata[,c(1:3)],method = "euclidean"))plot(sis1,main = " kmean silhouette", col = c("red", "green", "blue"))
聚类效果良好,有效地将各省份区分开,可结合标签和汇总表分析各类比的区别
#将标签写入cludata$bzcluster<-k3$clusterhead(cludata)
钢材 | 汽车 | 天然气 | bzcluster |
---|---|---|---|
<dbl> | <dbl> | <dbl> | <int> |
0.006008797 | 0.525748498 | 0.06279836 | 2 |
0.192010733 | 0.333841606 | 0.07371890 | 3 |
1.000000000 | 0.336837071 | 0.01233577 | 1 |
0.196913677 | 0.021085253 | 0.13649613 | 5 |
0.090287270 | 0.009288345 | 0.04661822 | 5 |
0.255351128 | 0.253742007 | 0.01305395 | 3 |
cludata$地区<-yhshengfen$地区cludata
钢材 | 汽车 | 天然气 | bzcluster | 地区 |
---|---|---|---|---|
<dbl> | <dbl> | <dbl> | <int> | <chr> |
0.006008797 | 0.5257484976 | 6.279836e-02 | 2 | 北京 |
0.192010733 | 0.3338416061 | 7.371890e-02 | 3 | 天津 |
1.000000000 | 0.3368370711 | 1.233577e-02 | 1 | 河北 |
0.196913677 | 0.0210852526 | 1.364961e-01 | 5 | 山西 |
0.090287270 | 0.0092883455 | 4.661822e-02 | 5 | 内蒙古 |
0.255351128 | 0.2537420073 | 1.305395e-02 | 3 | 辽宁 |
0.054356240 | 0.9267628848 | 2.181995e-02 | 2 | 吉林 |
0.027525494 | 0.0605644873 | 9.644713e-02 | 5 | 黑龙江 |
0.064051777 | 0.8811818127 | 2.634025e-02 | 2 | 上海 |
0.500232051 | 0.2686741357 | 2.574881e-02 | 3 | 江苏 |
0.122080076 | 0.3179597117 | 2.112289e-05 | 3 | 浙江 |
0.111172453 | 0.2488043783 | 4.478053e-03 | 3 | 安徽 |
0.131563005 | 0.0517934966 | 0.000000e+00 | 5 | 福建 |
aggregate(cludata[,c(1:3)],list(cludata$bzcluster),mean)
Group.1 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<int> | <dbl> | <dbl> | <dbl> |
1 | 1.00000000 | 0.33683707 | 0.01233577 |
2 | 0.08894338 | 0.77304633 | 0.05976018 |
3 | 0.18669423 | 0.26682679 | 0.02716193 |
4 | 0.07877225 | 0.12977389 | 0.88490840 |
5 | 0.05482585 | 0.01797888 | 0.03854544 |
yhshengfen$bzcluster<-k3$cluster
head(yhshengfen)
地区 | 钢材 | 汽车 | 天然气 | bzcluster |
---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <int> |
北京 | 170.7077 | 164.0186 | 29.73 | 2 |
天津 | 5454.9542 | 104.1491 | 34.90 | 3 |
河北 | 28409.6316 | 105.0836 | 5.84 | 1 |
山西 | 5594.2450 | 6.5780 | 64.62 | 5 |
内蒙古 | 2565.0281 | 2.8977 | 22.07 | 5 |
辽宁 | 7254.4315 | 79.1603 | 6.18 | 3 |
不同类别间,各指标平均值,可得到各类别侧重产业,结合现实分析
#各类比分类统计aggregate(yhshengfen[,c(2:4)],list(yhshengfen$bzcluster),mean)
Group.1 | 钢材 | 汽车 | 天然气 |
---|---|---|---|
<int> | <dbl> | <dbl> | <dbl> |
1 | 28409.632 | 105.08360 | 5.84000 |
2 | 2526.849 | 241.16850 | 28.29167 |
3 | 5303.914 | 83.24238 | 12.85900 |
4 | 2237.891 | 40.48577 | 418.93333 |
5 | 1557.582 | 5.60890 | 18.24818 |
head(yhshengfen)
地区 | 钢材 | 汽车 | 天然气 | bzcluster |
---|---|---|---|---|
<chr> | <dbl> | <dbl> | <dbl> | <int> |
北京 | 170.7077 | 164.0186 | 29.73 | 2 |
天津 | 5454.9542 | 104.1491 | 34.90 | 3 |
河北 | 28409.6316 | 105.0836 | 5.84 | 1 |
山西 | 5594.2450 | 6.5780 | 64.62 | 5 |
内蒙古 | 2565.0281 | 2.8977 | 22.07 | 5 |
辽宁 | 7254.4315 | 79.1603 | 6.18 | 3 |
不同类别省份间,各产品产量分布情况。
library(treemap)treemap(yhshengfen,index = c("bzcluster","地区"),vSize = "钢材", vColor = "汽车",type="value",palette="RdYlGn", title = "不同分类下各地区钢材与汽车产量关系",fontfamily.title = "STKaiti", title.legend = "汽车产量(万辆)",fontfamily.legend="STKaiti")
Warning message:"package 'treemap' was built under R version 4.0.4"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
library(treemap)treemap(yhshengfen,index = c("bzcluster","地区"),vSize = "天然气", vColor = "汽车",type="value",palette="RdYlGn", title = "不同分类下各地区天然气与汽车产量",fontfamily.title = "STKaiti", title.legend = "汽车产量(万辆)",fontfamily.legend="STKaiti")
Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
## 可视化各地区产量分布 ggplot(yhshengfen,aes(x=reorder(地区,汽车),y=汽车))+ theme_bw(base_family = "STKaiti")+ geom_bar(aes(fill=汽车),stat = "identity",show.legend = F)+ coord_flip()+ scale_fill_gradient(low = "#56B1F7", high = "#00C3C2")+ labs(x="地区",y="汽车产量",title="不同地区汽车产量")+ theme(axis.text.x = element_text(vjust = 0.5), plot.title = element_text(hjust = 0.5))
Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"Warning message in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, :"Windows字体数据库里没有这样的字体系列"
这篇关于工业产量分析与预测的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!