# 1972 Fuel Consumption Data -- DIAGNOSTICS # # The dataset below contains data for the 50 states, including # 1971 population in thousands (Pop), # 1972 gasoline tax in cents per gallon (Tax), # thousands of licensed drivers 1971 (Lic), # per capita income in dollars (Inc), # thousands of miles of federally funded highways in 1971 (Hwy) # total 1972 fuel consumption in millions of gallons (Gas) # # Source: Data collected by Cristopher Bingham from # the American Almanac, 1974 # republished in Weisberg, Applied Linear Regression, 2nd Edition. "Fuel" <- structure(list(Pop = c(1029, 771, 462, 5787, 968, 3082, 18366, 7367, 11926, 10783, 5291, 11251, 9082, 4520, 3896, 2883, 4753, 632, 579, 1525, 2258, 565, 4056, 4764, 1781, 5214, 2665, 4720, 7259, 3299, 4031, 3510, 2263, 1978, 3720, 2634, 11649, 719, 756, 345, 2357, 1065, 1945, 1126, 527, 3443, 2182, 20468, 303, 801 ), Tax = c(9, 9, 9, 7.5, 8, 10, 8, 8, 8, 7, 8, 7.5, 7, 7, 7, 7, 7, 7, 7, 8.5, 7, 8, 9, 9, 8.5, 9, 8, 7.5, 8, 9, 7, 7, 8, 7.5, 8, 6.58, 5, 7, 8.5, 7, 7, 7, 7, 7, 6, 9, 7, 7, 8, 5), Lic = c(540, 441, 268, 3060, 527, 1760, 8278, 4074, 6312, 5948, 2804, 5903, 5213, 2465, 2368, 1689, 2719, 341, 419, 1033, 1496, 340, 2073, 2463, 982, 2835, 1460, 2731, 4084, 1626, 2088, 1801, 1309, 1081, 1813, 1657, 6595, 421, 501, 232, 1475, 600, 1173, 572, 354, 1966, 1360, 12130, 141, 519), Inc = c(3571, 4092, 3865, 4870, 4399, 5342, 5319, 5126, 4447, 4512, 4391, 5126, 4817, 4207, 4332, 4318, 4206, 3718, 4716, 4341, 4593, 4983, 4897, 4258, 4574, 3721, 3448, 3846, 4188, 3601, 3640, 3333, 3063, 3357, 3528, 3802, 4045, 3897, 3635, 4345, 4449, 3656, 4300, 3745, 5215, 4476, 4296, 5002, 5162, 4995), Hwy = c(1976, 1250, 1586, 2351, 431, 1333, 11868, 2138, 8577, 8507, 5939, 14186, 6930, 6580, 8159, 10340, 8508, 4725, 5915, 6010, 7834, 602, 2449, 4686, 2619, 4746, 5399, 9061, 5975, 4650, 6905, 6594, 6524, 4121, 3495, 7834, 17782, 6385, 3274, 3905, 4639, 3985, 3635, 2611, 2302, 3942, 4083, 9794, 3246, 602 ), Gas = c(557, 404, 259, 2396, 397, 1408, 6312, 3439, 5528, 5375, 3068, 5301, 4768, 2294, 2204, 1830, 2865, 451, 501, 976, 1466, 305, 1883, 2604, 819, 2953, 1537, 2979, 4169, 1761, 2301, 1946, 1306, 1242, 1812, 1695, 7451, 506, 490, 334, 1384, 744, 1230, 666, 412, 1757, 1331, 10730, 172, 276)), .Names = c("Pop", "Tax", "Lic", "Inc", "Hwy", "Gas"), class = "data.frame", row.names = c("ME", "NH", "VT", "MA", "RI", "CT", "NY", "NJ", "PA", "OH", "IN", "IL", "MI", "WI", "MN", "IA", "MO", "ND", "SD", "NE", "KS", "DE", "MD", "VA", "WV", "NC", "SC", "GA", "FL", "KY", "TN", "AL", "MS", "AR", "LA", "OK", "TX", "MT", "ID", "WY", "CO", "NM", "AZ", "UT", "NV", "WA", "OR", "CA", "AK", "HI")) # PREVIOUS ANALYSIS (ANOVA, F tests): Fuel$pLic = 100*Fuel$Lic/Fuel$Pop #(now percent of population with licenses) Fuel$pGas = 1000*Fuel$Gas/Fuel$Pop #(now in gallons per person, consumed in one year) pGas.pLic.lm = lm(pGas~pLic, data=Fuel) summary(pGas.pLic.lm) pdf(file='pGaspLic.pdf') plot(Fuel$pLic,Fuel$pGas) abline(pGas.pLic.lm) dev.off() pGas.Tax.lm = lm(pGas~Tax, data=Fuel) summary(pGas.Tax.lm) pdf(file='pGasTax.pdf') plot(Fuel$Tax,Fuel$pGas) abline(pGas.Tax.lm) dev.off() Tax.pLic.lm = lm(Tax~pLic, data=Fuel) summary(Tax.pLic.lm) pdf(file='TaxpLic.pdf') plot(Fuel$pLic,Fuel$Tax) abline(Tax.pLic.lm) dev.off() res.pGas.Tax.lm = lm(residuals(pGas.pLic.lm)~residuals(Tax.pLic.lm)) summary(res.pGas.Tax) pdf(file='RespGasTax.pdf') plot(residuals(Tax.pLic.lm),residuals(pGas.pLic.lm)) abline(res.pGas.Tax.lm) dev.off() pGas.pLic.Tax.lm = lm(pGas~pLic+Tax, data=Fuel) summary(pGas.pLic.Tax.lm) pdf(file='pGaspLicTax.pdf') plot(fitted(pGas.pLic.Tax.lm),Fuel$pGas) abline(0,1) dev.off() anova(pGas.pLic.Tax.lm) anova(pGas.pLic.Tax.lm,pGas.pLic.lm) ############################################################################# ########### DIAGNOSTICS pGas.full.lm = lm(pGas~pLic+Tax+Inc+Hwy, data=Fuel) summary(pGas.full.lm) #Residuals: # Min 1Q Median 3Q Max #-278.06 -42.95 -14.15 23.37 276.70 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 221.015307 229.365813 0.964 0.34040 #pLic 12.072138 2.368490 5.097 6.66e-06 *** #Tax -7.668943 14.631106 -0.524 0.60274 #Inc -0.068874 0.021217 -3.246 0.00221 ** #Hwy 0.002954 0.003965 0.745 0.46003 # #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 86.4 on 45 degrees of freedom #Multiple R-squared: 0.4761, Adjusted R-squared: 0.4295 #F-statistic: 10.22 on 4 and 45 DF, p-value: 5.647e-06 library(car) hist(residuals(pGas.full.lm),nclass=20) hist(rstandard(pGas.full.lm),nclass=20) hist(hatvalues(pGas.full.lm),nclass=20) hist(cooks.distance(pGas.full.lm),nclass=20) #influence.measures(pGas.full.lm) plot(pGas.full.lm) cr.plots(pGas.full.lm) ########### MODEL WITHOUT AK AND HI Fuel.cont = Fuel[1:48,] pGas.full.cont.lm = lm(pGas~pLic+Tax+Inc+Hwy, data=Fuel.cont) summary(pGas.full.cont.lm) # #Residuals: # Min 1Q Median 3Q Max #-122.47 -44.93 -10.56 31.57 234.41 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 376.780329 185.690258 2.029 0.048670 * #pLic 13.365947 1.924349 6.946 1.54e-08 *** #Tax -34.777939 12.976102 -2.680 0.010389 * #Inc -0.066513 0.017227 -3.861 0.000375 *** #Hwy -0.002417 0.003391 -0.713 0.479984 #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 66.33 on 43 degrees of freedom #Multiple R-squared: 0.6784, Adjusted R-squared: 0.6485 #F-statistic: 22.68 on 4 and 43 DF, p-value: 3.971e-10 plot(pGas.full.cont.lm) cr.plots(pGas.full.cont.lm) ############ FURTHER W/O WY Fuel.cont.Wy = Fuel.cont[Fuel.cont[1,]!="WY",] pGas.full.cont.Wy.lm = lm(pGas~pLic+Tax+Inc+Hwy, data=Fuel.cont.Wy) summary(pGas.full.cont.Wy.lm) # #Residuals: # Min 1Q Median 3Q Max #-122.47 -44.93 -10.56 31.57 234.41 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 376.780329 185.690258 2.029 0.048670 * #pLic 13.365947 1.924349 6.946 1.54e-08 *** #Tax -34.777939 12.976102 -2.680 0.010389 * #Inc -0.066513 0.017227 -3.861 0.000375 *** #Hwy -0.002417 0.003391 -0.713 0.479984 #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 # #Residual standard error: 66.33 on 43 degrees of freedom #Multiple R-squared: 0.6784, Adjusted R-squared: 0.6485 #F-statistic: 22.68 on 4 and 43 DF, p-value: 3.971e-10 plot(pGas.full.cont.Wy.lm) cr.plots(pGas.full.cont.Wy.lm) ########################################### ## JOINT TESTING OF LM MODEL ASSUMPTIONS install.packages("gvlma") library(gvlma) fuel.lmgv = gvlma(pGas~pLic+Tax+Inc+Hwy, data=Fuel.cont) summary(fuel.lmgv) plot(fuel.lmgv) deletion.gvlma(fuel.lmgv) ########################################### ## KEEPING INFLUENTIAL POINTS IN THE MODEL Fuel$HIind = rep(0,50) Fuel$HIind[50] = 1 Fuel$AKind = rep(0,50) Fuel$AKind[49] = 1 pGas.full.Out.lm = lm(pGas~pLic+Tax+Inc+Hwy+AKind+HIind, data=Fuel) summary(pGas.full.Out.lm) anova(pGas.full.Out.lm, pGas.full.lm ) #Analysis of Variance Table # #Model 1: pGas ~ pLic + Tax + Inc + Hwy + AKind + HIind #Model 2: pGas ~ pLic + Tax + Inc + Hwy # Res.Df RSS Df Sum of Sq F Pr(>F) #1 43 189205 #2 45 335938 -2 -146733 16.674 4.36e-06 *** #--- #Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 plot(pGas.full.Out.lm) cr.plots(pGas.full.Out2.lm) Fuel$WYind = rep(0,50) Fuel$WYind[40] = 1 pGas.full.Out2.lm = lm(pGas~pLic+Tax+Inc+Hwy+AKind+HIind+WYind, data=Fuel) anova(pGas.full.Out2.lm,pGas.full.Out.lm) summary(pGas.full.Out2.lm) #Residuals: # Min 1Q Median 3Q Max #-116.982 -31.711 -3.382 24.494 124.942 # #Coefficients: # Estimate Std. Error t value Pr(>|t|) #(Intercept) 4.313e+02 1.551e+02 2.781 0.00808 ** #pLic 1.173e+01 1.644e+00 7.137 9.27e-09 *** #Tax -3.138e+01 1.083e+01 -2.898 0.00595 ** #Inc -6.617e-02 1.434e-02 -4.613 3.70e-05 *** #Hwy -1.351e-03 2.834e-03 -0.477 0.63605 #AKind 1.876e+02 6.095e+01 3.078 0.00366 ** #HIind -3.584e+02 6.787e+01 -5.282 4.25e-06 *** #WYind 2.606e+02 5.823e+01 4.474 5.75e-05 *** # #Residual standard error: 55.23 on 42 degrees of freedom #Multiple R-squared: 0.8002, Adjusted R-squared: 0.7669 #F-statistic: 24.03 on 7 and 42 DF, p-value: 8.997e-13 plot(pGas.full.Out2.lm) cr.plots(pGas.full.Out2.lm) fuel.out.lmgv = gvlma(pGas~pLic+Tax+Inc+Hwy+AKind+HIind+WYind, data=Fuel) summary(fuel.out.lmgv) #gvlma(x = pGas ~ pLic + Tax + Inc + Hwy + AKind + HIind + WYind, data = Fuel) # # Value p-value Decision #Global Stat 4.7664 0.3121 Assumptions acceptable. #Skewness 1.8074 0.1788 Assumptions acceptable. #Kurtosis 0.0271 0.8692 Assumptions acceptable. #Link Function 2.2723 0.1317 Assumptions acceptable. #Heteroscedasticity 0.6596 0.4167 Assumptions acceptable.