Statistic Basics and Linear Regression
This post explores some of the basic concepts of statistics. I mostly explore these concepts using linear regression. This is a reproducible example if you have R Studio just make sure you have installed the correct packages.
#http://r-statistics.co/Linear-Regression.html
#https://www.statmethods.net/stats/regression.html
#http://r-statistics.co/Statistical-Tests-in-R.html
#http://www.sthda.com/english/articles/40-regression-analysis/166-predict-in-r-model-predictions-and-confidence-intervals/
#Dr. Sager Utexas datasets
data <- read.table('AustinApartmentRents1.txt', header = TRUE)
summary(data)
## Rent Area
## Min. : 399.0 Min. : 474
## 1st Qu.: 470.0 1st Qu.: 666
## Median : 535.0 Median : 755
## Mean : 572.3 Mean : 816
## 3rd Qu.: 638.8 3rd Qu.: 925
## Max. :1050.0 Max. :1864
cor(data$Rent, data$Area)
## [1] 0.8740597
model <- lm(Rent ~ Area, data = data)
summary(model)
##
## Call:
## lm(formula = Rent ~ Area, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -154.659 -50.882 8.189 54.874 148.207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 160.18706 31.36081 5.108 3.8e-06 ***
## Area 0.50497 0.03685 13.702 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 68.86 on 58 degrees of freedom
## Multiple R-squared: 0.764, Adjusted R-squared: 0.7599
## F-statistic: 187.7 on 1 and 58 DF, p-value: < 2.2e-16
test.Areas <- data.frame(Area = c (500,1000))
predict(model, newdata = test.Areas)
## 1 2
## 412.6713 665.1556
data1 <- read.table('AustinApartmentRents2.txt', header = TRUE)
#A convient tool to see a lot of the initial data exploration
#https://towardsdatascience.com/simple-fast-exploratory-data-analysis-in-r-with-dataexplorer-package-e055348d9619
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 4.0.2
plot_str(data1)
plot_missing(data1)
plot_histogram(data1)
plot_density(data1)
plot_correlation(data1)
plot_bar(data1)
#create_report(data1) #This creates an HTML report of all the above information and more
#Confidence intervals around indivudual values
pred.int <- predict(model, interval = 'prediction')
## Warning in predict.lm(model, interval = "prediction"): predictions on current data refer to _future_ responses
#Confidence intervals around means
pred.conf <- predict(model, interval = 'confidence')
cbind(data,pred.int,pred.conf)
## Rent Area fit lwr upr fit lwr upr
## 1 519 725 526.2893 387.1539 665.4247 526.2893 507.2700 545.3085
## 2 765 995 662.6308 523.0320 802.2296 662.6308 640.4747 684.7868
## 3 475 481 403.0769 261.9228 544.2310 403.0769 372.6213 433.5326
## 4 575 925 627.2830 488.0776 766.4884 627.2830 607.7583 646.8077
## 5 415 600 463.1682 323.2840 603.0524 463.1682 439.2801 487.0563
## 6 530 668 497.5061 358.1044 636.9078 497.5061 476.6278 518.3843
## 7 580 725 526.2893 387.1539 665.4247 526.2893 507.2700 545.3085
## 8 995 1421 877.7474 731.7844 1023.7104 877.7474 829.7031 925.7917
## 9 565 672 499.5259 360.1470 638.9048 499.5259 478.8005 520.2514
## 10 620 1025 677.7799 537.9544 817.6053 677.7799 654.2379 701.3218
## 11 450 781 554.5675 415.5703 693.5648 554.5675 536.5869 572.5481
## 12 520 800 564.1619 425.1837 703.1402 564.1619 546.3289 581.9950
## 13 495 870 599.5097 460.4795 738.5399 599.5097 581.2764 617.7431
## 14 420 700 513.6651 374.4284 652.9017 513.6651 493.9190 533.4112
## 15 575 800 564.1619 425.1837 703.1402 564.1619 546.3289 581.9950
## 16 425 620 473.2676 333.5438 612.9913 473.2676 450.3375 496.1977
## 17 770 1040 685.3544 545.4026 825.3061 685.3544 661.0735 709.6352
## 18 445 520 422.7707 282.0919 563.4495 422.7707 394.5998 450.9416
## 19 510 880 604.5594 465.5062 743.6127 604.5594 586.1509 622.9679
## 20 635 832 580.3209 441.3427 719.2991 580.3209 562.4884 598.1535
## 21 470 545 435.3949 294.9906 575.7993 435.3949 408.6285 462.1614
## 22 700 921 625.2631 486.0744 764.4518 625.2631 605.8580 644.6682
## 23 450 577 451.5539 311.4663 591.6416 451.5539 426.5018 476.6060
## 24 785 1080 705.5531 565.2224 845.8838 705.5531 679.1757 731.9306
## 25 485 710 518.7147 379.5215 657.9080 518.7147 499.2771 538.1524
## 26 415 605 465.6930 325.8504 605.5357 465.6930 442.0494 489.3367
## 27 399 680 503.5657 364.2305 642.9008 503.5657 483.1366 523.9948
## 28 585 730 528.8141 389.6960 667.9322 528.8141 509.9220 547.7063
## 29 525 687 507.1005 367.8016 646.3994 507.1005 486.9201 527.2809
## 30 495 703 515.1800 375.9568 654.4032 515.1800 495.5288 534.8311
## 31 505 672 499.5259 360.1470 638.9048 499.5259 478.8005 520.2514
## 32 445 660 493.4663 354.0171 632.9155 493.4663 472.2734 514.6593
## 33 565 755 541.4383 402.3922 680.4845 541.4383 523.0835 559.7931
## 34 650 810 569.2116 430.2377 708.1855 569.2116 551.4123 587.0109
## 35 515 611 468.7229 328.9288 608.5169 468.7229 445.3683 492.0774
## 36 470 705 516.1899 376.9755 655.4044 516.1899 496.6009 535.7789
## 37 470 564 444.9893 304.7778 585.2009 444.9893 419.2531 470.7255
## 38 700 1250 791.3978 648.7851 934.0105 791.3978 754.7720 828.0235
## 39 455 512 418.7310 277.9593 559.5026 418.7310 390.1001 447.3618
## 40 550 630 478.3173 338.6680 617.9666 478.3173 455.8452 500.7893
## 41 625 850 589.4103 450.4146 728.4061 589.4103 571.4413 607.3794
## 42 745 1156 743.9307 602.7129 885.1486 743.9307 713.1810 774.6805
## 43 540 932 630.8178 491.5816 770.0540 630.8178 611.0749 650.5607
## 44 650 755 541.4383 402.3922 680.4845 541.4383 523.0835 559.7931
## 45 595 1093 712.1177 571.6507 852.5847 712.1177 685.0246 739.2108
## 46 470 751 539.4185 400.3624 678.4745 539.4185 520.9890 557.8479
## 47 480 608 467.2080 327.3898 607.0261 467.2080 443.7095 490.7064
## 48 460 900 614.6588 475.5477 753.7699 614.6588 595.8181 633.4994
## 49 600 860 594.4600 455.4490 733.4711 594.4600 576.3734 612.5467
## 50 575 925 627.2830 488.0776 766.4884 627.2830 607.7583 646.8077
## 51 659 944 636.8774 497.5840 776.1708 636.8774 616.7351 657.0197
## 52 650 940 634.8575 495.5838 774.1312 634.8575 614.8518 654.8632
## 53 750 1048 689.3941 549.3715 829.4168 689.3941 664.7079 714.0803
## 54 455 474 399.5422 258.2967 540.7876 399.5422 368.6660 430.4184
## 55 430 700 513.6651 374.4284 652.9017 513.6651 493.9190 533.4112
## 56 605 921 625.2631 486.0744 764.4518 625.2631 605.8580 644.6682
## 57 929 1229 780.7934 638.5205 923.0664 780.7934 745.5138 816.0731
## 58 695 896 612.6389 473.5406 751.7372 612.6389 593.8932 631.3846
## 59 455 630 478.3173 338.6680 617.9666 478.3173 455.8452 500.7893
## 60 1050 1864 1101.4485 942.4198 1260.4772 1101.4485 1022.1188 1180.7782
# 2. Regression line + confidence intervals
library("ggplot2")
## Warning: package 'ggplot2' was built under R version 4.0.2
mydata <- cbind(data, pred.int)
p <- ggplot(mydata, aes(Area, Rent)) +
geom_point() +
stat_smooth(method = lm)
# 3. Add prediction intervals
p + geom_line(aes(y = lwr), color = "red", linetype = "dashed")+
geom_line(aes(y = upr), color = "red", linetype = "dashed")
#T Test for samples
library(dplyr)
sample1 <- sample_n(data,40)
model1 <- lm(Rent ~ Area, data = data)
p1 <- predict(model1, interval = 'confidence', level = 0.95)
summary(p1)
## fit lwr upr
## Min. : 399.5 Min. : 368.7 Min. : 430.4
## 1st Qu.: 496.5 1st Qu.: 475.5 1st Qu.: 517.5
## Median : 541.4 Median : 523.1 Median : 559.8
## Mean : 572.3 Mean : 548.8 Mean : 595.7
## 3rd Qu.: 627.3 3rd Qu.: 607.8 3rd Qu.: 646.8
## Max. :1101.4 Max. :1022.1 Max. :1180.8
t.test(p1, mu = 550)
##
## One Sample t-test
##
## data: p1
## t = 2.4118, df = 179, p-value = 0.01688
## alternative hypothesis: true mean is not equal to 550
## 95 percent confidence interval:
## 554.0485 590.4849
## sample estimates:
## mean of x
## 572.2667
#MultiVariable Linear Regression
summary(data1)
## Rent Area Bedrooms Bathrooms
## Min. : 399.0 Min. : 474 Min. :1.000 Min. :1.00
## 1st Qu.: 470.0 1st Qu.: 666 1st Qu.:1.000 1st Qu.:1.00
## Median : 535.0 Median : 755 Median :1.000 Median :1.00
## Mean : 572.3 Mean : 816 Mean :1.517 Mean :1.25
## 3rd Qu.: 638.8 3rd Qu.: 925 3rd Qu.:2.000 3rd Qu.:1.25
## Max. :1050.0 Max. :1864 Max. :5.000 Max. :2.00
## Security Parking Distance Shuttle
## Min. :0.0000 Min. :0.0000 Min. : 1.100 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 5.000 1st Qu.:1.0000
## Median :0.0000 Median :0.0000 Median : 6.000 Median :1.0000
## Mean :0.1667 Mean :0.1333 Mean : 5.935 Mean :0.8667
## 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.: 7.000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :10.500 Max. :1.0000
## Age
## Min. : 1.00
## 1st Qu.:10.00
## Median :16.50
## Mean :16.33
## 3rd Qu.:22.25
## Max. :32.00
model2 <- lm(Rent ~ Area + Bathrooms, data = data1)
summary(model2)
##
## Call:
## lm(formula = Rent ~ Area + Bathrooms, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -152.02 -45.45 10.38 39.91 129.28
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 143.66927 29.51345 4.868 9.31e-06 ***
## Area 0.38746 0.04982 7.777 1.61e-10 ***
## Bathrooms 89.92902 27.75071 3.241 0.00199 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 63.83 on 57 degrees of freedom
## Multiple R-squared: 0.8007, Adjusted R-squared: 0.7937
## F-statistic: 114.5 on 2 and 57 DF, p-value: < 2.2e-16
test.bathrooms <- data.frame(Area = c(500,1000), Bathrooms = c(1,2))
p2a <- predict(model2, newdata = test.bathrooms, interval = 'confidence')
p2 <- as.data.frame(predict(model2, interval = 'confidence', level = 0.95))
cbind(p2, data1)
## fit lwr upr Rent Area Bedrooms Bathrooms Security Parking
## 1 514.5062 495.4259 533.5866 519 725 1 1 0 0
## 2 709.0493 673.7669 744.3316 765 995 2 2 0 0
## 3 419.9662 389.8581 450.0743 475 481 1 1 0 0
## 4 681.9271 643.6132 720.2411 575 925 2 2 0 1
## 5 466.0738 443.8498 488.2979 415 600 1 1 0 0
## 6 492.4211 472.8074 512.0348 530 668 1 1 0 0
## 7 514.5062 495.4259 533.5866 580 725 1 1 0 0
## 8 874.1069 829.4987 918.7151 995 1421 2 2 0 1
## 9 493.9709 474.4481 513.4937 565 672 1 1 0 0
## 10 630.7440 594.3939 667.0941 620 1025 2 1 1 0
## 11 536.2040 516.0356 556.3724 450 781 1 1 1 0
## 12 543.5657 522.6986 564.4328 520 800 2 1 0 0
## 13 570.6878 546.1304 595.2452 495 870 2 1 0 0
## 14 504.8198 485.7109 523.9287 420 700 1 1 0 0
## 15 543.5657 522.6986 564.4328 575 800 1 1 0 0
## 16 473.8230 452.5572 495.0888 425 620 1 1 0 0
## 17 726.4849 692.5303 760.4395 770 1040 2 2 0 1
## 18 435.0771 407.8700 462.2843 445 520 1 1 0 0
## 19 574.5624 549.3637 599.7611 510 880 2 1 0 1
## 20 555.9644 533.6045 578.3243 635 832 1 1 0 0
## 21 444.7636 419.2769 470.2502 470 545 1 1 0 0
## 22 680.3773 641.8590 718.8955 700 921 2 2 0 0
## 23 457.1623 433.6744 480.6501 450 577 1 1 0 0
## 24 741.9833 708.7412 775.2254 785 1080 2 2 0 0
## 25 508.6944 489.6360 527.7527 485 710 1 1 0 0
## 26 468.0111 446.0397 489.9826 415 605 1 1 0 0
## 27 497.0706 477.7062 516.4349 399 680 1 1 0 1
## 28 516.4435 497.3298 535.5573 585 730 2 1 0 0
## 29 499.7828 480.5310 519.0346 525 687 1 1 0 0
## 30 505.9821 486.8939 525.0704 495 703 1 1 0 0
## 31 493.9709 474.4481 513.4937 505 672 1 1 1 0
## 32 489.3214 469.5030 509.1398 445 660 1 1 0 0
## 33 526.1300 506.6576 545.6024 565 755 2 1 0 0
## 34 547.4403 526.1468 568.7337 650 810 2 1 0 0
## 35 470.3359 448.6563 492.0154 515 611 1 1 0 0
## 36 506.7571 487.6799 525.8342 470 705 1 1 0 0
## 37 452.1253 427.8562 476.3944 470 564 1 1 0 0
## 38 807.8514 772.3992 843.3035 700 1250 3 2 0 1
## 39 431.9774 404.1949 459.7599 455 512 1 1 1 0
## 40 477.6976 456.8558 498.5395 550 630 1 1 0 0
## 41 562.9387 539.5887 586.2886 625 850 2 1 1 0
## 42 771.4302 738.2367 804.6237 745 1156 3 2 0 0
## 43 594.7103 565.8488 623.5718 540 932 2 1 0 0
## 44 526.1300 506.6576 545.6024 650 755 1 1 1 1
## 45 747.0203 713.9093 780.1312 595 1093 2 2 1 0
## 46 524.5802 505.1862 543.9741 470 751 1 1 1 0
## 47 469.1735 447.3496 490.9974 480 608 1 1 0 0
## 48 582.3116 555.7642 608.8590 460 900 1 1 0 1
## 49 566.8132 542.8728 590.7537 600 860 2 1 0 0
## 50 591.9981 563.6574 620.3388 575 925 2 1 0 0
## 51 689.2888 651.9023 726.6754 659 944 2 2 0 0
## 52 687.7390 650.1632 725.3148 650 940 2 2 0 0
## 53 729.5846 695.8090 763.3602 750 1048 2 2 0 0
## 54 417.2540 386.6020 447.9060 455 474 1 1 0 0
## 55 504.8198 485.7109 523.9287 430 700 1 1 0 0
## 56 590.4483 562.4017 618.4949 605 921 1 1 0 0
## 57 799.7147 764.9734 834.4561 929 1229 2 2 1 0
## 58 670.6908 630.8290 710.5526 695 896 2 2 0 0
## 59 477.6976 456.8558 498.5395 455 630 1 1 1 0
## 60 1045.7513 964.5360 1126.9667 1050 1864 5 2 0 0
## Distance Shuttle Age
## 1 10.5 1 9
## 2 6.5 1 17
## 3 6.5 1 17
## 4 4.0 1 9
## 5 5.0 1 30
## 6 6.5 1 19
## 7 7.0 1 17
## 8 6.5 1 16
## 9 7.0 1 17
## 10 5.0 1 3
## 11 5.5 1 3
## 12 6.0 1 20
## 13 5.0 1 27
## 14 6.0 1 22
## 15 7.0 1 10
## 16 8.0 0 27
## 17 6.5 1 16
## 18 3.0 1 12
## 19 7.0 0 25
## 20 6.0 1 13
## 21 6.5 1 9
## 22 3.0 1 26
## 23 8.0 1 18
## 24 5.0 1 10
## 25 6.0 1 25
## 26 6.0 1 22
## 27 7.0 0 25
## 28 6.5 1 19
## 29 7.0 1 15
## 30 6.5 1 14
## 31 6.5 1 9
## 32 6.0 1 25
## 33 3.0 1 12
## 34 2.0 1 32
## 35 6.5 1 17
## 36 7.5 0 13
## 37 5.0 1 10
## 38 4.0 1 9
## 39 10.0 0 10
## 40 2.0 1 32
## 41 7.0 1 1
## 42 7.5 0 13
## 43 6.0 1 22
## 44 1.1 1 26
## 45 5.5 1 3
## 46 5.0 1 3
## 47 6.0 1 15
## 48 4.0 1 9
## 49 6.0 1 25
## 50 6.0 1 23
## 51 7.0 1 25
## 52 8.0 0 27
## 53 7.0 1 3
## 54 5.0 1 10
## 55 6.0 1 20
## 56 7.5 0 13
## 57 5.0 1 11
## 58 6.5 1 19
## 59 5.5 1 9
## 60 6.0 1 22
p2a
## fit lwr upr
## 1 427.3279 398.6624 455.9935
## 2 710.9866 675.8776 746.0955
model3 <- lm(Rent ~ Area + Bathrooms + Security + Parking + Distance, data = data1)
summary(model3)
##
## Call:
## lm(formula = Rent ~ Area + Bathrooms + Security + Parking + Distance,
## data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.199 -37.278 4.548 38.345 149.276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 167.45045 43.12346 3.883 0.000283 ***
## Area 0.39647 0.05084 7.798 2.09e-10 ***
## Bathrooms 92.60040 28.01804 3.305 0.001691 **
## Security -0.67875 22.42818 -0.030 0.975969
## Parking -38.26531 25.89638 -1.478 0.145316
## Distance -4.92937 5.05014 -0.976 0.333374
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.02 on 54 degrees of freedom
## Multiple R-squared: 0.81, Adjusted R-squared: 0.7925
## F-statistic: 46.06 on 5 and 54 DF, p-value: < 2.2e-16
model4 <- lm(Rent ~ Distance + Parking + Security + Bathrooms + Area, data = data1)
summary(model4)
##
## Call:
## lm(formula = Rent ~ Distance + Parking + Security + Bathrooms +
## Area, data = data1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -163.199 -37.278 4.548 38.345 149.276
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 167.45045 43.12346 3.883 0.000283 ***
## Distance -4.92937 5.05014 -0.976 0.333374
## Parking -38.26531 25.89638 -1.478 0.145316
## Security -0.67875 22.42818 -0.030 0.975969
## Bathrooms 92.60040 28.01804 3.305 0.001691 **
## Area 0.39647 0.05084 7.798 2.09e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 64.02 on 54 degrees of freedom
## Multiple R-squared: 0.81, Adjusted R-squared: 0.7925
## F-statistic: 46.06 on 5 and 54 DF, p-value: < 2.2e-16
#Multicollinearity
#http://www.sthda.com/english/articles/39-regression-model-diagnostics/160-multicollinearity-essentials-and-vif-in-r/
library(caret)
## Warning: package 'caret' was built under R version 4.0.2
set.seed(1234)
y <- runif(50,min=0, max =100)
x1 <- runif(50, min = 0, max = 100)
x2 <- runif(50, min = 0, max = 100)
z1 <- x1+x2
z2 <- x1 + x2 + 0.005*runif(50,min=0, max =100)
list <- cbind(y,x1,x2,z1,z2)
list
## y x1 x2 z1 z2
## [1,] 11.3703411 7.377988 3.545673 10.92366 11.14318
## [2,] 62.2299405 30.968660 56.507611 87.47627 87.59057
## [3,] 60.9274733 71.727174 28.025778 99.75295 99.79403
## [4,] 62.3379442 50.454591 20.419632 70.87422 71.29936
## [5,] 86.0915384 15.299896 13.373890 28.67379 28.79112
## [6,] 64.0310605 50.393349 32.568192 82.96154 83.45562
## [7,] 0.9495756 49.396092 15.506197 64.90229 65.20324
## [8,] 23.2550506 75.120020 12.996214 88.11623 88.61560
## [9,] 66.6083758 17.464982 43.553106 61.01809 61.20589
## [10,] 51.4251141 84.839241 3.864265 88.70351 88.98107
## [11,] 69.3591292 86.483383 71.330156 157.81354 158.02826
## [12,] 54.4974836 4.185728 10.076904 14.26263 14.55057
## [13,] 28.2733584 31.718216 95.030494 126.74871 126.96496
## [14,] 92.3433484 1.374994 12.181776 13.55677 13.66919
## [15,] 29.2315840 23.902573 21.965662 45.86823 45.91073
## [16,] 83.7295628 70.649462 91.308777 161.95824 162.27689
## [17,] 28.6223285 30.809476 94.585312 125.39479 125.61030
## [18,] 26.6820780 50.854757 27.915622 78.77038 78.80674
## [19,] 18.6722790 5.164662 12.347109 17.51177 17.91297
## [20,] 23.2225911 56.456984 79.716046 136.17303 136.33567
## [21,] 31.6612455 12.148019 74.427722 86.57574 86.95438
## [22,] 30.2693371 89.283638 91.597422 180.88106 181.17320
## [23,] 15.9046003 1.462726 99.459825 100.92255 101.27697
## [24,] 3.9995918 78.312110 94.236072 172.54818 172.76167
## [25,] 21.8799541 8.996133 48.613541 57.60967 57.78146
## [26,] 81.0598552 51.918998 28.345954 80.26495 80.64451
## [27,] 52.5697547 38.426669 25.154570 63.58124 63.79325
## [28,] 91.4658166 7.005250 50.325517 57.33077 57.61121
## [29,] 83.1345047 32.064442 49.696617 81.76106 81.81913
## [30,] 4.5770263 66.849540 31.844581 98.69412 98.84563
## [31,] 45.6091482 92.640048 96.222283 188.86233 189.10173
## [32,] 26.5186672 47.190972 63.409937 110.60091 110.77332
## [33,] 30.4672203 14.261534 12.743340 27.00487 27.30523
## [34,] 50.7306870 54.426976 42.304699 96.73167 96.76972
## [35,] 18.1096208 19.617465 91.431691 111.04916 111.52715
## [36,] 75.9670635 89.858049 46.779233 136.63728 136.64839
## [37,] 20.1248038 38.949978 90.816915 129.76689 130.18775
## [38,] 25.8809819 31.087078 59.774328 90.86141 91.17763
## [39,] 99.2150418 16.002866 63.174282 79.17715 79.33219
## [40,] 80.7352340 89.618585 86.915832 176.53442 176.90570
## [41,] 55.3333591 16.639378 50.274982 66.91436 67.23382
## [42,] 64.6406094 90.042460 98.363511 188.40597 188.90223
## [43,] 31.1824307 13.407820 32.438603 45.84642 45.91056
## [44,] 62.1819198 13.161413 48.137495 61.29891 61.74053
## [45,] 32.9770176 10.528750 35.698708 46.22746 46.63250
## [46,] 50.1997473 51.158358 62.747768 113.90613 114.31705
## [47,] 67.7094527 30.019905 74.160019 104.17992 104.59728
## [48,] 48.4991239 2.671690 56.596682 59.26837 59.63474
## [49,] 24.3928827 30.964743 98.078651 129.04339 129.53492
## [50,] 76.5459788 74.211966 57.681274 131.89324 132.21284
cor(list)
## y x1 x2 z1 z2
## y 1.0000000000 0.07758827 -0.07522322 -0.0005374415 -0.0008533985
## x1 0.0775882672 1.00000000 0.25313221 0.7815275201 0.7811302343
## x2 -0.0752232228 0.25313221 1.00000000 0.8013821453 0.8017562132
## z1 -0.0005374415 0.78152752 0.80138215 1.0000000000 0.9999955888
## z2 -0.0008533985 0.78113023 0.80175621 0.9999955888 1.0000000000
#model catches exact multicollinearity easily
mcmodel1 <- lm(y ~ x1+x2+z1+z2)
summary(mcmodel1)
##
## Call:
## lm(formula = y ~ x1 + x2 + z1 + z2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.767 -21.484 -2.126 19.999 53.375
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.22 10.53 4.865 1.38e-05 ***
## x1 15.81 28.05 0.563 0.576
## x2 15.65 28.09 0.557 0.580
## z1 NA NA NA NA
## z2 -15.72 28.06 -0.560 0.578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.93 on 46 degrees of freedom
## Multiple R-squared: 0.0223, Adjusted R-squared: -0.04146
## F-statistic: 0.3498 on 3 and 46 DF, p-value: 0.7895
#has a harder time catching near multicollinearity useful to use VIF or tolerance
mcmodel2 <- lm(y ~ x1+x2+z2)
summary(mcmodel2)
##
## Call:
## lm(formula = y ~ x1 + x2 + z2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -48.767 -21.484 -2.126 19.999 53.375
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 51.22 10.53 4.865 1.38e-05 ***
## x1 15.81 28.05 0.563 0.576
## x2 15.65 28.09 0.557 0.580
## z2 -15.72 28.06 -0.560 0.578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.93 on 46 degrees of freedom
## Multiple R-squared: 0.0223, Adjusted R-squared: -0.04146
## F-statistic: 0.3498 on 3 and 46 DF, p-value: 0.7895
car::vif(mcmodel2)
## x1 x2 z2
## 45305.13 49446.24 118711.40
#Can drop the collinear term
mcmodel3 <- lm(y ~ x1+x2)
summary(mcmodel3)
##
## Call:
## lm(formula = y ~ x1 + x2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.044 -19.925 -0.354 19.215 55.525
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.74591 8.44508 5.654 8.96e-07 ***
## x1 0.09332 0.13522 0.690 0.493
## x2 -0.08784 0.12964 -0.678 0.501
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.73 on 47 degrees of freedom
## Multiple R-squared: 0.01564, Adjusted R-squared: -0.02625
## F-statistic: 0.3733 on 2 and 47 DF, p-value: 0.6905
car::vif(mcmodel3)
## x1 x2
## 1.068463 1.068463
#Or drop the other one
mcmodel4 <- lm(y ~ x1+z2)
summary(mcmodel4)
##
## Call:
## lm(formula = y ~ x1 + z2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.050 -19.928 -0.363 19.217 55.519
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 47.77939 8.46296 5.646 9.21e-07 ***
## x1 0.18145 0.20950 0.866 0.391
## z2 -0.08807 0.12948 -0.680 0.500
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 26.73 on 47 degrees of freedom
## Multiple R-squared: 0.01571, Adjusted R-squared: -0.02618
## F-statistic: 0.3751 on 2 and 47 DF, p-value: 0.6893
car::vif(mcmodel4)
## x1 z2
## 2.565184 2.565184
#normality check
#https://www.statmethods.net/stats/regression.html
nmodel <- summary(model2)
nmodel$residuals
## 1 2 3 4 5 6
## 4.493754 55.950733 55.033809 -106.927120 -51.073841 37.578930
## 7 8 9 10 11 12
## 65.493754 120.893095 71.029094 -10.744020 -86.203964 -23.565690
## 13 14 15 16 17 18
## -75.687837 -84.819765 31.434310 -48.823026 43.515067 9.922899
## 19 20 21 22 23 24
## -64.562429 79.035614 25.236418 19.622717 -7.162278 43.016698
## 25 26 27 28 29 30
## -23.694358 -53.011137 -98.070580 68.556457 25.217205 -10.982143
## 31 32 33 34 35 36
## 11.029094 -44.321396 38.869976 102.559718 44.664107 -36.757062
## 37 38 39 40 41 42
## 17.874692 -107.851374 23.022573 72.302382 62.061348 -26.430205
## 43 44 45 46 47 48
## -54.710310 123.869976 -152.020273 -54.580187 10.826485 -122.311614
## 49 50 51 52 53 54
## 33.186756 -16.998095 -30.288845 -37.739008 20.415393 37.746024
## 55 56 57 58 59 60
## -74.819765 14.551742 129.285270 24.309199 -22.697618 4.248650
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(model2)
hist(nmodel$residuals)
#Runs different tests for normality run from the predicted values for rent
p3 <- predict(model2)
summary(p3)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 417.3 491.6 526.1 572.3 640.7 1045.8
sd(p3)
## [1] 125.7441
#H0: from normal distribution p < 0.05 reject
shapiro.test(p3)
##
## Shapiro-Wilk normality test
##
## data: p3
## W = 0.86842, p-value = 1.114e-05
#compares if two samples are from same distribution so comparing to a normal distribution H0: from different distributions p < 0.05 reject
ks.test(p3, rnorm(60,572.3,125.7441))
## Warning in ks.test(p3, rnorm(60, 572.3, 125.7441)): cannot compute exact p-value
## with ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: p3 and rnorm(60, 572.3, 125.7441)
## D = 0.18333, p-value = 0.2656
## alternative hypothesis: two-sided
#Linearity Test
#https://bookdown.org/ccolonescu/RPoE4/further-inference-in-multiple-regression.html
#http://r-statistics.co/Statistical-Tests-in-R.html
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.0.2
#Ramsey RESET test test whether higher order polynomials are necessary H0: no higher order polynomials are necssary
resettest(model3, power = 2:3, type = 'fitted')
##
## RESET test
##
## data: model3
## RESET = 1.3333, df1 = 2, df2 = 52, p-value = 0.2725
resettest(model3, power = 2:3, type = 'regressor')
##
## RESET test
##
## data: model3
## RESET = 2.0443, df1 = 10, df2 = 44, p-value = 0.05111
#Heteroscedasticity
#Put simply, heteroscedasticity (also spelled heteroskedasticity) refers to the circumstance in which the variability of a variable is unequal across the range of values of a second variable that predicts it.
#http://www.statsmakemecry.com/smmctheblog/confusing-stats-terms-explained-heteroscedasticity-heteroske.html
#Fisher Test can be used to tell if two samples have the same variance H0: ratio of variances is 1 aka they are the same p < 0.05 reject H0
var.test(sample(35,p3, replace = TRUE),sample(35,p3, replace = TRUE))
##
## F test to compare two variances
##
## data: sample(35, p3, replace = TRUE) and sample(35, p3, replace = TRUE)
## F = 0.91901, num df = 513, denom df = 513, p-value = 0.3391
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7728094 1.0928623
## sample estimates:
## ratio of variances
## 0.9190072
#Independance
#Chi square tests if two caterogical variables are dependant on each other H0: variables are independant p < 0.05 reject H0
chisq.test(table(data1$Bedrooms,data1$Bathrooms))
## Warning in chisq.test(table(data1$Bedrooms, data1$Bathrooms)): Chi-squared
## approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: table(data1$Bedrooms, data1$Bathrooms)
## X-squared = 29.391, df = 3, p-value = 1.853e-06
summary(table(data1$Bedrooms,data1$Bathrooms))
## Number of cases in table: 60
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 29.391, df = 3, p-value = 1.853e-06
## Chi-squared approximation may be incorrect
#Correlation test between two continuous variables H0: correlation is 0 aka they are independant p < 0.05 reject H0
#All show some correlation
cor.test(data1$Rent, data1$Area)
##
## Pearson's product-moment correlation
##
## data: data1$Rent and data1$Area
## t = 13.702, df = 58, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7970327 0.9231055
## sample estimates:
## cor
## 0.8740597
cor.test(data1$Bedrooms, data1$Area)
##
## Pearson's product-moment correlation
##
## data: data1$Bedrooms and data1$Area
## t = 12.811, df = 58, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7747757 0.9140118
## sample estimates:
## cor
## 0.8595894
cor.test(data1$Bedrooms, data1$Bathrooms)
##
## Pearson's product-moment correlation
##
## data: data1$Bedrooms and data1$Bathrooms
## t = 6.6217, df = 58, p-value = 1.262e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4826241 0.7800925
## sample estimates:
## cor
## 0.6561389
cor.test(data1$Bedrooms, data1$Bathrooms)
##
## Pearson's product-moment correlation
##
## data: data1$Bedrooms and data1$Bathrooms
## t = 6.6217, df = 58, p-value = 1.262e-08
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4826241 0.7800925
## sample estimates:
## cor
## 0.6561389
#Are uncorrelated
cor.test(data1$Area, data1$Distance)
##
## Pearson's product-moment correlation
##
## data: data1$Area and data1$Distance
## t = -0.4492, df = 58, p-value = 0.655
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3081967 0.1980052
## sample estimates:
## cor
## -0.05887988
#Both of these tests use log parameters as well as lag and leads to determine if the variance changes or if the predictors are truly independant of each other
#Other helpful Stat and R learning
#http://faculty.marshall.usc.edu/gareth-james/ISL/index.html
#https://web.stanford.edu/~hastie/ElemStatLearn/
#Essentials of Machine Learning https://www.analyticsvidhya.com/blog/2017/09/common-machine-learning-algorithms/
#CLT and Stat Basics https://www.analyticsvidhya.com/blog/2019/05/statistics-101-introduction-central-limit-theorem/
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.