# Reading csv file into R environment
mydata <- read.csv("example1.csv", header=TRUE)

# Attaching dataset to global environment
attach(mydata)
## The following objects are masked from mydata (pos = 3):
## 
##     Case, Height_in_Inches, Weight_in_Pounds
## 
## The following objects are masked from mydata (pos = 6):
## 
##     Case, Height_in_Inches, Weight_in_Pounds
## 
## The following objects are masked from mydata (pos = 7):
## 
##     Case, Height_in_Inches, Weight_in_Pounds
## 
## The following objects are masked from mydata (pos = 8):
## 
##     Case, Height_in_Inches, Weight_in_Pounds
# Displaying header names
names(mydata)
## [1] "Case"             "Height_in_Inches" "Weight_in_Pounds"
# Summarizing the data
summary(mydata)
##       Case       Height_in_Inches Weight_in_Pounds
##  Min.   :    1   Min.   :60.3     Min.   : 78     
##  1st Qu.: 6251   1st Qu.:66.7     1st Qu.:119     
##  Median :12500   Median :68.0     Median :127     
##  Mean   :12500   Mean   :68.0     Mean   :127     
##  3rd Qu.:18750   3rd Qu.:69.3     3rd Qu.:135     
##  Max.   :25000   Max.   :75.2     Max.   :171
# Plotting data
plot(Height_in_Inches, Weight_in_Pounds, main="Weight VS Height")

1

# The scatterplot shows a fairly strong and reasonably linear relationship between the two variables. 
# A Pearson correlation coefficient can be calculated using the cor( ) function
cor(Height_in_Inches, Weight_in_Pounds)
## [1] 0.5029
# Pearson's r = 0.5028585 means positive correlation exists among the variables

# We can also perform correlation test using:
cor.test(Height_in_Inches, Weight_in_Pounds)
## 
##  Pearson's product-moment correlation
## 
## data:  Height_in_Inches and Weight_in_Pounds
## t = 91.98, df = 24998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4935 0.5121
## sample estimates:
##    cor 
## 0.5029
# The function does a t-test with a 95% confidence interval for the population correlation 
# You can set "conf.level= " to change the confidence level, e.g.
cor.test(Height_in_Inches, Weight_in_Pounds, conf.level=0.99)
## 
##  Pearson's product-moment correlation
## 
## data:  Height_in_Inches and Weight_in_Pounds
## t = 91.98, df = 24998, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 99 percent confidence interval:
##  0.4906 0.5149
## sample estimates:
##    cor 
## 0.5029
# Since p-value is less than <0.05, the test suggests a strong positive correlation

# Performing simple linear regression using lm() function
# In R, lm() function for simple regression takes the form Weight ~ Height, which means something like "Weight as a function of Height" or "Weight as predicted by Height" etc.

fit <- lm(Weight_in_Pounds ~ Height_in_Inches)

# Viewing the results
fit
## 
## Call:
## lm(formula = Weight_in_Pounds ~ Height_in_Inches)
## 
## Coefficients:
##      (Intercept)  Height_in_Inches  
##           -82.58              3.08
# Better results can be viewed using:
summary(fit)
## 
## Call:
## lm(formula = Weight_in_Pounds ~ Height_in_Inches)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -40.30  -6.71  -0.05   6.81  39.09 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -82.5757     2.2802   -36.2   <2e-16 ***
## Height_in_Inches   3.0835     0.0335    92.0   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.1 on 24998 degrees of freedom
## Multiple R-squared:  0.253,  Adjusted R-squared:  0.253 
## F-statistic: 8.46e+03 on 1 and 24998 DF,  p-value: <2e-16
# Clearly we can see that the coefficients are highly significant (three stars)
# Also p-value being <0.05 (in fact, p-value is less than 2.2e-16), so we would reject the hypothesis that the slope is zero.

# Framing the equation:
# Weight_in_Pounds = -82.57574 + 3.08348* Height_in_Inches

# Plotting regression line
plot(Weight_in_Pounds ~ Height_in_Inches, main="Weight Vs. Height")
abline(fit, col="red")

2

# Plotting residuals
par(mfrow=c(2,2))
plot(fit)

3

# Predicting from the model fits (regression)
head(predict(fit))
##     1     2     3     4     5     6 
## 120.3 137.9 131.4 127.8 126.4 129.3
head(predict(fit, interval = "confidence"))
##     fit   lwr   upr
## 1 120.3 120.1 120.5
## 2 137.9 137.7 138.2
## 3 131.4 131.3 131.6
## 4 127.8 127.6 127.9
## 5 126.4 126.3 126.6
## 6 129.3 129.1 129.4
# Using prediction equation to predict weight for a given height (e.g. 80 Inches)
newheight <- data.frame(Height_in_Inches = 80) 
predict(fit, newheight, interval="predict")
##     fit   lwr   upr
## 1 164.1 144.3 183.9
# The 95% prediction interval of the weight for the given height of 80 inches is between 144.3298 and 183.875 Pounds 
Advertisements