#read the data from csv file
d <- read.csv(
"https://stats.idre.ucla.edu/wp-content/uploads/2019/02/elemapi2v2.csv")

#We can also read the data from the local computer
#we can set the working directory by setwd()
#d <- read.csv("C:/.../regR/elemapi2v2.csv")
class(d)  #class of object d, returns data.frame
names(d)  #retuns names of variables (columns)
dim(d)    #returns number of rows and columns of data frame


str(d)      #returns structure of variables in the data frame d
summary(d)  #Summary statistics of variables in d
help(lm)    #shows R Documentation for function lm()

#multiple regression model of DV api00 and DVs enroll, meals, and full
m1 <- lm(api00 ~ enroll, data = d) 
print(m1)   #Prints coefficients of the model
summary(m1) #Prints summary of the model
r <- cor(d$api00, d$enroll) #correlation coefficient of api00 and enroll
r ^ 2       #this is equal to r-squared in simple regression 
ls(m1)      #list of components in object class lm
m1$coefficients # returns coefficients of the model
m1$fitted.values[1:10] #a vector of fitted values
residuals <- m1$resid  #a vector of residuals
coefficients(m1)     # returns coefficients of the model
confint(m1)          # returns a matrix of Confidence Interval for coefficients

plot(api00 ~ enroll, data = d) #scatter plot of api00 vs. enroll
abline(m1, col = "blue")    #Add regression line to the scatter plot
#adding labels(school number) to the scatter plot
plot(api00 ~ enroll, data = d)
text(d$enroll, d$api00+20, labels = d$snum, cex = .7)
abline(m1, col = "blue")


anova(m1) #Anova table
#multiple regression model of DV api00 and DVs enroll, meals, and full
m2 <- lm(api00 ~  enroll + meals + full, data = d)
summary(m2) #summary of model m2
anova(m2)   #anova table of model m2

sum(anova(m2)$Sum) #sum of RSS and SSreg
(400 - 1) * var(d$api00) #Total sum of squre 

#Standardized regression model
m2.sd <- lm(scale(api00) ~  scale(enroll) + scale(meals) + scale(full), data = d)
summary(m2.sd)#coefficients are standardized



#---------------------------------------------------------------

# install packages for part 2, Regression Diagnostics

#install.packages("car")
#install.packages("alr3")
#install.packages("faraway")

#loading packages into working environment
library(car)
library(faraway)
library(alr3)
#scatter plot matrix from package car
scatterplotMatrix(~ api00 + enroll + meals + full, data =d)
#studentized residuals
er.std <- rstandard(m2)
#plot of studentized residuals
plot(er.std, ylab="Standardized Residual", ylim=c(-3.5,3.5))
#adding horizental lines for cut of value of outliers
abline(h =c(-3,0,3), lty = 2)
#determine which row is outlier(outside of cut of values of -3 and 3)
index <- which(er.std > 3 | er.std < -3)
#label School name to points that are out of bounds 
text(index-20, er.std[index] , labels = d$snum[index])
#print row number of values that are out of bounds
index
#print school names that are out of bounds
d$snum[index]

#Bonferroni p-values for testing outliner
outlierTest(m2)

#a vector containing the diagonal of the 'hat' matrix
h <- influence(m2)$hat

#half normal plot of leverage from package faraway
halfnorm(h, ylab = "leverage")


#the cut of value for cook's distance
cutoff <- 4/((nrow(d)-length(m2$coefficients)-2))
#plot cook's distance
plot(m2, which = 4, cook.levels = cutoff)

#cook's distance, studentized residuals, and leverage in the same plot
influencePlot(m2, main="Influence Plot", 
              sub="Circle size is proportial to Cook's Distance" )

#4 diagnostic plots to intentify influential points
infIndexPlot(m2)

#residual vs. fitted value plot for Homoscedasticity
plot(m2$resid ~ m2$fitted.values)
#add horizental line from 0
abline(h = 0, lty = 2)

#residual vs. fitted value and all predictors plus test for curvature
residualPlots(m2)

#residual plot vs. school id 
plot(m2$resid ~ d$snum)
abline(h = 0, lty = 2)

#plot(m2$resid[-1] ~ m2$resid[-400])
#Normal Quantile to Quantile plot
qqnorm(m2$resid)
qqline(m2$resid)


car::vif(m2) #variance inflation factor

#model specification added variable plot
avPlots(m2)

#codes for part 3
#------------------------------------------------------------
class(d$api00)   #class of api00
class(d$yr_rnd)  #class of yr_rnd
class(d$mealcat) #class of mealcat

#summary of api00, yr_rnd, and mealcat
summary(d$api00)   
summary(d$yr_rnd)  
summary(d$mealcat) 

table(d$yr_rnd)    #frequency table for yr_rnd
table(d$mealcat)   #frequency table for mealcat

#creat two new variables as factor and add to the data frame
d$yr_rnd_F <- factor(d$yr_rnd)
d$mealcat_F <- factor(d$mealcat)

class(d$yr_rnd_F)
class(d$mealcat_F)

#levels of factor
levels(d$yr_rnd_F)
levels(d$mealcat_F)

#summary of factor
summary(d$yr_rnd_F)
summary(d$mealcat_F)

#regression of api00 with yr_rnd_F
m3 <- lm(api00 ~ yr_rnd_F, data = d)
summary(m3)

#scatter plot api00 against yr_rnd
plot(api00 ~ yr_rnd, data = d)
abline(m3)  # add regression line to the plot

# mean of api00 when yr_rnd_F is at level "0". school is not year round
mean(d$api00[d$yr_rnd_F == "0"])
# mean of api00 when yr_rnd_F is at level "1". school is year round
mean(d$api00[d$yr_rnd_F == "1"])


#using aggregate to find the mean for each group of year school round
aggregate(api00 ~ yr_rnd_F, FUN=mean, data = d)

#t test for equality of mean of api00 for two group of year round and not year round 
#with equal variance assumption
t.test(api00 ~ yr_rnd_F, data = d, var.equal = TRUE)


#anova table
anova(m3)


#square of t value from the t-test is the same as F value from anova
10.7815 ^ 2

#regression model of api00 against categorical variable mealcat_F with 3 levels
m4 <- lm(api00 ~ mealcat_F, data = d)
summary(m4)

#aggregate the mean of api00 for each category in mealcat_F
aggregate(api00 ~ mealcat_F, FUN=mean, data = d)


#relevel factor mealcat_F and make group "3" as the reference group
d$mealcat_F <- relevel(d$mealcat_F, ref = "3")
m5 <- lm(api00 ~ mealcat_F, data = d)
summary(m5)