## ###### CODE STARTS HERE ##### ## #run these if you do not have the packages installed ## install.packages("tidyverse", dependencies = TRUE) ## install.packages("Hmisc", dependencies = TRUE) ## install.packages("lme4", dependencies = TRUE) #load libraries into R session library(tidyverse) library(Hmisc) library(lme4) library(nlme) theme_set(theme_grey(base_size = 12)) #first few rows of Milk head(Milk) #declare data and x and y aesthetics, #but no shapes yet ggplot(data = Milk, aes(x=Time, y=protein)) #geom_point inherits x=Time, y=protein ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() #color=Diet only applies to geom_boxplot ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() + geom_boxplot(aes(color=Diet)) #color varies as Diet varies ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point(aes(color=Diet)) #map color to constant "green" outside of aes ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point(color="green") #Color set to default red-orange color rather than green ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point(aes(color="green")) #Color set to default red-orange color rather than green ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() #Color set to default red-orange color rather than green ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() + geom_line(aes(group=Cow)) #Color set to default red-orange color rather than green ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() + geom_line(aes(group=Cow)) + geom_smooth() #store specification in object p and then produce graph p <- ggplot(data = Milk, aes(x=Time, y=protein)) + geom_point() p #add a loess plot layer p + geom_smooth() #or panel by Diet p + facet_wrap(~Diet) #define data and x for univariate plots pro <- ggplot(Milk, aes(x=protein)) #histogram pro + geom_histogram() #and density plot pro + geom_density() #bar plot of counts of Diet ggplot(Milk, aes(x=Diet)) + geom_bar() #define data, x and y p2 <- ggplot(Milk, aes(x=Time, y=protein)) #scatter plot p2 + geom_point() #loess plot p2 + geom_smooth() #loess plot p2 + geom_point() + geom_smooth() #lines of protein over Time by group=Cow p2 + geom_line(aes(group=Cow)) #lines of protein over Time by group=Cow p2 + geom_line() #lines of protein over Time colored by Cow p2 + geom_line(aes(color=Cow)) #lines of protein over Time by Cow colored by Diet p2 + geom_line(aes(group=Cow, color=Diet)) #lines of protein over Time by Cow patterned by Diet p2 + geom_line(aes(group=Cow, linetype=Diet)) #stat_bin produces a bar graph by default ggplot(Milk, aes(x=protein)) + stat_bin() #points instead of bars ggplot(Milk, aes(x=protein)) + stat_bin(geom="line") #plot mean and se of protein at each Time ggplot(Milk, aes(x=Time, y=protein)) + stat_summary() #median returns 1 value, geom_pointrange expects 2 (high, low) #change geom to point ggplot(Milk, aes(x=Time, y=protein)) + stat_summary(fun.y="median", geom="point") #mean and boostrapped confidence limits ggplot(Milk, aes(x=Time, y=protein)) + stat_summary(fun.data="mean_cl_boot") #we define our own function mean of log #and change geom to line meanlog <- function(y) {mean(log(y))} ggplot(Milk, aes(x=Time, y=protein)) + stat_summary(fun.y="meanlog", geom="line") #we define our own function mean of log #and change geom to line ggplot(Milk, aes(x=Time, y=protein, shape=Diet)) + geom_point() #we define our own function mean of log #and change geom to line ggplot(Milk, aes(x=Time, y=protein, shape=Diet)) + geom_point() + scale_shape_manual(values=c(5, 3, 8)) #we define our own function mean of log #and change geom to line ggplot(Milk, aes(x=Time, y=protein, shape=Diet)) + geom_point() + scale_shape_manual(values=c("B","M","L")) #default is scale_fill_hue, would be the same if omitted dDiet <- ggplot(Milk, aes(x=protein, fill=Diet)) + geom_density(alpha=1/3) # makes all colors transparent dDiet + scale_fill_hue() #a different starting hue [0,360] changes all 3 colors #because of the even hue-spacing dDiet + scale_fill_hue(h.start=150) #a qualitative scale for Diet in scale_fill_brewer dDiet + scale_fill_brewer(type="qual") #a divergent scale in scale_fill_brewer dDiet + scale_fill_brewer(type="div") # this old thing ggplot(Milk, aes(x=Time, y=protein)) + geom_point() # notice restriced ranges of x-axis and y-axis ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + lims(x=c(5,10), y=c(3,4)) + labs(x="Weeks", y="Protein Content") # notice no legend on the right anymore ggplot(Milk, aes(x=Time, y=protein, shape=Diet)) + geom_point() + guides(shape="none") #densities of protein, colored by Diet, faceted by Time ggplot(Milk, aes(x=protein, color=Diet)) + geom_density() + facet_wrap(~Time) #split rows by Diet, no splitting along columns ggplot(Milk, aes(x=Time, y=protein)) + geom_point() + facet_grid(Diet~.) #light blue background pt <- ggplot(Milk, aes(x=Time, y=protein)) + geom_point() pt + theme(panel.background=element_rect(fill="lightblue")) #make x-axis title big and red pt + theme(axis.title.x=element_text(size=20, color="red")) #remove plot background and x-axis title pt + theme(panel.background=element_blank(), axis.title.x=element_blank()) ## #save last displayed plot as pdf ## ggsave("plot.pdf") ## #save plot stored in "p" as png and resize ## ggsave("myplot.png", plot=p, width=7, height=5, units="in") #look at ToothGrowth data str(ToothGrowth) unique(ToothGrowth$dose) #bar plots of supp and dose ggplot(ToothGrowth, aes(x=supp)) + geom_bar() ggplot(ToothGrowth, aes(x=dose)) + geom_bar() #bar plot of counts of dose by supp #data are balanced, so not so interesting ggplot(ToothGrowth, aes(x=dose, fill=supp)) + geom_bar() #density of len ggplot(ToothGrowth, aes(x=len)) + geom_density() #density of len by supp ggplot(ToothGrowth, aes(x=len, color=supp)) + geom_density() #not the best scatterplot tp <- ggplot(ToothGrowth, aes(x=dose, y=len)) tp + geom_point() #mean and cl of len at each dose tp.1 <- tp + stat_summary() tp.1 #add a line plot of means to see dose-len relationship #maybe not linear tp.2 <- tp.1 + stat_summary(fun.y="mean", geom="line") tp.2 #all plots in tp.2 will now be colored by supp tp.2 + aes(color=supp) model_lin <- lm(len ~ dose*supp, data=ToothGrowth) summary(model_lin) # add model fitted values and residuals to dataset ToothGrowth$fit <- predict(model_lin) ToothGrowth$res <- residuals(model_lin) ggplot(ToothGrowth, aes(x=fit, y=res, color=supp)) + geom_point() + geom_smooth() #create dose-squared variable ToothGrowth$dosesq <- ToothGrowth$dose^2 # model with linear and quadratic effects interacted with supp model_quad <- lm(len ~ (dose + dosesq)*supp, data=ToothGrowth) # main effects model AIC(model_lin, model_quad) # interaction is significant, so we output model coefficients summary(model_quad) # get fitted values and residuals, quad model ToothGrowth$fit_q <- predict(model_quad) ToothGrowth$res_q <- residuals(model_quad) # residuals vs fitted again ggplot(ToothGrowth, aes(x=fit_q, y=res_q, color=supp)) + geom_point() + geom_smooth() # means and standard errors of residuals at each fitted value ggplot(ToothGrowth, aes(x=fit_q, y=res_q, color=supp)) + stat_summary() # add standardizes residuals to data set ToothGrowth$res_stand <- rstandard(model_quad) #q-q plot of residuals and diagonal reference line #geom_abline default aesthetics are yintercept=0 and slope=1 ggplot(ToothGrowth, aes(sample=res_stand)) + stat_qq() + geom_abline() #create new data set of values from 0.5 to 2 for each supp newdata <- data.frame(dose=rep(seq(0.5, 2, .01),each=2), supp=factor(c("OJ", "VC"))) #dose squared variable newdata$dosesq <- newdata$dose^2 #snapshot of the data we will use to get predicted values head(newdata) #add predicted values and confidence limits to newdata newdata <- data.frame(newdata, predict(model_quad, newdata, interval="confidence")) #take a look head(newdata) #fit curves by supp p1.0 <- ggplot(newdata, aes(x=dose, y=fit, color=supp)) + geom_line() p1.0 #add confidence bands p1.1 <- p1.0 + geom_ribbon(aes(ymax=upr, ymin=lwr, fill=supp)) p1.1 #make confidence bands transparent #alpha=1/5 means that 5 objects overlaid will be opaque p1.2 <- p1.0 + geom_ribbon(aes(ymax=upr, ymin=lwr, fill=supp), alpha=1/5) p1.2 #overlay observed data, changing dataset and y aesthetic to observed outcome p1.3 <- p1.2 + geom_point(data=ToothGrowth, aes(y=len)) p1.3 #overlay observed data p1.4 <- p1.3 + theme_classic() p1.4 #specify hex codes for orange, #fc8d62, and green #66c2a5 p1.5 <- p1.4 + scale_color_manual(values=c("#fc8d62", "#66c2a5")) + scale_fill_manual(values=c("#fc8d62", "#66c2a5")) p1.5 #note the default text justifications for the x-axis vs guide p1.6 <- p1.5 + labs(x="dosage vitamin C\n(mg/day)", y="tooth length", fill="supplement\ntype", color ="supplement\ntype") + theme(legend.position="bottom") p1.6 ## #FULL CODE FOR THE ABOVE GRAPH ## ggplot(newdata, aes(x=dose, y=fit, color=supp)) + ## geom_line() + ## geom_ribbon(aes(ymax=upr, ymin=lwr, fill=supp), alpha=1/5) + ## geom_point(data=ToothGrowth, aes(x=dose, y=len)) + ## theme_minimal() + ## scale_color_manual(values=c("#fc8d62", "#66c2a5")) + ## scale_fill_manual(values=c("#fc8d62", "#66c2a5")) + ## labs(x="dosage vitamin C\n(mg/day)", y="tooth length", ## fill="supplement\ntype", ## color ="supplement\ntype") + ## theme(legend.position="bottom") rm(list=apropos(what="esoph")) #structure of esoph str(esoph) #head of esoph head(esoph) #"stat=identity" plots the value of the variable mapped to "y" (ncases) as is ggplot(esoph, aes(x=alcgp, y=ncases)) + geom_bar(stat="identity") #gather ncontrols and ncases into a single column, "n" #labels will be in column "status" esoph_long <- gather(esoph, key="status", value="n", c(ncases, ncontrols)) #sort data before viewing (arrange from package dplyr) esoph_long <- arrange(esoph_long, agegp, alcgp, tobgp) #we can see the stacking of ncases and ncontrols head(esoph_long) #put n for cases and controls on the same bar graph ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity") #by default, geom_bar uses position="stack" #looks same as above ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="stack") #position = dodge move bars to the side ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="dodge") #position = fill stacks and standardizes height to 1 ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="fill") #position = identity overlays, obscruing "ncases" ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="identity") #jittering moves points a little away from x=0.5, x=1 and x=2 ggplot(ToothGrowth, aes(x=dose, y=len, fill=supp)) + geom_point(position="jitter") #alcohol group #plot saved to object an an <- ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="fill") an #age group instead of alcohol consumption an + aes(x=agegp) #tobacco consumption instead of agegp an + aes(x=tobgp) #tobgp to split rows, no splitting along columns an + facet_grid(tobgp~.) #3-way interaction visualization an + facet_grid(tobgp~agegp, labeller="label_both") #stacked counts ggplot(esoph_long, aes(x=alcgp, y=n, fill=status)) + geom_bar(stat="identity", position="stack") + facet_grid(tobgp~agegp, labeller="label_both") #change to numeric to model just linear effects of each logit2 <- glm(cbind(ncases, ncontrols) ~ as.numeric(agegp) + as.numeric(alcgp) + as.numeric(tobgp), data=esoph, family=binomial) summary(logit2) #get predicted probabilities and Cook's D esoph$fit <- predict(logit2, type="response") esoph$cooksd <- cooks.distance(logit2) #fitted vs Cooks D, colored by age, sized by alcohol, shaped by tobacco ggplot(esoph, aes(x=fit, y=cooksd, color=agegp, size=alcgp, shape=tobgp)) + geom_point() #starting graph p2.0 <- ggplot(esoph, aes(x=tobgp, y=fit, alpha=alcgp, group=alcgp)) + geom_point(size=3) + facet_grid(~agegp) p2.0 #in theme # remove things with element_blank() # gray0 is black and gray100 is white p2.1 <- p2.0 + theme(panel.background=element_blank(), strip.background=element_blank(), axis.ticks=element_blank(), panel.grid.major.y=element_line(color="gray90")) p2.1 #Set titles with labs #set fonts and sizes with theme p2.2 <- p2.1 + labs(title="age group", x="tobacco use(g/day)", y = "predicted probability of esophogeal cancer", alpha="alcohol\nuse(g/day)") + theme(plot.title=element_text(family="serif", size=14), axis.title=element_text(family="serif", size=14), legend.title=element_text(family="serif", size=14), strip.text=element_text(family="serif", size=10), legend.text=element_text(family="serif", size=10), axis.text.x=element_text(family="serif", size=8, angle=90)) p2.2 #Set titles with labs #set fonts and sizes with theme p2.3 <- p2.2 + scale_x_discrete(labels=c("0-9", "10-19", "20-29", "30+")) + scale_alpha_discrete(labels=c("0-39", "40-79", "80-119", "120+")) + theme(legend.position="bottom") p2.3 ## #FULL CODE FOR ABOVE PLOT ## ggplot(esoph, aes(x=tobgp, y=.pred, alpha=alcgp, group=alcgp)) + ## geom_point(size=3) + ## facet_grid(~agegp) + ## labs(title="age group", ## x="tobacco use(g/day)", ## y = "predicted probability of esophogeal cancer", ## alpha="alcohol\nuse(g/day)") + ## theme(panel.background = element_blank(), ## strip.background=element_blank(), ## panel.grid.major.y = element_line(color="gray90"), ## plot.title=element_text(family="serif", size=14), ## axis.title=element_text(family="serif", size=14), ## legend.title=element_text(family="serif", size=14), ## strip.text=element_text(family="serif", size=10), ## legend.text=element_text(family="serif", size=10), ## axis.text.x=element_text(family="serif", size=8, angle=90), ## legend.position="bottom", ## axis.ticks=element_blank()) + ## scale_x_discrete(labels=c("0-9", "10-19", "20-29", "30+")) + ## scale_alpha_discrete(labels=c("0-39", "40-79", "80-119", "120+")) #quick look at data structure str(sleepstudy) #mean relationship ggplot(sleepstudy, aes(x=Days, y=Reaction)) #mean relationship ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + geom_smooth() #linear model lm1 <- lm(Reaction ~ Days, sleepstudy) summary(lm1) #linear model sleepstudy$res <- residuals(lm1) sleepstudy$fit <- predict(lm1) ggplot(sleepstudy, aes(x=fit, y=res)) + geom_point() + geom_smooth() #scatter of residuals by subject ggplot(sleepstudy, aes(x=Subject, y=res)) + geom_point() #stat_summary() transforms the data to means and standard errors ggplot(sleepstudy, aes(x=Subject, y=res)) + geom_point() + stat_summary() #SET color and size to make dots stand out more ggplot(sleepstudy, aes(x=Subject, y=res)) + geom_point() + stat_summary(color="red", size=1) ## #not run ## ggplot(sleepstudy, aes(x=Days, y=Reaction)) + ## geom_point() + ## geom_smooth() ggplot(sleepstudy, aes(x=Days, y=Reaction, color=Subject)) + geom_point() + geom_smooth(se=F) ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + geom_smooth(span=1.5) + facet_wrap(~Subject) #random intercepts and coefficients for Days mixed <- lmer(Reaction ~ Days + (1+Days|Subject), data=sleepstudy) summary(mixed) #residuals mixed model sleepstudy$res_mix <- residuals(mixed) #residuals by subject, mixed model ggplot(sleepstudy, aes(x=Subject, y=res_mix)) + geom_point() + stat_summary(color="red", size=1) #residuals by subject, both models ggplot(sleepstudy, aes(x=Subject, y=res_mix)) + geom_point() + stat_summary(color="red", size=1) + stat_summary(aes(y=res), color="blue", size=1) #model results scatter plot ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~Subject, nrow=3) #predicted values from mixed model sleepstudy$fit_mix <- predict(mixed) #model results scatter plot ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~Subject, nrow=3) + geom_line(aes(y=fit_mix), size=.75) #mean intercept and slope fixef(mixed) #give them easy names mean_int <- fixef(mixed)[1] mean_slope <- fixef(mixed)[2] #adding mean intercept and slope to each graph ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~Subject, nrow=3) + geom_line(aes(y=fit_mix), size=.75) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) #adding mean intercept and slope to each graph ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~Subject, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) + geom_line(aes(y=fit_mix), size=.75) #mean intercept and slope re <- ranef(mixed)$Subject head(re) #create Subject column out of rownames, make it a factor re$Subject <- factor(rownames(re)) #rename column for use with ggplot colnames(re)[2] <- "rand_slope" #merge our random effects into our diagnostic dataset by Subject sleepstudy <- merge(sleepstudy, re, by="Subject") #check that merge worked str(sleepstudy) #total slope is rand_slope + slope, the fixed effect of Days from above mean_slope <- fixef(mixed)[2] sleepstudy$myslope <- sleepstudy$rand_slope + mean_slope #trim to do digits. sleepstudy$myslope <- format(sleepstudy$myslope, digits=2) #sort by slope and then Days #sleepstudy <- arrange(sleepstudy, slope, Days) #output total slope estimates by subject unique(sleepstudy$myslope) #sleepstudy$slope <- factor(sleepstudy$slope) #sleepstudy$id <- as.numeric(sleepstudy$slope) #facet by myslope ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) + geom_line(aes(y=fit_mix), size=.75) #reset x-axis ticks with scale_x_continuous ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) + geom_line(aes(y=fit_mix), size=.75) + scale_x_continuous(breaks=c(0,3,6,9)) #reset x-axis ticks with scale_x_continuous ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) + geom_line(aes(y=fit_mix), size=.75) + scale_x_continuous(breaks=c(0,3,6,9)) + labs(title="increase in reaction time (ms) per day of deprivation", x="days of deprivation", y="reaction time (ms)") #theme sets size and font family of title, axis title, #facet titles, and tick labels ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="red", size=.75) + geom_line(aes(y=fit_mix), size=.75) + scale_x_continuous(breaks=c(0,3,6,9)) + labs(title="increase in reaction time (ms) per day of deprivation", x="days of deprivation", y="reaction time (ms)") + theme(plot.title=element_text(family="serif", size=10), axis.title=element_text(family="serif", size=10), strip.text.x=element_text(family="serif", size=8), axis.text=element_text(family="serif", size=8)) #use theme_dark for a nighttime theme #place theme font specifications after theme_dark to ensure # that they are obeyed ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point(color="white") + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, color="black", size=.75) + geom_line(aes(y=fit_mix), size=.75, color="white") + scale_x_continuous(breaks=c(0,3,6,9)) + labs(title="increase in reaction time (ms) per day of deprivation", x="days of deprivation", y="reaction time (ms)") + theme_dark(base_family="serif") + theme(plot.title=element_text(size=10), axis.title=element_text(size=10), strip.text.x=element_text(size=8), axis.text=element_text(size=8)) #specify background and grid elements using theme #grays become lighter as number increases ggplot(sleepstudy, aes(x=Days, y=Reaction)) + geom_point() + facet_wrap(~myslope, nrow=3) + geom_abline(intercept=mean_int, slope=mean_slope, size=.75, color="gray60") + geom_line(aes(y=fit_mix), size=.75) + scale_x_continuous(breaks=c(0,3,6,9)) + labs(title="increase in reaction time (ms) per day of deprivation", x="days of deprivation", y="reaction time (ms)") + theme(plot.title=element_text(family="serif", size=12), axis.title=element_text(family="serif", size=12), strip.text.x=element_text(family="serif", size=10), axis.text=element_text(family="serif", size=10), panel.background=element_rect(fill=NA, color="gray60"), strip.background=element_rect(fill="gray90", color="gray60"), panel.grid.major=element_line(color=NA), panel.grid.minor=element_line(color=NA))