#install.packages("haven") #install.packages("survey") #install.packages("jtools") #install.packages("remotes") #remotes::install_github("carlganz/svrepmisc") library("haven") library("survey") library("jtools") library("remotes") library("svrepmisc") nhanes2012 <- read_dta("D:/data/Seminars/Applied Survey Data Analysis R/nhanes2012_reduced.dta") # the "id" is actually the PSU!!!!! nhc <- svydesign(id=~sdmvpsu, weights=~wtint2yr,strata=~sdmvstra, nest=TRUE, survey.lonely.psu = "adjust", data=nhanes2012) nhc summary(nhc) # descriptive statistics with continuous variables # calculating means # the documentation for svymean is found under surveysummary svymean(~ridageyr, nhc) # calculating standard deviations svysd(~ridageyr,design = nhc, na = TRUE) svymean(~pad630, nhc, na = TRUE) svymean(~hsq496, nhc, na = TRUE) # listwise deletion of missing cases across all variables # notice that the means shown below do not match those from above svymean(~ridageyr+pad630+hsq496, nhc, na = TRUE) # getting the mean, standard deviation, variance and coeff of var svymean(~pad680, nhc, na = TRUE) svysd(~pad680,design = nhc, na = TRUE) svyvar(~pad680, design = nhc, na = TRUE) cv(svymean(~pad680,design = nhc, na = TRUE)) svymean(~pad680, nhc, na = TRUE, deff = TRUE) # quantiles svyquantile(~hsq496, design = nhc, na = TRUE, c(.25,.5,.75),ci=TRUE) # totals svytotal(~dmdborn4,design = nhc, na = TRUE) # can get the cv and deff for totals cv(svytotal(~dmdborn4,design = nhc, na = TRUE)) svytotal(~dmdborn4,design = nhc, na = TRUE, deff = TRUE) # means and proportions for binary variables svymean(~female, nhc) confint(svymean(~female, nhc)) # per comment on page 70, use svyciprop rather than confint() # there are many options for method # likelihood uses the (Rao-Scott) scaled chi-squared # distribution for the loglikeli-hood from a binomial distribution svyciprop(~I(female==1), nhc, method="likelihood") # li is short for likelihood svyciprop(~I(female==0), nhc, method="li") # fits a logistic regression model and computes # a Wald-type interval on thelog-odds scale, which # is then transformed to the probability scale svyciprop(~I(female==1), nhc, method="logit") # uses a logit transformation of the mean and # then back-transforms to the probablity scale. # This appears to be the method used by SUDAAN and # SPSS COMPLEX SAMPLES. svyciprop(~I(female==1), nhc, method="xlogit") ## per the documentation: reproduces Stata svy: mean svyciprop(~I(female==1), nhc, method="me", df=degf(nhc)) ## per the documentation: reproduces Stata svy: prop svyciprop(~I(female==1), nhc, method="lo", df=degf(nhc)) # extra examples svymean(~female, nhc) svyciprop(~I(female==0), nhc, method="mean") svyciprop(~I(female==1), nhc, method="mean") # descriptive stats for categorical variables # frequencies svytable(~female, design = nhc) # crosstabulations # 2-way svytable(~female+dmdborn4, nhc) # 2-way svytable(~interaction(female, dmdborn4), design = nhc) # 3-way svytable(~interaction(female, dmdborn4, hsq571), design = nhc) # 4-way svytable(~interaction(female, dmdborn4, hsq571, paq665), design = nhc) # chi-square test svychisq(~female+dmdborn4, nhc, statistic="adjWald") # graphing of continuous variables # histogram # default is to show density on y-axis svyhist(~pad630, nhc) # histogram with count on y-axis svyhist(~ridageyr, nhc, probability = FALSE) # boxplot with just one variable and all outliers svyboxplot(~hsq496~1, nhc, all.outliers=TRUE) # boxplot with two variables # the grouping variable must be a factor svyboxplot(~hsq496~factor(female), nhc, all.outliers=TRUE) # barchart barplt<-svyby(~pad675+pad630, ~female, nhc, na = TRUE, svymean) barplot(barplt,beside=TRUE,legend=TRUE) # dotplot dotchart(barplt) # scatterplot with weights as bubble size svyplot(~pad675+pad630, nhc, style="bubble") # examples of other types of graphs smth<-svysmooth(~pad630, design=nhc) plot(smth) dens<-svysmooth(~pad630, design=nhc,bandwidth=30) plot(dens) dens1<-svysmooth(~pad630, design=nhc) plot(dens1) # subpops # the mean of age svymean(~ridageyr, nhc) # the mean of age broken out by the variable female svyby(~ridageyr, ~female, nhc, svymean) # the mean of age broken bout the female and marital status svyby(~ridageyr, ~dmdmartl+female, nhc, svymean) # using three variables to define the subpopulation # need the "na = TRUE" svyby(~pad630, ~dmdmartl+dmdeduc2+female, nhc, na = TRUE, svymean) # using subset.survey.design to see the results for only one group # subsetting to include only males smale <- subset(nhc,female == 0) summary(smale) svymean(~ridageyr,design=smale) # t-test svyttest(pad675~0, nhc, na = TRUE) # paired samples t-test # example from page 124 svyttest(I(pad660-pad675)~0, nhc, na = TRUE) # two-sample t-tests svyttest(pad630~female, nhc) # calculating the difference by hand # the matrix created in the svyby call is needed for the svycontrast function svyttest(ridageyr~female, nhc) summary(svyglm(ridageyr~female, design=nhc)) a <- svyby(~ridageyr, ~female, nhc, na.rm.by = TRUE, svymean, covmat = TRUE) vcov(a) a # calculating the difference between two means svycontrast(a, c( -1, 1)) # 4.589723 - 6.153479 # another example, this time using svypredmeans summary(svyglm(pad630~female, design=nhc)) # can't have the variable female in the model when you want # to get the predicted means for that variable ttest1 <- (svyglm(ridageyr~1, design=nhc)) summary(ttest1) svypredmeans(ttest1, ~female) # getting the confidence interval for the difference tt<-svyttest(pad630~female, nhc) tt confint(tt, level=0.9) # t-test with a subpopulation svyttest(pad630~dmdeduc2, subset(nhc, female == 1 ) ) # multiple linear regression # need to use summary to get standard errors, t and p-values # main effects only summary(svyglm(pad630~female+hsq571, design=nhc, na.action = na.omit)) # including an interaction term summary(svyglm(pad630~female*hsq571, design=nhc, na.action = na.omit)) glm1 <- (svyglm(pad630~female*hsq571, design=nhc, na.action = na.omit)) glm1 confint(glm1) summary(svyglm(pad630~factor(female)*factor(dmdmartl), design=nhc, na.action = na.omit)) ols1 <- (svyglm(pad630~1, design=nhc, na.action = na.omit)) predmarg<-svypredmeans(ols1, ~interaction(female,dmdmartl)) predmarg # non-parametric tests # Wilcoxon signed rank test wil <- svyranktest(hsq496~female, design = nhc, na = TRUE, test = c("wilcoxon")) wil # median test mtest <- svyranktest(hsq496~female, design = nhc, na = TRUE, test=("median")) mtest # Kruskal Wallis test kwtest <- svyranktest(hsq496~female, design = nhc, na = TRUE, test=("KruskalWallis")) kwtest # logistic regression logit1 <- (svyglm(paq665~factor(hsd010)+ridageyr, family=quasibinomial, design=nhc, na.action = na.omit)) summary(logit1) # using a subpopulation subset1 <- subset(nhc, ridageyr > 20) logit2 <- (svyglm(paq665~factor(hsd010)+ridageyr, family=quasibinomial, design=subset1, na.action = na.omit)) summary(logit2) regTermTest(logit2, ~ridageyr) # getting pseudo-R-square values psrsq(logit2, method = c("Cox-Snell")) psrsq(logit2, method = c("Nagelkerke")) # ordered logistic ologit1 <- svyolr(factor(dmdeduc2)~factor(female)+factor(dmdborn4)+pad680, design = nhc, method = c("logistic")) summary(ologit1) # Poisson model summary(svyglm(pad675~female, design=nhc, family=poisson())) ## other types of analyses available in the survey package # weighted PCA pc <- svyprcomp(~pad630+pad675+hsd010, design=nhc,scale=TRUE,scores=TRUE) pc biplot(pc, weight="scaled") # Cronbach's alpha svycralpha(~hsq571+dmdborn4, design=nhc, na.rm = TRUE)