options nocenter nodate nonumber formdlim="-"; /* Chapter 1: Simple and Multiple Regression */ /* Introduction to the elemapi2 data file */ proc contents data="c:\sasreg\elemapi" ; run; * show means of variables ; proc means data="c:\sasreg\elemapi2" ; var api00 ell meals yr_rnd mobility acs_k3 acs_46 full emer enroll ; run; * show freq tables for selected variables; proc freq data="c:\sasreg\elemapi2" ; tables yr_rnd acs_k3 acs_46 ; run; * show histogram for enroll ; proc univariate data="c:\sasreg\elemapi2"; var enroll ; histogram / cfill=gray normal midpoints=100 to 1500 by 100 kernel; run; /* a first multiple regression */ proc reg data="c:\sasreg\elemapi2" ; model api00 = ell meals yr_rnd mobility acs_k3 acs_46 full emer enroll ; run; * also show standardized estimates ; model api00 = ell meals yr_rnd mobility acs_k3 acs_46 full emer enroll / stb ; run; * show test of "ell"; test2: test ell; run; * show test of class size; test_class_size: test acs_k3, acs_46; run; quit; /* Chapter 2: Regression Diagnostics */ /* 2.1 Unusual and Influential data */ * Look at crime data file ; proc contents data="c:\sasreg\crime"; run; * Look at means of variables ; proc means data="c:\sasreg\crime"; var crime murder pctmetro pctwhite pcths poverty single; run; * show scatterplot matrix ; proc insight data="c:\sasreg\crime"; scatter crime pctmetro poverty single* crime pctmetro poverty single; run; quit; * show crime by pctmetro ; goptions reset=all; axis1 label=(r=0 a=90); symbol1 pointlabel = ("#state") font=simplex value=none; proc gplot data="c:\sasreg\crime"; plot crime*pctmetro=1 / vaxis=axis1; run; quit; * show crime by poverty ; proc gplot data="c:\sasreg\crime"; plot crime*poverty=1 / vaxis=axis1; run; quit; * show crime by single ; proc gplot data="c:\sasreg\crime"; plot crime*single=1 / vaxis=axis1; run; quit; * show regression and save rstudent, lev, cookd, dffits ; proc reg data="c:\sasreg\crime"; model crime=pctmetro poverty single; output out=crime1res rstudent=r h=lev cookd=cd dffits=dffit; run; quit; * Examine residuals for normality ; proc univariate data=crime1res plots plotsize=30; var r; run; * Show 10 smallest residuals; proc sort data=crime1res; by r; run; proc print data=crime1res(obs=10); var sid state r; run; * Show 10 largest residuals; proc print data=crime1res(firstobs=42 obs=51); var sid state r; run; * Show 10 largest leverage points; proc sort data=crime1res; by lev; run; proc print data=crime1res (firstobs=42 obs=51); var lev state; run; * Show residual squared by leverage ; proc sql; create table crimeres5 as select *, r**2/sum(r) as rsquared from crime1res; quit; goptions reset=all; axis1 label=(r=0 a=90); symbol1 pointlabel = ("#state") font=simplex value=none; proc gplot data=crimeres5; plot lev*rsquared / vaxis=axis1; run; quit; * Show cooks d that exceeds criteria of (k/n) ; proc print data=crime1res; where cd > (4/51); var crime pctmetro poverty single state cd; run; * Show DfBetas; proc reg data="c:\sasreg\crime"; model crime=pctmetro poverty single / influence; ods output OutputStatistics=crimedfbetas; id state; run; quit; proc print data=crimedfbetas (obs=5); var state DFB_pctmetro DFB_poverty DFB_single; run; * DC seems to be a problem. Let's show regression with and without DC; * With DC; proc reg data="c:\sasreg\crime"; model crime=pctmetro poverty single; run; quit; * Without DC; proc reg data="c:\sasreg\crime"; model crime=pctmetro poverty single; where state ne "dc"; run; quit; /* 2.2 Tests for normality of residuals */ /* Returning back to "elemapi2" data */ * Run regression and save residual and predicted ; proc reg data="c:\sasreg\elemapi2"; model api00=meals ell emer; output out=elem1res residual=r predicted=fv; run; quit; * Show kernal density plot of residuals ; proc kde data=elem1res out=den; var r; run; proc sort data=den; by r; run; goptions reset=all; symbol1 c=blue i=join v=none height=1; proc gplot data=den; plot density*r=1; run; quit; /* 2.3 Tests for Heteroscedasticity */ * Show residual by predicted ; goptions reset=all; proc reg data='c:\sasreg\elemapi2'; model api00 = meals ell emer; plot r.*p.; run; * Run white test for test of homoscedasticity ; model api00 = meals ell emer / spec; run; /* 2.4 Tests for collinearity */ * example 1, vifs look fine ; proc reg data='c:\sasreg\elemapi2'; model api00 = acs_k3 grad_sch col_grad some_col / vif tol; run; * example 2, more of a problem ; proc reg data='c:\sasreg\elemapi2'; model api00 = acs_k3 avg_ed grad_sch col_grad some_col / vif tol; run; * example 3, omit "avg_ed" and vifs improve ; proc reg data='c:\sasreg\elemapi2'; model api00 =acs_k3 grad_sch col_grad some_col / vif tol; run; quit; * 2.5 Nonlinearity ; * 2.6 Model specification ; * 2.7 Independence ; /* Chapter 3: Categorical Variables */ * 3.1 Regression with a 0/1 Variable ; goptions reset=all; proc reg data="c:\sasreg\elemapi2"; model api00 = yr_rnd; run; * Showing the plot we can see the meaning of the ; * intercept and slope for "yr_rnd" ; plot api00*yr_rnd; run; * 3.3 Regression with a 1/2/3 Variable ; * make dummy codes for "mealcat" ; data temp_elemapi; set "c:\sasreg\elemapi2"; mealcat1=0; mealcat2=0; mealcat3=0; if mealcat = 1 then mealcat1=1; if mealcat = 2 then mealcat2=1; if mealcat = 3 then mealcat3=1; run; * verify dummy codes ; proc freq data=temp_elemapi; tables mealcat*mealcat1*mealcat2*mealcat3 /list; run; * run analysis with dummy variables ; proc reg data=temp_elemapi; model api00 = mealcat2 mealcat3; run; * test overall effect of mealcat; test mealcat2=mealcat3=0; run; * relate means to parameter estimates ; proc means data=temp_elemapi mean std; class mealcat; var api00; run; * using proc glm with dummy variables ; proc glm data="c:\sasreg\elemapi2" order=freq ; class mealcat; model api00=mealcat / solution ss3; run; quit; * 3.5 Categorical predictors with interactions ; * 3.5.1 Using PROC REG ; * yr_rnd is already a 0/1 variable ; * make mealdum1-3 and mealxynd1-3 ; data mealxynd_elemapi; set "c:\sasreg\elemapi2"; array mealdum(3) mealcat1-mealcat3; array mealxynd(3) mealxynd1-mealxynd3; do i = 1 to 3; mealdum(i)=(mealcat=i); mealxynd(i)=mealdum(i)*yr_rnd; end; drop i; run; * verify coding ; proc freq data=mealxynd_elemapi; tables yr_rnd*mealcat*mealxynd1*mealxynd2*mealxynd3 / nopercent nocum list; run; * run regression ; proc reg data=mealxynd_elemapi; model api00=yr_rnd mealcat1 mealcat2 mealxynd1 mealxynd2; run; * test interaction ; test mealxynd1=mealxynd2=0; run; /* Predicted values in terms of coefficients mealcat=1 mealcat=2 mealcat=3 ------------------------------------------------- yr_rnd=0 _cons _cons _cons +Bmealcat1 +Bmealcat2 ------------------------------------------------- yr_rnd=1 _cons _cons _cons +Byr_rnd +Byr_rnd +Byr_rnd +Bmealcat1 +Bmealcat2 +Bmealxynd1 +Bmealxynd2 */ * test effect of "yr_rnd" when "mealcat" is 1 ; test yr_rnd + mealxynd1=0; run; * 3.5.2 Using PROC GLM ; proc glm data="c:\sasreg\elemapi2"; class yr_rnd mealcat; model api00=yr_rnd mealcat yr_rnd*mealcat /ss3; run; * get simple main effect with the "slice" option ; lsmeans yr_rnd*mealcat / slice=mealcat; run; /* 4. Beyond Ordinary Least Squares */ * 4.1 Robust methods ; * Show regression ; proc reg data = "c:\sasreg\elemapi2"; model api00 = acs_k3 acs_46 full enroll ; run; * show residuals vs. predicted ; plot r.*p.; run; * show cooks d ; plot cookd.*obs.; run; * Regression with robust standard errors ; * run regression with "acov" option ; proc reg data = "c:\sasreg\elemapi2"; model api00 = acs_k3 acs_46 full enroll /acov; ods output ACovEst = estcov; ods output ParameterEstimates=pest; run; quit; * show parameter estimates ; proc print data=pest; run; * show robust covariance estimates ; proc print data=estcov; run; * get robust standard errors from covariance ; data temp_dm; set estcov; drop model dependent; array a(5) intercept acs_k3 acs_46 full enroll; array b(5) std1-std5; b(_n_) = sqrt((395/390)*a(_n_)); std = max(of std1-std5); keep variable std; run; * Show robust standard errors ; proc print data=temp_dm; run; * combine estimates and standard errors to make table ; proc sql; select pest.variable, estimate, stderr, tvalue, probt, std as robust_stderr, estimate/robust_stderr as tvalue_rb, (1 - probt(abs(estimate/robust_stderr), 394))*2 as probt_rb from pest, temp_dm where pest.variable=temp_dm.variable; quit;