/* ALL CODE FOR INTRODUCTION TO SAS 9.3 SEMINAR */ ********************************************************* * Entering Data * *********************************************************; *2.1 Import wizard and proc import; proc import datafile="c:\sas_data\hs0.xlsx" dbms = excel replace out=hs0; range = "hs0$"; getnames = yes; run; *2.2 Data Steps; * Infile a comma-separated-values (.csv) file; data temp; infile 'c:\sas_data\hs0.csv' delimiter=',' dsd; length prgtype $10; input gender id race ses schtyp prgtype $ read write math science socst ; run; proc print data = temp (obs=10); run; * Enter data directly into SAS using input; data hsb10; input id female race ses schtype $ prog read write math science socst; datalines; 147 1 1 3 pub 1 47 62 53 53 61 108 0 1 2 pub 2 34 33 41 36 36 18 0 3 2 pub 3 50 33 49 44 36 153 0 1 2 pub 3 39 31 40 39 51 50 0 2 2 pub 2 50 59 42 53 61 51 1 2 1 pub 2 42 36 42 31 39 102 0 1 1 pub 1 52 41 51 53 56 57 1 1 2 pub 1 71 65 72 66 56 160 1 1 2 pub 1 55 65 55 50 61 136 0 1 2 pub 1 65 59 70 63 51 ; run; proc print data=hsb10; run; *2.3 Saving SAS data files; * Save temporary dataset "temp" as a permanent file; data 'c:\sas_data\hs0'; set temp; run; proc print data='c:\sas_data\hs0'; run; libname IN 'c:\sas_data'; data in.hs0; set temp; run; ********************************************************* * Exploring Data * *********************************************************; * Set output to be left justified rather than centered; options nocenter; * Examine data using proc contents and proc print; proc contents position data=in.hs0; run; proc print data=in.hs0 (obs=20); run; * If we only want to print some variables, we can use the var statement; proc print data=in.hs0 (obs=20); var gender id race ses schtyp prgtype read; run; * Create a temporary dataset called hs0 ; data hs0; set in.hs0; run; * Descriptive statistics with proc means and proc univariate; proc means data=hs0; run; proc univariate data=hs0; var read write; run; * means for a subset of variables using var; proc means data=hs0 n mean median std var; var read math science write; run; * means for a subset of variables using var; proc means data=hs0 n mean median std var; where read>=60; var read math science write; run; * means broken down by group (ses) using class; proc means data=hs0 n mean median std var; class ses; var read math science write; run; * histogram with normal curve overlay from proc univariate; proc univariate data=hs0 noprint; var write; histogram / normal; run; * Frequency distribution table; proc freq data=hs0; table ses; run; * Frequency distribution table; proc freq data=hs0; table gender schtyp prgtype; run; * a crosstab using proc freq plus cumulative frequency graph; proc freq data=hs0; table prgtype*ses / plots=freqplot; run; * correlations using proc corr with pairwise deletion of missing observations (default) ; proc corr data=hs0; var write read science; run; * correlations using proc corr with listwise deletion of missing observations (nomiss option) ; proc corr data=hs0 nomiss; var write read science; run; * Scatter plot matrix; proc corr data=hs0 nomiss plots=matrix; var write read science; run; * a scatter plot ; proc sgplot data = hs0; scatter x = read y = write; run; * scatter plot with id number a marker; proc sgplot data=hs0; scatter x=write y=read / markerchar=id; run; * scatter plot with gender of observation indicated; proc sgplot data=hs0; scatter x=write y=read / group=gender; run; * vertical bar chart representing mean of varaible write by ses with error bars; proc sgplot data=hs0; vbar ses /response = write stat=mean limits=both; run; * histogram of variable read with normal curve and density plot overlayed; proc sgplot data=hs0; histogram read; density read / type=normal; density read /type = kernel; run; ********************************************************* * Modifying Data * *********************************************************; *2.1 Proc Format, labels and renaming variables ; * Examine the dataset; proc contents data = in.hs0; run; * print observations in dataset hs0; proc print data = in.hs0; run; * Create value labels for the variable schtyp; proc format; value scl 1 = "public" 2 = "private"; run; * Frequency table using the labels with a format statement; proc freq data = hs0; tables schtyp; format schtyp scl.; run; proc contents data=hs0; run; * permanently apply a value label to a variable in a data step; data hs0b; set in.hs0; format schtyp scl.; run; proc contents data=hs0b; run; * label the dataset and schtyp; data hs0b(label="High School and Beyond"); set hs0b; label schtyp = "type of school"; run; * Rename schtype to public and gender to female in a temporary dataset hs0b; data hs0b; set hs0b (rename=(schtyp=public gender=female)); label schtyp = "type of school"; run; proc contents data=hs0b; run; *2.2 Putting things together in a long data step; * Proc format followed by a dataset that performs a variety of tasks; proc format; * create value labels for schtyp ; value scl 1 = "public" 2 = "private"; * create value labels for grade ; value abcdf 0 = "F" 1 = "D" 2 = "C" 3 = "B" 4 = "A"; * create value labels for female ; value fm 1 = "female" 0 = "male"; run; * create data file hs1, label it and rename the variable gender to female ; data hs1(label="High School and Beyond") ; set in.hs0 (rename=(gender=female)); * label the variable schtyp ; label schtyp = "type of school"; * apply value labels to schtyp; format schtyp scl.; * apply value labels to female; format female fm.; * the if-then statements create a new variable, called prog, which is numeric variable ; if prgtype = "academic" then prog = 1; if prgtype = "general" then prog = 2; if prgtype = "vocational" then prog = 3; * the if statement recodes values of 5 in the variable race to be missing (.) ; if race = 5 then race = .; * create a variable called total that is the sum of read, write, math, and science ; total = read + write + math + science; * the if-then statements recode the variable total into the variable grade ; if (total < 80) then grade = 0; if (80 <= total < 110) then grade = 1; if (110 <= total < 140) then grade = 2; if (140 <= total < 170) then grade = 3; if (total >= 170) then grade = 4; if (total = .) then grade = .; * apply value labels to variable grade; format grade abcdf.; run; proc contents data = hs1; run; proc print data = hs1 (obs = 20); run; proc freq data = hs1; tables schtyp*female; run; * Save temporary dataset as a permanent dataset; data in.hs1; set hs1; run; *2.3 Recoding a continuous variable using formats *Recoding a continuous variable using formats; proc format; * create format for test score; value score 25 - 60 = "low score" 61 -80 = "high score"; run; data hs1_read; set hs1; * apply value labels to variable read; format read score.; run; * variable read can be used in its original format; proc means data=hs1_read; var read; run; * variable read can be also be used in class statement as categorical; proc means data=hs1_read; class read; var total; run; *2.4 Creating new variables using procedures; * standardize read and write using proc standard; proc standard data = hs1 mean=0 std=1 out=hs1b; var read write ; run; * look at the data; proc print data=hs1b (obs=10); run; *2.5 Using SAS functions; * create variables using SAS function; data hs1b; set hs1; total2 = sum(of read write math science); * similarly, mean, max, min and more; x3 = ordinal(3, read, write, math, science); mean= mean(of read write math science); run; title ""; proc print data=hs1b (obs=20); var read write math science total total2 x3 mean; run; ********************************************************* * Managing Data * *********************************************************; *2.1 Creating a library; libname in "c:\sas_data\"; proc print data=in.hs1 (obs=10); var write read science; run; proc print data="c:\sas_data\hs1" (obs=10); var write read science; run; *2.2 Selecting cases using if or where statements; data highread lowread; set in.hs1; if read >=60 then output highread; if read < 60 then output lowread; run; title "high reading scores"; proc means data=highread n mean; var read; run; title "low reading scores"; proc means data=lowread n mean; var read; run; title; /* this statement clears the title we set earlier */ * using where statement; data highread; set in.hs1; where read >=60; run; proc print data = highread; run; *2.3 Keeping variables; * keeping a subset of variables; data hskept; set highread; keep id female read write; run; *2.4 Dropping variables; * dropping a subset of variables; data hsdropped; set highread; drop ses prog; run; *2.5 Appending datasets; * Look at frequency of variable "female" in each file ; proc freq data=in.hsmale; tables female; run; proc freq data=in.hsfemale; tables female; run; * Use DATA step to combine the two files and save them as hsmasters ; data in.hsmaster; set in.hsmale in.hsfemale; run; * Now you should have a file with both males and females; proc ttest data=in.hsmaster; by female; var write; run; *2.6 Merging datasets; * examine the two datasets; proc print data=in.hsdem ; run; proc print data=in.hstest ; run; * sort both files by the variable that identifies the cases in each file (id); proc sort data=in.hsdem out=dem; by id; run; proc sort data=in.hstest out=test; by id; run; * merge the datasets; data all; merge dem test; by id; run; proc print data=all; run; ********************************************************* * Analyzing Data * *********************************************************; *2.1 Chi-squared test; proc freq data=in.hs1; table prgtype*ses / chisq expected; run; *2.2 T-tests; * one sample t-test; proc ttest data=in.hs1 H0=50; var write; run; * paired t-test; proc ttest data=in.hs1; paired write*read; run; * two sample independent t-test; proc ttest data=in.hs1; class female; var write; run; *2.3 ANOVA; * Oneway ANOVA with type III sums of squares only; proc glm data=in.hs1; class prog; model write=prog / ss3; run; quit; *ancova; proc glm data=in.hs1; class prog; model write = read prog / ss3; run; quit; * 2.4 Regression ; proc reg data=in.hs1; model write = female read; run; quit; * OLS with diagnostic plots, this code also outputs a temporary dataset (temp) that contains the predicted values of math and the residuals ; proc reg data =in.hs1 plots=diagnostics; model math = write socst; output out=temp p=predict r=resid; run; quit; * look at the temporary dataset (temp); proc print data=temp (obs=20); var math predict resid; run; *2.5 Logistic regression; * create a dichotomous variable honcomp; data hs2; set in.hs1; honcomp = (write >= 60); run; * Logistic regression with descending option (so model predicts 1s rather than 0s); proc logistic data=hs2 descending; model honcomp = female read; run; *2.6 Nonparametric tests; * signtest (nonparametric analog of the single sample t-test); proc univariate data=in.hs1 mu0=50; var write; run; * signrank test (nonparametric analog of the paired t-test); * create the difference variable (diff); data hs1c; set in.hs1; diff = read - write; run; * test that diff=0; proc univariate data=hs1c; var diff; run; * ranksum (nonparametric analog of the independent two-sample t-test); proc npar1way data=in.hs1; class female; var write; run; * kruskal wallis (nonparametric analog of the one-way ANOVA); proc npar1way data=in.hs1; class ses; var write; run;