# data validation nonsales=read.csv("S:/Courses/stat-renaes/Stat404/nonsales.csv",header=T) # can use attach( ) -- I will be using with( ) instead for these examples # using is.na( ) to find NAs and using which( ) with it to find where they are is.na(nonsales) # this will be a large list with TRUE or FALSE for every variable value # which( ) with is.na( ) doesn't give clear info on whole dataset with(nonsales,which(is.na(Employee_ID))) with(nonsales,which(is.na(Job_Title))) with(nonsales,which(is.na(Gender))) # integer(0) means there are no NAs with(nonsales,which(is.na(Salary))) with(nonsales,which(is.na(Hire_Date))) with(nonsales,which(is.na(Birth_Date))) # use of complete.cases( ) complete.cases(nonsales) # or use with which( ) which(complete.cases(nonsales)) # shows all complete case data nonsales[complete.cases(nonsales),] # or find how many are NOT complete !complete.cases(nonsales) which(!complete.cases(nonsales)) # shows which cases are incomplete nonsales[!complete.cases(nonsales),] # shows all incomplete case data # use summary( ) to see if values are missing or outside feasible ranges # summary( ) does not need with( ) if I am summarizing the entire dataset summary(nonsales) # this is the for-loop for missing data (but not invalid data points) # it can be used with any dataset (just change the name nonsales) for (Var in names(nonsales)) { missing <- sum(is.na(nonsales[,Var])) if (missing > 0) { print(c(Var,missing)) } } # use table( ) to find invalid or missing values of categorical variables with(nonsales,table(Gender,useNA="ifany")) with(nonsales,table(Job_Title,useNA="ifany")) with(nonsales,table(Gender,Job_Title,useNA="ifany")) # do more at one time # the following is ALL you need for validation # all the above commands are great but there is an equivalent to PROC CONTENTS # that does ALL validation for categorical variables # install package Hmisc install.packages("Hmisc") library(Hmisc) # use contents( )command to get the descriptor portion of the dataset contents(nonsales) # then the only other command you need is to look at invalid ranges of numerical variables with(nonsales,summary(Salary)) with(nonsales,summary(Employee_ID))