################################################################################## ########## APPM 4570/5570 ; STAT 4000/5000 ########## ########## Statistical Methods ########## ########## Copyright Professor Vanja Dukic ########## ########## University of Colorado at Boulder ########## ########## ########## ########## ########## ########## WEEK 1 : Intro to R and EDA ########## ########## ########## ########## ########## ########## Note: the hash sign denotes comments ########## ########## ########## ########## Note: I have also commented out all the output ########## ########## so that you can verify your answers ########## ################################################################################## # read in the file ingredient = read.csv(file="ingredient.csv", header=TRUE, sep=",") # see the file out on the screen ingredient # brand generic #1 5.6 5.3 #2 5.1 4.1 #3 6.2 7.2 #4 6.0 6.5 #5 5.8 4.8 #6 6.5 4.9 #7 5.8 5.8 #8 5.5 5.0 # Objective: get a sense of the values in two groups visually # Histograms provide a quick way of visualizing the data values hist(ingredient$brand, main="brand",xlab="ingredient") hist(ingredient$generic, main="generic",xlab="ingredient") # this is how you save the above graphs into a single side-by-side file jpeg(file='hist.jpg', width=350, height=200) par(mfrow=c(1,2)) hist(ingredient$brand, main="brand",xlab="ingredient") hist(ingredient$generic, main="generic",xlab="ingredient") dev.off() # A lot easier to compare these histograms if the data are plotted on the same scale! hist(ingredient$brand, main="brand",xlab="ingredient", xlim=c(4,8)) hist(ingredient$generic, main="generic",xlab="ingredient", xlim=c(4,8)) # Even better if histograms have the same bin width! hist(ingredient$brand, main="brand",xlab="ingredient", xlim=c(4,8), breaks = seq(4,8,.25)) hist(ingredient$generic, main="generic",xlab="ingredient", xlim=c(4,8), breaks = seq(4,8,.25)) # And even better if y axes have the same scale too! hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4)) hist(ingredient$generic, main="generic",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4)) # Different shapes of the same data hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(5,8,.5)) hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(5,8,1)) # Histograms with relative frequency hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4)) hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4)) # Histograms with a kernel smooth hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5)) lines(density(ingredient$brand)) hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5)) lines(density(ingredient$generic)) # Best to check all data if you can -- add a rug plot hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5)) lines(density(ingredient$brand)) rug(ingredient$brand,ticksize=-0.1,col='red',lwd=3) hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5)) lines(density(ingredient$generic)) rug(ingredient$generic,ticksize=-0.1,col='red',lwd=3) # Measures of centrality mean(ingredient$brand) #[1] 5.8125 median(ingredient$brand) #[1] 5.8 mean(ingredient$generic) #[1] 5.45 median(ingredient$generic) #[1] 5.15 # Percentiles quantile(ingredient$brand, c(.1,.25,.5,.75,.9)) # 10% 25% 50% 75% 90% # 5.38 5.58 5.80 6.05 6.29 > quantile(ingredient$generic, c(.1,.25,.5,.75,.9)) # 10% 25% 50% 75% 90% # 4.59 4.88 5.15 5.98 6.71 # Visualization of percentiles boxplot(ingredient) # Mode histbrand = hist(ingredient$brand) histbrand # $breaks # [1] 5.0 5.5 6.0 6.5 # $counts # [1] 2 4 2 # $density # [1] 0.5 1.0 0.5 # $mids # [1] 5.25 5.75 6.25 histbrand$mids[histbrand$density==max(histbrand$density)] #[1] 5.75 histgen = hist(ingredient$generic) histgen$mids[histgen$density==max(histgen$density)] #[1] 4.5 # R commands for variability measures range(ingredient$generic) #[1] 4.1 7.2 range(ingredient$brand) #[1] 5.1 6.5 sd(ingredient$brand) #[1] 0.4323937 sd(ingredient$generic) #[1] 1.004277 var(ingredient$brand) #[1] 0.1869643 var(ingredient$generic) #[1] 1.008571 # note the square of the standard deviation equals the variance sd(ingredient$brand)^2 #[1] 0.1869643