##################################################################################
########## APPM 4570/5570 ; STAT 4000/5000				##########
########## Statistical Methods						##########
########## Copyright Professor Vanja Dukic				##########
########## University of Colorado at Boulder		    		##########	##########								##########
##########								##########
########## WEEK 1 : Intro to R and EDA 					##########
##########								##########
##########								##########
########## Note: the hash sign denotes comments				##########
##########								##########
########## Note: I have also commented out all the output 		##########
########## so that you can verify your answers				##########
##################################################################################


# read in the file
ingredient = read.csv(file="ingredient.csv", header=TRUE, sep=",")

# see the file out on the screen
ingredient

# brand generic
#1   5.6     5.3
#2   5.1     4.1
#3   6.2     7.2
#4   6.0     6.5
#5   5.8     4.8
#6   6.5     4.9
#7   5.8     5.8
#8   5.5     5.0


# Objective: get a sense of the values in two groups visually

# Histograms provide a quick way of visualizing the data values
 hist(ingredient$brand, main="brand",xlab="ingredient")
 hist(ingredient$generic, main="generic",xlab="ingredient")
 

# this is how you save the above graphs into a single side-by-side file

 jpeg(file='hist.jpg', width=350, height=200)
 par(mfrow=c(1,2))
 hist(ingredient$brand, main="brand",xlab="ingredient")
 hist(ingredient$generic, main="generic",xlab="ingredient")
 dev.off()


# A lot easier to compare these histograms if the data are plotted on the same scale!

 hist(ingredient$brand, main="brand",xlab="ingredient", xlim=c(4,8))
 hist(ingredient$generic, main="generic",xlab="ingredient", xlim=c(4,8))


# Even better if histograms have the same bin width! 

 hist(ingredient$brand, main="brand",xlab="ingredient", xlim=c(4,8), breaks = seq(4,8,.25))
 hist(ingredient$generic, main="generic",xlab="ingredient", xlim=c(4,8), breaks = seq(4,8,.25))


# And even better if y axes have the same scale too! 

 hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4))
 hist(ingredient$generic, main="generic",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4))


# Different shapes of the same data

hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(5,8,.5))
hist(ingredient$brand, main="brand",xlab="ingredient",breaks=seq(5,8,1))


# Histograms with relative frequency

hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4))
hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5),ylim=c(0,4))


# Histograms with a kernel smooth

 hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5))
 lines(density(ingredient$brand))
 hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5))
 lines(density(ingredient$generic))


# Best to check all data if you can -- add a rug plot

 hist(ingredient$brand, freq=FALSE, main="brand",xlab="ingredient",breaks=seq(4,8,.5))
 lines(density(ingredient$brand))
 rug(ingredient$brand,ticksize=-0.1,col='red',lwd=3)
 hist(ingredient$generic, freq=FALSE, main="generic",xlab="ingredient",breaks=seq(4,8,.5))
 lines(density(ingredient$generic))
 rug(ingredient$generic,ticksize=-0.1,col='red',lwd=3)


# Measures of centrality

mean(ingredient$brand)
#[1] 5.8125
median(ingredient$brand)
#[1] 5.8

 mean(ingredient$generic)
#[1] 5.45
 median(ingredient$generic)
#[1] 5.15


# Percentiles

quantile(ingredient$brand, c(.1,.25,.5,.75,.9))
#  10%   25%   50%   75%   90% 
# 5.38   5.58  5.80  6.05  6.29 

> quantile(ingredient$generic, c(.1,.25,.5,.75,.9))
#  10%   25%   50%   75%   90% 
# 4.59   4.88  5.15  5.98  6.71 


# Visualization of percentiles
boxplot(ingredient)


# Mode 

 histbrand = hist(ingredient$brand)

 histbrand
# $breaks
# [1] 5.0 5.5 6.0 6.5
# $counts
# [1] 2 4 2
# $density
# [1] 0.5 1.0 0.5
# $mids
# [1] 5.25 5.75 6.25

 histbrand$mids[histbrand$density==max(histbrand$density)]
#[1] 5.75

 histgen = hist(ingredient$generic)
 histgen$mids[histgen$density==max(histgen$density)]
#[1] 4.5


# R commands for variability measures

 range(ingredient$generic)
#[1] 4.1 7.2
 range(ingredient$brand)
#[1] 5.1 6.5

 sd(ingredient$brand)
#[1] 0.4323937
 sd(ingredient$generic)
#[1] 1.004277

 var(ingredient$brand)
#[1] 0.1869643
 var(ingredient$generic)
#[1] 1.008571 

# note the square of the standard deviation equals the variance
 sd(ingredient$brand)^2
#[1] 0.1869643