09_StatAnalysis

# 09 Graphics for Statistical Analysis: R code
# Jerzy Wieczorek
# 9/29/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 9/")

# Load R packages
library(RColorBrewer)
library(ggplot2)
library(MASS) # for eqscplot
library(aplpack) # for bagplot
library(Hmisc) # for errbar
library(lattice) # for ethanol dataset


#### PLOTTING MATHEMATICAL FUNCTIONS ####

# curve()
curve(x^2, from = -3, to = 3)

curve(sin(x), -pi, pi)

# Use curve() to plot continuous PDFs and CDFs
curve(dnorm(x), -3, 3, main = "Standard Normal PDF", ylab = "Density")

curve(pnorm(x), -3, 3, main = "Standard Normal CDF", ylab = "Cumulative density")

curve(dgamma(x, shape = 3), 0, 10, main = "Gamma(shape = 3, scale = 1) PDF", ylab = "Density")

curve(pgamma(x, shape = 3), 0, 10, main = "Gamma(shape = 3, scale = 1) CDF", ylab = "Cumulative density")

# Contours of bivariate function:
# let's plot bivariate standard Normal PDF,
# which is just product of two independent Normal densities
x = seq(-2, 2, 0.1)
y = x
dnorm2 = function(x, y) {dnorm(x) * dnorm(y)}
z = outer(x, y, FUN = dnorm2)
contour(x, y, z)

#### DISTRIBUTIONS ####

# Load the tips dataset from
# http://www.ggobi.org/book/
tips = read.csv("tips.csv")
# Reorder levels of day (alphabetical default is unhelpful here)
tips$day = factor(tips$day, levels = c("Thu", "Fri", "Sat", "Sun"))
# Attach dataset
attach(tips)

# Histograms with different bins
hist(tip, main = "Tip histogram: default bins", xlab = "Tip ($)")

hist(tip, seq(0.5, 10.5, by = 1), xlim = c(0, 11), 
     main = "Tip histogram: binned to nearest $1", xlab = "Tip ($)")

hist(tip, seq(0.95, 10.05, by = .1), xlim = c(0, 11), 
     main = "Tip histogram: binned to nearest $0.10", xlab = "Tip ($)")

# KDEs with different bandwidths and kernels
plot(density(tip), main = "Tip KDE: default bandwidth,\nGaussian kernel", xlab = "Tip ($)")

plot(density(tip, bw = 1), main = "Tip KDE: $1 bandwidth,\nGaussian kernel", xlab = "Tip ($)")

plot(density(tip, bw = 0.1), main = "Tip KDE: $0.10 bandwidth,\nGaussian kernel", xlab = "Tip ($)")

plot(density(tip, bw = 0.1, kernel = 'rectangular'),
     main = "Tip KDE: $0.10 bandwidth,\nrectangular kernel", xlab = "Tip ($)")

plot(density(tip, bw = 0.1, kernel = 'epanechnikov'),
     main = "Tip KDE: $0.10 bandwidth,\nEpanechnikov kernel", xlab = "Tip ($)")

# ECDF of all tips
plot(ecdf(tip), xlab = "Tip ($)", ylab = "Cumulative probability", main = "Tip empirical CDF")

# ECDF by day of week
ecdfsByDay = by(tip, day, ecdf)
plot(ecdfsByDay[[1]], verticals = TRUE, pch = 1, col = brewer.pal(4, 'Dark2')[1],
     xlab = "Tip ($)", ylab = "Cumulative probability", xlim = c(0, 11),
     main = "Tip empirical CDFs\nby day of week")
for(i in 2:4) {
  plot(ecdfsByDay[[i]], verticals = TRUE, pch = 1,
       add = TRUE, col = brewer.pal(4, 'Dark2')[i])
}
legend('bottomright', legend = levels(day), pch = 1, lty = 1,
       col = brewer.pal(4, 'Dark2'))

# Sunday tends to have lowest probability at most tip sizes,
# except for the highest tips

# Q-Q plot: compare two datasets
qqplot(tip[day == "Sat"], tip[day == "Sun"],
       xlab = "Saturday tips ($)", ylab = "Sunday tips ($)")
abline(a = 0, b = 1, lty = 2)

# Sunday has more low tips, Saturday has more high tips

# Q-Q plot: compare to Normal quantiles
qqnorm(tip, ylab = "Tip ($)", las = 1)
qqline(tip)

# Tips do not look Normal, but skewed:
# both the lowest and highest values of tip are too high,
# compared to the bulk of the sample.

# Boxplot of all tips
boxplot(tip, xlab = "Tip ($)", horizontal = TRUE)

# Boxplot by day of week
boxplot(tip ~ day, xlab = "Day", ylab = "Tip ($)", las = 1)

# Bagplot (a bivariate boxplot) by total bill
bagplot(totbill, tip, xlab = "Total bill ($)", ylab = "Tip ($)", las = 1)

# "Time map":
# https://districtdatalabs.silvrback.com/time-maps-visualizing-discrete-events-across-many-timescales
# Not a great dataset to use for this,
# and the overplotting is bad here.
# But it does confirm that the highest tips are spaced far apart
# (low density in high-tip distribution)
# while low tips are clustered together
# (high density in low-tip distribution)
diffs = diff(sort(tip))
nDiffs = length(tip) - 1
eqscplot(diffs[1:(nDiffs-1)], diffs[2:nDiffs],
         xlim = c(0, 1.5), ylim = c(0, 1.5), pch = 16, axes = FALSE,
         col = brewer.pal(10, "RdYlBu")[round(sort(tip))[2:nDiffs]],
         xlab = "Gap before tip", ylab = "Gap after tip",
         main = "Time map of tip data,\ncolored by tip value rounded to nearest dollar")
axis(1, at = c(0, 0.5, 1, 1.5))
axis(2)
legend('bottomright', legend = c("$1", "$4", "$7", "$10"), pch = 16,
       col = brewer.pal(10, "RdYlBu")[c(1, 4, 7, 10)])

#### PRECISION OR UNCERTAINTY ####

# Error bars: mean tip by day, with 95% CIs
tipMeans = tapply(tip, day, mean)
MOE = function(x) {1.96 * sd(x) / sqrt(length(x))}
tipMOEs = tapply(tip, day, MOE)
errbar(1:4, tipMeans, yplus = tipMeans + tipMOEs, yminus = tipMeans - tipMOEs,
       xlab = "Day", ylab = "Mean tip with 95% CI", axes = FALSE)
box()
axis(1, at = 1:4, labels = levels(day))
axis(2, las = 1)

# Error bars on a bar plot
# Save the output of barplot() to get a vector of the bar midpoints along x-axis 
mp = barplot(tipMeans, ylim = c(0, max(tipMeans + tipMOEs)), las = 1,
             ylab = "Mean tip with 95% CI")
errbar(mp, tipMeans, tipMeans + tipMOEs, tipMeans - tipMOEs, add = TRUE)

# Regression confidence bands:
# Easiest to use ggplot with stat_smooth
p = ggplot(tips, aes(totbill, tip)) + geom_point()
p + geom_smooth() # default loess smoother

p + geom_smooth(method = 'lm') # linear regression line smoother

# In base R, can use predict(lm(...), interval = 'confidence')
# and draw lines joining the CI bands
tipRegr = lm(tip ~ totbill)
# Define a range of x-values on which to compute confidence band limits
predXValues = floor(min(totbill)):ceiling(max(totbill))
# Get CI limits;
# give predict() a data frame with same variable names as your lm() predictors
tipConfBand = predict(tipRegr,
                      newdata = data.frame(totbill = predXValues),
                      interval = 'confidence')
# Plot the data and the best linear fit
plot(totbill, tip)
lines(predXValues, tipConfBand[, 'fit'], lty = 1)
# Plot the confidence band limits
lines(predXValues, tipConfBand[, 'lwr'], lty = 2)
lines(predXValues, tipConfBand[, 'upr'], lty = 2)
legend('topleft', legend = c('Regression line', '95% confidence band'), lty = 1:2)

# Detach tips dataset
detach(tips)


#### REGRESSION MODELING AND DIAGNOSTICS ####

## Choosing model terms (polynomial terms, interactions) ##

# Load the ethanol dataset
# from lattice package
data(ethanol)

# We want to understand how
# the level of nitric oxide (NOx) in car engines
# relates to the Equivalence Ratio (E)
# and the Compression Ratio (C)
# at which the engine is running.
# (This example is from Ch. 4.1 of Cleveland's 'Visualizing Data' book.)

# Quick exploration with a scatterplot matrix:
pairs(ethanol)

# NOx ~ E looks curved, possibly quadratic;
# NOx ~ C is unclear

# Try plotting NOx ~ E
# after conditioning on the 5 levels of C
ggplot(ethanol, aes(E, NOx)) + facet_wrap(~ C) +
  geom_point() + geom_smooth(method = "loess")

# Looks curved, possibly quadratic, at each level of C,
# so our model needs a squared E term

# Try plotting NOx ~ C
# after conditioning on slices of E
# (so first, use quantile() and cut() to create slices)
Ebreaks = c(0, quantile(ethanol$E, seq(1/6, 1, by = 1/6)))
ethanol$Ecut = cut(ethanol$E, Ebreaks)
ggplot(ethanol, aes(C, NOx)) + facet_wrap(~ Ecut) +
  geom_point() + geom_smooth(method = "loess")

# Might be linear with positive slope at low levels of E,
# then flat at higher levels of E.
# So our model needs some kind of interaction term,
# probably with a negative estimate:
# the slope of NOx ~ C gets smaller as E increases.

# So we could start by trying models like NOx ~ C + E + E^2 + C*E
summary(lm(NOx ~ C + E + I(E^2) + C*E, data = ethanol))

## 
## Call:
## lm(formula = NOx ~ C + E + I(E^2) + C * E, data = ethanol)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.85983 -0.37975 -0.03065  0.40586  0.79046 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -24.56379    1.44398 -17.011  < 2e-16 ***
## C             0.28290    0.05848   4.838 5.97e-06 ***
## E            56.69655    2.74284  20.671  < 2e-16 ***
## I(E^2)      -29.78140    1.38189 -21.551  < 2e-16 ***
## C:E          -0.23464    0.06105  -3.844 0.000236 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4486 on 83 degrees of freedom
## Multiple R-squared:  0.8503, Adjusted R-squared:  0.8431 
## F-statistic: 117.9 on 4 and 83 DF,  p-value: < 2.2e-16

# and then look at residuals for further diagnostics.

# Further examples with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/trivar.r


## Residual diagnostics, variable model transformations, pooling terms ##

# In a simple linear regression example, we will show
# how to use regression residuals and loess smoothers
# to check regression assumptions:
# linear relationship between response and predictors;
# errors have constant variance;
# errors are Normal.

# We can confirm these assumptions with three kinds of plots:
# Is there no trend in the residuals?
# Is there no trend in the spread of the residuals?
# Are the residuals approximately Normal?

# Load the ganglion dataset from
# http://content.csbs.utah.edu/~rogers/datanal/data/ganglion.txt
ganglion = read.table("ganglion.txt")
attach(ganglion)

# We want to understand how the CP ratio
# (concentration of certain cells near center of retina)
# relates to the area of the retina,
# in fetal cats.
# (This example is from Ch. 3.3 of Cleveland's 'Visualizing Data' book.)

# Plot the raw data
plot(area, cp.ratio, xlab = "Area (mm^2)", ylab = "CP Ratio", las = 1)

# Doesn't look quite linear, but hard to be sure.
# Try a linear fit:
cpLinear = lm(cp.ratio ~ area)
plot(area, cp.ratio)
lines(area, fitted(cpLinear))

# Plot residuals with a loess smoother:
scatter.smooth(area, resid(cpLinear), span = 1)
abline(h = 0, lty = 2) # reference line at 0

# Clearly a poor fit.
# Curved trend in the residuals: maybe a parabola?

# Try a quadratic fit instead:
cpQuad = lm(cp.ratio ~ area + I(area^2))
plot(area, cp.ratio)
lines(sort(area), fitted(cpQuad)[order(area)])

# Now plot residuals again:
scatter.smooth(area, resid(cpQuad), span = 1)
abline(h = 0, lty = 2) # reference line at 0

# Looks much better.
# The response does seem linearly related
# to the model with a quadratic term:
# cp.ratio ~ area + area^2

# Check if the error variance is constant with respect to the predictors.
# Each residual is supposed to have mean 0,
# so absolute residuals are estimates of spread.
# These are often skewed, because they cannot be negative,
# but taking their square root reduces the asymmetry.

# Plot the square root absolute residuals vs fitted values,
# with loess smoother:
scatter.smooth(fitted(cpQuad), sqrt(abs(resid(cpQuad))), span = 2,
               ylab = "Square Root Absolute Residual")

# This is a problem.
# The (transformed) residuals should show
# no pattern with respect to fitted value,
# but they increase with fitted value.
# We do not have constant variance,
# so our linear regression assumptions are not met.

# Try a different transformation:
# instead of a quadratic model, take the log of CP ratio
cpLog = lm(log2(cp.ratio) ~ area)
plot(area, log2(cp.ratio))
lines(area, fitted(cpLog))

# Now plot residuals again:
scatter.smooth(area, resid(cpLog), span = 1)
abline(h = 0, lty = 2) # reference line at 0

# The fit still looks good: no trend in residuals.
# Now plot square root absolute residuals again:
scatter.smooth(fitted(cpLog), sqrt(abs(resid(cpLog))), span = 2,
               ylab = "Square Root Absolute Residual")

# There is no longer a trend in the spread of the residuals.
# This model is more appropriate than the linear or quadratic models.

# Check for Normal residuals with a Q-Q plot:
qqnorm(resid(cpLog))
qqline(resid(cpLog))

# With so few points it's hard to tell for sure,
# but they don't look severely non-Normal,
# so we are probably OK.

# Since there is no trend in the residuals,
# nor in their spreads,
# and they are approximately Normal,
# we meet the assumptions for linear regression.

# Further examples with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/scatfit.R

detach(ganglion)


#### TIME SERIES ####

# Load co2 dataset of monthly carbon dioxide measurements
data(co2)

# Plot the data
plot(co2)

# Create a vector of times,
# fit a linear regression,
# plot it
t = 1959 + seq(0, length(co2)-1)/12
co2Linear = lm(co2 ~ t)
abline(co2Linear)

# Doesn't quite look linear

# Plot the residuals
plot(resid(co2Linear), type = 'l')
abline(h = 0, lty = 2)

# Linear fit is not good

# Seems like there is a trend in the residuals;
# look at the autocorrelation function:
acf(resid(co2Linear))

# Each point is highly correlated
# with the points 12 and 24 months after it,
# so there is a seasonal trend in the data.

# Fit a global trend with loess smoother,
# and a seasonal trend,
# and plot them along with the leftover residuals.
# Use stl (Seasonal decomposition of Time series by Loess)
# with a 12-unit window: seasonality is 12 months
plot(stl(co2, s.window = 12))

# More examples and with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/tser.r

09_StatAnalysis_code.R

jerzy

Mon Sep 28 22:21:29 2015