# 09 Graphics for Statistical Analysis: R code
# Jerzy Wieczorek
# 9/29/15
# 36-721 Statistical Graphics and Visualization
# Set working directory
setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 9/")
# Load R packages
library(RColorBrewer)
library(ggplot2)
library(MASS) # for eqscplot
library(aplpack) # for bagplot
library(Hmisc) # for errbar
library(lattice) # for ethanol dataset
#### PLOTTING MATHEMATICAL FUNCTIONS ####
# curve()
curve(x^2, from = -3, to = 3)
data:image/s3,"s3://crabby-images/9ee70/9ee703e4c59cfc33d05d09bd0da746740ef95edc" alt=""
curve(sin(x), -pi, pi)
data:image/s3,"s3://crabby-images/83cfb/83cfb4dc63fc6a81984b7a298140b747876a2502" alt=""
# Use curve() to plot continuous PDFs and CDFs
curve(dnorm(x), -3, 3, main = "Standard Normal PDF", ylab = "Density")
data:image/s3,"s3://crabby-images/f4316/f43169341dc79e289c0328f24e97de75fa8b1006" alt=""
curve(pnorm(x), -3, 3, main = "Standard Normal CDF", ylab = "Cumulative density")
data:image/s3,"s3://crabby-images/4ab04/4ab04b8a31ac2b982cee95d49f06a6011d501954" alt=""
curve(dgamma(x, shape = 3), 0, 10, main = "Gamma(shape = 3, scale = 1) PDF", ylab = "Density")
data:image/s3,"s3://crabby-images/d67b2/d67b2bb981c7e920790dd74a093d1aed82264fe8" alt=""
curve(pgamma(x, shape = 3), 0, 10, main = "Gamma(shape = 3, scale = 1) CDF", ylab = "Cumulative density")
data:image/s3,"s3://crabby-images/36b9a/36b9a6e384285839003682b2b75d276ec5ac17b2" alt=""
# Contours of bivariate function:
# let's plot bivariate standard Normal PDF,
# which is just product of two independent Normal densities
x = seq(-2, 2, 0.1)
y = x
dnorm2 = function(x, y) {dnorm(x) * dnorm(y)}
z = outer(x, y, FUN = dnorm2)
contour(x, y, z)
data:image/s3,"s3://crabby-images/b0cda/b0cda09dd48d79c8b04e8aa009539485ea4a7b4c" alt=""
#### DISTRIBUTIONS ####
# Load the tips dataset from
# http://www.ggobi.org/book/
tips = read.csv("tips.csv")
# Reorder levels of day (alphabetical default is unhelpful here)
tips$day = factor(tips$day, levels = c("Thu", "Fri", "Sat", "Sun"))
# Attach dataset
attach(tips)
# Histograms with different bins
hist(tip, main = "Tip histogram: default bins", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/1fcae/1fcae9ae5cc72b49723372b50113b8f54c8d2aff" alt=""
hist(tip, seq(0.5, 10.5, by = 1), xlim = c(0, 11),
main = "Tip histogram: binned to nearest $1", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/d00dc/d00dcbdd37da5c7f603e76c890b3346e2593c2cd" alt=""
hist(tip, seq(0.95, 10.05, by = .1), xlim = c(0, 11),
main = "Tip histogram: binned to nearest $0.10", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/d40ee/d40eed5f8c8d5a827fae288883aebe8df93b9e8c" alt=""
# KDEs with different bandwidths and kernels
plot(density(tip), main = "Tip KDE: default bandwidth,\nGaussian kernel", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/17241/17241ee8a1608255b99c25919bbcd59bc8885194" alt=""
plot(density(tip, bw = 1), main = "Tip KDE: $1 bandwidth,\nGaussian kernel", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/819e9/819e94e99968de60a60b2d547b557364dab92b78" alt=""
plot(density(tip, bw = 0.1), main = "Tip KDE: $0.10 bandwidth,\nGaussian kernel", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/012d1/012d13cf868cf8d9f51a2584428dcb1f6b927cad" alt=""
plot(density(tip, bw = 0.1, kernel = 'rectangular'),
main = "Tip KDE: $0.10 bandwidth,\nrectangular kernel", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/fddd9/fddd949e866d6f303feaea2543fd0a56440bf205" alt=""
plot(density(tip, bw = 0.1, kernel = 'epanechnikov'),
main = "Tip KDE: $0.10 bandwidth,\nEpanechnikov kernel", xlab = "Tip ($)")
data:image/s3,"s3://crabby-images/7b610/7b6102268c7ed38243ef851962521c0cdd4e2c5f" alt=""
# ECDF of all tips
plot(ecdf(tip), xlab = "Tip ($)", ylab = "Cumulative probability", main = "Tip empirical CDF")
data:image/s3,"s3://crabby-images/f8520/f85204dd6b721434d67ab5d6d8eea488c88884a1" alt=""
# ECDF by day of week
ecdfsByDay = by(tip, day, ecdf)
plot(ecdfsByDay[[1]], verticals = TRUE, pch = 1, col = brewer.pal(4, 'Dark2')[1],
xlab = "Tip ($)", ylab = "Cumulative probability", xlim = c(0, 11),
main = "Tip empirical CDFs\nby day of week")
for(i in 2:4) {
plot(ecdfsByDay[[i]], verticals = TRUE, pch = 1,
add = TRUE, col = brewer.pal(4, 'Dark2')[i])
}
legend('bottomright', legend = levels(day), pch = 1, lty = 1,
col = brewer.pal(4, 'Dark2'))
data:image/s3,"s3://crabby-images/526dc/526dce00b8a539327a69bfeb58940da5158a146a" alt=""
# Sunday tends to have lowest probability at most tip sizes,
# except for the highest tips
# Q-Q plot: compare two datasets
qqplot(tip[day == "Sat"], tip[day == "Sun"],
xlab = "Saturday tips ($)", ylab = "Sunday tips ($)")
abline(a = 0, b = 1, lty = 2)
data:image/s3,"s3://crabby-images/63063/63063c4c54229203f96c20690aad7470f1e39d3d" alt=""
# Sunday has more low tips, Saturday has more high tips
# Q-Q plot: compare to Normal quantiles
qqnorm(tip, ylab = "Tip ($)", las = 1)
qqline(tip)
data:image/s3,"s3://crabby-images/23838/23838b9d27941e45104ec27cccbe6e0d045f60d4" alt=""
# Tips do not look Normal, but skewed:
# both the lowest and highest values of tip are too high,
# compared to the bulk of the sample.
# Boxplot of all tips
boxplot(tip, xlab = "Tip ($)", horizontal = TRUE)
data:image/s3,"s3://crabby-images/9d224/9d224d889e878689f3eda584a113d7f0be657807" alt=""
# Boxplot by day of week
boxplot(tip ~ day, xlab = "Day", ylab = "Tip ($)", las = 1)
data:image/s3,"s3://crabby-images/1589d/1589da848cf24a8b1c954a168c2b149124ac975d" alt=""
# Bagplot (a bivariate boxplot) by total bill
bagplot(totbill, tip, xlab = "Total bill ($)", ylab = "Tip ($)", las = 1)
data:image/s3,"s3://crabby-images/d2fcb/d2fcbd83ad1329d6e1d92ba6161af9fffa0873fa" alt=""
# "Time map":
# https://districtdatalabs.silvrback.com/time-maps-visualizing-discrete-events-across-many-timescales
# Not a great dataset to use for this,
# and the overplotting is bad here.
# But it does confirm that the highest tips are spaced far apart
# (low density in high-tip distribution)
# while low tips are clustered together
# (high density in low-tip distribution)
diffs = diff(sort(tip))
nDiffs = length(tip) - 1
eqscplot(diffs[1:(nDiffs-1)], diffs[2:nDiffs],
xlim = c(0, 1.5), ylim = c(0, 1.5), pch = 16, axes = FALSE,
col = brewer.pal(10, "RdYlBu")[round(sort(tip))[2:nDiffs]],
xlab = "Gap before tip", ylab = "Gap after tip",
main = "Time map of tip data,\ncolored by tip value rounded to nearest dollar")
axis(1, at = c(0, 0.5, 1, 1.5))
axis(2)
legend('bottomright', legend = c("$1", "$4", "$7", "$10"), pch = 16,
col = brewer.pal(10, "RdYlBu")[c(1, 4, 7, 10)])
data:image/s3,"s3://crabby-images/3bf6e/3bf6ea389a20dd9ae41fac0f2af3d7c1d0ce4c25" alt=""
#### PRECISION OR UNCERTAINTY ####
# Error bars: mean tip by day, with 95% CIs
tipMeans = tapply(tip, day, mean)
MOE = function(x) {1.96 * sd(x) / sqrt(length(x))}
tipMOEs = tapply(tip, day, MOE)
errbar(1:4, tipMeans, yplus = tipMeans + tipMOEs, yminus = tipMeans - tipMOEs,
xlab = "Day", ylab = "Mean tip with 95% CI", axes = FALSE)
box()
axis(1, at = 1:4, labels = levels(day))
axis(2, las = 1)
data:image/s3,"s3://crabby-images/2341a/2341a74dac1b83ccd441740145fb9459afc47466" alt=""
# Error bars on a bar plot
# Save the output of barplot() to get a vector of the bar midpoints along x-axis
mp = barplot(tipMeans, ylim = c(0, max(tipMeans + tipMOEs)), las = 1,
ylab = "Mean tip with 95% CI")
errbar(mp, tipMeans, tipMeans + tipMOEs, tipMeans - tipMOEs, add = TRUE)
data:image/s3,"s3://crabby-images/d3e64/d3e641b317cc54fcd81605e7a3a48ddc58f514c1" alt=""
# Regression confidence bands:
# Easiest to use ggplot with stat_smooth
p = ggplot(tips, aes(totbill, tip)) + geom_point()
p + geom_smooth() # default loess smoother
data:image/s3,"s3://crabby-images/10b42/10b42e2b2b6e2fe10f2c30f1c7fb971d43e8ea26" alt=""
p + geom_smooth(method = 'lm') # linear regression line smoother
data:image/s3,"s3://crabby-images/2cf84/2cf8494dc7f0f481b2db30dc7a97042aabe500e2" alt=""
# In base R, can use predict(lm(...), interval = 'confidence')
# and draw lines joining the CI bands
tipRegr = lm(tip ~ totbill)
# Define a range of x-values on which to compute confidence band limits
predXValues = floor(min(totbill)):ceiling(max(totbill))
# Get CI limits;
# give predict() a data frame with same variable names as your lm() predictors
tipConfBand = predict(tipRegr,
newdata = data.frame(totbill = predXValues),
interval = 'confidence')
# Plot the data and the best linear fit
plot(totbill, tip)
lines(predXValues, tipConfBand[, 'fit'], lty = 1)
# Plot the confidence band limits
lines(predXValues, tipConfBand[, 'lwr'], lty = 2)
lines(predXValues, tipConfBand[, 'upr'], lty = 2)
legend('topleft', legend = c('Regression line', '95% confidence band'), lty = 1:2)
data:image/s3,"s3://crabby-images/b74f8/b74f8301cea02e6418fd807be8e9fedb8fe62ca4" alt=""
# Detach tips dataset
detach(tips)
#### REGRESSION MODELING AND DIAGNOSTICS ####
## Choosing model terms (polynomial terms, interactions) ##
# Load the ethanol dataset
# from lattice package
data(ethanol)
# We want to understand how
# the level of nitric oxide (NOx) in car engines
# relates to the Equivalence Ratio (E)
# and the Compression Ratio (C)
# at which the engine is running.
# (This example is from Ch. 4.1 of Cleveland's 'Visualizing Data' book.)
# Quick exploration with a scatterplot matrix:
pairs(ethanol)
data:image/s3,"s3://crabby-images/20527/20527a3b63f167dc0b40c38a8c52f4d65464774c" alt=""
# NOx ~ E looks curved, possibly quadratic;
# NOx ~ C is unclear
# Try plotting NOx ~ E
# after conditioning on the 5 levels of C
ggplot(ethanol, aes(E, NOx)) + facet_wrap(~ C) +
geom_point() + geom_smooth(method = "loess")
data:image/s3,"s3://crabby-images/e3686/e36863c7f02030e17a5cf24099fee06a625e906b" alt=""
# Looks curved, possibly quadratic, at each level of C,
# so our model needs a squared E term
# Try plotting NOx ~ C
# after conditioning on slices of E
# (so first, use quantile() and cut() to create slices)
Ebreaks = c(0, quantile(ethanol$E, seq(1/6, 1, by = 1/6)))
ethanol$Ecut = cut(ethanol$E, Ebreaks)
ggplot(ethanol, aes(C, NOx)) + facet_wrap(~ Ecut) +
geom_point() + geom_smooth(method = "loess")
data:image/s3,"s3://crabby-images/0055f/0055f92cad0524c6dfa6097d6326d56616df2899" alt=""
# Might be linear with positive slope at low levels of E,
# then flat at higher levels of E.
# So our model needs some kind of interaction term,
# probably with a negative estimate:
# the slope of NOx ~ C gets smaller as E increases.
# So we could start by trying models like NOx ~ C + E + E^2 + C*E
summary(lm(NOx ~ C + E + I(E^2) + C*E, data = ethanol))
##
## Call:
## lm(formula = NOx ~ C + E + I(E^2) + C * E, data = ethanol)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.85983 -0.37975 -0.03065 0.40586 0.79046
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -24.56379 1.44398 -17.011 < 2e-16 ***
## C 0.28290 0.05848 4.838 5.97e-06 ***
## E 56.69655 2.74284 20.671 < 2e-16 ***
## I(E^2) -29.78140 1.38189 -21.551 < 2e-16 ***
## C:E -0.23464 0.06105 -3.844 0.000236 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4486 on 83 degrees of freedom
## Multiple R-squared: 0.8503, Adjusted R-squared: 0.8431
## F-statistic: 117.9 on 4 and 83 DF, p-value: < 2.2e-16
# and then look at residuals for further diagnostics.
# Further examples with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/trivar.r
## Residual diagnostics, variable model transformations, pooling terms ##
# In a simple linear regression example, we will show
# how to use regression residuals and loess smoothers
# to check regression assumptions:
# linear relationship between response and predictors;
# errors have constant variance;
# errors are Normal.
# We can confirm these assumptions with three kinds of plots:
# Is there no trend in the residuals?
# Is there no trend in the spread of the residuals?
# Are the residuals approximately Normal?
# Load the ganglion dataset from
# http://content.csbs.utah.edu/~rogers/datanal/data/ganglion.txt
ganglion = read.table("ganglion.txt")
attach(ganglion)
# We want to understand how the CP ratio
# (concentration of certain cells near center of retina)
# relates to the area of the retina,
# in fetal cats.
# (This example is from Ch. 3.3 of Cleveland's 'Visualizing Data' book.)
# Plot the raw data
plot(area, cp.ratio, xlab = "Area (mm^2)", ylab = "CP Ratio", las = 1)
data:image/s3,"s3://crabby-images/af25a/af25acc6541bc84ac713e28e5f631adf822333dc" alt=""
# Doesn't look quite linear, but hard to be sure.
# Try a linear fit:
cpLinear = lm(cp.ratio ~ area)
plot(area, cp.ratio)
lines(area, fitted(cpLinear))
data:image/s3,"s3://crabby-images/7ebf4/7ebf4000f03225aeb4d71811c590b9f038caabcc" alt=""
# Plot residuals with a loess smoother:
scatter.smooth(area, resid(cpLinear), span = 1)
abline(h = 0, lty = 2) # reference line at 0
data:image/s3,"s3://crabby-images/65641/656414a61fcab355afe1f5aa33ec0a670d9c8167" alt=""
# Clearly a poor fit.
# Curved trend in the residuals: maybe a parabola?
# Try a quadratic fit instead:
cpQuad = lm(cp.ratio ~ area + I(area^2))
plot(area, cp.ratio)
lines(sort(area), fitted(cpQuad)[order(area)])
data:image/s3,"s3://crabby-images/3fc75/3fc75e5cf773373d389c8afb937de1f25c808f48" alt=""
# Now plot residuals again:
scatter.smooth(area, resid(cpQuad), span = 1)
abline(h = 0, lty = 2) # reference line at 0
data:image/s3,"s3://crabby-images/8a3aa/8a3aaec548b2247ae860557238a271fbd1d0506d" alt=""
# Looks much better.
# The response does seem linearly related
# to the model with a quadratic term:
# cp.ratio ~ area + area^2
# Check if the error variance is constant with respect to the predictors.
# Each residual is supposed to have mean 0,
# so absolute residuals are estimates of spread.
# These are often skewed, because they cannot be negative,
# but taking their square root reduces the asymmetry.
# Plot the square root absolute residuals vs fitted values,
# with loess smoother:
scatter.smooth(fitted(cpQuad), sqrt(abs(resid(cpQuad))), span = 2,
ylab = "Square Root Absolute Residual")
data:image/s3,"s3://crabby-images/81406/81406e6ba8a47a7675ec7d27a5a88dc25b600e08" alt=""
# This is a problem.
# The (transformed) residuals should show
# no pattern with respect to fitted value,
# but they increase with fitted value.
# We do not have constant variance,
# so our linear regression assumptions are not met.
# Try a different transformation:
# instead of a quadratic model, take the log of CP ratio
cpLog = lm(log2(cp.ratio) ~ area)
plot(area, log2(cp.ratio))
lines(area, fitted(cpLog))
data:image/s3,"s3://crabby-images/87213/87213391f4ff70deaf277335c3ff86dd1140ea5c" alt=""
# Now plot residuals again:
scatter.smooth(area, resid(cpLog), span = 1)
abline(h = 0, lty = 2) # reference line at 0
data:image/s3,"s3://crabby-images/cd7fd/cd7fd95db3c84ed2e2b69f1a7345274d129fc22c" alt=""
# The fit still looks good: no trend in residuals.
# Now plot square root absolute residuals again:
scatter.smooth(fitted(cpLog), sqrt(abs(resid(cpLog))), span = 2,
ylab = "Square Root Absolute Residual")
data:image/s3,"s3://crabby-images/deace/deace0800c66549c6989ec08537414983d54ac3d" alt=""
# There is no longer a trend in the spread of the residuals.
# This model is more appropriate than the linear or quadratic models.
# Check for Normal residuals with a Q-Q plot:
qqnorm(resid(cpLog))
qqline(resid(cpLog))
data:image/s3,"s3://crabby-images/340b1/340b1955bdea884cf522842474f1cc30334063bc" alt=""
# With so few points it's hard to tell for sure,
# but they don't look severely non-Normal,
# so we are probably OK.
# Since there is no trend in the residuals,
# nor in their spreads,
# and they are approximately Normal,
# we meet the assumptions for linear regression.
# Further examples with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/scatfit.R
detach(ganglion)
#### TIME SERIES ####
# Load co2 dataset of monthly carbon dioxide measurements
data(co2)
# Plot the data
plot(co2)
# Create a vector of times,
# fit a linear regression,
# plot it
t = 1959 + seq(0, length(co2)-1)/12
co2Linear = lm(co2 ~ t)
abline(co2Linear)
data:image/s3,"s3://crabby-images/af52a/af52aca53375ef5997faca98482edf84a8e2d343" alt=""
# Doesn't quite look linear
# Plot the residuals
plot(resid(co2Linear), type = 'l')
abline(h = 0, lty = 2)
data:image/s3,"s3://crabby-images/adbc3/adbc31a737aea9a81af107006806d9081b38130a" alt=""
# Linear fit is not good
# Seems like there is a trend in the residuals;
# look at the autocorrelation function:
acf(resid(co2Linear))
data:image/s3,"s3://crabby-images/f1f1b/f1f1bf0e70951c66345c8005139a78a4eb0ae31a" alt=""
# Each point is highly correlated
# with the points 12 and 24 months after it,
# so there is a seasonal trend in the data.
# Fit a global trend with loess smoother,
# and a seasonal trend,
# and plot them along with the leftover residuals.
# Use stl (Seasonal decomposition of Time series by Loess)
# with a 12-unit window: seasonality is 12 months
plot(stl(co2, s.window = 12))
data:image/s3,"s3://crabby-images/cb920/cb9207b995dcd5eb5b92b9399552bae2be88a7c7" alt=""
# More examples and with lattice graphs:
# http://content.csbs.utah.edu/~rogers/datanal/lect/tser.r