# 13 Wrapup: R code
# Jerzy Wieczorek
# 10/15/15
# 36-721 Statistical Graphics and Visualization
# Set working directory
#setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 13")
setwd("D:/Dropbox/CMU/36-721 Dataviz F15/Lecture 13")
# Load packages
library(ggplot2) # for diamonds dataset
library(hexbin) # for hexbin()
library(MASS) # for kde2d()
library(tabplot) # for tableplot()
#### DENSITY LEGEND ####
# If the colorbar breaks aren't quantiles,
# it's helpful to tell the viewer
# how the data are distributed in each color category.
# Here is one way:
# Install the oaPlots package from a non-standard repository
# install.packages("oaPlots",
# repos = "http://repos.openanalytics.eu",
# type = "source")
# Load the package and run their example
library(oaPlots)
example(densityLegend)
##
## dnstyL> library(ggplot2)
##
## dnstyL> # subset the data object
## dnstyL> dsub <- subset(diamonds, x > 5 & x < 6 & y > 5 & y < 6)
##
## dnstyL> dsub <- dsub[-which(dsub$z > 4), ]
##
## dnstyL> dsub <- dsub[-which(dsub$z < 3), ]
##
## dnstyL> # define color pallette, color vector and color region breaks
## dnstyL> colorPalette <- brewer.pal(9, "Blues")[4:9]
##
## dnstyL> colorObj <- splitColorVar(colorVar = dsub$z, colorPalette)
##
## dnstyL> colorVec <- colorObj$colorVec
##
## dnstyL> breaks <- colorObj$breaks
##
## dnstyL> # plot the data
## dnstyL> prepLegend(side = "right", proportion = 0.3)
##
## dnstyL> oaTemplate(xlim = range(dsub$x), ylim = range(dsub$y),
## dnstyL+ main = "Diamond Length by Width \n Colored by Depth",
## dnstyL+ xlab = "Length (mm)", ylab = "Width (mm)")
##
## dnstyL> points(x = dsub$x, y = dsub$y, col = colorVec, pch = 19, cex = 0.6)
##
## dnstyL> # add the legend
## dnstyL> densityLegend(x = dsub$z, colorPalette = colorPalette, side = "right",
## dnstyL+ main = "Diamond Depth", colorBreaks = breaks)
data:image/s3,"s3://crabby-images/c1fea/c1feae6e90f3a950b594f8da454b0e2d979c2367" alt=""
# Reset default plotting parameters
# after the densityLegend example changed them
par(mfrow = c(1, 1))
#### LARGE SAMPLES (MANY RECORDS) ####
# When plotting a large number of records (not variables),
# scatterplots will be overplotted and hard to read.
# For example, use diamonds dataset from ggplot2
# (though these plots will be in base R, not ggplot)
data(diamonds)
dim(diamonds)
## [1] 53940 10
# Almost 54000 points to plot!
# How does a diamond's carat (weight) relate to its price?
plot(diamonds$carat, diamonds$price,
xlab = "Weight (carats)", ylab = "Price ($)")
data:image/s3,"s3://crabby-images/05dda/05dda90cada16afbc35bec45922bb26786bf8cc7" alt=""
# Many points are overplotted.
# Seems like many diamonds are > 1.5 carats and > $7500,
# but in fact we'll see those are very rare.
# Would smaller plot symbols help?
plot(diamonds$carat, diamonds$price, pch = '.',
xlab = "Weight (carats)", ylab = "Price ($)")
data:image/s3,"s3://crabby-images/3a6a1/3a6a180d0f236562b568e657a78605fe505e609f" alt=""
# Not good enough.
# Use color to show data density (2D KDE);
# only plot outliers as individual points
smoothScatter(diamonds$carat, diamonds$price,
xlab = "Weight (carats)", ylab = "Price ($)")
data:image/s3,"s3://crabby-images/ce4e7/ce4e7c979536421ebcb3ab82dd192e485244c254" alt=""
# Instead of smooth density estimates,
# aggregate to discrete hexagonal bins
# (like a 2D histogram).
# Use hexbins, not a grid of squares,
# to avoid seeing spurious patterns due to lines on grid
plot(hexbin(diamonds$carat, diamonds$price),
xlab = "Weight (carats)", ylab = "Price ($)")
data:image/s3,"s3://crabby-images/fe461/fe4612e1b394b32faa3e832ba2edd7563fc69013" alt=""
# However, if you want more control over 2D density estmate,
# can use image function with kde2d
image(kde2d(diamonds$carat, diamonds$price),
xlab = "Weight (carats)", ylab = "Price ($)")
data:image/s3,"s3://crabby-images/8df9a/8df9a493e3621a4a9632166476ee3d5c16a60e28" alt=""
# Or can plot kde2d as contours over top of scatterplot
plot(diamonds$carat, diamonds$price, pch = '.',
xlab = "Weight (carats)", ylab = "Price ($)")
contour(kde2d(diamonds$carat, diamonds$price),
add = T, nlevels = 20, col = heat.colors(20))
data:image/s3,"s3://crabby-images/3fcd4/3fcd4398539d02bbf1e49f95d536c6f011abb8b4" alt=""
#### TABLEPLOTS ####
# Tableplots:
# Sort by one variable and bin by percentages,
# then show distributions of other variables
# at each level of the sorted variable.
# Sort of a multivariate (stacked) bar chart / histogram.
# Possible alternative to parallel coordinates plots.
# Draw a tableplot of the diamonds data.
# By default, sort by first variable (carat)
tableplot(diamonds)
data:image/s3,"s3://crabby-images/f7ecb/f7ecbde49410be4daefbcd4d8e40378800c1bfc4" alt=""
# Sort on cut instead
tableplot(diamonds, sortCol = 2)
data:image/s3,"s3://crabby-images/5006e/5006efbe5058cfc50cb85b72b5edc2d9e98bed63" alt=""
# Subset by cut first, then sort on price within each value of cut
tableplot(diamonds, subset_string = "cut", sortCol = 7)
data:image/s3,"s3://crabby-images/c3f19/c3f19632464bce0e90fb0a13d87b65798f09d86e" alt=""
data:image/s3,"s3://crabby-images/fef82/fef82efc05cdf1b0ca4a775bc67054a8cd3628e3" alt=""
data:image/s3,"s3://crabby-images/546a4/546a425adba9ef607739d3f0452df1bc2235c4b3" alt=""
data:image/s3,"s3://crabby-images/759a1/759a1c62da7d57c417f64183e179ddb6b8eecdb4" alt=""
data:image/s3,"s3://crabby-images/86921/86921b4ec918f9aea535cf808402714fbca34181" alt=""
# Zoom in to look at top 1% of most expensive diamonds.
# Also change number of bins used,
# since there aren't many observations here.
tableplot(diamonds, sortCol = 7, from=0, to=1)
data:image/s3,"s3://crabby-images/136fd/136fdbe6d078280c84ef7cb1219a389d8bd92e23" alt=""
# At the top 1%, the other variables' distributions
# no longer vary smoothly with price
#### MOSAIC PLOTS ####
# Mosaic plots:
# Use areas of rectangles to plot several categorical variables.
# Often I find these hard to read
# (hard to compare areas of differently-shaped rectangles),
# but sometimes they can be useful.
# Plot Titanic dataset:
# each passenger's Sex, Age (Adult, Child),
# Class (1st, 2nd, 3rd, Crew), and whether they Survived.
data(Titanic)
# There were fewer Females overall,
# but their survival rate was higher than for Males:
mosaicplot(~ Sex + Survived, data = Titanic, color = TRUE)
data:image/s3,"s3://crabby-images/baa6a/baa6a9d5e45eb8040b46edc5cbd8019bb4af5bca" alt=""
# There were fewer Children overall,
# but their survival rate was higher than for Adults:
mosaicplot(~ Age + Survived, data = Titanic, color = TRUE)
data:image/s3,"s3://crabby-images/b0c81/b0c81ae228f49f2b2e7dc753ebd576516a80d607" alt=""
# Age didn't affect survival rates much among Females,
# but it did among Males:
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)
data:image/s3,"s3://crabby-images/c159f/c159fb4f6b8907511750029da64697636125ba05" alt=""