#+ warning=FALSE, message=FALSE
# 13 Wrapup: R code
# Jerzy Wieczorek
# 10/15/15
# 36-721 Statistical Graphics and Visualization
# Set working directory
#setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 13")
setwd("D:/Dropbox/CMU/36-721 Dataviz F15/Lecture 13")
# Load packages
library(ggplot2) # for diamonds dataset
library(hexbin) # for hexbin()
library(MASS) # for kde2d()
library(tabplot) # for tableplot()
#### DENSITY LEGEND ####
# If the colorbar breaks aren't quantiles,
# it's helpful to tell the viewer
# how the data are distributed in each color category.
# Here is one way:
# Install the oaPlots package from a non-standard repository
# install.packages("oaPlots",
# repos = "http://repos.openanalytics.eu",
# type = "source")
# Load the package and run their example
library(oaPlots)
example(densityLegend)
# Reset default plotting parameters
# after the densityLegend example changed them
par(mfrow = c(1, 1))
#### LARGE SAMPLES (MANY RECORDS) ####
# When plotting a large number of records (not variables),
# scatterplots will be overplotted and hard to read.
# For example, use diamonds dataset from ggplot2
# (though these plots will be in base R, not ggplot)
data(diamonds)
dim(diamonds)
# Almost 54000 points to plot!
# How does a diamond's carat (weight) relate to its price?
plot(diamonds$carat, diamonds$price,
xlab = "Weight (carats)", ylab = "Price ($)")
# Many points are overplotted.
# Seems like many diamonds are > 1.5 carats and > $7500,
# but in fact we'll see those are very rare.
# Would smaller plot symbols help?
plot(diamonds$carat, diamonds$price, pch = '.',
xlab = "Weight (carats)", ylab = "Price ($)")
# Not good enough.
# Use color to show data density (2D KDE);
# only plot outliers as individual points
smoothScatter(diamonds$carat, diamonds$price,
xlab = "Weight (carats)", ylab = "Price ($)")
# Instead of smooth density estimates,
# aggregate to discrete hexagonal bins
# (like a 2D histogram).
# Use hexbins, not a grid of squares,
# to avoid seeing spurious patterns due to lines on grid
plot(hexbin(diamonds$carat, diamonds$price),
xlab = "Weight (carats)", ylab = "Price ($)")
# However, if you want more control over 2D density estmate,
# can use image function with kde2d
image(kde2d(diamonds$carat, diamonds$price),
xlab = "Weight (carats)", ylab = "Price ($)")
# Or can plot kde2d as contours over top of scatterplot
plot(diamonds$carat, diamonds$price, pch = '.',
xlab = "Weight (carats)", ylab = "Price ($)")
contour(kde2d(diamonds$carat, diamonds$price),
add = T, nlevels = 20, col = heat.colors(20))
#### TABLEPLOTS ####
# Tableplots:
# Sort by one variable and bin by percentages,
# then show distributions of other variables
# at each level of the sorted variable.
# Sort of a multivariate (stacked) bar chart / histogram.
# Possible alternative to parallel coordinates plots.
# Draw a tableplot of the diamonds data.
# By default, sort by first variable (carat)
tableplot(diamonds)
# Sort on cut instead
tableplot(diamonds, sortCol = 2)
# Subset by cut first, then sort on price within each value of cut
tableplot(diamonds, subset_string = "cut", sortCol = 7)
# Zoom in to look at top 1% of most expensive diamonds.
# Also change number of bins used,
# since there aren't many observations here.
tableplot(diamonds, sortCol = 7, from=0, to=1)
# At the top 1%, the other variables' distributions
# no longer vary smoothly with price
#### MOSAIC PLOTS ####
# Mosaic plots:
# Use areas of rectangles to plot several categorical variables.
# Often I find these hard to read
# (hard to compare areas of differently-shaped rectangles),
# but sometimes they can be useful.
# Plot Titanic dataset:
# each passenger's Sex, Age (Adult, Child),
# Class (1st, 2nd, 3rd, Crew), and whether they Survived.
data(Titanic)
# There were fewer Females overall,
# but their survival rate was higher than for Males:
mosaicplot(~ Sex + Survived, data = Titanic, color = TRUE)
# There were fewer Children overall,
# but their survival rate was higher than for Adults:
mosaicplot(~ Age + Survived, data = Titanic, color = TRUE)
# Age didn't affect survival rates much among Females,
# but it did among Males:
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)