# 13 Wrapup: R code
# Jerzy Wieczorek
# 10/15/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
#setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 13")
setwd("D:/Dropbox/CMU/36-721 Dataviz F15/Lecture 13")

# Load packages
library(ggplot2) # for diamonds dataset
library(hexbin)  # for hexbin()
library(MASS)    # for kde2d()
library(tabplot) # for tableplot()


#### DENSITY LEGEND ####

# If the colorbar breaks aren't quantiles,
# it's helpful to tell the viewer
# how the data are distributed in each color category.
# Here is one way:

# Install the oaPlots package from a non-standard repository
# install.packages("oaPlots",
#   repos = "http://repos.openanalytics.eu",
#   type = "source")
# Load the package and run their example
library(oaPlots)
example(densityLegend)
## 
## dnstyL> library(ggplot2)
## 
## dnstyL> # subset the data object
## dnstyL> dsub <- subset(diamonds, x > 5 & x < 6 & y > 5 & y < 6)
## 
## dnstyL> dsub <- dsub[-which(dsub$z > 4), ]
## 
## dnstyL> dsub <- dsub[-which(dsub$z < 3), ]
## 
## dnstyL> # define color pallette, color vector and color region breaks
## dnstyL> colorPalette <- brewer.pal(9, "Blues")[4:9]
## 
## dnstyL> colorObj <- splitColorVar(colorVar = dsub$z, colorPalette)
## 
## dnstyL> colorVec <- colorObj$colorVec
## 
## dnstyL> breaks <- colorObj$breaks
## 
## dnstyL> # plot the data
## dnstyL> prepLegend(side = "right", proportion = 0.3)
## 
## dnstyL> oaTemplate(xlim = range(dsub$x), ylim = range(dsub$y),
## dnstyL+      main = "Diamond Length by Width \n Colored by Depth",
## dnstyL+      xlab = "Length (mm)", ylab = "Width (mm)")
## 
## dnstyL> points(x = dsub$x, y = dsub$y, col = colorVec, pch = 19, cex = 0.6)
## 
## dnstyL> # add the legend
## dnstyL> densityLegend(x = dsub$z, colorPalette = colorPalette, side = "right",
## dnstyL+      main = "Diamond Depth", colorBreaks = breaks)

# Reset default plotting parameters
# after the densityLegend example changed them
par(mfrow = c(1, 1))

#### LARGE SAMPLES (MANY RECORDS) ####

# When plotting a large number of records (not variables),
# scatterplots will be overplotted and hard to read.

# For example, use diamonds dataset from ggplot2
# (though these plots will be in base R, not ggplot)
data(diamonds)
dim(diamonds)
## [1] 53940    10
# Almost 54000 points to plot!

# How does a diamond's carat (weight) relate to its price?
plot(diamonds$carat, diamonds$price,
     xlab = "Weight (carats)", ylab = "Price ($)")

# Many points are overplotted.
# Seems like many diamonds are > 1.5 carats and > $7500,
# but in fact we'll see those are very rare.

# Would smaller plot symbols help?
plot(diamonds$carat, diamonds$price, pch = '.',
     xlab = "Weight (carats)", ylab = "Price ($)")

# Not good enough.

# Use color to show data density (2D KDE);
# only plot outliers as individual points
smoothScatter(diamonds$carat, diamonds$price,
              xlab = "Weight (carats)", ylab = "Price ($)")

# Instead of smooth density estimates,
# aggregate to discrete hexagonal bins
# (like a 2D histogram).
# Use hexbins, not a grid of squares,
# to avoid seeing spurious patterns due to lines on grid
plot(hexbin(diamonds$carat, diamonds$price),
     xlab = "Weight (carats)", ylab = "Price ($)")

# However, if you want more control over 2D density estmate,
# can use image function with kde2d
image(kde2d(diamonds$carat, diamonds$price),
      xlab = "Weight (carats)", ylab = "Price ($)")

# Or can plot kde2d as contours over top of scatterplot
plot(diamonds$carat, diamonds$price, pch = '.',
     xlab = "Weight (carats)", ylab = "Price ($)")
contour(kde2d(diamonds$carat, diamonds$price),
        add = T, nlevels = 20, col = heat.colors(20))

#### TABLEPLOTS ####

# Tableplots:
# Sort by one variable and bin by percentages,
# then show distributions of other variables
# at each level of the sorted variable.
# Sort of a multivariate (stacked) bar chart / histogram.
# Possible alternative to parallel coordinates plots.

# Draw a tableplot of the diamonds data.
# By default, sort by first variable (carat)
tableplot(diamonds)

# Sort on cut instead
tableplot(diamonds, sortCol = 2)

# Subset by cut first, then sort on price within each value of cut
tableplot(diamonds, subset_string = "cut", sortCol = 7)

# Zoom in to look at top 1% of most expensive diamonds.
# Also change number of bins used,
# since there aren't many observations here.
tableplot(diamonds, sortCol = 7, from=0, to=1)

# At the top 1%, the other variables' distributions
# no longer vary smoothly with price


#### MOSAIC PLOTS ####

# Mosaic plots:
# Use areas of rectangles to plot several categorical variables.
# Often I find these hard to read
# (hard to compare areas of differently-shaped rectangles),
# but sometimes they can be useful.

# Plot Titanic dataset:
# each passenger's Sex, Age (Adult, Child),
# Class (1st, 2nd, 3rd, Crew), and whether they Survived.
data(Titanic)
# There were fewer Females overall,
# but their survival rate was higher than for Males:
mosaicplot(~ Sex + Survived, data = Titanic, color = TRUE)

# There were fewer Children overall,
# but their survival rate was higher than for Adults:
mosaicplot(~ Age + Survived, data = Titanic, color = TRUE)

# Age didn't affect survival rates much among Females,
# but it did among Males:
mosaicplot(~ Sex + Age + Survived, data = Titanic, color = TRUE)