# 04 Grammar of Graphics: R code
# Jerzy Wieczorek
# 9/10/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 4/")


#### LOAD GGPLOT2 ####

# Load ggplot2 package:
# Install it once if you haven't
## install.packages("ggplot2")
# Then you only need to load it once per R session
library(ggplot2)
# Set a black-and-white theme instead of default gray theme
# (just so the figures show up better on projector in class)
theme_set(theme_bw())


#### SIMPLE EXAMPLES FROM SLIDES ####

# Subset the diamonds dataset that comes with ggplot2
dsmall = diamonds[sample(nrow(diamonds),100),]

# Bar chart
ggplot(data = dsmall, aes(x = cut, fill = cut)) + 
  geom_bar(stat = "bin") + coord_cartesian()

# Pie chart
# (this is a bad idea in practice; just showing GoG's flexibility)
ggplot(data = dsmall, aes(x = factor(1), fill = cut)) + 
  geom_bar(stat = "bin") + coord_polar(theta = "y")

# Race track plot
# (this is a terrible idea! again, just demo'ing GoG's flexibility)
ggplot(data = dsmall, aes(x = cut, fill = cut)) + 
  geom_bar(stat = "bin") + coord_polar(theta = "y")

#### READ NHANES DATASET ####

# Read in the data
nhanes = read.csv("nhanes.csv")


#### COMPARE MANUAL LAYOUT TO ggplot2 FACETS ####

# Base R:
# Deal with finicky par(),
# subset by Gender repeatedly,
# repeatedly enter labels,
# set ylim manually to ensure matching scales...
oldPar = par(no.readonly = TRUE)
par(mfrow = c(1, 2))
with(subset(nhanes, GENDER == 'Female'),
     plot(jitter(MONTHS), WEIGHT_KG, ylim = range(nhanes$WEIGHT_KG),
          xlab = 'Age (months)', ylab = 'Weight (kg)', main = 'Female')
)
with(subset(nhanes, GENDER == 'Male'),
     plot(jitter(MONTHS), WEIGHT_KG, ylim = range(nhanes$WEIGHT_KG),
          xlab = 'Age (months)', ylab = 'Weight (kg)', main = 'Male')
)

par(oldPar)

# ggplot2:
# All in "one step," following a simple specification
ggplot(data = nhanes, aes(x = MONTHS, y = WEIGHT_KG)) +
  geom_point(position = position_jitter(width = .2)) +
  facet_grid(~ GENDER) +
  xlab("Age (months)") + ylab("Weight (kg)")

#### ggplot2: aes, geom, stat ####

# Define the dataset and the shared aes
# using the ggplot command,
# and also use same ColorBrewer palette as last time
p = ggplot(nhanes,
           aes(x = LENGTH_CM, y = WEIGHT_KG, color = GENDER)) +
  scale_color_brewer(palette = "Set1")

# Try plotting it?
## p
# No layers yet, so there's nothing to plot...

# Add a geom_point layer, making a scatterplot
p + geom_point()

# Add transparency, setting alpha-channel to 70%,
# to help with overplotting
p + geom_point(alpha = 0.5)

# Could plot a line instead
# (by default it connects the points,
#  although that is not appropriate here:
#  each point is a different person,
#  not the same one repeated over time)
p + geom_line()

# Instead of raw data lines using default (stat = "identity"),
# plot quantile lines using a stat summary
p + geom_line(stat = "quantile")
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x

# Note that the lines understand you're already grouping color by GENDER

# Map the derived variable ..quantile.. to aes(linetype)
# and change from default quantiles to 15%, 50%, 85%
p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..)))
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x

# Overlay raw data points over the quantile lines
p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..))) +
  geom_point()
## Smoothing formula not specified. Using: y ~ x
## Smoothing formula not specified. Using: y ~ x

#### ggplot2: facet_grid, facet_wrap ####

# Now let's try to mimic the WHO plots
# of length-by-age quantiles, by gender:
# plot LENGTH vs MONTHS, facet by GENDER,
# geom = line, stat = quantile

# Switch aes mapping
# from x = LENGTH_CM, y = WEIGHT_KG
# to x = MONTHS, y = LENGTH_CM
# by adding a revised call to aes()
p = p + aes(x = MONTHS, y = LENGTH_CM)

# Scatterplot
p + geom_point()

# Add quantiles
p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..))) +
  geom_point()
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x

# Facet by gender
p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..))) +
  geom_point() + facet_wrap(~ GENDER)
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x

# We can also use facet_grid(rows ~ columns); 
# especially useful if want to facet on multiple variables
p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..))) +
  geom_point() + facet_grid(~ GENDER)
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x

p + geom_line(stat = "quantile",
              quantiles = c(.15, .50, .85),
              mapping = aes(linetype = factor(..quantile..))) +
  geom_point() + facet_grid(GENDER ~ .)
## Smoothing formula not specified. Using: y ~ x
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique
## Smoothing formula not specified. Using: y ~ x

#### ggplot2: ALL AT ONCE ####

# Can we juggle all 6 variables at once?
# (not including ID, of course)
names(nhanes)
## [1] "ID"        "GENDER"    "MONTHS"    "RACETH"    "WEIGHT_KG" "LENGTH_CM"
## [7] "HEAD_CM"
# We've seen these commands so far:
# aes(x, y, color, size, linetype, alpha)
# stat = 'quantile', stat = 'identity'
# geom_point, geom_line
# facet_grid, facet_wrap
# scale_color_brewer
# coord_cartesian, coord_polar

# One possible attempt
ggplot(nhanes) +
  aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, size = HEAD_CM) +
  facet_grid(RACETH ~ MONTHS) +
  scale_color_brewer(palette = 'Set1') + 
  geom_point(alpha = .7) + 
  geom_line(stat = 'quantile', quantiles = .5, formula = y ~ x)
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

# We get some errors, likely due to too few points in some subplots

# Above we map aes(size = HEAD_CM) and we set alpha=.7
# To clarify difference between setting and mapping,
# contrast this with doing the reverse:
# map aes(alpha = HEAD_CM) and set size=5
ggplot(nhanes) +
  aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, alpha = HEAD_CM) +
  facet_grid(RACETH ~ MONTHS) +
  scale_color_brewer(palette = 'Set1') + 
  geom_point(size = 5) + 
  geom_line(stat = 'quantile', quantiles = .5, formula = y ~ x)
## Warning in rq.fit.br(wx, wy, tau = tau, ...): Solution may be nonunique

#### ggplot2: IMPORTANCE OF USING aes() ####

# Show the difference between
# color="green"
# vs
# aes(color="green")

# Map levels of the data column GENDER to colors
p + geom_point(aes(color = GENDER))

# Map a constant-valued variable with a level named "green" to one color
p + geom_point(aes(color = 'green'))

# Make all points green
p + geom_point(color = 'green')

# Make all points GENDER?
# Can't do it -- no such variable outside your dataset
## p + geom_point(color = GENDER)


#### ggplot2: A FEW MISC NOTES ####

# Note that both 
## ggplot(nhanes, aes(x = WEIGHT_KG, y = LENGTH_CM)) + geom_point()
# and 
## ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM) + geom_point()
# are equivalent, just using the syntax differently.


# Note that setting xlim, ylim will actually clip the data,
# so if you summarize (smooth, boxplot, etc)
# it'll only use the data in the clipped region,
# not compute on full dataset and then zoom in!
# Wilkinson saw such zooming as "a form of lying with graphics"...
# Example: add regression lines (smooth with method "lm"),
# then zoom in and see the lines are based on cropped data:
p = ggplot(nhanes,
           aes(x = LENGTH_CM, y = WEIGHT_KG, color = GENDER)) +
  scale_color_brewer(palette = "Set1")
p + geom_point() + geom_smooth(method = "lm")

p + geom_point() + geom_smooth(method = "lm") + xlim(c(70, 90))
## Warning: Removed 99 rows containing missing values (stat_smooth).
## Warning in qt((1 - level)/2, df): NaNs produced
## Warning: Removed 100 rows containing missing values (stat_smooth).
## Warning: Removed 199 rows containing missing values (geom_point).

# If you really do just want to zoom, not crop the data,
# use coord_cartesian(xlim) instead:
p + geom_point() + geom_smooth(method = "lm") + coord_cartesian(xlim = c(70, 90))

# It's possible to add "margins" to facets,
# i.e. plot the data combined over the facet-variables
# as the last row or column:
p + geom_point() + aes(shape = RACETH) +
  facet_grid(GENDER ~ RACETH, margins = TRUE)

# Can we make something like Rafe Donahue's "you-are-here" plots?
# Yes: make facets with your data plotted as geom_point,
# but first include another geom_point layer
# that DOES NOT have the facet variable.
# Example: we'll facet by RACETH,
# but underneath each facet we'll show all points (in grey),
# while increasing size of the points that belong to that facet
p + geom_point(data = subset(nhanes, select = -RACETH), color = "black") + 
  facet_wrap( ~ RACETH) + geom_point(size = 3)

# This works because any layer with the facet variable missing
# will be plotted on every facet.
# (I believe this was intended to make background maps easy to do).


# Can ggplot2 do scale breaks on x-axis?
# Yes: facet on a cut of the x-variable, then use free x-scale.
# Say we want to plot state areas and pops with a scale break,
# so that huge states don't force the rest into a small corner of plot:
state.x77 = as.data.frame(state.x77)
state.x77$areaCut = cut(state.area, c(0, 1e5, 1e6), labels = c("<=10k", ">10k"))
p = ggplot(data = state.x77, aes(x = Area, y = Population)) + geom_point()
# Basic scatterplot of area vs population
p

# Now facet by the cut of Area, with a free x-scale
# (i.e. don't keep the same x-scale on both facets)
p + facet_grid(. ~ areaCut, scales = "free_x")