# 04 Grammar of Graphics: R code # Jerzy Wieczorek # 9/10/15 # 36-721 Statistical Graphics and Visualization # Set working directory setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 4/") #### LOAD GGPLOT2 #### # Load ggplot2 package: # Install it once if you haven't ## install.packages("ggplot2") # Then you only need to load it once per R session library(ggplot2) # Set a black-and-white theme instead of default gray theme # (just so the figures show up better on projector in class) theme_set(theme_bw()) #### SIMPLE EXAMPLES FROM SLIDES #### # Subset the diamonds dataset that comes with ggplot2 dsmall = diamonds[sample(nrow(diamonds),100),] # Bar chart ggplot(data = dsmall, aes(x = cut, fill = cut)) + geom_bar(stat = "bin") + coord_cartesian() # Pie chart # (this is a bad idea in practice; just showing GoG's flexibility) ggplot(data = dsmall, aes(x = factor(1), fill = cut)) + geom_bar(stat = "bin") + coord_polar(theta = "y") # Race track plot # (this is a terrible idea! again, just demo'ing GoG's flexibility) ggplot(data = dsmall, aes(x = cut, fill = cut)) + geom_bar(stat = "bin") + coord_polar(theta = "y") #### READ NHANES DATASET #### # Read in the data nhanes = read.csv("nhanes.csv") #### COMPARE MANUAL LAYOUT TO ggplot2 FACETS #### # Base R: # Deal with finicky par(), # subset by Gender repeatedly, # repeatedly enter labels, # set ylim manually to ensure matching scales... oldPar = par(no.readonly = TRUE) par(mfrow = c(1, 2)) with(subset(nhanes, GENDER == 'Female'), plot(jitter(MONTHS), WEIGHT_KG, ylim = range(nhanes$WEIGHT_KG), xlab = 'Age (months)', ylab = 'Weight (kg)', main = 'Female') ) with(subset(nhanes, GENDER == 'Male'), plot(jitter(MONTHS), WEIGHT_KG, ylim = range(nhanes$WEIGHT_KG), xlab = 'Age (months)', ylab = 'Weight (kg)', main = 'Male') ) par(oldPar) # ggplot2: # All in "one step," following a simple specification ggplot(data = nhanes, aes(x = MONTHS, y = WEIGHT_KG)) + geom_point(position = position_jitter(width = .2)) + facet_grid(~ GENDER) + xlab("Age (months)") + ylab("Weight (kg)") #### ggplot2: aes, geom, stat #### # Define the dataset and the shared aes # using the ggplot command, # and also use same ColorBrewer palette as last time p = ggplot(nhanes, aes(x = LENGTH_CM, y = WEIGHT_KG, color = GENDER)) + scale_color_brewer(palette = "Set1") # Try plotting it? ## p # No layers yet, so there's nothing to plot... # Add a geom_point layer, making a scatterplot p + geom_point() # Add transparency, setting alpha-channel to 70%, # to help with overplotting p + geom_point(alpha = 0.5) # Could plot a line instead # (by default it connects the points, # although that is not appropriate here: # each point is a different person, # not the same one repeated over time) p + geom_line() # Instead of raw data lines using default (stat = "identity"), # plot quantile lines using a stat summary p + geom_line(stat = "quantile") # Note that the lines understand you're already grouping color by GENDER # Map the derived variable ..quantile.. to aes(linetype) # and change from default quantiles to 15%, 50%, 85% p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) # Overlay raw data points over the quantile lines p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() #### ggplot2: facet_grid, facet_wrap #### # Now let's try to mimic the WHO plots # of length-by-age quantiles, by gender: # plot LENGTH vs MONTHS, facet by GENDER, # geom = line, stat = quantile # Switch aes mapping # from x = LENGTH_CM, y = WEIGHT_KG # to x = MONTHS, y = LENGTH_CM # by adding a revised call to aes() p = p + aes(x = MONTHS, y = LENGTH_CM) # Scatterplot p + geom_point() # Add quantiles p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() # Facet by gender p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() + facet_wrap(~ GENDER) # We can also use facet_grid(rows ~ columns); # especially useful if want to facet on multiple variables p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() + facet_grid(~ GENDER) p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() + facet_grid(GENDER ~ .) #### ggplot2: ALL AT ONCE #### # Can we juggle all 6 variables at once? # (not including ID, of course) names(nhanes) # We've seen these commands so far: # aes(x, y, color, size, linetype, alpha) # stat = 'quantile', stat = 'identity' # geom_point, geom_line # facet_grid, facet_wrap # scale_color_brewer # coord_cartesian, coord_polar # One possible attempt ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, size = HEAD_CM) + facet_grid(RACETH ~ MONTHS) + scale_color_brewer(palette = 'Set1') + geom_point(alpha = .7) + geom_line(stat = 'quantile', quantiles = .5, formula = y ~ x) # We get some errors, likely due to too few points in some subplots # Above we map aes(size = HEAD_CM) and we set alpha=.7 # To clarify difference between setting and mapping, # contrast this with doing the reverse: # map aes(alpha = HEAD_CM) and set size=5 ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, alpha = HEAD_CM) + facet_grid(RACETH ~ MONTHS) + scale_color_brewer(palette = 'Set1') + geom_point(size = 5) + geom_line(stat = 'quantile', quantiles = .5, formula = y ~ x) #### ggplot2: IMPORTANCE OF USING aes() #### # Show the difference between # color="green" # vs # aes(color="green") # Map levels of the data column GENDER to colors p + geom_point(aes(color = GENDER)) # Map a constant-valued variable with a level named "green" to one color p + geom_point(aes(color = 'green')) # Make all points green p + geom_point(color = 'green') # Make all points GENDER? # Can't do it -- no such variable outside your dataset ## p + geom_point(color = GENDER) #### ggplot2: A FEW MISC NOTES #### # Note that both ## ggplot(nhanes, aes(x = WEIGHT_KG, y = LENGTH_CM)) + geom_point() # and ## ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM) + geom_point() # are equivalent, just using the syntax differently. # Note that setting xlim, ylim will actually clip the data, # so if you summarize (smooth, boxplot, etc) # it'll only use the data in the clipped region, # not compute on full dataset and then zoom in! # Wilkinson saw such zooming as "a form of lying with graphics"... # Example: add regression lines (smooth with method "lm"), # then zoom in and see the lines are based on cropped data: p = ggplot(nhanes, aes(x = LENGTH_CM, y = WEIGHT_KG, color = GENDER)) + scale_color_brewer(palette = "Set1") p + geom_point() + geom_smooth(method = "lm") p + geom_point() + geom_smooth(method = "lm") + xlim(c(70, 90)) # If you really do just want to zoom, not crop the data, # use coord_cartesian(xlim) instead: p + geom_point() + geom_smooth(method = "lm") + coord_cartesian(xlim = c(70, 90)) # It's possible to add "margins" to facets, # i.e. plot the data combined over the facet-variables # as the last row or column: p + geom_point() + aes(shape = RACETH) + facet_grid(GENDER ~ RACETH, margins = TRUE) # Can we make something like Rafe Donahue's "you-are-here" plots? # Yes: make facets with your data plotted as geom_point, # but first include another geom_point layer # that DOES NOT have the facet variable. # Example: we'll facet by RACETH, # but underneath each facet we'll show all points (in grey), # while increasing size of the points that belong to that facet p + geom_point(data = subset(nhanes, select = -RACETH), color = "black") + facet_wrap( ~ RACETH) + geom_point(size = 3) # This works because any layer with the facet variable missing # will be plotted on every facet. # (I believe this was intended to make background maps easy to do). # Can ggplot2 do scale breaks on x-axis? # Yes: facet on a cut of the x-variable, then use free x-scale. # Say we want to plot state areas and pops with a scale break, # so that huge states don't force the rest into a small corner of plot: state.x77 = as.data.frame(state.x77) state.x77$areaCut = cut(state.area, c(0, 1e5, 1e6), labels = c("<=10k", ">10k")) p = ggplot(data = state.x77, aes(x = Area, y = Population)) + geom_point() # Basic scatterplot of area vs population p # Now facet by the cut of Area, with a free x-scale # (i.e. don't keep the same x-scale on both facets) p + facet_grid(. ~ areaCut, scales = "free_x")