# Jerzy Wieczorek # July 8, 2015 # Stat Bytes # http://civilstat.com/datavis/StatBytesJune2015/ # We have a subset of the NHANES 2011-2012 data on demographics # http://wwwn.cdc.gov/Nchs/Nhanes/Search/DataPage.aspx?Component=Demographics&CycleBeginYear=2011 # and on body measures # http://wwwn.cdc.gov/Nchs/Nhanes/Search/DataPage.aspx?Component=Examination&CycleBeginYear=2011 # We've only kept selected variables, # and only the complete-data records # for infants aged 0-6 months and in 3 largest race/ethnic groups. # Note that this is survey data, not a simple random sample. # We cannot really make proper inferences # without accounting for sample design, weights, etc... # It's just a visualization exercise. #### LOAD DATA #### setwd("C:/repos/StatBytesJuly2015") load("StatBytesJuly2015.Rdata") summary(nhanes) #### COLOR CHOICE: Base R #### # Scatterplot with two groups and a legend, # using base R plotting baseQualColors = c("blue", "red") with(nhanes, plot(LENGTH_CM, WEIGHT_KG, col = baseQualColors[GENDER], pch = 20, cex = 2, main = "Weight vs length, by gender")) legend('bottomright', col = baseQualColors, pch = 20, legend = c("Male", "Female")) #### COLOR CHOICE: Color Brewer, Qualitative #### # http://colorbrewer2.org/ library(RColorBrewer) display.brewer.all() # Choose a qualitative color palette with blue and red display.brewer.pal(2, 'Set1') # Warning tells us we need to request 3+ color levels; # Just save the first two levels: first blue, then red cbQualColors = brewer.pal(3, 'Set1')[c(2, 1)] cbQualColors # saved as character strings of hex values # Redo the plot with these new colors with(nhanes, plot(LENGTH_CM, WEIGHT_KG, col = cbQualColors[GENDER], pch = 20, cex = 2, main = "Weight vs length, by gender")) legend('bottomright', col = cbQualColors, pch = 20, legend = c("Male", "Female")) #### COLOR CHOICE: Color Brewer, Sequential #### # Now color by age, not gender. # Use a sequential color scheme for the 7 MONTHS values; # first few are too light, so request more colors and only use later ones display.brewer.pal(9, 'YlGn') cbSeqColors = brewer.pal(9, 'YlGn')[3:9] with(nhanes, plot(LENGTH_CM, WEIGHT_KG, col = cbSeqColors[MONTHS + 1], pch = 20, cex = 2, main = "Weight vs length, by age (months)")) legend('bottomright', col = cbSeqColors, pch = 20, legend = 0:6) #### DIRECT LABELS #### library(directlabels) # (Use lattice or ggplot2; directlabels doesn't work with base R plots) library(ggplot2) p <- qplot(LENGTH_CM, WEIGHT_KG, color = GENDER, data = nhanes) + scale_colour_manual(values = cbQualColors) + ggtitle("Weight vs length, by gender") p direct.label(p) #### ggplot2: aes, geom, stat #### p <- ggplot(nhanes, aes(x = LENGTH_CM, y = WEIGHT_KG, color = GENDER)) + scale_color_manual(values = cbQualColors) p # No layers yet, so nothing to plot... # Add a geom_point layer, # making the same scatterplot as above p + geom_point() # Could plot a line instead # (by default it connects the points, # although that is not appropriate here: # each point is a different person, # not the same one repeated over time) p + geom_line() # Of course we can direct.label() these plots # (but we'll skip this step for the rest of the talk) direct.label(p + geom_point()) direct.label(p + geom_line()) # Instead of raw data lines using default (stat = "identity"), # plot quantile lines using a stat summary p + geom_line(stat = "quantile") # Note that the lines understand you're grouping color by GENDER # Map the derived variable ..quantile.. to aes(linetype) # and change from default quantiles to 15%, 50%, 85% p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) # Overlay raw data points over the quantile lines p + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() #### ggplot2: facet_grid, facet_wrap #### # Mayo Clinic book plots (more or less): # LENGTH vs WEIGHT, color by GENDER, facet by MONTHS, # geom = point, stat = identity p + geom_point() + facet_grid( ~ MONTHS) p + geom_point() + facet_grid(MONTHS ~ .) p + geom_point() + facet_wrap( ~ MONTHS) # WHO plots: # LENGTH vs MONTHS, facet by GENDER, # geom = line, stat = quantile # We can revise our previous aes mappings with p + aes(...) q <- p + aes(x = MONTHS, y = LENGTH_CM) # Scatterplot q + geom_point() # Add quantiles q + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() # Facet by gender q + geom_line(stat = "quantile", quantiles = c(.15, .50, .85), mapping = aes(linetype = factor(..quantile..))) + geom_point() + facet_wrap(~ GENDER) #### ggplot2: all at once! #### # Can we juggle all 6 variables at once? # We've seen these commands so far: # aes(x, y, color, size, linetype, alpha) # facet_grid, facet_wrap # geom_point, geom_line # stat = 'quantile', stat = 'identity' # Audience suggestion ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, size = HEAD_CM, shape = RACETH) + facet_grid( ~ MONTHS) + scale_color_manual(values = cbQualColors) + geom_point(alpha = .7) # Jerzy's attempt ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, size = HEAD_CM) + facet_grid(RACETH ~ MONTHS) + scale_color_manual(values = cbQualColors) + geom_point(alpha = .7) + geom_line(stat = 'quantile', quantiles = .5) # Above we map aes(size = HEAD_CM) and we fix alpha=.7 # Contrast this with doing the reverse: # map aes(alpha = HEAD_CM) and fix size=5 ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM, color = GENDER, alpha = HEAD_CM) + facet_grid(RACETH ~ MONTHS) + scale_color_manual(values = cbQualColors) + geom_point(size = 5) + geom_line(stat = 'quantile', quantiles = .5) #### ggplot2: importance of using aes() #### # Show the difference between # color="green" # vs # aes(color="green") # Map levels of the data column GENDER to colors p + geom_point(aes(color = GENDER)) # Map a constant-valued variable with a level named "green" to one color p + geom_point(aes(color = 'green')) # Make all points green p + geom_point(color = 'green') # Make all points GENDER? # Can't do it -- no such variable outside your dataset p + geom_point(color = GENDER) # Also, note that both ggplot(nhanes, aes(x = WEIGHT_KG, y = LENGTH_CM)) + geom_point() # and ggplot(nhanes) + aes(x = WEIGHT_KG, y = LENGTH_CM) + geom_point() # are equivalent, just using the syntax differently.