# 02 Legible Graphics: R code
# Jerzy Wieczorek
# 9/3/15
# 36-721 Statistical Graphics and Visualization
# Set working directory
setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 2/")
#### READ DATA, MAKE A PLOT, GET HELP ####
# Read in the data
nhanes = read.csv("nhanes.csv")
# See a quick overview of the dataset:
# all columns and their content
summary(nhanes)
## ID GENDER MONTHS RACETH
## Min. :62207 Female:101 Min. :0.000 Hispanic :83
## 1st Qu.:64577 Male :108 1st Qu.:1.000 Non-Hispanic Black:53
## Median :66783 Median :3.000 Non-Hispanic White:73
## Mean :67057 Mean :2.909
## 3rd Qu.:69557 3rd Qu.:5.000
## Max. :71910 Max. :6.000
## WEIGHT_KG LENGTH_CM HEAD_CM
## Min. : 3.600 Min. :48.3 Min. :34.60
## 1st Qu.: 5.400 1st Qu.:58.5 1st Qu.:39.40
## Median : 6.800 Median :62.3 Median :41.50
## Mean : 6.689 Mean :62.3 Mean :41.27
## 3rd Qu.: 7.900 3rd Qu.:66.6 3rd Qu.:43.10
## Max. :10.800 Max. :86.6 Max. :48.40
# Access a single column and take its mean
mean(nhanes$LENGTH_CM)
## [1] 62.29522
# Default scatterplot
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
# Read documentation for plot:
?plot
# See some usage examples:
example(plot)
See also the Quick-R website’s sections on Basic Graphs and Advanced Graphs.
#### SAVE PLOTS ####
# Save a bitmap png
png("SimplePNG.png")
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()
# Adjust png image size, font size, and resolution
png("NicerPNG.png", width = 500, height = 500, pointsize = 16)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()
# Adjust png resolution and units too
png("NicestPNG.png", width = 5, height = 5, units = "in",
pointsize = 16, res = 300)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()
# Save a vector pdf
pdf("SimplePDF.pdf")
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()
# Adjust pdf image size and font size
pdf("NicerPDF.pdf", width = 5.4, height = 5.4, pointsize = 12)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()
# No need to adjust resolution for vector images
#### LABEL AND ANNOTATE PLOTS ####
# Edit the x-axis and y-axis labels
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
xlab = "Weight (kg)", ylab = "Length (cm)")
# Add a main title
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
xlab = "Weight (kg)", ylab = "Length (cm)",
main = "Length vs weight for babies 0-6 months old")
# Orient tick labels horizontally
# (las = LAbel Style)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
xlab = "Weight (kg)", ylab = "Length (cm)",
main = "Length vs weight for babies 0-6 months old",
las = 1)
# Color points by GENDER
# (col = COLor)
# and break main title into two lines
# (\n = newline)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
xlab = "Weight (kg)", ylab = "Length (cm)",
main = "Length vs weight for babies 0-6 months old\ncolored by gender",
las = 1, col = nhanes$GENDER)
# Add a legend to existing plot:
# must specify plotting symbol to show points,
# or line type / width to show lines
# (pch = Plotting CHaracter,
# lty = Line TYpe, lwd = Line WiDth)
legend('topleft', legend = levels(nhanes$GENDER), col = 1:2, pch = 1)
# Manual control over tick locations and labels:
# Plot without axes
# (xaxt = X-AXis Type, yaxt = Y-AXis Type)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
xlab = "Weight (kg)", ylab = "Length (cm)",
main = "Length vs weight for babies 0-6 months old",
las = 1, xaxt = 'n', yaxt = 'n')
# Add axis ticks to existing graph, at certain places
# (side = 1 for x-axis)
axis(side = 1, at = c(4, 6, 8, 10))
# Add axis ticks, with certain labels
# (side = 2 for y-axis)
axis(side = 2, at = c(50, 80), labels = c("Short", "Tall"))
#### CORE PLOT ELEMENTS FOR RAW OR PRE-SUMMARIZED DATA ####
# Points, lines, bars
# Basic scatterplot, again
plot(nhanes$MONTHS, nhanes$LENGTH_CM,
xlab = 'Age (mo)', ylab = 'Length (cm)')
# Compute means over time, to have something worth plotting as a line
MeanLengths = tapply(nhanes$LENGTH_CM, nhanes$MONTHS, mean)
# Look at the mean values
MeanLengths
## 0 1 2 3 4 5 6
## 54.93636 57.39667 60.94444 63.29643 65.18684 66.59333 69.22609
# Line plot
# (type = 'l' for lines, 'p' for points, 'b' for both, etc)
plot(0:6, MeanLengths, type = 'l',
xlab = 'Age (mo)', ylab = 'Mean length (cm)')
# Add lines to existing plot
plot(nhanes$MONTHS, nhanes$LENGTH_CM)
lines(0:6, MeanLengths, type = 'l')
# Add points to existing plot; set y-axis limits
# (ylim = Y-axis LIMits, xlim = X-axis LIMits)
plot(0:6, MeanLengths, type = 'l', ylim = range(nhanes$LENGTH_CM))
points(nhanes$MONTHS, nhanes$LENGTH_CM)
# Bar graph for pre-summarized data
barplot(MeanLengths)
# Tabulate data for bar graph
table(nhanes$GENDER)
##
## Female Male
## 101 108
# Bar graph, tabulating on the fly
barplot(table(nhanes$GENDER))
#### CORE PLOT TYPES FOR STATISTICAL SUMMARIES ####
# Histogram, density, boxplot, regression line, loess line
# Histogram
hist(nhanes$WEIGHT_KG)
# Change number of breaks
hist(nhanes$WEIGHT_KG, breaks = 5)
hist(nhanes$WEIGHT_KG, breaks = 15)
# Density, using kernel density estimates(KDE)
plot(density(nhanes$LENGTH_CM))
# Boxplot
boxplot(nhanes$LENGTH_CM)
# Boxplot, conditioned
boxplot(nhanes$LENGTH_CM ~ nhanes$MONTHS)
# Overlay raw data on the existing boxplot:
# (boxplot treats MONTHS as categorical;
# x-axis indices are from 1 not 0;
# so here we must add 1 to MONTHS when overlaying points)
points(nhanes$MONTHS + 1, nhanes$LENGTH_CM)
# Regression line on scatterplot
plot(nhanes$LENGTH_CM, nhanes$HEAD_CM)
abline(lm(nhanes$HEAD_CM ~ nhanes$LENGTH_CM))
# Loess smoother line
scatter.smooth(nhanes$LENGTH_CM, nhanes$HEAD_CM)
#### OTHER SIMPLE TRICKS ####
# Jitter points (to deal with overplotting)
plot(jitter(nhanes$MONTHS), nhanes$LENGTH_CM)
# Compare to no jitter
plot(nhanes$MONTHS, nhanes$LENGTH_CM)
# Log scales:
# Points that differ by orders of magnitude
# are hard to compare...
a = 1:4
b = 10^a
plot(a, b, las = 1)
# You can take log of the data and plot that
# (here the y-axis labels are the logs)
plot(a, log(b), las = 1)
# Or you can plot with a log-scale
# (here the y-axis tick labels are original scale,
# but their locations are stretched via log scaling)
plot(a, b, log = 'y', las = 1)