# 02 Legible Graphics: R code
# Jerzy Wieczorek
# 9/3/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
setwd("/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 2/")


#### READ DATA, MAKE A PLOT, GET HELP ####

# Read in the data
nhanes = read.csv("nhanes.csv")
# See a quick overview of the dataset:
# all columns and their content
summary(nhanes)
##        ID           GENDER        MONTHS                     RACETH  
##  Min.   :62207   Female:101   Min.   :0.000   Hispanic          :83  
##  1st Qu.:64577   Male  :108   1st Qu.:1.000   Non-Hispanic Black:53  
##  Median :66783                Median :3.000   Non-Hispanic White:73  
##  Mean   :67057                Mean   :2.909                          
##  3rd Qu.:69557                3rd Qu.:5.000                          
##  Max.   :71910                Max.   :6.000                          
##    WEIGHT_KG        LENGTH_CM       HEAD_CM     
##  Min.   : 3.600   Min.   :48.3   Min.   :34.60  
##  1st Qu.: 5.400   1st Qu.:58.5   1st Qu.:39.40  
##  Median : 6.800   Median :62.3   Median :41.50  
##  Mean   : 6.689   Mean   :62.3   Mean   :41.27  
##  3rd Qu.: 7.900   3rd Qu.:66.6   3rd Qu.:43.10  
##  Max.   :10.800   Max.   :86.6   Max.   :48.40
# Access a single column and take its mean
mean(nhanes$LENGTH_CM)
## [1] 62.29522
# Default scatterplot
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)

# Read documentation for plot:
?plot

# See some usage examples:
example(plot)

See also the Quick-R website’s sections on Basic Graphs and Advanced Graphs.

#### SAVE PLOTS ####

# Save a bitmap png
png("SimplePNG.png")
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()

# Adjust png image size, font size, and resolution
png("NicerPNG.png", width = 500, height = 500, pointsize = 16)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()

# Adjust png resolution and units too
png("NicestPNG.png", width = 5, height = 5, units = "in",
    pointsize = 16, res = 300)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()

# Save a vector pdf
pdf("SimplePDF.pdf")
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()

# Adjust pdf image size and font size
pdf("NicerPDF.pdf", width = 5.4, height = 5.4, pointsize = 12)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM)
dev.off()

# No need to adjust resolution for vector images
#### LABEL AND ANNOTATE PLOTS ####

# Edit the x-axis and y-axis labels
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
     xlab = "Weight (kg)", ylab = "Length (cm)")

# Add a main title
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
     xlab = "Weight (kg)", ylab = "Length (cm)",
     main = "Length vs weight for babies 0-6 months old")

# Orient tick labels horizontally
# (las = LAbel Style)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
     xlab = "Weight (kg)", ylab = "Length (cm)",
     main = "Length vs weight for babies 0-6 months old",
     las = 1)

# Color points by GENDER
# (col = COLor)
# and break main title into two lines
# (\n = newline)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
     xlab = "Weight (kg)", ylab = "Length (cm)",
     main = "Length vs weight for babies 0-6 months old\ncolored by gender",
     las = 1, col = nhanes$GENDER)

# Add a legend to existing plot:
# must specify plotting symbol to show points,
# or line type / width to show lines
# (pch = Plotting CHaracter,
#  lty = Line TYpe, lwd = Line WiDth)
legend('topleft', legend = levels(nhanes$GENDER), col = 1:2, pch = 1)

# Manual control over tick locations and labels:
# Plot without axes
# (xaxt = X-AXis Type, yaxt = Y-AXis Type)
plot(nhanes$WEIGHT_KG, nhanes$LENGTH_CM,
     xlab = "Weight (kg)", ylab = "Length (cm)",
     main = "Length vs weight for babies 0-6 months old",
     las = 1, xaxt = 'n', yaxt = 'n')
# Add axis ticks to existing graph, at certain places
# (side = 1 for x-axis)
axis(side = 1, at = c(4, 6, 8, 10))
# Add axis ticks, with certain labels
# (side = 2 for y-axis)
axis(side = 2, at = c(50, 80), labels = c("Short", "Tall"))

#### CORE PLOT ELEMENTS FOR RAW OR PRE-SUMMARIZED DATA ####
# Points, lines, bars

# Basic scatterplot, again
plot(nhanes$MONTHS, nhanes$LENGTH_CM,
     xlab = 'Age (mo)', ylab = 'Length (cm)')

# Compute means over time, to have something worth plotting as a line
MeanLengths = tapply(nhanes$LENGTH_CM, nhanes$MONTHS, mean)
# Look at the mean values
MeanLengths
##        0        1        2        3        4        5        6 
## 54.93636 57.39667 60.94444 63.29643 65.18684 66.59333 69.22609
# Line plot
# (type = 'l' for lines, 'p' for points, 'b' for both, etc)
plot(0:6, MeanLengths, type = 'l',
     xlab = 'Age (mo)', ylab = 'Mean length (cm)')

# Add lines to existing plot
plot(nhanes$MONTHS, nhanes$LENGTH_CM)
lines(0:6, MeanLengths, type = 'l')

# Add points to existing plot; set y-axis limits
# (ylim = Y-axis LIMits, xlim = X-axis LIMits)
plot(0:6, MeanLengths, type = 'l', ylim = range(nhanes$LENGTH_CM))
points(nhanes$MONTHS, nhanes$LENGTH_CM)

# Bar graph for pre-summarized data
barplot(MeanLengths)

# Tabulate data for bar graph
table(nhanes$GENDER)
## 
## Female   Male 
##    101    108
# Bar graph, tabulating on the fly
barplot(table(nhanes$GENDER))

#### CORE PLOT TYPES FOR STATISTICAL SUMMARIES ####
# Histogram, density, boxplot, regression line, loess line

# Histogram
hist(nhanes$WEIGHT_KG)

# Change number of breaks
hist(nhanes$WEIGHT_KG, breaks = 5)

hist(nhanes$WEIGHT_KG, breaks = 15)

# Density, using kernel density estimates(KDE)
plot(density(nhanes$LENGTH_CM))

# Boxplot
boxplot(nhanes$LENGTH_CM)

# Boxplot, conditioned
boxplot(nhanes$LENGTH_CM ~ nhanes$MONTHS)

# Overlay raw data on the existing boxplot:
# (boxplot treats MONTHS as categorical;
# x-axis indices are from 1 not 0;
# so here we must add 1 to MONTHS when overlaying points)
points(nhanes$MONTHS + 1, nhanes$LENGTH_CM)

# Regression line on scatterplot
plot(nhanes$LENGTH_CM, nhanes$HEAD_CM)
abline(lm(nhanes$HEAD_CM ~ nhanes$LENGTH_CM))

# Loess smoother line
scatter.smooth(nhanes$LENGTH_CM, nhanes$HEAD_CM)

#### OTHER SIMPLE TRICKS ####

# Jitter points (to deal with overplotting)
plot(jitter(nhanes$MONTHS), nhanes$LENGTH_CM)

# Compare to no jitter
plot(nhanes$MONTHS, nhanes$LENGTH_CM)

# Log scales:
# Points that differ by orders of magnitude
# are hard to compare...
a = 1:4
b = 10^a
plot(a, b, las = 1)

# You can take log of the data and plot that
# (here the y-axis labels are the logs)
plot(a, log(b), las = 1)

# Or you can plot with a log-scale
# (here the y-axis tick labels are original scale,
#  but their locations are stretched via log scaling)
plot(a, b, log = 'y', las = 1)