# 10 Mapping: R code
# Jerzy Wieczorek
# 10/1/15
# 36-721 Statistical Graphics and Visualization
# Set working directory
indir = "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10"
#indir = "D:/Dropbox/CMU/36-721 Dataviz F'15/Lecture 10"
setwd(indir)
# R mapping packages you may need to install, if you don't have them:
# install.packages(c("rgdal", "sp", "maptools", "rgeos", gpclib", ggmap"))
# Also, classInt for computing Jenks class intervals if you want them:
# install.packages("classInt")
# Load R packages
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.0-7, (SVN revision 559)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 1.7.3, released 2010/11/10
## Path to GDAL shared files: /usr/share/gdal/1.7
## GDAL does not use iconv for recoding strings.
## Loaded PROJ.4 runtime: Rel. 4.7.1, 23 September 2009, [PJ_VERSION: 470]
## Path to PROJ.4 shared files: (autodetected)
## Linking to sp version: 1.2-0
library(ggplot2)
library(ggmap)
library(RColorBrewer)
library(classInt)
#### LOAD AND PLOT SHAPEFILES ####
# Load the shapefiles for all 2010 Census tracts in Montana (MT).
# Tell it the directory where the shapefiles are,
# so it can find all the various pieces (.shp, .dbf, etc)
tractSPDF = readOGR(indir, "tl_2010_30_tract10")
## OGR data source with driver: ESRI Shapefile
## Source: "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10", layer: "tl_2010_30_tract10"
## with 271 features
## It has 12 fields
# Plot the tract boundaries
plot(tractSPDF)
# Get a summary of the object's properties
summary(tractSPDF)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
## min max
## x -116.05000 -104.03914
## y 44.35821 49.00139
## Is projected: FALSE
## proj4string :
## [+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs +towgs84=0,0,0]
## Data attributes:
## STATEFP10 COUNTYFP10 TRACTCE10 GEOID10 NAME10
## 30:271 111 : 32 000100 : 32 30001000100: 1 1 : 32
## 013 : 22 000200 : 20 30001000200: 1 2 : 20
## 031 : 22 000300 : 18 30001000300: 1 3 : 18
## 063 : 20 000400 : 9 30003000100: 1 4 : 9
## 029 : 19 000500 : 8 30003940400: 1 5 : 8
## 049 : 14 000800 : 8 30003940500: 1 8 : 8
## (Other):142 (Other):176 (Other) :265 (Other):176
## NAMELSAD10 MTFCC10 FUNCSTAT10 ALAND10
## Census Tract 1: 32 G5020:271 S:271 Min. :3.602e+05
## Census Tract 2: 20 1st Qu.:8.582e+06
## Census Tract 3: 18 Median :2.228e+08
## Census Tract 4: 9 Mean :1.391e+09
## Census Tract 5: 8 3rd Qu.:2.174e+09
## Census Tract 8: 8 Max. :1.331e+10
## (Other) :176
## AWATER10 INTPTLAT10 INTPTLON10
## Min. : 0 +44.8573177: 1 -104.1114017: 1
## 1st Qu.: 25880 +45.0226823: 1 -104.2288430: 1
## Median : 1547557 +45.1014235: 1 -104.2744656: 1
## Mean : 14277494 +45.1045153: 1 -104.3281098: 1
## 3rd Qu.: 10696537 +45.1523621: 1 -104.3292116: 1
## Max. :445782449 +45.1526102: 1 -104.4057182: 1
## (Other) :265 (Other) :265
# This is a new-to-us kind of object:
# a SpatialPolygonsDataFrame.
# To access just the data frame part, with one row per tract,
# access the object's data slot using tractSPDF@data
# (documentation calls this the "attribute table")
names(tractSPDF@data)
## [1] "STATEFP10" "COUNTYFP10" "TRACTCE10" "GEOID10" "NAME10"
## [6] "NAMELSAD10" "MTFCC10" "FUNCSTAT10" "ALAND10" "AWATER10"
## [11] "INTPTLAT10" "INTPTLON10"
# For example, plot the longitude and latitude of each tract centroid.
# The lon and lat were stored as factors,
# so first read them as characters and then turn them into numbers,
# and resave these numeric versions of each variable:
tractSPDF@data$lonCenter = as.numeric(as.character(tractSPDF@data$INTPTLON10))
tractSPDF@data$latCenter = as.numeric(as.character(tractSPDF@data$INTPTLAT10))
# Then plot like a usual scatterplot:
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
xlab = "Longitude", ylab = "Latitude", pch = '.')
# If we had another variable of interest,
# we could use cex to vary the symbol size or col to vary symbol color
# and just plot symbols at the centroid of each tract.
# This data set includes ALAND10, the land area of each tract,
# so let's plot that, rescaled by its max value:
maxArea = max(tractSPDF@data$ALAND10)
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
xlab = "Longitude", ylab = "Latitude",
cex = tractSPDF@data$ALAND10 / maxArea * 5)
# Or we can plot the tracts as a choropleth / thematic map,
# colored by land area.
# Use the spplot function.
# Let's also keep it clean by not drawing borders:
# use col = NA (here col is color of border, not of fill)
spplot(tractSPDF, "ALAND10", col = NA)
#### MERGE IN OTHER DATA ####
# But we are interested in plotting the Census mail back rate,
# so let's merge it into our attribute table.
# Load it first:
load("censusMT.Rdata")
# Let's rename it to censusDF (for Census Data Frame)
# so that this code can be reused with different states:
censusDF = censusMT
rm(censusMT)
# Use county and tract numbers to match up rows of the two data frames.
# Again, in tractSPDF, apparently they loaded as a factor,
# so we'll have to convert to integers via character:
summary(as.integer(as.character(tractSPDF@data$TRACTCE10)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 300 800 163400 10350 980600
# Compare to the tract numbers in censusDF:
summary(censusDF$Tract)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100 300 800 163400 10350 980600
# Looks compatible.
# However, the same tract number can repeat in different counties,
# so we need to merge on county number as well:
summary(as.integer(as.character(tractSPDF@data$COUNTYFP10)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 29.00 49.00 55.11 84.00 111.00
summary(censusDF$County)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 29.00 49.00 55.11 84.00 111.00
# In each dataset, we need compatible versions
# of tract and county number, for merging the datasets.
# Transform the tract and county IDs in tractSPDF
# to character and then to integer,
# and save them as new variables Tract and County:
tractSPDF@data$County = as.integer(as.character(tractSPDF@data$COUNTYFP10))
tractSPDF@data$Tract = as.integer(as.character(tractSPDF@data$TRACTCE10))
# Next, create a single ID variable for each tract,
# by combining the county and tract numbers:
tractSPDF@data$tractID = paste(tractSPDF@data$County, tractSPDF@data$Tract, sep = ".")
censusDF$tractID = paste(censusDF$County, censusDF$Tract, sep = ".")
# Check that the region variables seem compatible:
all.equal(sort(tractSPDF@data$tractID), sort(censusDF$tractID))
## [1] TRUE
# Looks good.
# To keep the merge clean, we can also delete Tract and County from one of the datasets,
# so we don't end up with two copies.
censusDF$County = NULL
censusDF$Tract = NULL
# Finally, merge the censusDF variables into tractSPDF.
# When merging a SpatialPolygonsDataFrame with a regular data frame,
# put the spatial one first:
tractSPDFmerged = merge(tractSPDF, censusDF, by = "tractID")
names(tractSPDFmerged)
## [1] "tractID" "STATEFP10"
## [3] "COUNTYFP10" "TRACTCE10"
## [5] "GEOID10" "NAME10"
## [7] "NAMELSAD10" "MTFCC10"
## [9] "FUNCSTAT10" "ALAND10"
## [11] "AWATER10" "INTPTLAT10"
## [13] "INTPTLON10" "lonCenter"
## [15] "latCenter" "County"
## [17] "Tract" "State"
## [19] "State_name" "County_name"
## [21] "LAND_AREA" "Tot_Population_ACS_09_13"
## [23] "Med_HHD_Inc_ACS_09_13" "Med_House_value_ACS_09_13"
## [25] "Mail_Return_Rate_CEN_2010" "pct_Males_ACS_09_13"
## [27] "pct_Pop_5_17_ACS_09_13" "pct_Pop_18_24_ACS_09_13"
## [29] "pct_Pop_25_44_ACS_09_13" "pct_Pop_45_64_ACS_09_13"
## [31] "pct_Pop_65plus_ACS_09_13" "pct_Hispanic_ACS_09_13"
## [33] "pct_NH_White_alone_ACS_09_13" "pct_NH_Blk_alone_ACS_09_13"
## [35] "pct_Not_HS_Grad_ACS_09_13" "pct_College_ACS_09_13"
## [37] "pct_Prs_Blw_Pov_Lev_ACS_09_13" "pct_Diff_HU_1yr_Ago_ACS_09_13"
## [39] "pct_MrdCple_HHD_ACS_09_13" "pct_Female_No_HB_ACS_09_13"
## [41] "pct_Sngl_Prns_HHD_ACS_09_13" "avg_Tot_Prns_in_HHD_ACS_09_13"
## [43] "pct_Rel_Under_6_ACS_09_13" "pct_Vacant_Units_ACS_09_13"
## [45] "pct_Renter_Occp_HU_ACS_09_13" "pct_Single_Unit_ACS_09_13"
# Success! Now we can plot the Census records for each tract.
#### PLOT COMBINED DATA ####
# Summarize the Census mail back rates in Montana tracts
summary(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 16.70 77.50 80.40 80.18 82.90 100.00 18
# There are 18 NAs, so some of our tracts had no data
# and will show up as blank on a map.
# These tracts seem to correspond to American Indian reservations.
# Plot the Census mail back rate
# as sizes of circles located at tract centroids:
plot(tractSPDFmerged@data$lonCenter, tractSPDFmerged@data$latCenter,
xlab = "Longitude", ylab = "Latitude",
cex = tractSPDFmerged@data$Mail_Return_Rate_CEN_2010 / 100 * 2)
# Not very informative: the rates don't vary much,
# so most of the circles look about the same size.
# Plot the tracts as a choropleth / thematic map,
# colored by Census mail back rate.
spplot(tractSPDFmerged, "Mail_Return_Rate_CEN_2010", col = NA)
#### MAP PROJECTIONS ####
# So far we've been plotting x = longitude, y = latitude,
# but often it helps to use a map projection,
# especially for larger areas (like entire continents).
# Use an Albers Equal Area conic projection,
# as used for many US government maps.
# Here are suggested parameters for a Pennsylvania map:
# http://www.pasda.psu.edu/tutorials/arcgis/projection.asp
# where our standard parallels are lat_1 and lat_2,
# latitude of origin is lat_0,
# longitude of central meridian is lon_0,
# and so on as defined here:
# https://trac.osgeo.org/proj/wiki/GenParms
# For Albers, we want the standard parallels set to
# just below and just above Montana's southern and northern edges;
# longitude of central meridian to be in the middle of Montana;
# and other parameters are not critical:
# http://forums.esri.com/Thread.asp?c=93&f=984&t=258585
aea.proj = "+proj=aea +lat_1=44 +lat_2=49 +lat_0=0 +lon_0=-110 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m"
tractSPDFprojected = spTransform(tractSPDFmerged, CRS(aea.proj))
# Using plot() to show centroid lon/lat values, we'll see no difference.
# But using spplot(), we will see a slight difference after projecting:
spplot(tractSPDFprojected, "Mail_Return_Rate_CEN_2010", col = NA)
# Notice that the flat horizontal lines at top & bottom of MT
# are now slightly curved. The projection worked :)
#### SETTING COLORS ####
# Use cut() for equal intervals,
# quantile() for equal quantiles,
# and classInt(..., style = "jenks") for Jenks
# Looks like almost all the response rates are between 70% and 90%
hist(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)
# Compute equal interval and quantile breaks,
# though for equal intervals let's start at 0-60 and then go up by 10s
tractSPDFmerged@data$mailRrEqualIntervals = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
breaks = c(0, 10*(6:10)))
tractSPDFmerged@data$mailRrEqualQuantiles = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
breaks = c(0, quantile(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
probs = seq(.2, 1, .2),
na.rm = TRUE)))
# Jenks intervals: takes a VERY long time if you have many records,
# so can compute the breaks on a smaller random subsample of the tracts
## jenksBreaks = classIntervals(sample(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010, 200),
## n = 5, style = "jenks")
# But in Montana we only have 300ish tracts, not too many:
jenksBreaks = classIntervals(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
n = 5, style = "jenks")
## Warning in classIntervals(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010, :
## var has missing values, omitted in finding classes
# Manually set the highest and lowest breaks to 100 and 0,
# to cover full range of the data
jenksBreaks$brks[c(1, 6)] = c(0, 100)
jenksBreaks$brks
## [1] 0.0 16.7 74.3 79.3 83.9 100.0
tractSPDFmerged@data$mailRrJenks = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
breaks = jenksBreaks$brks)
# Plot and compare the 3 styles of intervals
spplot(tractSPDFmerged, "mailRrEqualIntervals", col = NA,
main = "2010 Census mail back rates by Equal Intervals")
spplot(tractSPDFmerged, "mailRrEqualQuantiles", col = NA,
main = "2010 Census mail back rates by Equal Quantiles")
spplot(tractSPDFmerged, "mailRrJenks", col = NA,
main = "2010 Census mail back rates by Jenks cuts")
# To use a different color scale, assign it with col.regions
spplot(tractSPDFmerged, "mailRrJenks", col = NA,
col.regions = brewer.pal(5, "Blues"))
#### IN ggplot2 AND ggmap ####
# For ggplot to use shapefile data, we have to fortify it:
# turn it into the kind of data frame that ggplot understands.
# We don't need to use the projected version of the data,
# since we will have to project within ggplot separately anyway.
# Now use the fortify function,
# telling it that tractID is the variable name of
# the unique identifier of each region (tract)
tractSPDFgg = fortify(tractSPDFmerged, region = "tractID")
# Now it contains the map borders, but lost the other variables
head(tractSPDFgg)
## long lat order hole piece group id
## 1 -113.4366 45.83811 1 FALSE 1 1.100.1 1.100
## 2 -113.4363 45.83804 2 FALSE 1 1.100.1 1.100
## 3 -113.4363 45.83800 3 FALSE 1 1.100.1 1.100
## 4 -113.4362 45.83794 4 FALSE 1 1.100.1 1.100
## 5 -113.4362 45.83780 5 FALSE 1 1.100.1 1.100
## 6 -113.4361 45.83749 6 FALSE 1 1.100.1 1.100
# Also note that it renamed tractID to "id".
# So let's merge the other variables back in:
tractSPDFgg = merge(tractSPDFgg, tractSPDFmerged@data,
by.x = "id", by.y = "tractID")
names(tractSPDFgg)
## [1] "id" "long"
## [3] "lat" "order"
## [5] "hole" "piece"
## [7] "group" "STATEFP10"
## [9] "COUNTYFP10" "TRACTCE10"
## [11] "GEOID10" "NAME10"
## [13] "NAMELSAD10" "MTFCC10"
## [15] "FUNCSTAT10" "ALAND10"
## [17] "AWATER10" "INTPTLAT10"
## [19] "INTPTLON10" "lonCenter"
## [21] "latCenter" "County"
## [23] "Tract" "State"
## [25] "State_name" "County_name"
## [27] "LAND_AREA" "Tot_Population_ACS_09_13"
## [29] "Med_HHD_Inc_ACS_09_13" "Med_House_value_ACS_09_13"
## [31] "Mail_Return_Rate_CEN_2010" "pct_Males_ACS_09_13"
## [33] "pct_Pop_5_17_ACS_09_13" "pct_Pop_18_24_ACS_09_13"
## [35] "pct_Pop_25_44_ACS_09_13" "pct_Pop_45_64_ACS_09_13"
## [37] "pct_Pop_65plus_ACS_09_13" "pct_Hispanic_ACS_09_13"
## [39] "pct_NH_White_alone_ACS_09_13" "pct_NH_Blk_alone_ACS_09_13"
## [41] "pct_Not_HS_Grad_ACS_09_13" "pct_College_ACS_09_13"
## [43] "pct_Prs_Blw_Pov_Lev_ACS_09_13" "pct_Diff_HU_1yr_Ago_ACS_09_13"
## [45] "pct_MrdCple_HHD_ACS_09_13" "pct_Female_No_HB_ACS_09_13"
## [47] "pct_Sngl_Prns_HHD_ACS_09_13" "avg_Tot_Prns_in_HHD_ACS_09_13"
## [49] "pct_Rel_Under_6_ACS_09_13" "pct_Vacant_Units_ACS_09_13"
## [51] "pct_Renter_Occp_HU_ACS_09_13" "pct_Single_Unit_ACS_09_13"
## [53] "mailRrEqualIntervals" "mailRrEqualQuantiles"
## [55] "mailRrJenks"
# Looks good.
# Finally, create a map.
# First, use coord_equal to ensure equal scales on x and y axes:
map1 = ggplot(tractSPDFgg, aes(long, lat, group = group,
fill = Mail_Return_Rate_CEN_2010)) +
geom_polygon() + coord_equal()
map1
# Note that the NAs are plotted as grey tracts.
# Now, use coord_map to specify a map projection instead:
# in ggplot, projections are specified as in
# the mapproject function in the mapproj package.
# Again, use Albers Equal Area conic projection,
# with parameters lat0 and lat1 for the two standard parallels
# between which we want minimal distortion:
map1 + coord_map("albers", lat0 = 44, lat1 = 49)
# Looks a bit nicer than the coord_equal version above.
# Plot a spatial scatterplot of the tract centroids in ggplot2
map2 = ggplot(tractSPDFmerged@data, aes(lonCenter, latCenter,
col = Mail_Return_Rate_CEN_2010,
size = ALAND10)) +
geom_point() + coord_equal()
map2
# Redo with Albers projection
map2 + coord_map("albers", lat0 = 40, lat1 = 42)
# Overlay the tract centroids on top of a Google Maps basemap using ggmap.
# Note that we can't choose a projection with ggmap;
# we must use the same Web Mercator projection as the basemaps.
# First get a Google Map basemap centered on Montana.
# I tried a few zooms, using
## qmap("montana", zoom = 6)
# and so on, and 6 fits well.
# (low zoom = far away, high zoom = close up)
MTmap = get_map('montana', zoom = 6)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=montana&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=montana&sensor=false
# Plot the basemap with ggmap(),
# then add points as with usual ggplot code
map3 = ggmap(MTmap) +
geom_point(aes(lonCenter, latCenter,
col = Mail_Return_Rate_CEN_2010, size = ALAND10),
data = tractSPDFmerged@data)
map3
# The basemap overwhelms the data.
# Retry with a lighter basemap,
# using the darken argument to add a transparent white rectangle over top of it
map4 = ggmap(MTmap, darken = c(.5, "white")) +
geom_point(aes(lonCenter, latCenter,
col = Mail_Return_Rate_CEN_2010, size = ALAND10),
data = tractSPDFmerged@data)
map4
# Change color scale so that higher rates are darker red.
# (I chose the low and high colors using colorbrewer2.org)
map4 + scale_color_continuous(low = "#fee8c8", high = "#e34a33")
# Finally, let's get fancy:
# Instead of plotting continuous response rates,
# use our precomputed Jenks cuts.
# Let's add a discrete ColorBrewer scale,
# remind it to plot NAs as grey,
# use a bigger max size of the points,
# and make all points a little transparent:
map5 = ggmap(MTmap, darken = c(.5, "white")) +
geom_point(aes(lonCenter, latCenter,
col = mailRrJenks, size = ALAND10),
alpha = 0.7,
data = tractSPDFmerged@data) +
scale_color_brewer(palette = "Reds", na.value = "grey50") +
scale_size(range = c(1, 10))
map5