# 10 Mapping: R code
# Jerzy Wieczorek
# 10/1/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
indir = "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10"
#indir = "D:/Dropbox/CMU/36-721 Dataviz F'15/Lecture 10"
setwd(indir)

# R mapping packages you may need to install, if you don't have them:
# install.packages(c("rgdal", "sp", "maptools", "rgeos", gpclib", ggmap"))
# Also, classInt for computing Jenks class intervals if you want them:
# install.packages("classInt")

# Load R packages
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.0-7, (SVN revision 559)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.7.3, released 2010/11/10
##  Path to GDAL shared files: /usr/share/gdal/1.7
##  GDAL does not use iconv for recoding strings.
##  Loaded PROJ.4 runtime: Rel. 4.7.1, 23 September 2009, [PJ_VERSION: 470]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-0
library(ggplot2)
library(ggmap)
library(RColorBrewer)
library(classInt)


#### LOAD AND PLOT SHAPEFILES ####

# Load the shapefiles for all 2010 Census tracts in Montana (MT).
# Tell it the directory where the shapefiles are,
# so it can find all the various pieces (.shp, .dbf, etc)
tractSPDF = readOGR(indir, "tl_2010_30_tract10")
## OGR data source with driver: ESRI Shapefile 
## Source: "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10", layer: "tl_2010_30_tract10"
## with 271 features
## It has 12 fields
# Plot the tract boundaries
plot(tractSPDF)

# Get a summary of the object's properties
summary(tractSPDF)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
##          min        max
## x -116.05000 -104.03914
## y   44.35821   49.00139
## Is projected: FALSE 
## proj4string :
## [+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs +towgs84=0,0,0]
## Data attributes:
##  STATEFP10   COUNTYFP10    TRACTCE10          GEOID10        NAME10   
##  30:271    111    : 32   000100 : 32   30001000100:  1   1      : 32  
##            013    : 22   000200 : 20   30001000200:  1   2      : 20  
##            031    : 22   000300 : 18   30001000300:  1   3      : 18  
##            063    : 20   000400 :  9   30003000100:  1   4      :  9  
##            029    : 19   000500 :  8   30003940400:  1   5      :  8  
##            049    : 14   000800 :  8   30003940500:  1   8      :  8  
##            (Other):142   (Other):176   (Other)    :265   (Other):176  
##           NAMELSAD10   MTFCC10    FUNCSTAT10    ALAND10         
##  Census Tract 1: 32   G5020:271   S:271      Min.   :3.602e+05  
##  Census Tract 2: 20                          1st Qu.:8.582e+06  
##  Census Tract 3: 18                          Median :2.228e+08  
##  Census Tract 4:  9                          Mean   :1.391e+09  
##  Census Tract 5:  8                          3rd Qu.:2.174e+09  
##  Census Tract 8:  8                          Max.   :1.331e+10  
##  (Other)       :176                                             
##     AWATER10               INTPTLAT10         INTPTLON10 
##  Min.   :        0   +44.8573177:  1   -104.1114017:  1  
##  1st Qu.:    25880   +45.0226823:  1   -104.2288430:  1  
##  Median :  1547557   +45.1014235:  1   -104.2744656:  1  
##  Mean   : 14277494   +45.1045153:  1   -104.3281098:  1  
##  3rd Qu.: 10696537   +45.1523621:  1   -104.3292116:  1  
##  Max.   :445782449   +45.1526102:  1   -104.4057182:  1  
##                      (Other)    :265   (Other)     :265
# This is a new-to-us kind of object:
# a SpatialPolygonsDataFrame.
# To access just the data frame part, with one row per tract,
# access the object's data slot using tractSPDF@data
# (documentation calls this the "attribute table")
names(tractSPDF@data)
##  [1] "STATEFP10"  "COUNTYFP10" "TRACTCE10"  "GEOID10"    "NAME10"    
##  [6] "NAMELSAD10" "MTFCC10"    "FUNCSTAT10" "ALAND10"    "AWATER10"  
## [11] "INTPTLAT10" "INTPTLON10"
# For example, plot the longitude and latitude of each tract centroid.
# The lon and lat were stored as factors,
# so first read them as characters and then turn them into numbers,
# and resave these numeric versions of each variable:
tractSPDF@data$lonCenter = as.numeric(as.character(tractSPDF@data$INTPTLON10))
tractSPDF@data$latCenter = as.numeric(as.character(tractSPDF@data$INTPTLAT10))
# Then plot like a usual scatterplot:
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
     xlab = "Longitude", ylab = "Latitude", pch = '.')

# If we had another variable of interest,
# we could use cex to vary the symbol size or col to vary symbol color
# and just plot symbols at the centroid of each tract.
# This data set includes ALAND10, the land area of each tract,
# so let's plot that, rescaled by its max value:
maxArea = max(tractSPDF@data$ALAND10)
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
     xlab = "Longitude", ylab = "Latitude",
     cex = tractSPDF@data$ALAND10 / maxArea * 5)

# Or we can plot the tracts as a choropleth / thematic map,
# colored by land area.
# Use the spplot function.
# Let's also keep it clean by not drawing borders:
# use col = NA (here col is color of border, not of fill)
spplot(tractSPDF, "ALAND10", col = NA)

#### MERGE IN OTHER DATA ####

# But we are interested in plotting the Census mail back rate,
# so let's merge it into our attribute table.
# Load it first:
load("censusMT.Rdata")
# Let's rename it to censusDF (for Census Data Frame)
# so that this code can be reused with different states:
censusDF = censusMT
rm(censusMT)

# Use county and tract numbers to match up rows of the two data frames.
# Again, in tractSPDF, apparently they loaded as a factor,
# so we'll have to convert to integers via character:
summary(as.integer(as.character(tractSPDF@data$TRACTCE10)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     300     800  163400   10350  980600
# Compare to the tract numbers in censusDF:
summary(censusDF$Tract)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     300     800  163400   10350  980600
# Looks compatible.
# However, the same tract number can repeat in different counties,
# so we need to merge on county number as well:
summary(as.integer(as.character(tractSPDF@data$COUNTYFP10)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   29.00   49.00   55.11   84.00  111.00
summary(censusDF$County)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   29.00   49.00   55.11   84.00  111.00
# In each dataset, we need compatible versions
# of tract and county number, for merging the datasets.
# Transform the tract and county IDs in tractSPDF
# to character and then to integer,
# and save them as new variables Tract and County:
tractSPDF@data$County = as.integer(as.character(tractSPDF@data$COUNTYFP10))
tractSPDF@data$Tract = as.integer(as.character(tractSPDF@data$TRACTCE10))
# Next, create a single ID variable for each tract,
# by combining the county and tract numbers:
tractSPDF@data$tractID = paste(tractSPDF@data$County, tractSPDF@data$Tract, sep = ".")
censusDF$tractID = paste(censusDF$County, censusDF$Tract, sep = ".")
# Check that the region variables seem compatible:
all.equal(sort(tractSPDF@data$tractID), sort(censusDF$tractID))
## [1] TRUE
# Looks good.

# To keep the merge clean, we can also delete Tract and County from one of the datasets,
# so we don't end up with two copies.
censusDF$County = NULL
censusDF$Tract = NULL

# Finally, merge the censusDF variables into tractSPDF.
# When merging a SpatialPolygonsDataFrame with a regular data frame,
# put the spatial one first:
tractSPDFmerged = merge(tractSPDF, censusDF, by = "tractID")
names(tractSPDFmerged)
##  [1] "tractID"                       "STATEFP10"                    
##  [3] "COUNTYFP10"                    "TRACTCE10"                    
##  [5] "GEOID10"                       "NAME10"                       
##  [7] "NAMELSAD10"                    "MTFCC10"                      
##  [9] "FUNCSTAT10"                    "ALAND10"                      
## [11] "AWATER10"                      "INTPTLAT10"                   
## [13] "INTPTLON10"                    "lonCenter"                    
## [15] "latCenter"                     "County"                       
## [17] "Tract"                         "State"                        
## [19] "State_name"                    "County_name"                  
## [21] "LAND_AREA"                     "Tot_Population_ACS_09_13"     
## [23] "Med_HHD_Inc_ACS_09_13"         "Med_House_value_ACS_09_13"    
## [25] "Mail_Return_Rate_CEN_2010"     "pct_Males_ACS_09_13"          
## [27] "pct_Pop_5_17_ACS_09_13"        "pct_Pop_18_24_ACS_09_13"      
## [29] "pct_Pop_25_44_ACS_09_13"       "pct_Pop_45_64_ACS_09_13"      
## [31] "pct_Pop_65plus_ACS_09_13"      "pct_Hispanic_ACS_09_13"       
## [33] "pct_NH_White_alone_ACS_09_13"  "pct_NH_Blk_alone_ACS_09_13"   
## [35] "pct_Not_HS_Grad_ACS_09_13"     "pct_College_ACS_09_13"        
## [37] "pct_Prs_Blw_Pov_Lev_ACS_09_13" "pct_Diff_HU_1yr_Ago_ACS_09_13"
## [39] "pct_MrdCple_HHD_ACS_09_13"     "pct_Female_No_HB_ACS_09_13"   
## [41] "pct_Sngl_Prns_HHD_ACS_09_13"   "avg_Tot_Prns_in_HHD_ACS_09_13"
## [43] "pct_Rel_Under_6_ACS_09_13"     "pct_Vacant_Units_ACS_09_13"   
## [45] "pct_Renter_Occp_HU_ACS_09_13"  "pct_Single_Unit_ACS_09_13"
# Success! Now we can plot the Census records for each tract.


#### PLOT COMBINED DATA ####

# Summarize the Census mail back rates in Montana tracts
summary(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.70   77.50   80.40   80.18   82.90  100.00      18
# There are 18 NAs, so some of our tracts had no data
# and will show up as blank on a map.
# These tracts seem to correspond to American Indian reservations.

# Plot the Census mail back rate
# as sizes of circles located at tract centroids:
plot(tractSPDFmerged@data$lonCenter, tractSPDFmerged@data$latCenter,
     xlab = "Longitude", ylab = "Latitude",
     cex = tractSPDFmerged@data$Mail_Return_Rate_CEN_2010 / 100 * 2)

# Not very informative: the rates don't vary much,
# so most of the circles look about the same size.

# Plot the tracts as a choropleth / thematic map,
# colored by Census mail back rate.
spplot(tractSPDFmerged, "Mail_Return_Rate_CEN_2010", col = NA)

#### MAP PROJECTIONS ####

# So far we've been plotting x = longitude, y = latitude,
# but often it helps to use a map projection,
# especially for larger areas (like entire continents).

# Use an Albers Equal Area conic projection,
# as used for many US government maps.
# Here are suggested parameters for a Pennsylvania map:
# http://www.pasda.psu.edu/tutorials/arcgis/projection.asp
# where our standard parallels are lat_1 and lat_2,
# latitude of origin is lat_0,
# longitude of central meridian is lon_0,
# and so on as defined here:
# https://trac.osgeo.org/proj/wiki/GenParms
# For Albers, we want the standard parallels set to
# just below and just above Montana's southern and northern edges;
# longitude of central meridian to be in the middle of Montana;
# and other parameters are not critical:
# http://forums.esri.com/Thread.asp?c=93&f=984&t=258585
aea.proj = "+proj=aea +lat_1=44 +lat_2=49 +lat_0=0 +lon_0=-110 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m"
tractSPDFprojected = spTransform(tractSPDFmerged, CRS(aea.proj))

# Using plot() to show centroid lon/lat values, we'll see no difference.
# But using spplot(), we will see a slight difference after projecting:
spplot(tractSPDFprojected, "Mail_Return_Rate_CEN_2010", col = NA)

# Notice that the flat horizontal lines at top & bottom of MT
# are now slightly curved. The projection worked :)


#### SETTING COLORS ####

# Use cut() for equal intervals,
# quantile() for equal quantiles,
# and classInt(..., style = "jenks") for Jenks

# Looks like almost all the response rates are between 70% and 90%
hist(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)

# Compute equal interval and quantile breaks,
# though for equal intervals let's start at 0-60 and then go up by 10s
tractSPDFmerged@data$mailRrEqualIntervals = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
                                                breaks = c(0, 10*(6:10)))
tractSPDFmerged@data$mailRrEqualQuantiles = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
                                                breaks = c(0, quantile(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
                                                                       probs = seq(.2, 1, .2),
                                                                       na.rm = TRUE)))
# Jenks intervals: takes a VERY long time if you have many records,
# so can compute the breaks on a smaller random subsample of the tracts
## jenksBreaks = classIntervals(sample(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010, 200),
##                              n = 5, style = "jenks")
# But in Montana we only have 300ish tracts, not too many:
jenksBreaks = classIntervals(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
                              n = 5, style = "jenks")
## Warning in classIntervals(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010, :
## var has missing values, omitted in finding classes
# Manually set the highest and lowest breaks to 100 and 0,
# to cover full range of the data
jenksBreaks$brks[c(1, 6)] = c(0, 100)
jenksBreaks$brks
## [1]   0.0  16.7  74.3  79.3  83.9 100.0
tractSPDFmerged@data$mailRrJenks = cut(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010,
                                      breaks = jenksBreaks$brks)

# Plot and compare the 3 styles of intervals
spplot(tractSPDFmerged, "mailRrEqualIntervals", col = NA,
       main = "2010 Census mail back rates by Equal Intervals")

spplot(tractSPDFmerged, "mailRrEqualQuantiles", col = NA,
       main = "2010 Census mail back rates by Equal Quantiles")

spplot(tractSPDFmerged, "mailRrJenks", col = NA,
       main = "2010 Census mail back rates by Jenks cuts")

# To use a different color scale, assign it with col.regions
spplot(tractSPDFmerged, "mailRrJenks", col = NA,
       col.regions = brewer.pal(5, "Blues"))

#### IN ggplot2 AND ggmap ####

# For ggplot to use shapefile data, we have to fortify it:
# turn it into the kind of data frame that ggplot understands.

# We don't need to use the projected version of the data,
# since we will have to project within ggplot separately anyway.

# Now use the fortify function,
# telling it that tractID is the variable name of
# the unique identifier of each region (tract)
tractSPDFgg = fortify(tractSPDFmerged, region = "tractID")
# Now it contains the map borders, but lost the other variables
head(tractSPDFgg)
##        long      lat order  hole piece   group    id
## 1 -113.4366 45.83811     1 FALSE     1 1.100.1 1.100
## 2 -113.4363 45.83804     2 FALSE     1 1.100.1 1.100
## 3 -113.4363 45.83800     3 FALSE     1 1.100.1 1.100
## 4 -113.4362 45.83794     4 FALSE     1 1.100.1 1.100
## 5 -113.4362 45.83780     5 FALSE     1 1.100.1 1.100
## 6 -113.4361 45.83749     6 FALSE     1 1.100.1 1.100
# Also note that it renamed tractID to "id".
# So let's merge the other variables back in:
tractSPDFgg = merge(tractSPDFgg, tractSPDFmerged@data,
                   by.x = "id", by.y = "tractID")
names(tractSPDFgg)
##  [1] "id"                            "long"                         
##  [3] "lat"                           "order"                        
##  [5] "hole"                          "piece"                        
##  [7] "group"                         "STATEFP10"                    
##  [9] "COUNTYFP10"                    "TRACTCE10"                    
## [11] "GEOID10"                       "NAME10"                       
## [13] "NAMELSAD10"                    "MTFCC10"                      
## [15] "FUNCSTAT10"                    "ALAND10"                      
## [17] "AWATER10"                      "INTPTLAT10"                   
## [19] "INTPTLON10"                    "lonCenter"                    
## [21] "latCenter"                     "County"                       
## [23] "Tract"                         "State"                        
## [25] "State_name"                    "County_name"                  
## [27] "LAND_AREA"                     "Tot_Population_ACS_09_13"     
## [29] "Med_HHD_Inc_ACS_09_13"         "Med_House_value_ACS_09_13"    
## [31] "Mail_Return_Rate_CEN_2010"     "pct_Males_ACS_09_13"          
## [33] "pct_Pop_5_17_ACS_09_13"        "pct_Pop_18_24_ACS_09_13"      
## [35] "pct_Pop_25_44_ACS_09_13"       "pct_Pop_45_64_ACS_09_13"      
## [37] "pct_Pop_65plus_ACS_09_13"      "pct_Hispanic_ACS_09_13"       
## [39] "pct_NH_White_alone_ACS_09_13"  "pct_NH_Blk_alone_ACS_09_13"   
## [41] "pct_Not_HS_Grad_ACS_09_13"     "pct_College_ACS_09_13"        
## [43] "pct_Prs_Blw_Pov_Lev_ACS_09_13" "pct_Diff_HU_1yr_Ago_ACS_09_13"
## [45] "pct_MrdCple_HHD_ACS_09_13"     "pct_Female_No_HB_ACS_09_13"   
## [47] "pct_Sngl_Prns_HHD_ACS_09_13"   "avg_Tot_Prns_in_HHD_ACS_09_13"
## [49] "pct_Rel_Under_6_ACS_09_13"     "pct_Vacant_Units_ACS_09_13"   
## [51] "pct_Renter_Occp_HU_ACS_09_13"  "pct_Single_Unit_ACS_09_13"    
## [53] "mailRrEqualIntervals"          "mailRrEqualQuantiles"         
## [55] "mailRrJenks"
# Looks good.

# Finally, create a map.
# First, use coord_equal to ensure equal scales on x and y axes:
map1 = ggplot(tractSPDFgg, aes(long, lat, group = group,
                              fill = Mail_Return_Rate_CEN_2010)) +
  geom_polygon() + coord_equal()
map1

# Note that the NAs are plotted as grey tracts.

# Now, use coord_map to specify a map projection instead:
# in ggplot, projections are specified as in
# the mapproject function in the mapproj package.
# Again, use Albers Equal Area conic projection,
# with parameters lat0 and lat1 for the two standard parallels
# between which we want minimal distortion:
map1 + coord_map("albers", lat0 = 44, lat1 = 49)

# Looks a bit nicer than the coord_equal version above.

# Plot a spatial scatterplot of the tract centroids in ggplot2
map2 = ggplot(tractSPDFmerged@data, aes(lonCenter, latCenter,
                                       col = Mail_Return_Rate_CEN_2010,
                                       size = ALAND10)) +
  geom_point() + coord_equal()
map2

# Redo with Albers projection
map2 + coord_map("albers", lat0 = 40, lat1 = 42)

# Overlay the tract centroids on top of a Google Maps basemap using ggmap.
# Note that we can't choose a projection with ggmap;
# we must use the same Web Mercator projection as the basemaps.
# First get a Google Map basemap centered on Montana.
# I tried a few zooms, using
## qmap("montana", zoom = 6)
# and so on, and 6 fits well.
# (low zoom = far away, high zoom = close up)
MTmap = get_map('montana', zoom = 6)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=montana&zoom=6&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=montana&sensor=false
# Plot the basemap with ggmap(),
# then add points as with usual ggplot code
map3 = ggmap(MTmap) + 
  geom_point(aes(lonCenter, latCenter,
                 col = Mail_Return_Rate_CEN_2010, size = ALAND10),
             data = tractSPDFmerged@data)
map3

# The basemap overwhelms the data.
# Retry with a lighter basemap,
# using the darken argument to add a transparent white rectangle over top of it
map4 = ggmap(MTmap, darken = c(.5, "white")) + 
  geom_point(aes(lonCenter, latCenter,
                 col = Mail_Return_Rate_CEN_2010, size = ALAND10),
             data = tractSPDFmerged@data)
map4

# Change color scale so that higher rates are darker red.
# (I chose the low and high colors using colorbrewer2.org)
map4 + scale_color_continuous(low = "#fee8c8", high = "#e34a33")

# Finally, let's get fancy:
# Instead of plotting continuous response rates,
# use our precomputed Jenks cuts.
# Let's add a discrete ColorBrewer scale,
# remind it to plot NAs as grey,
# use a bigger max size of the points,
# and make all points a little transparent:
map5 = ggmap(MTmap, darken = c(.5, "white")) + 
  geom_point(aes(lonCenter, latCenter,
                 col = mailRrJenks, size = ALAND10),
             alpha = 0.7,
             data = tractSPDFmerged@data) +
  scale_color_brewer(palette = "Reds", na.value = "grey50") +
  scale_size(range = c(1, 10))
map5