# 10 Mapping: R code
# Jerzy Wieczorek
# 10/1/15
# 36-721 Statistical Graphics and Visualization

# Set working directory
indir = "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10"
#indir = "D:/Dropbox/CMU/36-721 Dataviz F'15/Lecture 10"
setwd(indir)

# R mapping packages you may need to install, if you don't have them:
# install.packages(c("rgdal", "sp", "maptools", "rgeos", gpclib", ggmap"))
# Also, classInt for computing Jenks class intervals if you want them:
# install.packages("classInt")

# Load R packages
library(rgdal)
## Loading required package: sp
## rgdal: version: 1.0-7, (SVN revision 559)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 1.7.3, released 2010/11/10
##  Path to GDAL shared files: /usr/share/gdal/1.7
##  GDAL does not use iconv for recoding strings.
##  Loaded PROJ.4 runtime: Rel. 4.7.1, 23 September 2009, [PJ_VERSION: 470]
##  Path to PROJ.4 shared files: (autodetected)
##  Linking to sp version: 1.2-0
library(ggplot2)
library(ggmap)
library(RColorBrewer)
library(classInt)


#### LOAD AND PLOT SHAPEFILES ####

# Load the shapefiles for all 2010 Census tracts in Montana (MT).
# Tell it the directory where the shapefiles are,
# so it can find all the various pieces (.shp, .dbf, etc)
tractSPDF = readOGR(indir, "tl_2010_30_tract10")
## OGR data source with driver: ESRI Shapefile 
## Source: "/home/jerzy/Downloads/36-721 Dataviz F15/Lecture 10", layer: "tl_2010_30_tract10"
## with 271 features
## It has 12 fields
# Plot the tract boundaries
plot(tractSPDF)

# Get a summary of the object's properties
summary(tractSPDF)
## Object of class SpatialPolygonsDataFrame
## Coordinates:
##          min        max
## x -116.05000 -104.03914
## y   44.35821   49.00139
## Is projected: FALSE 
## proj4string :
## [+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs +towgs84=0,0,0]
## Data attributes:
##  STATEFP10   COUNTYFP10    TRACTCE10          GEOID10        NAME10   
##  30:271    111    : 32   000100 : 32   30001000100:  1   1      : 32  
##            013    : 22   000200 : 20   30001000200:  1   2      : 20  
##            031    : 22   000300 : 18   30001000300:  1   3      : 18  
##            063    : 20   000400 :  9   30003000100:  1   4      :  9  
##            029    : 19   000500 :  8   30003940400:  1   5      :  8  
##            049    : 14   000800 :  8   30003940500:  1   8      :  8  
##            (Other):142   (Other):176   (Other)    :265   (Other):176  
##           NAMELSAD10   MTFCC10    FUNCSTAT10    ALAND10         
##  Census Tract 1: 32   G5020:271   S:271      Min.   :3.602e+05  
##  Census Tract 2: 20                          1st Qu.:8.582e+06  
##  Census Tract 3: 18                          Median :2.228e+08  
##  Census Tract 4:  9                          Mean   :1.391e+09  
##  Census Tract 5:  8                          3rd Qu.:2.174e+09  
##  Census Tract 8:  8                          Max.   :1.331e+10  
##  (Other)       :176                                             
##     AWATER10               INTPTLAT10         INTPTLON10 
##  Min.   :        0   +44.8573177:  1   -104.1114017:  1  
##  1st Qu.:    25880   +45.0226823:  1   -104.2288430:  1  
##  Median :  1547557   +45.1014235:  1   -104.2744656:  1  
##  Mean   : 14277494   +45.1045153:  1   -104.3281098:  1  
##  3rd Qu.: 10696537   +45.1523621:  1   -104.3292116:  1  
##  Max.   :445782449   +45.1526102:  1   -104.4057182:  1  
##                      (Other)    :265   (Other)     :265
# This is a new-to-us kind of object:
# a SpatialPolygonsDataFrame.
# To access just the data frame part, with one row per tract,
# access the object's data slot using tractSPDF@data
# (documentation calls this the "attribute table")
names(tractSPDF@data)
##  [1] "STATEFP10"  "COUNTYFP10" "TRACTCE10"  "GEOID10"    "NAME10"    
##  [6] "NAMELSAD10" "MTFCC10"    "FUNCSTAT10" "ALAND10"    "AWATER10"  
## [11] "INTPTLAT10" "INTPTLON10"
# For example, plot the longitude and latitude of each tract centroid.
# The lon and lat were stored as factors,
# so first read them as characters and then turn them into numbers,
# and resave these numeric versions of each variable:
tractSPDF@data$lonCenter = as.numeric(as.character(tractSPDF@data$INTPTLON10))
tractSPDF@data$latCenter = as.numeric(as.character(tractSPDF@data$INTPTLAT10))
# Then plot like a usual scatterplot:
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
     xlab = "Longitude", ylab = "Latitude", pch = '.')

# If we had another variable of interest,
# we could use cex to vary the symbol size or col to vary symbol color
# and just plot symbols at the centroid of each tract.
# This data set includes ALAND10, the land area of each tract,
# so let's plot that, rescaled by its max value:
maxArea = max(tractSPDF@data$ALAND10)
plot(tractSPDF@data$lonCenter, tractSPDF@data$latCenter,
     xlab = "Longitude", ylab = "Latitude",
     cex = tractSPDF@data$ALAND10 / maxArea * 5)

# Or we can plot the tracts as a choropleth / thematic map,
# colored by land area.
# Use the spplot function.
# Let's also keep it clean by not drawing borders:
# use col = NA (here col is color of border, not of fill)
spplot(tractSPDF, "ALAND10", col = NA)

#### MERGE IN OTHER DATA ####

# But we are interested in plotting the Census mail back rate,
# so let's merge it into our attribute table.
# Load it first:
load("censusMT.Rdata")
# Let's rename it to censusDF (for Census Data Frame)
# so that this code can be reused with different states:
censusDF = censusMT
rm(censusMT)

# Use county and tract numbers to match up rows of the two data frames.
# Again, in tractSPDF, apparently they loaded as a factor,
# so we'll have to convert to integers via character:
summary(as.integer(as.character(tractSPDF@data$TRACTCE10)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     300     800  163400   10350  980600
# Compare to the tract numbers in censusDF:
summary(censusDF$Tract)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     100     300     800  163400   10350  980600
# Looks compatible.
# However, the same tract number can repeat in different counties,
# so we need to merge on county number as well:
summary(as.integer(as.character(tractSPDF@data$COUNTYFP10)))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   29.00   49.00   55.11   84.00  111.00
summary(censusDF$County)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   29.00   49.00   55.11   84.00  111.00
# In each dataset, we need compatible versions
# of tract and county number, for merging the datasets.
# Transform the tract and county IDs in tractSPDF
# to character and then to integer,
# and save them as new variables Tract and County:
tractSPDF@data$County = as.integer(as.character(tractSPDF@data$COUNTYFP10))
tractSPDF@data$Tract = as.integer(as.character(tractSPDF@data$TRACTCE10))
# Next, create a single ID variable for each tract,
# by combining the county and tract numbers:
tractSPDF@data$tractID = paste(tractSPDF@data$County, tractSPDF@data$Tract, sep = ".")
censusDF$tractID = paste(censusDF$County, censusDF$Tract, sep = ".")
# Check that the region variables seem compatible:
all.equal(sort(tractSPDF@data$tractID), sort(censusDF$tractID))
## [1] TRUE
# Looks good.

# To keep the merge clean, we can also delete Tract and County from one of the datasets,
# so we don't end up with two copies.
censusDF$County = NULL
censusDF$Tract = NULL

# Finally, merge the censusDF variables into tractSPDF.
# When merging a SpatialPolygonsDataFrame with a regular data frame,
# put the spatial one first:
tractSPDFmerged = merge(tractSPDF, censusDF, by = "tractID")
names(tractSPDFmerged)
##  [1] "tractID"                       "STATEFP10"                    
##  [3] "COUNTYFP10"                    "TRACTCE10"                    
##  [5] "GEOID10"                       "NAME10"                       
##  [7] "NAMELSAD10"                    "MTFCC10"                      
##  [9] "FUNCSTAT10"                    "ALAND10"                      
## [11] "AWATER10"                      "INTPTLAT10"                   
## [13] "INTPTLON10"                    "lonCenter"                    
## [15] "latCenter"                     "County"                       
## [17] "Tract"                         "State"                        
## [19] "State_name"                    "County_name"                  
## [21] "LAND_AREA"                     "Tot_Population_ACS_09_13"     
## [23] "Med_HHD_Inc_ACS_09_13"         "Med_House_value_ACS_09_13"    
## [25] "Mail_Return_Rate_CEN_2010"     "pct_Males_ACS_09_13"          
## [27] "pct_Pop_5_17_ACS_09_13"        "pct_Pop_18_24_ACS_09_13"      
## [29] "pct_Pop_25_44_ACS_09_13"       "pct_Pop_45_64_ACS_09_13"      
## [31] "pct_Pop_65plus_ACS_09_13"      "pct_Hispanic_ACS_09_13"       
## [33] "pct_NH_White_alone_ACS_09_13"  "pct_NH_Blk_alone_ACS_09_13"   
## [35] "pct_Not_HS_Grad_ACS_09_13"     "pct_College_ACS_09_13"        
## [37] "pct_Prs_Blw_Pov_Lev_ACS_09_13" "pct_Diff_HU_1yr_Ago_ACS_09_13"
## [39] "pct_MrdCple_HHD_ACS_09_13"     "pct_Female_No_HB_ACS_09_13"   
## [41] "pct_Sngl_Prns_HHD_ACS_09_13"   "avg_Tot_Prns_in_HHD_ACS_09_13"
## [43] "pct_Rel_Under_6_ACS_09_13"     "pct_Vacant_Units_ACS_09_13"   
## [45] "pct_Renter_Occp_HU_ACS_09_13"  "pct_Single_Unit_ACS_09_13"
# Success! Now we can plot the Census records for each tract.


#### PLOT COMBINED DATA ####

# Summarize the Census mail back rates in Montana tracts
summary(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   16.70   77.50   80.40   80.18   82.90  100.00      18
# There are 18 NAs, so some of our tracts had no data
# and will show up as blank on a map.
# These tracts seem to correspond to American Indian reservations.

# Plot the Census mail back rate
# as sizes of circles located at tract centroids:
plot(tractSPDFmerged@data$lonCenter, tractSPDFmerged@data$latCenter,
     xlab = "Longitude", ylab = "Latitude",
     cex = tractSPDFmerged@data$Mail_Return_Rate_CEN_2010 / 100 * 2)

# Not very informative: the rates don't vary much,
# so most of the circles look about the same size.

# Plot the tracts as a choropleth / thematic map,
# colored by Census mail back rate.
spplot(tractSPDFmerged, "Mail_Return_Rate_CEN_2010", col = NA)

#### MAP PROJECTIONS ####

# So far we've been plotting x = longitude, y = latitude,
# but often it helps to use a map projection,
# especially for larger areas (like entire continents).

# Use an Albers Equal Area conic projection,
# as used for many US government maps.
# Here are suggested parameters for a Pennsylvania map:
# http://www.pasda.psu.edu/tutorials/arcgis/projection.asp
# where our standard parallels are lat_1 and lat_2,
# latitude of origin is lat_0,
# longitude of central meridian is lon_0,
# and so on as defined here:
# https://trac.osgeo.org/proj/wiki/GenParms
# For Albers, we want the standard parallels set to
# just below and just above Montana's southern and northern edges;
# longitude of central meridian to be in the middle of Montana;
# and other parameters are not critical:
# http://forums.esri.com/Thread.asp?c=93&f=984&t=258585
aea.proj = "+proj=aea +lat_1=44 +lat_2=49 +lat_0=0 +lon_0=-110 +x_0=0 +y_0=0 +ellps=GRS80 +datum=NAD83 +units=m"
tractSPDFprojected = spTransform(tractSPDFmerged, CRS(aea.proj))

# Using plot() to show centroid lon/lat values, we'll see no difference.
# But using spplot(), we will see a slight difference after projecting:
spplot(tractSPDFprojected, "Mail_Return_Rate_CEN_2010", col = NA)

# Notice that the flat horizontal lines at top & bottom of MT
# are now slightly curved. The projection worked :)


#### SETTING COLORS ####

# Use cut() for equal intervals,
# quantile() for equal quantiles,
# and classInt(..., style = "jenks") for Jenks

# Looks like almost all the response rates are between 70% and 90%
hist(tractSPDFmerged@data$Mail_Return_Rate_CEN_2010)