# look for extra spaces, doubled directions (N N), and missing directions
# then clear up to only unique values and split each record in two, one for each
# end of the block. Output will be a datafile of keys and addresses.
master = readRDS("/home/ajackson/mirrors/ajackson/crime/data/masterdataset.rds")
master[,13] = paste(master$Block_Range, master$Suffix, master$Street, ", Houston, TX")
colnames(master)[13] = "Address"
master$Address <- str_replace_all(master$Address, " N +N ", " N ")
master$Address <- str_replace_all(master$Address, " S +S ", " S ")
master$Address <- str_replace_all(master$Address, " E +E ", " E ")
master$Address <- str_replace_all(master$Address, " W +W ", " W ")
master$Address <- str_replace_all(master$Address, " ", " ")
# remove records missing E or W designation on a numbered street.
# without that, the address is incomplete and not usable. Store in a new dataframe.
geotable = data.frame(master$Address[-grep('\\d+-\\d+ \\d', master$Address)], stringsAsFactors=FALSE)
colnames(geotable)[1] = "Address"
geotable <- unique(geotable) # filter down to uniques addresses
# split out beginning and ending block addresses and street name
geotable[,2] = str_extract(geotable$Address,"^\\d+")
geotable[,3] = str_extract(geotable$Address,"\\d+ ")
geotable[,4] = str_extract(geotable$Address," .+$")
colnames(geotable)[2] = "Add1"
colnames(geotable)[3] = "Add2"
colnames(geotable)[4] = "Street"
# delete records that are incomplete
geotable <- geotable[complete.cases(geotable),]
# run geocode for each record up to the daily limit, and then average the lat/long
# values from each end of the block to get a true block center value.
latlong1 = geocode(paste(geotable$Add1, geotable$Street))
geotable = bind_cols(geotable,latlong1)
colnames(geotable)[5] = "lon1"
colnames(geotable)[6] = "lat1"
# Hmm... random 20% failures. Let's retry the failures
latlong11 = geocode(paste(geotable[!complete.cases(latlong1),]$Add1, geotable[!complete.cases(latlong1),]$Street))
# merge new data into geotable
geotable[!complete.cases(geotable),]$lon1 = latlong11$lon
geotable[!complete.cases(geotable),]$lat1 = latlong11$lat
# and again
latlong11 = geocode(paste(geotable[!complete.cases(geotable),]$Add1, geotable[!complete.cases(geotable),]$Street))
geotable[!complete.cases(geotable),]$lon1 = latlong11$lon
geotable[!complete.cases(geotable),]$lat1 = latlong11$lat
# and again
latlong11 = geocode(paste(geotable[!complete.cases(geotable),]$Add1, geotable[!complete.cases(geotable),]$Street))
geotable[!complete.cases(geotable),]$lon1 = latlong11$lon
geotable[!complete.cases(geotable),]$lat1 = latlong11$lat
##### now the ending address
latlong2 = geocode(paste(geotable$Add2, geotable$Street))
geotable = bind_cols(geotable,latlong2)
colnames(geotable)[7] = "lon2"
colnames(geotable)[8] = "lat2"
# exceeded query max - wait a day
latlong21 = geocode(paste(geotable[!complete.cases(geotable),]$Add2, geotable[!complete.cases(geotable),]$Street))
geotable[!complete.cases(geotable),]$lon2 = latlong21$lon
geotable[!complete.cases(geotable),]$lat2 = latlong21$lat
latlong21 = geocode(paste(geotable[!complete.cases(geotable),]$Add2, geotable[!complete.cases(geotable),]$Street))
geotable[!complete.cases(geotable),]$lon2 = latlong21$lon
geotable[!complete.cases(geotable),]$lat2 = latlong21$lat
latlong21 = geocode(paste(geotable[!complete.cases(geotable),]$Add2, geotable[!complete.cases(geotable),]$Street))
geotable[!complete.cases(geotable),]$lon2 = latlong21$lon
geotable[!complete.cases(geotable),]$lat2 = latlong21$lat
# Average lats and longs to get block center coordinates
temp = geotable
geotable$avglon = (geotable$lon1 + geotable$lon2)/2
geotable$avglat = (geotable$lat1 + geotable$lat2)/2