# Loading the tidyverse
library(tidyverse)
# Reading in the taxi data
taxi <- read_csv("datasets/taxi.csv")
# Taking a look at the first couple of rows in taxi
head(taxi)
library(testthat)
library(IRkernel.testthat)
run_tests({
test_that("Test that tidyverse is loaded", {
expect_true( "package:tidyverse" %in% search(),
info = "The tidyverse package should be loaded using library().")
})
test_that("Read in data correctly.", {
expect_is(taxi, "tbl_df",
info = 'You should use read_csv (with an underscore) to read "datasets/taxi.csv" into taxi.')
})
test_that("Read in data correctly.", {
taxi_temp <- read_csv('datasets/taxi.csv')
expect_equivalent(taxi, taxi_temp,
info = 'taxi should contain the data in "datasets/taxi.csv".')
})
})
# Renaming the location variables,
# dropping any journeys with zero fares and zero tips,
# and creating the total variable as the log sum of fare and tip
taxi <- taxi %>% rename(lat = pickup_latitude, long = pickup_longitude) %>% filter(fare_amount > 0 | tip_amount > 0) %>% mutate(total = log(fare_amount + tip_amount))
head(taxi)
run_tests({
test_that("rename lat", {
expect_true(!is.null(taxi$lat),
info = "The taxi data frame does not contain a variable called lat. You need to rename pickup_latitude.")
})
test_that("rename long", {
expect_true(!is.null(taxi$long),
info = "The taxi data frame does not contain a variable called long. You need to rename pickup_longitude.")
})
test_that("total exists", {
expect_true(!is.null(taxi$total),
info = "The taxi data frame does not contain a variable called total. You need to create this as the logarithm (use the log() function) of the sum of fare_amount and tip_amount.")
})
test_that("Modified data correctly.", {
taxi_temp <- read_csv('datasets/taxi.csv') %>%
rename(long = pickup_longitude, lat = pickup_latitude) %>%
filter(fare_amount > 0 | tip_amount > 0) %>%
mutate(total = log(fare_amount + tip_amount) )
expect_equivalent(taxi, taxi_temp,
info = 'The taxi dataframe has not been modified correctly. See if you can find something is wrong with your code.')
})
})
# Reducing the data to taxi trips starting in Manhattan
# Manhattan is bounded by the rectangle with
# latitude from 40.70 to 40.83 and
# longitude from -74.025 to -73.93
taxi <- taxi %>%
filter(between(lat, 40.70, 40.83) &
between(long ,-74.025, -73.93))
run_tests({
test_that("The correct number of rows have been filtered away", {
expect_equal(45766, nrow(taxi),
info = "It seems you haven't filter away the taxi trips outside of Manhattan correctly.")
})
})
# Loading in ggmap and viridis for nice colors
library(ggmap)
library(viridis)
# Retrieving a stored map object which originally was created by
# manhattan <- get_map("manhattan", zoom = 12, color = "bw")
manhattan <- readRDS("datasets/manhattan.rds")
# Drawing a density map with the number of journey start locations
ggmap(manhattan, darken = 0.5) +
scale_fill_viridis(option = 'plasma') +
geom_bin2d(data = taxi, aes(long, lat, bins = 60, alpha = 0.6)) +
labs(x = "Longitude", y = "Latitude", fill = "Journeys")
run_tests({
test_that("Test that ggmap is loaded", {
expect_true( "package:ggmap" %in% search(),
info = "The ggmap package should be loaded using library().")
})
test_that("Test that viridis is loaded", {
expect_true( "package:viridis" %in% search(),
info = "The viridis package should be loaded using library().")
})
test_that("Check that geom_bin2d was used", {
p <- last_plot()
stat_classes <- as.character(sapply(p$layers, function(layer) {
class(layer$stat)
}))
expect_true("StatBin2d" %in% stat_classes,
info = "You need to use geom_bin2d correctly to draw the map.")
})
})
# Loading in the tree package.
library(tree)
# Fitting a tree to lat and long
fitted_tree <- tree(data = taxi, total ~ lat + long)
# Draw a diagram of the tree structure
plot(fitted_tree)
text(fitted_tree)
run_tests({
test_that("Test that tree is loaded", {
expect_true( "package:tree" %in% search(),
info = "The tree package should be loaded using library().")
})
test_that("The tree has been fitted correctly", {
correctly_fitted_tree <- tree(total ~ lat + long, data = taxi)
expect_equivalent(fitted_tree, correctly_fitted_tree,
info = "It seem you didn't fit the tree correctly. Check the hint, it might help!")
})
})
# Loading in the lubridate package
library(lubridate)
# Generate the three new time variables
taxi <- taxi %>%
mutate(hour = hour(pickup_datetime),
wday = wday(pickup_datetime, label = TRUE),
month = month(pickup_datetime, label = TRUE))
run_tests({
test_that("Test that lubridate is loaded", {
expect_true( "package:lubridate" %in% search(),
info = "The lubridate package should be loaded using library().")
})
test_that("hour is correct", {
expect_equivalent(taxi$hour[1], 10L,
info = "The `hour` column doesn't seem to be correct. Check the hint for more help.")
})
test_that("wday is correct", {
expect_true(taxi$wday[1] == "Sun",
info = "The `wday` column doesn't seem to be correct. Check the hint for more help.")
})
test_that("month is correct", {
expect_true(taxi$month[1] == "Jan",
info = "The `month` column doesn't seem to be correct. Check the hint for more help.")
})
})
# Fitting a tree with total as the outcome and
# lat, long, hour, wday, and month as predictors
fitted_tree <- tree(data = taxi, total ~ lat + long + hour + wday + month)
# draw a diagram of the tree structure
plot(fitted_tree)
text(fitted_tree)
# Summarizing the performance of the tree
summary(fitted_tree)
run_tests({
test_that("The tree has been fitted correctly", {
correctly_fitted_tree <- tree(total ~ lat + long + hour + wday + month, data = taxi)
expect_equivalent(fitted_tree, correctly_fitted_tree,
info = "It seem you didn't fit the tree correctly. Check the hint, it might help!")
})
})
# Loading in the randomForest package
library(randomForest)
# Fitting a random forest
fitted_forest <- randomForest(data = taxi, total ~ lat + long + hour + wday + month, ntree = 80, sampsize = 10000)
# Printing the fitted_forest object
fitted_forest
run_tests({
test_that("Test that randomForest is loaded", {
expect_true( "package:randomForest" %in% search(),
info = "The randomForest package should be loaded using library().")
})
test_that("ntree is correct.", {
expect_true(fitted_forest$ntree == 80,
info = "The ntree argument to randomForest should be ntree = 80 .")
})
test_that("Check randomForest call was ok", {
call_string <- paste(deparse(fitted_forest$call), collapse = " ")
keywords <- c("total", "lat", "long", "hour", "wday", "month",
"ntree", "sampsize", "100")
expect_true(all(str_detect(call_string, keywords)),
info = "You have not called randomForest correctly. Did you include all the predictors and the right output variable?.")
})
})
# Extracting the prediction from fitted_forest
taxi$pred_total <- fitted_forest$predicted
# Plotting the predicted mean trip prices from according to the random forest
# .... COPY CODE FROM TASK 4 AND MODIFY HERE ....
ggmap(manhattan, darken = 0.5) +
scale_fill_viridis(option = 'plasma') +
stat_summary_2d(data = taxi, aes(long, lat, bins = 60, alpha = 0.6,z = pred_total), fun = mean) +
labs(x = "Longitude", y = "Latitude", fill = "Fitted")
run_tests({
test_that("taxi$pred_total == fitted_forest$predicted", {
expect_true(all(taxi$pred_total == fitted_forest$predicted),
info = "You should assign fitted_forest$predicted to taxi$pred_total .")
})
test_that("Check that stat_summary_2d was used", {
p <- last_plot()
stat_classes <- as.character(sapply(p$layers, function(layer) {
class(layer$stat)
}))
expect_true("StatSummary2d" %in% stat_classes,
info = "You need to use geom_bin2d correctly to draw the map.")
})
test_that("Check that pred_total was used", {
p <- last_plot()
p_variables <- unlist(sapply(p$layers, function(layer) {
as.character(layer$mapping)
}))
expect_true(any(str_detect(p_variables, "pred_total")),
info = "You need to connect pred_total to z in the aes() call correctly.")
})
})
# Function that returns the mean *if* there are 15 or more datapoints
mean_if_enough_data <- function(x) {
ifelse( length(x) >= 15, mean(x), NA)
}
# Plotting the mean trip prices from the data
# .... COPY CODE FROM TASK 9 AND MODIFY HERE ....
ggmap(manhattan, darken = 0.5) +
scale_fill_viridis(option = 'plasma') +
stat_summary_2d(data = taxi, aes(long, lat, bins = 60, alpha = 0.6,z = total), fun = mean_if_enough_data) +
labs(x = "Longitude", y = "Latitude", fill = "Fitted")
run_tests({
test_that("Check that total was used but not pred_total", {
p <- last_plot()
p_variables <- unlist(sapply(p$layers, function(layer) {
as.character(layer$mapping)
}))
expect_true(any(str_detect(p_variables, "total")) &
!any(str_detect(p_variables, "pred_total")),
info = "You need to connect total to z in the aes() call correctly. Make sure you are not still using pred_total.")
})
})
# Where are people spending the most on their taxi trips?
spends_most_on_trips <- "downtown" # "uptown" or "downtown"
run_tests({
test_that("...", {
expect_true(str_detect(tolower(spends_most_on_trips), "downtown"),
info = "Well, looking at the plot it looks like people pay more downtown.")
})
})New York Taxi Analysis with R
R
Tidyverse
We are going to do the DataCamp R project: New York Taxi. In this project, we work with data from a large number of taxi journeys in New York from 2013.