Basic queries
Let's begin the data exploration. Each of the functions below return some useful information about the data.
head(nyc_taxi) # show me the first few rows
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
1 2 2015-01-15 19:05:40 2015-01-15 19:28:18 5 8.33
2 2 2015-01-25 00:13:06 2015-01-25 00:24:51 1 3.37
3 2 2015-01-25 00:13:08 2015-01-25 00:34:57 1 3.72
4 2 2015-01-25 00:13:09 2015-01-25 01:02:40 1 10.20
5 2 2015-01-04 13:44:52 2015-01-04 13:46:38 1 0.36
6 2 2015-01-04 13:44:52 2015-01-04 14:04:23 1 8.98
pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
1 -73.9 40.8 1 N -74.0
2 -73.9 40.8 1 N -74.0
3 -74.0 40.8 1 N -74.0
4 -74.0 40.8 1 N -73.9
5 -74.0 40.8 1 N -74.0
6 -73.9 40.8 1 N -74.0
dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
1 40.8 1 26.0 1.0 0.5 8.08 5.33
2 40.8 1 12.5 0.5 0.5 0.00 0.00
3 40.7 1 16.5 0.5 0.5 3.56 0.00
4 40.7 2 39.0 0.5 0.5 0.00 0.00
5 40.8 2 3.5 0.0 0.5 0.00 0.00
6 40.8 1 27.0 0.0 0.5 0.00 5.33
improvement_surcharge total_amount u
1 0.3 41.2 0.0191
2 0.3 13.8 0.0229
3 0.3 21.4 0.0399
4 0.3 40.3 0.0055
5 0.3 4.3 0.0497
6 0.3 33.1 0.0383
head(nyc_taxi, n = 3) # show me the first 10 rows
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
1 2 2015-01-15 19:05:40 2015-01-15 19:28:18 5 8.33
2 2 2015-01-25 00:13:06 2015-01-25 00:24:51 1 3.37
3 2 2015-01-25 00:13:08 2015-01-25 00:34:57 1 3.72
pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
1 -73.9 40.8 1 N -74
2 -73.9 40.8 1 N -74
3 -74.0 40.8 1 N -74
dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
1 40.8 1 26.0 1.0 0.5 8.08 5.33
2 40.8 1 12.5 0.5 0.5 0.00 0.00
3 40.7 1 16.5 0.5 0.5 3.56 0.00
improvement_surcharge total_amount u
1 0.3 41.2 0.0191
2 0.3 13.8 0.0229
3 0.3 21.4 0.0399
tail(nyc_taxi) # show me the last few rows
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
3852357 1 2015-06-30 21:53:55 2015-06-30 22:13:39 1 3.80
3852358 1 2015-06-30 21:54:22 2015-06-30 22:05:10 1 1.00
3852359 1 2015-06-30 21:54:05 2015-06-30 22:15:41 1 8.60
3852360 2 2015-06-30 21:54:15 2015-06-30 22:11:22 1 4.96
3852361 1 2015-06-30 21:54:18 2015-06-30 22:11:42 2 3.50
3852362 2 2015-06-30 21:54:18 2015-06-30 22:07:42 1 1.63
pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
3852357 -74.0 40.8 1 N -73.9
3852358 -74.0 40.8 1 N -74.0
3852359 -73.9 40.8 1 N -74.0
3852360 -73.9 40.7 1 N -74.0
3852361 -74.0 40.8 1 N -74.0
3852362 -74.0 40.8 1 N -74.0
dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
3852357 40.8 1 16.0 0.5 0.5 3.45 0
3852358 40.8 1 8.5 0.5 0.5 2.90 0
3852359 40.7 2 27.0 0.5 0.5 0.00 0
3852360 40.7 2 17.5 0.5 0.5 0.00 0
3852361 40.7 1 14.5 0.5 0.5 3.15 0
3852362 40.8 1 9.5 0.5 0.5 2.70 0
improvement_surcharge total_amount u
3852357 0.3 20.8 0.03868
3852358 0.3 12.7 0.03120
3852359 0.3 28.3 0.00657
3852360 0.3 18.8 0.04165
3852361 0.3 18.9 0.02368
3852362 0.3 13.5 0.01578
basic_info <- list(
class = class(nyc_taxi), # shows the type of the data: `data.frame`
type = typeof(nyc_taxi), # shows that a `data.frame` is fundamentally a `list` object
nrow = nrow(nyc_taxi), # number of rows
ncol = ncol(nyc_taxi), # number of columns
colnames = names(nyc_taxi))
basic_info
$class
[1] "data.frame"
$type
[1] "list"
$nrow
[1] 3852362
$ncol
[1] 20
$colnames
[1] "VendorID" "tpep_pickup_datetime" "tpep_dropoff_datetime"
[4] "passenger_count" "trip_distance" "pickup_longitude"
[7] "pickup_latitude" "RateCodeID" "store_and_fwd_flag"
[10] "dropoff_longitude" "dropoff_latitude" "payment_type"
[13] "fare_amount" "extra" "mta_tax"
[16] "tip_amount" "tolls_amount" "improvement_surcharge"
[19] "total_amount" "u"
names(nyc_taxi)[8] <- 'rate_code_id' # rename column `RateCodeID` to `rate_code_id`
We use str
to look at column types in the data: the most common column types are integer
, numeric
(for floats), character
(for strings), factor
(for categorical data). Less common column types exist, such as date, time, and datetime formats.
str(nyc_taxi)
'data.frame': 3852362 obs. of 20 variables:
$ VendorID : int 2 2 2 2 2 2 2 1 2 2 ...
$ tpep_pickup_datetime : chr "2015-01-15 19:05:40" "2015-01-25 00:13:06" "2015-01-25 00:13:08" "2015-01-25 00:13:09" ...
$ tpep_dropoff_datetime: chr "2015-01-15 19:28:18" "2015-01-25 00:24:51" "2015-01-25 00:34:57" "2015-01-25 01:02:40" ...
$ passenger_count : int 5 1 1 1 1 1 1 1 1 1 ...
$ trip_distance : num 8.33 3.37 3.72 10.2 0.36 8.98 1.56 1.5 1.39 15.2 ...
$ pickup_longitude : num -73.9 -73.9 -74 -74 -74 ...
$ pickup_latitude : num 40.8 40.8 40.8 40.8 40.8 ...
$ rate_code_id : int 1 1 1 1 1 1 1 1 1 2 ...
$ store_and_fwd_flag : chr "N" "N" "N" "N" ...
$ dropoff_longitude : num -74 -74 -74 -73.9 -74 ...
$ dropoff_latitude : num 40.8 40.8 40.7 40.7 40.8 ...
$ payment_type : int 1 1 1 2 2 1 1 1 2 1 ...
$ fare_amount : num 26 12.5 16.5 39 3.5 27 7 8 7.5 52 ...
$ extra : num 1 0.5 0.5 0.5 0 0 0 0 0 0 ...
$ mta_tax : num 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
$ tip_amount : num 8.08 0 3.56 0 0 ...
$ tolls_amount : num 5.33 0 0 0 0 5.33 0 0 0 5.33 ...
$ improvement_surcharge: num 0.3 0.3 0.3 0.3 0.3 0.3 0.3 0 0.3 0.3 ...
$ total_amount : num 41.2 13.8 21.4 40.3 4.3 ...
$ u : num 0.0191 0.0229 0.0399 0.0055 0.0497 ...
Now let's see how we can subset or slice the data: in other words. Since a data.frame
is a 2-dimensional object, we can slice by asking for specific rows or columns of the data. The notation we use here (which we refer to as the bracket notation) is as follows:
data[rows_to_slice, columns_to_slice]
As we will see, we can be very flexible in what we choose for rows_to_slice
and columns_to_slice
. For example,
- we can provide numeric indexes corresponding to row numbers or column positions
- we can (and should) specify the column names instead of column positions
- we can provide functions that return integers corresponding to the row indexes we want to return
- we can provide functions that return the column names we want to return
- we can have conditional statements or functions that return
TRUE
andFALSE
for each row or column, so that only cases that areTRUE
are returned
We will encounter examples for each case.
nyc_taxi[1:5, 1:4] # rows 1 through 5, columns 1 through 4
VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count
1 2 2015-01-15 19:05:40 2015-01-15 19:28:18 5
2 2 2015-01-25 00:13:06 2015-01-25 00:24:51 1
3 2 2015-01-25 00:13:08 2015-01-25 00:34:57 1
4 2 2015-01-25 00:13:09 2015-01-25 01:02:40 1
5 2 2015-01-04 13:44:52 2015-01-04 13:46:38 1
nyc_taxi[1:5, -(1:4)] # rows 1 through 5, except columns 1 through 4
trip_distance pickup_longitude pickup_latitude rate_code_id store_and_fwd_flag
1 8.33 -73.9 40.8 1 N
2 3.37 -73.9 40.8 1 N
3 3.72 -74.0 40.8 1 N
4 10.20 -74.0 40.8 1 N
5 0.36 -74.0 40.8 1 N
dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax
1 -74.0 40.8 1 26.0 1.0 0.5
2 -74.0 40.8 1 12.5 0.5 0.5
3 -74.0 40.7 1 16.5 0.5 0.5
4 -73.9 40.7 2 39.0 0.5 0.5
5 -74.0 40.8 2 3.5 0.0 0.5
tip_amount tolls_amount improvement_surcharge total_amount u
1 8.08 5.33 0.3 41.2 0.0191
2 0.00 0.00 0.3 13.8 0.0229
3 3.56 0.00 0.3 21.4 0.0399
4 0.00 0.00 0.3 40.3 0.0055
5 0.00 0.00 0.3 4.3 0.0497
nyc.first.ten <- nyc_taxi[1:10, ] # store the first 10 rows and all columns in a new `data.frame` called `nyc.first.ten`
So far our data slices have been limited to adjacent rows and adjacent columns. Here's an example of how to slice the data for non-adjacent rows. It is also far more common to select columns by their names instead of their position (also called numeric index), since this makes the code more readable and won't break the code if column positions change.
nyc_taxi[c(2, 3, 8, 66), c("fare_amount", "mta_tax", "tip_amount", "tolls_amount")]
fare_amount mta_tax tip_amount tolls_amount
2 12.5 0.5 0.00 0
3 16.5 0.5 3.56 0
8 8.0 0.5 1.75 0
66 8.0 0.5 1.50 0