Basic queries

Let's begin the data exploration. Each of the functions below return some useful information about the data.

head(nyc_taxi) # show me the first few rows

  VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
1        2  2015-01-15 19:05:40   2015-01-15 19:28:18               5          8.33
2        2  2015-01-25 00:13:06   2015-01-25 00:24:51               1          3.37
3        2  2015-01-25 00:13:08   2015-01-25 00:34:57               1          3.72
4        2  2015-01-25 00:13:09   2015-01-25 01:02:40               1         10.20
5        2  2015-01-04 13:44:52   2015-01-04 13:46:38               1          0.36
6        2  2015-01-04 13:44:52   2015-01-04 14:04:23               1          8.98
  pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
1            -73.9            40.8          1                  N             -74.0
2            -73.9            40.8          1                  N             -74.0
3            -74.0            40.8          1                  N             -74.0
4            -74.0            40.8          1                  N             -73.9
5            -74.0            40.8          1                  N             -74.0
6            -73.9            40.8          1                  N             -74.0
  dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
1             40.8            1        26.0   1.0     0.5       8.08         5.33
2             40.8            1        12.5   0.5     0.5       0.00         0.00
3             40.7            1        16.5   0.5     0.5       3.56         0.00
4             40.7            2        39.0   0.5     0.5       0.00         0.00
5             40.8            2         3.5   0.0     0.5       0.00         0.00
6             40.8            1        27.0   0.0     0.5       0.00         5.33
  improvement_surcharge total_amount      u
1                   0.3         41.2 0.0191
2                   0.3         13.8 0.0229
3                   0.3         21.4 0.0399
4                   0.3         40.3 0.0055
5                   0.3          4.3 0.0497
6                   0.3         33.1 0.0383

head(nyc_taxi, n = 3) # show me the first 10 rows

  VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
1        2  2015-01-15 19:05:40   2015-01-15 19:28:18               5          8.33
2        2  2015-01-25 00:13:06   2015-01-25 00:24:51               1          3.37
3        2  2015-01-25 00:13:08   2015-01-25 00:34:57               1          3.72
  pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
1            -73.9            40.8          1                  N               -74
2            -73.9            40.8          1                  N               -74
3            -74.0            40.8          1                  N               -74
  dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
1             40.8            1        26.0   1.0     0.5       8.08         5.33
2             40.8            1        12.5   0.5     0.5       0.00         0.00
3             40.7            1        16.5   0.5     0.5       3.56         0.00
  improvement_surcharge total_amount      u
1                   0.3         41.2 0.0191
2                   0.3         13.8 0.0229
3                   0.3         21.4 0.0399

tail(nyc_taxi) # show me the last few rows

        VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count trip_distance
3852357        1  2015-06-30 21:53:55   2015-06-30 22:13:39               1          3.80
3852358        1  2015-06-30 21:54:22   2015-06-30 22:05:10               1          1.00
3852359        1  2015-06-30 21:54:05   2015-06-30 22:15:41               1          8.60
3852360        2  2015-06-30 21:54:15   2015-06-30 22:11:22               1          4.96
3852361        1  2015-06-30 21:54:18   2015-06-30 22:11:42               2          3.50
3852362        2  2015-06-30 21:54:18   2015-06-30 22:07:42               1          1.63
        pickup_longitude pickup_latitude RateCodeID store_and_fwd_flag dropoff_longitude
3852357            -74.0            40.8          1                  N             -73.9
3852358            -74.0            40.8          1                  N             -74.0
3852359            -73.9            40.8          1                  N             -74.0
3852360            -73.9            40.7          1                  N             -74.0
3852361            -74.0            40.8          1                  N             -74.0
3852362            -74.0            40.8          1                  N             -74.0
        dropoff_latitude payment_type fare_amount extra mta_tax tip_amount tolls_amount
3852357             40.8            1        16.0   0.5     0.5       3.45            0
3852358             40.8            1         8.5   0.5     0.5       2.90            0
3852359             40.7            2        27.0   0.5     0.5       0.00            0
3852360             40.7            2        17.5   0.5     0.5       0.00            0
3852361             40.7            1        14.5   0.5     0.5       3.15            0
3852362             40.8            1         9.5   0.5     0.5       2.70            0
        improvement_surcharge total_amount       u
3852357                   0.3         20.8 0.03868
3852358                   0.3         12.7 0.03120
3852359                   0.3         28.3 0.00657
3852360                   0.3         18.8 0.04165
3852361                   0.3         18.9 0.02368
3852362                   0.3         13.5 0.01578

basic_info <- list(
    class = class(nyc_taxi), # shows the type of the data: `data.frame`
    type = typeof(nyc_taxi), # shows that a `data.frame` is fundamentally a `list` object
    nrow = nrow(nyc_taxi), # number of rows
    ncol = ncol(nyc_taxi), # number of columns
    colnames = names(nyc_taxi))

basic_info

$class
[1] "data.frame"

$type
[1] "list"

$nrow
[1] 3852362

$ncol
[1] 20

$colnames
 [1] "VendorID"              "tpep_pickup_datetime"  "tpep_dropoff_datetime"
 [4] "passenger_count"       "trip_distance"         "pickup_longitude"     
 [7] "pickup_latitude"       "RateCodeID"            "store_and_fwd_flag"   
[10] "dropoff_longitude"     "dropoff_latitude"      "payment_type"         
[13] "fare_amount"           "extra"                 "mta_tax"              
[16] "tip_amount"            "tolls_amount"          "improvement_surcharge"
[19] "total_amount"          "u"

names(nyc_taxi)[8] <- 'rate_code_id' # rename column `RateCodeID` to `rate_code_id`

We use str to look at column types in the data: the most common column types are integer, numeric (for floats), character (for strings), factor (for categorical data). Less common column types exist, such as date, time, and datetime formats.

str(nyc_taxi)

'data.frame':    3852362 obs. of  20 variables:
 $ VendorID             : int  2 2 2 2 2 2 2 1 2 2 ...
 $ tpep_pickup_datetime : chr  "2015-01-15 19:05:40" "2015-01-25 00:13:06" "2015-01-25 00:13:08" "2015-01-25 00:13:09" ...
 $ tpep_dropoff_datetime: chr  "2015-01-15 19:28:18" "2015-01-25 00:24:51" "2015-01-25 00:34:57" "2015-01-25 01:02:40" ...
 $ passenger_count      : int  5 1 1 1 1 1 1 1 1 1 ...
 $ trip_distance        : num  8.33 3.37 3.72 10.2 0.36 8.98 1.56 1.5 1.39 15.2 ...
 $ pickup_longitude     : num  -73.9 -73.9 -74 -74 -74 ...
 $ pickup_latitude      : num  40.8 40.8 40.8 40.8 40.8 ...
 $ rate_code_id         : int  1 1 1 1 1 1 1 1 1 2 ...
 $ store_and_fwd_flag   : chr  "N" "N" "N" "N" ...
 $ dropoff_longitude    : num  -74 -74 -74 -73.9 -74 ...
 $ dropoff_latitude     : num  40.8 40.8 40.7 40.7 40.8 ...
 $ payment_type         : int  1 1 1 2 2 1 1 1 2 1 ...
 $ fare_amount          : num  26 12.5 16.5 39 3.5 27 7 8 7.5 52 ...
 $ extra                : num  1 0.5 0.5 0.5 0 0 0 0 0 0 ...
 $ mta_tax              : num  0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 0.5 ...
 $ tip_amount           : num  8.08 0 3.56 0 0 ...
 $ tolls_amount         : num  5.33 0 0 0 0 5.33 0 0 0 5.33 ...
 $ improvement_surcharge: num  0.3 0.3 0.3 0.3 0.3 0.3 0.3 0 0.3 0.3 ...
 $ total_amount         : num  41.2 13.8 21.4 40.3 4.3 ...
 $ u                    : num  0.0191 0.0229 0.0399 0.0055 0.0497 ...

Now let's see how we can subset or slice the data: in other words. Since a data.frame is a 2-dimensional object, we can slice by asking for specific rows or columns of the data. The notation we use here (which we refer to as the bracket notation) is as follows:

data[rows_to_slice, columns_to_slice]

As we will see, we can be very flexible in what we choose for rows_to_slice and columns_to_slice. For example,

we can provide numeric indexes corresponding to row numbers or column positions
we can (and should) specify the column names instead of column positions
we can provide functions that return integers corresponding to the row indexes we want to return
we can provide functions that return the column names we want to return
we can have conditional statements or functions that return TRUE and FALSE for each row or column, so that only cases that are TRUE are returned

We will encounter examples for each case.

nyc_taxi[1:5, 1:4] # rows 1 through 5, columns 1 through 4

  VendorID tpep_pickup_datetime tpep_dropoff_datetime passenger_count
1        2  2015-01-15 19:05:40   2015-01-15 19:28:18               5
2        2  2015-01-25 00:13:06   2015-01-25 00:24:51               1
3        2  2015-01-25 00:13:08   2015-01-25 00:34:57               1
4        2  2015-01-25 00:13:09   2015-01-25 01:02:40               1
5        2  2015-01-04 13:44:52   2015-01-04 13:46:38               1

nyc_taxi[1:5, -(1:4)] # rows 1 through 5, except columns 1 through 4

  trip_distance pickup_longitude pickup_latitude rate_code_id store_and_fwd_flag
1          8.33            -73.9            40.8            1                  N
2          3.37            -73.9            40.8            1                  N
3          3.72            -74.0            40.8            1                  N
4         10.20            -74.0            40.8            1                  N
5          0.36            -74.0            40.8            1                  N
  dropoff_longitude dropoff_latitude payment_type fare_amount extra mta_tax
1             -74.0             40.8            1        26.0   1.0     0.5
2             -74.0             40.8            1        12.5   0.5     0.5
3             -74.0             40.7            1        16.5   0.5     0.5
4             -73.9             40.7            2        39.0   0.5     0.5
5             -74.0             40.8            2         3.5   0.0     0.5
  tip_amount tolls_amount improvement_surcharge total_amount      u
1       8.08         5.33                   0.3         41.2 0.0191
2       0.00         0.00                   0.3         13.8 0.0229
3       3.56         0.00                   0.3         21.4 0.0399
4       0.00         0.00                   0.3         40.3 0.0055
5       0.00         0.00                   0.3          4.3 0.0497

nyc.first.ten <- nyc_taxi[1:10, ] # store the first 10 rows and all columns in a new `data.frame` called `nyc.first.ten`

So far our data slices have been limited to adjacent rows and adjacent columns. Here's an example of how to slice the data for non-adjacent rows. It is also far more common to select columns by their names instead of their position (also called numeric index), since this makes the code more readable and won't break the code if column positions change.

nyc_taxi[c(2, 3, 8, 66), c("fare_amount", "mta_tax", "tip_amount", "tolls_amount")]

   fare_amount mta_tax tip_amount tolls_amount
2         12.5     0.5       0.00            0
3         16.5     0.5       3.56            0
8          8.0     0.5       1.75            0
66         8.0     0.5       1.50            0

3.1 Basic queries

Basic queries

results matching ""

No results matching ""