February 6, 2018

A Day With R

Datatypes

A vector is always homogenous
A list can be heterogenous
A matrix is a list of vectors
A dataframe is a list of lists.

stockprice <- 1900L
class(stockprice)

stocks <- c("HEG", "Graphite", "Bajaj", "L&T")
prices <- c(1900L, 650L, 500L, 1380L)
names(stocks) <- c("Carbon", "Carbon", "Auto", "Infra")

isAuto <- TRUE
isAuto <- FALSE

data = factor(names(stocks))

mat1 <- matrix(c(1:10), nrow=2)
mat1 <- matrix(c(1:40), nrow=4)

p <- c(6,8,10)
p1 <- list(6,8,10)
p2 <- list(c(6,8,10))

Vector of numeric values

p <- c(6,8,10)

> p
[1]  6  8 10
>
> for (i in 1:length(p)) {
+   print(p[[i]]*2)
+ }
[1] 12
[1] 16
[1] 20
>

List of vector of 3 elems

p1 <- list(6,8,10)
> p1
[[1]]
[1] 6

[[2]]
[1] 8

[[3]]
[1] 10

> for (each in p1) {
+   print(each*2)
+ }
[1] 12
[1] 16
[1] 20
>

List of vector of one elem which is also a list

p2 <- list(c(6,8,10))
> p2
[[1]]
[1]  6  8 10

> for (i in 1:length(p2[[1]])) {
+   print(p2[[1]][i]*2)
+ }
[1] 12
[1] 16
[1] 20
>

lapply

output is always a list

> mul <- function(x){x*2}
> p
[1]  6  8 10
> lapply(p, mul)
[[1]]
[1] 12

[[2]]
[1] 16

[[3]]
[1] 20

> p1
[[1]]
[1] 6

[[2]]
[1] 8

[[3]]
[1] 10

> lapply(p1, mul)
[[1]]
[1] 12

[[2]]
[1] 16

[[3]]
[1] 20

> unlist(lapply(p1, mul))
[1] 12 16 20

sapply

Good for reporting.
Needs high level of confidence in data structures used.
Output can be vector or matrix or list

> mul <- function(x){x*2}
> p
[1]  6  8 10
> sapply(p, mul)
[1] 12 16 20
> p1
[[1]]
[1] 6

[[2]]
[1] 8

[[3]]
[1] 10

> sapply(p1, mul)
[1] 12 16 20
> p2
[[1]]
[1]  6  8 10

> sapply(p2, mul)
     [,1]
[1,]   12
[2,]   16
[3,]   20

vapply

Verify apply
Verifies if the return type of the function matches with the last arg

> mul <- function(x){x*2}
> p
[1]  6  8 10
> vapply(p, mul, numeric(1))
[1] 12 16 20
> p
[1]  6  8 10
> vapply(p, mul, numeric(2))
Error in vapply(p, mul, numeric(2)) : values must be length 2,
 but FUN(X[[1]]) result is length 1
> p
[1]  6  8 10
> vapply(p, mul, character(1))
Error in vapply(p, mul, character(1)) : values must be type 'character',
 but FUN(X[[1]]) result is type 'double'
>

tapply

split, apply, combine
group_by and for each group do a task
outputs an array
tapply can have only one y variable, multiple x variables. i.e. mean sepal length for multiple species.
aggregate can also be used for the same purpose
aggregate gives data frame as output
aggregate can have multiple x and multiple y variables. i.e. sales and profit number for multiple companies.

Exercise #1

Q - From a given list of names print the min and max char in every name in a report format.
The below also demonstrates the difference between lapply and sapply.

> names <- c("Ankur", "Ganesan", "Bhargavi", "pankaj", "Vivek", "Srikrishnan")
> firstlast <- function(x) {
+   s <- strsplit(x, "")[[1]]
+   return(c(first=min(s), last=max(s)))
+   
+ }
> lapply(names, firstlast)
[[1]]
first  last 
  "A"   "u" 

[[2]]
first  last 
  "G"   "s" 

[[3]]
first  last 
  "B"   "v" 

[[4]]
first  last 
  "a"   "p" 

[[5]]
first  last 
  "V"   "v" 

[[6]]
first  last 
  "S"   "s" 
> class(lapply(names, firstlast))
[1] "list"
>
> sapply(names, firstlast)
      Ankur Ganesan Bhargavi pankaj Vivek Srikrishnan
first "A"   "G"     "B"      "a"    "V"   "S"        
last  "u"   "s"     "v"      "p"    "v"   "s"        
>
> class(sapply(names, firstlast))
[1] "matrix"

Exercise #2

Q - from a given list of names print only the unique characters per name

> names <- c("ankur", "ganesan", "bhargavi", "pankaj", "vivek", "srikrishnan")
> uniquenames <- function(x) {
+   s <- strsplit(x, "")[[1]]
+   return(unique(s))
+ }

> sapply(names, uniquenames)
$ankur
[1] "a" "n" "k" "u" "r"

$ganesan
[1] "g" "a" "n" "e" "s"

$bhargavi
[1] "b" "h" "a" "r" "g" "v" "i"

$pankaj
[1] "p" "a" "n" "k" "j"

$vivek
[1] "v" "i" "e" "k"

$srikrishnan
[1] "s" "r" "i" "k" "h" "n" "a"

>

Tidy the data

We use reshape2 and tidyr libraries here.
reshape2 provides us the melt method that will help us convert data from report format to a more program friendly format.
tidyr provides us with the gather API, this does exactly the same thing as melt
In the below example we hav religions as rows and salary ranges as columns.
To convert it to tidy data we need to stack religion:salary_range as rows.
Tidy data allows for easy operation on the dataset - like groupby, filter etc.

> setwd("/Users/vvb/work/projects/datascience")
> getwd()
[1] "/Users/vvb/work/projects/datascience"

> rawdata <- read.csv("pew.csv", check.names = F, encoding = "UTF-8")
> head(rawdata)
                   religion <$10k $10-20k $20-30k $30-40k $40-50k $50-75k $75-100k $100-150k >150k Don't know/refused
1                  Agnostic    27      34      60      81      76     137      122       109    84                 96
2                   Atheist    12      27      37      52      35      70       73        59    74                 76
3                  Buddhist    27      21      30      34      33      58       62        39    53                 54
4                  Catholic   418     617     732     670     638    1116      949       792   633               1489
5 Don<U+2019>t know/refused    15      14      15      11      10      35       21        17    18                116
6          Evangelical Prot   575     869    1064     982     881    1486      949       723   414               1529

> library(reshape2)
> library(tidyr)
> tidydata <- rawdata %>%
+               melt(id= "religion",
+                    variable.name = "salary",
+                    value.name = "count")

> head(tidydata)
                   religion salary count
1                  Agnostic  <$10k    27
2                   Atheist  <$10k    12
3                  Buddhist  <$10k    27
4                  Catholic  <$10k   418
5 Don<U+2019>t know/refused  <$10k    15
6          Evangelical Prot  <$10k   575

> tidydata2 <- rawdata %>%
+                 gather(2:11,
+                        key = "salary",
+                        value = "count")

> head(tidydata2)
                   religion salary count
1                  Agnostic  <$10k    27
2                   Atheist  <$10k    12
3                  Buddhist  <$10k    27
4                  Catholic  <$10k   418
5 Don<U+2019>t know/refused  <$10k    15
6          Evangelical Prot  <$10k   575
>

Below is the min and max temperature data for various (year, month, day) pairs.
We need to transform this to a year:month:day as row format.
Note that id can be a list of columns - c("year", "month", "element") where element is temp_max/temp_min

> rawdata1 <- read.delim("weather.txt", check.names = F, na.strings = '.')
> head(rawdata1)
  year month element  1   2   3  4   5  6  7  8  9  10  11 12 13 14 15  16 17 18 19 20 21 22  23 24 25 26 27 28 29  30 31
1 2010     1    tmax NA  NA  NA NA  NA NA NA NA NA  NA  NA NA NA NA NA  NA NA NA NA NA NA NA  NA NA NA NA NA NA NA 278 NA
2 2010     1    tmin NA  NA  NA NA  NA NA NA NA NA  NA  NA NA NA NA NA  NA NA NA NA NA NA NA  NA NA NA NA NA NA NA 145 NA
3 2010     2    tmax NA 273 241 NA  NA NA NA NA NA  NA 297 NA NA NA NA  NA NA NA NA NA NA NA 299 NA NA NA NA NA NA  NA NA
4 2010     2    tmin NA 144 144 NA  NA NA NA NA NA  NA 134 NA NA NA NA  NA NA NA NA NA NA NA 107 NA NA NA NA NA NA  NA NA
5 2010     3    tmax NA  NA  NA NA 321 NA NA NA NA 345  NA NA NA NA NA 311 NA NA NA NA NA NA  NA NA NA NA NA NA NA  NA NA
6 2010     3    tmin NA  NA  NA NA 142 NA NA NA NA 168  NA NA NA NA NA 176 NA NA NA NA NA NA  NA NA NA NA NA NA NA  NA NA
> str(rawdata1)
'data.frame':	22 obs. of  34 variables:
 $ year   : int  2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
 $ month  : int  1 1 2 2 3 3 4 4 5 5 ...
 $ element: Factor w/ 2 levels "tmax","tmin": 1 2 1 2 1 2 1 2 1 2 ...
 $ 1      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 2      : int  NA NA 273 144 NA NA NA NA NA NA ...
 $ 3      : int  NA NA 241 144 NA NA NA NA NA NA ...
 $ 4      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 5      : int  NA NA NA NA 321 142 NA NA NA NA ...
 $ 6      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 7      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 8      : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 9      : logi  NA NA NA NA NA NA ...
 $ 10     : int  NA NA NA NA 345 168 NA NA NA NA ...
 $ 11     : int  NA NA 297 134 NA NA NA NA NA NA ...
 $ 12     : logi  NA NA NA NA NA NA ...
 $ 13     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 14     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 15     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 16     : int  NA NA NA NA 311 176 NA NA NA NA ...
 $ 17     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 18     : logi  NA NA NA NA NA NA ...
 $ 19     : logi  NA NA NA NA NA NA ...
 $ 20     : logi  NA NA NA NA NA NA ...
 $ 21     : logi  NA NA NA NA NA NA ...
 $ 22     : logi  NA NA NA NA NA NA ...
 $ 23     : int  NA NA 299 107 NA NA NA NA NA NA ...
 $ 24     : logi  NA NA NA NA NA NA ...
 $ 25     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 26     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 27     : int  NA NA NA NA NA NA 363 167 332 182 ...
 $ 28     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 29     : int  NA NA NA NA NA NA NA NA NA NA ...
 $ 30     : int  278 145 NA NA NA NA NA NA NA NA ...
 $ 31     : int  NA NA NA NA NA NA NA NA NA NA ...
> tidydata3 <- rawdata1 %>% melt(id = c("year", "month", "element"),
+                                variable.name = "day",
+                                value.name = "temp")
> head(tidydata3)
  year month element day temp
1 2010     1    tmax   1   NA
2 2010     1    tmin   1   NA
3 2010     2    tmax   1   NA
4 2010     2    tmin   1   NA
5 2010     3    tmax   1   NA
6 2010     3    tmin   1   NA
> tidydata4 <- rawdata1 %>% melt(id = c("year", "month", "element"),
+                                variable.name = "day",
+                                value.name = "temp",
+                                na.rm = TRUE)
> head(tidydata4)
   year month element day temp
21 2010    12    tmax   1  299
22 2010    12    tmin   1  138
25 2010     2    tmax   2  273
26 2010     2    tmin   2  144
41 2010    11    tmax   2  313
42 2010    11    tmin   2  163
> tidydata5 <- rawdata1 %>% gather(4:34, 
+                                  key = "day",
+                                  value = "temp")
> head(tidydata5)
  year month element day temp
1 2010     1    tmax   1   NA
2 2010     1    tmin   1   NA
3 2010     2    tmax   1   NA
4 2010     2    tmin   1   NA
5 2010     3    tmax   1   NA
6 2010     3    tmin   1   NA
> tidydata6 <- rawdata1 %>% gather(4:34, 
+                                  key = "day",
+                                  value = "temp",
+                                  na.rm = TRUE)
> head(tidydata6)
   year month element day temp
21 2010    12    tmax   1  299
22 2010    12    tmin   1  138
25 2010     2    tmax   2  273
26 2010     2    tmin   2  144
41 2010    11    tmax   2  313
42 2010    11    tmin   2  163
>

dcast and spread

> # transform the data to a format that dcast can use
> tidydata7 <- tidydata6[,c("year", "month", "day", "element", "temp")]
> head(tidydata7)
   year month day element temp
21 2010    12   1    tmax  299
22 2010    12   1    tmin  138
25 2010     2   2    tmax  273
26 2010     2   2    tmin  144
41 2010    11   2    tmax  313
42 2010    11   2    tmin  163
> # use element values as column names, pick the values from "temp"
> tidydata8 <- dcast(year + month + day ~ element, data = tidydata7, value.var = "temp")
> head(tidydata8)
  year month day tmax tmin
1 2010     1  30  278  145
2 2010     2  11  297  134
3 2010     2   2  273  144
4 2010     2  23  299  107
5 2010     2   3  241  144
6 2010     3  10  345  168
> # do the same using spread - spread element as columns and use values from temp
> tidydata9 <- spread(tidydata6, element, temp)
> head(tidydata9)
  year month day tmax tmin
1 2010     1  30  278  145
2 2010     2  11  297  134
3 2010     2   2  273  144
4 2010     2  23  299  107
5 2010     2   3  241  144
6 2010     3  10  345  168
> # adding a new column which shows difference in temperature
> tidydata8$tdiff <- tidydata8$tmax - tidydata8$tmin
> head(tidydata8)
  year month day tmax tmin tdiff
1 2010     1  30  278  145   133
2 2010     2  11  297  134   163
3 2010     2   2  273  144   129
4 2010     2  23  299  107   192
5 2010     2   3  241  144    97
6 2010     3  10  345  168   177
>
>

Analyze the titanic shipwreck data to answer the following,

For each group of Class+Age+Gender what is the survival rate?
To answer this question, we need to transform the given data

> rawdata2 <- read.csv("shipwreck.csv", check.names = F, encoding = "UTF-8")
> rawdata2
   class   age     fate male female
1    1st adult perished  118      4
2    1st adult survived   57    140
3    1st child perished    0      0
4    1st child survived    5      1
5    2nd adult perished  154     13
6    2nd adult survived   14     80
7    2nd child perished    0      0
8    2nd child survived   11     13
9    3rd adult perished  387     89
10   3rd adult survived   75     76
11   3rd child perished   35     17
12   3rd child survived   13     14
13  Crew adult perished  670      3
14  Crew adult survived  192     20
15  Crew child perished    0      0
16  Crew child survived    0      0

> # Q - For each group of Class+Age+Gender what is the survival rate?
> tidydata10 <- rawdata2 %>% gather(4:5,
+                                   key = "gender",
+                                   value = "count")
> tidydata10
   class   age     fate gender count
1    1st adult perished   male   118
2    1st adult survived   male    57
3    1st child perished   male     0
4    1st child survived   male     5
5    2nd adult perished   male   154
6    2nd adult survived   male    14
7    2nd child perished   male     0
8    2nd child survived   male    11
9    3rd adult perished   male   387
10   3rd adult survived   male    75
11   3rd child perished   male    35
12   3rd child survived   male    13
13  Crew adult perished   male   670
14  Crew adult survived   male   192
15  Crew child perished   male     0
16  Crew child survived   male     0
17   1st adult perished female     4
18   1st adult survived female   140
19   1st child perished female     0
20   1st child survived female     1
21   2nd adult perished female    13
22   2nd adult survived female    80
23   2nd child perished female     0
24   2nd child survived female    13
25   3rd adult perished female    89
26   3rd adult survived female    76
27   3rd child perished female    17
28   3rd child survived female    14
29  Crew adult perished female     3
30  Crew adult survived female    20
31  Crew child perished female     0
32  Crew child survived female     0

> tidydata10 <- tidydata10 %>% spread(fate, count)
> tidydata10
   class   age gender perished survived
1    1st adult female        4      140
2    1st adult   male      118       57
3    1st child female        0        1
4    1st child   male        0        5
5    2nd adult female       13       80
6    2nd adult   male      154       14
7    2nd child female        0       13
8    2nd child   male        0       11
9    3rd adult female       89       76
10   3rd adult   male      387       75
11   3rd child female       17       14
12   3rd child   male       35       13
13  Crew adult female        3       20
14  Crew adult   male      670      192
15  Crew child female        0        0
16  Crew child   male        0        0
>

deplyr

filter
arrange
mutate
summarize
select
group_by

> # filter data for Jan
> head(flights)
# A tibble: 6 x 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
1  2013     1     1      517            515      2.00      830            819      11.0 UA        1545 N14228  EWR    IAH        227     1400  5.00   15.0
2  2013     1     1      533            529      4.00      850            830      20.0 UA        1714 N24211  LGA    IAH        227     1416  5.00   29.0
3  2013     1     1      542            540      2.00      923            850      33.0 AA        1141 N619AA  JFK    MIA        160     1089  5.00   40.0
4  2013     1     1      544            545     -1.00     1004           1022     -18.0 B6         725 N804JB  JFK    BQN        183     1576  5.00   45.0
5  2013     1     1      554            600     -6.00      812            837     -25.0 DL         461 N668DN  LGA    ATL        116      762  6.00    0  
6  2013     1     1      554            558     -4.00      740            728      12.0 UA        1696 N39463  EWR    ORD        150      719  5.00   58.0
# ... with 1 more variable: time_hour <dttm>
> flights %>% filter(month==1)
# A tibble: 27,004 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
 1  2013     1     1      517            515      2.00      830            819     11.0  UA        1545 N14228  EWR    IAH      227       1400  5.00   15.0
 2  2013     1     1      533            529      4.00      850            830     20.0  UA        1714 N24211  LGA    IAH      227       1416  5.00   29.0
 3  2013     1     1      542            540      2.00      923            850     33.0  AA        1141 N619AA  JFK    MIA      160       1089  5.00   40.0
 4  2013     1     1      544            545     -1.00     1004           1022    -18.0  B6         725 N804JB  JFK    BQN      183       1576  5.00   45.0
 5  2013     1     1      554            600     -6.00      812            837    -25.0  DL         461 N668DN  LGA    ATL      116        762  6.00    0  
 6  2013     1     1      554            558     -4.00      740            728     12.0  UA        1696 N39463  EWR    ORD      150        719  5.00   58.0
 7  2013     1     1      555            600     -5.00      913            854     19.0  B6         507 N516JB  EWR    FLL      158       1065  6.00    0  
 8  2013     1     1      557            600     -3.00      709            723    -14.0  EV        5708 N829AS  LGA    IAD       53.0      229  6.00    0  
 9  2013     1     1      557            600     -3.00      838            846    - 8.00 B6          79 N593JB  JFK    MCO      140        944  6.00    0  
10  2013     1     1      558            600     -2.00      753            745      8.00 AA         301 N3ALAA  LGA    ORD      138        733  6.00    0  
# ... with 26,994 more rows, and 1 more variable: time_hour <dttm>
> # filter data for Nov and Dec
> flights %>% filter(month==11 | month == 12)
# A tibble: 55,403 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
 1  2013    11     1        5           2359      6.00      352            345      7.00 B6         745 N568JB  JFK    PSE      205       1617 23.0    59.0
 2  2013    11     1       35           2250    105         123           2356     87.0  B6        1816 N353JB  JFK    SYR       36.0      209 22.0    50.0
 3  2013    11     1      455            500   -  5.00      641            651    -10.0  US        1895 N192UW  EWR    CLT       88.0      529  5.00    0  
 4  2013    11     1      539            545   -  6.00      856            827     29.0  UA        1714 N38727  LGA    IAH      229       1416  5.00   45.0
 5  2013    11     1      542            545   -  3.00      831            855    -24.0  AA        2243 N5CLAA  JFK    MIA      147       1089  5.00   45.0
 6  2013    11     1      549            600   - 11.0       912            923    -11.0  UA         303 N595UA  JFK    SFO      359       2586  6.00    0  
 7  2013    11     1      550            600   - 10.0       705            659      6.00 US        2167 N748UW  LGA    DCA       57.0      214  6.00    0  
 8  2013    11     1      554            600   -  6.00      659            701    - 2.00 US        2134 N742PS  LGA    BOS       40.0      184  6.00    0  
 9  2013    11     1      554            600   -  6.00      826            827    - 1.00 DL         563 N912DE  LGA    ATL      126        762  6.00    0  
10  2013    11     1      554            600   -  6.00      749            751    - 2.00 DL         731 N315NB  LGA    DTW       93.0      502  6.00    0  
# ... with 55,393 more rows, and 1 more variable: time_hour <dttm>
> flights %>% filter(month %in% c(11,12))
# A tibble: 55,403 x 19
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour minute
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
 1  2013    11     1        5           2359      6.00      352            345      7.00 B6         745 N568JB  JFK    PSE      205       1617 23.0    59.0
 2  2013    11     1       35           2250    105         123           2356     87.0  B6        1816 N353JB  JFK    SYR       36.0      209 22.0    50.0
 3  2013    11     1      455            500   -  5.00      641            651    -10.0  US        1895 N192UW  EWR    CLT       88.0      529  5.00    0  
 4  2013    11     1      539            545   -  6.00      856            827     29.0  UA        1714 N38727  LGA    IAH      229       1416  5.00   45.0
 5  2013    11     1      542            545   -  3.00      831            855    -24.0  AA        2243 N5CLAA  JFK    MIA      147       1089  5.00   45.0
 6  2013    11     1      549            600   - 11.0       912            923    -11.0  UA         303 N595UA  JFK    SFO      359       2586  6.00    0  
 7  2013    11     1      550            600   - 10.0       705            659      6.00 US        2167 N748UW  LGA    DCA       57.0      214  6.00    0  
 8  2013    11     1      554            600   -  6.00      659            701    - 2.00 US        2134 N742PS  LGA    BOS       40.0      184  6.00    0  
 9  2013    11     1      554            600   -  6.00      826            827    - 1.00 DL         563 N912DE  LGA    ATL      126        762  6.00    0  
10  2013    11     1      554            600   -  6.00      749            751    - 2.00 DL         731 N315NB  LGA    DTW       93.0      502  6.00    0  
# ... with 55,393 more rows, and 1 more variable: time_hour <dttm>
> # filter data with departure and arrival delay < 2 hours
> str(flights)
Classes 'tbl_df', 'tbl' and 'data.frame':	336776 obs. of  19 variables:
 $ year          : int  2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
 $ month         : int  1 1 1 1 1 1 1 1 1 1 ...
 $ day           : int  1 1 1 1 1 1 1 1 1 1 ...
 $ dep_time      : int  517 533 542 544 554 554 555 557 557 558 ...
 $ sched_dep_time: int  515 529 540 545 600 558 600 600 600 600 ...
 $ dep_delay     : num  2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
 $ arr_time      : int  830 850 923 1004 812 740 913 709 838 753 ...
 $ sched_arr_time: int  819 830 850 1022 837 728 854 723 846 745 ...
 $ arr_delay     : num  11 20 33 -18 -25 12 19 -14 -8 8 ...
 $ carrier       : chr  "UA" "UA" "AA" "B6" ...
 $ flight        : int  1545 1714 1141 725 461 1696 507 5708 79 301 ...
 $ tailnum       : chr  "N14228" "N24211" "N619AA" "N804JB" ...
 $ origin        : chr  "EWR" "LGA" "JFK" "JFK" ...
 $ dest          : chr  "IAH" "IAH" "MIA" "BQN" ...
 $ air_time      : num  227 227 160 183 116 150 158 53 140 138 ...
 $ distance      : num  1400 1416 1089 1576 762 ...
 $ hour          : num  5 5 5 5 6 5 6 6 6 6 ...
 $ minute        : num  15 29 40 45 0 58 0 0 0 0 ...
 $ time_hour     : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
> flights %>% filter(dep_delay < 120 & arr_delay < 120) %>% select(arr_delay, dep_delay, everything())
# A tibble: 315,868 x 19
   arr_delay dep_delay  year month   day dep_time sched_dep_time arr_time sched_arr_time carrier flight tailnum origin dest  air_time distance  hour minute
       <dbl>     <dbl> <int> <int> <int>    <int>          <int>    <int>          <int> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>  <dbl>
 1     11.0       2.00  2013     1     1      517            515      830            819 UA        1545 N14228  EWR    IAH      227       1400  5.00   15.0
 2     20.0       4.00  2013     1     1      533            529      850            830 UA        1714 N24211  LGA    IAH      227       1416  5.00   29.0
 3     33.0       2.00  2013     1     1      542            540      923            850 AA        1141 N619AA  JFK    MIA      160       1089  5.00   40.0
 4    -18.0      -1.00  2013     1     1      544            545     1004           1022 B6         725 N804JB  JFK    BQN      183       1576  5.00   45.0
 5    -25.0      -6.00  2013     1     1      554            600      812            837 DL         461 N668DN  LGA    ATL      116        762  6.00    0  
 6     12.0      -4.00  2013     1     1      554            558      740            728 UA        1696 N39463  EWR    ORD      150        719  5.00   58.0
 7     19.0      -5.00  2013     1     1      555            600      913            854 B6         507 N516JB  EWR    FLL      158       1065  6.00    0  
 8    -14.0      -3.00  2013     1     1      557            600      709            723 EV        5708 N829AS  LGA    IAD       53.0      229  6.00    0  
 9    - 8.00     -3.00  2013     1     1      557            600      838            846 B6          79 N593JB  JFK    MCO      140        944  6.00    0  
10      8.00     -2.00  2013     1     1      558            600      753            745 AA         301 N3ALAA  LGA    ORD      138        733  6.00    0  
# ... with 315,858 more rows, and 1 more variable: time_hour <dttm>
> # show the makeup time
> flights %>% mutate(makeuptime = dep_delay - arr_delay) %>% select(makeuptime, everything())
# A tibble: 336,776 x 20
   makeuptime  year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest  air_time distance  hour
        <dbl> <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>    <dbl> <dbl>
 1     - 9.00  2013     1     1      517            515      2.00      830            819     11.0  UA        1545 N14228  EWR    IAH      227       1400  5.00
 2     -16.0   2013     1     1      533            529      4.00      850            830     20.0  UA        1714 N24211  LGA    IAH      227       1416  5.00
 3     -31.0   2013     1     1      542            540      2.00      923            850     33.0  AA        1141 N619AA  JFK    MIA      160       1089  5.00
 4      17.0   2013     1     1      544            545     -1.00     1004           1022    -18.0  B6         725 N804JB  JFK    BQN      183       1576  5.00
 5      19.0   2013     1     1      554            600     -6.00      812            837    -25.0  DL         461 N668DN  LGA    ATL      116        762  6.00
 6     -16.0   2013     1     1      554            558     -4.00      740            728     12.0  UA        1696 N39463  EWR    ORD      150        719  5.00
 7     -24.0   2013     1     1      555            600     -5.00      913            854     19.0  B6         507 N516JB  EWR    FLL      158       1065  6.00
 8      11.0   2013     1     1      557            600     -3.00      709            723    -14.0  EV        5708 N829AS  LGA    IAD       53.0      229  6.00
 9       5.00  2013     1     1      557            600     -3.00      838            846    - 8.00 B6          79 N593JB  JFK    MCO      140        944  6.00
10     -10.0   2013     1     1      558            600     -2.00      753            745      8.00 AA         301 N3ALAA  LGA    ORD      138        733  6.00
# ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
> 
> head(iris)
  Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1          5.1         3.5          1.4         0.2  setosa
2          4.9         3.0          1.4         0.2  setosa
3          4.7         3.2          1.3         0.2  setosa
4          4.6         3.1          1.5         0.2  setosa
5          5.0         3.6          1.4         0.2  setosa
6          5.4         3.9          1.7         0.4  setosa
> # for every species list the average sepal length
> iris %>% group_by(Species) %>% summarise(meanSepl = mean(Sepal.Length))
# A tibble: 3 x 2
  Species    meanSepl
  <fct>         <dbl>
1 setosa         5.01
2 versicolor     5.94
3 virginica      6.59
> head(mtcars)
                   mpg cyl disp  hp drat    wt  qsec vs am gear carb
Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1
> # list mean of mpg for every combination of gear and cylinder
> mtcars %>% group_by(gear, cyl) %>% summarise(mean1 = mean(mpg))
# A tibble: 8 x 3
# Groups:   gear [?]
   gear   cyl mean1
  <dbl> <dbl> <dbl>
1  3.00  4.00  21.5
2  3.00  6.00  19.8
3  3.00  8.00  15.0
4  4.00  4.00  26.9
5  4.00  6.00  19.8
6  5.00  4.00  28.2
7  5.00  6.00  19.7
8  5.00  8.00  15.4
> # add a new column that lists mpg + displacement
> mtcars %>% group_by(gear, cyl) %>% mutate(newcol = mpg + disp)
# A tibble: 32 x 12
# Groups:   gear, cyl [8]
     mpg   cyl  disp    hp  drat    wt  qsec    vs    am  gear  carb newcol
   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
 1  21.0  6.00   160 110    3.90  2.62  16.5  0     1.00  4.00  4.00    181
 2  21.0  6.00   160 110    3.90  2.88  17.0  0     1.00  4.00  4.00    181
 3  22.8  4.00   108  93.0  3.85  2.32  18.6  1.00  1.00  4.00  1.00    131
 4  21.4  6.00   258 110    3.08  3.22  19.4  1.00  0     3.00  1.00    279
 5  18.7  8.00   360 175    3.15  3.44  17.0  0     0     3.00  2.00    379
 6  18.1  6.00   225 105    2.76  3.46  20.2  1.00  0     3.00  1.00    243
 7  14.3  8.00   360 245    3.21  3.57  15.8  0     0     3.00  4.00    374
 8  24.4  4.00   147  62.0  3.69  3.19  20.0  1.00  0     4.00  2.00    171
 9  22.8  4.00   141  95.0  3.92  3.15  22.9  1.00  0     4.00  2.00    164
10  19.2  6.00   168 123    3.92  3.44  18.3  1.00  0     4.00  4.00    187
# ... with 22 more rows
>

Software and stuff..

Home

A Day With R

Datatypes

Vector of numeric values

List of vector of 3 elems

List of vector of one elem which is also a list

lapply

sapply

vapply

tapply

Exercise #1

Exercise #2

Tidy the data

dcast and spread

Analyze the titanic shipwreck data to answer the following,

deplyr