Datatypes
- A
vector
is always homogenous
- A
list
can be heterogenous
- A
matrix
is a list of vectors
- A
dataframe
is a list of lists.
stockprice <- 1900L
class(stockprice)
stocks <- c("HEG", "Graphite", "Bajaj", "L&T")
prices <- c(1900L, 650L, 500L, 1380L)
names(stocks) <- c("Carbon", "Carbon", "Auto", "Infra")
isAuto <- TRUE
isAuto <- FALSE
data = factor(names(stocks))
mat1 <- matrix(c(1:10), nrow=2)
mat1 <- matrix(c(1:40), nrow=4)
p <- c(6,8,10)
p1 <- list(6,8,10)
p2 <- list(c(6,8,10))
Vector of numeric values
p <- c(6,8,10)
> p
[1] 6 8 10
>
> for (i in 1:length(p)) {
+ print(p[[i]]*2)
+ }
[1] 12
[1] 16
[1] 20
>
List of vector of 3 elems
p1 <- list(6,8,10)
> p1
[[1]]
[1] 6
[[2]]
[1] 8
[[3]]
[1] 10
> for (each in p1) {
+ print(each*2)
+ }
[1] 12
[1] 16
[1] 20
>
List of vector of one elem which is also a list
p2 <- list(c(6,8,10))
> p2
[[1]]
[1] 6 8 10
> for (i in 1:length(p2[[1]])) {
+ print(p2[[1]][i]*2)
+ }
[1] 12
[1] 16
[1] 20
>
lapply
> mul <- function(x){x*2}
> p
[1] 6 8 10
> lapply(p, mul)
[[1]]
[1] 12
[[2]]
[1] 16
[[3]]
[1] 20
> p1
[[1]]
[1] 6
[[2]]
[1] 8
[[3]]
[1] 10
> lapply(p1, mul)
[[1]]
[1] 12
[[2]]
[1] 16
[[3]]
[1] 20
> unlist(lapply(p1, mul))
[1] 12 16 20
sapply
- Good for reporting.
- Needs high level of confidence in data structures used.
- Output can be vector or matrix or list
> mul <- function(x){x*2}
> p
[1] 6 8 10
> sapply(p, mul)
[1] 12 16 20
> p1
[[1]]
[1] 6
[[2]]
[1] 8
[[3]]
[1] 10
> sapply(p1, mul)
[1] 12 16 20
> p2
[[1]]
[1] 6 8 10
> sapply(p2, mul)
[,1]
[1,] 12
[2,] 16
[3,] 20
vapply
- Verify apply
- Verifies if the return type of the function matches with the last arg
> mul <- function(x){x*2}
> p
[1] 6 8 10
> vapply(p, mul, numeric(1))
[1] 12 16 20
> p
[1] 6 8 10
> vapply(p, mul, numeric(2))
Error in vapply(p, mul, numeric(2)) : values must be length 2,
but FUN(X[[1]]) result is length 1
> p
[1] 6 8 10
> vapply(p, mul, character(1))
Error in vapply(p, mul, character(1)) : values must be type 'character',
but FUN(X[[1]]) result is type 'double'
>
tapply
- split, apply, combine
- group_by and for each group do a task
- outputs an array
- tapply can have only one y variable, multiple x variables. i.e. mean sepal length for multiple species.
- aggregate can also be used for the same purpose
- aggregate gives data frame as output
- aggregate can have multiple x and multiple y variables. i.e. sales and profit number for multiple companies.
Exercise #1
- Q - From a given list of names print the min and max char in every name in a report format.
- The below also demonstrates the difference between lapply and sapply.
> names <- c("Ankur", "Ganesan", "Bhargavi", "pankaj", "Vivek", "Srikrishnan")
> firstlast <- function(x) {
+ s <- strsplit(x, "")[[1]]
+ return(c(first=min(s), last=max(s)))
+
+ }
> lapply(names, firstlast)
[[1]]
first last
"A" "u"
[[2]]
first last
"G" "s"
[[3]]
first last
"B" "v"
[[4]]
first last
"a" "p"
[[5]]
first last
"V" "v"
[[6]]
first last
"S" "s"
> class(lapply(names, firstlast))
[1] "list"
>
> sapply(names, firstlast)
Ankur Ganesan Bhargavi pankaj Vivek Srikrishnan
first "A" "G" "B" "a" "V" "S"
last "u" "s" "v" "p" "v" "s"
>
> class(sapply(names, firstlast))
[1] "matrix"
Exercise #2
- Q - from a given list of names print only the unique characters per name
> names <- c("ankur", "ganesan", "bhargavi", "pankaj", "vivek", "srikrishnan")
> uniquenames <- function(x) {
+ s <- strsplit(x, "")[[1]]
+ return(unique(s))
+ }
> sapply(names, uniquenames)
$ankur
[1] "a" "n" "k" "u" "r"
$ganesan
[1] "g" "a" "n" "e" "s"
$bhargavi
[1] "b" "h" "a" "r" "g" "v" "i"
$pankaj
[1] "p" "a" "n" "k" "j"
$vivek
[1] "v" "i" "e" "k"
$srikrishnan
[1] "s" "r" "i" "k" "h" "n" "a"
>
Tidy the data
- We use
reshape2
and tidyr
libraries here.
reshape2
provides us the melt
method that will help us convert data from report format to a more program friendly format.
tidyr
provides us with the gather
API, this does exactly the same thing as melt
- In the below example we hav religions as rows and salary ranges as columns.
- To convert it to tidy data we need to stack religion:salary_range as rows.
- Tidy data allows for easy operation on the dataset - like groupby, filter etc.
> setwd("/Users/vvb/work/projects/datascience")
> getwd()
[1] "/Users/vvb/work/projects/datascience"
> rawdata <- read.csv("pew.csv", check.names = F, encoding = "UTF-8")
> head(rawdata)
religion <$10k $10-20k $20-30k $30-40k $40-50k $50-75k $75-100k $100-150k >150k Don't know/refused
1 Agnostic 27 34 60 81 76 137 122 109 84 96
2 Atheist 12 27 37 52 35 70 73 59 74 76
3 Buddhist 27 21 30 34 33 58 62 39 53 54
4 Catholic 418 617 732 670 638 1116 949 792 633 1489
5 Don<U+2019>t know/refused 15 14 15 11 10 35 21 17 18 116
6 Evangelical Prot 575 869 1064 982 881 1486 949 723 414 1529
> library(reshape2)
> library(tidyr)
> tidydata <- rawdata %>%
+ melt(id= "religion",
+ variable.name = "salary",
+ value.name = "count")
> head(tidydata)
religion salary count
1 Agnostic <$10k 27
2 Atheist <$10k 12
3 Buddhist <$10k 27
4 Catholic <$10k 418
5 Don<U+2019>t know/refused <$10k 15
6 Evangelical Prot <$10k 575
> tidydata2 <- rawdata %>%
+ gather(2:11,
+ key = "salary",
+ value = "count")
> head(tidydata2)
religion salary count
1 Agnostic <$10k 27
2 Atheist <$10k 12
3 Buddhist <$10k 27
4 Catholic <$10k 418
5 Don<U+2019>t know/refused <$10k 15
6 Evangelical Prot <$10k 575
>
- Below is the min and max temperature data for various (year, month, day) pairs.
- We need to transform this to a
year:month:day
as row format.
- Note that id can be a list of columns -
c("year", "month", "element")
where element is temp_max/temp_min
> rawdata1 <- read.delim("weather.txt", check.names = F, na.strings = '.')
> head(rawdata1)
year month element 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
1 2010 1 tmax NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 278 NA
2 2010 1 tmin NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA 145 NA
3 2010 2 tmax NA 273 241 NA NA NA NA NA NA NA 297 NA NA NA NA NA NA NA NA NA NA NA 299 NA NA NA NA NA NA NA NA
4 2010 2 tmin NA 144 144 NA NA NA NA NA NA NA 134 NA NA NA NA NA NA NA NA NA NA NA 107 NA NA NA NA NA NA NA NA
5 2010 3 tmax NA NA NA NA 321 NA NA NA NA 345 NA NA NA NA NA 311 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
6 2010 3 tmin NA NA NA NA 142 NA NA NA NA 168 NA NA NA NA NA 176 NA NA NA NA NA NA NA NA NA NA NA NA NA NA NA
> str(rawdata1)
'data.frame': 22 obs. of 34 variables:
$ year : int 2010 2010 2010 2010 2010 2010 2010 2010 2010 2010 ...
$ month : int 1 1 2 2 3 3 4 4 5 5 ...
$ element: Factor w/ 2 levels "tmax","tmin": 1 2 1 2 1 2 1 2 1 2 ...
$ 1 : int NA NA NA NA NA NA NA NA NA NA ...
$ 2 : int NA NA 273 144 NA NA NA NA NA NA ...
$ 3 : int NA NA 241 144 NA NA NA NA NA NA ...
$ 4 : int NA NA NA NA NA NA NA NA NA NA ...
$ 5 : int NA NA NA NA 321 142 NA NA NA NA ...
$ 6 : int NA NA NA NA NA NA NA NA NA NA ...
$ 7 : int NA NA NA NA NA NA NA NA NA NA ...
$ 8 : int NA NA NA NA NA NA NA NA NA NA ...
$ 9 : logi NA NA NA NA NA NA ...
$ 10 : int NA NA NA NA 345 168 NA NA NA NA ...
$ 11 : int NA NA 297 134 NA NA NA NA NA NA ...
$ 12 : logi NA NA NA NA NA NA ...
$ 13 : int NA NA NA NA NA NA NA NA NA NA ...
$ 14 : int NA NA NA NA NA NA NA NA NA NA ...
$ 15 : int NA NA NA NA NA NA NA NA NA NA ...
$ 16 : int NA NA NA NA 311 176 NA NA NA NA ...
$ 17 : int NA NA NA NA NA NA NA NA NA NA ...
$ 18 : logi NA NA NA NA NA NA ...
$ 19 : logi NA NA NA NA NA NA ...
$ 20 : logi NA NA NA NA NA NA ...
$ 21 : logi NA NA NA NA NA NA ...
$ 22 : logi NA NA NA NA NA NA ...
$ 23 : int NA NA 299 107 NA NA NA NA NA NA ...
$ 24 : logi NA NA NA NA NA NA ...
$ 25 : int NA NA NA NA NA NA NA NA NA NA ...
$ 26 : int NA NA NA NA NA NA NA NA NA NA ...
$ 27 : int NA NA NA NA NA NA 363 167 332 182 ...
$ 28 : int NA NA NA NA NA NA NA NA NA NA ...
$ 29 : int NA NA NA NA NA NA NA NA NA NA ...
$ 30 : int 278 145 NA NA NA NA NA NA NA NA ...
$ 31 : int NA NA NA NA NA NA NA NA NA NA ...
> tidydata3 <- rawdata1 %>% melt(id = c("year", "month", "element"),
+ variable.name = "day",
+ value.name = "temp")
> head(tidydata3)
year month element day temp
1 2010 1 tmax 1 NA
2 2010 1 tmin 1 NA
3 2010 2 tmax 1 NA
4 2010 2 tmin 1 NA
5 2010 3 tmax 1 NA
6 2010 3 tmin 1 NA
> tidydata4 <- rawdata1 %>% melt(id = c("year", "month", "element"),
+ variable.name = "day",
+ value.name = "temp",
+ na.rm = TRUE)
> head(tidydata4)
year month element day temp
21 2010 12 tmax 1 299
22 2010 12 tmin 1 138
25 2010 2 tmax 2 273
26 2010 2 tmin 2 144
41 2010 11 tmax 2 313
42 2010 11 tmin 2 163
> tidydata5 <- rawdata1 %>% gather(4:34,
+ key = "day",
+ value = "temp")
> head(tidydata5)
year month element day temp
1 2010 1 tmax 1 NA
2 2010 1 tmin 1 NA
3 2010 2 tmax 1 NA
4 2010 2 tmin 1 NA
5 2010 3 tmax 1 NA
6 2010 3 tmin 1 NA
> tidydata6 <- rawdata1 %>% gather(4:34,
+ key = "day",
+ value = "temp",
+ na.rm = TRUE)
> head(tidydata6)
year month element day temp
21 2010 12 tmax 1 299
22 2010 12 tmin 1 138
25 2010 2 tmax 2 273
26 2010 2 tmin 2 144
41 2010 11 tmax 2 313
42 2010 11 tmin 2 163
>
dcast and spread
> # transform the data to a format that dcast can use
> tidydata7 <- tidydata6[,c("year", "month", "day", "element", "temp")]
> head(tidydata7)
year month day element temp
21 2010 12 1 tmax 299
22 2010 12 1 tmin 138
25 2010 2 2 tmax 273
26 2010 2 2 tmin 144
41 2010 11 2 tmax 313
42 2010 11 2 tmin 163
> # use element values as column names, pick the values from "temp"
> tidydata8 <- dcast(year + month + day ~ element, data = tidydata7, value.var = "temp")
> head(tidydata8)
year month day tmax tmin
1 2010 1 30 278 145
2 2010 2 11 297 134
3 2010 2 2 273 144
4 2010 2 23 299 107
5 2010 2 3 241 144
6 2010 3 10 345 168
> # do the same using spread - spread element as columns and use values from temp
> tidydata9 <- spread(tidydata6, element, temp)
> head(tidydata9)
year month day tmax tmin
1 2010 1 30 278 145
2 2010 2 11 297 134
3 2010 2 2 273 144
4 2010 2 23 299 107
5 2010 2 3 241 144
6 2010 3 10 345 168
> # adding a new column which shows difference in temperature
> tidydata8$tdiff <- tidydata8$tmax - tidydata8$tmin
> head(tidydata8)
year month day tmax tmin tdiff
1 2010 1 30 278 145 133
2 2010 2 11 297 134 163
3 2010 2 2 273 144 129
4 2010 2 23 299 107 192
5 2010 2 3 241 144 97
6 2010 3 10 345 168 177
>
>
Analyze the titanic shipwreck data to answer the following,
- For each group of Class+Age+Gender what is the survival rate?
- To answer this question, we need to transform the given data
> rawdata2 <- read.csv("shipwreck.csv", check.names = F, encoding = "UTF-8")
> rawdata2
class age fate male female
1 1st adult perished 118 4
2 1st adult survived 57 140
3 1st child perished 0 0
4 1st child survived 5 1
5 2nd adult perished 154 13
6 2nd adult survived 14 80
7 2nd child perished 0 0
8 2nd child survived 11 13
9 3rd adult perished 387 89
10 3rd adult survived 75 76
11 3rd child perished 35 17
12 3rd child survived 13 14
13 Crew adult perished 670 3
14 Crew adult survived 192 20
15 Crew child perished 0 0
16 Crew child survived 0 0
> # Q - For each group of Class+Age+Gender what is the survival rate?
> tidydata10 <- rawdata2 %>% gather(4:5,
+ key = "gender",
+ value = "count")
> tidydata10
class age fate gender count
1 1st adult perished male 118
2 1st adult survived male 57
3 1st child perished male 0
4 1st child survived male 5
5 2nd adult perished male 154
6 2nd adult survived male 14
7 2nd child perished male 0
8 2nd child survived male 11
9 3rd adult perished male 387
10 3rd adult survived male 75
11 3rd child perished male 35
12 3rd child survived male 13
13 Crew adult perished male 670
14 Crew adult survived male 192
15 Crew child perished male 0
16 Crew child survived male 0
17 1st adult perished female 4
18 1st adult survived female 140
19 1st child perished female 0
20 1st child survived female 1
21 2nd adult perished female 13
22 2nd adult survived female 80
23 2nd child perished female 0
24 2nd child survived female 13
25 3rd adult perished female 89
26 3rd adult survived female 76
27 3rd child perished female 17
28 3rd child survived female 14
29 Crew adult perished female 3
30 Crew adult survived female 20
31 Crew child perished female 0
32 Crew child survived female 0
> tidydata10 <- tidydata10 %>% spread(fate, count)
> tidydata10
class age gender perished survived
1 1st adult female 4 140
2 1st adult male 118 57
3 1st child female 0 1
4 1st child male 0 5
5 2nd adult female 13 80
6 2nd adult male 154 14
7 2nd child female 0 13
8 2nd child male 0 11
9 3rd adult female 89 76
10 3rd adult male 387 75
11 3rd child female 17 14
12 3rd child male 35 13
13 Crew adult female 3 20
14 Crew adult male 670 192
15 Crew child female 0 0
16 Crew child male 0 0
>
deplyr
- filter
- arrange
- mutate
- summarize
- select
- group_by
> # filter data for Jan
> head(flights)
# A tibble: 6 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N14228 EWR IAH 227 1400 5.00 15.0
2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N24211 LGA IAH 227 1416 5.00 29.0
3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619AA JFK MIA 160 1089 5.00 40.0
4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804JB JFK BQN 183 1576 5.00 45.0
5 2013 1 1 554 600 -6.00 812 837 -25.0 DL 461 N668DN LGA ATL 116 762 6.00 0
6 2013 1 1 554 558 -4.00 740 728 12.0 UA 1696 N39463 EWR ORD 150 719 5.00 58.0
# ... with 1 more variable: time_hour <dttm>
> flights %>% filter(month==1)
# A tibble: 27,004 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N14228 EWR IAH 227 1400 5.00 15.0
2 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N24211 LGA IAH 227 1416 5.00 29.0
3 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619AA JFK MIA 160 1089 5.00 40.0
4 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804JB JFK BQN 183 1576 5.00 45.0
5 2013 1 1 554 600 -6.00 812 837 -25.0 DL 461 N668DN LGA ATL 116 762 6.00 0
6 2013 1 1 554 558 -4.00 740 728 12.0 UA 1696 N39463 EWR ORD 150 719 5.00 58.0
7 2013 1 1 555 600 -5.00 913 854 19.0 B6 507 N516JB EWR FLL 158 1065 6.00 0
8 2013 1 1 557 600 -3.00 709 723 -14.0 EV 5708 N829AS LGA IAD 53.0 229 6.00 0
9 2013 1 1 557 600 -3.00 838 846 - 8.00 B6 79 N593JB JFK MCO 140 944 6.00 0
10 2013 1 1 558 600 -2.00 753 745 8.00 AA 301 N3ALAA LGA ORD 138 733 6.00 0
# ... with 26,994 more rows, and 1 more variable: time_hour <dttm>
> # filter data for Nov and Dec
> flights %>% filter(month==11 | month == 12)
# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2013 11 1 5 2359 6.00 352 345 7.00 B6 745 N568JB JFK PSE 205 1617 23.0 59.0
2 2013 11 1 35 2250 105 123 2356 87.0 B6 1816 N353JB JFK SYR 36.0 209 22.0 50.0
3 2013 11 1 455 500 - 5.00 641 651 -10.0 US 1895 N192UW EWR CLT 88.0 529 5.00 0
4 2013 11 1 539 545 - 6.00 856 827 29.0 UA 1714 N38727 LGA IAH 229 1416 5.00 45.0
5 2013 11 1 542 545 - 3.00 831 855 -24.0 AA 2243 N5CLAA JFK MIA 147 1089 5.00 45.0
6 2013 11 1 549 600 - 11.0 912 923 -11.0 UA 303 N595UA JFK SFO 359 2586 6.00 0
7 2013 11 1 550 600 - 10.0 705 659 6.00 US 2167 N748UW LGA DCA 57.0 214 6.00 0
8 2013 11 1 554 600 - 6.00 659 701 - 2.00 US 2134 N742PS LGA BOS 40.0 184 6.00 0
9 2013 11 1 554 600 - 6.00 826 827 - 1.00 DL 563 N912DE LGA ATL 126 762 6.00 0
10 2013 11 1 554 600 - 6.00 749 751 - 2.00 DL 731 N315NB LGA DTW 93.0 502 6.00 0
# ... with 55,393 more rows, and 1 more variable: time_hour <dttm>
> flights %>% filter(month %in% c(11,12))
# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2013 11 1 5 2359 6.00 352 345 7.00 B6 745 N568JB JFK PSE 205 1617 23.0 59.0
2 2013 11 1 35 2250 105 123 2356 87.0 B6 1816 N353JB JFK SYR 36.0 209 22.0 50.0
3 2013 11 1 455 500 - 5.00 641 651 -10.0 US 1895 N192UW EWR CLT 88.0 529 5.00 0
4 2013 11 1 539 545 - 6.00 856 827 29.0 UA 1714 N38727 LGA IAH 229 1416 5.00 45.0
5 2013 11 1 542 545 - 3.00 831 855 -24.0 AA 2243 N5CLAA JFK MIA 147 1089 5.00 45.0
6 2013 11 1 549 600 - 11.0 912 923 -11.0 UA 303 N595UA JFK SFO 359 2586 6.00 0
7 2013 11 1 550 600 - 10.0 705 659 6.00 US 2167 N748UW LGA DCA 57.0 214 6.00 0
8 2013 11 1 554 600 - 6.00 659 701 - 2.00 US 2134 N742PS LGA BOS 40.0 184 6.00 0
9 2013 11 1 554 600 - 6.00 826 827 - 1.00 DL 563 N912DE LGA ATL 126 762 6.00 0
10 2013 11 1 554 600 - 6.00 749 751 - 2.00 DL 731 N315NB LGA DTW 93.0 502 6.00 0
# ... with 55,393 more rows, and 1 more variable: time_hour <dttm>
> # filter data with departure and arrival delay < 2 hours
> str(flights)
Classes 'tbl_df', 'tbl' and 'data.frame': 336776 obs. of 19 variables:
$ year : int 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
$ month : int 1 1 1 1 1 1 1 1 1 1 ...
$ day : int 1 1 1 1 1 1 1 1 1 1 ...
$ dep_time : int 517 533 542 544 554 554 555 557 557 558 ...
$ sched_dep_time: int 515 529 540 545 600 558 600 600 600 600 ...
$ dep_delay : num 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
$ arr_time : int 830 850 923 1004 812 740 913 709 838 753 ...
$ sched_arr_time: int 819 830 850 1022 837 728 854 723 846 745 ...
$ arr_delay : num 11 20 33 -18 -25 12 19 -14 -8 8 ...
$ carrier : chr "UA" "UA" "AA" "B6" ...
$ flight : int 1545 1714 1141 725 461 1696 507 5708 79 301 ...
$ tailnum : chr "N14228" "N24211" "N619AA" "N804JB" ...
$ origin : chr "EWR" "LGA" "JFK" "JFK" ...
$ dest : chr "IAH" "IAH" "MIA" "BQN" ...
$ air_time : num 227 227 160 183 116 150 158 53 140 138 ...
$ distance : num 1400 1416 1089 1576 762 ...
$ hour : num 5 5 5 5 6 5 6 6 6 6 ...
$ minute : num 15 29 40 45 0 58 0 0 0 0 ...
$ time_hour : POSIXct, format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
> flights %>% filter(dep_delay < 120 & arr_delay < 120) %>% select(arr_delay, dep_delay, everything())
# A tibble: 315,868 x 19
arr_delay dep_delay year month day dep_time sched_dep_time arr_time sched_arr_time carrier flight tailnum origin dest air_time distance hour minute
<dbl> <dbl> <int> <int> <int> <int> <int> <int> <int> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 11.0 2.00 2013 1 1 517 515 830 819 UA 1545 N14228 EWR IAH 227 1400 5.00 15.0
2 20.0 4.00 2013 1 1 533 529 850 830 UA 1714 N24211 LGA IAH 227 1416 5.00 29.0
3 33.0 2.00 2013 1 1 542 540 923 850 AA 1141 N619AA JFK MIA 160 1089 5.00 40.0
4 -18.0 -1.00 2013 1 1 544 545 1004 1022 B6 725 N804JB JFK BQN 183 1576 5.00 45.0
5 -25.0 -6.00 2013 1 1 554 600 812 837 DL 461 N668DN LGA ATL 116 762 6.00 0
6 12.0 -4.00 2013 1 1 554 558 740 728 UA 1696 N39463 EWR ORD 150 719 5.00 58.0
7 19.0 -5.00 2013 1 1 555 600 913 854 B6 507 N516JB EWR FLL 158 1065 6.00 0
8 -14.0 -3.00 2013 1 1 557 600 709 723 EV 5708 N829AS LGA IAD 53.0 229 6.00 0
9 - 8.00 -3.00 2013 1 1 557 600 838 846 B6 79 N593JB JFK MCO 140 944 6.00 0
10 8.00 -2.00 2013 1 1 558 600 753 745 AA 301 N3ALAA LGA ORD 138 733 6.00 0
# ... with 315,858 more rows, and 1 more variable: time_hour <dttm>
> # show the makeup time
> flights %>% mutate(makeuptime = dep_delay - arr_delay) %>% select(makeuptime, everything())
# A tibble: 336,776 x 20
makeuptime year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour
<dbl> <int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl> <dbl> <dbl>
1 - 9.00 2013 1 1 517 515 2.00 830 819 11.0 UA 1545 N14228 EWR IAH 227 1400 5.00
2 -16.0 2013 1 1 533 529 4.00 850 830 20.0 UA 1714 N24211 LGA IAH 227 1416 5.00
3 -31.0 2013 1 1 542 540 2.00 923 850 33.0 AA 1141 N619AA JFK MIA 160 1089 5.00
4 17.0 2013 1 1 544 545 -1.00 1004 1022 -18.0 B6 725 N804JB JFK BQN 183 1576 5.00
5 19.0 2013 1 1 554 600 -6.00 812 837 -25.0 DL 461 N668DN LGA ATL 116 762 6.00
6 -16.0 2013 1 1 554 558 -4.00 740 728 12.0 UA 1696 N39463 EWR ORD 150 719 5.00
7 -24.0 2013 1 1 555 600 -5.00 913 854 19.0 B6 507 N516JB EWR FLL 158 1065 6.00
8 11.0 2013 1 1 557 600 -3.00 709 723 -14.0 EV 5708 N829AS LGA IAD 53.0 229 6.00
9 5.00 2013 1 1 557 600 -3.00 838 846 - 8.00 B6 79 N593JB JFK MCO 140 944 6.00
10 -10.0 2013 1 1 558 600 -2.00 753 745 8.00 AA 301 N3ALAA LGA ORD 138 733 6.00
# ... with 336,766 more rows, and 2 more variables: minute <dbl>, time_hour <dttm>
>
> head(iris)
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
6 5.4 3.9 1.7 0.4 setosa
> # for every species list the average sepal length
> iris %>% group_by(Species) %>% summarise(meanSepl = mean(Sepal.Length))
# A tibble: 3 x 2
Species meanSepl
<fct> <dbl>
1 setosa 5.01
2 versicolor 5.94
3 virginica 6.59
> head(mtcars)
mpg cyl disp hp drat wt qsec vs am gear carb
Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
> # list mean of mpg for every combination of gear and cylinder
> mtcars %>% group_by(gear, cyl) %>% summarise(mean1 = mean(mpg))
# A tibble: 8 x 3
# Groups: gear [?]
gear cyl mean1
<dbl> <dbl> <dbl>
1 3.00 4.00 21.5
2 3.00 6.00 19.8
3 3.00 8.00 15.0
4 4.00 4.00 26.9
5 4.00 6.00 19.8
6 5.00 4.00 28.2
7 5.00 6.00 19.7
8 5.00 8.00 15.4
> # add a new column that lists mpg + displacement
> mtcars %>% group_by(gear, cyl) %>% mutate(newcol = mpg + disp)
# A tibble: 32 x 12
# Groups: gear, cyl [8]
mpg cyl disp hp drat wt qsec vs am gear carb newcol
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 21.0 6.00 160 110 3.90 2.62 16.5 0 1.00 4.00 4.00 181
2 21.0 6.00 160 110 3.90 2.88 17.0 0 1.00 4.00 4.00 181
3 22.8 4.00 108 93.0 3.85 2.32 18.6 1.00 1.00 4.00 1.00 131
4 21.4 6.00 258 110 3.08 3.22 19.4 1.00 0 3.00 1.00 279
5 18.7 8.00 360 175 3.15 3.44 17.0 0 0 3.00 2.00 379
6 18.1 6.00 225 105 2.76 3.46 20.2 1.00 0 3.00 1.00 243
7 14.3 8.00 360 245 3.21 3.57 15.8 0 0 3.00 4.00 374
8 24.4 4.00 147 62.0 3.69 3.19 20.0 1.00 0 4.00 2.00 171
9 22.8 4.00 141 95.0 3.92 3.15 22.9 1.00 0 4.00 2.00 164
10 19.2 6.00 168 123 3.92 3.44 18.3 1.00 0 4.00 4.00 187
# ... with 22 more rows
>