x <- 8
y <- 2
x+y
## [1] 10
x-y
## [1] 6
x*y
## [1] 16
x %% y
## [1] 0
x <- 1:999
even_numbers <- x %% 2 == 0
even_numbers
odd_numbers <- x %% 2 != 0
odd_numbers
iris_species <- iris$Species
str(iris_species)
## Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
# alternative
is.factor(iris_species)
## [1] TRUE
# alternative
type_sum(iris_species)
## [1] "fct"
length(iris_species)
## [1] 150
levels(iris_species)
## [1] "setosa" "versicolor" "virginica"
Using the mtcars dataset, extract the number of cylinders and horsepower, and car names and put them in a list named cars
# Have a look at the data
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
# cyl is the cylinders, horsepower is hp and names is rows
cylinders <- mtcars$cyl
horsepower <- mtcars$hp
carnames <- rownames(mtcars)
#Put in a list
cars <- list(cylinders=cylinders, HP=horsepower, Car_Names=carnames)
head(rock)
## area peri shape perm
## 1 4990 2791.90 0.0903296 6.3
## 2 7002 3892.60 0.1486220 6.3
## 3 7558 3930.66 0.1833120 6.3
## 4 7352 3869.32 0.1170630 6.3
## 5 7943 3948.54 0.1224170 17.1
## 6 7979 4010.15 0.1670450 17.1
rock_area <- rock$area
rock_shape <- rock$shape
rock_list <- list(Area=rock_area, Shape=rock_shape)
rock_cars <- list(Cars=cars, Rocks=rock_list)
str(rock_cars)
## List of 2
## $ Cars :List of 3
## ..$ cylinders: num [1:32] 6 6 4 6 8 6 8 4 4 6 ...
## ..$ HP : num [1:32] 110 110 93 110 175 105 245 62 95 123 ...
## ..$ Car_Names: chr [1:32] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710" "Hornet 4 Drive" ...
## $ Rocks:List of 2
## ..$ Area : int [1:48] 4990 7002 7558 7352 7943 7979 9333 8209 8393 6425 ...
## ..$ Shape: num [1:48] 0.0903 0.1486 0.1833 0.1171 0.1224 ...
rock_cars$Cars$Car_Names
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
# alternative
rock_cars[['Cars']][['Car_Names']]
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
# alternative
rock_cars[[1]][[3]]
## [1] "Mazda RX4" "Mazda RX4 Wag" "Datsun 710"
## [4] "Hornet 4 Drive" "Hornet Sportabout" "Valiant"
## [7] "Duster 360" "Merc 240D" "Merc 230"
## [10] "Merc 280" "Merc 280C" "Merc 450SE"
## [13] "Merc 450SL" "Merc 450SLC" "Cadillac Fleetwood"
## [16] "Lincoln Continental" "Chrysler Imperial" "Fiat 128"
## [19] "Honda Civic" "Toyota Corolla" "Toyota Corona"
## [22] "Dodge Challenger" "AMC Javelin" "Camaro Z28"
## [25] "Pontiac Firebird" "Fiat X1-9" "Porsche 914-2"
## [28] "Lotus Europa" "Ford Pantera L" "Ferrari Dino"
## [31] "Maserati Bora" "Volvo 142E"
For this exercise, we’re going to be using the fruit data from the stringr package
b_fruits <- str_detect(fruit, '^b')
fruit[b_fruits]
## [1] "banana" "bell pepper" "bilberry" "blackberry"
## [5] "blackcurrant" "blood orange" "blueberry" "boysenberry"
## [9] "breadfruit"
vowel_fruit <- str_detect(fruit,'^[aeiou]')
fruit[vowel_fruit]
## [1] "apple" "apricot" "avocado" "eggplant" "elderberry"
## [6] "olive" "orange" "ugli fruit"
For this exercise, we’re using the senteces data
length(sentences)
## [1] 720
str_length(sentences[1])
## [1] 42
max( str_length(sentences) )
## [1] 57
#Find maximum value for all sentence lengths
max(str_length(sentences))
## [1] 57
# Find sentence with that length
long_sentence <- str_length(sentences)==57
#Getting the sentence
sentences[long_sentence]
## [1] "The bills were mailed promptly on the tenth of the month."
# Order strings according to alphabetical order, remembering to set language, although not necessary in this case
alphabetical <- str_order(sentences,locale = 'en')
# reorder according to alphabetical order
sentences[alphabetical]
For these exercises, we’re going to use the forcats::gss_cat dataset. It’s a sample dataset from the General Social Survey, i.e. a US-based survey run by NORC, an independent organisation at the University of Chicago
Have a look at the dataset
# Using race variable
levels(gss_cat$race)
## [1] "Other" "Black" "White" "Not applicable"
# Alternative
gss_cat %>%
count(race)
## # A tibble: 3 x 2
## race n
## <fct> <int>
## 1 Other 1959
## 2 Black 3129
## 3 White 16395
gss_cat %>%
count(relig)
## # A tibble: 15 x 2
## relig n
## <fct> <int>
## 1 No answer 93
## 2 Don't know 15
## 3 Inter-nondenominational 109
## 4 Native american 23
## 5 Christian 689
## 6 Orthodox-christian 95
## 7 Moslem/islam 104
## 8 Other eastern 32
## 9 Hinduism 71
## 10 Buddhism 147
## 11 Other 224
## 12 None 3523
## 13 Jewish 388
## 14 Catholic 5124
## 15 Protestant 10846
# alternative
summary(gss_cat$relig)
## No answer Don't know Inter-nondenominational
## 93 15 109
## Native american Christian Orthodox-christian
## 23 689 95
## Moslem/islam Other eastern Hinduism
## 104 32 71
## Buddhism Other None
## 147 224 3523
## Jewish Catholic Protestant
## 388 5124 10846
## Not applicable
## 0
Using the nycflights13::flights dataset, do the following:
nycflights13::flights %>%
filter(arr_delay>=2)
## # A tibble: 127,929 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 554 558 -4 740
## 5 2013 1 1 555 600 -5 913
## 6 2013 1 1 558 600 -2 753
## 7 2013 1 1 558 600 -2 924
## 8 2013 1 1 559 600 -1 941
## 9 2013 1 1 600 600 0 837
## 10 2013 1 1 602 605 -3 821
## # … with 127,919 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
filter(month==7| month==8| month==9)
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # … with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
filter(between(month,7,9))
## # A tibble: 86,326 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 7 1 1 2029 212 236
## 2 2013 7 1 2 2359 3 344
## 3 2013 7 1 29 2245 104 151
## 4 2013 7 1 43 2130 193 322
## 5 2013 7 1 44 2150 174 300
## 6 2013 7 1 46 2051 235 304
## 7 2013 7 1 48 2001 287 308
## 8 2013 7 1 58 2155 183 335
## 9 2013 7 1 100 2146 194 327
## 10 2013 7 1 100 2245 135 337
## # … with 86,316 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
arrange(dep_delay)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 12 7 2040 2123 -43 40
## 2 2013 2 3 2022 2055 -33 2240
## 3 2013 11 10 1408 1440 -32 1549
## 4 2013 1 11 1900 1930 -30 2233
## 5 2013 1 29 1703 1730 -27 1947
## 6 2013 8 9 729 755 -26 1002
## 7 2013 10 23 1907 1932 -25 2143
## 8 2013 3 30 2030 2055 -25 2213
## 9 2013 3 2 1431 1455 -24 1601
## 10 2013 5 5 934 958 -24 1225
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
arrange(desc(air_time))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 3 17 1337 1335 2 1937
## 2 2013 2 6 853 900 -7 1542
## 3 2013 3 15 1001 1000 1 1551
## 4 2013 3 17 1006 1000 6 1607
## 5 2013 3 16 1001 1000 1 1544
## 6 2013 2 5 900 900 0 1555
## 7 2013 11 12 936 930 6 1630
## 8 2013 3 14 958 1000 -2 1542
## 9 2013 11 20 1006 1000 6 1639
## 10 2013 3 15 1342 1335 7 1924
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
arrange(air_time)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 16 1355 1315 40 1442
## 2 2013 4 13 537 527 10 622
## 3 2013 12 6 922 851 31 1021
## 4 2013 2 3 2153 2129 24 2247
## 5 2013 2 5 1303 1315 -12 1342
## 6 2013 2 12 2123 2130 -7 2211
## 7 2013 3 2 1450 1500 -10 1547
## 8 2013 3 8 2026 1935 51 2131
## 9 2013 3 18 1456 1329 87 1533
## 10 2013 3 19 2226 2145 41 2305
## # … with 336,766 more rows, and 12 more variables: sched_arr_time <int>,
## # arr_delay <dbl>, carrier <chr>, flight <int>, tailnum <chr>,
## # origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
## # minute <dbl>, time_hour <dttm>
nycflights13::flights %>%
select(dep_time,dep_delay,arr_time,arr_delay)
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
# Alternative
nycflights13::flights %>%
select(starts_with('dep'),starts_with('arr'))
## # A tibble: 336,776 x 4
## dep_time dep_delay arr_time arr_delay
## <int> <dbl> <int> <dbl>
## 1 517 2 830 11
## 2 533 4 850 20
## 3 542 2 923 33
## 4 544 -1 1004 -18
## 5 554 -6 812 -25
## 6 554 -4 740 12
## 7 555 -5 913 19
## 8 557 -3 709 -14
## 9 557 -3 838 -8
## 10 558 -2 753 8
## # … with 336,766 more rows
select(flights,contains(‘TIME’))