Data manipulations using the dplyr package

1

First we need to call in our dplyr package
There are 150 observations and 5 variables in the data set.

We can use data(iris) to find this
library(dplyr)
library(tidyverse)

2

# Using the code below I found that iris1 has 56 observations with 5 variables.
iris1 <- iris%>%
  filter(Species == "virginica"|Species == "versicolor", Sepal.Length > 6, Sepal.Width > 2.5)
rep(iris1)
## $Sepal.Length
##  [1] 7.0 6.4 6.9 6.5 6.3 6.6 6.1 6.7 6.1 6.1 6.4 6.6 6.8 6.7 6.7 6.1 6.2 6.3 7.1
## [20] 6.3 6.5 7.6 7.3 7.2 6.5 6.4 6.8 6.4 6.5 7.7 7.7 6.9 7.7 6.3 6.7 7.2 6.2 6.1
## [39] 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.9 6.7 6.9 6.8 6.7 6.7 6.5 6.2
## 
## $Sepal.Width
##  [1] 3.2 3.2 3.1 2.8 3.3 2.9 2.9 3.1 2.8 2.8 2.9 3.0 2.8 3.0 3.1 3.0 2.9 3.3 3.0
## [20] 2.9 3.0 3.0 2.9 3.6 3.2 2.7 3.0 3.2 3.0 3.8 2.6 3.2 2.8 2.7 3.3 3.2 2.8 3.0
## [39] 2.8 3.0 2.8 3.8 2.8 2.8 2.6 3.0 3.4 3.1 3.1 3.1 3.1 3.2 3.3 3.0 3.0 3.4
## 
## $Petal.Length
##  [1] 4.7 4.5 4.9 4.6 4.7 4.6 4.7 4.4 4.0 4.7 4.3 4.4 4.8 5.0 4.7 4.6 4.3 6.0 5.9
## [20] 5.6 5.8 6.6 6.3 6.1 5.1 5.3 5.5 5.3 5.5 6.7 6.9 5.7 6.7 4.9 5.7 6.0 4.8 4.9
## [39] 5.6 5.8 6.1 6.4 5.6 5.1 5.6 6.1 5.6 5.5 5.4 5.6 5.1 5.9 5.7 5.2 5.2 5.4
## 
## $Petal.Width
##  [1] 1.4 1.5 1.5 1.5 1.6 1.3 1.4 1.4 1.3 1.2 1.3 1.4 1.4 1.7 1.5 1.4 1.3 2.5 2.1
## [20] 1.8 2.2 2.1 1.8 2.5 2.0 1.9 2.1 2.3 1.8 2.2 2.3 2.3 2.0 1.8 2.1 1.8 1.8 1.8
## [39] 2.1 1.6 1.9 2.0 2.2 1.5 1.4 2.3 2.4 1.8 2.1 2.4 2.3 2.3 2.5 2.3 2.0 2.3
## 
## $Species
##  [1] versicolor versicolor versicolor versicolor versicolor versicolor
##  [7] versicolor versicolor versicolor versicolor versicolor versicolor
## [13] versicolor versicolor versicolor versicolor versicolor virginica 
## [19] virginica  virginica  virginica  virginica  virginica  virginica 
## [25] virginica  virginica  virginica  virginica  virginica  virginica 
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica  virginica  virginica  virginica 
## [49] virginica  virginica  virginica  virginica  virginica  virginica 
## [55] virginica  virginica 
## Levels: setosa versicolor virginica

3

# Using the code below I found that iris 2 has 56 observations with 3 variables.
iris2 <- iris1%>%
  select(Species,Sepal.Length, Sepal.Width)
rep(iris2)
## $Species
##  [1] versicolor versicolor versicolor versicolor versicolor versicolor
##  [7] versicolor versicolor versicolor versicolor versicolor versicolor
## [13] versicolor versicolor versicolor versicolor versicolor virginica 
## [19] virginica  virginica  virginica  virginica  virginica  virginica 
## [25] virginica  virginica  virginica  virginica  virginica  virginica 
## [31] virginica  virginica  virginica  virginica  virginica  virginica 
## [37] virginica  virginica  virginica  virginica  virginica  virginica 
## [43] virginica  virginica  virginica  virginica  virginica  virginica 
## [49] virginica  virginica  virginica  virginica  virginica  virginica 
## [55] virginica  virginica 
## Levels: setosa versicolor virginica
## 
## $Sepal.Length
##  [1] 7.0 6.4 6.9 6.5 6.3 6.6 6.1 6.7 6.1 6.1 6.4 6.6 6.8 6.7 6.7 6.1 6.2 6.3 7.1
## [20] 6.3 6.5 7.6 7.3 7.2 6.5 6.4 6.8 6.4 6.5 7.7 7.7 6.9 7.7 6.3 6.7 7.2 6.2 6.1
## [39] 6.4 7.2 7.4 7.9 6.4 6.3 6.1 7.7 6.3 6.4 6.9 6.7 6.9 6.8 6.7 6.7 6.5 6.2
## 
## $Sepal.Width
##  [1] 3.2 3.2 3.1 2.8 3.3 2.9 2.9 3.1 2.8 2.8 2.9 3.0 2.8 3.0 3.1 3.0 2.9 3.3 3.0
## [20] 2.9 3.0 3.0 2.9 3.6 3.2 2.7 3.0 3.2 3.0 3.8 2.6 3.2 2.8 2.7 3.3 3.2 2.8 3.0
## [39] 2.8 3.0 2.8 3.8 2.8 2.8 2.6 3.0 3.4 3.1 3.1 3.1 3.1 3.2 3.3 3.0 3.0 3.4

4

#Create an iris3 data frame from iris2 that orders the observations from largest to smallest #sepal length. Show the first 6 rows of this data set.

iris3 <- iris2%>%
  arrange(desc(Sepal.Length))

#Now to show the first 6 rows

head(iris3)
##     Species Sepal.Length Sepal.Width
## 1 virginica          7.9         3.8
## 2 virginica          7.7         3.8
## 3 virginica          7.7         2.6
## 4 virginica          7.7         2.8
## 5 virginica          7.7         3.0
## 6 virginica          7.6         3.0

5

# Create an iris4 data frame from iris3 that creates a column with a sepal area (length * width) value for each observation. How many observations and variables are in the data set?

#There are 56 observations and 4 variables in the data set.

iris4 <- iris3%>%
  mutate(Sepal.area = (Sepal.Length * Sepal.Width))
rep(iris4)
## $Species
##  [1] virginica  virginica  virginica  virginica  virginica  virginica 
##  [7] virginica  virginica  virginica  virginica  virginica  virginica 
## [13] versicolor versicolor virginica  virginica  virginica  versicolor
## [19] virginica  virginica  versicolor versicolor versicolor virginica 
## [25] virginica  virginica  virginica  versicolor versicolor versicolor
## [31] virginica  virginica  virginica  virginica  versicolor versicolor
## [37] virginica  virginica  virginica  virginica  virginica  versicolor
## [43] virginica  virginica  virginica  virginica  virginica  versicolor
## [49] virginica  virginica  versicolor versicolor versicolor versicolor
## [55] virginica  virginica 
## Levels: setosa versicolor virginica
## 
## $Sepal.Length
##  [1] 7.9 7.7 7.7 7.7 7.7 7.6 7.4 7.3 7.2 7.2 7.2 7.1 7.0 6.9 6.9 6.9 6.9 6.8 6.8
## [20] 6.8 6.7 6.7 6.7 6.7 6.7 6.7 6.7 6.6 6.6 6.5 6.5 6.5 6.5 6.5 6.4 6.4 6.4 6.4
## [39] 6.4 6.4 6.4 6.3 6.3 6.3 6.3 6.3 6.3 6.2 6.2 6.2 6.1 6.1 6.1 6.1 6.1 6.1
## 
## $Sepal.Width
##  [1] 3.8 3.8 2.6 2.8 3.0 3.0 2.8 2.9 3.6 3.2 3.0 3.0 3.2 3.1 3.2 3.1 3.1 2.8 3.0
## [20] 3.2 3.1 3.0 3.1 3.3 3.1 3.3 3.0 2.9 3.0 2.8 3.0 3.2 3.0 3.0 3.2 2.9 2.7 3.2
## [39] 2.8 2.8 3.1 3.3 3.3 2.9 2.7 2.8 3.4 2.9 2.8 3.4 2.9 2.8 2.8 3.0 3.0 2.6
## 
## $Sepal.area
##  [1] 30.02 29.26 20.02 21.56 23.10 22.80 20.72 21.17 25.92 23.04 21.60 21.30
## [13] 22.40 21.39 22.08 21.39 21.39 19.04 20.40 21.76 20.77 20.10 20.77 22.11
## [25] 20.77 22.11 20.10 19.14 19.80 18.20 19.50 20.80 19.50 19.50 20.48 18.56
## [37] 17.28 20.48 17.92 17.92 19.84 20.79 20.79 18.27 17.01 17.64 21.42 17.98
## [49] 17.36 21.08 17.69 17.08 17.08 18.30 18.30 15.86

6

# Create iris5 that calculates the average sepal length, the average sepal width, and the sample size of the entire iris4 data frame and print iris5.

iris5 <- iris4%>%
  summarize(meanLength = mean(Sepal.Length, na.rm=TRUE),meanWidth = mean(Sepal.Width, na.rm = TRUE), SampleSize = n())

print(iris5)
##   meanLength meanWidth SampleSize
## 1   6.698214  3.041071         56

7

# Finally, create iris6 that calculates the average sepal length, the average sepal width, and the sample size for each species of in the iris4 data frame and print iris6.

iris6 <- iris4%>%
  group_by(Species)%>%
  summarize(meanLength = mean(Sepal.Length, na.rm=TRUE),meanWidth = mean(Sepal.Width, na.rm = TRUE), SampleSize = n())

print(iris6)
## # A tibble: 2 × 4
##   Species    meanLength meanWidth SampleSize
##   <fct>           <dbl>     <dbl>      <int>
## 1 versicolor       6.48      2.99         17
## 2 virginica        6.79      3.06         39

8

# Asks you to redo everything with pips statement, just start with pip statements or chain together

irisFinal <- iris%>%
  filter(Species == "virginica"|Species == "versicolor", Sepal.Length > 6, Sepal.Width > 2.5)%>%
  select(Species,Sepal.Length, Sepal.Width)%>%
  arrange(desc(Sepal.Length))%>%
  mutate(Sepal.area = (Sepal.Length * Sepal.Width))%>%
  group_by(Species)%>%
  summarize(meanLength = mean(Sepal.Length, na.rm=TRUE),meanWidth = mean(Sepal.Width, na.rm = TRUE), SampleSize = n())
rep(irisFinal)
## $Species
## [1] versicolor virginica 
## Levels: setosa versicolor virginica
## 
## $meanLength
## [1] 6.482353 6.792308
## 
## $meanWidth
## [1] 2.988235 3.064103
## 
## $SampleSize
## [1] 17 39

9

# Create a ‘longer’ data frame using the original iris data set with three columns named “Species”, “Measure”, “Value”. The column “Species” will retain the species names of the data set. The column “Measure” will include whether the value corresponds to Sepal.Length, Sepal.Width, Petal.Length, or Petal.Width and the column “Value” will include the numerical values of those measurements.

longer <- iris%>%
  group_by(Species)%>%
  pivot_longer(cols= 1:4,
               names_to = "Measure",
               values_to= "Value",)