R - Data Wrangling

Setup

R studio
https://cloud.r-project.org/

Documentation
https://www.rdocumentation.org/

Install and Open R

Overview

R at first glance

Intro

"Dataframe Oriented"

Weirdness about R

Goodness about R

Bad

Slowest (besides Octave)

Big Principle

Keep things simple

The data.frames is a type or class that makes it easy to do this.

Basics

In MacOS or Linux terminal type

R

bash
In windows cmd.exe type

R.exe

bash

Assignment Operators

A <- 3
A.B <- 3
remove(A)
print(A.B)
4 -> A
print(A)

A = 3
A.B = 3
rm(A)
print(A.B)
print(A)

sample(1:4, 10, replace=TRUE)

Conventions

<- for variables
= for arguments

= used to not work for variable assignment

REPL

down history = ctrl-n or down
up history = ctrl-p or up
tab-completion
clear view = ctrl-l
quit = ctrl-d
newline = ctrl-c

#help
help("data.frame") # help on data.frame
help("ls")
# up and down or j and k to scroll
# q to quit
ls # gives code for function

builtins()
page(builtins)

grep("ls",builtins(),value=TRUE)

# save variables
ls()
save("myvars")
rm(list=ls())
ls()
load("myvars")
ls()

#save history
history()
q() # quit
history()
loadhistory()

savehistory(file="myhistory")
q()
loadhistory(file="myhistory")
history()

Syntax highlighting (Don't do during tutorial)

install.packages("devtools")
devtools::install_github("jalvesaq/colorout")
library("colorout")

Mathematical Operators

# Anything after a hash sign is a comment - it will not be evaluated
8+5                          # + is an operator
8*5                          # Multiplication
(5+3)/2                      # Order of operations apply!
a <- 2^2
a^2

Logical Operators

b3
a=-4
b > abs(a)                   # Is b greater than abs(a)?
b < abs(a)                   # Is b less than abs (a)?
                             # Is b equal to 3? Notice to equal signs instead of 1.
b != 3                       # Is b not equal to 3?
b <= 4                       # Is b less than or equal to 4?
b >= 4                       # Is b greater than or equal to 4?
TRUE==1                      # True and 1 are are the same
FALSE==0                     # False and zero are the same
!TRUE==FALSE                 # ~ takes the compliment of a test (flips the sign)

c <- b==4                    # Assigning result of logical test
print(c)

Combined logic

!(b <= 4)                    # Is b less _greater_ than or equal to 4?
b==5 | abs(a)==3             # OR
b==5 & abs(a)==3             # AND
(b==3 & abs(a)==3 ) | c      # Grouping

Conditionals

x='123' # string

if (x == "123") {
  y=1:3 # vector
} else if (x=="345") {
  y=3:5
} else {
  y=6:24
}
print(y)

Combined logic

x='a'

if (x == "a" || x=="A") {
  y=1
} else if (x=="B" || x=="b") {
  y=2
} else {
  y=3
}
print(y)

Double vs Single

Use '&' and '|' for bare logical statements
Use '&&' and '||' in conditional statements
(Latter uses short-circuit evaluation)

Loops

The Problem

print(10)
Sys.sleep(1)
print(9)
Sys.sleep(1)
print(8)
Sys.sleep(1)
print(7)
# ...
disp(10)
Sys.sleep(1)

The Solution

for (val in 1:10) {
    print(val)
    Sys.sleep(1)
}

x=1:10
for (val in x) {
    print(val)
    Sys.sleep(1)
}

while loop

x=10
while (x > 0) {
    print(x)
    Sys.sleep(1)
    x <- x-1
}

Types

Primitive

- Logical
- Integer
- Numeric
- Complex
- Character
- Raw

Atomic vectors

Note: bracketed numbers on right
Different from 'generic vectors'
Vector of Primitives

a <- 1
# a is an atomic vector of length 1
class(1[1])
class(1[1][1])

Default

In R virtually everything is an atomic vector (with numeric/character elements)

is.integer(2)
is.integer(a[1])   # indexing
b <- as.integer(2) # cast vectors to integers
is.integer(b)
class(b)
class(2) # numeric (atomic vector)
is.integer(2L)

String (A Vectors)

character atomic vector = string atomic vector
Not typical strings

C<-'foo bar'
length(C)
nchar(C)
substr(A,1,3)

escape characters

print('C:\Documents')
print('C:\\Documents')
cat('C:\\Documents')
cat('C:\\Documents\n')
cat('C:\\Documents',sep="\n")

cat(3)
cat(3); cat('\n')

cat('Programming in R is \"fun.\"\n')
cat('\tProgramming in R is \"fun.\"\n')

Numeric Vectors

vectors, matrices, arrays
very similar to matlab,julia,python
just different syntax

my.vec  <- 10:20
print(my.vec)
# period over underscore

my.seq <- seq(from = 0, to = 100, by = 20)
print(my.seq)

my.vec2 <- c(50, 20, 30, 50)
print(my.vec)
sort(my.vec2)

my.vec[2]
my.vec[c(2,4)]
my.vec[-1] # negative signs are compliment, all but last char
my.vec[c(2,4)] # negative signs are compliment, all but last char
my.vec[length(my.vec)] # negative signs are compliment, all but last char
tail(my.vec,n=1) # NOTE argument equals sign

my.vec[-length(my.vec)]
my.vec[-tail(my.vec,n=4)] #all but last foour characters
my.vec[head(my.vec,n=2)] #same

my.rep <- rep(c(1,2,3), each = 3)

2 %in% my.vec
2 %in% my.rep

match(2, my.rep)
match(c

Matrices

matrix( v, nrow = n, ncol = m, byrow = b)

Creation

my.vec=1:6
print(my.vec)
my.matrix <- matrix(my.vec, nrow = 2, ncol = 3)
print(my.matrix)
t(my.matrix)

my.matrix(0,nrow=3,ncol=3)

t(matrix(my.vec, nrow = 3, ncol = 2))

dim(my.matrix)
length(my.matrix)
nrow(my.matrix)
ncol(my.matrix)

Indexing

my.matrix[2,]  # row
my.matrix[,2]  # column
my.matrix[1,2] # element at row 1 column 2
my.matrix[,c(2,3)]
my.matrix[1,c(2,3)]
my.matrix[-1,c(2,3)]
my.matrix[4]

tail(my.matrix,n=1) # doesn't work the same
my.matrix[-length(my.matrix)]
e<-length(my.matrix)
my.matrix[-c(e,e-1)]

Reassign

my.matrix[4] <- 44
print(my.matrix)
my.matrix[3:4] <- c(33,444)
print(my.matrix)
my.matrix[,2] <- c(3,4)
print(my.matrix)

Append

my.new.matrix <- cbind(my.matrix, c(7, 8)) # append column
print(my.new.matrix)
my.new.matrix <- rbind(my.new.matrix, c(9, 10, 11, 12)) # accounts for new column

cbind(c(7,8),my.matrix)
cbind(c(7,8),my.matrix[,c(2,3)],my.matrix[,c(1,2)])

Match

10 %in% my.new.matrix
match(10,my.new.matrix)
my.new.matrix[10]

which(my.new.matrix == 10, arr.ind=TRUE)
which(my.new.matrix == 10)

Looping

last=0;
my.matrix <- matrix(1,3,4)
for (row in 1:nrow(my.matrix)){
    for (col in 1:ncol(my.matrix)){
        my.matrix[row,col]=3+last
        last=my.matrix[row,col]
    }
}

print(my.matrix)
cat(my.matrix,sep="\n")

Math

my.matrix+3
my.matrix*3
my.matrix+my.matrix
my.matrix*my.matrix

diag(4)
diag(4)*3

my.matrix %*% t(my.matrix) # matrix mulitplication

my.matrix=matrix(runif(9),nrow=3)
out <- eigen(my.matrix)
out$values
out$vectors

Arrays

array( v, dim=c(d1,d2,d3…))

my.array <- array((runif(27)-1)*20, dim=c(3,3,3))
print(my.array)
dim(my.array)
my.array[3,,] # row 3
my.array[,3,] # column 3
my.array[,,3] # column 3

Lists

Like cells in matlab. Avoid if possible.

my.list <- list("my string",3,2)
print(my.list)
print(length(my.list))
"my string" %in% my.list

append(my.list, "a 2nd string") # no assignment?
print(my.list)
cat(my.list) # should return error
append(my.list, "a 3rd string", after = 2)

my.list[[1]] # returns element of list
my.list[1] # returns sub-list
my.list[1][1] # returns sub-sub-list
my.list[1:3]
unlist(my.list[c(2,3)])

my.list <- list(1:5) # vector becomes first element
my.list <- as.list(1:5)
my.list <- my.list[-1] # remove first element & shift
my.list[1] <- NULL  # same
print(my.list)
my.list[[1]] <- NULL  # same if element has length of 1
print(my.list)

for (x in my.list) {
  print(my.list)
}

my.rep.list <- c(my.list,my.list)
print(my.rep.list)

Named lists

hard to use

my.list=list(name=c("Dave","Mel","Brian"),age=c(31,32,40))

my.list['name']
my.list[1]
names(my.list)

Factors

best way to store categorical data

similar to atomic vector
levels - possible values

data <- c('R','python','Julia','matlab','python','matlab','matlab')

my.fac <- factor(data)
print(my.fac)

levels(my.fac)
# reorder levels
my.fac <- factor(my.fac,levels = c('python','Julia','R','matlab'))
print(my.fac)
levels(my.fac)

my.fac <- gl(4,3,labels=levels(my.fac))
print(my.fac)
# n = numeber of levels
# k = number of repeats
# labels = names

length(my.fac)
my.fac[1]
my.fac[4] <- 'matlab'
print(my.fac)


# add new level
levels(my.fac) <- c(levels(my.fac), 'Perl') # ADD NEW LEVEL
my.fac[4] <- 'perl'

# change value

# rename level
levels(my.fac)[levels(my.fac)=='Python']  <- "python"
print(my.fac)

levels(my.factor)
length(my.factor)
nlevels(r)

table(my.fac)
summary(my.fac)

sample(my.fac,3)

Datasets

data()
?mtcars
nrow(mtcars)
ncol(mtcars)

Types: data.frame

Named lists + matrices

Calling

numbers will
data.frame(c1,c2,…cn, args)
ARGS

Create

favoriteangle <- c(pi, pi/2, pi/4)
my.data <- data.frame(number = 10:12,
                      isblind = c(FALSE, FALSE, TRUE),
                      haircolor = c("bald", "brown", "red"),
                      favoriteangle,
                      row.names = c("Dave","MEL","BRIAN"),
                      stringsAsFactors = FALSE
                      )

print(my.data)
summary(my.data)
str(my.data) # structure

names(my.data)
rownames(my.data)
colnames(my.data)

text columns will be cast to factors by default

Add info

# RANAME ROWS
rownames(my.data) <- c("Dave","Mel","Brian")
print(my.data)

# RANAME COLUMNS
colnames(my.data) <- c("score","isBlind","hairColor","favoriteAngle")
print(my.data)
comment(my.data) <- "Need more subjects!"
comment(my.data)

Summarize

attributes(my.data)
dim(my.data)
ncol(my.data)
nrow(my.data)
length(my.data)
head(my.data,n=2)
tail(my.data,n=3)

File to data.frame

read.csv('/path/to/my/file', header=TRUE,sep=",")
read.xls()
read.spss()

Convert to data.frame

# named list
my.list.f <- as.data.frame(my.list)
print(my.list.f)

rownames(my.list.f) <- my.list[[1]]
my.list.f[,1] <- NULL

# matrix
my.mat.f <- as.data.frame(my.matrix)
print(my.mat.f)
colnames(my.mat.f) <- c("col1","col2","col3")

Index

Just like matrices
Indexing creates smaller dataframes

my.data[1:3, ]         # get first 3 rows
my.data[[9]]
my.data[c(2,3) c(4,5)] # get elements at 2,4 and 3,5

my.data$c1             # *simple, and what makes dataframes shine
my.data["Dave",]
my.data[,"hairColor"]
my.data["hairColor"]
my.data["Dave","hairColor"]

# reassign
my.data["Dave","favoriteAngle"] <- pi/3
print(my.data)

Match

# MATCH
my.data[my.data$isBlind == TRUE]
subset(my.data, isBlind==TRUE)
subset(my.data, isBlind==TRUE | hairColor=="brown")

unique(my.data$isBlind)
duplicated(my.data$isBlind)


my.data <- rbind(my.data,my.data[1,])
print(my.data)
duplicated(my.data)
# remove duplicated
my.data <- my.data[!duplicated(my.data),]

Reorder

# reorder columns
my.data <- my.data[,c(1,3,2,4)]
print(my.data)
my.data <- my.data[c("favoriteAngle","isBlind", "score", "hairColor")]


# sort rows
my.data[order(rownames(my.data)),]

# sor tcolumns
my.data[,order(colnames(my.data))]

# multiple sort
#dd[with(dd, order(-z, b)), ]

ADD

# ADD COLUMNS
my.data <-cbind(my.data, Strength=c(110, 110, 110))
my.data$stretngth <-c(110, 110, 110))

# add rows
my.data <- rbind(my.data, Mike = c(.8, FALSE, 12, "brown", 112))

RM

# RM column
my.data$strength <- NULL
print(my.data)


# RM ROW
my.data[rownames(my.data)!='Dave',]
my.data[-3,]
subset(my.data, !rownames(my.data) %in% "Dave")

Merge

#Outer join:
merge(x = my.data, y = df2, by = "CustomerId", all = TRUE)
#Left outer:
merge(x = my.data, y = df2, by = "CustomerId", all.x = TRUE)
#Right outer:
merge(x = my.data, y = df2, by = "CustomerId", all.y = TRUE)
#Cross join:
merge(x = my.data, y = df2, by = NULL)

Join

join

Apply: Looping without loops

Arrays

apply(ARRAY,MARGIN,FUN)

# By Row
apply(my.mdata,1,sum)

# By Column
apply(my.mdata,1,sum)

# By cell
apply(my.mdata,1:2, function(x) x+3)

Lists & Vectors

lapply(LIST,FUN)

sapply(X,FUN)

vapply(X,FUN,TYPE)

# list
lapply(list(1,2,3),sum)

# vectors
lapply(as.list(c(1,2,3)),sum)

# simplified
sapply(list(1,2,3),sum)

# simplified
vapply

Facotred Data

tapply(X,INDEX,FUN)

# XXX
tapply(my.data,,FUN) #INDEX = grouping factor(s)

mapply

XXX

mapply

Graphing

https://www.r-graph-gallery.com/

plot

plot(1,1)
plot(c(1, 10), c(20, 30))
x=1:10
y=20:30
plot(x,y)
plot(x,y, type='l')
plot(x,y, type='l', col='red')
plot(x,y, type='l', col='red', cex=1.5, pch=1)
plot(x,y, type='l', col='red', cex=1.5, pch=2)
plot(x,y, type='l', col='red', cex=1.5, pch=2, lwd=2)
plot(x,y, type='l', col='red', cex=1.5, pch=2, lty=3)
plot(x,y,
     type='l', col='red', cex=1.5, pch=2, lty=3,
     main="My Plot", xlab="x values (au)", ylab="y values (au)")

#multiple
lines(c(1, 10), c(20, 30))
points(c(2.5, 3.5), c(3.5, 3.5))

#legend
my.labels=c("data1","data2")
my.colors=c("black","red")
legend("topright", my.labels, fill=my.colors)

pie

my.counts=c(15,50,35,40)
pie(my.counts)
pie(my.counts, init.angle=45)

my.labels=c("test1","test2","test3")
my.colors=c("blue","red","black","violet")
pie(my.counts,
    main="Test scores", label=my.labels, col=my.colors)

# legend
legend("topright", my.labels, fill=my.colors

bar

x <- c("A", "B", "C", "D")

y <- c(2, 4, 6, 8)
barplot(y, names.arg = x)

col
density
width
horiz=True

other

hist
dev.off()
heatmap.2
gplots
ggplot2
plotPCA
plotMA
RColorBrewer
colorrRampPallette

BiocManager::install("EnhancedVolcano")
EnchancedVolcano

Packages

install.packages("")
BiocManager
DESeq2
pwr

detach("package:vegan", unload=TRUE)

https://stackoverflow.com/questions/5595512/what-is-the-difference-between-require-and-library

library("BiocManager") # load inside repl
require("BiocManager") # load inside function

Functions

Creating

my.function <- function(my.arg1, my.arg2) {
}
my.function <- function(my.arg1 = 1, my.arg2) {
    return my.arg1 + my.arg2
}