R

The R and related Bioconductor packages can be invaluable to those of researchers in the life sciences. It is often reasonably well documented, capable of high-end statistical processes and can generate extremely complex and high end graphics. And the best part is that it is FREE.... The down side is there is a bit of a learning curve and there seems to be general assumption that you know a bit of standard command line stuff. The good news is there are a couple of good GUI interfaces that make things a bit easier, like RSTUDIO.

Standard R Prompt:

>

General Commands:

R - Starts an R session in a command terminal

q() - Quits an R session in a command terminal

getwd() - returns current working directory

setwd() - sets the currrent directory as the working directory

list.files() - shows all the files in the current directory (akin to ls in unix)

ls() OR objects() - returns a list of all objects in current session

WHAT ARE OBJECTS?

Good question. These are the things R creates and/or manipulates. These may be variables, arrays of numbers (vectors), character strings, functions, or more general structures built from such components.

rm() - allows you to delete an object(s) in the current session [ rm(foo) OR rm(foo, bar) ]

Using R as a Calculator:

#Find all the available "basic" mathmatical functions
> getGroupMembers(Arith)
[1] "+"   "-"   "*"   "^"   "%%"  "%/%" "/"
#Open the related help page
> help(Arithmetic)
#Addition
> 2+2
[1] 4
#Subtraction
> 5-3
[1] 2
#Multiplication
> 4*3
[1] 12
#Division
> 20/4
[1] 5
>

Adding Values to Objects:

Option 1 - Manual Data Entry
## Assign the number 10 to an Object called A
# Classic method uses "less than" and dash
> A <- c(10)       
# But these alternatives also work
> c(10) -> A
> assign("A", c(10))
## Assign the numbers 3, 4, 5, 6, 7 to an Object called B
> B <- c(3, 4, 5, 6, 7)

Seeing what Objects Exist:

> ls()
[1] "A" "B"
> objects()
[1] "A" "B"

Seeing the Contents of Objects:

> cat(A)
10
> cat(B)
3 4 5 6 7
> head(A)
[1] 10
> head(B)
[1] 3 4 5 6 7

Tutorial

Gene    Size
KRAS    1000
NRAS    1200
TRAF3    1200

Read a tab separated table into R

> Table<-read.table("Example_Table.txt")

or

> Table<-read.table("Example_Table.txt",header=T)

Write a table out (this forces tab separation and ensures the row numbers used in R are not in the export

> write.table(sig, file="SignificantGenes2.txt", sep="\t", row.names=F)

Specific column for a table with a header

head(Table)

Genes<-Table$Genes

Math Operations

For any operation on one column of data, use Table$Columnname and insert the header for the column after the dollar sign.

In this example, "Size" is the name of the column we want to manipulate.

Mean

> mean(Table$Size)

Standard Deviation

> sd(Table$Size)

Median

> median(Table$Size)

Geomean

library(psych)
geometric.mean(Table$Size,na.rm=TRUE)

Mode

> freq <- tapply(Table$Size, Table$Size, length)
> as.numeric(names(freq)[which.max(freq)])

Quartile

> quartile(Table$Size)

Quantile (every 10%)

> quantile(Table$Size,probs=c(0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1))

Subset

- Used to extract rows of a table. Similar to grep or awk.

Usage: subset(Data_Table, Column/Operator/Value)

Operators:

XY Plots

#Download the test data from the data repository section of the site (HtSeq_StrandedVersusUnstranded.txt)
#Open R Studio
#Set working directory to location of downloaded files (>Session>Set Working Directory>Choose Directory)
#Load ggplot
>library(ggplot2)
>library(scales)
#Load data table into R
>xyTest <- read.table("HtSeq_StrandedVersusUnstranded.txt", header=T)
#View table structure
>head(xyTest)
#Basic XY Plot
>ggplot(xyTest, aes(Count_Unstranded, Count_Stranded)) + geom_point()
#Advanced XY Plot - (add 1 to all values to allow log scale plot, scale graph to log10 and include exponential annotation, add custom labels and title, set text size)
>ggplot(xyTest, aes(Count_Unstranded+1, Count_Stranded+1)) + geom_point() + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + ylab(label = "Stranded Count") + xlab(label = "Unstranded Count") + ggtitle("HtSeq - Stranded versus Unstranded") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), title = element_text(size=18))
#Custom with continuous color gradient of 7 rainbow colors, custom annotations both text and rectangles
>ggplot(sex7, aes(Mean, SD)) + geom_point(aes(color = Count)) + annotate("rect", xmin = 0.60, xmax = 0.73, ymin = 0.23, ymax = 0.29, alpha = .2) + annotate("rect", xmin=0.95, xmax=1.0, ymin=0.0, ymax=0.16, alpha=0.2) + annotate("text", x=0.67, y=0.19, label="Female\nX=0.60-0.73\nY=0.23-0.29") + annotate("text", x=0.9, y=0.085, label="Male\nX=0.95-1.00\nY=0.00-0.16") + scale_x_continuous(limits=c(0.5,1.001)) + scale_y_continuous(limits=c(0.0,0.501)) + theme(axis.text = element_text(size=12), axis.title = element_text(size=16)) + annotate("text", x=0.76, y=0.346, label="MMRF_1159", size=4) + annotate("text", x=0.905, y=0.174, label="MMRF_1687", size=4) + annotate("text", x=0.855, y=0.232, label="MMRF_1400", size=4) + annotate("text", x=0.895, y=0.196, label="MMRF_1089", size=4) + scale_color_gradientn("Informative\nVariants", colours=rainbow(7), limits=c(0.0,2000), na.value = "purple", guide = "colourbar")
#Get the correlation values; pearson [absolute value correlation] and spearman(rho) [rank-order correlation]
>cor(xyTest[c(2, 3)], method="spearman")
>cor(xyTest[c(2, 3)], method="pearson")

Working with a Large Matrix

#Load necessary libraries
library(ggplot2)
library(reshape2)
library(grid)
library(gridExtra) #For creating multiple plot pages
library(psych) #For geometric mean
#Read the table, since the matrix is numbers outside of the first row and column
#we indicate that there is a header and that the row name should be derived from the first column
TPM <- read.table("TPM_Matrix.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
## REPLACE GeneX with your gene name
#Extract a gene of interest from the matrix for graphing
GeneX <- subset(TPM, subset=rownames(TPM)=="ENSGxxxxxxxxxxxx")
head(GeneX)

#Transpose the single row gene table into a two column table

mGeneX <- melt(GeneX)
head(mGeneX)

#Generate the various plots and then produce a single image with all 4 plots

a <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_boxplot()

b <- ggplot(mGeneX, aes(x=value)) + geom_histogram(binwidth=5)

c <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_point(position = "jitter")

d <- ggplot(mGeneX, aes(x="GeneX", y=value)) + geom_dotplot(binaxis = "y", stackdir = "center")

grid.arrange(a, b, c, d, ncol=2, nrow=2)
## DYNAMIC code version, just update the first 2 lines to your gene name and ENSG_ID
HUGO <- "TRAF3"
ENSG <- "ENSG00000131323"
GeneX <- subset(TPM, subset=rownames(TPM)==ENSG)
mGeneX <- melt(GeneX)

a <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_boxplot() + xlab(label = "")

b <- ggplot(mGeneX, aes(x=value)) + geom_histogram(binwidth=5)

c <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_point(position = "jitter") + xlab(label = "")

d <- ggplot(mGeneX, aes(x=HUGO, y=value)) + geom_dotplot(binaxis = "y", stackdir = "center") + xlab(label = "")

grid.arrange(a, b, c, d, ncol=2, nrow=2)

#Get a list of the samples with TPM values above 100
subset(mGeneX, value > 100)
#To make it faster to load the matrix a second time
save(TPM, "TPM.rData")
#In subsequent runs load the data object instead of reading the table to make things faster
load("TPM.rData")
#More, load different matrix, calculate geometric means and merge and graph
> NFKB_Keats <- read.table("IA7_Keats_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
> gmNFKB_Keats <- geometric.mean(NFKB_Keats+1,na.rm=TRUE)
> MgmNFKB_Keats <- melt(gmNFKB_Keats)
> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=0.25)
> ggplot(MgmNFKB_Keats, aes(x=value)) + geom_histogram(binwidth=5)
> ggplot(MgmNFKB_Keats, aes(x="NFKB Index (Keats)", y=value)) + geom_point(position = "jitter")
> NFKB_Ann <- read.table("IA7_Annunziate_NFKB.txt", header=T, sep="\t", row.names=1, stringsAsFactors=F)
> gmNFKB_Ann <- geometric.mean(NFKB_Ann+1,na.rm=TRUE)
> MgmNFKB_Ann <- melt(gmNFKB_Ann)
> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=5)
> ggplot(MgmNFKB_Ann, aes(x=value)) + geom_histogram(binwidth=1)
> ggplot(MgmNFKB_Ann, aes(x="NFKB Index (Annunziate)", y=value)) + geom_point(position = "jitter")
> merge_NFKB <- merge(MgmNFKB_Keats, MgmNFKB_Ann, by="row.names")
> cor(merge_NFKB[c(2, 3)], method="pearson")
> cor(merge_NFKB[c(2, 3)], method="spearman")
> ggplot(merge_NFKB, aes(value.x, value.y)) + geom_point()
#Create a vector of ENSGxxx identifiers as "list" and a vector of samples as "samples"
#This will extract the genes in "list" and the samples in "samples"
list <- as.vector(keats_nfkb_list$V1)
Test2 <- subset(nonB, subset=rownames(nonB)%in%list, select=samples)

Mutation Plots

> library(ggplot2)
> library(reshape2)
> library(gridExtra)
> sigMutIA6 <- read.table("IA6_Significant_Mutations.txt", header=T)
> mSigMutIA6 <- melt(sigMutIA6, id.vars="Gene")
> full <- read.table("IA6_Significant_Mutations.txt", header=T)
> a <- ggplot(full, aes(x=Gene, y=log10(p), fill=Significant)) + geom_bar(stat="identity") + coord_flip() + geom_hline(aes(yintercept=log10(0.05)), colour="black", linetype="dashed") + ylab(label = "p-value (log10)") + xlab("") + scale_fill_discrete(name="Significant q-value   ") + xlab("") + theme(axis.text.y = element_blank(), axis.ticks = element_blank(), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + scale_y_reverse()
> b <- ggplot(mSigMutIA6, aes(x=Gene, y=value, fill=variable)) + geom_bar(stat='identity') + theme(axis.text.y = element_text(size = 14, color="black"), axis.text.x = element_text(size = 14,colour = "black"), axis.title = element_text(size = 20), legend.title = element_text(size = 15), legend.text = element_text(size = 12), legend.position="top") + ylab(label = "Somatic Mutation Count") + scale_fill_discrete(name="Mutation Type   ", labels=c("Non-Silent", "Silent", "Non-Coding")) + xlab("") + coord_flip()
> grid.arrange(b,a,ncol=2,widths=c(3, 1))
#Create a plot for each gene showing DNA and RNA allele fraction correlations
> dnarna <- read.table("IA5_graph2.txt", header=T)

> head(dnarna)

RNA_Alt_Freq DNA_Alt_Freq Gene

1 0.115869 0.283 ACTG1

2 0.307286 0.378 ACTG1

> ggplot(dnarna, aes(DNA_Alt_Freq, RNA_Alt_Freq)) + geom_point() + facet_wrap(~ Gene) + ylab(label = "Alt Allele Ratio - RNA") + xlab(label = "Alt Allele Ratio - DNA")

Making Venn Diagrams

#Load Venn Diagram library, it can make 2-,3-,4-, and 5-way venn diagrams.

library(VennDiagram)
#Create clean workspace
grid.newpage()
#Create two-way Venn plot; Numbers represent Total Group1 count, Total Group2 count, Shared Group 1 & 2 count
venn.plot <- draw.pairwise.venn(315, 148, 118, c("Strelka_Pass", "Seurat_MMRF_Filtered"))
#Print the plot

grid.draw(venn.plot)

#For a better looking version

venn.plot <- draw.pairwise.venn(area1        = 267,
+                                 area2        = 808,
+                                 cross.area   = 193,
+                                 category     = c("Strelka Pass", "Seurat Raw"),
+                                 fill         = c("blue", "red"),
+                                 alpha        = 0.3,
+                                 lty          = "blank",
+                                 cex          = 2,
+                                 cat.cex      = 2,
+                                 cat.pos      = c(285, 105),
+                                 cat.dist     = 0.09,
+                                 cat.just     = list(c(-1, -1), c(1, 1)),
+                                 ext.pos      = 30,
+                                 ext.dist     = -0.05,
+                                 ext.length   = 0.85,
+                                 ext.line.lwd = 2,
+                                 ext.line.lty = "dashed")
#More usable version
venn.plot <- draw.pairwise.venn(1734, 1969, 1248, c("MuTect_1000ng", "MuTect_500ng"), euler.d=T , scaled=T, fill=c("blue", "red"), alpha=0.3, lty="blank", cex=2, cat.cex=2, cat.dist=0.03, cat.pos=c(-30,30) )
#Create three-way Venn plot

#Numbers are Area1, Area2, Area3, Intersect 1&2, Intersect 2&3, Intersect 1&3, Intersect 1,2,3

venn.plot <- draw.triple.venn(100, 150, 180, 50, 100, 30, 15, c("WGS", "WES", "RNA"))

venn.plot <- draw.triple.venn(area1=2437, area2=662, area3=6557, n12=245, n23=594, n13=1293, n123=177, c("Bolli", "CoMMpass IA3", "MMGI - Lohr"), euler.d=T, scaled=F,fill=c("Red", "Green", "Blue"), alpha=0.5, cex=1.5, cat.cex=1.7, cat.dist=0.05)

USING GGPLOT2

Hi-Low Distribution plot with Delta coloring

ggplot(Tab, aes(Sample, Top1, ymin = Top2, ymax = Top1, fill = Mean_Top_Delta)) + geom_crossbar(stat='identity', linetype='blank', width=0.75) + xlab(label="Samples Tested") + ylab(label="Top Two Isoform Range") + theme(axis.text.x = element_text(angle=90,hjust = 0.9,vjust=0.5), axis.title = element_text(size=20, vjust=0.4)) + scale_y_continuous(limits=c(0,1), breaks=seq(0, 1, 0.1)) + scale_fill_gradient2("Mean\nInter\nIsoform\nDelta", low="blue", mid = "yellow", high="red", limits=c(0.0,1.0), midpoint = 0.5) + geom_hline(aes(yintercept=0.9), colour="blue", linetype="dashed")

Box Plot/Dot Plot

#Load the data table
All<-read.table("Full.txt",header=T)
#Box Plot
ggplot(All, aes(factor(Class), Ratio)) + geom_boxplot()
#Dot Plot
ggplot(All, aes(factor(Class), Ratio)) + geom_jitter(aes(colour = Alleles))

Box Plot/Dot Plot with x axis grouped into deciles

#Load needed library's

> library(ggplot2) #used to create the plot

> library(gdata) #used to import excel.xls or excel.xlsx files as a data.frame object

> library(Hmisc) #used to make a factor variable which groups the x axis data of a scatter plot into deciles

#read excel file, create factor variable, and plot data

> dat <- read.xls("filename.xlsx") #reads any excel file into R as a data.frame with headers

> RIN <- cut2(dat$AliquotStock_RIN, g=10) #cut2 (part of the Hmisc library) creates a factor. the g=10 specifies 10 groups of equal observations (deciles)

> ggplot(dat, aes(factor(RIN), dat$picardTools.rnaMetrics.PCT_MRNA_BASES)) + geom_jitter()

Histogram

Freq2<-read.table("Frequency_Table2.txt",header=T)
library(ggplot2)
p <- ggplot(Freq2, aes(x=Count))
p + geom_histogram(aes(fill = Assay),binwidth = 10)
ggplot(Freq2) + geom_histogram(aes(x=Ratio,fill = Assay),binwidth = .02)
#CODE FOR GAINED REGIONS
ggplot(CRD, aes(x=X1q21)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.1375, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = 0.3219, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "1q21 - CKS1B") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")
#CODE FOR DELETED REGIONS
ggplot(CRD, aes(x=X17p13)) + geom_histogram(binwidth=0.01) + geom_vline(xintercept = -1.0, colour="green", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.5849, colour="orange", linetype = "longdash", size = 1) + geom_vline(xintercept = 1.0, colour="red", linetype = "longdash", size = 1) + geom_vline(xintercept = 0.0, colour="darkgrey", linetype = "longdash", size = 1) + geom_vline(xintercept = -0.1520, colour="purple", linetype = "dashed", size = 1) + geom_vline(xintercept = -0.4150, colour="purple", linetype = "solid", size = 1) + theme(plot.title = element_text(size=20), axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + labs(title = "17p13 - TP53") + ylab(label = "Count") + xlab(label = "Copy Number State [log2(T/N)]")
ggsave("1q21.png")

Line Diagram

#This plots two different data sources on the same graph.
#A smoothed trendline is applied and the confidence interval is suppressed (se=F)
>ggplot() + stat_smooth(data=cov3, aes(percent_gc, normalized_coverage, color=Sample), se=F, method = glm, formula= y ~ ns(x,15)) + stat_density(data=cov, aes(x=X.gc,y=..density../10))+ ylab(label = "Normalized Coverage") + xlab(label = "Percent GC of Capture Target") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) + ylim(limits=c(0,1))

More Examples

> Dist <- read.table("Intermarker_Distance.txt", header = T)
> library(ggplot2)
#These make things work
> library(MASS)
> library(scales)
> ggplot(Dist) + geom_histogram(aes(x=InterMarkerDistance), binwidth = 1000) + theme(axis.text = element_text(size = 16)) + theme(axis.text.x = element_text(angle = 45, hjust = 1), axis.title = element_text(size=20)) + scale_x_continuous(name="Inter-Marker Distance", labels = comma, limits=c(0,500000), breaks=c(0,50000,100000,150000,200000,250000,300000,350000,400000,450000,500000))
> ggplot(Dist, aes(factor(CHR), InterMarkerDistance)) + geom_boxplot() + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),labels = trans_format("log10", math_format(10^.x))) + annotation_logticks(sides = "l") + ylab(label = "Inter-Marker Distance (bp)") + xlab(label = "Chromosome") + theme(axis.text = element_text(size=16), axis.title = element_text(size=20))
# Bar Chart
ggplot(dat, aes(factor(dat$Hugo), dat$n_nonsilent, fill=dat$TPM_2)) +
geom_bar(stat = "identity") +
theme(axis.text.y = element_text(size = 14), axis.text.x = element_text(angle=90,size = 14,colour = "black", hjust = 0.9, vjust=0.5), axis.title = element_text(size = 24), legend.title = element_text(size = 15), legend.text = element_text(size = 12)) +
scale_fill_gradient2("Log2\nMedian\n(TPM+1)", low="blue", mid="yellow", high="red", midpoint = 5) +
ylab(label = "Non-synonomous Mutation Count") +
xlab(label = "")

Create a Map

library(ggplot2)
library("PBSmapping")
library("data.table")
library(maptools)
library(maps)

mmrf_map <- read.table("map.txt", header=T)

> head(mmrf_map)

Vertical Horizontal

1 33.75286 -84.39013

2 34.11918 -84.30292

3 35.68612 -79.82919

vert <- as.vector(mmrf_map$Vertical)

hort <- as.vector(mmrf_map$Horizontal)

xlim=c(-150,25)
ylim=c(20,70)
worldmap = map_data("world")
setnames(worldmap, c("X","Y","PID","POS","region","subregion"))
worldmap = clipPolys(worldmap, xlim=xlim,ylim=ylim, keepExtra=TRUE)
statemap = map_data("state")
setnames(statemap, c("X","Y","PID","POS","region","subregion"))
statemap = clipPolys(statemap, xlim=xlim,ylim=ylim, keepExtra=TRUE)
test.y <- c(33.752856, 34.119177, 35.686122, 45.778852, 51.083333, 53.55)
test.x <- c(-84.39013, -84.30292, -79.82919, -108.5742, -114.083333, -113.5)
p <- ggplot() + coord_map(xlim=xlim,ylim=ylim) + geom_polygon(data=worldmap,aes(X,Y,group=PID), fill = "grey70",color="grey50") + geom_polygon(data=statemap,aes(X,Y,group=PID), fill = "grey70",color="grey50")
p + geom_point(aes(x=test.x, y=test.y), color="blue", size=3)
p + geom_point(aes(x=hort, y=vert), color="blue", size=3)

Install packages from the command line

sudo R
install.packages(c("PackageName"),dependencies=T)
q()

Graphing

Histogram

As tabular data

> hist(Table$Size,plot=F)

Trim value list to those that are less than 500

> Trimmed<-Table$Size[Table$Size<500]

Graph trimmed with bins from 0-500 with length of 10

> bins=seq(0,500,by=10)
> hist(Trimmed,breaks=bins)