## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)

# Obtain data
data(Islam2011)

# Do not filter gene list:
Transformed <- Linnorm(Islam2011)

# Filter low count genes
result <- Linnorm(Islam2011, Filter=TRUE)
FTransformed <- result[["Linnorm"]]

## ----echo=TRUE----------------------------------------------------------------
# You can use this file with Excel.
# This file includes all genes.
write.table(Transformed, "Transformed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

# This file filtered low count genes.
write.table(FTransformed, "Transformed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
Normalized <- Linnorm.Norm(Islam2011)
# Important: depending on downstream analysis, please
# set output to be "XPM" or "Raw".
# Set to "XPM" if downstream tools will convert the 
# dataset into CPM or TPM.
# Set to "Raw" if input is raw counts and downstream 
# tools will work with raw counts.

## ----echo=TRUE----------------------------------------------------------------
# You can use this file with Excel.
write.table(Normalized, "Normalized.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(LIHC)

## ----echo=TRUE----------------------------------------------------------------
# Now, we can calculate fold changes between
# sample set 1 and sample set 2.
# Index of sample set 1
set1 <- 1:5
# Index of sample set 2
set2 <- 6:10

# Normalization
LIHC2 <- Linnorm.Norm(LIHC,output="XPM")

# Optional: Only use genes with at least 50%
# of the values being non-zero
LIHC2 <- LIHC2[rowSums(LIHC2 != 0) >= ncol(LIHC2)/2,]

## ----echo=TRUE----------------------------------------------------------------
# Put resulting data into a matrix
FCMatrix <- matrix(0, ncol=1, nrow=nrow(LIHC2))
rownames(FCMatrix) <- rownames(LIHC2)
colnames(FCMatrix) <- c("Log 2 Fold Change")
FCMatrix[,1] <- log((rowMeans(LIHC2[,set1])+1)/(rowMeans(LIHC2[,set2])+1),2)
# Now FCMatrix contains fold change results.

# If the optional filtering step is not done,
# users might need to remove infinite and zero values:
# Remove Infinite values.
FCMatrix <- FCMatrix[!is.infinite(FCMatrix[,1]),,drop=FALSE]
# Remove Zero values
FCMatrix <- FCMatrix[FCMatrix[,1] != 0,,drop=FALSE]

# Now FCMatrix contains fold change results.

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
Density <- density(FCMatrix[,1])
plot(Density$x,Density$y,type="n",xlab="Log 2 Fold Change of TPM+1",
ylab="Probability Density",)
lines(Density$x,Density$y, lwd=1.5, col="blue")
title("Probability Density of Fold Change.\nTCGA Partial LIHC data
Paired Tumor vs Adjacent Normal")
legend("topright",legend=paste("mean = ", 
round(mean(FCMatrix[,1]),2),
"\nStdev = ", round(sd(FCMatrix[,1]),2)))

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
# Normalization with BE_strength set to 1.
# This increases normalization strength.
LIHC2 <- Linnorm.Norm(LIHC,output="XPM",BE_strength=1)

# Optional: Only use genes with at least 50%
# of the values being non-zero
LIHC2 <- LIHC2[rowSums(LIHC2 != 0) >= ncol(LIHC2)/2,]

FCMatrix <- matrix(0, ncol=1, nrow=nrow(LIHC2))
rownames(FCMatrix) <- rownames(LIHC2)
colnames(FCMatrix) <- c("Log 2 Fold Change")
FCMatrix[,1] <- log((rowMeans(LIHC2[,set1])+1)/(rowMeans(LIHC2[,set2])+1),2)
# Now FCMatrix contains fold change results.


Density <- density(FCMatrix[,1])
plot(Density$x,Density$y,type="n",xlab="Log 2 Fold Change of TPM+1",
ylab="Probability Density",)
lines(Density$x,Density$y, lwd=1.5, col="blue")
title("Probability Density of Fold Change.\nTCGA Partial LIHC data
Paired Tumor vs Adjacent Normal")
legend("topright",legend=paste("mean = ", 
round(mean(FCMatrix[,1]),2),
"\nStdev = ", round(sd(FCMatrix[,1]),2)))

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(Islam2011)
# Obtain transformed data
Transformed <- Linnorm(Islam2011)

# Data Imputation
DataImputed <- Linnorm.DataImput(Transformed)

# Or, users can directly perform data imputation during transformation.
DataImputed <- Linnorm(Islam2011,DataImputation=TRUE)

## ----echo=TRUE----------------------------------------------------------------
# You can use this file with Excel.
write.table(DataImputed, "DataImputed.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(Islam2011)
# Obtain stable genes
StableGenes <- Linnorm.SGenes(Islam2011)

## ----echo=TRUE----------------------------------------------------------------
# You can use this file with Excel.
write.table(StableGenes, "StableGenes.txt",
quote=FALSE, sep="\t", col.names=TRUE, row.names=TRUE)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(LIHC)
datamatrix <- LIHC

## ----echo=TRUE----------------------------------------------------------------
# 5 samples for condition 1 and 5 samples for condition 2.
# You might need to edit this line
design <- c(rep(1,5),rep(2,5))
# These lines can be copied directly.
design <- model.matrix(~ 0+factor(design))
colnames(design) <- c("group1", "group2")
rownames(design) <- colnames(datamatrix)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
# The Linnorm-limma pipeline only consists of one line.
DEG_Results <- Linnorm.limma(datamatrix,design)
# The DEG_Results matrix contains your DEG analysis results.

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
# Just add output="Both" into the argument list
BothResults <- Linnorm.limma(datamatrix,design,output="Both")

# To separate results into two matrices:
DEG_Results <- BothResults$DEResults
TransformedMatrix <- BothResults$Linnorm
# The DEG_Results matrix now contains DEG analysis results.
# The TransformedMatrix matrix now contains a Linnorm
# Transformed dataset.

## -----------------------------------------------------------------------------
write.table(DEG_Results, "DEG_Results.txt", quote=FALSE, sep="\t",
col.names=TRUE,row.names=TRUE)

## ----echo=TRUE----------------------------------------------------------------
Genes10 <- DEG_Results[order(DEG_Results[,"adj.P.Val"]),][1:10,]
# Users can print the gene list by the following command:
# print(Genes10)

# logFC: log 2 fold change of the gene.
# XPM: If input is raw count or CPM, XPM is CPM.
# 	If input is RPKM, FPKM or TPM, XPM is TPM.
# t: moderated t statistic.
# P.Value: p value.
# adj.P.Val: Adjusted p value. This is also called False Discovery Rate or q value.}
# B: log odds that the feature is differential.

# Note all columns are printed here

## ----kable1, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10[,c(1:5)], digits=4)

## ----echo=TRUE----------------------------------------------------------------
SignificantGenes <- DEG_Results[DEG_Results[,5] <= 0.05,1]

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
plot(x=DEG_Results[,1], y=DEG_Results[,5], col=ifelse(DEG_Results[,1] %in%
SignificantGenes, "red", "green"),log="y", ylim = 
rev(range(DEG_Results[,5])), main="Volcano Plot", 
xlab="log2 Fold Change", ylab="q values", cex = 0.5)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(Islam2011)
IslamData <- Islam2011[,1:92]

## ----echo=TRUE----------------------------------------------------------------
# 48 samples for condition 1 and 44 samples for condition 2.
# You might need to edit this line
design <- c(rep(1,48),rep(2,44))
# There lines can be copied directly.
design <- model.matrix(~ 0+factor(design))
colnames(design) <- c("group1", "group2")
rownames(design) <- colnames(IslamData)

## ----echo=TRUE----------------------------------------------------------------
# Example 1: Filter low count genes.
# and genes with high technical noise.
scRNAseqResults <- Linnorm.limma(IslamData,design,Filter=TRUE)


# Example 2: Do not filter gene list.
scRNAseqResults <- Linnorm.limma(IslamData,design)

## ----echo=TRUE----------------------------------------------------------------
data(Islam2011)

# Obtain embryonic stem cell data
datamatrix <- Islam2011[,1:48]

## ----echo=TRUE----------------------------------------------------------------
# Setting plotNetwork to TRUE will create a figure file in your current directory.
# Setting it to FALSE will stop it from creating a figure file, but users can plot it
# manually later using the igraph object in the output.
# For this vignette, we will plot it manually in step 4.
results <- Linnorm.Cor(datamatrix, plotNetwork=FALSE,
# Edge color when correlation is positive
plot.Pos.cor.col="red",
# Edge color when correlation is negative
plot.Neg.cor.col="green")

## ----echo=TRUE----------------------------------------------------------------
Genes10 <- results$Results[order(results$Results[,5]),][1:10,]
# Users can print the gene list by the following command:
# print(Genes10)

## ----kable2, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10, digits=4, row.names=FALSE)

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
library(igraph)
Thislayout <- layout_with_kk(results$igraph)
plot(results$igraph,
# Vertex size
vertex.size=2,
# Vertex color, based on clustering results
vertex.color=results$Cluster$Cluster,
# Vertex frame color
vertex.frame.color="transparent",
# Vertex label color (the gene names)
vertex.label.color="black",
# Vertex label size
vertex.label.cex=0.05,
# This is how much the edges should be curved.
edge.curved=0.1,
# Edge width
edge.width=0.05,
# Use the layout created previously
layout=Thislayout
)

## ----echo=TRUE----------------------------------------------------------------
TheCluster <- which(results$Cluster[,1] == "Mmp2")

## ----echo=TRUE----------------------------------------------------------------
# Index of the genes
ListOfGenes <- which(results$Cluster[,2] == TheCluster)

# Names of the genes
GeneNames <- results$Cluster[ListOfGenes,1]

# Users can write these genes into a file for further analysis.

## ----echo=TRUE----------------------------------------------------------------
top100 <- results$Results[order(results$Results[,4],decreasing=FALSE)[1:100],1]

## ----echo=TRUE----------------------------------------------------------------
Top100.Cor.Matrix <- results$Cor.Matrix[top100,top100]

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
library(RColorBrewer)
library(gplots)
heatmap.2(as.matrix(Top100.Cor.Matrix),
# Hierarchical clustering on both row and column
Rowv=TRUE, Colv=TRUE,
# Center white color at correlation 0
symbreaks=TRUE,
# Turn off level trace
trace="none",
# x and y axis labels
xlab = 'Genes', ylab = "Genes",
# Turn off density info
density.info='none',
# Control color
key.ylab=NA,
col=colorRampPalette(c("blue", "white", "yellow"))(n = 1000),
lmat=rbind(c(4, 3), c(2, 1)),
# Gene name font size
cexRow=0.3,
cexCol=0.3,
# Margins
margins = c(8, 8)
)

## ----echo=TRUE----------------------------------------------------------------
data(Islam2011)


## ----echo=TRUE----------------------------------------------------------------
# The first 48 columns are the embryonic stem cells.
results <- Linnorm.HVar(Islam2011[,1:48])

## ----echo=TRUE----------------------------------------------------------------
resultsdata <- results$Results
Genes10 <- resultsdata[order(resultsdata[,"q.value"]),][1:10,3:5]
# Users can print the gene list by the following command:
# print(Genes10)

## ----kable3, echo=FALSE-------------------------------------------------------
library(knitr)
kable(Genes10, digits=4)

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
print(results$plot$plot)
# By default, this plot highlights genes/features with p value less than 0.05.

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(Islam2011)

## ----echo=TRUE----------------------------------------------------------------
tSNE.results <- Linnorm.tSNE(Islam2011[,1:92])

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
# Here, we can see two clusters.
print(tSNE.results$plot$plot)

## ----echo=TRUE----------------------------------------------------------------
# The first 48 samples belong to mouse embryonic stem cells.
Groups <- rep("ES_Cells",48)
# The next 44 samples are mouse embryonic fibroblasts.
Groups <- c(Groups, rep("EF_Cells",44))

## ----echo=TRUE----------------------------------------------------------------
# Useful arguments:
# Group:
# allows user to provide each sample's information.
# num_center:
# how many clusters are supposed to be there.
# num_PC:
# how many principal components should be used in k-means
# clustering.

tSNE.results <- Linnorm.tSNE(Islam2011[,1:92],
 Group=Groups, num_center=2)

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
# Here, we can see two clusters.
print(tSNE.results$plot$plot)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(Islam2011)

## ----echo=TRUE----------------------------------------------------------------
PCA.results <- Linnorm.PCA(Islam2011[,1:92])

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
# Here, we can see multiple clusters.
print(PCA.results$plot$plot)

## ----echo=TRUE----------------------------------------------------------------
# The first 48 samples belong to mouse embryonic stem cells.
Groups <- rep("ES_Cells",48)
# The next 44 samples are mouse embryonic fibroblasts.
Groups <- c(Groups, rep("EF_Cells",44))

## ----echo=TRUE----------------------------------------------------------------
# Useful arguments:
# Group:
# allows user to provide each sample's information.
# num_center:
# how many clusters are supposed to be there.
# num_PC
# how many principal components should be used in k-means
# clustering.

PCA.results <- Linnorm.PCA(Islam2011[,1:92],
Group=Groups, num_center=2, num_PC=3)

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
# Here, we can see two clusters.
print(PCA.results$plot$plot)

## ----echo=TRUE----------------------------------------------------------------
data(Islam2011)
Islam <- Islam2011[,1:92]

## ----echo=TRUE----------------------------------------------------------------
# 48 ESC, 44 EF, and 4 NegCtrl
Group <- c(rep("ESC",48),rep("EF",44))
colnames(Islam) <- paste(colnames(Islam),Group,sep="_")

## ----echo=TRUE----------------------------------------------------------------
# Note that there are 3 known clusters.
HClust.Results <- Linnorm.HClust(Islam,Group=Group,
num_Clust=2, fontsize=1.5, Color = c("Red","Blue"), RectColor="Green")

## ----echo=TRUE, fig.height=6, fig.width=6-------------------------------------
print(HClust.Results$plot$plot)

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
data(SEQC)
SampleA <- SEQC

## ----echo=TRUE----------------------------------------------------------------
# This will generate two sets of RNA-seq data with 5 replicates each. 
# It will have 20000 genes totally with 2000 genes being differentially 
# expressed. It has the Negative Binomial distribution.
SimulatedData <- RnaXSim(SampleA)

## ----echo=TRUE----------------------------------------------------------------
# Index of differentially expressed genes.
DE_Index <- SimulatedData[[2]]

# Expression Matrix
ExpMatrix <- SimulatedData[[1]]

# Sample Set 1
Sample1 <- ExpMatrix[,1:3]

# Sample Set 2
Sample2 <- ExpMatrix[,4:6]

## ----echo=TRUE----------------------------------------------------------------
data(SEQC)
SampleA <- SEQC

## ----echo=TRUE----------------------------------------------------------------
library(Linnorm)
SimulatedData <- RnaXSim(SampleA,
distribution="Gamma", # Distribution in the simulated dataset.
# Put "NB" for Negative Binomial, "Gamma" for Gamma,
# "Poisson" for Poisson and "LogNorm" for Log Normal distribution.
NumRep=5, # Number of replicates in each sample set. 
# 5 will generate 10 samples in total.
NumDiff = 1000, # Number of differentially expressed genes.
NumFea = 5000 # Total number of genes in the dataset
)

## ----echo=TRUE----------------------------------------------------------------
# Index of differentially expressed genes.
DE_Index <- SimulatedData[[2]]

# Expression Matrix
ExpMatrix <- SimulatedData[[1]]

# Simulated Sample Set 1
Sample1 <- ExpMatrix[,1:3]

# Simulated Sample Set 2
Sample2 <- ExpMatrix[,4:6]