## ----setup, message = FALSE---------------------------------------------------
library(gemma.R)
library(dplyr)
library(pheatmap)
library(purrr)

## ----include = FALSE----------------------------------------------------------
options('gemma.memoised' = TRUE)

## ----include=FALSE,eval=FALSE-------------------------------------------------
#  # finding a good example
#  all_valid <-
#      get_result_sets(filter = "analysis.subsetFactorValue.characteristics.size > 0") %>%
#      get_all_pages()
#  
#  # remove bugged ones, should be temporary until #50 is fixed
#  contrasts_with_subsets <- all_valid[!all_valid$experimental.factors %>% sapply(is.null),]
#  
#  
#  # find datasets where the experimental factor is marked by multiple statements
#  contrasts_with_subsets <- contrasts_with_subsets[contrasts_with_subsets$experimental.factors %>% sapply(nrow) %>% {.>1},]
#  
#  # find datasets where experimental factor is marked by multiple statements belonging to the same factor
#  
#  contrasts_with_subsets$experimental.factors %>% sapply(function(x){
#      any(duplicated(x$ID))
#      }) %>% {contrasts_with_subsets[.,]} -> contrasts_with_subsets
#  
#  # interaction
#  contrasts_with_subsets = contrasts_with_subsets[grepl("_",contrasts_with_subsets$contrast.ID),]

## -----------------------------------------------------------------------------
get_dataset_annotations('GSE48962') %>%
    gemma_kable

## -----------------------------------------------------------------------------
samples <- get_dataset_samples('GSE48962')
samples$sample.factorValues[[
    which(samples$sample.name == "TSM490")
    ]] %>% 
    gemma_kable()

## ----include = FALSE----------------------------------------------------------

doubled_id <- samples$sample.factorValues[[
    which(samples$sample.name == "TSM490")
]] %>% filter(value == "HTT [human] huntingtin") %>% {.$ID} %>% unique


## -----------------------------------------------------------------------------
id <- samples$sample.factorValues[[
    which(samples$sample.name == "TSM490")
]] %>% filter(value == "HTT [human] huntingtin") %>% {.$ID} %>% unique


# count how many patients has this phenotype
samples$sample.factorValues %>% sapply(\(x){
    id %in% x$ID
}) %>% sum


## -----------------------------------------------------------------------------
id <- samples$sample.factorValues[[2]] %>% 
    filter(value == "HTT [human] huntingtin") %>% {.$factor.ID} %>% unique

samples$sample.factorValues %>% lapply(\(x){
    x %>% filter(factor.ID == id) %>% {.$summary}
}) %>% unlist %>% unique

## -----------------------------------------------------------------------------
design <- make_design(samples)
design[,-1] %>% head %>%  # first column is just a copy of the original factor values
    gemma_kable()

## -----------------------------------------------------------------------------
design %>%
    group_by(`organism part`,timepoint,genotype) %>% 
    summarize(n= n()) %>% 
    arrange(desc(n)) %>% 
    gemma_kable()


## -----------------------------------------------------------------------------
# removing columns containing factor values and URIs for brevity
remove_columns <- c('baseline.factors','experimental.factors','subsetFactor','factor.category.URI')

dea <- get_dataset_differential_expression_analyses("GSE48962")

dea[,.SD,.SDcols = !remove_columns] %>% 
    gemma_kable()

## -----------------------------------------------------------------------------
contrast <- dea %>% 
    filter(
        factor.category == "genotype" & 
            subsetFactor %>% map_chr('value') %>% {.=='cerebral cortex'} # we will talk about subsets in a moment
        )

## -----------------------------------------------------------------------------
# removing URIs for brevity
uri_columns = c('category.URI',
                'object.URI',
                'value.URI',
                'predicate.URI',
                'factor.category.URI')

contrast$baseline.factors[[1]][,.SD,.SDcols = !uri_columns] %>% 
     gemma_kable()

contrast$experimental.factors[[1]][,.SD,.SDcols = !uri_columns] %>% 
     gemma_kable()

## -----------------------------------------------------------------------------
contrast <- dea %>% 
    filter(
        factor.category == "genotype,timepoint" & 
            subsetFactor %>% map_chr('value') %>% {.=='cerebral cortex'} # we're almost there!
        )

## -----------------------------------------------------------------------------
contrast$baseline.factors[[1]][,.SD,.SDcols = !uri_columns] %>% 
     gemma_kable()

contrast$experimental.factors[[1]][,.SD,.SDcols = !uri_columns] %>% 
     gemma_kable()

## -----------------------------------------------------------------------------
contrast$subsetFactor[[1]][,.SD,.SDcols = !uri_columns] %>%
     gemma_kable()

## -----------------------------------------------------------------------------
obj <-  get_dataset_object("GSE48962",resultSets = contrast$result.ID,contrasts = contrast$contrast.ID,type = 'list')
obj[[1]]$design[,-1] %>% 
    head %>% gemma_kable()

## -----------------------------------------------------------------------------
dif_vals <- get_differential_expression_values('GSE48962')
dif_vals[[as.character(contrast$result.ID)]] %>% head %>%  
     gemma_kable()

## -----------------------------------------------------------------------------
# getting the top 10 genes
top_genes <- dif_vals[[as.character(contrast$result.ID)]] %>% 
    arrange(across(paste0('contrast_',contrast$contrast.ID,'_pvalue'))) %>% 
    filter(GeneSymbol!='' | grepl("|",GeneSymbol,fixed = TRUE)) %>% # remove blank genes or probes with multiple genes
    {.[1:10,]}
top_genes %>% select(Probe,NCBIid,GeneSymbol) %>% 
     gemma_kable()

## -----------------------------------------------------------------------------
exp_subset<- obj[[1]]$exp %>% 
    filter(Probe %in% top_genes$Probe)
genes <- top_genes$GeneSymbol

# ordering design file
design <- obj[[1]]$design %>% arrange(genotype,timepoint)

# shorten the resistance label a bit
design$genotype[grepl('HTT',design$genotype)] = "Huntington Model"

exp_subset[,.SD,.SDcols = rownames(design)] %>% t  %>% scale %>% t %>%
    pheatmap(cluster_rows = FALSE,cluster_cols = FALSE,labels_row = genes,
             annotation_col =design %>% select(genotype,timepoint))


## ----sessionInfo, echo=FALSE--------------------------------------------------
sessionInfo()