#############

STAR

#############

Read mapping with STAR

library(systemPipeR)
## Loading required package: Rsamtools
## Loading required package: GenomeInfoDb
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min
## Loading required package: S4Vectors
## Loading required package: stats4
## 
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## Loading required package: GenomicRanges
## Loading required package: Biostrings
## Loading required package: XVector
## 
## Attaching package: 'Biostrings'
## The following object is masked from 'package:base':
## 
##     strsplit
## Loading required package: ShortRead
## Loading required package: BiocParallel
## Loading required package: GenomicAlignments
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
# sal <- SPRproject(logs.dir= ".SPRproject_test") # use this line when .SPRproject_test doesn't exist yet
sal <- SPRproject(overwrite = TRUE, logs.dir= ".SPRproject_test")
## Recreating directory '/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test'
## Creating file '/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test/SYSargsList.yml'

Use GTF file instead of GFF

Download GTF from Arabidopsis from here. This is for proper read counting with STAR.

download.file("https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.59.gtf.gz", "data/tair10.gtf.gz")
R.utils::gunzip("data/tair10.gtf.gz", overwrite=TRUE)
dna <- readDNAStringSet("./data/tair10.fasta")
names(dna) <- c(as.character(c(1:5)), "Mt", "Pt") # Fixes chromomse ids, where numbers are used by ENSEMBL, and toy data set uses: Chr1, Chr2, ...
writeXStringSet(dna, "./data/tair10.fasta")
appendStep(sal) <- LineWise(code = {
                library(systemPipeR)
                }, step_name = "load_SPR")

Read preprocessing

Preprocessing with preprocessReads function

appendStep(sal) <- SYSargsList(
    step_name = "preprocessing",
    targets = "targetsPE.txt", dir = TRUE,
    wf_file = "preprocessReads/preprocessReads-pe.cwl",
    input_file = "preprocessReads/preprocessReads-pe.yml",
    dir_path = "param/cwl",
    inputvars = c(
        FileName1 = "_FASTQ_PATH1_",
        FileName2 = "_FASTQ_PATH2_",
        SampleName = "_SampleName_"
    ),
    dependency = c("load_SPR"))

Alignments with STAR

STAR Indexing

appendStep(sal) <- SYSargsList(
    step_name = "star_index", 
    dir = FALSE, 
    targets=NULL, 
    wf_file = "star/star-index.cwl", 
    input_file="star/star-index.yml",
    dir_path="param/cwl", 
    dependency = "load_SPR"
)

STAR Mapping

appendStep(sal) <- SYSargsList(
    step_name = "star_mapping", 
    dir = TRUE, 
    targets = "preprocessing",
    wf_file = "star/star-mapping-pe.cwl", 
    input_file = "star/star-mapping-pe.yml", 
    dir_path = "param/cwl",
    inputvars = c(preprocessReads_1 = "_FASTQ_PATH1_", preprocessReads_2 = "_FASTQ_PATH2_",
        SampleName = "_SampleName_"), rm_targets_col = c("FileName1", "FileName2"),
    dependency = c("preprocessing", "star_index"))
## Return command-line calls for STAR
cmdlist(sal, step="star_mapping", targets=1)

## BAM outpaths required for read counting below
outpaths <- getColumn(sal, step = "star_mapping", "outfiles", column = "Aligned_toTranscriptome_out_bam")
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully
## To run sal stepwise, make sure you have constructed your 
## sal object step-by-step starting from an empty sal
## as shown above under chunk: intialize sal for testing 
sal <- runWF(sal, steps=c(1)) # increment step number one by one just for checking
sal
outpaths <- getColumn(sal, step = "star_mapping", "outfiles", column = "Aligned_toTranscriptome_out_bam")
outpaths
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully
## The following can be used for setting up things initial testing
starPE <- loadWorkflow(targets = "targetsPE.txt", wf_file = "star-mapping-pe.cwl", 
                       input_file = "star-mapping-pe.yml", dir_path = "./param/star_test")
starPE <- renderWF(starPE, inputvars = c(FileName1 = "_FASTQ_PATH1_", FileName2 = "_FASTQ_PATH2_", 
                                         SampleName = "_SampleName_"))
cmdlist(starPE)
runCommandline(starPE, make_bam = FALSE)

Assemble read count matrix

readcounts <- getColumn(sal, step = "star_mapping", "outfiles", column = "ReadsPerGene_out_tab")
assembleCountMA <- function(x) {
    df <- read.delim(x, row.names=1)
    colnames(df) <- paste(rep(gsub("\\..*", "", basename(x)), 3), c("both", "sense", "antisense"), sep="_") 
    df[!rownames(df) %in% c("N_multimapping", "N_noFeature", "N_ambiguous"),]
}
countList <- lapply(readcounts, function(z) assembleCountMA(x=z))
names(countList) <- NULL
## Check whether if row order is identical among all matrices
# all(sapply(names(countList), function(x) row.names(countList[[1]]) %in% row.names(countList[[x]])))
countDFstar <- do.call(cbind, countList)
write.table(countDFstar, "results/countDFstar.xls", col.names = NA, quote = FALSE, sep = "\t")