GEN242

#############

STAR

#############

Read mapping with `STAR`

library(systemPipeR)

## Loading required package: Rsamtools

## Loading required package: GenomeInfoDb

## Loading required package: BiocGenerics

## 
## Attaching package: 'BiocGenerics'

## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs

## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, saveRDS, setdiff,
##     table, tapply, union, unique, unsplit, which.max, which.min

## Loading required package: S4Vectors

## Loading required package: stats4

## 
## Attaching package: 'S4Vectors'

## The following object is masked from 'package:utils':
## 
##     findMatches

## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname

## Loading required package: IRanges

## Loading required package: GenomicRanges

## Loading required package: Biostrings

## Loading required package: XVector

## 
## Attaching package: 'Biostrings'

## The following object is masked from 'package:base':
## 
##     strsplit

## Loading required package: ShortRead

## Loading required package: BiocParallel

## Loading required package: GenomicAlignments

## Loading required package: SummarizedExperiment

## Loading required package: MatrixGenerics

## Loading required package: matrixStats

## 
## Attaching package: 'MatrixGenerics'

## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars

## Loading required package: Biobase

## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.

## 
## Attaching package: 'Biobase'

## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians

## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians

# sal <- SPRproject(logs.dir= ".SPRproject_test") # use this line when .SPRproject_test doesn't exist yet
sal <- SPRproject(overwrite = TRUE, logs.dir= ".SPRproject_test")

## Recreating directory '/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test'
## Creating file '/home/tgirke/tmp/GEN242/content/en/assignments/Projects/helper_code/aligners/.SPRproject_test/SYSargsList.yml'

Use GTF file instead of GFF

Download GTF from Arabidopsis from here. This is for proper read counting with STAR.

download.file("https://ftp.ensemblgenomes.ebi.ac.uk/pub/plants/release-59/gtf/arabidopsis_thaliana/Arabidopsis_thaliana.TAIR10.59.gtf.gz", "data/tair10.gtf.gz")
R.utils::gunzip("data/tair10.gtf.gz", overwrite=TRUE)
dna <- readDNAStringSet("./data/tair10.fasta")
names(dna) <- c(as.character(c(1:5)), "Mt", "Pt") # Fixes chromomse ids, where numbers are used by ENSEMBL, and toy data set uses: Chr1, Chr2, ...
writeXStringSet(dna, "./data/tair10.fasta")

appendStep(sal) <- LineWise(code = {
                library(systemPipeR)
                }, step_name = "load_SPR")

Read preprocessing

Preprocessing with `preprocessReads` function

appendStep(sal) <- SYSargsList(
    step_name = "preprocessing",
    targets = "targetsPE.txt", dir = TRUE,
    wf_file = "preprocessReads/preprocessReads-pe.cwl",
    input_file = "preprocessReads/preprocessReads-pe.yml",
    dir_path = "param/cwl",
    inputvars = c(
        FileName1 = "_FASTQ_PATH1_",
        FileName2 = "_FASTQ_PATH2_",
        SampleName = "_SampleName_"
    ),
    dependency = c("load_SPR"))

Alignments with `STAR`

`STAR` Indexing

appendStep(sal) <- SYSargsList(
    step_name = "star_index", 
    dir = FALSE, 
    targets=NULL, 
    wf_file = "star/star-index.cwl", 
    input_file="star/star-index.yml",
    dir_path="param/cwl", 
    dependency = "load_SPR"
)

`STAR` Mapping

appendStep(sal) <- SYSargsList(
    step_name = "star_mapping", 
    dir = TRUE, 
    targets = "preprocessing",
    wf_file = "star/star-mapping-pe.cwl", 
    input_file = "star/star-mapping-pe.yml", 
    dir_path = "param/cwl",
    inputvars = c(preprocessReads_1 = "_FASTQ_PATH1_", preprocessReads_2 = "_FASTQ_PATH2_",
        SampleName = "_SampleName_"), rm_targets_col = c("FileName1", "FileName2"),
    dependency = c("preprocessing", "star_index"))

## Return command-line calls for STAR
cmdlist(sal, step="star_mapping", targets=1)

## BAM outpaths required for read counting below
outpaths <- getColumn(sal, step = "star_mapping", "outfiles", column = "Aligned_toTranscriptome_out_bam")
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully

## To run sal stepwise, make sure you have constructed your 
## sal object step-by-step starting from an empty sal
## as shown above under chunk: intialize sal for testing 
sal <- runWF(sal, steps=c(1)) # increment step number one by one just for checking
sal
outpaths <- getColumn(sal, step = "star_mapping", "outfiles", column = "Aligned_toTranscriptome_out_bam")
outpaths
file.exists(outpaths) # Will not return TRUE until STAR completed sucessfully

## The following can be used for setting up things initial testing
starPE <- loadWorkflow(targets = "targetsPE.txt", wf_file = "star-mapping-pe.cwl", 
                       input_file = "star-mapping-pe.yml", dir_path = "./param/star_test")
starPE <- renderWF(starPE, inputvars = c(FileName1 = "_FASTQ_PATH1_", FileName2 = "_FASTQ_PATH2_", 
                                         SampleName = "_SampleName_"))
cmdlist(starPE)
runCommandline(starPE, make_bam = FALSE)

Assemble read count matrix

readcounts <- getColumn(sal, step = "star_mapping", "outfiles", column = "ReadsPerGene_out_tab")
assembleCountMA <- function(x) {
    df <- read.delim(x, row.names=1)
    colnames(df) <- paste(rep(gsub("\\..*", "", basename(x)), 3), c("both", "sense", "antisense"), sep="_") 
    df[!rownames(df) %in% c("N_multimapping", "N_noFeature", "N_ambiguous"),]
}
countList <- lapply(readcounts, function(z) assembleCountMA(x=z))
names(countList) <- NULL
## Check whether if row order is identical among all matrices
# all(sapply(names(countList), function(x) row.names(countList[[1]]) %in% row.names(countList[[x]])))
countDFstar <- do.call(cbind, countList)
write.table(countDFstar, "results/countDFstar.xls", col.names = NA, quote = FALSE, sep = "\t")

STAR

Read mapping with STAR

Use GTF file instead of GFF

Read preprocessing

Preprocessing with preprocessReads function

Alignments with STAR

STAR Indexing

STAR Mapping

Assemble read count matrix

Read mapping with `STAR`

Preprocessing with `preprocessReads` function

Alignments with `STAR`

`STAR` Indexing

`STAR` Mapping