meme suite里面的大多数工具都已经非常成熟我们只需要选择合适的参数,用一行命令就可以实现motif discovery。相对而言,准备输入序列反而更需要一些生物学知识,在编码上也更需要花费精力。所以在本例中我们对准备输入序列的过程,以及这一过程用到的R包进行介绍。
在我们给出的这个例子中,假设我们通过某种方式得到了一个gene list,对这些基因的调控序列(promoter, 5' UTR和3' UTR),我们现在考虑两个问题:
docker load -i bioinfo_motif.2.0.tar.gz
docker run -dt --name motif --restart unless-stopped -v {host.machine.shared.directory}:/data bioinfo/motif:2.0 /bin/bash
docker exec -it -u test motif /bin/bash
cd /home/test/motif/sequence_motif
ln -s /data/genome . #这一步相当于建立一个快捷方式,方便访问
gunzip genome/GRCh38.p10.genome.fa.gz #解压
gunzip genome/gencode.v27.annotation.gtf.gz
如前所述,我们提供了准备好的序列文件,如果只希望了解motif分析本身可以跳过这一节。 我们提供了下面两个脚本,根据提供的gene list和gtf注释文件从基因组中提取这些基因的一些可能具有调控作用的序列(promoter, 5'UTR和3' UTR)作为motif discovery的输入。
#!/usr/bin/env Rscript
message("Load required packages ...")
suppressPackageStartupMessages(library("GenomicFeatures"))
message("Load trasncripts in gencode human genome annotation ...")
# specify the location of your genome annotation in gtf format
gtf.path <- "genome/gencode.v27.annotation.gtf"
# GenomicFeatures package use "TxDb" to store annotation for each transcripts
tx.db <- makeTxDbFromGFF(gtf.path, format="gtf")
# Here we only consider longest trasncript of each gene to avoid over representation of genes with multiple isoform
tx.lengths <- transcriptLengths(tx.db, with.cds_len=FALSE,with.utr5_len=FALSE, with.utr3_len=FALSE)
# tx.lengths is a data.frame with 5 fields:
# tx_id, tx_name, gene_id, nexon (numer of exons), tx_len
gene.id.2.longest.tx.id <- lapply(split(tx.lengths,tx.lengths$gene_id),
function(isoform.length.table){isoform.length.table$tx_name[which.max(isoform.length.table$tx_len)];})
# get a look up table: gene id to longest transcript id
gene.id.2.longest.tx.id <- unlist(gene.id.2.longest.tx.id)
message("Load gene ids of interest ...")
gene.ids <- read.delim("gene.ensembl.ids.txt",sep="\n",stringsAsFactors = F,header = F)$V1
message(length(gene.ids)," genes were loaded .")
considered.tx.ids <- gene.id.2.longest.tx.id[gene.ids]
# longest transcript id to gene id look up
names(gene.ids) <- considered.tx.ids
# fetch 5' UTR sequences
message("Fetch 5' UTR genomic intervals ...")
five.prime.utrs = fiveUTRsByTranscript(tx.db, use.names=T)
five.prime.utrs <- data.frame(five.prime.utrs)
# five.prime.utrs is now a data.frame like this:
# group group_name seqnames start end width strand exon_id exon_name exon_rank
#1 1 ENST00000641515.1 chr1 65419 65433 15 + 21 ENSE00003812156.1 1
#2 1 ENST00000641515.1 chr1 65520 65573 54 + 22 ENSE00003813641.1 2
#3 1 ENST00000641515.1 chr1 69037 69090 54 + 23 ENSE00003813949.1 3
message("Saving 5' UTR genomic intervals in bed format ...")
fields <- c("seqnames","start","end","group_name","exon_id","strand")
five.prime.utrs <- five.prime.utrs[five.prime.utrs$group_name %in% considered.tx.ids,fields]
five.prime.utrs$group_name <- paste0(gene.ids[five.prime.utrs$group_name],"-",five.prime.utrs$group_name)
five.prime.utrs$start <- five.prime.utrs$start - 1
write.table(five.prime.utrs,"five.prime.utrs.bed",quote=FALSE,sep="\t",col.names=FALSE,row.names=FALSE)
# fetch 3' UTR sequences
message("Fetch 3' UTR genomic intervals ...")
three.prime.utrs = threeUTRsByTranscript(tx.db, use.names=T)
three.prime.utrs <- data.frame(three.prime.utrs)
message("Saving 3' UTR genomic intervals in bed format ...")
three.prime.utrs <- three.prime.utrs[three.prime.utrs$group_name %in% considered.tx.ids,fields]
three.prime.utrs$group_name <- paste0(gene.ids[three.prime.utrs$group_name],"-",three.prime.utrs$group_name)
three.prime.utrs$start <- three.prime.utrs$start - 1
write.table(three.prime.utrs , "three.prime.utrs.bed",quote=FALSE,sep="\t",col.names=FALSE,row.names=FALSE)
# fetch promoter sequences
message("Fetch promoter genomic intervals ...")
# how are so called "promoters" defined in GenomicFeatures package? please see its manual.
promoters.ivs <- promoters(tx.db)
promoters.ivs <- data.frame(promoters.ivs)
message("Saving promoter genomic intervals ...")
fields <- c("seqnames","start","end","tx_name","tx_id","strand")
promoters.ivs <- promoters.ivs[promoters.ivs$tx_name %in% considered.tx.ids,fields]
promoters.ivs$start <- promoters.ivs$start - 1
write.table(promoters.ivs, "promoters.bed",quote=FALSE,sep="\t",col.names=FALSE,row.names=FALSE)
message("All done .")
mkdir -p sequences
bedtools getfasta -s -name+ -fi genome/GRCh38.p10.genome.fa -bed bed/promoters.bed -fo sequences/promoters.fa
bedtools getfasta -s -name+ -fi genome/GRCh38.p10.genome.fa -bed bed/five.prime.utrs.bed -fo sequences/five.prime.utrs.fa
bedtools getfasta -s -name+ -fi genome/GRCh38.p10.genome.fa -bed bed/three.prime.utrs.bed -fo sequences/three.prime.utrs.fa
#!/usr/bin/env Rscript
message("Load required packages ...")
suppressPackageStartupMessages(library("Biostrings"))
input.sequences.path <- "sequences/three.prime.utrs.fa"
output.sequences.path <- "sequences/three.prime.utrs.spliced.fa"
message("Load sequences ...")
# load fasta file with readBStringSet function in Biostrings package
sequences <- readBStringSet(input.sequences.path, format="fasta")
#example: ENSG00000257315.2-ENST00000550078.2::chr1:203848330-203848407(+)
seq.names.long <- names(sequences)
#example: ENSG00000257315.2-ENST00000550078.2
seq.names <- unlist(lapply(seq.names.long,function(s){strsplit(s,"::")[[1]][1]}))
#example: chr1:203848330-203848407(+)
regions <- unlist(lapply(seq.names.long,function(s){strsplit(s,"::")[[1]][2]}))
#example: 203848330
start.positions <- as.numeric(unlist(lapply(regions,function(s){strsplit(strsplit(s,":")[[1]][2],"-")[[1]][1]})))
strands <- unlist(lapply(regions,function(s){l=nchar(s);substr(s,l-1,l-1);}))
region.table <- data.frame(seq.names.long=seq.names.long,start.positions=start.positions,strands=strands)
intervals.by.tx <- split(region.table,seq.names)
message("Concatenate sequences ...")
string.set <- DNAStringSet()
for(tx.id in names(intervals.by.tx)){
intervals <- intervals.by.tx[[tx.id]]
if(intervals$strands[1]=="+"){
intervals <- intervals[order(intervals$start.positions,decreasing=F),];
}else{
intervals <- intervals[order(intervals$start.positions,decreasing=T),];
}
# concatenate exons that belongs to same UTR into a single spliced sequence
spliced.sequence <- paste(apply(intervals,1,function(x){toString(sequences[x[1]]);}),collapse="")
string.set[[tx.id]] = spliced.sequence
}
# save results in fasta format
message("Saving results ...")
writeXStringSet(string.set,output.sequences.path)
message("All done.")
input.sequences.path <- "sequences/five.prime.utrs.fa"
output.sequences.path <- "sequences/five.prime.utrs.spliced.fa"
fasta-dinucleotide-shuffle -f sequences/promoters.fa > sequences/promoters.dinuc.shuffled.fa
fasta-dinucleotide-shuffle -f sequences/five.prime.utrs.spliced.fa > sequences/five.prime.utrs.spliced.dinuc.shuffled.fa
fasta-dinucleotide-shuffle -f sequences/three.prime.utrs.spliced.fa > sequences/three.prime.utrs.spliced.dinuc.shuffled.fa
meme -dna -minw 6 -maxw 10 -oc promoters.motif.discovery -nmotifs 5 sequences/promoters.fa
ame --control sequences/promoters.dinuc.shuffled.fa --oc promoter.motif.enrichment sequences/promoters.fa known-motifs/JASPAR2018_CORE_vertebrates_non-redundant.meme