我在BioJulia裡頭找不到類似R語言裡頭Biostrings
那個package所提供的letterFrequencyInSlidingView
,所以就打算來自幹一個。
在我自幹的初淺版本當中,會需要用到的packages有:
BioSequences.jl
DataFrames.jl
"""
在這個函數中,我需要用到四個變數:
- seq是BioSequence{DNAAlphabet{4}}類別的物件
- windowSize是我所想要設定的window大小的Int類別整數變數
- asProb則是決定是否將計算得到的GC數量轉化成比例的布林變數
- seqName則是染色體或其他用來代表該序列名稱的String類別變數
"""
function getGCcontentInSlideWindowView(seq::BioSequence{DNAAlphabet{4}}, windowSize::Int, asProb::Bool, seqName::String)
inputSeqLen = length(seq)
@assert inputSeqLen >= windowSize
restSeqLen = inputSeqLen % windowSize
windowsNum = restSeqLen == 0 ? inputSeqLen ÷ windowSize : inputSeqLen ÷ windowSize + 1
GCcontent = asProb ? zeros(Float32, windowsNum) : zeros(Int32, windowsNum)
StartPos = zeros(Int32, windowsNum)
StopPos = zeros(Int32, windowsNum)
Chromosome = fill(seqName, windowsNum)
for i = 0:(windowsNum-1)
idx = i + 1
start = i * windowSize + 1
stop = restSeqLen == 0 ? (i + 1) * windowSize :
( i == (windowsNum - 1) ? i * windowSize + restSeqLen : (i+1) * windowSize)
StartPos[idx] = start
StopPos[idx] = stop
if i == (windowsNum - 1)
@assert stop == inputSeqLen
end
subseq = seq[start:stop]
actgComposition = BioSequences.Composition(subseq).counts
numG = haskey(actgComposition, DNA_G) ? actgComposition[DNA_G] : 0
numC = haskey(actgComposition, DNA_C) ? actgComposition[DNA_C] : 0
numGC = numG + numC
GCcontent[idx] = asProb ? round(numGC/windowSize, digits = 4) : numGC
end
output = DataFrame(Chrom = Chromosome, Start = StartPos, Stop = StopPos, GC = GCcontent)
return output
end
using BioSequences
using DataFrame
# 建立一個空表格
gcData = DataFrame([String, Int32, Int32, Float32], [:Chrom, :Start, :Stop, :GCcontent], 0)
# 讀取人類基因體資料
reader = FASTA.Reader(open("human.fa", "r"))
# 設定window size為20kbp, 開始計算GC content
i = 1
for rec in reader
seqname = FASTA.identifier(rec)
seq = FASTA.sequence(rec)
dataFromEachChrom = getGCcontentInSlideWindowView(seq, 20000, true, seqname)
global gcData = vcat(gcData, dataFromEachChrom)
global i += 1
## 不想計算Mitochodria的部份154423×4 DataFrame
if i == 25
break
end
end
close(reader)
執行後可以得到
154423×4 DataFrame
│ Row │ Chrom │ Start │ Stop │ GC │
│ │ String │ Int32 │ Int32 │ Float32 │
├────────┼────────┼──────────┼──────────┼─────────┤
│ 1 │ chr1 │ 1 │ 20000 │ 0.2962 │
│ 2 │ chr1 │ 20001 │ 40000 │ 0.4943 │
│ 3 │ chr1 │ 40001 │ 60000 │ 0.3608 │
│ 4 │ chr1 │ 60001 │ 80000 │ 0.3566 │
│ 5 │ chr1 │ 80001 │ 100000 │ 0.4026 │
│ 6 │ chr1 │ 100001 │ 120000 │ 0.377 │
│ 7 │ chr1 │ 120001 │ 140000 │ 0.471 │
│ 8 │ chr1 │ 140001 │ 160000 │ 0.448 │
⋮
│ 154415 │ chrY │ 57060001 │ 57080000 │ 0.4089 │
│ 154416 │ chrY │ 57080001 │ 57100000 │ 0.3796 │
│ 154417 │ chrY │ 57100001 │ 57120000 │ 0.3872 │
│ 154418 │ chrY │ 57120001 │ 57140000 │ 0.4013 │
│ 154419 │ chrY │ 57140001 │ 57160000 │ 0.417 │
│ 154420 │ chrY │ 57160001 │ 57180000 │ 0.4346 │
│ 154421 │ chrY │ 57180001 │ 57200000 │ 0.5268 │
│ 154422 │ chrY │ 57200001 │ 57220000 │ 0.4822 │
│ 154423 │ chrY │ 57220001 │ 57227415 │ 0.0 │
是不是也很容易呢?
我运行 seq = FASTA.sequence(rec) 这步非常慢,有没有比较便捷的方法取出特定位置的序列,比如seq = FASTA.sequence(rec, 500000:500010)呢?谢谢!