0

所需工具

• `BioSequences.jl`
• `DataFrames.jl`

先寫個針對每個序列計算GC content的函數

``````"""

- seq是BioSequence{DNAAlphabet{4}}類別的物件
- windowSize是我所想要設定的window大小的Int類別整數變數
- asProb則是決定是否將計算得到的GC數量轉化成比例的布林變數
- seqName則是染色體或其他用來代表該序列名稱的String類別變數
"""

function getGCcontentInSlideWindowView(seq::BioSequence{DNAAlphabet{4}}, windowSize::Int, asProb::Bool, seqName::String)
inputSeqLen = length(seq)
@assert inputSeqLen >= windowSize
restSeqLen = inputSeqLen % windowSize
windowsNum = restSeqLen == 0 ? inputSeqLen ÷ windowSize : inputSeqLen ÷ windowSize + 1
GCcontent = asProb ? zeros(Float32, windowsNum) : zeros(Int32, windowsNum)
StartPos = zeros(Int32, windowsNum)
StopPos = zeros(Int32, windowsNum)
Chromosome = fill(seqName, windowsNum)
for i = 0:(windowsNum-1)
idx = i + 1
start = i * windowSize + 1
stop = restSeqLen == 0 ? (i + 1) * windowSize :
( i == (windowsNum - 1) ? i * windowSize + restSeqLen : (i+1) * windowSize)
StartPos[idx] = start
StopPos[idx] = stop
if i == (windowsNum - 1)
@assert stop == inputSeqLen
end
subseq = seq[start:stop]
actgComposition = BioSequences.Composition(subseq).counts
numG = haskey(actgComposition, DNA_G) ? actgComposition[DNA_G] : 0
numC = haskey(actgComposition, DNA_C) ? actgComposition[DNA_C] : 0
numGC = numG + numC
GCcontent[idx] = asProb ? round(numGC/windowSize, digits = 4) : numGC
end
output = DataFrame(Chrom = Chromosome, Start = StartPos, Stop = StopPos, GC = GCcontent)
return output
end
``````

拿個實際的例子來看看

``````using BioSequences
using DataFrame

# 建立一個空表格
gcData = DataFrame([String, Int32, Int32, Float32], [:Chrom, :Start, :Stop, :GCcontent], 0)

# 讀取人類基因體資料

# 設定window size為20kbp, 開始計算GC content
i = 1
seqname = FASTA.identifier(rec)
seq = FASTA.sequence(rec)
dataFromEachChrom = getGCcontentInSlideWindowView(seq, 20000, true, seqname)
global gcData = vcat(gcData, dataFromEachChrom)
global i += 1

## 不想計算Mitochodria的部份154423×4 DataFrame
if i == 25
break
end
end
``````

``````154423×4 DataFrame
│ Row    │ Chrom  │ Start    │ Stop     │ GC      │
│        │ String │ Int32    │ Int32    │ Float32 │
├────────┼────────┼──────────┼──────────┼─────────┤
│ 1      │ chr1   │ 1        │ 20000    │ 0.2962  │
│ 2      │ chr1   │ 20001    │ 40000    │ 0.4943  │
│ 3      │ chr1   │ 40001    │ 60000    │ 0.3608  │
│ 4      │ chr1   │ 60001    │ 80000    │ 0.3566  │
│ 5      │ chr1   │ 80001    │ 100000   │ 0.4026  │
│ 6      │ chr1   │ 100001   │ 120000   │ 0.377   │
│ 7      │ chr1   │ 120001   │ 140000   │ 0.471   │
│ 8      │ chr1   │ 140001   │ 160000   │ 0.448   │
⋮
│ 154415 │ chrY   │ 57060001 │ 57080000 │ 0.4089  │
│ 154416 │ chrY   │ 57080001 │ 57100000 │ 0.3796  │
│ 154417 │ chrY   │ 57100001 │ 57120000 │ 0.3872  │
│ 154418 │ chrY   │ 57120001 │ 57140000 │ 0.4013  │
│ 154419 │ chrY   │ 57140001 │ 57160000 │ 0.417   │
│ 154420 │ chrY   │ 57160001 │ 57180000 │ 0.4346  │
│ 154421 │ chrY   │ 57180001 │ 57200000 │ 0.5268  │
│ 154422 │ chrY   │ 57200001 │ 57220000 │ 0.4822  │
│ 154423 │ chrY   │ 57220001 │ 57227415 │ 0.0     │
``````

1 則留言

0
yingwenshuzi
iT邦新手 5 級 ‧ 2019-02-14 13:03:46