題目連結:https://rosalind.info/problems/cons/
輸入:
>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT
輸出:
ATGCAACT
A: 5 1 0 0 5 5 0 0
C: 0 0 1 4 2 0 6 1
G: 1 1 6 3 0 1 0 0
T: 1 5 0 0 0 1 1 6
哦原來今天題目要將多個序列,計算ATCG哪一個點位出現最多次,加總得到這張表格
這種統計表格稱作 Profile Matrix(剖面矩陣)
先用 005_GC 提到的 解析fasta格式(parse_fasta函式)讀入input
接著創建一個 profile陣列
遍歷每一條序列的每個字母,不斷對profile count做+=1
最後組出 consensus共識字串(相同最多次的)
按題目要的格式湊出文字輸出
程式碼:
data = """
>Rosalind_1
ATCCAGCT
>Rosalind_2
GGGCAACT
>Rosalind_3
ATGGATCT
>Rosalind_4
AAGCAACC
>Rosalind_5
TTGGAACT
>Rosalind_6
ATGCCATT
>Rosalind_7
ATGGCACT
"""
def parse_fasta(data: str) -> dict:
sequences = {}
label = ""
for line in data.strip().splitlines():
line = line.strip()
if not line:
continue
if line.startswith(">"):
label = line[1:].strip()
if not label:
label = ""
continue
sequences[label] = ""
elif label:
sequences[label] += line
return sequences
seqs = parse_fasta(data)
profile_width = len(next(iter(seqs.values()))) # 取出seqs.values()中第一條序列的長度
# profile matrix 使用二維陣列來記錄
profile = [
# [0, 0, 0, 0, 0, 0, 0, 0], # A
# [0, 0, 0, 0, 0, 0, 0, 0], # C
# [0, 0, 0, 0, 0, 0, 0, 0], # G
# [0, 0, 0, 0, 0, 0, 0, 0], # T
[0] * profile_width
for i in range(4) # 自動化產相對應的長度
]
for seq in seqs.values():
for i in range(len(seq)):
match seq[i]:
case "A":
profile[0][i] += 1
case "C":
profile[1][i] += 1
case "G":
profile[2][i] += 1
case "T":
profile[3][i] += 1
# print(profile)
# [[5, 1, 0, 0, 5, 5, 0, 0], [0, 0, 1, 4, 2, 0, 6, 1], [1, 1, 6, 3, 0, 1, 0, 0], [1, 5, 0, 0, 0, 1, 1, 6]]
consensus = "" # 中文翻譯為"共識"。共識字串:在每個位置上「出現最多次的字母(鹼基)/最多序列相同」 組成的字串
nucleotides = ["A", "C", "G", "T"]
for i in range(profile_width):
nt_counts = [profile[0][i], profile[1][i], profile[2][i], profile[3][i]]
max_index = nt_counts.index(max(nt_counts))
consensus += nucleotides[max_index] # 0,1,2,3 => A,C,G,T
print(consensus)
for i, counts in enumerate(profile):
print(nucleotides[i] + ":", *counts)
def parse_fasta(data: str) -> dict:
...(略)
def build_profile_and_consensus(sequences: list[str]):
length = len(sequences[0])
# profile matrix 使用dict來記錄
profile = {
"A": [0] * length,
"C": [0] * length,
"G": [0] * length,
"T": [0] * length,
}
for seq in sequences:
for i, nucleotide in enumerate(seq):
profile[nucleotide][i] += 1
consensus = ""
for i in range(length):
nt_counts = {
nt: profile[nt][i]
for nt in "ACGT"
}
consensus += max(nt_counts, key=nt_counts.get)
return consensus, profile
data = """
...(略)
"""
sequences = list(parse_fasta(data).values())
consensus, profile = build_profile_and_consensus(sequences)
print(consensus)
for nt in "ACGT":
print(f"{nt}: {' '.join(str(x) for x in profile[nt])}")
這類「轉置矩陣表格」的應用,非常適合用zip與解包(*
)來解,
用法在 006_HAMM 有提到
data = """
...(略)
"""
def parse_fasta(data: str) -> dict:
...(略)
seqs = parse_fasta(data)
# print(*seqs.values())
# 原本是一條條的序列
# ATCCAGCT GGGCAACT ATGGATCT AAGCAACC TTGGAACT ATGCCATT ATGGCACT
column = list(zip(*seqs.values())) # nucleotide positions
# 把原本多條序列橫著看了(就可以橫著走天下)
# print(column[0]) # ('A', 'G', 'A', 'A', 'T', 'A', 'A')
# print(column[-1]) # ('T', 'T', 'T', 'C', 'T', 'T', 'T')
# ⚠️*zip vs zip 的差異,在於有沒有「一個個取出」⚠️
# 1. 一個個取出每個元素
# print(list(zip(*seqs.values())))
# [('A', 'G', 'A', 'A', 'T', 'A', 'A'), ('T', 'G', 'T', 'A', 'T', 'T', 'T'), ('C', 'G', 'G', 'G', 'G', 'G', 'G'), ('C', 'C', 'G', 'C', 'G', 'C', 'G'), ('A', 'A', 'A', 'A', 'A', 'C', 'C'), ('G', 'A', 'T', 'A', 'A', 'A', 'A'), ('C', 'C', 'C', 'C', 'C', 'T', 'C'), ('T', 'T', 'T', 'C', 'T', 'T', 'T')]
# 2. 沒有一個個取出,導致變成tuple
# print(list(zip(seqs.values())))
# [('ATCCAGCT',), ('GGGCAACT',), ('ATGGATCT',), ('AAGCAACC',), ('TTGGAACT',), ('ATGCCATT',), ('ATGGCACT',)]
profile = {
"A": [],
"C": [],
"G": [],
"T": [],
}
consensus = ""
for bases in column:
nt_counts = {
"A": bases.count("A"),
"C": bases.count("C"),
"G": bases.count("G"),
"T": bases.count("T"),
}
profile["A"].append(nt_counts["A"])
profile["C"].append(nt_counts["C"])
profile["G"].append(nt_counts["G"])
profile["T"].append(nt_counts["T"])
consensus += max(nt_counts, key=nt_counts.get)
print(consensus)
for base in ["A", "C", "G", "T"]:
print(base + ":", *profile[base]) # 加入星號解包,避免印出list陣列的`[]`符號
行列互換後,這樣子寫程式碼變得輕便許多
max()
、min()
的用法(文長,實在沒經費了,明天再繼續戰鬥!