藉由馬可夫模型的機率特性,將語言中的最小單位-詞語組成句子,再由句子組為文章
馬可夫模型:一種紀錄機率的統計模型,已任一狀態為起點,
依照機率走放到下一階段,走訪完成時機率會為100%
with open("test.txt", encoding="utf-8") as f:
sentences = f.readlines()
sentences = [s.strip() for s in sentences]
引入程式文件並分割
import re
import string
delims = [
",", "。", ";", ":", "!",
"?", "?", ";", ":", "!",
",", ".", "\"", "'", "“",
"‘", "’", "(", ")", "”",
"(", ")", "%", "%", "@",
"~", "`", "~", "`", "#",
"、", "/", "\\", "<", ">",
"《", "》", "/", "{", "}",
"{", "}", "[", "]", "[",
"]", "|", "|", "\n", "\r",
" ", "\t", " ", '+', '=', '*', '^', '·'
]\
+ list("0123456789") \
+ list(string.punctuation)
escaped = re.escape(''.join(delims))
exclusions = '['+escaped+']'
## 斷句字典
splitsen = []
for s in sentences:
cleans = re.sub(exclusions, ' ', s)
subs = cleans.split()
splitsen.extend(subs)
## 加入空格後切割 最後存入陣列
for idx, s in enumerate(splitsen):
splitsen[idx] = 'S'+s+"E"
## 頭尾加分隔符號
jieba.load_userdict('dict.txt.big')
words = []
for s in splitsen:
ws = list(jieba.cut(s))
words.extend(ws)
def build_word_dict(words):
word_dict = {}
for i in range(1, len(words)):
if words[i-1] not in word_dict:
word_dict[words[i-1]] = {}
if words[i] not in word_dict[words[i-1]]:
word_dict[words[i-1]][words[i]] = 0
word_dict[words[i-1]][words[i]] += 1
return word_dict
word_dict = build_word_dict(words)
print(words)
print(word_dict["人"])
# 算總次數
def wordListSum(wordList):
sumfreq = 0
for word, freq in wordList.items():
sumfreq += freq
#print(sumfreq)
return sumfreq
# 依照機率分布,隨機產生下一個字
def retrieveRandomWord(wordList):
#print(wordList)
# 1~n 取亂數
randIndex = randint(1, wordListSum(wordList))
for word, freq in wordList.items():
randIndex -= freq
if randIndex <= 0:
return word
# 產生長度100的Markov chain
length = 100
chain = ""
currentWord = "生活"
for i in range(0, length):
chain += currentWord+""
print(currentWord,"=>",word_dict[currentWord])
currentWord = retrieveRandomWord(word_dict[currentWord])
#print(chain)
import re
reply = re.split('S|E',chain)
reply = [s for s in reply if s != '']
for x in reply:
print(x)