Day 25: 三種AI猜字方法

第 11 屆 iThome 鐵人賽

DAY 25

AI & Data

深入淺出搜尋引擎和自然語言處理系列第 25 篇

11th鐵人賽 nlp 自然語言處理

丹尼爾胡

2019-09-26 07:28:50

1647 瀏覽

分享至

第一種猜字方法：隨機猜字

為了設下一個基準，我們先設計一種AI方法--每次從26個字母中隨機選取一個字母來猜。這裡我先將26個字母存到 list 中，再用 numpy.random.choice隨機選取。

test_guesser 是用來測試平均猜錯次數的方法。

def test_guesser(guesser, test=test_set):
    """
        這個方法是用來測試平均猜錯的次數。
    """
    total = 0
    for word in test:
        total += hangman(word, guesser, 26, False)
    return total / float(len(test))

import string

def random_guesser(mask, guessed, **kwargs):
    """
        隨機猜字
    """

    alphabets = []
    for letter in range(97,123):
        if chr(letter) not in guessed:
            alphabets.append(chr(letter))

    picked = np.random.choice(alphabets)
    return picked

# 若要看看機器是怎麼猜字的，可以把下面這句打開
#hangman(np.random.choice(test_set), random_guesser, 10, True)

result = test_guesser(random_guesser)
print()
print("Average number of incorrect guesses: ", result)

第二種猜法：Unigram Guesser

我們可以嘗試用Unigram模型來訓練。我們需要知道每個字母的出現頻率，接著照出現頻率的高低來進行猜字。每當猜完一個字之後就應該把猜過的字去掉。

from collections import Counter

# unigram_counts 儲存了整個訓練及中每個字母的出現次數
unigram_counts = Counter()

for word in training_set:
    for letter in word:
        unigram_counts[letter] += 1

print(unigram_counts)


def unigram_guesser(mask, guessed, unigram_counts=unigram_counts):
    """
        這個方法實作了Unigram Guesser，會根據Unigram Model每次回傳一個要猜的字。
    """
    
    unigram_keys = []

    # 照出現頻率將字母排序
    for i in range(len(unigram_counts)):
        unigram_keys.append(unigram_counts.most_common()[i][0])

    # 將猜過的字去除
    for letter in guessed:
        if letter in unigram_keys:
            unigram_keys.remove(letter)

    return unigram_keys[0]

#hangman(np.random.choice(test_set), unigram_guesser, 10, True)

result = test_guesser(unigram_guesser)
print()
print("平均猜錯次數：", result)

第三種猜法：根據文字長度猜字

從和昨天同一篇文章中我們看到，不同的文字長度，每個字母出現的頻率不盡相同，例如，短的字比較不會出現前綴或後綴。在這裡，我們針對不同的文字長度設計不一樣的猜字順序。

from collections import defaultdict

# unigram_counts_by_length 將文字長度和字母頻率map在一起
unigram_counts_by_length = defaultdict(Counter)

# 幫每一種文字長度寫不同的Unigram Model
for word in training_set:    
    this_count = Counter()
    for letter in word:
        this_count[letter] += 1
        unigram_counts_by_length[len(word)] += this_count
        this_count = Counter()

        
def exclude_guessed_letters(length_model, guessed):
    unigram_keys_by_length = []
    # 照出現頻率將字母排序
    for i in range(len(unigram_counts_by_length[length_model])):
        unigram_keys_by_length.append(unigram_counts_by_length[length_model].most_common()[i][0])
    
    # 將猜過的字去除
    for letter in guessed:
        if letter in unigram_keys_by_length:
            unigram_keys_by_length.remove(letter)
    
    return unigram_keys_by_length


lengths = sorted(unigram_counts_by_length.keys())
max_length = lengths[-1] + 1

print(unigram_counts_by_length)

def unigram_length_guesser(mask, guessed, counts=unigram_counts_by_length):
    
    length_model = len(mask)
    # 若要猜的文字長度不在unigram model時，我們將一長度來猜。
    while length_model not in lengths:
        length_model -= 1
    
    unigram_keys_by_length = exclude_guessed_letters(length_model, guessed)
    
    # 若這個文字長度沒有猜字選項了，從附近的文字長度找
    while len(unigram_keys_by_length) == 0:
        if length_model < 20:
            length_model += 1
        else:
            length_model -= 1
        unigram_keys_by_length = exclude_guessed_letters(length_model, guessed)
    
    return unigram_keys_by_length[0]


#hangman(np.random.choice(test_set), unigram_length_guesser, 10, True)

result = test_guesser(unigram_length_guesser)
print()
print("平均猜錯次數：", result)

今天的Code在這裡。