如何使用python將食物挑出?
要先進行分組:
# 讀取txt檔
with open('file.txt', 'r') as file:
    content = file.read().splitlines()
# 將檔案中的內容進行分組
foods = [word for word in content if word in ['apple', 'orange', 'candy']]
# 印出分組結果
print(foods)
輸出:
['apple', 'orange', 'candy']
其實我有一個可能很愚蠢的問題:
怎麼判斷那些單字是不是食物呀?
我以為要用 BERT 還什麼語言模型去判斷⋯⋯
# 先給一些食物的單字舉例
FOODLIST = [
    "apple",
    "orange",
    "candy",
    "noodle",
    "rice",
    # ... etc. a lot,最好有個一萬筆
]
# 看要用查表還是找模型
AUTO_JUDGE = False
if not AUTO_JUDGE:
    # 直接用查表的,指定哪些單字算食物
    def word_is_food(word):
        return word in FOODLIST
else:
    # 用 NLP 模型自動判斷
    import torch, transformers
    def load_model(modelname="bert-base-uncased"):
        # LOAD a BERT model for example
        # 你當然可以選其他模型
        tokenizer = (transformers
            .AutoTokenizer.from_pretrained(
                modelname))
        model = (transformers
            .BertModel.from_pretrained(
                modelname))
        return model, tokenizer
        
    model, tokenizer = load_model()
    def word_is_food(word):
        """ 判斷單字是不是食物 """
        def similarity(word1, word2):
            """ cosine similarity """
            return ((word1 @ word2) 
                / (word1 @ word1) / (word2 @ word2))
        last_hidden_states = model(
            **(tokenizer(word, return_tensors="pt"))
        ).last_hidden_states[..., 1:].mean()
        
        def embed_foodword(word):
            return model(
                **(tokenizer(word, return_tensors="pt"))
            ).last_hidden_states[..., 1:].mean()
        total_sim = [
            similarity(
                last_hidden_states,
                embed_foodword(foodword),
            )
            for foodword in FOODLIST]
        return (total_sim / len(FOODLIST)) > 0.5
foods = []
with open('file.txt', 'r') as file:
    for line in file:
        word = line.strip()
        if word_is_food(word):
            food_in_file.append(word)
print(foods)