如何使用python將食物挑出?
要先進行分組:
# 讀取txt檔
with open('file.txt', 'r') as file:
content = file.read().splitlines()
# 將檔案中的內容進行分組
foods = [word for word in content if word in ['apple', 'orange', 'candy']]
# 印出分組結果
print(foods)
輸出:
['apple', 'orange', 'candy']
其實我有一個可能很愚蠢的問題:
怎麼判斷那些單字是不是食物呀?
我以為要用 BERT 還什麼語言模型去判斷⋯⋯
# 先給一些食物的單字舉例
FOODLIST = [
"apple",
"orange",
"candy",
"noodle",
"rice",
# ... etc. a lot,最好有個一萬筆
]
# 看要用查表還是找模型
AUTO_JUDGE = False
if not AUTO_JUDGE:
# 直接用查表的,指定哪些單字算食物
def word_is_food(word):
return word in FOODLIST
else:
# 用 NLP 模型自動判斷
import torch, transformers
def load_model(modelname="bert-base-uncased"):
# LOAD a BERT model for example
# 你當然可以選其他模型
tokenizer = (transformers
.AutoTokenizer.from_pretrained(
modelname))
model = (transformers
.BertModel.from_pretrained(
modelname))
return model, tokenizer
model, tokenizer = load_model()
def word_is_food(word):
""" 判斷單字是不是食物 """
def similarity(word1, word2):
""" cosine similarity """
return ((word1 @ word2)
/ (word1 @ word1) / (word2 @ word2))
last_hidden_states = model(
**(tokenizer(word, return_tensors="pt"))
).last_hidden_states[..., 1:].mean()
def embed_foodword(word):
return model(
**(tokenizer(word, return_tensors="pt"))
).last_hidden_states[..., 1:].mean()
total_sim = [
similarity(
last_hidden_states,
embed_foodword(foodword),
)
for foodword in FOODLIST]
return (total_sim / len(FOODLIST)) > 0.5
foods = []
with open('file.txt', 'r') as file:
for line in file:
word = line.strip()
if word_is_food(word):
food_in_file.append(word)
print(foods)