今天我們要做的是到美國國家醫學圖書館 (NLM, National Library of Medicine) 簡稱(PubMed)分別抓取 E001 近10年 E002 近1年 論文文獻資料當作基本資料basic data 。
pubmed_ingest.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
PubMed E-utilities 自動蒐集:E001(烏髮)/E002(抗皺)
依食物/營養素同義詞比對,抽取端點與粗分級,輸出 evidence.auto.csv
"""
import argparse, csv, time, sys, re, math, os
from datetime import datetime, timedelta
import requests
import xml.etree.ElementTree as ET
import yaml
from urllib.parse import urlencode
EUTILS_BASE = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
UA = {"User-Agent": "LongevityProject/1.0 (contact: you@example.com)"}
ENDPOINT_PATTERNS = [
("wrinkle_depth", r"\bwrinkle(s)?\b|\bwrinkling\b"),
("skin_elasticity", r"\belasticit(y|ies)\b|\belastin\b"),
("collagen", r"\bcollagen\b|\btype\s?I collagen\b"),
("hair_pigmentation", r"\bhair (pigment|pigmentation)\b|\bmelanin\b|\bgrey hair\b|\bgray hair\b"),
("photoaging", r"\bphotoaging\b|\bphotodamage\b")
]
DIRECTION_PATTERNS = [
("improves", r"\b(improv|increase|enhanc|reduce[s]? wrinkle|boost)\w*"),
("worsens", r"\b(worsen|decreas|impair|inhibit collagen)\w*"),
]
GRADE_BY_PUBTYPE = {
"Randomized Controlled Trial": "A",
"Systematic Review": "A",
"Meta-Analysis": "A",
"Clinical Trial": "B",
"Controlled Clinical Trial": "B",
"Comparative Study": "B",
"Observational Study": "C",
"Cohort Studies": "C",
"Case-Control Studies": "C",
"Animal": "C",
"In Vitro": "C",
"Review": "D"
}
def guess_grade(pubtypes):
for k, v in GRADE_BY_PUBTYPE.items():
if any(k.lower() in p.lower() for p in pubtypes):
return v
return "D"
def detect(text, patterns, default=None):
text_l = text.lower()
for label, pat in patterns:
if re.search(pat, text_l):
return label
return default
def load_foods(path):
with open(path, "r") as f:
data = yaml.safe_load(f)
foods = data.get("foods", [])
for fobj in foods:
fobj["synonyms"] = [s.lower() for s in fobj.get("synonyms", [])]
fobj["name_l"] = fobj["name"].lower()
return foods
def build_query(effect, days):
today = datetime.utcnow().date()
start = today - timedelta(days=days)
if effect == "E002":
terms = ["(wrinkle OR elasticity OR collagen OR photoaging)"]
else:
terms = ["(hair pigmentation OR melanin OR grey hair OR gray hair)"]
date_range = f'("{start}"[Date - Publication] : "{today}"[Date - Publication])'
q = f'({" OR ".join(terms)}) AND {date_range}'
return q
def esearch(query, retmax=200):
params = dict(db="pubmed", term=query, retmax=str(retmax), sort="most+recent")
url = f"{EUTILS_BASE}/esearch.fcgi?{urlencode(params)}"
r = requests.get(url, headers=UA, timeout=20)
r.raise_for_status()
root = ET.fromstring(r.text)
ids = [e.text for e in root.findall(".//IdList/Id")]
return ids
def efetch(pmids):
params = dict(db="pubmed", id=",".join(pmids), rettype="abstract", retmode="xml")
url = f"{EUTILS_BASE}/efetch.fcgi?{urlencode(params)}"
r = requests.get(url, headers=UA, timeout=30)
r.raise_for_status()
return ET.fromstring(r.text)
def extract_articles(root):
arts = []
for art in root.findall(".//PubmedArticle"):
pmid = (art.findtext(".//PMID") or "").strip()
title = (art.findtext(".//ArticleTitle") or "").strip()
abstract = " ".join([t.text or "" for t in art.findall(".//AbstractText")]).strip()
pubtypes = [pt.text or "" for pt in art.findall(".//PublicationType")]
mesh = [mh.findtext("DescriptorName") or "" for mh in art.findall(".//MeshHeading")]
arts.append(dict(pmid=pmid, title=title, abstract=abstract, pubtypes=pubtypes, mesh=mesh))
return arts
def match_food(foods, text):
t = text.lower()
for f in foods:
if f["name_l"] in t or any(s in t for s in f["synonyms"]):
return f["name"]
return None
def run(effect, days, foods_path, out_csv):
foods = load_foods(foods_path)
q = build_query(effect, days)
print(f"[INFO] Query: {q}")
ids = esearch(q)
print(f"[INFO] Found {len(ids)} pmids")
all_rows = []
for chunk_start in range(0, len(ids), 50):
chunk = ids[chunk_start:chunk_start+50]
root = efetch(chunk)
arts = extract_articles(root)
for a in arts:
blob = f"{a['title']} {a['abstract']} {' '.join(a['mesh'])}"
food_name = match_food(foods, blob)
if not food_name:
continue
endpoint = detect(blob, ENDPOINT_PATTERNS, default=("wrinkle_depth" if effect=="E002" else "hair_pigmentation"))
direction = detect(blob, DIRECTION_PATTERNS, default="improves")
grade = guess_grade(a["pubtypes"])
all_rows.append({
"effect_id": effect,
"food_name": food_name,
"endpoint": endpoint,
"direction": direction,
"grade": grade,
"reference": f"PMID:{a['pmid']}",
"title": a["title"]
})
time.sleep(0.4)
uniq = {}
for r in all_rows:
k = (r["effect_id"], r["food_name"], r["reference"])
if k not in uniq:
uniq[k] = r
rows = list(uniq.values())
os.makedirs(os.path.dirname(out_csv), exist_ok=True)
with open(out_csv, "w", newline="", encoding="utf-8") as f:
w = csv.DictWriter(f, fieldnames=["effect_id","food_name","endpoint","direction","grade","reference","title"])
w.writeheader()
w.writerows(rows)
print(f"[DONE] wrote {len(rows)} rows to {out_csv}")
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("--effect", choices=["E001","E002"], required=True)
p.add_argument("--days", type=int, default=365)
p.add_argument("--foods", default=os.path.join(os.path.dirname(__file__), "..", "config", "foods.yml"))
p.add_argument("--out", default=os.path.join(os.path.dirname(__file__), "..", "outputs", "evidence.auto.csv"))
args = p.parse_args()
run(args.effect, args.days, args.foods, args.out)
requests
lxml
PyYAML
-food.yml
foods:
- name: "番茄"
synonyms: ["lycopene", "tomato", "tomatoes"]
- name: "甜椒(維生素C)"
synonyms: ["vitamin c", "ascorbic acid", "bell pepper", "capsicum"]
- name: "鮭魚"
synonyms: ["salmon", "omega-3", "epa", "dha", "fish oil"]
- name: "黑芝麻"
synonyms: ["black sesame", "sesame"]
- name: "綠茶"
synonyms: ["green tea", "egcg", "catechin", "epigallocatechin gallate"]