圖片描述 (Image-to-Text)

17th鐵人賽

frankfrank8785

2025-09-13 12:45:58

168 瀏覽

分享至

src/day7_image_to_text.js（新增）
// src/day7_image_to_text.js
import fs from "fs";
import path from "path";
import { openai } from "./aiClient.js";
import { PromptBuilder } from "./promptBuilder.js";
import { extractJson, validateBySchema } from "./jsonGuard.js";

/** 簡易副檔名 → MIME 對照（常見即可） */
const MIME_MAP = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".webp": "image/webp",
};

/** 將本地檔讀成 data URL（供 image_url 使用） */
function fileToDataUrl(filePath) {
const ext = path.extname(filePath).toLowerCase();
const mime = MIME_MAP[ext] || "application/octet-stream";
const b64 = fs.readFileSync(filePath).toString("base64");
return data:${mime};base64,${b64};
}

/** 結構化輸出 schema */
const schema = {
type: "object",
required: ["title", "alt", "description", "tags"],
properties: {
title: { type: "string" },
alt: { type: "string" },
description: { type: "string" },
tags: { type: "object" }, // 驗證為 array
ocrText: { type: "string" }, // 可選
},
};

/**

圖片 → 文字（JSON）
@param {Object} opts
@param {string} [opts.imagePath] - 本地檔路徑
@param {string} [opts.imageUrl] - 遠端 URL
@param {boolean} [opts.wantOCR=false] - 是否嘗試讀取圖中文字（粗略）
@param {("short"|"medium"|"long")} [opts.length="medium"] - 描述長度
*/
export async function imageToJson(opts = {}) {
const { imagePath, imageUrl, wantOCR = false, length = "medium" } = opts;
if (!imagePath && !imageUrl) {
throw new Error("請提供 imagePath（本地檔）或 imageUrl（遠端連結）其一。");
}

// 組 image 資料來源
let url = imageUrl;
if (!url && imagePath) {
if (!fs.existsSync(imagePath)) {
throw new Error(找不到檔案：${imagePath});
}
url = fileToDataUrl(imagePath);
}

// 提示工程：用 PromptBuilder 宣告任務與限制
const pb = new PromptBuilder()
.setRole("你是嚴謹的圖片描述與可近用性（Accessibility）撰寫助手")
.setGoal("針對輸入圖片，產生結構化 JSON：title/alt/description/tags/(optional)ocrText")
.addConstraint("ALT 文本需精煉、具體、避免主觀情緒")
.addConstraint("description 用完整句子，避免流水帳，描述場景/主體/動作/風格")
.addConstraint("tags 為 3~7 個中文關鍵字陣列，從『可辨識客觀元素』出發")
.addConstraint("若畫面有清楚中文字且 wantOCR=true，再嘗試 OCR；否則 ocrText 請留空或省略")
.addConstraint("輸出必須是純 JSON，不要有多餘文字或 Markdown")
.setFormatHint(描述長度：${length === "short" ? "80~120 字" : length === "long" ? "300~500 字" : "150~250 字"})
.setJsonSchema(schema);

const messages = [
{ role: "system", content: pb.buildSystemPrompt() },
// user: 多模態輸入（image + 文字指示）
{
role: "user",
content: [
{ type: "input_text", text: 請產生結構化 JSON${wantOCR ? "（同時嘗試 OCR）" : ""}。 },
{ type: "input_image", image_url: url },
],
},
];

const res = await openai.chat.completions.create({
model: "gpt-4o-mini", // 支援圖像理解
temperature: 0.2,
messages,
});

const raw = res.choices?.[0]?.message?.content ?? "";
const obj = extractJson(raw);

// 極輕量 schema 驗證
const check = validateBySchema(obj, schema);
if (!check.ok) {
throw new Error("JSON 不符合 schema：" + check.errors.join("; "));
}

// 類型確認：tags 必須是陣列
if (!Array.isArray(obj.tags)) {
throw new Error("欄位 tags 應為陣列");
}

// wantOCR=false 時，移除空的 ocrText
if (!wantOCR && "ocrText" in obj && !obj.ocrText) {
delete obj.ocrText;
}

return obj;
}

index.js（修改）
// index.js
import { englishTeacher, codeReview, sentimentClassify } from "./src/day3_prompt_engineering.js";
import { newsToJson } from "./src/day4_text_to_json.js";
import { chatOnce, resetSession } from "./src/day5_chat_history.js";
import { textToImage } from "./src/day6_text_to_image.js";
import { imageToJson } from "./src/day7_image_to_text.js";

const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);

async function main() {
const task = args.task || "chat";

if (task === "vision") {
const imagePath = args.imagePath || null;
const imageUrl = args.imageUrl || null;
const wantOCR = args.ocr === "true" || args.ocr === true;
const length = args.length || "medium";
const out = await imageToJson({ imagePath, imageUrl, wantOCR, length });
console.log("\n=== 圖片 → JSON 描述 ===\n");
console.log(JSON.stringify(out, null, 2));

} else if (task === "image") {
const prompt = args.text || "一隻戴著太空頭盔的柴犬，漂浮在月球上，插著台灣國旗";
const size = args.size || "512x512";
const n = args.n ? Number(args.n) : 1;
const urls = await textToImage(prompt, { size, n });
console.log("\n=== 生成圖片 ===\n");
urls.forEach((f) => console.log("已儲存：" + f));

} else if (task === "chat") {
const sessionId = args.session || "default";
if (args.reset) {
resetSession(sessionId);
console.log(已重設會話：${sessionId});
return;
}
const input = args.text || "嗨，我想規劃 3 天 2 夜的台中旅遊行程。";
const { reply } = await chatOnce(input, { sessionId });
console.log(\n[${sessionId}] AI：\n${reply}\n);

} else if (task === "teacher") {
const out = await englishTeacher(args.text || "He go to school every day.");
console.log("\n=== 英文老師 ===\n");
console.log(out);

} else if (task === "review") {
const out = await codeReview("function sum(arr){ return arr.reduce((a,b)=>a+b,0) }");
console.log("\n=== 程式碼審查 ===\n");
console.log(out);

} else if (task === "sentiment") {
const out = await sentimentClassify(args.text || "今天心情糟透了，事情一團亂。");
console.log("\n=== 情緒分類(JSON) ===\n");
console.log(out);

} else if (task === "json_summary") {
const out = await newsToJson(args.text || "OpenAI 發布新模型，效能大幅提升。");
console.log("\n=== 新聞 JSON 摘要 ===\n");
console.log(out);

} else {
console.log("未知任務，請使用 --task chat | teacher | review | sentiment | json_summary | image | vision");
}
}

main().catch((e) => {
console.error("發生錯誤：", e.message);
process.exit(1);
});

package.json（新增 Script）
{
"scripts": {
"day7:vision:file": "node index.js --task vision --imagePath sample/cat.png --length short",
"day7:vision:url": "node index.js --task vision --imageUrl https://example.com/demo.jpg --ocr true --length medium"
}
}

▶️ CLI 操作範例

讀本地檔（不做 OCR）

npm run day7:vision:file --silent

讀遠端 URL 並嘗試 OCR（若圖中有中文字）

npm run day7:vision:url --silent

輸出（範例）：

{
"title": "黃昏街角的咖啡外帶杯",
"alt": "一個咖啡外帶杯放在濕潤路緣上，背景為散景的街燈與車流",
"description": "畫面中央是一個棕白相間的咖啡外帶杯，置於濕潤的路面...",
"tags": ["咖啡", "街頭", "夜景", "雨後", "散景", "城市"],
"ocrText": ""
}