iT邦幫忙

2025 iThome 鐵人賽

DAY 20
0
生成式 AI

練習AI系列 第 21

AI 數據分析助手(CSV/JSON → 統計摘要+洞察+可選圖表)

  • 分享至 

  • xImage
  •  

📦 安裝(新增相依)
npm i csv-parse simple-statistics

(可選)要讓 AI 產洞察才需要 openai:你專案已有 aiClient 就不用加

🆕 程式碼

  1. src/utils/csv.js(新增 / 或覆蓋)
    // src/utils/csv.js
    import fs from "fs";
    import { parse } from "csv-parse/sync";

/** 讀 JSON:支援 Array 或 {data: Array} 兩種結構 */
export function readJson(filePath) {
const raw = fs.readFileSync(filePath, "utf-8");
const obj = JSON.parse(raw);
if (Array.isArray(obj)) return obj;
if (Array.isArray(obj?.data)) return obj.data;
throw new Error("JSON 結構不支援:需要 Array 或 {data: Array}");
}

/** 讀 CSV(UTF-8),回傳 Array */
export function readCsv(filePath) {
const raw = fs.readFileSync(filePath, "utf-8");
const recs = parse(raw, {
columns: true,
skip_empty_lines: true,
trim: true,
});
return recs;
}

/** 取副檔名(小寫) */
export function extnameLower(p) {
return (p.split(".").pop() || "").toLowerCase();
}

  1. src/day20_data_analyst.js(新增 / 或覆蓋)
    // src/day20_data_analyst.js
    import fs from "fs";
    import path from "path";
    import ss from "simple-statistics";
    import { openai } from "./aiClient.js"; // 你已有的 client
    import { readCsv, readJson, extnameLower } from "./utils/csv.js";

/** 工具:安全轉數字 */
function toNum(x) {
if (x === null || x === undefined || x === "") return NaN;
const n = Number(String(x).replace(/,/g, ""));
return Number.isFinite(n) ? n : NaN;
}

/** 推斷欄位型別(簡化且實用) */
function inferType(values) {
let nNum = 0, nDate = 0, nBool = 0, nEmpty = 0;
const SAMPLE = values.slice(0, 1000);
for (const v of SAMPLE) {
if (v === null || v === undefined || v === "") { nEmpty++; continue; }
const s = String(v).trim();
if (/^(true|false)$/i.test(s)) { nBool++; continue; }
const nn = toNum(s);
if (Number.isFinite(nn)) { nNum++; continue; }
// Date(鬆判)
const d = new Date(s);
if (!isNaN(d.valueOf())) { nDate++; continue; }
}
const k = SAMPLE.length - nEmpty;
if (k === 0) return "text";
const pNum = nNum / k, pDate = nDate / k, pBool = nBool / k;
if (pNum > 0.7) return "number";
if (pDate > 0.7) return "date";
if (pBool > 0.7) return "boolean";
return "text";
}

/** 欄位統計:依型別回傳統計資料 */
function profileColumn(values, type) {
const n = values.length;
const empty = values.filter(v => v === null || v === undefined || v === "").length;
const distinct = new Set(values.filter(v => v !== "" && v !== null && v !== undefined).map(String)).size;
const common = { count: n, missing: empty, missingRate: n ? empty / n : 0, distinct };

if (type === "number") {
const nums = values.map(toNum).filter(Number.isFinite);
if (nums.length === 0) return { ...common, stats: null, outliers: [] };
const sorted = nums.slice().sort((a, b) => a - b);
const q1 = ss.quantileSorted(sorted, 0.25);
const p50 = ss.medianSorted(sorted);
const q3 = ss.quantileSorted(sorted, 0.75);
const iqr = q3 - q1;
const lower = q1 - 1.5 * iqr;
const upper = q3 + 1.5 * iqr;
const outliers = nums.filter(v => v < lower || v > upper);
return {
...common,
stats: {
mean: ss.mean(nums),
stdev: nums.length > 1 ? ss.standardDeviation(nums) : 0,
min: sorted[0],
p25: q1,
p50,
p75: q3,
max: sorted[sorted.length - 1],
iqr,
skew: (ss.mean(nums) - p50) / (ss.standardDeviation(nums) + 1e-9),
},
outliersSample: outliers.slice(0, 10),
outliersCount: outliers.length,
};
}

if (type === "boolean") {
const trues = values.filter(v => String(v).toLowerCase() === "true").length;
const falses = values.filter(v => String(v).toLowerCase() === "false").length;
return { ...common, bool: { true: trues, false: falses } };
}

if (type === "date") {
// 取日級別分佈
const days = new Map();
for (const v of values) {
const d = new Date(v);
if (isNaN(d.valueOf())) continue;
const key = d.toISOString().slice(0, 10);
days.set(key, (days.get(key) || 0) + 1);
}
const series = [...days.entries()].sort((a, b) => a[0].localeCompare(b[0])).map(([k, c]) => ({ day: k, count: c }));
return { ...common, timeline: series.slice(0, 200) };
}

// text / 其他 → 類別 Top-K
const counts = new Map();
for (const v of values) {
if (v === null || v === undefined || v === "") continue;
const key = String(v);
counts.set(key, (counts.get(key) || 0) + 1);
}
const topK = [...counts.entries()].sort((a, b) => b[1] - a[1]).slice(0, 10)
.map(([value, cnt]) => ({ value, count: cnt }));
return { ...common, topK };
}

/** 健康檢查:可快速抓問題欄 */
function healthChecks(rows, schema) {
const issues = [];
if (!rows.length) return { issues };

// 重複列
const keySeen = new Set();
let dup = 0;
for (const r of rows) {
const sig = JSON.stringify(r);
if (keySeen.has(sig)) dup++;
else keySeen.add(sig);
}
if (dup > 0) issues.push({ type: "duplicate_rows", count: dup, message: 偵測到 ${dup} 筆重複列 });

// 空值/常值欄位、疑似 ID 欄位
for (const col of Object.keys(schema)) {
const vals = rows.map(r => r[col]);
const nonEmpty = vals.filter(v => v !== "" && v !== null && v !== undefined);
const distinct = new Set(nonEmpty.map(String));
if (nonEmpty.length === 0) issues.push({ type: "all_missing", column: col, message: "此欄幾乎全為缺失" });
if (distinct.size === 1 && nonEmpty.length > 0) issues.push({ type: "constant", column: col, message: "此欄幾乎為常值,資訊量低" });
if (distinct.size === nonEmpty.length && nonEmpty.length === rows.length && rows.length >= 20)
issues.push({ type: "possible_id", column: col, message: "此欄每列唯一,疑似 ID 欄位" });
}

return { issues };
}

/** 主流程:資料分析 */
export async function analyzeData({ filePath, rows: inputRows, llmInsight = true }) {
let rows = inputRows;
if (!rows && !filePath) throw new Error("請提供 filePath 或 rows(陣列)");
if (!rows) {
const ext = extnameLower(filePath);
if (ext === "csv") rows = readCsv(filePath);
else if (ext === "json") rows = readJson(filePath);
else throw new Error("只支援 .csv / .json");
}
if (!Array.isArray(rows) || rows.length === 0) throw new Error("資料為空或格式不正確");

const columns = Object.keys(rows[0]);
const schema = {};
for (const c of columns) {
schema[c] = inferType(rows.map(r => r[c]));
}

const profiles = {};
for (const c of columns) {
profiles[c] = profileColumn(rows.map(r => r[c]), schema[c]);
}

const health = healthChecks(rows, schema);

// AI 洞察(可關閉)
let insights = null;
if (llmInsight) {
const brief = {
rows: rows.length,
cols: columns.length,
schema,
pick: Object.fromEntries(
columns.slice(0, 5).map(c => [c, {
type: schema[c],
...("stats" in (profiles[c] || {}) ? { stats: profiles[c].stats } : {}),
...("topK" in (profiles[c] || {}) ? { topK: profiles[c].topK } : {}),
}])
),
issues: health.issues.slice(0, 8),
};
const res = await openai.chat.completions.create({
model: "gpt-4o-mini",
temperature: 0.2,
messages: [
{ role: "system", content: "你是嚴謹的資料分析顧問,請用繁體中文回覆。" },
{ role: "user", content:
`資料概況(JSON):
${JSON.stringify(brief, null, 2)}

請產出:

  1. TL;DR(3~5 句)
  2. 重要指標與異常(條列)
  3. 值得追問的3~5題
    僅根據提供的概況,不要臆測資料內容。` }
    ],
    });
    insights = res.choices?.[0]?.message?.content?.trim() || "";
    }

const result = {
rows: rows.length,
columns,
schema, // 欄位型別
profiles, // 欄位統計
health, // 健康檢查
insights, // LLM 洞察(可為 null)
createdAt: new Date().toISOString(),
};

// 存檔
const outDir = path.join("outputs", "data");
if (!fs.existsSync(outDir)) fs.mkdirSync(outDir, { recursive: true });
const stamp = Date.now();
const jsonPath = path.join(outDir, analysis_${stamp}.json);
const mdPath = path.join(outDir, analysis_${stamp}.md);

fs.writeFileSync(jsonPath, JSON.stringify(result, null, 2), "utf-8");
// Markdown 摘要(精簡)
const md = [
# Data Analysis (${stamp}),
- 產出:${result.createdAt},
- 列數:${result.rows},欄位數:${result.columns.length},
## 欄位型別,
...Object.entries(schema).map(([k, v]) => - \${k}`: ${v}), \n## 健康檢查, ...(health.issues.length ? health.issues.map(i => - [${i.type}] ${i.column ? \${i.column}`:: ""}${i.message}${i.count ?(${i.count}) : ""}) : ["- (無明顯問題)"]),
\n## 洞察,
insights || "(如需洞察,請開啟 llmInsight)",
].join("\n");
fs.writeFileSync(mdPath, md, "utf-8");

return { jsonPath, mdPath, meta: result };
}

  1. index.js(修改:加入 task=data)
    // index.js(節錄新增 data 分支)
    import { analyzeData } from "./src/day20_data_analyst.js";
    import fs from "fs";

const args = Object.fromEntries(
process.argv.slice(2).reduce((acc, cur, i, arr) => {
if (cur.startsWith("--")) {
const key = cur.replace(/^--/, "");
const val = arr[i + 1] && !arr[i + 1].startsWith("--") ? arr[i + 1] : true;
acc.push([key, val]);
}
return acc;
}, [])
);

async function main() {
const task = args.task || "chat";

if (task === "data") {
const filePath = args.file || "";
const noLLM = String(args["no-llm"] || "false").toLowerCase() === "true";
if (!filePath || !fs.existsSync(filePath)) throw new Error("請提供存在的檔案路徑 --file <path.csv|json>");
const out = await analyzeData({ filePath, llmInsight: !noLLM });
console.log("\n=== 數據分析完成 ===");
console.log("- JSON:", out.jsonPath);
console.log("- Markdown:", out.mdPath);
console.log("\n重點:");
console.log(列數=${out.meta.rows} 欄位數=${out.meta.columns.length} 問題數=${out.meta.health.issues.length});

} else {
// ...保留你原本的其他 task 分支
}
}

main().catch(e => { console.error(e); process.exit(1); });

  1. package.json(新增 scripts)
    {
    "scripts": {
    "day20:csv": "node index.js --task data --file sample/data.csv",
    "day20:json": "node index.js --task data --file sample/data.json",
    "day20:csv:nollm": "node index.js --task data --file sample/data.csv --no-llm true"
    }
    }

  2. Next.js API:app/api/data/analyze/route.js(新增)
    // app/api/data/analyze/route.js
    import { NextResponse } from "next/server";
    import { analyzeData } from "../../../../src/day20_data_analyst.js";

export const runtime = "nodejs";

export async function POST(req) {
try {
const ctype = req.headers.get("content-type") || "";
if (ctype.includes("multipart/form-data")) {
const form = await req.formData();
const file = form.get("file");
if (!file) return NextResponse.json({ ok:false, error:"缺少檔案" }, { status:400 });
const buf = Buffer.from(await file.arrayBuffer());
// 寫入暫存再分析
const name = file.name || upload_${Date.now()};
const tmp = /tmp/${Date.now()}_${name};
await Bun.write?.(tmp, buf); // 若你不是 bun,改用 fs.writeFileSync
// fallback(Node)
try { require("fs").writeFileSync(tmp, buf); } catch {}
const out = await analyzeData({ filePath: tmp, llmInsight: true });
return NextResponse.json({ ok:true, ...out.meta });
} else {
const body = await req.json();
const rows = body?.rows;
if (!Array.isArray(rows)) return NextResponse.json({ ok:false, error:"JSON 請提供 rows:Array" }, { status:400 });
const out = await analyzeData({ rows, llmInsight: true });
return NextResponse.json({ ok:true, ...out.meta });
}
} catch (e) {
return NextResponse.json({ ok:false, error: e.message || "Server error" }, { status:500 });
}
}

你若不用 Bun,保留 fs.writeFileSync 即可(我已做 fallback )。

  1. 前端頁:app/data-analyst/page.tsx(新增)
    "use client";
    import { useState } from "react";

type Schema = Record<string, string>;
type Issue = { type:string; column?:string; message:string; count?:number };

export default function DataAnalystPage() {
const [file, setFile] = useState<File|null>(null);
const [loading, setLoading] = useState(false);
const [rows, setRows] = useState(0);
const [cols, setCols] = useState<string[]>([]);
const [schema, setSchema] = useState({});
const [issues, setIssues] = useState<Issue[]>([]);
const [insights, setInsights] = useState("");
const [err, setErr] = useState("");

async function analyze() {
if (!file) { setErr("請選擇檔案"); return; }
setLoading(true); setErr(""); setInsights("");
try {
const fd = new FormData();
fd.append("file", file);
const r = await fetch("/api/data/analyze", { method: "POST", body: fd });
const j = await r.json();
if (!j.ok) throw new Error(j.error);
setRows(j.rows || 0);
setCols(j.columns || []);
setSchema(j.schema || {});
setIssues(j.health?.issues || []);
setInsights(j.insights || "");
} catch (e:any) {
setErr(e.message || "分析失敗");
} finally {
setLoading(false);
}
}

return (


Day 20|AI 數據分析助手

    {err && <div className="alert alert-error"><span>{err}</span></div>}

    <div className="card bg-base-100 shadow">
      <div className="card-body space-y-3">
        <input type="file" accept=".csv,.json" className="file-input file-input-bordered"
          onChange={e=>setFile(e.target.files?.[0] || null)} />
        <button className={`btn btn-primary ${loading ? "btn-disabled" : ""}`} onClick={analyze}>
          {loading ? "分析中..." : "開始分析"}
        </button>
      </div>
    </div>

    {rows > 0 && (
      <div className="grid grid-cols-1 lg:grid-cols-2 gap-4">
        <div className="card bg-base-100 shadow">
          <div className="card-body">
            <h2 className="card-title">資料概況</h2>
            <p>列數:{rows}</p>
            <p>欄位數:{cols.length}</p>
            <div className="mt-3">
              <h3 className="font-semibold">欄位型別</h3>
              <ul className="list-disc pl-6">
                {Object.entries(schema).map(([k,v])=>(
                  <li key={k}><code>{k}</code>:{v}</li>
                ))}
              </ul>
            </div>
          </div>
        </div>

        <div className="card bg-base-100 shadow">
          <div className="card-body">
            <h2 className="card-title">健康檢查</h2>
            {issues.length === 0 ? (
              <div>(無明顯問題)</div>
            ) : (
              <ul className="list-disc pl-6">
                {issues.map((i, idx)=>(
                  <li key={idx}>
                    <span className="badge badge-outline mr-2">{i.type}</span>
                    {i.column && <code className="mr-1">{i.column}</code>}
                    {i.message} {i.count ? `(${i.count})` : ""}
                  </li>
                ))}
              </ul>
            )}
          </div>
        </div>

        {insights && (
          <div className="card bg-base-100 shadow lg:col-span-2">
            <div className="card-body">
              <h2 className="card-title">AI 洞察(摘要)</h2>
              <div className="whitespace-pre-wrap">{insights}</div>
            </div>
          </div>
        )}
      </div>
    )}
  </div>
</div>

);
}

▶️ 使用方式
CLI

分析 CSV(含 LLM 洞察)

npm run day20:csv --silent

分析 JSON

npm run day20:json --silent

僅數理統計,不用 LLM

npm run day20:csv:nollm --silent

前端

開發

npm run dev

前往 /data-analyst 上傳 .csv/.json 查看摘要


上一篇
引用高亮(Grounded Answers)
下一篇
RAG「增量索引」+ 併發控制 + 刪除檢測
系列文
練習AI24
圖片
  熱門推薦
圖片
{{ item.channelVendor }} | {{ item.webinarstarted }} |
{{ formatDate(item.duration) }}
直播中

尚未有邦友留言

立即登入留言