經過 Day 27 的 CloudWatch 監控建置,我們已經有了完整的可觀測性架構。今天我們要實作 AWS QuickSight 與資料湖架構,這是企業級數據分析的關鍵基礎設施。我們將使用 S3 作為資料湖、AWS Glue 進行 ETL、Athena 查詢分析、QuickSight 視覺化呈現,打造一個完整的 Modern Data Stack,讓 Kyo System 的數據發揮最大價值。
/**
* AWS Data Lake Architecture
*
* ┌──────────────────────────────────────────────┐
* │ Modern Data Stack on AWS │
* └──────────────────────────────────────────────┘
*
* 1. Data Ingestion (資料擷取)
* ┌─────────────────────────────────────┐
* │ Data Sources │
* │ ├─ RDS (Operational Data) │
* │ ├─ CloudWatch Logs │
* │ ├─ ALB Access Logs │
* │ ├─ S3 (Application Logs) │
* │ └─ Kinesis Firehose │
* │ ↓ │
* │ Lambda (Transformation) │
* │ ↓ │
* │ S3 Data Lake │
* └─────────────────────────────────────┘
*
* 2. Data Lake Layers (分層架構)
* ┌─────────────────────────────────────┐
* │ S3 Data Lake (3 Layers) │
* │ │
* │ ├─ Raw (原始數據) │
* │ │ └─ s3://kyo-datalake/raw/ │
* │ │ - 原始格式 (JSON, CSV) │
* │ │ - 不可變 │
* │ │ - 完整歷史 │
* │ │
* │ ├─ Processed (處理後) │
* │ │ └─ s3://kyo-datalake/processed/ │
* │ │ - Parquet 格式 │
* │ │ - 分區 (年/月/日) │
* │ │ - 壓縮 (Snappy) │
* │ │
* │ └─ Curated (精選) │
* │ └─ s3://kyo-datalake/curated/ │
* │ - 聚合數據 │
* │ - 業務模型 │
* │ - 視覺化用 │
* └─────────────────────────────────────┘
*
* 3. Data Processing (ETL)
* ┌─────────────────────────────────────┐
* │ AWS Glue │
* │ ├─ Glue Crawler │
* │ │ └─ 自動探索 Schema │
* │ ├─ Glue Data Catalog │
* │ │ └─ 集中式元資料 │
* │ ├─ Glue ETL Jobs │
* │ │ ├─ Python / Spark │
* │ │ ├─ 資料清洗 │
* │ │ ├─ 格式轉換 │
* │ │ └─ 分區管理 │
* │ └─ Glue Workflows │
* │ └─ 編排任務流程 │
* └─────────────────────────────────────┘
*
* 4. Data Query & Analysis
* ┌─────────────────────────────────────┐
* │ Amazon Athena │
* │ ├─ SQL 查詢 S3 │
* │ ├─ Serverless │
* │ ├─ 按掃描量計費 │
* │ └─ 分區查詢優化 │
* │ │
* │ Amazon Redshift Serverless │
* │ ├─ 複雜分析 │
* │ ├─ 高效能 │
* │ └─ BI 整合 │
* └─────────────────────────────────────┘
*
* 5. Data Visualization
* ┌─────────────────────────────────────┐
* │ Amazon QuickSight │
* │ ├─ SPICE (In-memory Engine) │
* │ ├─ Dashboards & Reports │
* │ ├─ Embedded Analytics │
* │ ├─ ML Insights │
* │ └─ Row-level Security │
* └─────────────────────────────────────┘
*
* 成本優化:
* - S3 Intelligent-Tiering
* - Athena 分區查詢 (減少掃描量)
* - Glue Job Bookmarks (增量處理)
* - QuickSight SPICE (快取)
* - 生命週期管理 (Glacier)
*/
// infrastructure/lib/data-lake-stack.ts
import * as cdk from 'aws-cdk-lib';
import * as s3 from 'aws-cdk-lib/aws-s3';
import * as glue from 'aws-cdk-lib/aws-glue';
import * as athena from 'aws-cdk-lib/aws-athena';
import * as iam from 'aws-cdk-lib/aws-iam';
import * as lambda from 'aws-cdk-lib/aws-lambda';
import * as events from 'aws-cdk-lib/aws-events';
import * as targets from 'aws-cdk-lib/aws-events-targets';
import { Construct } from 'constructs';
export class DataLakeStack extends cdk.Stack {
public readonly dataLakeBucket: s3.Bucket;
public readonly glueDatabase: glue.CfnDatabase;
constructor(scope: Construct, id: string, props?: cdk.StackProps) {
super(scope, id, props);
/**
* S3 Data Lake Bucket (3-tier structure)
*/
this.dataLakeBucket = new s3.Bucket(this, 'DataLakeBucket', {
bucketName: `kyo-datalake-${this.account}`,
encryption: s3.BucketEncryption.S3_MANAGED,
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
versioned: false, // 資料湖不需版本控制
lifecycleRules: [
// Raw layer: 保留 90 天後轉至 Intelligent-Tiering
{
id: 'raw-layer-lifecycle',
prefix: 'raw/',
transitions: [
{
storageClass: s3.StorageClass.INTELLIGENT_TIERING,
transitionAfter: cdk.Duration.days(90),
},
],
expiration: cdk.Duration.days(365), // 1 年後刪除
},
// Processed layer: 保留 30 天後轉至 Intelligent-Tiering
{
id: 'processed-layer-lifecycle',
prefix: 'processed/',
transitions: [
{
storageClass: s3.StorageClass.INTELLIGENT_TIERING,
transitionAfter: cdk.Duration.days(30),
},
],
expiration: cdk.Duration.days(730), // 2 年後刪除
},
// Curated layer: 永久保留,轉至 Glacier
{
id: 'curated-layer-lifecycle',
prefix: 'curated/',
transitions: [
{
storageClass: s3.StorageClass.GLACIER,
transitionAfter: cdk.Duration.days(180),
},
],
},
],
});
/**
* Glue Database
*/
this.glueDatabase = new glue.CfnDatabase(this, 'GlueDatabase', {
catalogId: this.account,
databaseInput: {
name: 'kyo_datalake',
description: 'Kyo System Data Lake Database',
},
});
/**
* Glue Crawler for Raw Data
*/
const crawlerRole = new iam.Role(this, 'GlueCrawlerRole', {
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'),
managedPolicies: [
iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole'),
],
});
this.dataLakeBucket.grantRead(crawlerRole);
new glue.CfnCrawler(this, 'RawDataCrawler', {
name: 'kyo-raw-data-crawler',
role: crawlerRole.roleArn,
databaseName: this.glueDatabase.ref,
targets: {
s3Targets: [
{
path: `s3://${this.dataLakeBucket.bucketName}/raw/`,
},
],
},
schemaChangePolicy: {
updateBehavior: 'UPDATE_IN_DATABASE',
deleteBehavior: 'LOG',
},
schedule: {
// 每天凌晨 2 點執行
scheduleExpression: 'cron(0 2 * * ? *)',
},
});
/**
* Glue ETL Job - Raw to Processed
*/
const etlRole = new iam.Role(this, 'GlueETLRole', {
assumedBy: new iam.ServicePrincipal('glue.amazonaws.com'),
managedPolicies: [
iam.ManagedPolicy.fromAwsManagedPolicyName('service-role/AWSGlueServiceRole'),
],
});
this.dataLakeBucket.grantReadWrite(etlRole);
const etlScript = this.createETLScript();
new glue.CfnJob(this, 'RawToProcessedETL', {
name: 'kyo-raw-to-processed-etl',
role: etlRole.roleArn,
command: {
name: 'glueetl',
scriptLocation: `s3://${this.dataLakeBucket.bucketName}/scripts/etl.py`,
pythonVersion: '3',
},
defaultArguments: {
'--TempDir': `s3://${this.dataLakeBucket.bucketName}/temp/`,
'--job-bookmark-option': 'job-bookmark-enable',
'--enable-metrics': 'true',
'--enable-continuous-cloudwatch-log': 'true',
'--source_path': `s3://${this.dataLakeBucket.bucketName}/raw/`,
'--dest_path': `s3://${this.dataLakeBucket.bucketName}/processed/`,
},
glueVersion: '3.0',
maxRetries: 1,
timeout: 60, // 1 小時
workerType: 'G.1X',
numberOfWorkers: 10,
});
/**
* Athena Workgroup
*/
const athenaResultsBucket = new s3.Bucket(this, 'AthenaResultsBucket', {
bucketName: `kyo-athena-results-${this.account}`,
encryption: s3.BucketEncryption.S3_MANAGED,
blockPublicAccess: s3.BlockPublicAccess.BLOCK_ALL,
lifecycleRules: [
{
expiration: cdk.Duration.days(30), // 查詢結果 30 天後刪除
},
],
});
new athena.CfnWorkGroup(this, 'AthenaWorkGroup', {
name: 'kyo-analytics',
workGroupConfiguration: {
resultConfiguration: {
outputLocation: `s3://${athenaResultsBucket.bucketName}/`,
encryptionConfiguration: {
encryptionOption: 'SSE_S3',
},
},
enforceWorkGroupConfiguration: true,
publishCloudWatchMetricsEnabled: true,
engineVersion: {
selectedEngineVersion: 'AUTO',
},
},
});
/**
* Lambda for Data Ingestion
*/
const ingestionLambda = new lambda.Function(this, 'DataIngestionLambda', {
functionName: 'kyo-data-ingestion',
runtime: lambda.Runtime.PYTHON_3_9,
handler: 'index.handler',
code: lambda.Code.fromInline(`
import json
import boto3
import datetime
from urllib.parse import unquote_plus
s3 = boto3.client('s3')
def handler(event, context):
"""
從 RDS 導出數據到 S3 Data Lake
"""
bucket = '${this.dataLakeBucket.bucketName}'
# 模擬從 RDS 查詢數據
# 實際上應該連接 RDS 查詢
data = [
{'id': 1, 'name': 'test', 'created_at': '2024-01-01T00:00:00Z'},
]
# 按日期分區
date = datetime.datetime.now().strftime('%Y/%m/%d')
key = f'raw/otp_requests/year={date.split("/")[0]}/month={date.split("/")[1]}/day={date.split("/")[2]}/data.json'
# 寫入 S3
s3.put_object(
Bucket=bucket,
Key=key,
Body=json.dumps(data),
ContentType='application/json'
)
return {
'statusCode': 200,
'body': json.dumps(f'Data ingested to {key}')
}
`),
timeout: cdk.Duration.minutes(5),
environment: {
DATA_LAKE_BUCKET: this.dataLakeBucket.bucketName,
},
});
this.dataLakeBucket.grantWrite(ingestionLambda);
// EventBridge 定期觸發
const rule = new events.Rule(this, 'DataIngestionRule', {
schedule: events.Schedule.cron({ hour: '0', minute: '0' }), // 每天凌晨
});
rule.addTarget(new targets.LambdaFunction(ingestionLambda));
/**
* Outputs
*/
new cdk.CfnOutput(this, 'DataLakeBucketName', {
value: this.dataLakeBucket.bucketName,
description: 'Data Lake S3 Bucket',
});
new cdk.CfnOutput(this, 'GlueDatabaseName', {
value: this.glueDatabase.ref,
description: 'Glue Database Name',
});
new cdk.CfnOutput(this, 'AthenaWorkGroupName', {
value: 'kyo-analytics',
description: 'Athena WorkGroup',
});
}
/**
* 建立 Glue ETL Script
*/
private createETLScript(): string {
return `
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'source_path', 'dest_path'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
# 讀取原始數據
datasource = glueContext.create_dynamic_frame.from_options(
connection_type="s3",
connection_options={"paths": [args['source_path']]},
format="json"
)
# 資料清洗與轉換
# 1. 移除 null 值
cleaned = DropNullFields.apply(frame=datasource)
# 2. 重新分區 (按日期)
repartitioned = cleaned.toDF().repartition("year", "month", "day")
# 3. 寫入 Parquet 格式 (壓縮、列式儲存)
glueContext.write_dynamic_frame.from_options(
frame=DynamicFrame.fromDF(repartitioned, glueContext, "result"),
connection_type="s3",
connection_options={
"path": args['dest_path'],
"partitionKeys": ["year", "month", "day"]
},
format="parquet",
format_options={"compression": "snappy"}
)
job.commit()
`;
}
}
// infrastructure/lib/quicksight-stack.ts
import * as cdk from 'aws-cdk-lib';
import * as quicksight from 'aws-cdk-lib/aws-quicksight';
import * as iam from 'aws-cdk-lib/aws-iam';
import { Construct } from 'constructs';
export interface QuickSightStackProps extends cdk.StackProps {
dataLakeBucketName: string;
glueDatabaseName: string;
athenaWorkGroupName: string;
}
export class QuickSightStack extends cdk.Stack {
constructor(scope: Construct, id: string, props: QuickSightStackProps) {
super(scope, id, props);
/**
* QuickSight Data Source (Athena)
*/
const dataSource = new quicksight.CfnDataSource(this, 'AthenaDataSource', {
dataSourceId: 'kyo-athena-datasource',
name: 'Kyo Data Lake (Athena)',
type: 'ATHENA',
awsAccountId: this.account,
dataSourceParameters: {
athenaParameters: {
workGroup: props.athenaWorkGroupName,
},
},
permissions: [
{
principal: `arn:aws:quicksight:${this.region}:${this.account}:user/default/admin`,
actions: [
'quicksight:DescribeDataSource',
'quicksight:DescribeDataSourcePermissions',
'quicksight:PassDataSource',
'quicksight:UpdateDataSource',
'quicksight:DeleteDataSource',
'quicksight:UpdateDataSourcePermissions',
],
},
],
});
/**
* QuickSight Dataset - OTP Daily Stats
*/
const otpDataset = new quicksight.CfnDataSet(this, 'OTPDataSet', {
dataSetId: 'kyo-otp-daily-stats',
name: 'OTP Daily Statistics',
awsAccountId: this.account,
importMode: 'SPICE', // 使用 SPICE 引擎 (In-memory)
physicalTableMap: {
'otp-stats-table': {
customSql: {
dataSourceArn: dataSource.attrArn,
name: 'OTP Stats Query',
sqlQuery: `
SELECT
date,
tenant_id,
provider,
total_requests,
sent_count,
verified_count,
failed_count,
CAST(verified_count AS DOUBLE) / NULLIF(sent_count, 0) * 100 AS success_rate,
avg_verification_time,
total_cost
FROM ${props.glueDatabaseName}.otp_daily_stats
WHERE date >= CURRENT_DATE - INTERVAL '90' DAY
`,
columns: [
{ name: 'date', type: 'DATETIME' },
{ name: 'tenant_id', type: 'STRING' },
{ name: 'provider', type: 'STRING' },
{ name: 'total_requests', type: 'INTEGER' },
{ name: 'sent_count', type: 'INTEGER' },
{ name: 'verified_count', type: 'INTEGER' },
{ name: 'failed_count', type: 'INTEGER' },
{ name: 'success_rate', type: 'DECIMAL' },
{ name: 'avg_verification_time', type: 'DECIMAL' },
{ name: 'total_cost', type: 'DECIMAL' },
],
},
},
},
logicalTableMap: {
'otp-stats-logical': {
alias: 'OTP Statistics',
source: {
physicalTableId: 'otp-stats-table',
},
},
},
permissions: [
{
principal: `arn:aws:quicksight:${this.region}:${this.account}:user/default/admin`,
actions: [
'quicksight:DescribeDataSet',
'quicksight:DescribeDataSetPermissions',
'quicksight:PassDataSet',
'quicksight:DescribeIngestion',
'quicksight:ListIngestions',
'quicksight:UpdateDataSet',
'quicksight:DeleteDataSet',
'quicksight:CreateIngestion',
'quicksight:CancelIngestion',
'quicksight:UpdateDataSetPermissions',
],
},
],
});
/**
* QuickSight Analysis
*/
const analysis = new quicksight.CfnAnalysis(this, 'OTPAnalysis', {
analysisId: 'kyo-otp-analysis',
name: 'OTP Usage Analysis',
awsAccountId: this.account,
sourceEntity: {
sourceTemplate: {
dataSetReferences: [
{
dataSetPlaceholder: 'OTPStats',
dataSetArn: otpDataset.attrArn,
},
],
arn: `arn:aws:quicksight:${this.region}:${this.account}:template/kyo-otp-template`,
},
},
permissions: [
{
principal: `arn:aws:quicksight:${this.region}:${this.account}:user/default/admin`,
actions: [
'quicksight:RestoreAnalysis',
'quicksight:UpdateAnalysisPermissions',
'quicksight:DeleteAnalysis',
'quicksight:QueryAnalysis',
'quicksight:DescribeAnalysisPermissions',
'quicksight:DescribeAnalysis',
'quicksight:UpdateAnalysis',
],
},
],
});
/**
* QuickSight Dashboard
*/
new quicksight.CfnDashboard(this, 'OTPDashboard', {
dashboardId: 'kyo-otp-dashboard',
name: 'Kyo OTP Dashboard',
awsAccountId: this.account,
sourceEntity: {
sourceTemplate: {
dataSetReferences: [
{
dataSetPlaceholder: 'OTPStats',
dataSetArn: otpDataset.attrArn,
},
],
arn: `arn:aws:quicksight:${this.region}:${this.account}:template/kyo-otp-template`,
},
},
permissions: [
{
principal: `arn:aws:quicksight:${this.region}:${this.account}:user/default/admin`,
actions: [
'quicksight:DescribeDashboard',
'quicksight:ListDashboardVersions',
'quicksight:UpdateDashboardPermissions',
'quicksight:QueryDashboard',
'quicksight:UpdateDashboard',
'quicksight:DeleteDashboard',
'quicksight:DescribeDashboardPermissions',
'quicksight:UpdateDashboardPublishedVersion',
],
},
],
dashboardPublishOptions: {
adHocFilteringOption: {
availabilityStatus: 'ENABLED',
},
exportToCsvOption: {
availabilityStatus: 'ENABLED',
},
sheetControlsOption: {
visibilityState: 'EXPANDED',
},
},
});
}
}
-- Athena 查詢範例
-- 1. 每日 OTP 發送趨勢
SELECT
date,
SUM(sent_count) as total_sent,
SUM(verified_count) as total_verified,
ROUND(SUM(verified_count)::DECIMAL / NULLIF(SUM(sent_count), 0) * 100, 2) as success_rate
FROM kyo_datalake.otp_daily_stats
WHERE date >= CURRENT_DATE - INTERVAL '30' DAY
GROUP BY date
ORDER BY date DESC;
-- 2. 按供應商統計
SELECT
provider,
SUM(sent_count) as total_sent,
SUM(total_cost) as total_cost,
ROUND(AVG(avg_verification_time), 2) as avg_time
FROM kyo_datalake.otp_daily_stats
WHERE date >= CURRENT_DATE - INTERVAL '30' DAY
GROUP BY provider
ORDER BY total_sent DESC;
-- 3. 按租戶統計
SELECT
tenant_id,
COUNT(DISTINCT date) as active_days,
SUM(sent_count) as total_sent,
SUM(verified_count) as total_verified,
SUM(total_cost) as total_cost
FROM kyo_datalake.otp_daily_stats
WHERE date >= CURRENT_DATE - INTERVAL '30' DAY
GROUP BY tenant_id
ORDER BY total_sent DESC;
-- 4. 同比分析 (Year-over-Year)
WITH current_month AS (
SELECT
SUM(sent_count) as sent,
SUM(verified_count) as verified
FROM kyo_datalake.otp_daily_stats
WHERE date >= DATE_TRUNC('month', CURRENT_DATE)
AND date < DATE_TRUNC('month', CURRENT_DATE) + INTERVAL '1' MONTH
),
previous_month AS (
SELECT
SUM(sent_count) as sent,
SUM(verified_count) as verified
FROM kyo_datalake.otp_daily_stats
WHERE date >= DATE_TRUNC('month', CURRENT_DATE - INTERVAL '1' MONTH)
AND date < DATE_TRUNC('month', CURRENT_DATE)
)
SELECT
c.sent as current_sent,
p.sent as previous_sent,
ROUND((c.sent - p.sent)::DECIMAL / NULLIF(p.sent, 0) * 100, 2) as growth_rate
FROM current_month c, previous_month p;
-- 5. 分區剪枝優化 (Partition Pruning)
-- 只掃描特定日期範圍的分區,大幅減少掃描量
SELECT *
FROM kyo_datalake.otp_requests
WHERE year = '2024'
AND month = '01'
AND day >= '01'
AND day <= '31';
/**
* AWS Data Lake 成本優化策略
*/
// 1. S3 儲存成本優化
interface S3CostOptimization {
// Intelligent-Tiering: 自動轉移不常存取的數據
lifecycle: {
rule: 'intelligent-tiering',
daysAfter: 90,
savings: '68%', // vs Standard
};
// Parquet 格式: 壓縮率高、查詢效率佳
format: {
format: 'Parquet',
compression: 'Snappy',
compressionRatio: '10:1', // vs JSON
querySavings: '99%', // Athena 掃描量減少
};
// 分區策略: 只掃描需要的分區
partitioning: {
keys: ['year', 'month', 'day'],
example: 's3://bucket/data/year=2024/month=01/day=15/',
benefit: '只掃描 1 天數據,而非整個數據集',
};
}
// 2. Athena 查詢成本優化
interface AthenaCostOptimization {
// 定價: $5 per TB scanned
pricing: {
perTB: 5.0,
minCharge: 0.01, // 10 MB minimum
};
// 優化技巧
optimizations: [
{
technique: '分區查詢',
example: "WHERE year='2024' AND month='01'",
savings: '95%+',
},
{
technique: '欄位選擇',
example: 'SELECT id, name (vs SELECT *)',
savings: '取決於欄位數',
},
{
technique: 'LIMIT',
example: 'LIMIT 100',
note: 'LIMIT 不會減少掃描量,需配合 WHERE',
},
{
technique: 'Parquet + Compression',
savings: '99%',
example: '1 GB JSON → 10 MB Parquet',
},
];
// 成本計算範例
exampleCost: {
scenario: '查詢 30 天 OTP 數據',
jsonSize: '1 TB',
parquetSize: '10 GB',
jsonCost: '$5.00',
parquetCost: '$0.05',
savings: '$4.95 (99%)',
};
}
// 3. Glue ETL 成本優化
interface GlueCostOptimization {
// 定價: $0.44 per DPU-Hour
pricing: {
perDPUHour: 0.44,
minBilling: 10, // 10 分鐘最小計費
};
// 優化技巧
optimizations: [
{
technique: 'Job Bookmarks',
benefit: '只處理新數據,避免重複處理',
savings: '90%+',
},
{
technique: 'Worker 數量',
recommendation: '依數據量動態調整',
example: '1 GB → 2 workers, 10 GB → 10 workers',
},
{
technique: '排程時間',
recommendation: '離峰時段執行',
benefit: '不影響生產系統',
},
];
}
// 4. QuickSight 成本優化
interface QuickSightCostOptimization {
// 定價
pricing: {
author: '$24/user/month',
reader: '$5/user/month',
spice: '$0.25/GB/month (前 10 GB 免費)',
};
// 優化技巧
optimizations: [
{
technique: 'SPICE vs Direct Query',
spice: '快速但需儲存成本',
directQuery: '慢但無儲存成本',
recommendation: '常用報表用 SPICE,臨時查詢用 Direct Query',
},
{
technique: '資料重新整理頻率',
recommendation: '依業務需求設定',
examples: {
realtime: '每小時 (成本高)',
daily: '每天凌晨 (成本低)',
},
},
{
technique: 'Row-level Security',
benefit: '一個 Dashboard 服務所有租戶',
savings: '不需為每個租戶建立 Dashboard',
},
];
}
// 成本估算工具
function estimateDataLakeCost(params: {
dailyDataGB: number;
retentionDays: number;
queryCountPerDay: number;
}) {
const { dailyDataGB, retentionDays, queryCountPerDay } = params;
// S3 儲存成本
const totalDataGB = dailyDataGB * retentionDays;
const s3CostPerMonth = (totalDataGB / 1024) * 23; // $0.023 per GB (Standard)
// Athena 查詢成本 (假設每次查詢掃描 10 GB Parquet)
const queryDataPerDayTB = (queryCountPerDay * 10) / 1024;
const athenaCostPerMonth = queryDataPerDayTB * 30 * 5; // $5 per TB
// Glue ETL 成本 (假設每天 1 小時,2 DPU)
const glueCostPerMonth = 0.44 * 2 * 30; // $0.44 per DPU-Hour
// QuickSight 成本 (假設 5 authors, 50 readers)
const quicksightCostPerMonth = 5 * 24 + 50 * 5;
const total = s3CostPerMonth + athenaCostPerMonth + glueCostPerMonth + quicksightCostPerMonth;
return {
s3: s3CostPerMonth.toFixed(2),
athena: athenaCostPerMonth.toFixed(2),
glue: glueCostPerMonth.toFixed(2),
quicksight: quicksightCostPerMonth.toFixed(2),
total: total.toFixed(2),
};
}
// 範例計算
console.log('=== Data Lake 成本估算 ===');
console.log('假設: 每日 10 GB 數據, 保留 90 天, 每日 100 次查詢');
console.log(estimateDataLakeCost({
dailyDataGB: 10,
retentionDays: 90,
queryCountPerDay: 100,
}));
// 預估每月成本: ~$500
我們今天完成了 Kyo System 的資料湖架構:
Data Lake vs Data Warehouse:
Parquet vs JSON:
SPICE vs Direct Query:
成本優化: