在 Day 21 我們建立了 CI/CD 自動化部署流程。今天我們要實作 AWS CloudWatch 監控與告警系統,建立完整的可觀測性 (Observability) 架構,確保我們能即時發現和解決生產環境的問題。
┌─────────────────────── Observability ──────────────────────┐
│ │
│ ┌─── Metrics ───┐ ┌─── Logs ───┐ ┌─── Traces ───┐ │
│ │ │ │ │ │ │ │
│ │ • CPU/Memory │ │ • App Logs │ │ • Request │ │
│ │ • Requests │ │ • Error Logs│ │ Flow │ │
│ │ • Latency │ │ • Audit Log │ │ • Bottleneck │ │
│ │ • Error Rate │ │ • System Log│ │ • Dependencies │
│ │ │ │ │ │ │ │
│ └───────────────┘ └─────────────┘ └───────────────┘ │
│ │
│ ↓ ↓ ↓ │
│ CloudWatch Metrics CloudWatch Logs X-Ray Tracing │
└──────────────────────────────────────────────────────────────┘
// packages/kyo-core/src/services/metrics/CloudWatchMetrics.ts
import { CloudWatchClient, PutMetricDataCommand } from '@aws-sdk/client-cloudwatch';
export class CloudWatchMetrics {
private readonly client: CloudWatchClient;
private readonly namespace: string;
private readonly defaultDimensions: Record<string, string>;
constructor(config: {
region: string;
namespace: string;
environment: string;
service: string;
}) {
this.client = new CloudWatchClient({ region: config.region });
this.namespace = config.namespace;
this.defaultDimensions = {
Environment: config.environment,
Service: config.service,
};
}
/**
* 記錄 OTP 發送成功
*/
async recordOTPSent(tenantId: string): Promise<void> {
await this.putMetric({
metricName: 'OTPSent',
value: 1,
unit: 'Count',
dimensions: {
TenantId: tenantId,
},
});
}
/**
* 記錄 OTP 驗證結果
*/
async recordOTPVerification(success: boolean, tenantId: string): Promise<void> {
await this.putMetric({
metricName: success ? 'OTPVerifySuccess' : 'OTPVerifyFailure',
value: 1,
unit: 'Count',
dimensions: {
TenantId: tenantId,
Status: success ? 'Success' : 'Failure',
},
});
}
/**
* 記錄 API 回應時間
*/
async recordAPILatency(
endpoint: string,
latencyMs: number,
statusCode: number
): Promise<void> {
await this.putMetric({
metricName: 'APILatency',
value: latencyMs,
unit: 'Milliseconds',
dimensions: {
Endpoint: endpoint,
StatusCode: statusCode.toString(),
},
});
}
/**
* 記錄錯誤率
*/
async recordError(
errorType: string,
endpoint?: string
): Promise<void> {
await this.putMetric({
metricName: 'ErrorCount',
value: 1,
unit: 'Count',
dimensions: {
ErrorType: errorType,
...(endpoint && { Endpoint: endpoint }),
},
});
}
/**
* 記錄活躍租戶數
*/
async recordActiveTenants(count: number): Promise<void> {
await this.putMetric({
metricName: 'ActiveTenants',
value: count,
unit: 'Count',
});
}
/**
* 記錄 Rate Limit 觸發
*/
async recordRateLimitHit(
tenantId: string,
resource: string
): Promise<void> {
await this.putMetric({
metricName: 'RateLimitHit',
value: 1,
unit: 'Count',
dimensions: {
TenantId: tenantId,
Resource: resource,
},
});
}
/**
* 記錄 Redis 連線狀態
*/
async recordRedisHealth(
healthy: boolean,
latencyMs?: number
): Promise<void> {
await this.putMetric({
metricName: 'RedisHealth',
value: healthy ? 1 : 0,
unit: 'Count',
});
if (latencyMs !== undefined) {
await this.putMetric({
metricName: 'RedisLatency',
value: latencyMs,
unit: 'Milliseconds',
});
}
}
/**
* 批次發送指標 (效能優化)
*/
async putMetricBatch(
metrics: Array<{
metricName: string;
value: number;
unit: string;
dimensions?: Record<string, string>;
timestamp?: Date;
}>
): Promise<void> {
const metricData = metrics.map(metric => ({
MetricName: metric.metricName,
Value: metric.value,
Unit: metric.unit,
Timestamp: metric.timestamp || new Date(),
Dimensions: this.buildDimensions(metric.dimensions || {}),
}));
const command = new PutMetricDataCommand({
Namespace: this.namespace,
MetricData: metricData,
});
await this.client.send(command);
}
/**
* 內部方法:發送單一指標
*/
private async putMetric(params: {
metricName: string;
value: number;
unit: string;
dimensions?: Record<string, string>;
}): Promise<void> {
const command = new PutMetricDataCommand({
Namespace: this.namespace,
MetricData: [
{
MetricName: params.metricName,
Value: params.value,
Unit: params.unit,
Timestamp: new Date(),
Dimensions: this.buildDimensions(params.dimensions || {}),
},
],
});
try {
await this.client.send(command);
} catch (error) {
console.error('Failed to put metric:', error);
// 不要因為 metrics 失敗而影響主要業務邏輯
}
}
/**
* 建構 Dimensions 陣列
*/
private buildDimensions(
dimensions: Record<string, string>
): Array<{ Name: string; Value: string }> {
const merged = { ...this.defaultDimensions, ...dimensions };
return Object.entries(merged).map(([name, value]) => ({
Name: name,
Value: value,
}));
}
}
// 建立單例
export const metrics = new CloudWatchMetrics({
region: process.env.AWS_REGION || 'ap-northeast-1',
namespace: 'Kyo/OTP',
environment: process.env.NODE_ENV || 'development',
service: 'kyo-otp-service',
});
// apps/kyo-otp-service/src/plugins/metrics.ts
import fp from 'fastify-plugin';
import { metrics } from '@kyong/kyo-core';
export default fp(async (fastify) => {
// 記錄每個請求的延遲
fastify.addHook('onResponse', async (request, reply) => {
const latency = reply.getResponseTime();
const endpoint = `${request.method} ${request.routeOptions.url || request.url}`;
await metrics.recordAPILatency(
endpoint,
latency,
reply.statusCode
);
// 如果是錯誤回應,記錄錯誤
if (reply.statusCode >= 400) {
await metrics.recordError(
reply.statusCode >= 500 ? 'ServerError' : 'ClientError',
endpoint
);
}
});
// 記錄 Rate Limit 事件
fastify.addHook('onRequest', async (request, reply) => {
const rateLimitKey = request.headers['x-ratelimit-limit'];
if (rateLimitKey === '0') {
const tenantId = request.headers['x-tenant-id'] as string || 'unknown';
await metrics.recordRateLimitHit(tenantId, request.url);
}
});
// 健康檢查端點也記錄指標
fastify.get('/health', async (request, reply) => {
const health = await checkSystemHealth();
// 記錄 Redis 健康狀況
await metrics.recordRedisHealth(
health.redis.healthy,
health.redis.latency
);
if (health.isHealthy) {
return { status: 'ok', ...health };
} else {
reply.code(503);
return { status: 'degraded', ...health };
}
});
});
async function checkSystemHealth() {
// 檢查 Redis
const redisStart = Date.now();
let redisHealthy = false;
let redisLatency = 0;
try {
await redis.ping();
redisHealthy = true;
redisLatency = Date.now() - redisStart;
} catch (error) {
console.error('Redis health check failed:', error);
}
return {
isHealthy: redisHealthy,
redis: {
healthy: redisHealthy,
latency: redisLatency,
},
timestamp: new Date().toISOString(),
};
}
// packages/kyo-core/src/services/logger/StructuredLogger.ts
import pino from 'pino';
import { CloudWatchLogsClient, PutLogEventsCommand } from '@aws-sdk/client-cloudwatch-logs';
interface LogContext {
tenantId?: string;
userId?: string;
requestId?: string;
[key: string]: unknown;
}
export class StructuredLogger {
private readonly logger: pino.Logger;
private readonly cloudwatch?: CloudWatchLogsClient;
private readonly logGroupName: string;
private readonly logStreamName: string;
constructor(config: {
level: string;
service: string;
environment: string;
enableCloudWatch?: boolean;
logGroupName?: string;
}) {
this.logger = pino({
level: config.level,
formatters: {
level: (label) => {
return { level: label };
},
},
base: {
service: config.service,
environment: config.environment,
},
});
if (config.enableCloudWatch) {
this.cloudwatch = new CloudWatchLogsClient({
region: process.env.AWS_REGION,
});
this.logGroupName = config.logGroupName || `/aws/kyo/${config.service}`;
this.logStreamName = `${config.environment}-${Date.now()}`;
}
}
/**
* 記錄資訊日誌
*/
info(message: string, context?: LogContext): void {
this.logger.info(context, message);
this.sendToCloudWatch('INFO', message, context);
}
/**
* 記錄警告日誌
*/
warn(message: string, context?: LogContext): void {
this.logger.warn(context, message);
this.sendToCloudWatch('WARN', message, context);
}
/**
* 記錄錯誤日誌
*/
error(message: string, error?: Error, context?: LogContext): void {
this.logger.error(
{
...context,
error: {
message: error?.message,
stack: error?.stack,
name: error?.name,
},
},
message
);
this.sendToCloudWatch('ERROR', message, {
...context,
error: error?.message,
});
}
/**
* 記錄 OTP 相關日誌
*/
logOTP(
action: 'sent' | 'verified' | 'failed',
context: {
phone: string;
tenantId: string;
success?: boolean;
reason?: string;
}
): void {
// 遮罩手機號碼的部分數字
const maskedPhone = context.phone.replace(/(\d{4})\d{4}(\d{2})/, '$1****$2');
this.info(`OTP ${action}`, {
...context,
phone: maskedPhone,
action,
});
}
/**
* 記錄審計日誌
*/
audit(
action: string,
resource: string,
context: LogContext & {
userId: string;
tenantId: string;
result: 'success' | 'failure';
}
): void {
this.info(`AUDIT: ${action} on ${resource}`, {
...context,
auditLog: true,
action,
resource,
});
}
/**
* 發送日誌到 CloudWatch (非同步,不阻塞)
*/
private async sendToCloudWatch(
level: string,
message: string,
context?: LogContext
): Promise<void> {
if (!this.cloudwatch) return;
try {
const logEvent = {
message: JSON.stringify({
level,
message,
...context,
timestamp: new Date().toISOString(),
}),
timestamp: Date.now(),
};
const command = new PutLogEventsCommand({
logGroupName: this.logGroupName,
logStreamName: this.logStreamName,
logEvents: [logEvent],
});
// 非阻塞執行
this.cloudwatch.send(command).catch(err => {
console.error('Failed to send log to CloudWatch:', err);
});
} catch (error) {
// 靜默失敗,不影響主要業務邏輯
}
}
}
// 建立單例
export const logger = new StructuredLogger({
level: process.env.LOG_LEVEL || 'info',
service: 'kyo-otp-service',
environment: process.env.NODE_ENV || 'development',
enableCloudWatch: process.env.NODE_ENV === 'production',
logGroupName: '/aws/kyo/otp-service',
});
// infrastructure/lib/alarms-stack.ts
import * as cdk from 'aws-cdk-lib';
import * as cloudwatch from 'aws-cdk-lib/aws-cloudwatch';
import * as cloudwatchActions from 'aws-cdk-lib/aws-cloudwatch-actions';
import * as sns from 'aws-cdk-lib/aws-sns';
import * as subscriptions from 'aws-cdk-lib/aws-sns-subscriptions';
import { Construct } from 'constructs';
export class AlarmsStack extends cdk.Stack {
constructor(scope: Construct, id: string, props?: cdk.StackProps) {
super(scope, id, props);
// SNS 主題用於告警通知
const alertTopic = new sns.Topic(this, 'AlertTopic', {
topicName: 'kyo-production-alerts',
displayName: 'Kyo Production Alerts',
});
// 訂閱 Email
alertTopic.addSubscription(
new subscriptions.EmailSubscription('ops@kyong.com')
);
// API 高錯誤率告警
const apiErrorAlarm = new cloudwatch.Alarm(this, 'HighAPIErrorRate', {
alarmName: 'Kyo-High-API-Error-Rate',
alarmDescription: 'API error rate exceeds 5% threshold',
metric: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'ErrorCount',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
}),
threshold: 50, // 5分鐘內超過 50 個錯誤
evaluationPeriods: 2,
datapointsToAlarm: 2,
treatMissingData: cloudwatch.TreatMissingData.NOT_BREACHING,
comparisonOperator: cloudwatch.ComparisonOperator.GREATER_THAN_THRESHOLD,
});
apiErrorAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// API 高延遲告警
const apiLatencyAlarm = new cloudwatch.Alarm(this, 'HighAPILatency', {
alarmName: 'Kyo-High-API-Latency',
alarmDescription: 'API latency exceeds 2 seconds',
metric: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'APILatency',
statistic: 'Average',
period: cdk.Duration.minutes(5),
}),
threshold: 2000, // 2秒
evaluationPeriods: 3,
datapointsToAlarm: 2,
});
apiLatencyAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// Redis 健康檢查告警
const redisHealthAlarm = new cloudwatch.Alarm(this, 'RedisUnhealthy', {
alarmName: 'Kyo-Redis-Unhealthy',
alarmDescription: 'Redis health check failing',
metric: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'RedisHealth',
statistic: 'Minimum',
period: cdk.Duration.minutes(1),
}),
threshold: 1,
evaluationPeriods: 2,
comparisonOperator: cloudwatch.ComparisonOperator.LESS_THAN_THRESHOLD,
});
redisHealthAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// OTP 發送失敗率告警
const otpFailureAlarm = new cloudwatch.Alarm(this, 'HighOTPFailureRate', {
alarmName: 'Kyo-High-OTP-Failure-Rate',
alarmDescription: 'OTP verification failure rate exceeds 20%',
metric: new cloudwatch.MathExpression({
expression: '(failures / (successes + failures)) * 100',
usingMetrics: {
successes: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifySuccess',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
}),
failures: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifyFailure',
statistic: 'Sum',
period: cdk.Duration.minutes(5),
}),
},
period: cdk.Duration.minutes(5),
}),
threshold: 20, // 20% 失敗率
evaluationPeriods: 2,
});
otpFailureAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// ECS 服務 CPU 使用率告警
const ecsCpuAlarm = new cloudwatch.Alarm(this, 'HighECSCPU', {
alarmName: 'Kyo-High-ECS-CPU',
metric: new cloudwatch.Metric({
namespace: 'AWS/ECS',
metricName: 'CPUUtilization',
dimensionsMap: {
ServiceName: 'kyo-api-service',
ClusterName: 'kyo-production-cluster',
},
statistic: 'Average',
period: cdk.Duration.minutes(5),
}),
threshold: 80, // 80% CPU
evaluationPeriods: 3,
datapointsToAlarm: 2,
});
ecsCpuAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// RDS 連線數告警
const rdsConnectionAlarm = new cloudwatch.Alarm(this, 'HighRDSConnections', {
alarmName: 'Kyo-High-RDS-Connections',
metric: new cloudwatch.Metric({
namespace: 'AWS/RDS',
metricName: 'DatabaseConnections',
dimensionsMap: {
DBInstanceIdentifier: 'kyo-master-db',
},
statistic: 'Maximum',
period: cdk.Duration.minutes(5),
}),
threshold: 80, // 假設最大連線數為 100,告警設在 80
evaluationPeriods: 2,
});
rdsConnectionAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
// 複合告警:系統整體健康度
const systemHealthAlarm = new cloudwatch.CompositeAlarm(this, 'SystemUnhealthy', {
alarmName: 'Kyo-System-Unhealthy',
alarmDescription: 'Multiple system components are unhealthy',
compositeAlarmName: 'kyo-system-health',
alarmRule: cloudwatch.AlarmRule.anyOf(
cloudwatch.AlarmRule.fromAlarm(apiErrorAlarm, cloudwatch.AlarmState.ALARM),
cloudwatch.AlarmRule.fromAlarm(redisHealthAlarm, cloudwatch.AlarmState.ALARM),
cloudwatch.AlarmRule.fromAlarm(ecsCpuAlarm, cloudwatch.AlarmState.ALARM)
),
});
systemHealthAlarm.addAlarmAction(new cloudwatchActions.SnsAction(alertTopic));
}
}
// infrastructure/lib/dashboard-stack.ts
export class DashboardStack extends cdk.Stack {
constructor(scope: Construct, id: string, props?: cdk.StackProps) {
super(scope, id, props);
const dashboard = new cloudwatch.Dashboard(this, 'KyoDashboard', {
dashboardName: 'Kyo-Production-Overview',
defaultInterval: cdk.Duration.hours(1),
});
// Row 1: API 指標
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'API Request Rate',
left: [
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'APILatency',
statistic: 'SampleCount',
label: 'Requests/min',
}),
],
width: 12,
height: 6,
}),
new cloudwatch.GraphWidget({
title: 'API Latency (p50, p95, p99)',
left: [
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'APILatency',
statistic: 'p50',
label: 'p50',
color: '#1f77b4',
}),
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'APILatency',
statistic: 'p95',
label: 'p95',
color: '#ff7f0e',
}),
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'APILatency',
statistic: 'p99',
label: 'p99',
color: '#d62728',
}),
],
width: 12,
height: 6,
})
);
// Row 2: OTP 指標
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'OTP Sent vs Verified',
left: [
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPSent',
statistic: 'Sum',
label: 'OTP Sent',
color: '#2ca02c',
}),
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifySuccess',
statistic: 'Sum',
label: 'Verified Successfully',
color: '#1f77b4',
}),
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifyFailure',
statistic: 'Sum',
label: 'Verification Failed',
color: '#d62728',
}),
],
width: 12,
height: 6,
}),
new cloudwatch.SingleValueWidget({
title: 'Active Tenants',
metrics: [
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'ActiveTenants',
statistic: 'Maximum',
}),
],
width: 6,
height: 6,
}),
new cloudwatch.SingleValueWidget({
title: 'OTP Success Rate',
metrics: [
new cloudwatch.MathExpression({
expression: '(success / (success + failure)) * 100',
usingMetrics: {
success: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifySuccess',
statistic: 'Sum',
}),
failure: new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'OTPVerifyFailure',
statistic: 'Sum',
}),
},
label: 'Success Rate (%)',
}),
],
width: 6,
height: 6,
})
);
// Row 3: 系統資源
dashboard.addWidgets(
new cloudwatch.GraphWidget({
title: 'ECS CPU & Memory',
left: [
new cloudwatch.Metric({
namespace: 'AWS/ECS',
metricName: 'CPUUtilization',
dimensionsMap: {
ServiceName: 'kyo-api-service',
ClusterName: 'kyo-production-cluster',
},
statistic: 'Average',
label: 'CPU %',
}),
],
right: [
new cloudwatch.Metric({
namespace: 'AWS/ECS',
metricName: 'MemoryUtilization',
dimensionsMap: {
ServiceName: 'kyo-api-service',
ClusterName: 'kyo-production-cluster',
},
statistic: 'Average',
label: 'Memory %',
}),
],
width: 12,
height: 6,
}),
new cloudwatch.GraphWidget({
title: 'Redis Latency',
left: [
new cloudwatch.Metric({
namespace: 'Kyo/OTP',
metricName: 'RedisLatency',
statistic: 'Average',
label: 'Average Latency (ms)',
}),
],
width: 12,
height: 6,
})
);
// Row 4: 錯誤追蹤
dashboard.addWidgets(
new cloudwatch.LogQueryWidget({
title: 'Recent Errors',
logGroupNames: ['/aws/kyo/otp-service'],
queryLines: [
'fields @timestamp, @message',
'filter level = "ERROR"',
'sort @timestamp desc',
'limit 20',
],
width: 24,
height: 6,
})
);
}
}
// 指標保留策略
const metricRetentionPeriod = {
detailed: 7 * 24 * 60 * 60, // 7天詳細數據
hourly: 30 * 24 * 60 * 60, // 30天小時數據
daily: 365 * 24 * 60 * 60, // 365天每日數據
};
// Log 保留策略
const logRetention = {
application: logs.RetentionDays.ONE_MONTH,
audit: logs.RetentionDays.ONE_YEAR,
access: logs.RetentionDays.ONE_WEEK,
};
我們今天建立了完整的 CloudWatch 監控系統: