今日目的:從模擬混沌到真實混沌,掌握 xk6-disruptor 的威力
理解 Kubernetes 原生混沌工程的精髓
在受控環境中釋放真正的「混沌之力」
傳統負載測試 vs 混沌工程
為什麼傳統壓測「太完美」是個問題?
你有沒有想過,為什麼生產環境總是出現測試時沒遇到的問題?
傳統壓測的盲點:
一般壓測結果可能如下圖
🤔 為什麼結果這麼「完美」?
昨天的測試結果顯示 100% 成功率,這其實暴露了一個問題:
xk6-disruptor 的革命性思維
為什麼需要多節點?
想像一下:
為什麼需要 LoadBalancer
?
首先 load balacner 的話就是需要多節點,正且 KinD 能很方便的模擬這多節點環境。
# kind-config.yaml - 打造真實的「小宇宙」
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane # 指揮家
kubeadmConfigPatches:
- |
kind: InitConfiguration
nodeRegistration:
kubeletExtraArgs:
node-labels: "ingress-ready=true"
extraPortMappings: # 對外的窗口
- containerPort: 80
hostPort: 80
protocol: TCP
- containerPort: 443
hostPort: 443
protocol: TCP
- role: worker # 演奏者 1
- role: worker # 演奏者 2
# deployments/demo-app/service.yaml
apiVersion: v1
kind: Service
metadata:
name: demo-service
labels:
app: demo-app
spec:
selector:
app: demo-app
ports:
- name: http
port: 8080
targetPort: 80
type: LoadBalancer
---
apiVersion: v1
kind: Service
metadata:
name: demo-service-v1
labels:
app: demo-app
version: v1
spec:
selector:
app: demo-app
version: v1
ports:
- name: http
port: 8080
targetPort: 80
type: LoadBalancer
---
apiVersion: v1
kind: Service
metadata:
name: demo-service-v2
labels:
app: demo-app
version: v2
spec:
selector:
app: demo-app
version: v2
ports:
- name: http
port: 8080
targetPort: 80
type: LoadBalancer
但 KinD 的 service 網路如果需要啟用 LoadBalancer
類型則需要安裝設定MetalLB
。
在雲端環境中,LoadBalancer 是理所當然的存在,但在本地環境中:
# 沒有 MetalLB 的悲劇
kubectl get svc
NAME TYPE EXTERNAL-IP PORT(S)
demo-service LoadBalancer <pending> 8080:30123/TCP
# 有了 MetalLB 的奇蹟,能取得 EXTERNAL-IP
kubectl get svc
NAME TYPE EXTERNAL-IP PORT(S)
demo-service LoadBalancer 172.18.255.1 8080:30123/TCP
go install go.k6.io/xk6/cmd/xk6@latest
xk6 build --with github.com/grafana/xk6-disruptor
會在專案目錄產生一個k6
的執行檔。
// pod-disruption.js - 容器級故障注入
// 這不是普通的測試,這是「生死時速」
export function disrupt() {
const podDisruptor = new PodDisruptor(selector);
// 🔪 暗殺 50% 的 Pods - 模擬節點故障
podDisruptor.injectPodKillFault({
count: 2, // 精準打擊
});
// 💪 CPU 折磨術 - 模擬資源競爭
podDisruptor.injectStressFault({
count: 1,
options: {
stressors: {
cpu: {
workers: 2,
load: 80, // 80% CPU 壓力
},
},
duration: '30s',
},
});
}
這模擬了什麼真實場景?
// 階段式混沌注入 - 就像慢性毒藥
const faultStages = [
{ errorRate: 0.1, delay: '100ms', duration: '60s' }, // 輕微不適
{ errorRate: 0.3, delay: '300ms', duration: '60s' }, // 明顯症狀
{ errorRate: 0.5, delay: '500ms', duration: '60s' }, // 嚴重病症
{ errorRate: 0.2, delay: '200ms', duration: '60s' }, // 康復階段
];
這就像餐廳的服務品質測試:
export const options = {
scenarios: {
baseline_load: { // 平靜的日常
executor: 'constant-vus',
vus: 10,
duration: '10m',
},
spike_load: { // 突如其來的風暴
executor: 'ramping-vus',
stages: [
{ duration: '30s', target: 50 }, // 風暴來襲
{ duration: '1m', target: 50 }, // 暴風眼
{ duration: '30s', target: 0 }, // 風暴過後
],
startTime: '4m',
},
pod_chaos: { // 暗黑勢力
startTime: '2m',
},
service_chaos: { // 網路惡魔
startTime: '6m',
},
},
};
import http from 'k6/http';
import { check, sleep } from 'k6';
import { PodDisruptor } from 'k6/x/disruptor';
export const options = {
scenarios: {
// 基線負載測試
baseline_load: {
executor: 'constant-vus',
vus: 3,
duration: '1m',
exec: 'baselineTest',
},
// 混沌注入測試
chaos_injection: {
executor: 'shared-iterations',
iterations: 1,
vus: 1,
exec: 'chaosTest',
startTime: '30s',
},
// 故障後恢復測試
recovery_test: {
executor: 'constant-vus',
vus: 2,
duration: '1m',
exec: 'recoveryTest',
startTime: '2m', // 調整避免重疊
},
},
thresholds: {
http_req_duration: ['p(95)<5000'], // 放寬到 5 秒
http_req_failed: ['rate<0.2'], // 調整到 20%
http_reqs: ['count>30'], // 調整最小請求數
},
};
// 配置參數
const TARGET_URL = __ENV.TARGET_URL || 'http://172.18.255.202:8080';
const NAMESPACE = __ENV.NAMESPACE || 'default';
const CHAOS_DURATION = __ENV.CHAOS_DURATION || '30s';
const FAULT_LEVEL = __ENV.FAULT_LEVEL || 'light';
// 故障配置
const FAULT_CONFIGS = {
light: {
averageDelay: '100ms',
errorRate: 0.1,
errorCode: 500,
},
medium: {
averageDelay: '300ms',
errorRate: 0.2,
errorCode: 503,
},
heavy: {
averageDelay: '1s',
errorRate: 0.3,
errorCode: 500,
},
};
const currentFault = FAULT_CONFIGS[FAULT_LEVEL];
// 統計數據
let stats = {
baseline: { requests: 0, failures: 0, totalDuration: 0 },
chaos: { requests: 0, failures: 0, totalDuration: 0 },
recovery: { requests: 0, failures: 0, totalDuration: 0 },
};
export function setup() {
console.log('🎭 === OPTIMIZED K6 CHAOS ENGINEERING ===');
console.log(`🎯 Target URL: ${TARGET_URL}`);
console.log(`☸️ Kubernetes Namespace: ${NAMESPACE}`);
console.log(`⏱️ Chaos Duration: ${CHAOS_DURATION}`);
console.log(`💥 Fault Level: ${FAULT_LEVEL}`);
// 健康檢查
console.log('\n🏥 === HEALTH CHECK ===');
let healthyChecks = 0;
const totalChecks = 5;
for (let i = 0; i < totalChecks; i++) {
try {
const response = http.get(TARGET_URL, { timeout: '10s' });
if (response.status === 200) {
healthyChecks++;
console.log(`✅ Health check ${i+1}/${totalChecks}: OK (${Math.round(response.timings.duration)}ms)`);
} else {
console.log(`⚠️ Health check ${i+1}/${totalChecks}: Status ${response.status}`);
}
} catch (error) {
console.log(`❌ Health check ${i+1}/${totalChecks}: Failed - ${error.message}`);
}
sleep(0.5);
}
const healthPercentage = (healthyChecks / totalChecks) * 100;
console.log(`📊 Overall Health: ${healthPercentage}% (${healthyChecks}/${totalChecks})`);
return {
healthy: healthyChecks >= 3,
healthPercentage: healthPercentage,
startTime: Date.now()
};
}
// 基線測試
export function baselineTest() {
const startTime = Date.now();
try {
const response = http.get(TARGET_URL, { timeout: '10s' });
const duration = Date.now() - startTime;
stats.baseline.requests++;
stats.baseline.totalDuration += duration;
const success = check(response, {
'baseline: status is 200': (r) => r.status === 200,
'baseline: response time < 3000ms': (r) => r.timings.duration < 3000,
'baseline: has content': (r) => r.body && r.body.length > 0,
});
if (!success) {
stats.baseline.failures++;
}
} catch (error) {
stats.baseline.failures++;
console.log(`❌ Baseline request failed: ${error.message}`);
}
sleep(1);
}
// 混沌測試期間的輔助函數
function testDuringChaos(strategy) {
const testCount = Math.max(3, parseInt(CHAOS_DURATION.replace('s', '')) / 5);
for (let i = 0; i < testCount; i++) {
const startTime = Date.now();
try {
const response = http.get(TARGET_URL, { timeout: '15s' });
const duration = Date.now() - startTime;
stats.chaos.requests++;
stats.chaos.totalDuration += duration;
const success = check(response, {
[`chaos-${strategy}: response received`]: (r) => r.status > 0,
[`chaos-${strategy}: not timeout`]: (r) => r.timings.duration < 15000,
});
if (!success || response.status >= 500) {
stats.chaos.failures++;
}
console.log(` [${strategy}] Request ${i+1}/${testCount}: Status=${response.status}, Duration=${Math.round(response.timings.duration)}ms`);
} catch (error) {
stats.chaos.failures++;
console.log(` [${strategy}] Request ${i+1}/${testCount}: Failed - ${error.message}`);
}
sleep(3);
}
}
// 手動混沌測試(備案)
function manualChaosTest() {
console.log('🔧 Performing manual chaos simulation...');
const testCount = 5;
for (let i = 0; i < testCount; i++) {
const startTime = Date.now();
try {
// 模擬網路延遲
const randomDelay = Math.random() * 300 + 50; // 50-350ms
sleep(randomDelay / 1000);
const response = http.get(TARGET_URL, { timeout: '10s' });
const duration = Date.now() - startTime;
stats.chaos.requests++;
stats.chaos.totalDuration += duration;
// 模擬隨機錯誤
const simulatedError = Math.random() < currentFault.errorRate;
const success = check(response, {
'manual-chaos: response ok': (r) => r.status === 200 && !simulatedError,
});
if (!success || simulatedError) {
stats.chaos.failures++;
console.log(` Manual test ${i+1}/${testCount}: Simulated error`);
} else {
console.log(` Manual test ${i+1}/${testCount}: OK (${Math.round(duration)}ms)`);
}
} catch (error) {
stats.chaos.failures++;
console.log(` Manual test ${i+1}/${testCount}: Failed - ${error.message}`);
}
sleep(2);
}
}
// 混沌測試主函數
export function chaosTest() {
console.log('\n🔥 === STARTING CHAOS INJECTION ===');
let chaosSuccess = false;
// 策略 1: 嘗試 Pod 終止
console.log('\n🎯 Strategy 1: Pod Termination');
try {
const disruptor = new PodDisruptor({
namespace: NAMESPACE,
select: {
labels: {
app: 'demo-app',
version: 'v2',
},
},
});
console.log('✅ PodDisruptor created for V2 pods');
// 嘗試終止 2 個 Pod
disruptor.terminatePods({ count: 2 });
console.log('💥 Pod termination injected (2 V2 pod)');
chaosSuccess = true;
// 在混沌期間進行測試
testDuringChaos('pod-termination');
} catch (error) {
console.log(`⚠️ Pod termination failed: ${error.message}`);
}
// 策略 2: 如果 Pod 終止失敗,嘗試 HTTP 故障注入
if (!chaosSuccess) {
console.log('\n🎯 Strategy 2: HTTP Fault Injection');
try {
const disruptor = new PodDisruptor({
namespace: NAMESPACE,
select: {
labels: {
app: 'demo-app',
},
},
});
disruptor.injectHTTPFaults({
averageDelay: currentFault.averageDelay,
errorRate: currentFault.errorRate,
errorCode: currentFault.errorCode,
}, CHAOS_DURATION);
console.log(`💉 HTTP faults injected: ${currentFault.averageDelay} delay, ${currentFault.errorRate*100}% error rate`);
chaosSuccess = true;
testDuringChaos('http-faults');
} catch (error) {
console.log(`⚠️ HTTP fault injection failed: ${error.message}`);
}
}
// 策略 3: 手動混沌測試(最後備案)
if (!chaosSuccess) {
console.log('\n🎯 Strategy 3: Manual Chaos Simulation');
manualChaosTest();
}
console.log('\n⏳ Waiting for system stabilization...');
sleep(10);
}
// 恢復測試
export function recoveryTest() {
const startTime = Date.now();
try {
const response = http.get(TARGET_URL, { timeout: '10s' });
const duration = Date.now() - startTime;
stats.recovery.requests++;
stats.recovery.totalDuration += duration;
const success = check(response, {
'recovery: status is 200': (r) => r.status === 200,
'recovery: response time < 5000ms': (r) => r.timings.duration < 5000,
'recovery: service recovered': (r) => r.body && r.body.length > 0,
});
if (!success) {
stats.recovery.failures++;
}
} catch (error) {
stats.recovery.failures++;
console.log(`❌ Recovery request failed: ${error.message}`);
}
sleep(1);
}
export function teardown(data) {
console.log('\n📊 === FINAL CHAOS TEST REPORT ===');
// 計算統計數據
const phases = ['baseline', 'chaos', 'recovery'];
phases.forEach(phase => {
const stat = stats[phase];
if (stat.requests > 0) {
const successRate = ((stat.requests - stat.failures) / stat.requests * 100).toFixed(1);
const avgDuration = (stat.totalDuration / stat.requests).toFixed(0);
console.log(`\n${phase.toUpperCase()} PHASE:`);
console.log(` 📈 Requests: ${stat.requests}`);
console.log(` ✅ Success Rate: ${successRate}%`);
console.log(` ⏱️ Avg Duration: ${avgDuration}ms`);
console.log(` ❌ Failures: ${stat.failures}`);
}
});
// 系統韌性評估
const totalRequests = stats.baseline.requests + stats.chaos.requests + stats.recovery.requests;
const totalFailures = stats.baseline.failures + stats.chaos.failures + stats.recovery.failures;
const overallSuccessRate = totalRequests > 0 ? ((totalRequests - totalFailures) / totalRequests * 100).toFixed(1) : 0;
console.log(`\n🎯 OVERALL RESILIENCE SCORE:`);
console.log(` 📊 Total Requests: ${totalRequests}`);
console.log(` ✅ Overall Success: ${overallSuccessRate}%`);
// 韌性評級
let grade = 'F';
if (overallSuccessRate >= 95) grade = 'A+';
else if (overallSuccessRate >= 90) grade = 'A';
else if (overallSuccessRate >= 85) grade = 'B+';
else if (overallSuccessRate >= 80) grade = 'B';
else if (overallSuccessRate >= 70) grade = 'C';
else if (overallSuccessRate >= 60) grade = 'D';
console.log(` 🏆 Resilience Grade: ${grade}`);
console.log('\n🎉 Chaos Engineering Test Completed!');
}
Result:
./k6 run scripts/advanced-scenarios.js
/\ Grafana /‾‾/
/\ / \ |\ __ / /
/ \/ \ | |/ / / ‾‾\
/ \ | ( | (‾) |
/ __________ \ |_|\_\ \_____/
WARN[0000] The configuration file has been found on the old default path ("/home/nathan/.config/loadimpact/k6/config.json"). Please, run again `k6 cloud login` or `k6 login` commands to migrate to the new default path.
execution: local
script: scripts/advanced-scenarios2.js
output: -
scenarios: (100.00%) 3 scenarios, 4 max VUs, 11m0s max duration (incl. graceful stop):
* baseline_load: 3 looping VUs for 1m0s (exec: baselineTest, gracefulStop: 30s)
* chaos_injection: 1 iterations shared among 1 VUs (maxDuration: 10m0s, exec: chaosTest, startTime: 30s, gracefulStop: 30s)
* recovery_test: 2 looping VUs for 1m0s (exec: recoveryTest, startTime: 2m0s, gracefulStop: 30s)
INFO[0000] 🎭 === OPTIMIZED K6 CHAOS ENGINEERING === source=console
INFO[0000] 🎯 Target URL: http://172.18.255.202:8080 source=console
INFO[0000] ☸️ Kubernetes Namespace: default source=console
INFO[0000] ⏱️ Chaos Duration: 30s source=console
INFO[0000] 💥 Fault Level: heavy source=console
INFO[0000]
🏥 === HEALTH CHECK === source=console
INFO[0000] ✅ Health check 1/5: OK (0ms) source=console
INFO[0000] ✅ Health check 2/5: OK (0ms) source=console
INFO[0001] ✅ Health check 3/5: OK (0ms) source=console
INFO[0001] ✅ Health check 4/5: OK (0ms) source=console
INFO[0002] ✅ Health check 5/5: OK (0ms) source=console
INFO[0002] 📊 Overall Health: 100% (5/5) source=console
INFO[0032]
🔥 === STARTING CHAOS INJECTION === source=console
INFO[0032]
🎯 Strategy 1: Pod Termination source=console
INFO[0032] ✅ PodDisruptor created for V2 pods source=console
WARN[0032] Request Failed error="Get \"http://172.18.255.202:8080\": dial tcp 172.18.255.202:8080: connect: connection refused"
WARN[0032] Request Failed error="Get \"http://172.18.255.202:8080\": dial tcp 172.18.255.202:8080: connect: connection refused"
WARN[0032] Request Failed error="Get \"http://172.18.255.202:8080\": dial tcp 172.18.255.202:8080: connect: connection refused"
INFO[0033] 💥 Pod termination injected (2 V2 pod) source=console
WARN[0043] Request Failed error="Get \"http://172.18.255.202:8080\": request timeout"
WARN[0043] Request Failed error="Get \"http://172.18.255.202:8080\": request timeout"
WARN[0043] Request Failed error="Get \"http://172.18.255.202:8080\": request timeout"
WARN[0048] Request Failed error="Get \"http://172.18.255.202:8080\": request timeout"
INFO[0048] [pod-termination] Request 1/6: Status=0, Duration=0ms source=console
INFO[0051] [pod-termination] Request 2/6: Status=200, Duration=0ms source=console
INFO[0054] [pod-termination] Request 3/6: Status=200, Duration=0ms source=console
INFO[0057] [pod-termination] Request 4/6: Status=200, Duration=0ms source=console
INFO[0060] [pod-termination] Request 5/6: Status=200, Duration=0ms source=console
INFO[0063] [pod-termination] Request 6/6: Status=200, Duration=0ms source=console
INFO[0066]
⏳ Waiting for system stabilization... source=console
INFO[0182]
📊 === FINAL CHAOS TEST REPORT === source=console
INFO[0182]
🎯 OVERALL RESILIENCE SCORE: source=console
INFO[0182] 📊 Total Requests: 0 source=console
INFO[0182] ✅ Overall Success: 0% source=console
INFO[0182] 🏆 Resilience Grade: F source=console
INFO[0182]
🎉 Chaos Engineering Test Completed! source=console
█ THRESHOLDS
http_req_duration
✓ 'p(95)<5000' p(95)=452.95µs
http_req_failed
✓ 'rate<0.2' rate=2.49%
http_reqs
✓ 'count>30' count=281
█ TOTAL RESULTS
checks_total.......: 822 4.50214/s
checks_succeeded...: 98.41% 809 out of 822
checks_failed......: 1.58% 13 out of 822
✗ baseline: status is 200
↳ 96% — ✓ 144 / ✗ 6
✓ baseline: response time < 3000ms
✗ baseline: has content
↳ 96% — ✓ 144 / ✗ 6
✗ chaos-pod-termination: response received
↳ 83% — ✓ 5 / ✗ 1
✓ chaos-pod-termination: not timeout
✓ recovery: status is 200
✓ recovery: response time < 5000ms
✓ recovery: service recovered
HTTP
http_req_duration..............: avg=330.92µs min=0s med=324.02µs max=756.21µs p(90)=390.2µs p(95)=452.95µs
{ expected_response:true }...: avg=339.37µs min=215.32µs med=325.08µs max=756.21µs p(90)=394.99µs p(95)=465.44µs
http_req_failed................: 2.49% 7 out of 281
http_reqs......................: 281 1.539053/s
EXECUTION
iteration_duration.............: avg=1.26s min=1s med=1s max=43.78s p(90)=1s p(95)=1s
iterations.....................: 271 1.484282/s
vus............................: 2 min=0 max=4
vus_max........................: 4 min=4 max=4
NETWORK
data_received..................: 234 kB 1.3 kB/s
data_sent......................: 21 kB 113 B/s
running (03m02.6s), 0/4 VUs, 271 complete and 0 interrupted iterations
baseline_load ✓ [====================================] 3 VUs 1m0s
chaos_injection ✓ [====================================] 1 VUs 00m43.8s/10m0s 1/1 shared iters
recovery_test ✓ [====================================] 2 VUs 1m0s
📊 測試結果的深度解讀
從「完美」到「真實」的轉變
之前的壓測結果:
✅ 100% 成功率
✅ 平均響應時間 < 100ms
✅ 沒有任何錯誤
今天的混沌測試結果:
✅ 總體成功率:98.41% (809/822)
✅ HTTP 失敗率:僅 2.49% (7/281)
✅ 基線測試:96% 成功率 (144/150)
✅ 混沌測試:83% 成功率 (5/6)
✅ 恢復測試:100% 成功率
這些數字告訴我們什麼?
系統韌性評估:
真實用戶體驗:
xk6-disruptor
不只是一個測試工具,它是一種哲學思維:
擁抱不確定性 🌊
承認故障是必然的
主動尋找系統弱點
在可控環境中學習
建立信心 💪
通過混沌測試驗證系統韌性
讓團隊對系統有真實的了解
在真正的故障來臨時不會驚慌
持續改進 🔄
每次測試都是學習機會
將發現的問題轉化為改進點
建立更強大的系統架構
記住:混沌工程的目標不是破壞系統,而是讓系統在混沌中依然能夠優雅地運行 ✨
就像武俠小說中的高手,不是因為從未遇到過強敵,而是因為在無數次的生死搏鬥中磨練出了絕世武功。你的微服務架構,也需要經歷這樣的「混沌試煉」才能真正成為可靠的系統!