diff --git a/api/src/core/signaling.js b/api/src/core/signaling.js index 6445cc78..2d6602b2 100644 --- a/api/src/core/signaling.js +++ b/api/src/core/signaling.js @@ -20,9 +20,7 @@ export const setupSignalingServer = (httpServer) => { console.log(`Cleaned up expired session: ${sessionId}`); } } - }, 5 * 60 * 1000); // Check every 5 minutes - - wss.on('connection', (ws, req) => { + }, 5 * 60 * 1000); // Check every 5 minutes wss.on('connection', (ws, req) => { const clientIP = req.headers['x-forwarded-for'] || req.headers['x-real-ip'] || req.socket.remoteAddress; const userAgent = req.headers['user-agent'] || 'Unknown'; console.log(`WebSocket connection established: ${clientIP}, URL: ${req.url}, User-Agent: ${userAgent}`); @@ -30,6 +28,56 @@ export const setupSignalingServer = (httpServer) => { let sessionId = null; let userRole = null; // 'creator' | 'joiner' let connectionStartTime = Date.now(); + let lastPingTime = Date.now(); + let lastPongTime = Date.now(); + let missedPongs = 0; + const maxMissedPongs = 3; // 允许最多3次未响应ping + + // Send ping every 25 seconds to keep connection alive + const pingInterval = setInterval(() => { + if (ws.readyState === ws.OPEN) { + const now = Date.now(); + + // 检查是否有太多未响应的ping + if (now - lastPongTime > 75000) { // 75秒没有pong响应 + missedPongs++; + console.warn(`Missed pong from ${clientIP} (session: ${sessionId || 'none'}), count: ${missedPongs}`); + + if (missedPongs >= maxMissedPongs) { + console.error(`Too many missed pongs from ${clientIP}, closing connection`); + ws.terminate(); + return; + } + } + + ws.ping(); + lastPingTime = now; + console.log(`Ping sent to ${clientIP} (session: ${sessionId || 'none'}), pongs missed: ${missedPongs}`); + } + }, 25000); + + // 额外的健康检查,每60秒检查连接状态 + const healthCheckInterval = setInterval(() => { + if (ws.readyState === ws.OPEN) { + const now = Date.now(); + const connectionAge = now - connectionStartTime; + const lastActivity = Math.min(now - lastPingTime, now - lastPongTime); + + console.log(`Health check for ${clientIP} (session: ${sessionId || 'none'}): connection age ${Math.round(connectionAge/1000)}s, last activity ${Math.round(lastActivity/1000)}s ago`); + + // 如果连接超过2小时且长时间没有活动,主动关闭 + if (connectionAge > 2 * 60 * 60 * 1000 && lastActivity > 300000) { // 2小时连接且5分钟无活动 + console.log(`Closing stale connection from ${clientIP} due to inactivity`); + ws.close(1000, 'Connection cleanup due to inactivity'); + } + } + }, 60000); + + ws.on('pong', () => { + lastPongTime = Date.now(); + missedPongs = 0; // 重置未响应计数 + console.log(`Pong received from ${clientIP} (session: ${sessionId || 'none'})`); + }); ws.on('message', (data) => { try { @@ -65,12 +113,32 @@ export const setupSignalingServer = (httpServer) => { message: 'Message format error' })); } - }); - - ws.on('close', (code, reason) => { + }); ws.on('close', (code, reason) => { + // 清理所有定时器 + clearInterval(pingInterval); + clearInterval(healthCheckInterval); + const connectionDuration = Date.now() - connectionStartTime; const reasonStr = reason ? reason.toString() : 'No reason provided'; - console.log(`WebSocket connection closed: code=${code}, reason="${reasonStr}", sessionId=${sessionId || 'None'}, role=${userRole || 'None'}, duration=${connectionDuration}ms`); + + // Log detailed close information + let closeReason = 'Unknown'; + switch(code) { + case 1000: closeReason = 'Normal closure'; break; + case 1001: closeReason = 'Going away'; break; + case 1002: closeReason = 'Protocol error'; break; + case 1003: closeReason = 'Unsupported data'; break; + case 1005: closeReason = 'No status received'; break; + case 1006: closeReason = 'Abnormal closure'; break; + case 1007: closeReason = 'Invalid frame payload data'; break; + case 1008: closeReason = 'Policy violation'; break; + case 1009: closeReason = 'Message too big'; break; + case 1010: closeReason = 'Mandatory extension'; break; + case 1011: closeReason = 'Internal server error'; break; + case 1015: closeReason = 'TLS handshake'; break; + } + + console.log(`WebSocket connection closed: code=${code} (${closeReason}), reason="${reasonStr}", sessionId=${sessionId || 'None'}, role=${userRole || 'None'}, duration=${connectionDuration}ms, clientIP=${clientIP}`); if (sessionId && userRole) { handleDisconnect(sessionId, userRole); diff --git a/cobalt-chart/templates/backendconfig.yaml b/cobalt-chart/templates/backendconfig.yaml new file mode 100644 index 00000000..f14d96ff --- /dev/null +++ b/cobalt-chart/templates/backendconfig.yaml @@ -0,0 +1,29 @@ +{{- if .Values.ingress.enabled -}} +apiVersion: cloud.google.com/v1 +kind: BackendConfig +metadata: + name: websocket-backendconfig + labels: + {{- include "cobalt-chart.labels" . | nindent 4 }} +spec: + # WebSocket connection timeout configuration + timeoutSec: 3600 # 1-hour backend timeout + connectionDraining: + drainingTimeoutSec: 60 # Connection draining time + # Session affinity to ensure WebSocket connections stay on same Pod + sessionAffinity: + affinityType: "CLIENT_IP" + affinityCookieTtlSec: 3600 + # Health check configuration + healthCheck: + checkIntervalSec: 15 + timeoutSec: 5 + healthyThreshold: 1 + unhealthyThreshold: 2 + type: HTTP + requestPath: /health + port: 80 + # Disable CDN for WebSocket compatibility + cdn: + enabled: false +{{- end }} diff --git a/cobalt-chart/templates/service.yaml b/cobalt-chart/templates/service.yaml index bcfe13d9..b1a82040 100644 --- a/cobalt-chart/templates/service.yaml +++ b/cobalt-chart/templates/service.yaml @@ -4,6 +4,11 @@ metadata: name: {{ include "cobalt-chart.fullname" . }} labels: {{- include "cobalt-chart.labels" . | nindent 4 }} +{{- if .Values.ingress.enabled }} + annotations: + cloud.google.com/backend-config: '{"default": "websocket-backendconfig"}' + cloud.google.com/neg: '{"ingress": true}' +{{- end }} spec: type: {{ .Values.service.type }} ports: diff --git a/cobalt-chart/values.yaml b/cobalt-chart/values.yaml index e7c805ba..04f00f3f 100644 --- a/cobalt-chart/values.yaml +++ b/cobalt-chart/values.yaml @@ -46,6 +46,10 @@ ingress: # GKE WebSocket 支持 cloud.google.com/neg: '{"ingress": true}' kubernetes.io/ingress.allow-http: "false" + # WebSocket 超时配置 - 增加超时时间以支持长连接 + cloud.google.com/timeout-sec: "3600" # 1小时超时 + # Backend 服务超时配置 + cloud.google.com/backend-config: '{"default": "websocket-backendconfig"}' hosts: - host: api.freesavevideo.online paths: diff --git a/docs/websocket-stability-completion.md b/docs/websocket-stability-completion.md new file mode 100644 index 00000000..5958c6cc --- /dev/null +++ b/docs/websocket-stability-completion.md @@ -0,0 +1,145 @@ +# WebSocket Connection Stability Improvements - Completion Report + +## Overview +Successfully implemented comprehensive WebSocket connection stability improvements to resolve the production environment disconnection issues in the clipboard sharing application. + +## Completed Improvements + +### 1. GKE Load Balancer Configuration ✅ + +#### Helm Chart Enhancements +- **File**: `cobalt-chart/values.yaml` +- **Changes**: Added WebSocket-specific timeout configurations in Ingress annotations +- **Impact**: 1-hour connection timeout instead of default 30 seconds + +#### BackendConfig Resource +- **File**: `cobalt-chart/templates/backendconfig.yaml` +- **Features**: + - 1-hour backend timeout (`timeoutSec: 3600`) + - Connection draining (60 seconds) + - Client IP session affinity for WebSocket persistence + - Custom health check targeting `/health` endpoint + - CDN disabled for WebSocket compatibility + +#### Service Annotations +- **File**: `cobalt-chart/templates/service.yaml` +- **Features**: + - Links to WebSocket BackendConfig + - GKE NEG annotations for proper load balancer integration + +### 2. Server-Side Connection Monitoring ✅ + +#### Enhanced WebSocket Server +- **File**: `api/src/core/signaling.js` +- **Features**: + - Advanced ping/pong monitoring with missed pong detection (max 3 missed) + - Health check interval every 60 seconds + - Connection age and activity tracking + - Automatic cleanup of stale connections (2+ hours old with 5+ minutes inactivity) + - Proper timer cleanup on connection close + - Enhanced logging for connection diagnostics + +## Configuration Summary + +### Load Balancer Timeouts +```yaml +# Ingress timeout +cloud.google.com/timeout-sec: "3600" + +# Backend timeout +timeoutSec: 3600 +connectionDraining: + drainingTimeoutSec: 60 +``` + +### Connection Monitoring +```javascript +// Ping every 25 seconds +const pingInterval = setInterval(() => { + // Check for missed pongs (max 3) + // Send ping to keep connection alive +}, 25000); + +// Health check every 60 seconds +const healthCheckInterval = setInterval(() => { + // Monitor connection age and activity + // Auto-cleanup stale connections +}, 60000); +``` + +## Deployment Instructions + +### 1. Deploy Updated Helm Chart +```bash +cd cobalt-chart +helm upgrade cobalt-api . --namespace production +``` + +### 2. Verify Deployment +```bash +# Check BackendConfig +kubectl get backendconfig websocket-backendconfig -o yaml + +# Check Service annotations +kubectl get service -o yaml | grep -A5 annotations + +# Check Ingress configuration +kubectl get ingress -o yaml | grep timeout-sec +``` + +### 3. Monitor WebSocket Connections +```bash +# Check server logs for enhanced connection monitoring +kubectl logs -f deployment/cobalt-api | grep -E "(WebSocket|Ping|Pong|Health check)" +``` + +## Expected Results + +### Before Implementation +- WebSocket connections disconnecting after ~30 seconds in production +- Error codes: 1005 (No status received), 1006 (Abnormal closure) +- Manual reconnection required + +### After Implementation +- WebSocket connections stable for hours in production +- Automatic handling of network interruptions +- Proactive connection health monitoring +- Graceful cleanup of inactive connections + +## Monitoring and Validation + +### Connection Stability Metrics +- Average connection duration should increase from ~30 seconds to hours +- Reduction in abnormal closure codes (1005/1006) +- Improved user experience with fewer reconnection prompts + +### Health Check Validation +```bash +# Test health endpoints +curl https://api.freesavevideo.online/health +curl https://api.freesavevideo.online/ws/health +``` + +### WebSocket Connection Test +```javascript +// Browser console test +const ws = new WebSocket('wss://api.freesavevideo.online/ws'); +ws.onopen = () => console.log('WebSocket connected successfully'); +ws.onclose = (event) => console.log(`WebSocket closed: ${event.code}`); +``` + +## Files Modified + +1. `cobalt-chart/values.yaml` - Ingress timeout configuration +2. `cobalt-chart/templates/backendconfig.yaml` - GKE WebSocket backend config +3. `cobalt-chart/templates/service.yaml` - Service annotations for BackendConfig +4. `api/src/core/signaling.js` - Enhanced connection monitoring and health checks + +## Resolution Status +✅ **COMPLETED** - All WebSocket connection stability improvements have been successfully implemented and are ready for production deployment. + +The solution addresses the root cause of production disconnections by: +1. Configuring appropriate GKE load balancer timeouts for WebSocket connections +2. Adding robust server-side connection monitoring and automatic cleanup +3. Implementing proactive health checks and connection management +4. Providing comprehensive logging for ongoing monitoring diff --git a/web/src/lib/clipboard/clipboard-manager.ts b/web/src/lib/clipboard/clipboard-manager.ts index 899369b7..be11613e 100644 --- a/web/src/lib/clipboard/clipboard-manager.ts +++ b/web/src/lib/clipboard/clipboard-manager.ts @@ -56,6 +56,11 @@ export class ClipboardManager { private sharedKey: CryptoKey | null = null; private currentReceivingFile: ReceivingFile | null = null; private statusInterval: ReturnType | null = null; + private reconnectAttempts = 0; + private maxReconnectAttempts = 5; + private reconnectDelay = 1000; // Start with 1 second + private reconnectTimer: ReturnType | null = null; + private isReconnecting = false; constructor() { this.loadStoredSession(); @@ -139,9 +144,7 @@ export class ClipboardManager { const wsUrl = `${protocol}//${host}/ws`; console.log('Constructed WebSocket URL:', wsUrl); return wsUrl; - } - - private async connectWebSocket(): Promise { + } private async connectWebSocket(): Promise { return new Promise((resolve, reject) => { try { const wsUrl = this.getWebSocketURL(); @@ -149,6 +152,8 @@ export class ClipboardManager { this.ws.onopen = () => { console.log('🔗 WebSocket connected'); + this.reconnectAttempts = 0; // Reset reconnect attempts on successful connection + this.reconnectDelay = 1000; // Reset delay clipboardState.update(state => ({ ...state, isConnected: true })); resolve(); }; @@ -162,9 +167,14 @@ export class ClipboardManager { } }; - this.ws.onclose = () => { - console.log('🔌 WebSocket disconnected'); + this.ws.onclose = (event) => { + console.log(`🔌 WebSocket disconnected: code=${event.code}, reason=${event.reason}`); clipboardState.update(state => ({ ...state, isConnected: false })); + + // Only attempt reconnection if we have a session and we're not manually disconnecting + if (!this.isReconnecting && this.shouldReconnect(event.code)) { + this.handleReconnection(); + } }; this.ws.onerror = (error) => { @@ -178,6 +188,113 @@ export class ClipboardManager { }); } + // Reconnection logic + private shouldReconnect(closeCode: number): boolean { + // Get current session state + const state = this.getCurrentState(); + + // Don't reconnect if we don't have a session + if (!state.sessionId) { + return false; + } + + // Don't reconnect on normal closure (user initiated) + if (closeCode === 1000) { + return false; + } + + // Reconnect on abnormal closures (network issues) + return closeCode === 1005 || closeCode === 1006 || closeCode === 1001; + } + + private async handleReconnection(): Promise { + if (this.isReconnecting || this.reconnectAttempts >= this.maxReconnectAttempts) { + console.log(`Max reconnection attempts reached (${this.maxReconnectAttempts})`); + this.showError('Connection lost. Please refresh the page to reconnect.'); + return; + } + + this.isReconnecting = true; + this.reconnectAttempts++; + + console.log(`Attempting to reconnect (${this.reconnectAttempts}/${this.maxReconnectAttempts})...`); + + // Show reconnecting status to user + clipboardState.update(state => ({ + ...state, + errorMessage: `Reconnecting... (${this.reconnectAttempts}/${this.maxReconnectAttempts})`, + showError: true + })); + + // Wait before reconnecting (exponential backoff) + this.reconnectTimer = setTimeout(async () => { + try { + await this.connectWebSocket(); + await this.rejoinSession(); + + // Clear error message on successful reconnection + clipboardState.update(state => ({ + ...state, + errorMessage: '', + showError: false + })); + + console.log('✅ Successfully reconnected and rejoined session'); + } catch (error) { + console.error('❌ Reconnection failed:', error); + // Exponential backoff: increase delay for next attempt + this.reconnectDelay = Math.min(this.reconnectDelay * 2, 30000); // Max 30 seconds + + // Try again + this.handleReconnection(); + } finally { + this.isReconnecting = false; + } + }, this.reconnectDelay); + } + + private async rejoinSession(): Promise { + const state = this.getCurrentState(); + + if (!state.sessionId || !this.ws || this.ws.readyState !== WebSocket.OPEN) { + throw new Error('Cannot rejoin: no session or WebSocket not ready'); + } + + // Generate new key pair for security + await this.generateKeyPair(); + const publicKeyArray = Array.from(new Uint8Array(await this.exportPublicKey())); + + if (state.isCreator) { + // Reconnect as creator + this.ws.send(JSON.stringify({ + type: 'create_session', + publicKey: publicKeyArray, + existingSessionId: state.sessionId + })); + } else { + // Reconnect as joiner + this.ws.send(JSON.stringify({ + type: 'join_session', + sessionId: state.sessionId, + publicKey: publicKeyArray + })); + } + } + private getCurrentState() { + let state: any; + const unsubscribe = clipboardState.subscribe(s => state = s); + unsubscribe(); + return state; + } + + private showError(message: string): void { + clipboardState.update(state => ({ + ...state, + errorMessage: message, + showError: true + })); + } + // Encryption private async generateKeyPair(): Promise { this.keyPair = await window.crypto.subtle.generateKey( @@ -371,6 +488,14 @@ export class ClipboardManager { navigator.clipboard.writeText(url); } } cleanup(): void { + // Clear reconnection timer + if (this.reconnectTimer) { + clearTimeout(this.reconnectTimer); + this.reconnectTimer = null; + } + this.isReconnecting = false; + this.reconnectAttempts = 0; + if (this.dataChannel) { this.dataChannel.close(); this.dataChannel = null;