feat: Add AI differentiators

🤖 OPHION Copilot - Interactive AI assistant for troubleshooting - Context-aware conversations - Actionable suggestions with commands 🔧 Auto-Healing - AI-powered incident analysis - Automatic remediation plans - Safe execution with dry-run mode 🚨 Smart Alerts - Noise reduction - Alert correlation - Root cause analysis - Impact assessment 📊 AI Insights - Daily insights generation - Security anomaly detection - Cost optimization suggestions - Capacity predictions - Executive reports 🛡️ Security - Behavioral anomaly detection - Intrusion attempt identification - Compliance monitoring
2026-02-05 22:48:10 -03:00
parent d58ac37e39
commit 369373b387
5 changed files with 1473 additions and 0 deletions
--- a/internal/ai/autohealing.go
+++ b/internal/ai/autohealing.go
@@ -0,0 +1,278 @@
+package ai
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"log"
+	"time"
+)
+
+// AutoHealer sistema de auto-correção inteligente
+type AutoHealer struct {
+	engine      *AIEngine
+	enabled     bool
+	dryRun      bool
+	maxActions  int
+	cooldown    time.Duration
+	lastActions map[string]time.Time
+}
+
+// HealingAction ação de correção
+type HealingAction struct {
+	ID          string    `json:"id"`
+	Type        string    `json:"type"`
+	Target      string    `json:"target"`
+	Command     string    `json:"command"`
+	Description string    `json:"description"`
+	Risk        string    `json:"risk"`
+	Executed    bool      `json:"executed"`
+	ExecutedAt  time.Time `json:"executed_at,omitempty"`
+	Result      string    `json:"result,omitempty"`
+	Error       string    `json:"error,omitempty"`
+}
+
+// HealingPlan plano de correção gerado pela IA
+type HealingPlan struct {
+	Issue       string          `json:"issue"`
+	Severity    string          `json:"severity"`
+	Analysis    string          `json:"analysis"`
+	Actions     []HealingAction `json:"actions"`
+	Confidence  float64         `json:"confidence"`
+	RequiresApproval bool       `json:"requires_approval"`
+}
+
+// AutoHealConfig configuração do auto-healer
+type AutoHealConfig struct {
+	Enabled        bool
+	DryRun         bool          // Se true, não executa, só sugere
+	MaxActionsPerHour int
+	Cooldown       time.Duration // Tempo mínimo entre ações no mesmo recurso
+	AllowedActions []string      // Tipos de ações permitidas
+	ApprovalRequired []string    // Ações que precisam de aprovação humana
+}
+
+// NewAutoHealer cria novo auto-healer
+func NewAutoHealer(engine *AIEngine, config AutoHealConfig) *AutoHealer {
+	return &AutoHealer{
+		engine:      engine,
+		enabled:     config.Enabled,
+		dryRun:      config.DryRun,
+		maxActions:  config.MaxActionsPerHour,
+		cooldown:    config.Cooldown,
+		lastActions: make(map[string]time.Time),
+	}
+}
+
+// Analyze analisa um problema e gera plano de correção
+func (h *AutoHealer) Analyze(ctx context.Context, incident Incident) (*HealingPlan, error) {
+	if !h.enabled {
+		return nil, fmt.Errorf("auto-healer is disabled")
+	}
+
+	prompt := fmt.Sprintf(`Analise o seguinte incidente e gere um plano de correção automática:
+
+INCIDENTE:
+- Tipo: %s
+- Severidade: %s
+- Host: %s
+- Descrição: %s
+- Início: %s
+- Duração: %s
+
+MÉTRICAS DO HOST:
+%s
+
+LOGS RECENTES:
+%s
+
+AÇÕES ANTERIORES (últimas 24h):
+%s
+
+REGRAS:
+1. Priorize ações de baixo risco
+2. Reinício de serviço só como último recurso
+3. Escalonamento horizontal antes de vertical
+4. Sempre tenha rollback em mente
+
+Responda em JSON:
+{
+  "issue": "descrição concisa do problema",
+  "severity": "low|medium|high|critical",
+  "analysis": "análise detalhada da causa raiz",
+  "actions": [
+    {
+      "id": "action_1",
+      "type": "restart_service|scale_up|clear_cache|rotate_logs|kill_process|run_script|config_change",
+      "target": "nome do serviço ou recurso",
+      "command": "comando a executar",
+      "description": "o que essa ação faz",
+      "risk": "low|medium|high"
+    }
+  ],
+  "confidence": 0.0-1.0,
+  "requires_approval": boolean
+}`,
+		incident.Type,
+		incident.Severity,
+		incident.Host,
+		incident.Description,
+		incident.StartTime.Format(time.RFC3339),
+		time.Since(incident.StartTime).String(),
+		incident.Metrics,
+		incident.RecentLogs,
+		h.getRecentActionsForHost(incident.Host),
+	)
+
+	response, err := h.engine.chat(ctx, prompt)
+	if err != nil {
+		return nil, err
+	}
+
+	var plan HealingPlan
+	if err := json.Unmarshal([]byte(response), &plan); err != nil {
+		return nil, fmt.Errorf("failed to parse healing plan: %w", err)
+	}
+
+	return &plan, nil
+}
+
+// Execute executa um plano de correção
+func (h *AutoHealer) Execute(ctx context.Context, plan *HealingPlan) (*HealingResult, error) {
+	if h.dryRun {
+		return &HealingResult{
+			Plan:    plan,
+			DryRun:  true,
+			Message: "Dry run - ações não executadas",
+		}, nil
+	}
+
+	result := &HealingResult{
+		Plan:      plan,
+		StartedAt: time.Now(),
+		Actions:   make([]ActionResult, 0),
+	}
+
+	for _, action := range plan.Actions {
+		// Verificar cooldown
+		if !h.canExecute(action.Target) {
+			result.Actions = append(result.Actions, ActionResult{
+				ActionID: action.ID,
+				Skipped:  true,
+				Reason:   "Cooldown period active",
+			})
+			continue
+		}
+
+		// Executar ação
+		actionResult := h.executeAction(ctx, action)
+		result.Actions = append(result.Actions, actionResult)
+
+		// Registrar execução
+		h.lastActions[action.Target] = time.Now()
+
+		// Se falhou, parar
+		if !actionResult.Success {
+			result.Success = false
+			result.Error = actionResult.Error
+			break
+		}
+	}
+
+	result.CompletedAt = time.Now()
+	result.Success = result.Error == ""
+
+	return result, nil
+}
+
+// executeAction executa uma ação individual
+func (h *AutoHealer) executeAction(ctx context.Context, action HealingAction) ActionResult {
+	result := ActionResult{
+		ActionID:  action.ID,
+		StartedAt: time.Now(),
+	}
+
+	log.Printf("[AutoHealer] Executing action: %s on %s", action.Type, action.Target)
+
+	// Aqui seria a execução real via SSH, API, etc.
+	// Por segurança, implementar com cuidado
+	switch action.Type {
+	case "restart_service":
+		// Exemplo: systemctl restart <service>
+		result.Command = action.Command
+		// result = executeSSHCommand(action.Target, action.Command)
+		
+	case "scale_up":
+		// Exemplo: kubectl scale deployment...
+		result.Command = action.Command
+		
+	case "clear_cache":
+		// Exemplo: redis-cli flushdb
+		result.Command = action.Command
+		
+	case "rotate_logs":
+		// Exemplo: logrotate -f
+		result.Command = action.Command
+		
+	default:
+		result.Error = fmt.Sprintf("Unknown action type: %s", action.Type)
+		return result
+	}
+
+	// Simular execução bem-sucedida (implementar execução real)
+	result.Success = true
+	result.CompletedAt = time.Now()
+	result.Output = "Action executed successfully (simulated)"
+
+	return result
+}
+
+func (h *AutoHealer) canExecute(target string) bool {
+	lastAction, exists := h.lastActions[target]
+	if !exists {
+		return true
+	}
+	return time.Since(lastAction) > h.cooldown
+}
+
+func (h *AutoHealer) getRecentActionsForHost(host string) string {
+	// Buscar histórico de ações recentes
+	return "Nenhuma ação recente"
+}
+
+// Incident representa um incidente
+type Incident struct {
+	ID          string
+	Type        string
+	Severity    string
+	Host        string
+	Description string
+	StartTime   time.Time
+	Metrics     string
+	RecentLogs  string
+}
+
+// HealingResult resultado da execução do plano
+type HealingResult struct {
+	Plan        *HealingPlan
+	Success     bool
+	DryRun      bool
+	Message     string
+	Error       string
+	StartedAt   time.Time
+	CompletedAt time.Time
+	Actions     []ActionResult
+}
+
+// ActionResult resultado de uma ação
+type ActionResult struct {
+	ActionID    string
+	Success     bool
+	Skipped     bool
+	Reason      string
+	Command     string
+	Output      string
+	Error       string
+	StartedAt   time.Time
+	CompletedAt time.Time
+}