ophion/internal/ai/autohealing.go

package ai

import (
	"context"
	"encoding/json"
	"fmt"
	"log"
	"time"
)

// AutoHealer sistema de auto-correção inteligente
type AutoHealer struct {
	engine      *AIEngine
	enabled     bool
	dryRun      bool
	maxActions  int
	cooldown    time.Duration
	lastActions map[string]time.Time
}

// HealingAction ação de correção
type HealingAction struct {
	ID          string    `json:"id"`
	Type        string    `json:"type"`
	Target      string    `json:"target"`
	Command     string    `json:"command"`
	Description string    `json:"description"`
	Risk        string    `json:"risk"`
	Executed    bool      `json:"executed"`
	ExecutedAt  time.Time `json:"executed_at,omitempty"`
	Result      string    `json:"result,omitempty"`
	Error       string    `json:"error,omitempty"`
}

// HealingPlan plano de correção gerado pela IA
type HealingPlan struct {
	Issue       string          `json:"issue"`
	Severity    string          `json:"severity"`
	Analysis    string          `json:"analysis"`
	Actions     []HealingAction `json:"actions"`
	Confidence  float64         `json:"confidence"`
	RequiresApproval bool       `json:"requires_approval"`
}

// AutoHealConfig configuração do auto-healer
type AutoHealConfig struct {
	Enabled        bool
	DryRun         bool          // Se true, não executa, só sugere
	MaxActionsPerHour int
	Cooldown       time.Duration // Tempo mínimo entre ações no mesmo recurso
	AllowedActions []string      // Tipos de ações permitidas
	ApprovalRequired []string    // Ações que precisam de aprovação humana
}

// NewAutoHealer cria novo auto-healer
func NewAutoHealer(engine *AIEngine, config AutoHealConfig) *AutoHealer {
	return &AutoHealer{
		engine:      engine,
		enabled:     config.Enabled,
		dryRun:      config.DryRun,
		maxActions:  config.MaxActionsPerHour,
		cooldown:    config.Cooldown,
		lastActions: make(map[string]time.Time),
	}
}

// Analyze analisa um problema e gera plano de correção
func (h *AutoHealer) Analyze(ctx context.Context, incident Incident) (*HealingPlan, error) {
	if !h.enabled {
		return nil, fmt.Errorf("auto-healer is disabled")
	}

	prompt := fmt.Sprintf(`Analise o seguinte incidente e gere um plano de correção automática:

INCIDENTE:
- Tipo: %s
- Severidade: %s
- Host: %s
- Descrição: %s
- Início: %s
- Duração: %s

MÉTRICAS DO HOST:
%s

LOGS RECENTES:
%s

AÇÕES ANTERIORES (últimas 24h):
%s

REGRAS:
1. Priorize ações de baixo risco
2. Reinício de serviço só como último recurso
3. Escalonamento horizontal antes de vertical
4. Sempre tenha rollback em mente

Responda em JSON:
{
  "issue": "descrição concisa do problema",
  "severity": "low|medium|high|critical",
  "analysis": "análise detalhada da causa raiz",
  "actions": [
    {
      "id": "action_1",
      "type": "restart_service|scale_up|clear_cache|rotate_logs|kill_process|run_script|config_change",
      "target": "nome do serviço ou recurso",
      "command": "comando a executar",
      "description": "o que essa ação faz",
      "risk": "low|medium|high"
    }
  ],
  "confidence": 0.0-1.0,
  "requires_approval": boolean
}`,
		incident.Type,
		incident.Severity,
		incident.Host,
		incident.Description,
		incident.StartTime.Format(time.RFC3339),
		time.Since(incident.StartTime).String(),
		incident.Metrics,
		incident.RecentLogs,
		h.getRecentActionsForHost(incident.Host),
	)

	response, err := h.engine.chat(ctx, prompt)
	if err != nil {
		return nil, err
	}

	var plan HealingPlan
	if err := json.Unmarshal([]byte(response), &plan); err != nil {
		return nil, fmt.Errorf("failed to parse healing plan: %w", err)
	}

	return &plan, nil
}

// Execute executa um plano de correção
func (h *AutoHealer) Execute(ctx context.Context, plan *HealingPlan) (*HealingResult, error) {
	if h.dryRun {
		return &HealingResult{
			Plan:    plan,
			DryRun:  true,
			Message: "Dry run - ações não executadas",
		}, nil
	}

	result := &HealingResult{
		Plan:      plan,
		StartedAt: time.Now(),
		Actions:   make([]ActionResult, 0),
	}

	for _, action := range plan.Actions {
		// Verificar cooldown
		if !h.canExecute(action.Target) {
			result.Actions = append(result.Actions, ActionResult{
				ActionID: action.ID,
				Skipped:  true,
				Reason:   "Cooldown period active",
			})
			continue
		}

		// Executar ação
		actionResult := h.executeAction(ctx, action)
		result.Actions = append(result.Actions, actionResult)

		// Registrar execução
		h.lastActions[action.Target] = time.Now()

		// Se falhou, parar
		if !actionResult.Success {
			result.Success = false
			result.Error = actionResult.Error
			break
		}
	}

	result.CompletedAt = time.Now()
	result.Success = result.Error == ""

	return result, nil
}

// executeAction executa uma ação individual
func (h *AutoHealer) executeAction(ctx context.Context, action HealingAction) ActionResult {
	result := ActionResult{
		ActionID:  action.ID,
		StartedAt: time.Now(),
	}

	log.Printf("[AutoHealer] Executing action: %s on %s", action.Type, action.Target)

	// Aqui seria a execução real via SSH, API, etc.
	// Por segurança, implementar com cuidado
	switch action.Type {
	case "restart_service":
		// Exemplo: systemctl restart <service>
		result.Command = action.Command
		// result = executeSSHCommand(action.Target, action.Command)

	case "scale_up":
		// Exemplo: kubectl scale deployment...
		result.Command = action.Command

	case "clear_cache":
		// Exemplo: redis-cli flushdb
		result.Command = action.Command

	case "rotate_logs":
		// Exemplo: logrotate -f
		result.Command = action.Command

	default:
		result.Error = fmt.Sprintf("Unknown action type: %s", action.Type)
		return result
	}

	// Simular execução bem-sucedida (implementar execução real)
	result.Success = true
	result.CompletedAt = time.Now()
	result.Output = "Action executed successfully (simulated)"

	return result
}

func (h *AutoHealer) canExecute(target string) bool {
	lastAction, exists := h.lastActions[target]
	if !exists {
		return true
	}
	return time.Since(lastAction) > h.cooldown
}

func (h *AutoHealer) getRecentActionsForHost(host string) string {
	// Buscar histórico de ações recentes
	return "Nenhuma ação recente"
}

// Incident representa um incidente
type Incident struct {
	ID          string
	Type        string
	Severity    string
	Host        string
	Description string
	StartTime   time.Time
	Metrics     string
	RecentLogs  string
}

// HealingResult resultado da execução do plano
type HealingResult struct {
	Plan        *HealingPlan
	Success     bool
	DryRun      bool
	Message     string
	Error       string
	StartedAt   time.Time
	CompletedAt time.Time
	Actions     []ActionResult
}

// ActionResult resultado de uma ação
type ActionResult struct {
	ActionID    string
	Success     bool
	Skipped     bool
	Reason      string
	Command     string
	Output      string
	Error       string
	StartedAt   time.Time
	CompletedAt time.Time
}