diff --git a/README.md b/README.md index 973e8f8..121f5e1 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,9 @@ The Nuclei Operator watches for Ingress and VirtualService resources in your Kub - **Automatic Discovery**: Watches Kubernetes Ingress and Istio VirtualService resources for new endpoints - **Automated Scanning**: Automatically creates and runs Nuclei scans when new endpoints are discovered - **Scheduled Scans**: Support for cron-based scheduled rescanning +- **Automatic Rescans**: Automatically rescans when results become stale (configurable age threshold) +- **Target Availability Checking**: Waits for targets to be available before scanning +- **Stale Scan Recovery**: Automatically resets interrupted scans on operator restart - **Flexible Configuration**: Configurable templates, severity filters, and scan options - **Native Kubernetes Integration**: Results stored as Kubernetes custom resources - **Owner References**: Automatic cleanup when source resources are deleted @@ -167,6 +170,38 @@ The operator can be configured using the following environment variables: | `NUCLEI_BINARY_PATH` | Path to the Nuclei binary | `nuclei` | | `NUCLEI_TEMPLATES_PATH` | Path to Nuclei templates directory | (uses Nuclei default) | | `NUCLEI_TIMEOUT` | Default scan timeout | `30m` | +| `NUCLEI_RESCAN_AGE` | Maximum age of scan results before automatic rescan | `168h` (1 week) | +| `NUCLEI_BACKOFF_INITIAL` | Initial retry interval for target availability checks | `10s` | +| `NUCLEI_BACKOFF_MAX` | Maximum retry interval for target availability checks | `10m` | +| `NUCLEI_BACKOFF_MULTIPLIER` | Multiplier for exponential backoff | `2.0` | + +### Automatic Rescan Behavior + +The operator automatically rescans targets when: + +1. **Stale Results**: Scan results are older than `NUCLEI_RESCAN_AGE` (default: 1 week) +2. **Operator Restart**: Any scans that were in "Running" state when the operator restarted are automatically re-queued +3. **Spec Changes**: When the NucleiScan spec is modified + +### Target Availability with Exponential Backoff + +Before running a scan, the operator checks if targets are reachable: + +- Uses HTTP HEAD requests to verify target availability +- If no targets are available, the scan waits and retries with **exponential backoff** +- Backoff sequence with defaults: 10s → 20s → 40s → 80s → 160s → 320s → 600s (max) +- Scans proceed with available targets even if some are unreachable +- Any HTTP response (including 4xx/5xx) is considered "available" - the service is responding +- Retry count is tracked in `status.retryCount` and reset when targets become available + +**Backoff Configuration Example:** + +```bash +# Set initial retry to 5 seconds, max to 5 minutes, multiplier to 1.5 +export NUCLEI_BACKOFF_INITIAL=5s +export NUCLEI_BACKOFF_MAX=5m +export NUCLEI_BACKOFF_MULTIPLIER=1.5 +``` ### NucleiScan Spec Options diff --git a/api/v1alpha1/nucleiscan_types.go b/api/v1alpha1/nucleiscan_types.go index fb0461f..3e3ae46 100644 --- a/api/v1alpha1/nucleiscan_types.go +++ b/api/v1alpha1/nucleiscan_types.go @@ -191,6 +191,15 @@ type NucleiScanStatus struct { // ObservedGeneration is the generation observed by the controller // +optional ObservedGeneration int64 `json:"observedGeneration,omitempty"` + + // RetryCount tracks the number of consecutive availability check retries + // Used for exponential backoff when waiting for targets + // +optional + RetryCount int `json:"retryCount,omitempty"` + + // LastRetryTime is when the last availability check retry occurred + // +optional + LastRetryTime *metav1.Time `json:"lastRetryTime,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index 78d7010..b0a99bf 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -185,6 +185,10 @@ func (in *NucleiScanStatus) DeepCopyInto(out *NucleiScanStatus) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.LastRetryTime != nil { + in, out := &in.LastRetryTime, &out.LastRetryTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NucleiScanStatus. diff --git a/cmd/main.go b/cmd/main.go index 4cf6532..db6e44e 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -183,11 +183,11 @@ func main() { os.Exit(1) } - if err := (&controller.NucleiScanReconciler{ - Client: mgr.GetClient(), - Scheme: mgr.GetScheme(), - Scanner: scanner.NewNucleiScannerWithDefaults(), - }).SetupWithManager(mgr); err != nil { + if err := controller.NewNucleiScanReconciler( + mgr.GetClient(), + mgr.GetScheme(), + scanner.NewNucleiScannerWithDefaults(), + ).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NucleiScan") os.Exit(1) } diff --git a/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml b/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml index e45429d..97a36e3 100644 --- a/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml +++ b/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml @@ -252,6 +252,11 @@ spec: lastError: description: LastError contains the error message if the scan failed type: string + lastRetryTime: + description: LastRetryTime is when the last availability check retry + occurred + format: date-time + type: string lastScanTime: description: LastScanTime is when the last scan was initiated format: date-time @@ -274,6 +279,11 @@ spec: - Completed - Failed type: string + retryCount: + description: |- + RetryCount tracks the number of consecutive availability check retries + Used for exponential backoff when waiting for targets + type: integer summary: description: Summary provides aggregated scan statistics properties: diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 36085f6..5fafbf9 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -73,6 +73,21 @@ spec: value: "/nuclei-templates" - name: NUCLEI_TIMEOUT value: "30m" + # NUCLEI_RESCAN_AGE controls how old scan results can be before + # triggering an automatic rescan. Default is 168h (1 week). + # Set to "0" to disable automatic rescans based on age. + - name: NUCLEI_RESCAN_AGE + value: "168h" + # Backoff configuration for target availability checks + # NUCLEI_BACKOFF_INITIAL: Initial retry interval (default: 10s) + - name: NUCLEI_BACKOFF_INITIAL + value: "10s" + # NUCLEI_BACKOFF_MAX: Maximum retry interval (default: 10m) + - name: NUCLEI_BACKOFF_MAX + value: "10m" + # NUCLEI_BACKOFF_MULTIPLIER: Multiplier for exponential backoff (default: 2.0) + - name: NUCLEI_BACKOFF_MULTIPLIER + value: "2.0" securityContext: readOnlyRootFilesystem: false # Nuclei needs to write temporary files allowPrivilegeEscalation: false diff --git a/internal/controller/nucleiscan_controller.go b/internal/controller/nucleiscan_controller.go index be2898a..5e7b938 100644 --- a/internal/controller/nucleiscan_controller.go +++ b/internal/controller/nucleiscan_controller.go @@ -19,6 +19,8 @@ package controller import ( "context" "fmt" + "net/http" + "os" "time" "k8s.io/apimachinery/pkg/api/meta" @@ -41,6 +43,20 @@ const ( defaultRequeueAfter = 30 * time.Second defaultScheduleRequeue = 1 * time.Minute defaultErrorRequeueAfter = 1 * time.Minute + + // Default rescan age (1 week) + defaultRescanAge = 7 * 24 * time.Hour + + // Default backoff settings for target availability checks + defaultBackoffInitial = 10 * time.Second // Initial retry interval + defaultBackoffMax = 10 * time.Minute // Maximum retry interval + defaultBackoffMultiplier = 2.0 // Multiplier for exponential backoff + + // Environment variables + envRescanAge = "NUCLEI_RESCAN_AGE" + envBackoffInitial = "NUCLEI_BACKOFF_INITIAL" + envBackoffMax = "NUCLEI_BACKOFF_MAX" + envBackoffMultiplier = "NUCLEI_BACKOFF_MULTIPLIER" ) // Condition types for NucleiScan @@ -58,11 +74,92 @@ const ( ReasonScanSuspended = "ScanSuspended" ) +// BackoffConfig holds configuration for exponential backoff +type BackoffConfig struct { + Initial time.Duration + Max time.Duration + Multiplier float64 +} + // NucleiScanReconciler reconciles a NucleiScan object type NucleiScanReconciler struct { client.Client - Scheme *runtime.Scheme - Scanner scanner.Scanner + Scheme *runtime.Scheme + Scanner scanner.Scanner + RescanAge time.Duration + HTTPClient *http.Client + Backoff BackoffConfig +} + +// NewNucleiScanReconciler creates a new NucleiScanReconciler with default settings +func NewNucleiScanReconciler(client client.Client, scheme *runtime.Scheme, scanner scanner.Scanner) *NucleiScanReconciler { + rescanAge := defaultRescanAge + if envVal := os.Getenv(envRescanAge); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + rescanAge = parsed + } + } + + backoffInitial := defaultBackoffInitial + if envVal := os.Getenv(envBackoffInitial); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + backoffInitial = parsed + } + } + + backoffMax := defaultBackoffMax + if envVal := os.Getenv(envBackoffMax); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + backoffMax = parsed + } + } + + backoffMultiplier := defaultBackoffMultiplier + if envVal := os.Getenv(envBackoffMultiplier); envVal != "" { + if parsed, err := parseFloat(envVal); err == nil && parsed > 0 { + backoffMultiplier = parsed + } + } + + return &NucleiScanReconciler{ + Client: client, + Scheme: scheme, + Scanner: scanner, + RescanAge: rescanAge, + HTTPClient: &http.Client{ + Timeout: 10 * time.Second, + }, + Backoff: BackoffConfig{ + Initial: backoffInitial, + Max: backoffMax, + Multiplier: backoffMultiplier, + }, + } +} + +// parseFloat parses a string to float64 +func parseFloat(s string) (float64, error) { + var f float64 + _, err := fmt.Sscanf(s, "%f", &f) + return f, err +} + +// calculateBackoff calculates the next backoff duration based on retry count +func (r *NucleiScanReconciler) calculateBackoff(retryCount int) time.Duration { + if retryCount <= 0 { + return r.Backoff.Initial + } + + // Calculate exponential backoff: initial * multiplier^retryCount + backoff := float64(r.Backoff.Initial) + for i := 0; i < retryCount; i++ { + backoff *= r.Backoff.Multiplier + if backoff > float64(r.Backoff.Max) { + return r.Backoff.Max + } + } + + return time.Duration(backoff) } // +kubebuilder:rbac:groups=nuclei.homelab.mortenolsen.pro,resources=nucleiscans,verbs=get;list;watch;create;update;patch;delete @@ -118,9 +215,15 @@ func (r *NucleiScanReconciler) Reconcile(ctx context.Context, req ctrl.Request) case nucleiv1alpha1.ScanPhasePending: return r.handlePendingPhase(ctx, nucleiScan) case nucleiv1alpha1.ScanPhaseRunning: - // This shouldn't happen in our synchronous implementation - // but handle it gracefully - return r.handlePendingPhase(ctx, nucleiScan) + // Running phase on startup means the scan was interrupted (operator restart) + // Reset to Pending to re-run the scan + log.Info("Found stale Running scan, resetting to Pending (operator likely restarted)") + nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.LastError = "Scan was interrupted due to operator restart, re-queuing" + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{Requeue: true}, nil case nucleiv1alpha1.ScanPhaseCompleted: return r.handleCompletedPhase(ctx, nucleiScan) case nucleiv1alpha1.ScanPhaseFailed: @@ -161,7 +264,49 @@ func (r *NucleiScanReconciler) handleDeletion(ctx context.Context, nucleiScan *n // handlePendingPhase handles the Pending phase - starts a new scan func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) { log := logf.FromContext(ctx) - log.Info("Starting scan", "targets", len(nucleiScan.Spec.Targets)) + log.Info("Preparing to scan", "targets", len(nucleiScan.Spec.Targets)) + + // Check if at least one target is available before scanning + availableTargets, unavailableTargets := r.checkTargetsAvailability(ctx, nucleiScan.Spec.Targets) + if len(availableTargets) == 0 { + // Calculate backoff based on retry count + retryCount := nucleiScan.Status.RetryCount + backoffDuration := r.calculateBackoff(retryCount) + + log.Info("No targets are available yet, waiting with backoff...", + "unavailable", len(unavailableTargets), + "retryCount", retryCount, + "backoffDuration", backoffDuration) + + // Update condition and retry count + now := metav1.Now() + meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ + Type: ConditionTypeReady, + Status: metav1.ConditionFalse, + Reason: "WaitingForTargets", + Message: fmt.Sprintf("Waiting for targets to become available (%d unavailable, retry #%d, next check in %v)", len(unavailableTargets), retryCount+1, backoffDuration), + LastTransitionTime: now, + }) + nucleiScan.Status.LastError = fmt.Sprintf("Targets not available: %v", unavailableTargets) + nucleiScan.Status.RetryCount = retryCount + 1 + nucleiScan.Status.LastRetryTime = &now + + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + + // Requeue with exponential backoff + return ctrl.Result{RequeueAfter: backoffDuration}, nil + } + + // Reset retry count since targets are now available + if nucleiScan.Status.RetryCount > 0 { + log.Info("Targets now available, resetting retry count", "previousRetries", nucleiScan.Status.RetryCount) + nucleiScan.Status.RetryCount = 0 + nucleiScan.Status.LastRetryTime = nil + } + + log.Info("Starting scan", "availableTargets", len(availableTargets), "unavailableTargets", len(unavailableTargets)) // Update status to Running now := metav1.Now() @@ -175,7 +320,7 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca Type: ConditionTypeScanActive, Status: metav1.ConditionTrue, Reason: ReasonScanRunning, - Message: "Scan is in progress", + Message: fmt.Sprintf("Scan is in progress (%d targets)", len(availableTargets)), LastTransitionTime: now, }) @@ -190,8 +335,8 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca Timeout: 30 * time.Minute, // Default timeout } - // Execute the scan - result, err := r.Scanner.Scan(ctx, nucleiScan.Spec.Targets, options) + // Execute the scan with available targets only + result, err := r.Scanner.Scan(ctx, availableTargets, options) if err != nil { log.Error(err, "Scan failed") return r.handleScanError(ctx, nucleiScan, err) @@ -201,6 +346,33 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca return r.handleScanSuccess(ctx, nucleiScan, result) } +// checkTargetsAvailability checks which targets are reachable +func (r *NucleiScanReconciler) checkTargetsAvailability(ctx context.Context, targets []string) (available []string, unavailable []string) { + log := logf.FromContext(ctx) + + for _, target := range targets { + req, err := http.NewRequestWithContext(ctx, http.MethodHead, target, nil) + if err != nil { + log.V(1).Info("Failed to create request for target", "target", target, "error", err) + unavailable = append(unavailable, target) + continue + } + + resp, err := r.HTTPClient.Do(req) + if err != nil { + log.V(1).Info("Target not available", "target", target, "error", err) + unavailable = append(unavailable, target) + continue + } + resp.Body.Close() + + // Consider any response (even 4xx/5xx) as "available" - the service is responding + available = append(available, target) + } + + return available, unavailable +} + // handleScanSuccess updates the status after a successful scan func (r *NucleiScanReconciler) handleScanSuccess(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, result *scanner.ScanResult) (ctrl.Result, error) { log := logf.FromContext(ctx) @@ -294,6 +466,25 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS return r.checkScheduledScan(ctx, nucleiScan) } + // Check if scan results are stale (older than RescanAge) + if nucleiScan.Status.CompletionTime != nil { + age := time.Since(nucleiScan.Status.CompletionTime.Time) + if age > r.RescanAge { + log.Info("Scan results are stale, triggering rescan", "age", age, "maxAge", r.RescanAge) + nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.LastError = fmt.Sprintf("Automatic rescan triggered (results were %v old)", age.Round(time.Hour)) + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{Requeue: true}, nil + } + + // Schedule a requeue for when the results will become stale + timeUntilStale := r.RescanAge - age + log.V(1).Info("Scan results still fresh, will check again later", "timeUntilStale", timeUntilStale) + return ctrl.Result{RequeueAfter: timeUntilStale}, nil + } + return ctrl.Result{}, nil }