feat: support rescans and backoffs

This commit is contained in:
Morten Olsen
2025-12-12 12:07:34 +01:00
parent 8073d0044b
commit 67014b3d16
7 changed files with 278 additions and 14 deletions

View File

@@ -14,6 +14,9 @@ The Nuclei Operator watches for Ingress and VirtualService resources in your Kub
- **Automatic Discovery**: Watches Kubernetes Ingress and Istio VirtualService resources for new endpoints
- **Automated Scanning**: Automatically creates and runs Nuclei scans when new endpoints are discovered
- **Scheduled Scans**: Support for cron-based scheduled rescanning
- **Automatic Rescans**: Automatically rescans when results become stale (configurable age threshold)
- **Target Availability Checking**: Waits for targets to be available before scanning
- **Stale Scan Recovery**: Automatically resets interrupted scans on operator restart
- **Flexible Configuration**: Configurable templates, severity filters, and scan options
- **Native Kubernetes Integration**: Results stored as Kubernetes custom resources
- **Owner References**: Automatic cleanup when source resources are deleted
@@ -167,6 +170,38 @@ The operator can be configured using the following environment variables:
| `NUCLEI_BINARY_PATH` | Path to the Nuclei binary | `nuclei` |
| `NUCLEI_TEMPLATES_PATH` | Path to Nuclei templates directory | (uses Nuclei default) |
| `NUCLEI_TIMEOUT` | Default scan timeout | `30m` |
| `NUCLEI_RESCAN_AGE` | Maximum age of scan results before automatic rescan | `168h` (1 week) |
| `NUCLEI_BACKOFF_INITIAL` | Initial retry interval for target availability checks | `10s` |
| `NUCLEI_BACKOFF_MAX` | Maximum retry interval for target availability checks | `10m` |
| `NUCLEI_BACKOFF_MULTIPLIER` | Multiplier for exponential backoff | `2.0` |
### Automatic Rescan Behavior
The operator automatically rescans targets when:
1. **Stale Results**: Scan results are older than `NUCLEI_RESCAN_AGE` (default: 1 week)
2. **Operator Restart**: Any scans that were in "Running" state when the operator restarted are automatically re-queued
3. **Spec Changes**: When the NucleiScan spec is modified
### Target Availability with Exponential Backoff
Before running a scan, the operator checks if targets are reachable:
- Uses HTTP HEAD requests to verify target availability
- If no targets are available, the scan waits and retries with **exponential backoff**
- Backoff sequence with defaults: 10s → 20s → 40s → 80s → 160s → 320s → 600s (max)
- Scans proceed with available targets even if some are unreachable
- Any HTTP response (including 4xx/5xx) is considered "available" - the service is responding
- Retry count is tracked in `status.retryCount` and reset when targets become available
**Backoff Configuration Example:**
```bash
# Set initial retry to 5 seconds, max to 5 minutes, multiplier to 1.5
export NUCLEI_BACKOFF_INITIAL=5s
export NUCLEI_BACKOFF_MAX=5m
export NUCLEI_BACKOFF_MULTIPLIER=1.5
```
### NucleiScan Spec Options

View File

@@ -191,6 +191,15 @@ type NucleiScanStatus struct {
// ObservedGeneration is the generation observed by the controller
// +optional
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
// RetryCount tracks the number of consecutive availability check retries
// Used for exponential backoff when waiting for targets
// +optional
RetryCount int `json:"retryCount,omitempty"`
// LastRetryTime is when the last availability check retry occurred
// +optional
LastRetryTime *metav1.Time `json:"lastRetryTime,omitempty"`
}
// +kubebuilder:object:root=true

View File

@@ -185,6 +185,10 @@ func (in *NucleiScanStatus) DeepCopyInto(out *NucleiScanStatus) {
(*in)[i].DeepCopyInto(&(*out)[i])
}
}
if in.LastRetryTime != nil {
in, out := &in.LastRetryTime, &out.LastRetryTime
*out = (*in).DeepCopy()
}
}
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NucleiScanStatus.

View File

@@ -183,11 +183,11 @@ func main() {
os.Exit(1)
}
if err := (&controller.NucleiScanReconciler{
Client: mgr.GetClient(),
Scheme: mgr.GetScheme(),
Scanner: scanner.NewNucleiScannerWithDefaults(),
}).SetupWithManager(mgr); err != nil {
if err := controller.NewNucleiScanReconciler(
mgr.GetClient(),
mgr.GetScheme(),
scanner.NewNucleiScannerWithDefaults(),
).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NucleiScan")
os.Exit(1)
}

View File

@@ -252,6 +252,11 @@ spec:
lastError:
description: LastError contains the error message if the scan failed
type: string
lastRetryTime:
description: LastRetryTime is when the last availability check retry
occurred
format: date-time
type: string
lastScanTime:
description: LastScanTime is when the last scan was initiated
format: date-time
@@ -274,6 +279,11 @@ spec:
- Completed
- Failed
type: string
retryCount:
description: |-
RetryCount tracks the number of consecutive availability check retries
Used for exponential backoff when waiting for targets
type: integer
summary:
description: Summary provides aggregated scan statistics
properties:

View File

@@ -73,6 +73,21 @@ spec:
value: "/nuclei-templates"
- name: NUCLEI_TIMEOUT
value: "30m"
# NUCLEI_RESCAN_AGE controls how old scan results can be before
# triggering an automatic rescan. Default is 168h (1 week).
# Set to "0" to disable automatic rescans based on age.
- name: NUCLEI_RESCAN_AGE
value: "168h"
# Backoff configuration for target availability checks
# NUCLEI_BACKOFF_INITIAL: Initial retry interval (default: 10s)
- name: NUCLEI_BACKOFF_INITIAL
value: "10s"
# NUCLEI_BACKOFF_MAX: Maximum retry interval (default: 10m)
- name: NUCLEI_BACKOFF_MAX
value: "10m"
# NUCLEI_BACKOFF_MULTIPLIER: Multiplier for exponential backoff (default: 2.0)
- name: NUCLEI_BACKOFF_MULTIPLIER
value: "2.0"
securityContext:
readOnlyRootFilesystem: false # Nuclei needs to write temporary files
allowPrivilegeEscalation: false

View File

@@ -19,6 +19,8 @@ package controller
import (
"context"
"fmt"
"net/http"
"os"
"time"
"k8s.io/apimachinery/pkg/api/meta"
@@ -41,6 +43,20 @@ const (
defaultRequeueAfter = 30 * time.Second
defaultScheduleRequeue = 1 * time.Minute
defaultErrorRequeueAfter = 1 * time.Minute
// Default rescan age (1 week)
defaultRescanAge = 7 * 24 * time.Hour
// Default backoff settings for target availability checks
defaultBackoffInitial = 10 * time.Second // Initial retry interval
defaultBackoffMax = 10 * time.Minute // Maximum retry interval
defaultBackoffMultiplier = 2.0 // Multiplier for exponential backoff
// Environment variables
envRescanAge = "NUCLEI_RESCAN_AGE"
envBackoffInitial = "NUCLEI_BACKOFF_INITIAL"
envBackoffMax = "NUCLEI_BACKOFF_MAX"
envBackoffMultiplier = "NUCLEI_BACKOFF_MULTIPLIER"
)
// Condition types for NucleiScan
@@ -58,11 +74,92 @@ const (
ReasonScanSuspended = "ScanSuspended"
)
// BackoffConfig holds configuration for exponential backoff
type BackoffConfig struct {
Initial time.Duration
Max time.Duration
Multiplier float64
}
// NucleiScanReconciler reconciles a NucleiScan object
type NucleiScanReconciler struct {
client.Client
Scheme *runtime.Scheme
Scanner scanner.Scanner
RescanAge time.Duration
HTTPClient *http.Client
Backoff BackoffConfig
}
// NewNucleiScanReconciler creates a new NucleiScanReconciler with default settings
func NewNucleiScanReconciler(client client.Client, scheme *runtime.Scheme, scanner scanner.Scanner) *NucleiScanReconciler {
rescanAge := defaultRescanAge
if envVal := os.Getenv(envRescanAge); envVal != "" {
if parsed, err := time.ParseDuration(envVal); err == nil {
rescanAge = parsed
}
}
backoffInitial := defaultBackoffInitial
if envVal := os.Getenv(envBackoffInitial); envVal != "" {
if parsed, err := time.ParseDuration(envVal); err == nil {
backoffInitial = parsed
}
}
backoffMax := defaultBackoffMax
if envVal := os.Getenv(envBackoffMax); envVal != "" {
if parsed, err := time.ParseDuration(envVal); err == nil {
backoffMax = parsed
}
}
backoffMultiplier := defaultBackoffMultiplier
if envVal := os.Getenv(envBackoffMultiplier); envVal != "" {
if parsed, err := parseFloat(envVal); err == nil && parsed > 0 {
backoffMultiplier = parsed
}
}
return &NucleiScanReconciler{
Client: client,
Scheme: scheme,
Scanner: scanner,
RescanAge: rescanAge,
HTTPClient: &http.Client{
Timeout: 10 * time.Second,
},
Backoff: BackoffConfig{
Initial: backoffInitial,
Max: backoffMax,
Multiplier: backoffMultiplier,
},
}
}
// parseFloat parses a string to float64
func parseFloat(s string) (float64, error) {
var f float64
_, err := fmt.Sscanf(s, "%f", &f)
return f, err
}
// calculateBackoff calculates the next backoff duration based on retry count
func (r *NucleiScanReconciler) calculateBackoff(retryCount int) time.Duration {
if retryCount <= 0 {
return r.Backoff.Initial
}
// Calculate exponential backoff: initial * multiplier^retryCount
backoff := float64(r.Backoff.Initial)
for i := 0; i < retryCount; i++ {
backoff *= r.Backoff.Multiplier
if backoff > float64(r.Backoff.Max) {
return r.Backoff.Max
}
}
return time.Duration(backoff)
}
// +kubebuilder:rbac:groups=nuclei.homelab.mortenolsen.pro,resources=nucleiscans,verbs=get;list;watch;create;update;patch;delete
@@ -118,9 +215,15 @@ func (r *NucleiScanReconciler) Reconcile(ctx context.Context, req ctrl.Request)
case nucleiv1alpha1.ScanPhasePending:
return r.handlePendingPhase(ctx, nucleiScan)
case nucleiv1alpha1.ScanPhaseRunning:
// This shouldn't happen in our synchronous implementation
// but handle it gracefully
return r.handlePendingPhase(ctx, nucleiScan)
// Running phase on startup means the scan was interrupted (operator restart)
// Reset to Pending to re-run the scan
log.Info("Found stale Running scan, resetting to Pending (operator likely restarted)")
nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending
nucleiScan.Status.LastError = "Scan was interrupted due to operator restart, re-queuing"
if err := r.Status().Update(ctx, nucleiScan); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
case nucleiv1alpha1.ScanPhaseCompleted:
return r.handleCompletedPhase(ctx, nucleiScan)
case nucleiv1alpha1.ScanPhaseFailed:
@@ -161,7 +264,49 @@ func (r *NucleiScanReconciler) handleDeletion(ctx context.Context, nucleiScan *n
// handlePendingPhase handles the Pending phase - starts a new scan
func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) {
log := logf.FromContext(ctx)
log.Info("Starting scan", "targets", len(nucleiScan.Spec.Targets))
log.Info("Preparing to scan", "targets", len(nucleiScan.Spec.Targets))
// Check if at least one target is available before scanning
availableTargets, unavailableTargets := r.checkTargetsAvailability(ctx, nucleiScan.Spec.Targets)
if len(availableTargets) == 0 {
// Calculate backoff based on retry count
retryCount := nucleiScan.Status.RetryCount
backoffDuration := r.calculateBackoff(retryCount)
log.Info("No targets are available yet, waiting with backoff...",
"unavailable", len(unavailableTargets),
"retryCount", retryCount,
"backoffDuration", backoffDuration)
// Update condition and retry count
now := metav1.Now()
meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{
Type: ConditionTypeReady,
Status: metav1.ConditionFalse,
Reason: "WaitingForTargets",
Message: fmt.Sprintf("Waiting for targets to become available (%d unavailable, retry #%d, next check in %v)", len(unavailableTargets), retryCount+1, backoffDuration),
LastTransitionTime: now,
})
nucleiScan.Status.LastError = fmt.Sprintf("Targets not available: %v", unavailableTargets)
nucleiScan.Status.RetryCount = retryCount + 1
nucleiScan.Status.LastRetryTime = &now
if err := r.Status().Update(ctx, nucleiScan); err != nil {
return ctrl.Result{}, err
}
// Requeue with exponential backoff
return ctrl.Result{RequeueAfter: backoffDuration}, nil
}
// Reset retry count since targets are now available
if nucleiScan.Status.RetryCount > 0 {
log.Info("Targets now available, resetting retry count", "previousRetries", nucleiScan.Status.RetryCount)
nucleiScan.Status.RetryCount = 0
nucleiScan.Status.LastRetryTime = nil
}
log.Info("Starting scan", "availableTargets", len(availableTargets), "unavailableTargets", len(unavailableTargets))
// Update status to Running
now := metav1.Now()
@@ -175,7 +320,7 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
Type: ConditionTypeScanActive,
Status: metav1.ConditionTrue,
Reason: ReasonScanRunning,
Message: "Scan is in progress",
Message: fmt.Sprintf("Scan is in progress (%d targets)", len(availableTargets)),
LastTransitionTime: now,
})
@@ -190,8 +335,8 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
Timeout: 30 * time.Minute, // Default timeout
}
// Execute the scan
result, err := r.Scanner.Scan(ctx, nucleiScan.Spec.Targets, options)
// Execute the scan with available targets only
result, err := r.Scanner.Scan(ctx, availableTargets, options)
if err != nil {
log.Error(err, "Scan failed")
return r.handleScanError(ctx, nucleiScan, err)
@@ -201,6 +346,33 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
return r.handleScanSuccess(ctx, nucleiScan, result)
}
// checkTargetsAvailability checks which targets are reachable
func (r *NucleiScanReconciler) checkTargetsAvailability(ctx context.Context, targets []string) (available []string, unavailable []string) {
log := logf.FromContext(ctx)
for _, target := range targets {
req, err := http.NewRequestWithContext(ctx, http.MethodHead, target, nil)
if err != nil {
log.V(1).Info("Failed to create request for target", "target", target, "error", err)
unavailable = append(unavailable, target)
continue
}
resp, err := r.HTTPClient.Do(req)
if err != nil {
log.V(1).Info("Target not available", "target", target, "error", err)
unavailable = append(unavailable, target)
continue
}
resp.Body.Close()
// Consider any response (even 4xx/5xx) as "available" - the service is responding
available = append(available, target)
}
return available, unavailable
}
// handleScanSuccess updates the status after a successful scan
func (r *NucleiScanReconciler) handleScanSuccess(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, result *scanner.ScanResult) (ctrl.Result, error) {
log := logf.FromContext(ctx)
@@ -294,6 +466,25 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS
return r.checkScheduledScan(ctx, nucleiScan)
}
// Check if scan results are stale (older than RescanAge)
if nucleiScan.Status.CompletionTime != nil {
age := time.Since(nucleiScan.Status.CompletionTime.Time)
if age > r.RescanAge {
log.Info("Scan results are stale, triggering rescan", "age", age, "maxAge", r.RescanAge)
nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending
nucleiScan.Status.LastError = fmt.Sprintf("Automatic rescan triggered (results were %v old)", age.Round(time.Hour))
if err := r.Status().Update(ctx, nucleiScan); err != nil {
return ctrl.Result{}, err
}
return ctrl.Result{Requeue: true}, nil
}
// Schedule a requeue for when the results will become stale
timeUntilStale := r.RescanAge - age
log.V(1).Info("Scan results still fresh, will check again later", "timeUntilStale", timeUntilStale)
return ctrl.Result{RequeueAfter: timeUntilStale}, nil
}
return ctrl.Result{}, nil
}