mirror of
https://github.com/morten-olsen/homelab-nuclei-operator.git
synced 2026-02-08 02:16:23 +01:00
feat: support rescans and backoffs
This commit is contained in:
35
README.md
35
README.md
@@ -14,6 +14,9 @@ The Nuclei Operator watches for Ingress and VirtualService resources in your Kub
|
||||
- **Automatic Discovery**: Watches Kubernetes Ingress and Istio VirtualService resources for new endpoints
|
||||
- **Automated Scanning**: Automatically creates and runs Nuclei scans when new endpoints are discovered
|
||||
- **Scheduled Scans**: Support for cron-based scheduled rescanning
|
||||
- **Automatic Rescans**: Automatically rescans when results become stale (configurable age threshold)
|
||||
- **Target Availability Checking**: Waits for targets to be available before scanning
|
||||
- **Stale Scan Recovery**: Automatically resets interrupted scans on operator restart
|
||||
- **Flexible Configuration**: Configurable templates, severity filters, and scan options
|
||||
- **Native Kubernetes Integration**: Results stored as Kubernetes custom resources
|
||||
- **Owner References**: Automatic cleanup when source resources are deleted
|
||||
@@ -167,6 +170,38 @@ The operator can be configured using the following environment variables:
|
||||
| `NUCLEI_BINARY_PATH` | Path to the Nuclei binary | `nuclei` |
|
||||
| `NUCLEI_TEMPLATES_PATH` | Path to Nuclei templates directory | (uses Nuclei default) |
|
||||
| `NUCLEI_TIMEOUT` | Default scan timeout | `30m` |
|
||||
| `NUCLEI_RESCAN_AGE` | Maximum age of scan results before automatic rescan | `168h` (1 week) |
|
||||
| `NUCLEI_BACKOFF_INITIAL` | Initial retry interval for target availability checks | `10s` |
|
||||
| `NUCLEI_BACKOFF_MAX` | Maximum retry interval for target availability checks | `10m` |
|
||||
| `NUCLEI_BACKOFF_MULTIPLIER` | Multiplier for exponential backoff | `2.0` |
|
||||
|
||||
### Automatic Rescan Behavior
|
||||
|
||||
The operator automatically rescans targets when:
|
||||
|
||||
1. **Stale Results**: Scan results are older than `NUCLEI_RESCAN_AGE` (default: 1 week)
|
||||
2. **Operator Restart**: Any scans that were in "Running" state when the operator restarted are automatically re-queued
|
||||
3. **Spec Changes**: When the NucleiScan spec is modified
|
||||
|
||||
### Target Availability with Exponential Backoff
|
||||
|
||||
Before running a scan, the operator checks if targets are reachable:
|
||||
|
||||
- Uses HTTP HEAD requests to verify target availability
|
||||
- If no targets are available, the scan waits and retries with **exponential backoff**
|
||||
- Backoff sequence with defaults: 10s → 20s → 40s → 80s → 160s → 320s → 600s (max)
|
||||
- Scans proceed with available targets even if some are unreachable
|
||||
- Any HTTP response (including 4xx/5xx) is considered "available" - the service is responding
|
||||
- Retry count is tracked in `status.retryCount` and reset when targets become available
|
||||
|
||||
**Backoff Configuration Example:**
|
||||
|
||||
```bash
|
||||
# Set initial retry to 5 seconds, max to 5 minutes, multiplier to 1.5
|
||||
export NUCLEI_BACKOFF_INITIAL=5s
|
||||
export NUCLEI_BACKOFF_MAX=5m
|
||||
export NUCLEI_BACKOFF_MULTIPLIER=1.5
|
||||
```
|
||||
|
||||
### NucleiScan Spec Options
|
||||
|
||||
|
||||
@@ -191,6 +191,15 @@ type NucleiScanStatus struct {
|
||||
// ObservedGeneration is the generation observed by the controller
|
||||
// +optional
|
||||
ObservedGeneration int64 `json:"observedGeneration,omitempty"`
|
||||
|
||||
// RetryCount tracks the number of consecutive availability check retries
|
||||
// Used for exponential backoff when waiting for targets
|
||||
// +optional
|
||||
RetryCount int `json:"retryCount,omitempty"`
|
||||
|
||||
// LastRetryTime is when the last availability check retry occurred
|
||||
// +optional
|
||||
LastRetryTime *metav1.Time `json:"lastRetryTime,omitempty"`
|
||||
}
|
||||
|
||||
// +kubebuilder:object:root=true
|
||||
|
||||
@@ -185,6 +185,10 @@ func (in *NucleiScanStatus) DeepCopyInto(out *NucleiScanStatus) {
|
||||
(*in)[i].DeepCopyInto(&(*out)[i])
|
||||
}
|
||||
}
|
||||
if in.LastRetryTime != nil {
|
||||
in, out := &in.LastRetryTime, &out.LastRetryTime
|
||||
*out = (*in).DeepCopy()
|
||||
}
|
||||
}
|
||||
|
||||
// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NucleiScanStatus.
|
||||
|
||||
10
cmd/main.go
10
cmd/main.go
@@ -183,11 +183,11 @@ func main() {
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
if err := (&controller.NucleiScanReconciler{
|
||||
Client: mgr.GetClient(),
|
||||
Scheme: mgr.GetScheme(),
|
||||
Scanner: scanner.NewNucleiScannerWithDefaults(),
|
||||
}).SetupWithManager(mgr); err != nil {
|
||||
if err := controller.NewNucleiScanReconciler(
|
||||
mgr.GetClient(),
|
||||
mgr.GetScheme(),
|
||||
scanner.NewNucleiScannerWithDefaults(),
|
||||
).SetupWithManager(mgr); err != nil {
|
||||
setupLog.Error(err, "unable to create controller", "controller", "NucleiScan")
|
||||
os.Exit(1)
|
||||
}
|
||||
|
||||
@@ -252,6 +252,11 @@ spec:
|
||||
lastError:
|
||||
description: LastError contains the error message if the scan failed
|
||||
type: string
|
||||
lastRetryTime:
|
||||
description: LastRetryTime is when the last availability check retry
|
||||
occurred
|
||||
format: date-time
|
||||
type: string
|
||||
lastScanTime:
|
||||
description: LastScanTime is when the last scan was initiated
|
||||
format: date-time
|
||||
@@ -274,6 +279,11 @@ spec:
|
||||
- Completed
|
||||
- Failed
|
||||
type: string
|
||||
retryCount:
|
||||
description: |-
|
||||
RetryCount tracks the number of consecutive availability check retries
|
||||
Used for exponential backoff when waiting for targets
|
||||
type: integer
|
||||
summary:
|
||||
description: Summary provides aggregated scan statistics
|
||||
properties:
|
||||
|
||||
@@ -73,6 +73,21 @@ spec:
|
||||
value: "/nuclei-templates"
|
||||
- name: NUCLEI_TIMEOUT
|
||||
value: "30m"
|
||||
# NUCLEI_RESCAN_AGE controls how old scan results can be before
|
||||
# triggering an automatic rescan. Default is 168h (1 week).
|
||||
# Set to "0" to disable automatic rescans based on age.
|
||||
- name: NUCLEI_RESCAN_AGE
|
||||
value: "168h"
|
||||
# Backoff configuration for target availability checks
|
||||
# NUCLEI_BACKOFF_INITIAL: Initial retry interval (default: 10s)
|
||||
- name: NUCLEI_BACKOFF_INITIAL
|
||||
value: "10s"
|
||||
# NUCLEI_BACKOFF_MAX: Maximum retry interval (default: 10m)
|
||||
- name: NUCLEI_BACKOFF_MAX
|
||||
value: "10m"
|
||||
# NUCLEI_BACKOFF_MULTIPLIER: Multiplier for exponential backoff (default: 2.0)
|
||||
- name: NUCLEI_BACKOFF_MULTIPLIER
|
||||
value: "2.0"
|
||||
securityContext:
|
||||
readOnlyRootFilesystem: false # Nuclei needs to write temporary files
|
||||
allowPrivilegeEscalation: false
|
||||
|
||||
@@ -19,6 +19,8 @@ package controller
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"k8s.io/apimachinery/pkg/api/meta"
|
||||
@@ -41,6 +43,20 @@ const (
|
||||
defaultRequeueAfter = 30 * time.Second
|
||||
defaultScheduleRequeue = 1 * time.Minute
|
||||
defaultErrorRequeueAfter = 1 * time.Minute
|
||||
|
||||
// Default rescan age (1 week)
|
||||
defaultRescanAge = 7 * 24 * time.Hour
|
||||
|
||||
// Default backoff settings for target availability checks
|
||||
defaultBackoffInitial = 10 * time.Second // Initial retry interval
|
||||
defaultBackoffMax = 10 * time.Minute // Maximum retry interval
|
||||
defaultBackoffMultiplier = 2.0 // Multiplier for exponential backoff
|
||||
|
||||
// Environment variables
|
||||
envRescanAge = "NUCLEI_RESCAN_AGE"
|
||||
envBackoffInitial = "NUCLEI_BACKOFF_INITIAL"
|
||||
envBackoffMax = "NUCLEI_BACKOFF_MAX"
|
||||
envBackoffMultiplier = "NUCLEI_BACKOFF_MULTIPLIER"
|
||||
)
|
||||
|
||||
// Condition types for NucleiScan
|
||||
@@ -58,11 +74,92 @@ const (
|
||||
ReasonScanSuspended = "ScanSuspended"
|
||||
)
|
||||
|
||||
// BackoffConfig holds configuration for exponential backoff
|
||||
type BackoffConfig struct {
|
||||
Initial time.Duration
|
||||
Max time.Duration
|
||||
Multiplier float64
|
||||
}
|
||||
|
||||
// NucleiScanReconciler reconciles a NucleiScan object
|
||||
type NucleiScanReconciler struct {
|
||||
client.Client
|
||||
Scheme *runtime.Scheme
|
||||
Scanner scanner.Scanner
|
||||
RescanAge time.Duration
|
||||
HTTPClient *http.Client
|
||||
Backoff BackoffConfig
|
||||
}
|
||||
|
||||
// NewNucleiScanReconciler creates a new NucleiScanReconciler with default settings
|
||||
func NewNucleiScanReconciler(client client.Client, scheme *runtime.Scheme, scanner scanner.Scanner) *NucleiScanReconciler {
|
||||
rescanAge := defaultRescanAge
|
||||
if envVal := os.Getenv(envRescanAge); envVal != "" {
|
||||
if parsed, err := time.ParseDuration(envVal); err == nil {
|
||||
rescanAge = parsed
|
||||
}
|
||||
}
|
||||
|
||||
backoffInitial := defaultBackoffInitial
|
||||
if envVal := os.Getenv(envBackoffInitial); envVal != "" {
|
||||
if parsed, err := time.ParseDuration(envVal); err == nil {
|
||||
backoffInitial = parsed
|
||||
}
|
||||
}
|
||||
|
||||
backoffMax := defaultBackoffMax
|
||||
if envVal := os.Getenv(envBackoffMax); envVal != "" {
|
||||
if parsed, err := time.ParseDuration(envVal); err == nil {
|
||||
backoffMax = parsed
|
||||
}
|
||||
}
|
||||
|
||||
backoffMultiplier := defaultBackoffMultiplier
|
||||
if envVal := os.Getenv(envBackoffMultiplier); envVal != "" {
|
||||
if parsed, err := parseFloat(envVal); err == nil && parsed > 0 {
|
||||
backoffMultiplier = parsed
|
||||
}
|
||||
}
|
||||
|
||||
return &NucleiScanReconciler{
|
||||
Client: client,
|
||||
Scheme: scheme,
|
||||
Scanner: scanner,
|
||||
RescanAge: rescanAge,
|
||||
HTTPClient: &http.Client{
|
||||
Timeout: 10 * time.Second,
|
||||
},
|
||||
Backoff: BackoffConfig{
|
||||
Initial: backoffInitial,
|
||||
Max: backoffMax,
|
||||
Multiplier: backoffMultiplier,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// parseFloat parses a string to float64
|
||||
func parseFloat(s string) (float64, error) {
|
||||
var f float64
|
||||
_, err := fmt.Sscanf(s, "%f", &f)
|
||||
return f, err
|
||||
}
|
||||
|
||||
// calculateBackoff calculates the next backoff duration based on retry count
|
||||
func (r *NucleiScanReconciler) calculateBackoff(retryCount int) time.Duration {
|
||||
if retryCount <= 0 {
|
||||
return r.Backoff.Initial
|
||||
}
|
||||
|
||||
// Calculate exponential backoff: initial * multiplier^retryCount
|
||||
backoff := float64(r.Backoff.Initial)
|
||||
for i := 0; i < retryCount; i++ {
|
||||
backoff *= r.Backoff.Multiplier
|
||||
if backoff > float64(r.Backoff.Max) {
|
||||
return r.Backoff.Max
|
||||
}
|
||||
}
|
||||
|
||||
return time.Duration(backoff)
|
||||
}
|
||||
|
||||
// +kubebuilder:rbac:groups=nuclei.homelab.mortenolsen.pro,resources=nucleiscans,verbs=get;list;watch;create;update;patch;delete
|
||||
@@ -118,9 +215,15 @@ func (r *NucleiScanReconciler) Reconcile(ctx context.Context, req ctrl.Request)
|
||||
case nucleiv1alpha1.ScanPhasePending:
|
||||
return r.handlePendingPhase(ctx, nucleiScan)
|
||||
case nucleiv1alpha1.ScanPhaseRunning:
|
||||
// This shouldn't happen in our synchronous implementation
|
||||
// but handle it gracefully
|
||||
return r.handlePendingPhase(ctx, nucleiScan)
|
||||
// Running phase on startup means the scan was interrupted (operator restart)
|
||||
// Reset to Pending to re-run the scan
|
||||
log.Info("Found stale Running scan, resetting to Pending (operator likely restarted)")
|
||||
nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending
|
||||
nucleiScan.Status.LastError = "Scan was interrupted due to operator restart, re-queuing"
|
||||
if err := r.Status().Update(ctx, nucleiScan); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
case nucleiv1alpha1.ScanPhaseCompleted:
|
||||
return r.handleCompletedPhase(ctx, nucleiScan)
|
||||
case nucleiv1alpha1.ScanPhaseFailed:
|
||||
@@ -161,7 +264,49 @@ func (r *NucleiScanReconciler) handleDeletion(ctx context.Context, nucleiScan *n
|
||||
// handlePendingPhase handles the Pending phase - starts a new scan
|
||||
func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) {
|
||||
log := logf.FromContext(ctx)
|
||||
log.Info("Starting scan", "targets", len(nucleiScan.Spec.Targets))
|
||||
log.Info("Preparing to scan", "targets", len(nucleiScan.Spec.Targets))
|
||||
|
||||
// Check if at least one target is available before scanning
|
||||
availableTargets, unavailableTargets := r.checkTargetsAvailability(ctx, nucleiScan.Spec.Targets)
|
||||
if len(availableTargets) == 0 {
|
||||
// Calculate backoff based on retry count
|
||||
retryCount := nucleiScan.Status.RetryCount
|
||||
backoffDuration := r.calculateBackoff(retryCount)
|
||||
|
||||
log.Info("No targets are available yet, waiting with backoff...",
|
||||
"unavailable", len(unavailableTargets),
|
||||
"retryCount", retryCount,
|
||||
"backoffDuration", backoffDuration)
|
||||
|
||||
// Update condition and retry count
|
||||
now := metav1.Now()
|
||||
meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{
|
||||
Type: ConditionTypeReady,
|
||||
Status: metav1.ConditionFalse,
|
||||
Reason: "WaitingForTargets",
|
||||
Message: fmt.Sprintf("Waiting for targets to become available (%d unavailable, retry #%d, next check in %v)", len(unavailableTargets), retryCount+1, backoffDuration),
|
||||
LastTransitionTime: now,
|
||||
})
|
||||
nucleiScan.Status.LastError = fmt.Sprintf("Targets not available: %v", unavailableTargets)
|
||||
nucleiScan.Status.RetryCount = retryCount + 1
|
||||
nucleiScan.Status.LastRetryTime = &now
|
||||
|
||||
if err := r.Status().Update(ctx, nucleiScan); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
|
||||
// Requeue with exponential backoff
|
||||
return ctrl.Result{RequeueAfter: backoffDuration}, nil
|
||||
}
|
||||
|
||||
// Reset retry count since targets are now available
|
||||
if nucleiScan.Status.RetryCount > 0 {
|
||||
log.Info("Targets now available, resetting retry count", "previousRetries", nucleiScan.Status.RetryCount)
|
||||
nucleiScan.Status.RetryCount = 0
|
||||
nucleiScan.Status.LastRetryTime = nil
|
||||
}
|
||||
|
||||
log.Info("Starting scan", "availableTargets", len(availableTargets), "unavailableTargets", len(unavailableTargets))
|
||||
|
||||
// Update status to Running
|
||||
now := metav1.Now()
|
||||
@@ -175,7 +320,7 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
|
||||
Type: ConditionTypeScanActive,
|
||||
Status: metav1.ConditionTrue,
|
||||
Reason: ReasonScanRunning,
|
||||
Message: "Scan is in progress",
|
||||
Message: fmt.Sprintf("Scan is in progress (%d targets)", len(availableTargets)),
|
||||
LastTransitionTime: now,
|
||||
})
|
||||
|
||||
@@ -190,8 +335,8 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
|
||||
Timeout: 30 * time.Minute, // Default timeout
|
||||
}
|
||||
|
||||
// Execute the scan
|
||||
result, err := r.Scanner.Scan(ctx, nucleiScan.Spec.Targets, options)
|
||||
// Execute the scan with available targets only
|
||||
result, err := r.Scanner.Scan(ctx, availableTargets, options)
|
||||
if err != nil {
|
||||
log.Error(err, "Scan failed")
|
||||
return r.handleScanError(ctx, nucleiScan, err)
|
||||
@@ -201,6 +346,33 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca
|
||||
return r.handleScanSuccess(ctx, nucleiScan, result)
|
||||
}
|
||||
|
||||
// checkTargetsAvailability checks which targets are reachable
|
||||
func (r *NucleiScanReconciler) checkTargetsAvailability(ctx context.Context, targets []string) (available []string, unavailable []string) {
|
||||
log := logf.FromContext(ctx)
|
||||
|
||||
for _, target := range targets {
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodHead, target, nil)
|
||||
if err != nil {
|
||||
log.V(1).Info("Failed to create request for target", "target", target, "error", err)
|
||||
unavailable = append(unavailable, target)
|
||||
continue
|
||||
}
|
||||
|
||||
resp, err := r.HTTPClient.Do(req)
|
||||
if err != nil {
|
||||
log.V(1).Info("Target not available", "target", target, "error", err)
|
||||
unavailable = append(unavailable, target)
|
||||
continue
|
||||
}
|
||||
resp.Body.Close()
|
||||
|
||||
// Consider any response (even 4xx/5xx) as "available" - the service is responding
|
||||
available = append(available, target)
|
||||
}
|
||||
|
||||
return available, unavailable
|
||||
}
|
||||
|
||||
// handleScanSuccess updates the status after a successful scan
|
||||
func (r *NucleiScanReconciler) handleScanSuccess(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, result *scanner.ScanResult) (ctrl.Result, error) {
|
||||
log := logf.FromContext(ctx)
|
||||
@@ -294,6 +466,25 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS
|
||||
return r.checkScheduledScan(ctx, nucleiScan)
|
||||
}
|
||||
|
||||
// Check if scan results are stale (older than RescanAge)
|
||||
if nucleiScan.Status.CompletionTime != nil {
|
||||
age := time.Since(nucleiScan.Status.CompletionTime.Time)
|
||||
if age > r.RescanAge {
|
||||
log.Info("Scan results are stale, triggering rescan", "age", age, "maxAge", r.RescanAge)
|
||||
nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending
|
||||
nucleiScan.Status.LastError = fmt.Sprintf("Automatic rescan triggered (results were %v old)", age.Round(time.Hour))
|
||||
if err := r.Status().Update(ctx, nucleiScan); err != nil {
|
||||
return ctrl.Result{}, err
|
||||
}
|
||||
return ctrl.Result{Requeue: true}, nil
|
||||
}
|
||||
|
||||
// Schedule a requeue for when the results will become stale
|
||||
timeUntilStale := r.RescanAge - age
|
||||
log.V(1).Info("Scan results still fresh, will check again later", "timeUntilStale", timeUntilStale)
|
||||
return ctrl.Result{RequeueAfter: timeUntilStale}, nil
|
||||
}
|
||||
|
||||
return ctrl.Result{}, nil
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user