feat: implement pod-based scanning architecture

This major refactor moves from synchronous subprocess-based scanning to
asynchronous pod-based scanning using Kubernetes Jobs.

## Architecture Changes
- Scanner jobs are now Kubernetes Jobs with TTLAfterFinished for automatic cleanup
- Jobs have owner references for garbage collection when NucleiScan is deleted
- Configurable concurrency limits, timeouts, and resource requirements

## New Features
- Dual-mode binary: --mode=controller (default) or --mode=scanner
- Annotation-based configuration for Ingress/VirtualService resources
- Operator-level configuration via environment variables
- Startup recovery for orphaned scans after operator restart
- Periodic cleanup of stuck jobs

## New Files
- DESIGN.md: Comprehensive architecture design document
- internal/jobmanager/: Job Manager for creating/monitoring scanner jobs
- internal/scanner/runner.go: Scanner mode implementation
- internal/annotations/: Annotation parsing utilities
- charts/nuclei-operator/templates/scanner-rbac.yaml: Scanner RBAC

## API Changes
- Added ScannerConfig struct for per-scan scanner configuration
- Added JobReference struct for tracking scanner jobs
- Added ScannerConfig field to NucleiScanSpec
- Added JobRef and ScanStartTime fields to NucleiScanStatus

## Supported Annotations
- nuclei.homelab.mortenolsen.pro/enabled
- nuclei.homelab.mortenolsen.pro/templates
- nuclei.homelab.mortenolsen.pro/severity
- nuclei.homelab.mortenolsen.pro/schedule
- nuclei.homelab.mortenolsen.pro/timeout
- nuclei.homelab.mortenolsen.pro/scanner-image

## RBAC Updates
- Added Job and Pod permissions for operator
- Created separate scanner service account with minimal permissions

## Documentation
- Updated README, user-guide, api.md, and Helm chart README
- Added example annotated Ingress resources
This commit is contained in:
Morten Olsen
2025-12-12 20:51:23 +01:00
parent 519ed32de3
commit 12d681ada1
22 changed files with 3060 additions and 245 deletions

View File

@@ -17,9 +17,14 @@ limitations under the License.
package main
import (
"context"
"crypto/tls"
"flag"
"fmt"
"os"
"strconv"
"strings"
"time"
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.)
// to ensure that exec-entrypoint and run can make use of them.
@@ -27,10 +32,12 @@ import (
istionetworkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1"
networkingv1 "k8s.io/api/networking/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
"k8s.io/apimachinery/pkg/runtime"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
ctrl "sigs.k8s.io/controller-runtime"
"sigs.k8s.io/controller-runtime/pkg/client"
"sigs.k8s.io/controller-runtime/pkg/healthz"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
"sigs.k8s.io/controller-runtime/pkg/metrics/filters"
@@ -39,6 +46,7 @@ import (
nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1"
"github.com/mortenolsen/nuclei-operator/internal/controller"
"github.com/mortenolsen/nuclei-operator/internal/jobmanager"
"github.com/mortenolsen/nuclei-operator/internal/scanner"
// +kubebuilder:scaffold:imports
)
@@ -67,6 +75,15 @@ func main() {
var secureMetrics bool
var enableHTTP2 bool
var tlsOpts []func(*tls.Config)
// Scanner mode flags
var mode string
var scanName string
var scanNamespace string
flag.StringVar(&mode, "mode", "controller", "Run mode: 'controller' or 'scanner'")
flag.StringVar(&scanName, "scan-name", "", "Name of the NucleiScan to execute (scanner mode only)")
flag.StringVar(&scanNamespace, "scan-namespace", "", "Namespace of the NucleiScan (scanner mode only)")
flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+
"Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.")
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.")
@@ -92,6 +109,15 @@ func main() {
ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts)))
// Check if running in scanner mode
if mode == "scanner" {
if err := scanner.RunScannerMode(scanName, scanNamespace); err != nil {
setupLog.Error(err, "Scanner mode failed")
os.Exit(1)
}
os.Exit(0)
}
// if the enable-http2 flag is false (the default), http/2 should be disabled
// due to its vulnerabilities. More specifically, disabling http/2 will
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and
@@ -183,10 +209,103 @@ func main() {
os.Exit(1)
}
// Parse environment variables for JobManager configuration
scannerImage := os.Getenv("SCANNER_IMAGE")
if scannerImage == "" {
scannerImage = jobmanager.DefaultScannerImage
}
scannerTimeout := 30 * time.Minute
if v := os.Getenv("SCANNER_TIMEOUT"); v != "" {
if d, err := time.ParseDuration(v); err == nil {
scannerTimeout = d
}
}
maxConcurrentScans := 5
if v := os.Getenv("MAX_CONCURRENT_SCANS"); v != "" {
if n, err := strconv.Atoi(v); err == nil {
maxConcurrentScans = n
}
}
ttlAfterFinished := int32(3600)
if v := os.Getenv("JOB_TTL_AFTER_FINISHED"); v != "" {
if n, err := strconv.Atoi(v); err == nil {
ttlAfterFinished = int32(n)
}
}
scannerServiceAccount := os.Getenv("SCANNER_SERVICE_ACCOUNT")
if scannerServiceAccount == "" {
scannerServiceAccount = "nuclei-scanner"
}
defaultTemplates := []string{}
if v := os.Getenv("DEFAULT_TEMPLATES"); v != "" {
defaultTemplates = strings.Split(v, ",")
}
defaultSeverity := []string{}
if v := os.Getenv("DEFAULT_SEVERITY"); v != "" {
defaultSeverity = strings.Split(v, ",")
}
// Create the JobManager configuration
jobManagerConfig := jobmanager.Config{
ScannerImage: scannerImage,
DefaultTimeout: scannerTimeout,
TTLAfterFinished: ttlAfterFinished,
BackoffLimit: 2,
MaxConcurrent: maxConcurrentScans,
ServiceAccountName: scannerServiceAccount,
DefaultResources: jobmanager.DefaultConfig().DefaultResources,
DefaultTemplates: defaultTemplates,
DefaultSeverity: defaultSeverity,
}
// Create the JobManager for scanner job management
jobMgr := jobmanager.NewJobManager(
mgr.GetClient(),
mgr.GetScheme(),
jobManagerConfig,
)
// Run startup recovery to handle orphaned scans from previous operator instance
setupLog.Info("Running startup recovery")
if err := runStartupRecovery(mgr.GetClient(), jobMgr); err != nil {
setupLog.Error(err, "Startup recovery failed")
// Don't exit - continue with normal operation
}
// Create a context that will be cancelled when the manager stops
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
// Start periodic cleanup goroutine
go func() {
ticker := time.NewTicker(5 * time.Minute)
defer ticker.Stop()
for {
select {
case <-ticker.C:
if err := jobMgr.CleanupOrphanedJobs(ctx); err != nil {
setupLog.Error(err, "Periodic cleanup failed")
}
case <-ctx.Done():
return
}
}
}()
// Create the NucleiScan reconciler with JobManager
if err := controller.NewNucleiScanReconciler(
mgr.GetClient(),
mgr.GetScheme(),
scanner.NewNucleiScannerWithDefaults(),
mgr.GetEventRecorderFor("nucleiscan-controller"),
jobMgr,
controller.DefaultReconcilerConfig(),
).SetupWithManager(mgr); err != nil {
setupLog.Error(err, "unable to create controller", "controller", "NucleiScan")
os.Exit(1)
@@ -222,3 +341,72 @@ func main() {
os.Exit(1)
}
}
// runStartupRecovery handles orphaned scans and jobs from previous operator instance
func runStartupRecovery(c client.Client, jobMgr *jobmanager.JobManager) error {
ctx := context.Background()
// List all NucleiScans in Running phase
scanList := &nucleiv1alpha1.NucleiScanList{}
if err := c.List(ctx, scanList); err != nil {
return fmt.Errorf("failed to list NucleiScans: %w", err)
}
for _, scan := range scanList.Items {
if scan.Status.Phase != nucleiv1alpha1.ScanPhaseRunning {
continue
}
// Check if the referenced job still exists
if scan.Status.JobRef != nil {
job, err := jobMgr.GetJob(ctx, scan.Status.JobRef.Name, scan.Namespace)
if err != nil {
if apierrors.IsNotFound(err) {
// Job is gone - reset scan to Pending
scan.Status.Phase = nucleiv1alpha1.ScanPhasePending
scan.Status.LastError = "Recovered from operator restart - job not found"
scan.Status.JobRef = nil
if updateErr := c.Status().Update(ctx, &scan); updateErr != nil {
return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr)
}
continue
}
return fmt.Errorf("failed to get job for scan %s: %w", scan.Name, err)
}
// Job exists - check if it's completed but status wasn't updated
if jobMgr.IsJobComplete(job) {
// The scanner pod should have updated the status
// If it didn't, mark as failed
if scan.Status.Phase == nucleiv1alpha1.ScanPhaseRunning {
if jobMgr.IsJobFailed(job) {
scan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed
scan.Status.LastError = "Job completed during operator downtime: " + jobMgr.GetJobFailureReason(job)
} else {
// Job succeeded but status wasn't updated - this shouldn't happen
// but handle it gracefully
scan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed
scan.Status.LastError = "Job completed during operator downtime but status was not updated"
}
if updateErr := c.Status().Update(ctx, &scan); updateErr != nil {
return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr)
}
}
}
} else {
// No job reference but Running - invalid state
scan.Status.Phase = nucleiv1alpha1.ScanPhasePending
scan.Status.LastError = "Recovered from invalid state - no job reference"
if updateErr := c.Status().Update(ctx, &scan); updateErr != nil {
return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr)
}
}
}
// Clean up orphaned jobs
if err := jobMgr.CleanupOrphanedJobs(ctx); err != nil {
return fmt.Errorf("failed to cleanup orphaned jobs: %w", err)
}
return nil
}