From 12d681ada18509b0b30322363f3d27a5d2095d0d Mon Sep 17 00:00:00 2001 From: Morten Olsen Date: Fri, 12 Dec 2025 20:51:23 +0100 Subject: [PATCH] feat: implement pod-based scanning architecture This major refactor moves from synchronous subprocess-based scanning to asynchronous pod-based scanning using Kubernetes Jobs. ## Architecture Changes - Scanner jobs are now Kubernetes Jobs with TTLAfterFinished for automatic cleanup - Jobs have owner references for garbage collection when NucleiScan is deleted - Configurable concurrency limits, timeouts, and resource requirements ## New Features - Dual-mode binary: --mode=controller (default) or --mode=scanner - Annotation-based configuration for Ingress/VirtualService resources - Operator-level configuration via environment variables - Startup recovery for orphaned scans after operator restart - Periodic cleanup of stuck jobs ## New Files - DESIGN.md: Comprehensive architecture design document - internal/jobmanager/: Job Manager for creating/monitoring scanner jobs - internal/scanner/runner.go: Scanner mode implementation - internal/annotations/: Annotation parsing utilities - charts/nuclei-operator/templates/scanner-rbac.yaml: Scanner RBAC ## API Changes - Added ScannerConfig struct for per-scan scanner configuration - Added JobReference struct for tracking scanner jobs - Added ScannerConfig field to NucleiScanSpec - Added JobRef and ScanStartTime fields to NucleiScanStatus ## Supported Annotations - nuclei.homelab.mortenolsen.pro/enabled - nuclei.homelab.mortenolsen.pro/templates - nuclei.homelab.mortenolsen.pro/severity - nuclei.homelab.mortenolsen.pro/schedule - nuclei.homelab.mortenolsen.pro/timeout - nuclei.homelab.mortenolsen.pro/scanner-image ## RBAC Updates - Added Job and Pod permissions for operator - Created separate scanner service account with minimal permissions ## Documentation - Updated README, user-guide, api.md, and Helm chart README - Added example annotated Ingress resources --- DESIGN.md | 587 ++++++++++++++++++ README.md | 18 +- api/v1alpha1/nucleiscan_types.go | 57 ++ api/v1alpha1/zz_generated.deepcopy.go | 82 ++- charts/nuclei-operator/README.md | 86 +++ .../nuclei-operator/templates/deployment.yaml | 18 + charts/nuclei-operator/templates/rbac.yaml | 26 + .../templates/scanner-rbac.yaml | 51 ++ charts/nuclei-operator/values.yaml | 34 +- cmd/main.go | 190 +++++- ...i.homelab.mortenolsen.pro_nucleiscans.yaml | 146 +++++ config/rbac/role.yaml | 20 + config/samples/example-ingress.yaml | 94 ++- docs/api.md | 100 +++ docs/user-guide.md | 224 ++++++- internal/annotations/annotations.go | 211 +++++++ internal/controller/ingress_controller.go | 77 ++- internal/controller/nucleiscan_controller.go | 446 +++++++------ .../controller/virtualservice_controller.go | 77 ++- internal/jobmanager/jobmanager.go | 427 +++++++++++++ internal/jobmanager/jobmanager_test.go | 117 ++++ internal/scanner/runner.go | 217 +++++++ 22 files changed, 3060 insertions(+), 245 deletions(-) create mode 100644 DESIGN.md create mode 100644 charts/nuclei-operator/templates/scanner-rbac.yaml create mode 100644 internal/annotations/annotations.go create mode 100644 internal/jobmanager/jobmanager.go create mode 100644 internal/jobmanager/jobmanager_test.go create mode 100644 internal/scanner/runner.go diff --git a/DESIGN.md b/DESIGN.md new file mode 100644 index 0000000..50f476d --- /dev/null +++ b/DESIGN.md @@ -0,0 +1,587 @@ +# Pod-Based Scanning Architecture Design + +## Executive Summary + +This document describes the new architecture for the nuclei-operator that moves from synchronous subprocess-based scanning to asynchronous pod-based scanning. This change improves scalability, reliability, and operational flexibility while maintaining backward compatibility. + +## 1. Architecture Overview + +### 1.1 Current State Problems + +The current implementation has several limitations: + +1. **Blocking Reconcile Loop**: Scans execute synchronously within the operator pod, blocking the reconcile loop for up to 30 minutes +2. **Single Point of Failure**: All scans run in the operator pod - if it restarts, running scans are lost +3. **Resource Contention**: Multiple concurrent scans compete for operator pod resources +4. **No Horizontal Scaling**: Cannot distribute scan workload across multiple pods +5. **Limited Configuration**: No annotation-based configuration for individual Ingress/VirtualService resources + +### 1.2 New Architecture + +``` +┌─────────────────────────────────────────────────────────────────────────────┐ +│ KUBERNETES CLUSTER │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌──────────────┐ ┌──────────────────┐ ┌─────────────────────────┐ │ +│ │ Ingress │───▶│ IngressReconciler │───▶│ │ │ +│ └──────────────┘ └──────────────────┘ │ │ │ +│ │ │ NucleiScan CRD │ │ +│ ┌──────────────┐ ┌──────────────────┐ │ │ │ +│ │VirtualService│───▶│ VSReconciler │───▶│ spec: │ │ +│ └──────────────┘ └──────────────────┘ │ sourceRef │ │ +│ │ │ targets[] │ │ +│ │ │ templates[] │ │ +│ ▼ │ severity[] │ │ +│ ┌──────────────────┐ │ schedule │ │ +│ │ Owner Reference │ │ status: │ │ +│ │ (GC on delete) │ │ phase │ │ +│ └──────────────────┘ │ findings[] │ │ +│ │ summary │ │ +│ │ jobRef │ │ +│ └───────────┬─────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────┐ │ +│ │ NucleiScanReconciler │ │ +│ │ │ │ +│ │ 1. Check phase │ │ +│ │ 2. Create/monitor Job │ │ +│ │ 3. Handle completion │ │ +│ └───────────┬─────────────┘ │ +│ │ │ +│ ▼ │ +│ ┌─────────────────────────┐ │ +│ │ Scanner Jobs │ │ +│ │ (Kubernetes Jobs) │ │ +│ │ │ │ +│ │ - Isolated execution │ │ +│ │ - Direct status update│ │ +│ │ - Auto cleanup (TTL) │ │ +│ └─────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────────────┘ +``` + +### 1.3 Key Design Decisions + +#### Decision 1: Kubernetes Jobs vs Bare Pods + +**Choice: Kubernetes Jobs with TTLAfterFinished** + +Rationale: +- Jobs provide built-in completion tracking and retry mechanisms +- TTLAfterFinished enables automatic cleanup of completed jobs +- Jobs maintain history for debugging and auditing +- Better integration with Kubernetes ecosystem tools + +```yaml +apiVersion: batch/v1 +kind: Job +metadata: + name: nucleiscan-myapp-abc123 + namespace: default +spec: + ttlSecondsAfterFinished: 3600 # Clean up 1 hour after completion + backoffLimit: 2 # Retry failed scans twice + activeDeadlineSeconds: 1800 # 30 minute timeout + template: + spec: + restartPolicy: Never + containers: + - name: scanner + image: ghcr.io/morten-olsen/homelab-nuclei-operator:latest + args: ["--mode=scanner", "--scan-id=myapp-abc123"] +``` + +#### Decision 2: Result Communication + +**Choice: Dual-mode operator image with direct API access** + +Rationale: +- Single image simplifies deployment and versioning +- Scanner mode has direct Kubernetes API access to update NucleiScan status +- No intermediate storage needed (ConfigMaps or logs) +- Results are immediately available in the CRD status +- Consistent error handling and status updates + +The operator binary supports two modes: +1. **Controller Mode** (default): Runs the operator controllers +2. **Scanner Mode** (`--mode=scanner`): Executes a single scan and updates the NucleiScan status + +#### Decision 3: Template Distribution + +**Choice: Hybrid approach with configurable options** + +1. **Default**: Use projectdiscovery/nuclei built-in templates (updated with each nuclei release) +2. **Custom Templates**: Mount via ConfigMap for small template sets +3. **Git Sync**: Init container that clones template repositories at runtime +4. **Custom Image**: For air-gapped environments, bake templates into a custom scanner image + +Configuration hierarchy: +``` +Operator Defaults < NucleiScan Spec < Ingress/VS Annotations +``` + +## 2. Component Design + +### 2.1 NucleiScan Controller Changes + +The controller transitions from executing scans to managing scan jobs: + +```go +// Simplified reconciliation flow +func (r *NucleiScanReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + nucleiScan := &nucleiv1alpha1.NucleiScan{} + if err := r.Get(ctx, req.NamespacedName, nucleiScan); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + switch nucleiScan.Status.Phase { + case ScanPhasePending: + return r.handlePending(ctx, nucleiScan) // Create Job + case ScanPhaseRunning: + return r.handleRunning(ctx, nucleiScan) // Monitor Job + case ScanPhaseCompleted, ScanPhaseFailed: + return r.handleCompleted(ctx, nucleiScan) // Schedule next or cleanup + } +} +``` + +### 2.2 Job Manager Component + +New component responsible for: +- Creating scanner jobs with proper configuration +- Monitoring job status and updating NucleiScan accordingly +- Cleaning up orphaned jobs on operator restart +- Enforcing concurrency limits + +```go +type JobManager struct { + client.Client + Scheme *runtime.Scheme + ScannerImage string + MaxConcurrent int + DefaultTimeout time.Duration +} + +func (m *JobManager) CreateScanJob(ctx context.Context, scan *nucleiv1alpha1.NucleiScan) (*batchv1.Job, error) { + job := m.buildJob(scan) + if err := controllerutil.SetControllerReference(scan, job, m.Scheme); err != nil { + return nil, err + } + return job, m.Create(ctx, job) +} +``` + +### 2.3 Scanner Mode Implementation + +The operator binary in scanner mode: + +```go +func runScannerMode(scanID string) error { + // 1. Initialize Kubernetes client + config, _ := rest.InClusterConfig() + client, _ := client.New(config, client.Options{}) + + // 2. Fetch the NucleiScan resource + scan := &nucleiv1alpha1.NucleiScan{} + client.Get(ctx, types.NamespacedName{...}, scan) + + // 3. Execute the scan + result, err := scanner.Scan(ctx, scan.Spec.Targets, options) + + // 4. Update NucleiScan status directly + scan.Status.Phase = ScanPhaseCompleted + scan.Status.Findings = result.Findings + scan.Status.Summary = result.Summary + client.Status().Update(ctx, scan) + + return nil +} +``` + +## 3. API Changes + +### 3.1 NucleiScan CRD Updates + +New fields added to the spec and status: + +```go +// NucleiScanSpec additions +type NucleiScanSpec struct { + // ... existing fields ... + + // ScannerConfig allows overriding scanner settings for this scan + // +optional + ScannerConfig *ScannerConfig `json:"scannerConfig,omitempty"` +} + +// ScannerConfig defines scanner-specific configuration +type ScannerConfig struct { + // Image overrides the default scanner image + // +optional + Image string `json:"image,omitempty"` + + // Resources defines resource requirements for the scanner pod + // +optional + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + + // Timeout overrides the default scan timeout + // +optional + Timeout *metav1.Duration `json:"timeout,omitempty"` + + // TemplateURLs specifies additional template repositories to clone + // +optional + TemplateURLs []string `json:"templateURLs,omitempty"` + + // NodeSelector for scanner pod scheduling + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + + // Tolerations for scanner pod scheduling + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +// NucleiScanStatus additions +type NucleiScanStatus struct { + // ... existing fields ... + + // JobRef references the current or last scanner job + // +optional + JobRef *JobReference `json:"jobRef,omitempty"` + + // ScanStartTime is when the scanner pod actually started scanning + // +optional + ScanStartTime *metav1.Time `json:"scanStartTime,omitempty"` +} + +// JobReference contains information about the scanner job +type JobReference struct { + // Name of the Job + Name string `json:"name"` + + // UID of the Job + UID string `json:"uid"` + + // PodName is the name of the scanner pod (for log retrieval) + // +optional + PodName string `json:"podName,omitempty"` + + // StartTime when the job was created + StartTime *metav1.Time `json:"startTime,omitempty"` +} +``` + +## 4. Annotation Schema + +### 4.1 Supported Annotations + +Annotations on Ingress/VirtualService resources to configure scanning: + +| Annotation | Type | Default | Description | +|------------|------|---------|-------------| +| `nuclei.homelab.mortenolsen.pro/enabled` | bool | `true` | Enable/disable scanning for this resource | +| `nuclei.homelab.mortenolsen.pro/templates` | string | - | Comma-separated list of template paths or tags | +| `nuclei.homelab.mortenolsen.pro/severity` | string | - | Comma-separated severity filter: info,low,medium,high,critical | +| `nuclei.homelab.mortenolsen.pro/schedule` | string | - | Cron schedule for periodic scans | +| `nuclei.homelab.mortenolsen.pro/timeout` | duration | `30m` | Scan timeout | +| `nuclei.homelab.mortenolsen.pro/scanner-image` | string | - | Override scanner image | +| `nuclei.homelab.mortenolsen.pro/exclude-templates` | string | - | Templates to exclude | +| `nuclei.homelab.mortenolsen.pro/rate-limit` | int | `150` | Requests per second limit | +| `nuclei.homelab.mortenolsen.pro/tags` | string | - | Template tags to include | +| `nuclei.homelab.mortenolsen.pro/exclude-tags` | string | - | Template tags to exclude | + +### 4.2 Example Annotated Ingress + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: myapp-ingress + namespace: production + annotations: + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" + nuclei.homelab.mortenolsen.pro/templates: "cves/,vulnerabilities/,exposures/" + nuclei.homelab.mortenolsen.pro/exclude-tags: "dos,fuzz" + nuclei.homelab.mortenolsen.pro/timeout: "45m" +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp + port: + number: 80 +``` + +## 5. State Machine + +### 5.1 Updated Scan Lifecycle + +``` + ┌─────────────────────────────────────┐ + │ │ + ▼ │ +┌─────────┐ ┌─────────┐ ┌───────────┐ ┌────────┴─┐ +│ Created │───▶│ Pending │───▶│ Running │───▶│ Completed│ +└─────────┘ └────┬────┘ └─────┬─────┘ └──────────┘ + │ │ │ + │ │ │ (schedule/rescanAge) + │ ▼ │ + │ ┌─────────┐ │ + │ │ Failed │◀──────────┘ + │ └────┬────┘ + │ │ + └───────────────┘ (spec change triggers retry) +``` + +### 5.2 Phase Definitions + +| Phase | Description | Job State | Actions | +|-------|-------------|-----------|---------| +| `Pending` | Waiting to start | None | Create scanner job | +| `Running` | Scan in progress | Active | Monitor job, check timeout | +| `Completed` | Scan finished successfully | Succeeded | Parse results, schedule next | +| `Failed` | Scan failed | Failed | Record error, retry logic | + +## 6. Error Handling + +### 6.1 Failure Scenarios + +| Scenario | Detection | Recovery | +|----------|-----------|----------| +| Job creation fails | API error | Retry with backoff, update status | +| Pod fails to schedule | Job pending timeout | Alert, manual intervention | +| Scan timeout | activeDeadlineSeconds | Mark failed, retry | +| Scanner crashes | Job failed status | Retry based on backoffLimit | +| Operator restarts | Running phase with no job | Reset to Pending | +| Target unavailable | HTTP check fails | Exponential backoff retry | +| Results too large | Status update fails | Truncate findings, log warning | + +### 6.2 Operator Restart Recovery + +On startup, the operator must handle orphaned state: + +```go +func (r *NucleiScanReconciler) RecoverOrphanedScans(ctx context.Context) error { + // List all NucleiScans in Running phase + scanList := &nucleiv1alpha1.NucleiScanList{} + if err := r.List(ctx, scanList); err != nil { + return err + } + + for _, scan := range scanList.Items { + if scan.Status.Phase != ScanPhaseRunning { + continue + } + + // Check if the referenced job still exists + if scan.Status.JobRef != nil { + job := &batchv1.Job{} + err := r.Get(ctx, types.NamespacedName{ + Name: scan.Status.JobRef.Name, + Namespace: scan.Namespace, + }, job) + + if apierrors.IsNotFound(err) { + // Job is gone - reset scan to Pending + scan.Status.Phase = ScanPhasePending + scan.Status.LastError = "Recovered from operator restart - job not found" + scan.Status.JobRef = nil + r.Status().Update(ctx, &scan) + } + // If job exists, normal reconciliation will handle it + } + } + + return nil +} +``` + +### 6.3 Job Cleanup + +Orphaned jobs are cleaned up via: + +1. **Owner References**: Jobs have NucleiScan as owner - deleted when scan is deleted +2. **TTLAfterFinished**: Kubernetes automatically cleans up completed jobs +3. **Periodic Cleanup**: Background goroutine removes stuck jobs + +## 7. Security Considerations + +### 7.1 RBAC Updates + +The operator needs additional permissions for Job management: + +```yaml +# Additional rules for config/rbac/role.yaml +rules: + # Job management + - apiGroups: ["batch"] + resources: ["jobs"] + verbs: ["get", "list", "watch", "create", "update", "patch", "delete"] + + # Pod logs for debugging + - apiGroups: [""] + resources: ["pods", "pods/log"] + verbs: ["get", "list", "watch"] +``` + +Scanner pods need minimal RBAC - only to update their specific NucleiScan: + +```yaml +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: nuclei-scanner-role +rules: + - apiGroups: ["nuclei.homelab.mortenolsen.pro"] + resources: ["nucleiscans"] + verbs: ["get"] + - apiGroups: ["nuclei.homelab.mortenolsen.pro"] + resources: ["nucleiscans/status"] + verbs: ["get", "update", "patch"] +``` + +### 7.2 Pod Security + +Scanner pods run with restricted security context: + +```yaml +securityContext: + runAsNonRoot: true + runAsUser: 65532 + runAsGroup: 65532 + fsGroup: 65532 + allowPrivilegeEscalation: false + readOnlyRootFilesystem: false # Nuclei needs temp files + seccompProfile: + type: RuntimeDefault + capabilities: + drop: + - ALL +``` + +## 8. Migration Path + +### 8.1 Version Strategy + +| Version | Changes | Compatibility | +|---------|---------|---------------| +| v0.x | Current synchronous scanning | - | +| v1.0 | Pod-based scanning, new status fields | Backward compatible | +| v1.1 | Annotation support | Additive | +| v2.0 | Remove synchronous mode | Breaking | + +### 8.2 Migration Steps + +1. **Phase 1**: Add new fields to CRD (non-breaking, all optional) +2. **Phase 2**: Dual-mode operation with feature flag +3. **Phase 3**: Add annotation support +4. **Phase 4**: Deprecate synchronous mode +5. **Phase 5**: Remove synchronous mode (v2.0) + +### 8.3 Rollback Plan + +If issues are discovered: +1. **Immediate**: Set `scanner.mode: sync` in Helm values +2. **Short-term**: Pin to previous operator version +3. **Long-term**: Fix issues in pod-based mode + +## 9. Configuration Reference + +### 9.1 Helm Values + +```yaml +# Scanner configuration +scanner: + # Scanning mode: "pod" or "sync" (legacy) + mode: "pod" + + # Default scanner image + image: ghcr.io/morten-olsen/homelab-nuclei-operator:latest + + # Default scan timeout + timeout: 30m + + # Maximum concurrent scan jobs + maxConcurrent: 5 + + # Job TTL after completion (seconds) + ttlAfterFinished: 3600 + + # Default resource requirements for scanner pods + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + +# Template configuration +templates: + # Built-in templates to use + defaults: + - cves/ + - vulnerabilities/ + + # Git repositories to clone (init container) + repositories: [] + # - url: https://github.com/projectdiscovery/nuclei-templates + # branch: main + # path: /templates/community + +# Operator configuration +operator: + # Rescan age - trigger rescan if results older than this + rescanAge: 168h + + # Backoff for target availability checks + backoff: + initial: 10s + max: 10m + multiplier: 2.0 +``` + +### 9.2 Environment Variables + +| Variable | Description | Default | +|----------|-------------|---------| +| `SCANNER_MODE` | pod or sync | pod | +| `SCANNER_IMAGE` | Default scanner image | operator image | +| `SCANNER_TIMEOUT` | Default scan timeout | 30m | +| `MAX_CONCURRENT_SCANS` | Max parallel jobs | 5 | +| `JOB_TTL_AFTER_FINISHED` | Job cleanup TTL | 3600 | +| `NUCLEI_TEMPLATES_PATH` | Template directory | /nuclei-templates | + +## 10. Observability + +### 10.1 Metrics + +New Prometheus metrics: +- `nuclei_scan_jobs_created_total` - Total scanner jobs created +- `nuclei_scan_job_duration_seconds` - Duration histogram of scan jobs +- `nuclei_active_scan_jobs` - Currently running scan jobs + +### 10.2 Events + +Kubernetes events for key state transitions: +- `ScanJobCreated` - Scanner job created +- `ScanCompleted` - Scan finished successfully +- `ScanFailed` - Scan failed + +### 10.3 Logging + +Structured logging with consistent fields: +- `scan` - NucleiScan name +- `namespace` - Namespace +- `targets` - Number of targets +- `timeout` - Scan timeout \ No newline at end of file diff --git a/README.md b/README.md index c54f29a..3a78847 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ The Nuclei Operator watches for Ingress and VirtualService resources in your Kub ### Key Features +- **Pod-based Scanning Architecture**: Each scan runs in an isolated Kubernetes Job for better scalability and reliability +- **Annotation-based Configuration**: Configure scanning behavior per-resource using annotations on Ingress/VirtualService - **Automatic Discovery**: Watches Kubernetes Ingress and Istio VirtualService resources for new endpoints - **Automated Scanning**: Automatically creates and runs Nuclei scans when new endpoints are discovered - **Scheduled Scans**: Support for cron-based scheduled rescanning @@ -31,15 +33,15 @@ The Nuclei Operator watches for Ingress and VirtualService resources in your Kub │ │ ▼ ▼ ┌─────────────────┐ ┌─────────────────┐ - │ Nuclei Engine │────▶│ Scan Results │ - │ (Scanner) │ │ (Findings) │ + │ Scanner Job │────▶│ Scan Results │ + │ (Isolated Pod) │ │ (Findings) │ └─────────────────┘ └─────────────────┘ ``` 1. **Watch**: The operator watches for Ingress and VirtualService resources -2. **Extract**: URLs are extracted from the resource specifications +2. **Extract**: URLs are extracted from the resource specifications (annotations configure behavior) 3. **Create**: A NucleiScan custom resource is created with the target URLs -4. **Scan**: The Nuclei scanner executes security scans against the targets +4. **Scan**: A Kubernetes Job is created to run the Nuclei scan in an isolated pod 5. **Store**: Results are stored in the NucleiScan status for easy access ## Prerequisites @@ -199,6 +201,11 @@ kind: Ingress metadata: name: my-app-ingress namespace: default + annotations: + # Optional: Configure scanning behavior via annotations + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" spec: tls: - hosts: @@ -484,8 +491,9 @@ make uninstall ## Documentation - [Architecture](ARCHITECTURE.md) - Detailed architecture documentation +- [Design Document](DESIGN.md) - Pod-based scanning architecture design - [API Reference](docs/api.md) - Complete CRD API reference -- [User Guide](docs/user-guide.md) - Detailed usage instructions +- [User Guide](docs/user-guide.md) - Detailed usage instructions (includes annotation reference) - [Contributing](CONTRIBUTING.md) - Contribution guidelines ## Contributing diff --git a/api/v1alpha1/nucleiscan_types.go b/api/v1alpha1/nucleiscan_types.go index 3e3ae46..0f9521b 100644 --- a/api/v1alpha1/nucleiscan_types.go +++ b/api/v1alpha1/nucleiscan_types.go @@ -17,6 +17,7 @@ limitations under the License. package v1alpha1 import ( + corev1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -44,6 +45,50 @@ type SourceReference struct { UID string `json:"uid"` } +// ScannerConfig defines scanner-specific configuration +type ScannerConfig struct { + // Image overrides the default scanner image + // +optional + Image string `json:"image,omitempty"` + + // Resources defines resource requirements for the scanner pod + // +optional + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + + // Timeout overrides the default scan timeout + // +optional + Timeout *metav1.Duration `json:"timeout,omitempty"` + + // TemplateURLs specifies additional template repositories to clone + // +optional + TemplateURLs []string `json:"templateURLs,omitempty"` + + // NodeSelector for scanner pod scheduling + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + + // Tolerations for scanner pod scheduling + // +optional + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} + +// JobReference contains information about the scanner job +type JobReference struct { + // Name of the Job + Name string `json:"name"` + + // UID of the Job + UID string `json:"uid"` + + // PodName is the name of the scanner pod (for log retrieval) + // +optional + PodName string `json:"podName,omitempty"` + + // StartTime when the job was created + // +optional + StartTime *metav1.Time `json:"startTime,omitempty"` +} + // NucleiScanSpec defines the desired state of NucleiScan type NucleiScanSpec struct { // SourceRef references the Ingress or VirtualService being scanned @@ -73,6 +118,10 @@ type NucleiScanSpec struct { // Suspend prevents scheduled scans from running // +optional Suspend bool `json:"suspend,omitempty"` + + // ScannerConfig allows overriding scanner settings for this scan + // +optional + ScannerConfig *ScannerConfig `json:"scannerConfig,omitempty"` } // ScanPhase represents the current phase of the scan @@ -200,6 +249,14 @@ type NucleiScanStatus struct { // LastRetryTime is when the last availability check retry occurred // +optional LastRetryTime *metav1.Time `json:"lastRetryTime,omitempty"` + + // JobRef references the current or last scanner job + // +optional + JobRef *JobReference `json:"jobRef,omitempty"` + + // ScanStartTime is when the scanner pod actually started scanning + // +optional + ScanStartTime *metav1.Time `json:"scanStartTime,omitempty"` } // +kubebuilder:object:root=true diff --git a/api/v1alpha1/zz_generated.deepcopy.go b/api/v1alpha1/zz_generated.deepcopy.go index b0a99bf..5ada8d4 100644 --- a/api/v1alpha1/zz_generated.deepcopy.go +++ b/api/v1alpha1/zz_generated.deepcopy.go @@ -21,7 +21,8 @@ limitations under the License. package v1alpha1 import ( - "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" ) @@ -61,6 +62,25 @@ func (in *Finding) DeepCopy() *Finding { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *JobReference) DeepCopyInto(out *JobReference) { + *out = *in + if in.StartTime != nil { + in, out := &in.StartTime, &out.StartTime + *out = (*in).DeepCopy() + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new JobReference. +func (in *JobReference) DeepCopy() *JobReference { + if in == nil { + return nil + } + out := new(JobReference) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *NucleiScan) DeepCopyInto(out *NucleiScan) { *out = *in @@ -139,6 +159,11 @@ func (in *NucleiScanSpec) DeepCopyInto(out *NucleiScanSpec) { *out = make([]string, len(*in)) copy(*out, *in) } + if in.ScannerConfig != nil { + in, out := &in.ScannerConfig, &out.ScannerConfig + *out = new(ScannerConfig) + (*in).DeepCopyInto(*out) + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NucleiScanSpec. @@ -156,7 +181,7 @@ func (in *NucleiScanStatus) DeepCopyInto(out *NucleiScanStatus) { *out = *in if in.Conditions != nil { in, out := &in.Conditions, &out.Conditions - *out = make([]v1.Condition, len(*in)) + *out = make([]metav1.Condition, len(*in)) for i := range *in { (*in)[i].DeepCopyInto(&(*out)[i]) } @@ -189,6 +214,15 @@ func (in *NucleiScanStatus) DeepCopyInto(out *NucleiScanStatus) { in, out := &in.LastRetryTime, &out.LastRetryTime *out = (*in).DeepCopy() } + if in.JobRef != nil { + in, out := &in.JobRef, &out.JobRef + *out = new(JobReference) + (*in).DeepCopyInto(*out) + } + if in.ScanStartTime != nil { + in, out := &in.ScanStartTime, &out.ScanStartTime + *out = (*in).DeepCopy() + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new NucleiScanStatus. @@ -223,6 +257,50 @@ func (in *ScanSummary) DeepCopy() *ScanSummary { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *ScannerConfig) DeepCopyInto(out *ScannerConfig) { + *out = *in + if in.Resources != nil { + in, out := &in.Resources, &out.Resources + *out = new(v1.ResourceRequirements) + (*in).DeepCopyInto(*out) + } + if in.Timeout != nil { + in, out := &in.Timeout, &out.Timeout + *out = new(metav1.Duration) + **out = **in + } + if in.TemplateURLs != nil { + in, out := &in.TemplateURLs, &out.TemplateURLs + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } + if in.Tolerations != nil { + in, out := &in.Tolerations, &out.Tolerations + *out = make([]v1.Toleration, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new ScannerConfig. +func (in *ScannerConfig) DeepCopy() *ScannerConfig { + if in == nil { + return nil + } + out := new(ScannerConfig) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SourceReference) DeepCopyInto(out *SourceReference) { *out = *in diff --git a/charts/nuclei-operator/README.md b/charts/nuclei-operator/README.md index 0a66621..dd7697e 100644 --- a/charts/nuclei-operator/README.md +++ b/charts/nuclei-operator/README.md @@ -2,6 +2,14 @@ A Helm chart for deploying the Nuclei Operator - a Kubernetes operator that automatically scans Ingress and VirtualService resources using Nuclei security scanner. +## Features + +- **Pod-based Scanning Architecture**: Each scan runs in an isolated Kubernetes Job for better scalability and reliability +- **Annotation-based Configuration**: Configure scanning behavior per-resource using annotations +- **Automatic Discovery**: Watches Kubernetes Ingress and Istio VirtualService resources +- **Scheduled Scans**: Support for cron-based scheduled rescanning +- **Flexible Configuration**: Configurable templates, severity filters, and scan options + ## Prerequisites - Kubernetes 1.26+ @@ -137,6 +145,24 @@ The following table lists the configurable parameters of the Nuclei Operator cha | `nuclei.backoff.max` | Maximum backoff interval | `10m` | | `nuclei.backoff.multiplier` | Backoff multiplier | `2.0` | +### Scanner Pod Configuration + +The operator uses a pod-based scanning architecture where each scan runs in its own Kubernetes Job. Configure scanner pod behavior with these parameters: + +| Parameter | Description | Default | +|-----------|-------------|---------| +| `scanner.enabled` | Enable scanner RBAC resources | `true` | +| `scanner.image` | Scanner image (defaults to operator image) | `""` | +| `scanner.timeout` | Default scan timeout | `30m` | +| `scanner.maxConcurrent` | Maximum concurrent scan jobs | `5` | +| `scanner.ttlAfterFinished` | Job TTL after completion (seconds) | `3600` | +| `scanner.resources.requests.cpu` | Scanner pod CPU request | `100m` | +| `scanner.resources.requests.memory` | Scanner pod memory request | `256Mi` | +| `scanner.resources.limits.cpu` | Scanner pod CPU limit | `1` | +| `scanner.resources.limits.memory` | Scanner pod memory limit | `1Gi` | +| `scanner.defaultTemplates` | Default templates to use | `[]` | +| `scanner.defaultSeverity` | Default severity filter | `[]` | + ### ServiceMonitor (Prometheus Operator) | Parameter | Description | Default | @@ -199,6 +225,28 @@ nuclei: rescanAge: "24h" ``` +### With Custom Scanner Configuration + +```yaml +# values.yaml +scanner: + enabled: true + timeout: "1h" + maxConcurrent: 10 + ttlAfterFinished: 7200 + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "2" + memory: 2Gi + defaultSeverity: + - medium + - high + - critical +``` + ### With Node Affinity ```yaml @@ -215,6 +263,44 @@ affinity: - arm64 ``` +## Annotation-Based Configuration + +You can configure scanning behavior for individual Ingress or VirtualService resources using annotations: + +| Annotation | Description | +|------------|-------------| +| `nuclei.homelab.mortenolsen.pro/enabled` | Enable/disable scanning (`true`/`false`) | +| `nuclei.homelab.mortenolsen.pro/templates` | Comma-separated list of template paths | +| `nuclei.homelab.mortenolsen.pro/severity` | Comma-separated severity filter | +| `nuclei.homelab.mortenolsen.pro/schedule` | Cron schedule for periodic scans | +| `nuclei.homelab.mortenolsen.pro/timeout` | Scan timeout duration | +| `nuclei.homelab.mortenolsen.pro/scanner-image` | Override scanner image | + +### Example Annotated Ingress + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: myapp-ingress + annotations: + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp + port: + number: 80 +``` + ## Uninstallation ```bash diff --git a/charts/nuclei-operator/templates/deployment.yaml b/charts/nuclei-operator/templates/deployment.yaml index 72e7044..d2eacd4 100644 --- a/charts/nuclei-operator/templates/deployment.yaml +++ b/charts/nuclei-operator/templates/deployment.yaml @@ -60,6 +60,24 @@ spec: value: {{ .Values.nuclei.backoff.max | quote }} - name: NUCLEI_BACKOFF_MULTIPLIER value: {{ .Values.nuclei.backoff.multiplier | quote }} + - name: SCANNER_IMAGE + value: {{ .Values.scanner.image | default (printf "%s:%s" .Values.image.repository (.Values.image.tag | default .Chart.AppVersion)) | quote }} + - name: SCANNER_TIMEOUT + value: {{ .Values.scanner.timeout | quote }} + - name: MAX_CONCURRENT_SCANS + value: {{ .Values.scanner.maxConcurrent | quote }} + - name: JOB_TTL_AFTER_FINISHED + value: {{ .Values.scanner.ttlAfterFinished | quote }} + - name: SCANNER_SERVICE_ACCOUNT + value: {{ include "nuclei-operator.fullname" . }}-scanner + {{- if .Values.scanner.defaultTemplates }} + - name: DEFAULT_TEMPLATES + value: {{ join "," .Values.scanner.defaultTemplates | quote }} + {{- end }} + {{- if .Values.scanner.defaultSeverity }} + - name: DEFAULT_SEVERITY + value: {{ join "," .Values.scanner.defaultSeverity | quote }} + {{- end }} ports: [] securityContext: {{- toYaml .Values.securityContext | nindent 10 }} diff --git a/charts/nuclei-operator/templates/rbac.yaml b/charts/nuclei-operator/templates/rbac.yaml index 341ef3c..e2506b7 100644 --- a/charts/nuclei-operator/templates/rbac.yaml +++ b/charts/nuclei-operator/templates/rbac.yaml @@ -6,6 +6,18 @@ metadata: labels: {{- include "nuclei-operator.labels" . | nindent 4 }} rules: +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - "" resources: @@ -13,6 +25,20 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - "" + resources: + - pods/log + verbs: + - get - apiGroups: - networking.istio.io resources: diff --git a/charts/nuclei-operator/templates/scanner-rbac.yaml b/charts/nuclei-operator/templates/scanner-rbac.yaml new file mode 100644 index 0000000..1557c6d --- /dev/null +++ b/charts/nuclei-operator/templates/scanner-rbac.yaml @@ -0,0 +1,51 @@ +{{- if .Values.scanner.enabled }} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "nuclei-operator.fullname" . }}-scanner + namespace: {{ .Release.Namespace }} + labels: + {{- include "nuclei-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: scanner +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: {{ include "nuclei-operator.fullname" . }}-scanner + labels: + {{- include "nuclei-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: scanner +rules: + # Scanner needs to read NucleiScan resources + - apiGroups: + - nuclei.homelab.mortenolsen.pro + resources: + - nucleiscans + verbs: + - get + # Scanner needs to update NucleiScan status + - apiGroups: + - nuclei.homelab.mortenolsen.pro + resources: + - nucleiscans/status + verbs: + - get + - patch + - update +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: {{ include "nuclei-operator.fullname" . }}-scanner + labels: + {{- include "nuclei-operator.labels" . | nindent 4 }} + app.kubernetes.io/component: scanner +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: {{ include "nuclei-operator.fullname" . }}-scanner +subjects: + - kind: ServiceAccount + name: {{ include "nuclei-operator.fullname" . }}-scanner + namespace: {{ .Release.Namespace }} +{{- end }} \ No newline at end of file diff --git a/charts/nuclei-operator/values.yaml b/charts/nuclei-operator/values.yaml index 3bc1e17..3cc68ad 100644 --- a/charts/nuclei-operator/values.yaml +++ b/charts/nuclei-operator/values.yaml @@ -130,4 +130,36 @@ serviceMonitor: # Network policies networkPolicy: # Enable network policy - enabled: false \ No newline at end of file + enabled: false + +# Scanner configuration +scanner: + # Enable scanner RBAC resources + enabled: true + + # Scanner image (defaults to operator image) + image: "" + + # Default scan timeout + timeout: "30m" + + # Maximum concurrent scan jobs + maxConcurrent: 5 + + # Job TTL after completion (seconds) + ttlAfterFinished: 3600 + + # Default resource requirements for scanner pods + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + + # Default templates to use + defaultTemplates: [] + + # Default severity filter + defaultSeverity: [] \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index db6e44e..cb9ebd7 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,9 +17,14 @@ limitations under the License. package main import ( + "context" "crypto/tls" "flag" + "fmt" "os" + "strconv" + "strings" + "time" // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) // to ensure that exec-entrypoint and run can make use of them. @@ -27,10 +32,12 @@ import ( istionetworkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" networkingv1 "k8s.io/api/networking/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/runtime" utilruntime "k8s.io/apimachinery/pkg/util/runtime" clientgoscheme "k8s.io/client-go/kubernetes/scheme" ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/healthz" "sigs.k8s.io/controller-runtime/pkg/log/zap" "sigs.k8s.io/controller-runtime/pkg/metrics/filters" @@ -39,6 +46,7 @@ import ( nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" "github.com/mortenolsen/nuclei-operator/internal/controller" + "github.com/mortenolsen/nuclei-operator/internal/jobmanager" "github.com/mortenolsen/nuclei-operator/internal/scanner" // +kubebuilder:scaffold:imports ) @@ -67,6 +75,15 @@ func main() { var secureMetrics bool var enableHTTP2 bool var tlsOpts []func(*tls.Config) + + // Scanner mode flags + var mode string + var scanName string + var scanNamespace string + + flag.StringVar(&mode, "mode", "controller", "Run mode: 'controller' or 'scanner'") + flag.StringVar(&scanName, "scan-name", "", "Name of the NucleiScan to execute (scanner mode only)") + flag.StringVar(&scanNamespace, "scan-namespace", "", "Namespace of the NucleiScan (scanner mode only)") flag.StringVar(&metricsAddr, "metrics-bind-address", "0", "The address the metrics endpoint binds to. "+ "Use :8443 for HTTPS or :8080 for HTTP, or leave as 0 to disable the metrics service.") flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") @@ -92,6 +109,15 @@ func main() { ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + // Check if running in scanner mode + if mode == "scanner" { + if err := scanner.RunScannerMode(scanName, scanNamespace); err != nil { + setupLog.Error(err, "Scanner mode failed") + os.Exit(1) + } + os.Exit(0) + } + // if the enable-http2 flag is false (the default), http/2 should be disabled // due to its vulnerabilities. More specifically, disabling http/2 will // prevent from being vulnerable to the HTTP/2 Stream Cancellation and @@ -183,10 +209,103 @@ func main() { os.Exit(1) } + // Parse environment variables for JobManager configuration + scannerImage := os.Getenv("SCANNER_IMAGE") + if scannerImage == "" { + scannerImage = jobmanager.DefaultScannerImage + } + + scannerTimeout := 30 * time.Minute + if v := os.Getenv("SCANNER_TIMEOUT"); v != "" { + if d, err := time.ParseDuration(v); err == nil { + scannerTimeout = d + } + } + + maxConcurrentScans := 5 + if v := os.Getenv("MAX_CONCURRENT_SCANS"); v != "" { + if n, err := strconv.Atoi(v); err == nil { + maxConcurrentScans = n + } + } + + ttlAfterFinished := int32(3600) + if v := os.Getenv("JOB_TTL_AFTER_FINISHED"); v != "" { + if n, err := strconv.Atoi(v); err == nil { + ttlAfterFinished = int32(n) + } + } + + scannerServiceAccount := os.Getenv("SCANNER_SERVICE_ACCOUNT") + if scannerServiceAccount == "" { + scannerServiceAccount = "nuclei-scanner" + } + + defaultTemplates := []string{} + if v := os.Getenv("DEFAULT_TEMPLATES"); v != "" { + defaultTemplates = strings.Split(v, ",") + } + + defaultSeverity := []string{} + if v := os.Getenv("DEFAULT_SEVERITY"); v != "" { + defaultSeverity = strings.Split(v, ",") + } + + // Create the JobManager configuration + jobManagerConfig := jobmanager.Config{ + ScannerImage: scannerImage, + DefaultTimeout: scannerTimeout, + TTLAfterFinished: ttlAfterFinished, + BackoffLimit: 2, + MaxConcurrent: maxConcurrentScans, + ServiceAccountName: scannerServiceAccount, + DefaultResources: jobmanager.DefaultConfig().DefaultResources, + DefaultTemplates: defaultTemplates, + DefaultSeverity: defaultSeverity, + } + + // Create the JobManager for scanner job management + jobMgr := jobmanager.NewJobManager( + mgr.GetClient(), + mgr.GetScheme(), + jobManagerConfig, + ) + + // Run startup recovery to handle orphaned scans from previous operator instance + setupLog.Info("Running startup recovery") + if err := runStartupRecovery(mgr.GetClient(), jobMgr); err != nil { + setupLog.Error(err, "Startup recovery failed") + // Don't exit - continue with normal operation + } + + // Create a context that will be cancelled when the manager stops + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + // Start periodic cleanup goroutine + go func() { + ticker := time.NewTicker(5 * time.Minute) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + if err := jobMgr.CleanupOrphanedJobs(ctx); err != nil { + setupLog.Error(err, "Periodic cleanup failed") + } + case <-ctx.Done(): + return + } + } + }() + + // Create the NucleiScan reconciler with JobManager if err := controller.NewNucleiScanReconciler( mgr.GetClient(), mgr.GetScheme(), - scanner.NewNucleiScannerWithDefaults(), + mgr.GetEventRecorderFor("nucleiscan-controller"), + jobMgr, + controller.DefaultReconcilerConfig(), ).SetupWithManager(mgr); err != nil { setupLog.Error(err, "unable to create controller", "controller", "NucleiScan") os.Exit(1) @@ -222,3 +341,72 @@ func main() { os.Exit(1) } } + +// runStartupRecovery handles orphaned scans and jobs from previous operator instance +func runStartupRecovery(c client.Client, jobMgr *jobmanager.JobManager) error { + ctx := context.Background() + + // List all NucleiScans in Running phase + scanList := &nucleiv1alpha1.NucleiScanList{} + if err := c.List(ctx, scanList); err != nil { + return fmt.Errorf("failed to list NucleiScans: %w", err) + } + + for _, scan := range scanList.Items { + if scan.Status.Phase != nucleiv1alpha1.ScanPhaseRunning { + continue + } + + // Check if the referenced job still exists + if scan.Status.JobRef != nil { + job, err := jobMgr.GetJob(ctx, scan.Status.JobRef.Name, scan.Namespace) + if err != nil { + if apierrors.IsNotFound(err) { + // Job is gone - reset scan to Pending + scan.Status.Phase = nucleiv1alpha1.ScanPhasePending + scan.Status.LastError = "Recovered from operator restart - job not found" + scan.Status.JobRef = nil + if updateErr := c.Status().Update(ctx, &scan); updateErr != nil { + return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr) + } + continue + } + return fmt.Errorf("failed to get job for scan %s: %w", scan.Name, err) + } + + // Job exists - check if it's completed but status wasn't updated + if jobMgr.IsJobComplete(job) { + // The scanner pod should have updated the status + // If it didn't, mark as failed + if scan.Status.Phase == nucleiv1alpha1.ScanPhaseRunning { + if jobMgr.IsJobFailed(job) { + scan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed + scan.Status.LastError = "Job completed during operator downtime: " + jobMgr.GetJobFailureReason(job) + } else { + // Job succeeded but status wasn't updated - this shouldn't happen + // but handle it gracefully + scan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed + scan.Status.LastError = "Job completed during operator downtime but status was not updated" + } + if updateErr := c.Status().Update(ctx, &scan); updateErr != nil { + return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr) + } + } + } + } else { + // No job reference but Running - invalid state + scan.Status.Phase = nucleiv1alpha1.ScanPhasePending + scan.Status.LastError = "Recovered from invalid state - no job reference" + if updateErr := c.Status().Update(ctx, &scan); updateErr != nil { + return fmt.Errorf("failed to update scan %s: %w", scan.Name, updateErr) + } + } + } + + // Clean up orphaned jobs + if err := jobMgr.CleanupOrphanedJobs(ctx); err != nil { + return fmt.Errorf("failed to cleanup orphaned jobs: %w", err) + } + + return nil +} diff --git a/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml b/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml index 97a36e3..74b7a5e 100644 --- a/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml +++ b/config/crd/bases/nuclei.homelab.mortenolsen.pro_nucleiscans.yaml @@ -55,6 +55,127 @@ spec: spec: description: NucleiScanSpec defines the desired state of NucleiScan properties: + scannerConfig: + description: ScannerConfig allows overriding scanner settings for + this scan + properties: + image: + description: Image overrides the default scanner image + type: string + nodeSelector: + additionalProperties: + type: string + description: NodeSelector for scanner pod scheduling + type: object + resources: + description: Resources defines resource requirements for the scanner + pod + properties: + claims: + description: |- + Claims lists the names of resources, defined in spec.resourceClaims, + that are used by this container. + + This field depends on the + DynamicResourceAllocation feature gate. + + This field is immutable. It can only be set for containers. + items: + description: ResourceClaim references one entry in PodSpec.ResourceClaims. + properties: + name: + description: |- + Name must match the name of one entry in pod.spec.resourceClaims of + the Pod where this field is used. It makes that resource available + inside a container. + type: string + request: + description: |- + Request is the name chosen for a request in the referenced claim. + If empty, everything from the claim is made available, otherwise + only the result of this request. + type: string + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Limits describes the maximum amount of compute resources allowed. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: |- + Requests describes the minimum amount of compute resources required. + If Requests is omitted for a container, it defaults to Limits if that is explicitly specified, + otherwise to an implementation-defined value. Requests cannot exceed Limits. + More info: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + type: object + type: object + templateURLs: + description: TemplateURLs specifies additional template repositories + to clone + items: + type: string + type: array + timeout: + description: Timeout overrides the default scan timeout + type: string + tolerations: + description: Tolerations for scanner pod scheduling + items: + description: |- + The pod this Toleration is attached to tolerates any taint that matches + the triple using the matching operator . + properties: + effect: + description: |- + Effect indicates the taint effect to match. Empty means match all taint effects. + When specified, allowed values are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: |- + Key is the taint key that the toleration applies to. Empty means match all taint keys. + If the key is empty, operator must be Exists; this combination means to match all values and all keys. + type: string + operator: + description: |- + Operator represents a key's relationship to the value. + Valid operators are Exists and Equal. Defaults to Equal. + Exists is equivalent to wildcard for value, so that a pod can + tolerate all taints of a particular category. + type: string + tolerationSeconds: + description: |- + TolerationSeconds represents the period of time the toleration (which must be + of effect NoExecute, otherwise this field is ignored) tolerates the taint. By default, + it is not set, which means tolerate the taint forever (do not evict). Zero and + negative values will be treated as 0 (evict immediately) by the system. + format: int64 + type: integer + value: + description: |- + Value is the taint value the toleration matches to. + If the operator is Exists, the value should be empty, otherwise just a regular string. + type: string + type: object + type: array + type: object schedule: description: |- Schedule for periodic rescanning in cron format @@ -249,6 +370,26 @@ spec: - timestamp type: object type: array + jobRef: + description: JobRef references the current or last scanner job + properties: + name: + description: Name of the Job + type: string + podName: + description: PodName is the name of the scanner pod (for log retrieval) + type: string + startTime: + description: StartTime when the job was created + format: date-time + type: string + uid: + description: UID of the Job + type: string + required: + - name + - uid + type: object lastError: description: LastError contains the error message if the scan failed type: string @@ -284,6 +425,11 @@ spec: RetryCount tracks the number of consecutive availability check retries Used for exponential backoff when waiting for targets type: integer + scanStartTime: + description: ScanStartTime is when the scanner pod actually started + scanning + format: date-time + type: string summary: description: Summary provides aggregated scan statistics properties: diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 936e687..4e345c6 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -11,6 +11,26 @@ rules: verbs: - create - patch +- apiGroups: + - "" + resources: + - pods + verbs: + - get + - list + - watch +- apiGroups: + - batch + resources: + - jobs + verbs: + - create + - delete + - get + - list + - patch + - update + - watch - apiGroups: - networking.istio.io resources: diff --git a/config/samples/example-ingress.yaml b/config/samples/example-ingress.yaml index 3cef198..ef3c95f 100644 --- a/config/samples/example-ingress.yaml +++ b/config/samples/example-ingress.yaml @@ -1,6 +1,43 @@ # Example Ingress resource that would trigger NucleiScan creation # When this Ingress is created, the nuclei-operator will automatically # create a corresponding NucleiScan resource to scan the exposed endpoints. +# +# The operator uses a pod-based scanning architecture where each scan +# runs in an isolated Kubernetes Job for better scalability and reliability. +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: example-ingress + namespace: default + labels: + app.kubernetes.io/name: example-app + app.kubernetes.io/managed-by: kustomize + annotations: + # Nuclei scanning configuration + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" + # Optional: Additional scanning configuration + # nuclei.homelab.mortenolsen.pro/templates: "cves/,vulnerabilities/" + # nuclei.homelab.mortenolsen.pro/timeout: "1h" + # nuclei.homelab.mortenolsen.pro/scanner-image: "custom-scanner:latest" + # nuclei.homelab.mortenolsen.pro/tags: "cve,oast" + # nuclei.homelab.mortenolsen.pro/exclude-tags: "dos" + kubernetes.io/ingress.class: nginx +spec: + rules: + - host: example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: example-service + port: + number: 80 +--- +# Example Ingress with TLS - endpoints will be scanned with HTTPS apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -10,9 +47,10 @@ metadata: app.kubernetes.io/name: example-app app.kubernetes.io/managed-by: kustomize annotations: - # Optional: Add annotations to customize scan behavior - # nuclei.homelab.mortenolsen.pro/scan-enabled: "true" - # nuclei.homelab.mortenolsen.pro/severity: "high,critical" + # Nuclei scanning configuration + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "high,critical" + nuclei.homelab.mortenolsen.pro/templates: "cves/,vulnerabilities/,exposures/" kubernetes.io/ingress.class: nginx spec: # TLS configuration - endpoints will be scanned with HTTPS @@ -52,8 +90,8 @@ spec: port: number: 8080 --- -# Example Ingress without TLS (HTTP only) -# This will be scanned with HTTP scheme +# Example Ingress with scanning disabled +# This will NOT trigger a NucleiScan creation apiVersion: networking.k8s.io/v1 kind: Ingress metadata: @@ -61,6 +99,9 @@ metadata: namespace: default labels: app.kubernetes.io/name: internal-app + annotations: + # Disable scanning for this internal resource + nuclei.homelab.mortenolsen.pro/enabled: "false" spec: rules: - host: internal.example.local @@ -72,4 +113,45 @@ spec: service: name: internal-app port: - number: 80 \ No newline at end of file + number: 80 +--- +# Example Ingress with full annotation configuration +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: fully-configured-ingress + namespace: default + labels: + app.kubernetes.io/name: configured-app + annotations: + # Enable scanning + nuclei.homelab.mortenolsen.pro/enabled: "true" + # Severity filter - only report medium and above + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + # Schedule daily scans at 2 AM + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" + # Use specific template directories + nuclei.homelab.mortenolsen.pro/templates: "cves/,vulnerabilities/,misconfiguration/" + # Set scan timeout to 1 hour + nuclei.homelab.mortenolsen.pro/timeout: "1h" + # Include specific tags + nuclei.homelab.mortenolsen.pro/tags: "cve,oast,sqli,xss" + # Exclude certain tags + nuclei.homelab.mortenolsen.pro/exclude-tags: "dos,fuzz" + kubernetes.io/ingress.class: nginx +spec: + tls: + - hosts: + - secure.example.com + secretName: secure-tls-secret + rules: + - host: secure.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: secure-app + port: + number: 443 \ No newline at end of file diff --git a/docs/api.md b/docs/api.md index c031a46..ad3d131 100644 --- a/docs/api.md +++ b/docs/api.md @@ -10,6 +10,8 @@ This document provides a complete reference for the Nuclei Operator Custom Resou - [Status](#status) - [Type Definitions](#type-definitions) - [SourceReference](#sourcereference) + - [ScannerConfig](#scannerconfig) + - [JobReference](#jobreference) - [Finding](#finding) - [ScanSummary](#scansummary) - [ScanPhase](#scanphase) @@ -62,6 +64,16 @@ spec: - critical schedule: "@every 24h" suspend: false + scannerConfig: + image: "custom-scanner:latest" + timeout: "1h" + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "1" + memory: 1Gi ``` #### Spec Fields @@ -74,6 +86,7 @@ spec: | `severity` | []string | No | Severity filter. Valid values: `info`, `low`, `medium`, `high`, `critical` | | `schedule` | string | No | Cron schedule for periodic rescanning | | `suspend` | bool | No | When true, suspends scheduled scans | +| `scannerConfig` | [ScannerConfig](#scannerconfig) | No | Scanner-specific configuration overrides | #### Schedule Format @@ -110,6 +123,12 @@ status: lastScanTime: "2024-01-15T10:30:00Z" completionTime: "2024-01-15T10:35:00Z" nextScheduledTime: "2024-01-16T10:30:00Z" + scanStartTime: "2024-01-15T10:30:05Z" + jobRef: + name: my-app-scan-abc123 + uid: "job-uid-12345" + podName: my-app-scan-abc123-xyz + startTime: "2024-01-15T10:30:00Z" summary: totalFindings: 3 findingsBySeverity: @@ -127,6 +146,7 @@ status: timestamp: "2024-01-15T10:32:00Z" lastError: "" observedGeneration: 1 + retryCount: 0 ``` #### Status Fields @@ -138,10 +158,14 @@ status: | `lastScanTime` | *Time | When the last scan was initiated | | `completionTime` | *Time | When the last scan completed | | `nextScheduledTime` | *Time | When the next scheduled scan will run | +| `scanStartTime` | *Time | When the scanner pod actually started scanning | +| `jobRef` | *[JobReference](#jobreference) | Reference to the current or last scanner job | | `summary` | *[ScanSummary](#scansummary) | Aggregated scan statistics | | `findings` | [][Finding](#finding) | Array of scan results | | `lastError` | string | Error message if the scan failed | | `observedGeneration` | int64 | Generation observed by the controller | +| `retryCount` | int | Number of consecutive availability check retries | +| `lastRetryTime` | *Time | When the last availability check retry occurred | #### Conditions @@ -188,6 +212,82 @@ type SourceReference struct { | `namespace` | string | Yes | Namespace of the source resource | | `uid` | string | Yes | UID of the source resource | +### ScannerConfig + +`ScannerConfig` defines scanner-specific configuration that can override default settings. + +```go +type ScannerConfig struct { + Image string `json:"image,omitempty"` + Resources *corev1.ResourceRequirements `json:"resources,omitempty"` + Timeout *metav1.Duration `json:"timeout,omitempty"` + TemplateURLs []string `json:"templateURLs,omitempty"` + NodeSelector map[string]string `json:"nodeSelector,omitempty"` + Tolerations []corev1.Toleration `json:"tolerations,omitempty"` +} +``` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `image` | string | No | Override the default scanner image | +| `resources` | ResourceRequirements | No | Resource requirements for the scanner pod | +| `timeout` | Duration | No | Override the default scan timeout | +| `templateURLs` | []string | No | Additional template repositories to clone | +| `nodeSelector` | map[string]string | No | Node selector for scanner pod scheduling | +| `tolerations` | []Toleration | No | Tolerations for scanner pod scheduling | + +**Example:** + +```yaml +scannerConfig: + image: "ghcr.io/custom/scanner:v1.0.0" + timeout: "1h" + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "2" + memory: 2Gi + nodeSelector: + node-type: scanner + tolerations: + - key: "dedicated" + operator: "Equal" + value: "scanner" + effect: "NoSchedule" +``` + +### JobReference + +`JobReference` contains information about the scanner job for tracking and debugging. + +```go +type JobReference struct { + Name string `json:"name"` + UID string `json:"uid"` + PodName string `json:"podName,omitempty"` + StartTime *metav1.Time `json:"startTime,omitempty"` +} +``` + +| Field | Type | Required | Description | +|-------|------|----------|-------------| +| `name` | string | Yes | Name of the Kubernetes Job | +| `uid` | string | Yes | UID of the Job | +| `podName` | string | No | Name of the scanner pod (for log retrieval) | +| `startTime` | *Time | No | When the job was created | + +**Example:** + +```yaml +jobRef: + name: my-scan-abc123 + uid: "12345678-1234-1234-1234-123456789012" + podName: my-scan-abc123-xyz + startTime: "2024-01-15T10:30:00Z" +``` + ### Finding `Finding` represents a single vulnerability or issue discovered during a scan. diff --git a/docs/user-guide.md b/docs/user-guide.md index 84edf2f..f5e2a17 100644 --- a/docs/user-guide.md +++ b/docs/user-guide.md @@ -7,6 +7,8 @@ This guide provides detailed instructions for using the Nuclei Operator to autom - [Introduction](#introduction) - [Installation](#installation) - [Basic Usage](#basic-usage) +- [Scanner Architecture](#scanner-architecture) +- [Annotation-Based Configuration](#annotation-based-configuration) - [Configuration Options](#configuration-options) - [Working with Ingress Resources](#working-with-ingress-resources) - [Working with VirtualService Resources](#working-with-virtualservice-resources) @@ -24,11 +26,13 @@ The Nuclei Operator automates security scanning by watching for Kubernetes Ingre 1. Extracts target URLs from the resource 2. Creates a NucleiScan custom resource -3. Executes a Nuclei security scan +3. Creates a Kubernetes Job to execute the Nuclei security scan in an isolated pod 4. Stores the results in the NucleiScan status This enables continuous security monitoring of your web applications without manual intervention. +The operator uses a **pod-based scanning architecture** where each scan runs in its own isolated Kubernetes Job, providing better scalability, reliability, and resource control. + --- ## Installation @@ -151,6 +155,224 @@ kubectl apply -f manual-scan.yaml --- +## Scanner Architecture + +The nuclei-operator uses a pod-based scanning architecture for improved scalability and reliability: + +1. **Operator Pod**: Manages NucleiScan resources and creates scanner jobs +2. **Scanner Jobs**: Kubernetes Jobs that execute nuclei scans in isolated pods +3. **Direct Status Updates**: Scanner pods update NucleiScan status directly via the Kubernetes API + +### Architecture Diagram + +``` +┌─────────────────────────────────────────────────────────────────────┐ +│ Kubernetes Cluster │ +│ │ +│ ┌──────────────────┐ ┌──────────────────────────────────────┐ │ +│ │ Operator Pod │ │ Scanner Jobs │ │ +│ │ │ │ │ │ +│ │ ┌────────────┐ │ │ ┌─────────┐ ┌─────────┐ │ │ +│ │ │ Controller │──┼─────┼─▶│ Job 1 │ │ Job 2 │ ... │ │ +│ │ │ Manager │ │ │ │(Scanner)│ │(Scanner)│ │ │ +│ │ └────────────┘ │ │ └────┬────┘ └────┬────┘ │ │ +│ │ │ │ │ │ │ │ │ +│ └────────┼─────────┘ └───────┼────────────┼─────────────────┘ │ +│ │ │ │ │ +│ ▼ ▼ ▼ │ +│ ┌──────────────────────────────────────────────────────────────┐ │ +│ │ Kubernetes API Server │ │ +│ │ │ │ +│ │ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │ │ +│ │ │ NucleiScan │ │ NucleiScan │ │ NucleiScan │ ... │ │ +│ │ │ Resource │ │ Resource │ │ Resource │ │ │ +│ │ └─────────────┘ └─────────────┘ └─────────────┘ │ │ +│ └──────────────────────────────────────────────────────────────┘ │ +│ │ +└─────────────────────────────────────────────────────────────────────┘ +``` + +### Benefits + +- **Scalability**: Multiple scans can run concurrently across the cluster +- **Isolation**: Each scan runs in its own pod with dedicated resources +- **Reliability**: Scans survive operator restarts +- **Resource Control**: Per-scan resource limits and quotas +- **Observability**: Individual pod logs for each scan + +### Scanner Configuration + +Configure scanner behavior via Helm values: + +```yaml +scanner: + # Enable scanner RBAC resources + enabled: true + + # Scanner image (defaults to operator image) + image: "ghcr.io/morten-olsen/nuclei-operator:latest" + + # Default scan timeout + timeout: "30m" + + # Maximum concurrent scan jobs + maxConcurrent: 5 + + # Job TTL after completion (seconds) + ttlAfterFinished: 3600 + + # Default resource requirements for scanner pods + resources: + requests: + cpu: 100m + memory: 256Mi + limits: + cpu: "1" + memory: 1Gi + + # Default templates to use + defaultTemplates: [] + + # Default severity filter + defaultSeverity: [] +``` + +### Per-Scan Scanner Configuration + +You can override scanner settings for individual scans using the `scannerConfig` field in the NucleiScan spec: + +```yaml +apiVersion: nuclei.homelab.mortenolsen.pro/v1alpha1 +kind: NucleiScan +metadata: + name: custom-scan +spec: + sourceRef: + apiVersion: networking.k8s.io/v1 + kind: Ingress + name: my-ingress + namespace: default + uid: "abc123" + targets: + - https://example.com + scannerConfig: + # Override scanner image + image: "custom-scanner:latest" + # Override timeout + timeout: "1h" + # Custom resource requirements + resources: + requests: + cpu: 200m + memory: 512Mi + limits: + cpu: "2" + memory: 2Gi + # Node selector for scanner pod + nodeSelector: + node-type: scanner + # Tolerations for scanner pod + tolerations: + - key: "scanner" + operator: "Equal" + value: "true" + effect: "NoSchedule" +``` + +--- + +## Annotation-Based Configuration + +You can configure scanning behavior for individual Ingress or VirtualService resources using annotations. + +### Supported Annotations + +| Annotation | Type | Default | Description | +|------------|------|---------|-------------| +| `nuclei.homelab.mortenolsen.pro/enabled` | bool | `true` | Enable/disable scanning for this resource | +| `nuclei.homelab.mortenolsen.pro/templates` | string | - | Comma-separated list of template paths or tags | +| `nuclei.homelab.mortenolsen.pro/severity` | string | - | Comma-separated severity filter: info,low,medium,high,critical | +| `nuclei.homelab.mortenolsen.pro/schedule` | string | - | Cron schedule for periodic scans | +| `nuclei.homelab.mortenolsen.pro/timeout` | duration | `30m` | Scan timeout | +| `nuclei.homelab.mortenolsen.pro/scanner-image` | string | - | Override scanner image | +| `nuclei.homelab.mortenolsen.pro/exclude-templates` | string | - | Templates to exclude | +| `nuclei.homelab.mortenolsen.pro/tags` | string | - | Template tags to include | +| `nuclei.homelab.mortenolsen.pro/exclude-tags` | string | - | Template tags to exclude | + +### Example Annotated Ingress + +```yaml +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: myapp-ingress + annotations: + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "medium,high,critical" + nuclei.homelab.mortenolsen.pro/schedule: "0 2 * * *" + nuclei.homelab.mortenolsen.pro/templates: "cves/,vulnerabilities/" +spec: + rules: + - host: myapp.example.com + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: myapp + port: + number: 80 +``` + +### Example Annotated VirtualService + +```yaml +apiVersion: networking.istio.io/v1beta1 +kind: VirtualService +metadata: + name: myapp-vs + annotations: + nuclei.homelab.mortenolsen.pro/enabled: "true" + nuclei.homelab.mortenolsen.pro/severity: "high,critical" + nuclei.homelab.mortenolsen.pro/timeout: "1h" + nuclei.homelab.mortenolsen.pro/tags: "cve,oast" +spec: + hosts: + - myapp.example.com + gateways: + - my-gateway + http: + - route: + - destination: + host: myapp + port: + number: 80 +``` + +### Disabling Scanning + +To disable scanning for a specific resource: + +```yaml +metadata: + annotations: + nuclei.homelab.mortenolsen.pro/enabled: "false" +``` + +This is useful when you want to temporarily exclude certain resources from scanning without removing them from the cluster. + +### Annotation Precedence + +When both annotations and NucleiScan spec fields are present, the following precedence applies: + +1. **NucleiScan spec fields** (highest priority) - Direct configuration in the NucleiScan resource +2. **Annotations** - Configuration from the source Ingress/VirtualService +3. **Helm values** - Default configuration from the operator deployment +4. **Built-in defaults** (lowest priority) - Hardcoded defaults in the operator + +--- + ## Configuration Options ### Severity Filtering diff --git a/internal/annotations/annotations.go b/internal/annotations/annotations.go new file mode 100644 index 0000000..4cbdb49 --- /dev/null +++ b/internal/annotations/annotations.go @@ -0,0 +1,211 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package annotations + +import ( + "strconv" + "strings" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" +) + +const ( + // AnnotationPrefix is the prefix for all nuclei annotations + AnnotationPrefix = "nuclei.homelab.mortenolsen.pro/" + + // AnnotationEnabled controls whether scanning is enabled for a resource + AnnotationEnabled = AnnotationPrefix + "enabled" + + // AnnotationTemplates specifies comma-separated template paths or tags + AnnotationTemplates = AnnotationPrefix + "templates" + + // AnnotationSeverity specifies comma-separated severity filter + AnnotationSeverity = AnnotationPrefix + "severity" + + // AnnotationSchedule specifies the cron schedule for periodic scans + AnnotationSchedule = AnnotationPrefix + "schedule" + + // AnnotationTimeout specifies the scan timeout + AnnotationTimeout = AnnotationPrefix + "timeout" + + // AnnotationScannerImage overrides the scanner image + AnnotationScannerImage = AnnotationPrefix + "scanner-image" + + // AnnotationExcludeTemplates specifies templates to exclude + AnnotationExcludeTemplates = AnnotationPrefix + "exclude-templates" + + // AnnotationRateLimit specifies requests per second limit + AnnotationRateLimit = AnnotationPrefix + "rate-limit" + + // AnnotationTags specifies template tags to include + AnnotationTags = AnnotationPrefix + "tags" + + // AnnotationExcludeTags specifies template tags to exclude + AnnotationExcludeTags = AnnotationPrefix + "exclude-tags" +) + +// ScanConfig holds parsed annotation configuration +type ScanConfig struct { + // Enabled indicates if scanning is enabled + Enabled bool + + // Templates to use for scanning + Templates []string + + // Severity filter + Severity []string + + // Schedule for periodic scans (cron format) + Schedule string + + // Timeout for the scan + Timeout *metav1.Duration + + // ScannerImage overrides the default scanner image + ScannerImage string + + // ExcludeTemplates to exclude from scanning + ExcludeTemplates []string + + // RateLimit for requests per second + RateLimit int + + // Tags to include + Tags []string + + // ExcludeTags to exclude + ExcludeTags []string +} + +// ParseAnnotations extracts scan configuration from resource annotations +func ParseAnnotations(annotations map[string]string) *ScanConfig { + config := &ScanConfig{ + Enabled: true, // Default to enabled + } + + if annotations == nil { + return config + } + + // Parse enabled + if v, ok := annotations[AnnotationEnabled]; ok { + config.Enabled = strings.ToLower(v) == "true" + } + + // Parse templates + if v, ok := annotations[AnnotationTemplates]; ok && v != "" { + config.Templates = splitAndTrim(v) + } + + // Parse severity + if v, ok := annotations[AnnotationSeverity]; ok && v != "" { + config.Severity = splitAndTrim(v) + } + + // Parse schedule + if v, ok := annotations[AnnotationSchedule]; ok { + config.Schedule = v + } + + // Parse timeout + if v, ok := annotations[AnnotationTimeout]; ok { + if d, err := time.ParseDuration(v); err == nil { + config.Timeout = &metav1.Duration{Duration: d} + } + } + + // Parse scanner image + if v, ok := annotations[AnnotationScannerImage]; ok { + config.ScannerImage = v + } + + // Parse exclude templates + if v, ok := annotations[AnnotationExcludeTemplates]; ok && v != "" { + config.ExcludeTemplates = splitAndTrim(v) + } + + // Parse rate limit + if v, ok := annotations[AnnotationRateLimit]; ok { + if n, err := strconv.Atoi(v); err == nil { + config.RateLimit = n + } + } + + // Parse tags + if v, ok := annotations[AnnotationTags]; ok && v != "" { + config.Tags = splitAndTrim(v) + } + + // Parse exclude tags + if v, ok := annotations[AnnotationExcludeTags]; ok && v != "" { + config.ExcludeTags = splitAndTrim(v) + } + + return config +} + +// ApplyToNucleiScanSpec applies the annotation config to a NucleiScan spec +func (c *ScanConfig) ApplyToNucleiScanSpec(spec *nucleiv1alpha1.NucleiScanSpec) { + // Apply templates if specified + if len(c.Templates) > 0 { + spec.Templates = c.Templates + } + + // Apply severity if specified + if len(c.Severity) > 0 { + spec.Severity = c.Severity + } + + // Apply schedule if specified + if c.Schedule != "" { + spec.Schedule = c.Schedule + } + + // Apply scanner config if any scanner-specific settings are specified + if c.ScannerImage != "" || c.Timeout != nil { + if spec.ScannerConfig == nil { + spec.ScannerConfig = &nucleiv1alpha1.ScannerConfig{} + } + if c.ScannerImage != "" { + spec.ScannerConfig.Image = c.ScannerImage + } + if c.Timeout != nil { + spec.ScannerConfig.Timeout = c.Timeout + } + } +} + +// IsEnabled returns true if scanning is enabled +func (c *ScanConfig) IsEnabled() bool { + return c.Enabled +} + +// splitAndTrim splits a string by comma and trims whitespace from each element +func splitAndTrim(s string) []string { + parts := strings.Split(s, ",") + result := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p != "" { + result = append(result, p) + } + } + return result +} diff --git a/internal/controller/ingress_controller.go b/internal/controller/ingress_controller.go index 149833d..1e695da 100644 --- a/internal/controller/ingress_controller.go +++ b/internal/controller/ingress_controller.go @@ -31,6 +31,7 @@ import ( logf "sigs.k8s.io/controller-runtime/pkg/log" nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" + "github.com/mortenolsen/nuclei-operator/internal/annotations" ) // IngressReconciler reconciles Ingress objects and creates NucleiScan resources @@ -59,12 +60,8 @@ func (r *IngressReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, err } - // Extract target URLs from the Ingress - targets := extractURLsFromIngress(ingress) - if len(targets) == 0 { - log.Info("No targets extracted from Ingress, skipping NucleiScan creation") - return ctrl.Result{}, nil - } + // Parse annotations to get scan configuration + scanConfig := annotations.ParseAnnotations(ingress.Annotations) // Define the NucleiScan name based on the Ingress name nucleiScanName := fmt.Sprintf("%s-scan", ingress.Name) @@ -81,23 +78,48 @@ func (r *IngressReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, err } + // Check if scanning is disabled via annotations + if !scanConfig.IsEnabled() { + // Scanning disabled - delete existing NucleiScan if it exists + if err == nil { + log.Info("Scanning disabled via annotation, deleting existing NucleiScan", "nucleiScan", nucleiScanName) + if err := r.Delete(ctx, existingScan); err != nil && !apierrors.IsNotFound(err) { + log.Error(err, "Failed to delete NucleiScan") + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil + } + + // Extract target URLs from the Ingress + targets := extractURLsFromIngress(ingress) + if len(targets) == 0 { + log.Info("No targets extracted from Ingress, skipping NucleiScan creation") + return ctrl.Result{}, nil + } + if apierrors.IsNotFound(err) { // Create a new NucleiScan + spec := nucleiv1alpha1.NucleiScanSpec{ + SourceRef: nucleiv1alpha1.SourceReference{ + APIVersion: "networking.k8s.io/v1", + Kind: "Ingress", + Name: ingress.Name, + Namespace: ingress.Namespace, + UID: string(ingress.UID), + }, + Targets: targets, + } + + // Apply annotation configuration to the spec + scanConfig.ApplyToNucleiScanSpec(&spec) + nucleiScan := &nucleiv1alpha1.NucleiScan{ ObjectMeta: metav1.ObjectMeta{ Name: nucleiScanName, Namespace: ingress.Namespace, }, - Spec: nucleiv1alpha1.NucleiScanSpec{ - SourceRef: nucleiv1alpha1.SourceReference{ - APIVersion: "networking.k8s.io/v1", - Kind: "Ingress", - Name: ingress.Name, - Namespace: ingress.Namespace, - UID: string(ingress.UID), - }, - Targets: targets, - }, + Spec: spec, } // Set owner reference for garbage collection @@ -115,18 +137,31 @@ func (r *IngressReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ct return ctrl.Result{}, nil } - // NucleiScan exists - check if targets need to be updated + // NucleiScan exists - check if it needs to be updated + needsUpdate := false + + // Check if targets changed if !reflect.DeepEqual(existingScan.Spec.Targets, targets) { existingScan.Spec.Targets = targets - // Also update the SourceRef UID in case it changed (e.g., Ingress was recreated) - existingScan.Spec.SourceRef.UID = string(ingress.UID) + needsUpdate = true + } + // Also update the SourceRef UID in case it changed (e.g., Ingress was recreated) + if existingScan.Spec.SourceRef.UID != string(ingress.UID) { + existingScan.Spec.SourceRef.UID = string(ingress.UID) + needsUpdate = true + } + + // Apply annotation configuration + scanConfig.ApplyToNucleiScanSpec(&existingScan.Spec) + + if needsUpdate { if err := r.Update(ctx, existingScan); err != nil { - log.Error(err, "Failed to update NucleiScan targets") + log.Error(err, "Failed to update NucleiScan") return ctrl.Result{}, err } - log.Info("Updated NucleiScan targets for Ingress", "nucleiScan", nucleiScanName, "targets", targets) + log.Info("Updated NucleiScan for Ingress", "nucleiScan", nucleiScanName, "targets", targets) } return ctrl.Result{}, nil diff --git a/internal/controller/nucleiscan_controller.go b/internal/controller/nucleiscan_controller.go index e0e81c7..285982e 100644 --- a/internal/controller/nucleiscan_controller.go +++ b/internal/controller/nucleiscan_controller.go @@ -23,16 +23,20 @@ import ( "os" "time" + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/client-go/tools/record" ctrl "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" logf "sigs.k8s.io/controller-runtime/pkg/log" nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" - "github.com/mortenolsen/nuclei-operator/internal/scanner" + "github.com/mortenolsen/nuclei-operator/internal/jobmanager" ) const ( @@ -74,66 +78,78 @@ const ( ReasonScanSuspended = "ScanSuspended" ) -// BackoffConfig holds configuration for exponential backoff -type BackoffConfig struct { - Initial time.Duration - Max time.Duration - Multiplier float64 +// ReconcilerConfig holds configuration for the NucleiScanReconciler +type ReconcilerConfig struct { + RescanAge time.Duration + BackoffInitial time.Duration + BackoffMax time.Duration + BackoffMultiplier float64 +} + +// DefaultReconcilerConfig returns a ReconcilerConfig with default values +func DefaultReconcilerConfig() ReconcilerConfig { + config := ReconcilerConfig{ + RescanAge: defaultRescanAge, + BackoffInitial: defaultBackoffInitial, + BackoffMax: defaultBackoffMax, + BackoffMultiplier: defaultBackoffMultiplier, + } + + // Override from environment variables + if envVal := os.Getenv(envRescanAge); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + config.RescanAge = parsed + } + } + + if envVal := os.Getenv(envBackoffInitial); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + config.BackoffInitial = parsed + } + } + + if envVal := os.Getenv(envBackoffMax); envVal != "" { + if parsed, err := time.ParseDuration(envVal); err == nil { + config.BackoffMax = parsed + } + } + + if envVal := os.Getenv(envBackoffMultiplier); envVal != "" { + if parsed, err := parseFloat(envVal); err == nil && parsed > 0 { + config.BackoffMultiplier = parsed + } + } + + return config } // NucleiScanReconciler reconciles a NucleiScan object type NucleiScanReconciler struct { client.Client Scheme *runtime.Scheme - Scanner scanner.Scanner - RescanAge time.Duration + Recorder record.EventRecorder + JobManager *jobmanager.JobManager + Config ReconcilerConfig HTTPClient *http.Client - Backoff BackoffConfig } -// NewNucleiScanReconciler creates a new NucleiScanReconciler with default settings -func NewNucleiScanReconciler(c client.Client, scheme *runtime.Scheme, s scanner.Scanner) *NucleiScanReconciler { - rescanAge := defaultRescanAge - if envVal := os.Getenv(envRescanAge); envVal != "" { - if parsed, err := time.ParseDuration(envVal); err == nil { - rescanAge = parsed - } - } - - backoffInitial := defaultBackoffInitial - if envVal := os.Getenv(envBackoffInitial); envVal != "" { - if parsed, err := time.ParseDuration(envVal); err == nil { - backoffInitial = parsed - } - } - - backoffMax := defaultBackoffMax - if envVal := os.Getenv(envBackoffMax); envVal != "" { - if parsed, err := time.ParseDuration(envVal); err == nil { - backoffMax = parsed - } - } - - backoffMultiplier := defaultBackoffMultiplier - if envVal := os.Getenv(envBackoffMultiplier); envVal != "" { - if parsed, err := parseFloat(envVal); err == nil && parsed > 0 { - backoffMultiplier = parsed - } - } - +// NewNucleiScanReconciler creates a new NucleiScanReconciler with the given configuration +func NewNucleiScanReconciler( + c client.Client, + scheme *runtime.Scheme, + recorder record.EventRecorder, + jobManager *jobmanager.JobManager, + config ReconcilerConfig, +) *NucleiScanReconciler { return &NucleiScanReconciler{ - Client: c, - Scheme: scheme, - Scanner: s, - RescanAge: rescanAge, + Client: c, + Scheme: scheme, + Recorder: recorder, + JobManager: jobManager, + Config: config, HTTPClient: &http.Client{ Timeout: 10 * time.Second, }, - Backoff: BackoffConfig{ - Initial: backoffInitial, - Max: backoffMax, - Multiplier: backoffMultiplier, - }, } } @@ -147,15 +163,15 @@ func parseFloat(s string) (float64, error) { // calculateBackoff calculates the next backoff duration based on retry count func (r *NucleiScanReconciler) calculateBackoff(retryCount int) time.Duration { if retryCount <= 0 { - return r.Backoff.Initial + return r.Config.BackoffInitial } // Calculate exponential backoff: initial * multiplier^retryCount - backoff := float64(r.Backoff.Initial) + backoff := float64(r.Config.BackoffInitial) for i := 0; i < retryCount; i++ { - backoff *= r.Backoff.Multiplier - if backoff > float64(r.Backoff.Max) { - return r.Backoff.Max + backoff *= r.Config.BackoffMultiplier + if backoff > float64(r.Config.BackoffMax) { + return r.Config.BackoffMax } } @@ -166,6 +182,8 @@ func (r *NucleiScanReconciler) calculateBackoff(retryCount int) time.Duration { // +kubebuilder:rbac:groups=nuclei.homelab.mortenolsen.pro,resources=nucleiscans/status,verbs=get;update;patch // +kubebuilder:rbac:groups=nuclei.homelab.mortenolsen.pro,resources=nucleiscans/finalizers,verbs=update // +kubebuilder:rbac:groups="",resources=events,verbs=create;patch +// +kubebuilder:rbac:groups=batch,resources=jobs,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch // Reconcile is part of the main kubernetes reconciliation loop which aims to // move the current state of the cluster closer to the desired state. @@ -215,15 +233,7 @@ func (r *NucleiScanReconciler) Reconcile(ctx context.Context, req ctrl.Request) case nucleiv1alpha1.ScanPhasePending: return r.handlePendingPhase(ctx, nucleiScan) case nucleiv1alpha1.ScanPhaseRunning: - // Running phase on startup means the scan was interrupted (operator restart) - // Reset to Pending to re-run the scan - log.Info("Found stale Running scan, resetting to Pending (operator likely restarted)") - nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending - nucleiScan.Status.LastError = "Scan was interrupted due to operator restart, re-queuing" - if err := r.Status().Update(ctx, nucleiScan); err != nil { - return ctrl.Result{}, err - } - return ctrl.Result{Requeue: true}, nil + return r.handleRunningPhase(ctx, nucleiScan) case nucleiv1alpha1.ScanPhaseCompleted: return r.handleCompletedPhase(ctx, nucleiScan) case nucleiv1alpha1.ScanPhaseFailed: @@ -245,8 +255,15 @@ func (r *NucleiScanReconciler) handleDeletion(ctx context.Context, nucleiScan *n if controllerutil.ContainsFinalizer(nucleiScan, finalizerName) { log.Info("Handling deletion, performing cleanup", "name", nucleiScan.Name, "namespace", nucleiScan.Namespace) - // Perform any cleanup here (e.g., cancel running scans) - // In our synchronous implementation, there's nothing to clean up + // Clean up any running scanner job + if nucleiScan.Status.JobRef != nil { + log.Info("Deleting scanner job", "job", nucleiScan.Status.JobRef.Name) + if err := r.JobManager.DeleteJob(ctx, nucleiScan.Status.JobRef.Name, nucleiScan.Namespace); err != nil { + if !apierrors.IsNotFound(err) { + log.Error(err, "Failed to delete scanner job", "job", nucleiScan.Status.JobRef.Name) + } + } + } // Remove finalizer log.Info("Removing finalizer", "finalizer", finalizerName) @@ -261,56 +278,55 @@ func (r *NucleiScanReconciler) handleDeletion(ctx context.Context, nucleiScan *n return ctrl.Result{}, nil } -// handlePendingPhase handles the Pending phase - starts a new scan +// handlePendingPhase handles the Pending phase - creates a scanner job func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) { - log := logf.FromContext(ctx) - log.Info("Preparing to scan", "targets", len(nucleiScan.Spec.Targets)) + logger := logf.FromContext(ctx) + logger.Info("Preparing to scan", "targets", len(nucleiScan.Spec.Targets)) + + // Check if we're at capacity + atCapacity, err := r.JobManager.AtCapacity(ctx) + if err != nil { + logger.Error(err, "Failed to check job capacity") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + if atCapacity { + logger.Info("At maximum concurrent scans, requeuing") + r.Recorder.Event(nucleiScan, corev1.EventTypeNormal, "AtCapacity", + "Maximum concurrent scans reached, waiting for capacity") + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } // Check if at least one target is available before scanning availableTargets, unavailableTargets := r.checkTargetsAvailability(ctx, nucleiScan.Spec.Targets) if len(availableTargets) == 0 { - // Calculate backoff based on retry count - retryCount := nucleiScan.Status.RetryCount - backoffDuration := r.calculateBackoff(retryCount) - - log.Info("No targets are available yet, waiting with backoff...", - "unavailable", len(unavailableTargets), - "retryCount", retryCount, - "backoffDuration", backoffDuration) - - // Update condition and retry count - now := metav1.Now() - meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ - Type: ConditionTypeReady, - Status: metav1.ConditionFalse, - Reason: "WaitingForTargets", - Message: fmt.Sprintf("Waiting for targets to become available (%d unavailable, retry #%d, next check in %v)", len(unavailableTargets), retryCount+1, backoffDuration), - LastTransitionTime: now, - }) - nucleiScan.Status.LastError = fmt.Sprintf("Targets not available: %v", unavailableTargets) - nucleiScan.Status.RetryCount = retryCount + 1 - nucleiScan.Status.LastRetryTime = &now - - if err := r.Status().Update(ctx, nucleiScan); err != nil { - return ctrl.Result{}, err - } - - // Requeue with exponential backoff - return ctrl.Result{RequeueAfter: backoffDuration}, nil + return r.handleTargetsUnavailable(ctx, nucleiScan, unavailableTargets) } // Reset retry count since targets are now available if nucleiScan.Status.RetryCount > 0 { - log.Info("Targets now available, resetting retry count", "previousRetries", nucleiScan.Status.RetryCount) + logger.Info("Targets now available, resetting retry count", "previousRetries", nucleiScan.Status.RetryCount) nucleiScan.Status.RetryCount = 0 nucleiScan.Status.LastRetryTime = nil } - log.Info("Starting scan", "availableTargets", len(availableTargets), "unavailableTargets", len(unavailableTargets)) + logger.Info("Creating scanner job", "availableTargets", len(availableTargets), "unavailableTargets", len(unavailableTargets)) - // Update status to Running + // Create the scanner job + job, err := r.JobManager.CreateScanJob(ctx, nucleiScan) + if err != nil { + logger.Error(err, "Failed to create scanner job") + r.Recorder.Event(nucleiScan, corev1.EventTypeWarning, "JobCreationFailed", err.Error()) + return ctrl.Result{RequeueAfter: 30 * time.Second}, nil + } + + // Update status to Running with job reference now := metav1.Now() nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhaseRunning + nucleiScan.Status.JobRef = &nucleiv1alpha1.JobReference{ + Name: job.Name, + UID: string(job.UID), + StartTime: &now, + } nucleiScan.Status.LastScanTime = &now nucleiScan.Status.LastError = "" nucleiScan.Status.ObservedGeneration = nucleiScan.Generation @@ -320,30 +336,150 @@ func (r *NucleiScanReconciler) handlePendingPhase(ctx context.Context, nucleiSca Type: ConditionTypeScanActive, Status: metav1.ConditionTrue, Reason: ReasonScanRunning, - Message: fmt.Sprintf("Scan is in progress (%d targets)", len(availableTargets)), + Message: fmt.Sprintf("Scanner job %s created for %d targets", job.Name, len(nucleiScan.Spec.Targets)), LastTransitionTime: now, }) + if err := r.Status().Update(ctx, nucleiScan); err != nil { + logger.Error(err, "Failed to update status") + return ctrl.Result{}, err + } + + r.Recorder.Event(nucleiScan, corev1.EventTypeNormal, "ScanJobCreated", + fmt.Sprintf("Created scanner job %s", job.Name)) + + // Requeue to monitor job status + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil +} + +// handleTargetsUnavailable handles the case when no targets are available +func (r *NucleiScanReconciler) handleTargetsUnavailable(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, unavailableTargets []string) (ctrl.Result, error) { + logger := logf.FromContext(ctx) + + // Calculate backoff based on retry count + retryCount := nucleiScan.Status.RetryCount + backoffDuration := r.calculateBackoff(retryCount) + + logger.Info("No targets are available yet, waiting with backoff...", + "unavailable", len(unavailableTargets), + "retryCount", retryCount, + "backoffDuration", backoffDuration) + + // Update condition and retry count + now := metav1.Now() + meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ + Type: ConditionTypeReady, + Status: metav1.ConditionFalse, + Reason: "WaitingForTargets", + Message: fmt.Sprintf("Waiting for targets to become available (%d unavailable, retry #%d, next check in %v)", len(unavailableTargets), retryCount+1, backoffDuration), + LastTransitionTime: now, + }) + nucleiScan.Status.LastError = fmt.Sprintf("Targets not available: %v", unavailableTargets) + nucleiScan.Status.RetryCount = retryCount + 1 + nucleiScan.Status.LastRetryTime = &now + if err := r.Status().Update(ctx, nucleiScan); err != nil { return ctrl.Result{}, err } - // Build scan options - options := scanner.ScanOptions{ - Templates: nucleiScan.Spec.Templates, - Severity: nucleiScan.Spec.Severity, - Timeout: 30 * time.Minute, // Default timeout + // Requeue with exponential backoff + return ctrl.Result{RequeueAfter: backoffDuration}, nil +} + +// handleRunningPhase handles the Running phase - monitors the scanner job +func (r *NucleiScanReconciler) handleRunningPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) { + logger := logf.FromContext(ctx) + + // Check if we have a job reference + if nucleiScan.Status.JobRef == nil { + logger.Info("No job reference found, resetting to Pending") + nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.LastError = "No job reference found, re-queuing scan" + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{Requeue: true}, nil } - // Execute the scan with available targets only - result, err := r.Scanner.Scan(ctx, availableTargets, options) + // Get the job + job, err := r.JobManager.GetJob(ctx, nucleiScan.Status.JobRef.Name, nucleiScan.Namespace) if err != nil { - log.Error(err, "Scan failed") - return r.handleScanError(ctx, nucleiScan, err) + if apierrors.IsNotFound(err) { + logger.Info("Scanner job not found, resetting to Pending") + nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.LastError = "Scanner job not found, re-queuing scan" + nucleiScan.Status.JobRef = nil + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + return ctrl.Result{Requeue: true}, nil + } + return ctrl.Result{}, err } - // Update status with results - return r.handleScanSuccess(ctx, nucleiScan, result) + // Update pod name if available + if nucleiScan.Status.JobRef.PodName == "" { + podName, _ := r.JobManager.GetJobPodName(ctx, job) + if podName != "" { + nucleiScan.Status.JobRef.PodName = podName + if err := r.Status().Update(ctx, nucleiScan); err != nil { + logger.Error(err, "Failed to update pod name") + } + } + } + + // Check job status - the scanner pod updates the NucleiScan status directly + // We just need to detect completion/failure for events + if r.JobManager.IsJobSuccessful(job) { + logger.Info("Scanner job completed successfully") + r.Recorder.Event(nucleiScan, corev1.EventTypeNormal, "ScanCompleted", + fmt.Sprintf("Scan completed with %d findings", len(nucleiScan.Status.Findings))) + + // Status is already updated by the scanner pod + // Just schedule next scan if needed + return r.scheduleNextScan(ctx, nucleiScan) + } + + if r.JobManager.IsJobFailed(job) { + reason := r.JobManager.GetJobFailureReason(job) + logger.Info("Scanner job failed", "reason", reason) + + // Update status if not already updated by scanner + if nucleiScan.Status.Phase != nucleiv1alpha1.ScanPhaseFailed { + now := metav1.Now() + nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed + nucleiScan.Status.LastError = reason + nucleiScan.Status.CompletionTime = &now + + // Set conditions + meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ + Type: ConditionTypeScanActive, + Status: metav1.ConditionFalse, + Reason: ReasonScanFailed, + Message: reason, + LastTransitionTime: now, + }) + + meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ + Type: ConditionTypeReady, + Status: metav1.ConditionFalse, + Reason: ReasonScanFailed, + Message: reason, + LastTransitionTime: now, + }) + + if err := r.Status().Update(ctx, nucleiScan); err != nil { + return ctrl.Result{}, err + } + } + + r.Recorder.Event(nucleiScan, corev1.EventTypeWarning, "ScanFailed", reason) + return ctrl.Result{}, nil + } + + // Job still running, requeue to check again + logger.V(1).Info("Scanner job still running", "job", job.Name) + return ctrl.Result{RequeueAfter: 15 * time.Second}, nil } // checkTargetsAvailability checks which targets are reachable @@ -373,80 +509,6 @@ func (r *NucleiScanReconciler) checkTargetsAvailability(ctx context.Context, tar return available, unavailable } -// handleScanSuccess updates the status after a successful scan -func (r *NucleiScanReconciler) handleScanSuccess(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, result *scanner.ScanResult) (ctrl.Result, error) { - log := logf.FromContext(ctx) - log.Info("Scan completed successfully", "findings", len(result.Findings), "duration", result.Duration) - - now := metav1.Now() - nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhaseCompleted - nucleiScan.Status.CompletionTime = &now - nucleiScan.Status.Findings = result.Findings - nucleiScan.Status.Summary = &result.Summary - nucleiScan.Status.LastError = "" - - // Set conditions - meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ - Type: ConditionTypeScanActive, - Status: metav1.ConditionFalse, - Reason: ReasonScanCompleted, - Message: "Scan completed successfully", - LastTransitionTime: now, - }) - - message := fmt.Sprintf("Scan completed with %d findings", len(result.Findings)) - meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ - Type: ConditionTypeReady, - Status: metav1.ConditionTrue, - Reason: ReasonScanCompleted, - Message: message, - LastTransitionTime: now, - }) - - if err := r.Status().Update(ctx, nucleiScan); err != nil { - return ctrl.Result{}, err - } - - // If there's a schedule, calculate next scan time - if nucleiScan.Spec.Schedule != "" { - return r.scheduleNextScan(ctx, nucleiScan) - } - - return ctrl.Result{}, nil -} - -// handleScanError updates the status after a failed scan -func (r *NucleiScanReconciler) handleScanError(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan, scanErr error) (ctrl.Result, error) { - now := metav1.Now() - nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed - nucleiScan.Status.CompletionTime = &now - nucleiScan.Status.LastError = scanErr.Error() - - // Set conditions - meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ - Type: ConditionTypeScanActive, - Status: metav1.ConditionFalse, - Reason: ReasonScanFailed, - Message: scanErr.Error(), - LastTransitionTime: now, - }) - - meta.SetStatusCondition(&nucleiScan.Status.Conditions, metav1.Condition{ - Type: ConditionTypeReady, - Status: metav1.ConditionFalse, - Reason: ReasonScanFailed, - Message: scanErr.Error(), - LastTransitionTime: now, - }) - - if err := r.Status().Update(ctx, nucleiScan); err != nil { - return ctrl.Result{}, err - } - - // Requeue with backoff for retry - return ctrl.Result{RequeueAfter: defaultErrorRequeueAfter}, nil -} - // handleCompletedPhase handles the Completed phase - checks for scheduled rescans func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) { log := logf.FromContext(ctx) @@ -455,6 +517,7 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS if nucleiScan.Generation != nucleiScan.Status.ObservedGeneration { log.Info("Spec changed, triggering new scan") nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.JobRef = nil // Clear old job reference if err := r.Status().Update(ctx, nucleiScan); err != nil { return ctrl.Result{}, err } @@ -469,9 +532,10 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS // Check if scan results are stale (older than RescanAge) if nucleiScan.Status.CompletionTime != nil { age := time.Since(nucleiScan.Status.CompletionTime.Time) - if age > r.RescanAge { - log.Info("Scan results are stale, triggering rescan", "age", age, "maxAge", r.RescanAge) + if age > r.Config.RescanAge { + log.Info("Scan results are stale, triggering rescan", "age", age, "maxAge", r.Config.RescanAge) nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.JobRef = nil // Clear old job reference nucleiScan.Status.LastError = fmt.Sprintf("Automatic rescan triggered (results were %v old)", age.Round(time.Hour)) if err := r.Status().Update(ctx, nucleiScan); err != nil { return ctrl.Result{}, err @@ -480,7 +544,7 @@ func (r *NucleiScanReconciler) handleCompletedPhase(ctx context.Context, nucleiS } // Schedule a requeue for when the results will become stale - timeUntilStale := r.RescanAge - age + timeUntilStale := r.Config.RescanAge - age log.V(1).Info("Scan results still fresh, will check again later", "timeUntilStale", timeUntilStale) return ctrl.Result{RequeueAfter: timeUntilStale}, nil } @@ -496,6 +560,7 @@ func (r *NucleiScanReconciler) handleFailedPhase(ctx context.Context, nucleiScan if nucleiScan.Generation != nucleiScan.Status.ObservedGeneration { log.Info("Spec changed, triggering new scan") nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending + nucleiScan.Status.JobRef = nil // Clear old job reference if err := r.Status().Update(ctx, nucleiScan); err != nil { return ctrl.Result{}, err } @@ -512,6 +577,11 @@ func (r *NucleiScanReconciler) handleFailedPhase(ctx context.Context, nucleiScan func (r *NucleiScanReconciler) scheduleNextScan(ctx context.Context, nucleiScan *nucleiv1alpha1.NucleiScan) (ctrl.Result, error) { log := logf.FromContext(ctx) + // If there's no schedule, nothing to do + if nucleiScan.Spec.Schedule == "" { + return ctrl.Result{}, nil + } + // Parse cron schedule nextTime, err := getNextScheduleTime(nucleiScan.Spec.Schedule, time.Now()) if err != nil { @@ -550,6 +620,7 @@ func (r *NucleiScanReconciler) checkScheduledScan(ctx context.Context, nucleiSca log.Info("Scheduled scan time reached, triggering scan") nucleiScan.Status.Phase = nucleiv1alpha1.ScanPhasePending nucleiScan.Status.NextScheduledTime = nil + nucleiScan.Status.JobRef = nil // Clear old job reference if err := r.Status().Update(ctx, nucleiScan); err != nil { return ctrl.Result{}, err } @@ -602,6 +673,7 @@ func getNextScheduleTime(schedule string, from time.Time) (time.Time, error) { func (r *NucleiScanReconciler) SetupWithManager(mgr ctrl.Manager) error { return ctrl.NewControllerManagedBy(mgr). For(&nucleiv1alpha1.NucleiScan{}). + Owns(&batchv1.Job{}). // Watch Jobs owned by NucleiScan Named("nucleiscan"). Complete(r) } diff --git a/internal/controller/virtualservice_controller.go b/internal/controller/virtualservice_controller.go index 16b8f47..ec0c066 100644 --- a/internal/controller/virtualservice_controller.go +++ b/internal/controller/virtualservice_controller.go @@ -33,6 +33,7 @@ import ( istionetworkingv1beta1 "istio.io/client-go/pkg/apis/networking/v1beta1" nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" + "github.com/mortenolsen/nuclei-operator/internal/annotations" ) // VirtualServiceReconciler reconciles VirtualService objects and creates NucleiScan resources @@ -61,12 +62,8 @@ func (r *VirtualServiceReconciler) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, err } - // Extract target URLs from the VirtualService - targets := extractURLsFromVirtualService(virtualService) - if len(targets) == 0 { - log.Info("No targets extracted from VirtualService, skipping NucleiScan creation") - return ctrl.Result{}, nil - } + // Parse annotations to get scan configuration + scanConfig := annotations.ParseAnnotations(virtualService.Annotations) // Define the NucleiScan name based on the VirtualService name nucleiScanName := fmt.Sprintf("%s-scan", virtualService.Name) @@ -83,23 +80,48 @@ func (r *VirtualServiceReconciler) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, err } + // Check if scanning is disabled via annotations + if !scanConfig.IsEnabled() { + // Scanning disabled - delete existing NucleiScan if it exists + if err == nil { + log.Info("Scanning disabled via annotation, deleting existing NucleiScan", "nucleiScan", nucleiScanName) + if err := r.Delete(ctx, existingScan); err != nil && !apierrors.IsNotFound(err) { + log.Error(err, "Failed to delete NucleiScan") + return ctrl.Result{}, err + } + } + return ctrl.Result{}, nil + } + + // Extract target URLs from the VirtualService + targets := extractURLsFromVirtualService(virtualService) + if len(targets) == 0 { + log.Info("No targets extracted from VirtualService, skipping NucleiScan creation") + return ctrl.Result{}, nil + } + if apierrors.IsNotFound(err) { // Create a new NucleiScan + spec := nucleiv1alpha1.NucleiScanSpec{ + SourceRef: nucleiv1alpha1.SourceReference{ + APIVersion: "networking.istio.io/v1beta1", + Kind: "VirtualService", + Name: virtualService.Name, + Namespace: virtualService.Namespace, + UID: string(virtualService.UID), + }, + Targets: targets, + } + + // Apply annotation configuration to the spec + scanConfig.ApplyToNucleiScanSpec(&spec) + nucleiScan := &nucleiv1alpha1.NucleiScan{ ObjectMeta: metav1.ObjectMeta{ Name: nucleiScanName, Namespace: virtualService.Namespace, }, - Spec: nucleiv1alpha1.NucleiScanSpec{ - SourceRef: nucleiv1alpha1.SourceReference{ - APIVersion: "networking.istio.io/v1beta1", - Kind: "VirtualService", - Name: virtualService.Name, - Namespace: virtualService.Namespace, - UID: string(virtualService.UID), - }, - Targets: targets, - }, + Spec: spec, } // Set owner reference for garbage collection @@ -117,18 +139,31 @@ func (r *VirtualServiceReconciler) Reconcile(ctx context.Context, req ctrl.Reque return ctrl.Result{}, nil } - // NucleiScan exists - check if targets need to be updated + // NucleiScan exists - check if it needs to be updated + needsUpdate := false + + // Check if targets changed if !reflect.DeepEqual(existingScan.Spec.Targets, targets) { existingScan.Spec.Targets = targets - // Also update the SourceRef UID in case it changed (e.g., VirtualService was recreated) - existingScan.Spec.SourceRef.UID = string(virtualService.UID) + needsUpdate = true + } + // Also update the SourceRef UID in case it changed (e.g., VirtualService was recreated) + if existingScan.Spec.SourceRef.UID != string(virtualService.UID) { + existingScan.Spec.SourceRef.UID = string(virtualService.UID) + needsUpdate = true + } + + // Apply annotation configuration + scanConfig.ApplyToNucleiScanSpec(&existingScan.Spec) + + if needsUpdate { if err := r.Update(ctx, existingScan); err != nil { - log.Error(err, "Failed to update NucleiScan targets") + log.Error(err, "Failed to update NucleiScan") return ctrl.Result{}, err } - log.Info("Updated NucleiScan targets for VirtualService", "nucleiScan", nucleiScanName, "targets", targets) + log.Info("Updated NucleiScan for VirtualService", "nucleiScan", nucleiScanName, "targets", targets) } return ctrl.Result{}, nil diff --git a/internal/jobmanager/jobmanager.go b/internal/jobmanager/jobmanager.go new file mode 100644 index 0000000..69e0162 --- /dev/null +++ b/internal/jobmanager/jobmanager.go @@ -0,0 +1,427 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jobmanager + +import ( + "context" + "fmt" + "time" + + batchv1 "k8s.io/api/batch/v1" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" + "sigs.k8s.io/controller-runtime/pkg/log" + + nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" +) + +const ( + // DefaultScannerImage is the default image used for scanner pods + DefaultScannerImage = "ghcr.io/morten-olsen/nuclei-operator:latest" + + // DefaultTimeout is the default scan timeout + DefaultTimeout = 30 * time.Minute + + // DefaultTTLAfterFinished is the default TTL for completed jobs + DefaultTTLAfterFinished = 3600 // 1 hour + + // DefaultBackoffLimit is the default number of retries for failed jobs + DefaultBackoffLimit = 2 + + // LabelManagedBy is the label key for identifying managed resources + LabelManagedBy = "app.kubernetes.io/managed-by" + + // LabelComponent is the label key for component identification + LabelComponent = "app.kubernetes.io/component" + + // LabelScanName is the label key for the scan name + LabelScanName = "nuclei.homelab.mortenolsen.pro/scan-name" + + // LabelScanNamespace is the label key for the scan namespace + LabelScanNamespace = "nuclei.homelab.mortenolsen.pro/scan-namespace" +) + +// Config holds the configuration for the JobManager +type Config struct { + // ScannerImage is the default image to use for scanner pods + ScannerImage string + + // DefaultTimeout is the default scan timeout + DefaultTimeout time.Duration + + // TTLAfterFinished is the TTL for completed jobs in seconds + TTLAfterFinished int32 + + // BackoffLimit is the number of retries for failed jobs + BackoffLimit int32 + + // MaxConcurrent is the maximum number of concurrent scan jobs + MaxConcurrent int + + // ServiceAccountName is the service account to use for scanner pods + ServiceAccountName string + + // DefaultResources are the default resource requirements for scanner pods + DefaultResources corev1.ResourceRequirements + + // DefaultTemplates are the default templates to use for scans + DefaultTemplates []string + + // DefaultSeverity is the default severity filter + DefaultSeverity []string +} + +// DefaultConfig returns a Config with default values +func DefaultConfig() Config { + return Config{ + ScannerImage: DefaultScannerImage, + DefaultTimeout: DefaultTimeout, + TTLAfterFinished: DefaultTTLAfterFinished, + BackoffLimit: DefaultBackoffLimit, + MaxConcurrent: 5, + ServiceAccountName: "nuclei-scanner", + DefaultResources: corev1.ResourceRequirements{ + Requests: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("100m"), + corev1.ResourceMemory: resource.MustParse("256Mi"), + }, + Limits: corev1.ResourceList{ + corev1.ResourceCPU: resource.MustParse("1"), + corev1.ResourceMemory: resource.MustParse("1Gi"), + }, + }, + } +} + +// JobManager manages scanner jobs for NucleiScan resources +type JobManager struct { + client.Client + Scheme *runtime.Scheme + Config Config +} + +// NewJobManager creates a new JobManager with the given configuration +func NewJobManager(c client.Client, scheme *runtime.Scheme, config Config) *JobManager { + return &JobManager{ + Client: c, + Scheme: scheme, + Config: config, + } +} + +// CreateScanJob creates a new scanner job for the given NucleiScan +func (m *JobManager) CreateScanJob(ctx context.Context, scan *nucleiv1alpha1.NucleiScan) (*batchv1.Job, error) { + logger := log.FromContext(ctx) + + job := m.buildJob(scan) + + // Set owner reference so the job is garbage collected when the scan is deleted + if err := controllerutil.SetControllerReference(scan, job, m.Scheme); err != nil { + return nil, fmt.Errorf("failed to set controller reference: %w", err) + } + + logger.Info("Creating scanner job", + "job", job.Name, + "namespace", job.Namespace, + "image", job.Spec.Template.Spec.Containers[0].Image, + "targets", len(scan.Spec.Targets)) + + if err := m.Create(ctx, job); err != nil { + return nil, fmt.Errorf("failed to create job: %w", err) + } + + return job, nil +} + +// GetJob retrieves a job by name and namespace +func (m *JobManager) GetJob(ctx context.Context, name, namespace string) (*batchv1.Job, error) { + job := &batchv1.Job{} + err := m.Get(ctx, types.NamespacedName{Name: name, Namespace: namespace}, job) + if err != nil { + return nil, err + } + return job, nil +} + +// DeleteJob deletes a job by name and namespace +func (m *JobManager) DeleteJob(ctx context.Context, name, namespace string) error { + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + } + return m.Delete(ctx, job, client.PropagationPolicy(metav1.DeletePropagationBackground)) +} + +// GetJobPodName returns the name of the pod created by the job +func (m *JobManager) GetJobPodName(ctx context.Context, job *batchv1.Job) (string, error) { + podList := &corev1.PodList{} + err := m.List(ctx, podList, + client.InNamespace(job.Namespace), + client.MatchingLabels{"job-name": job.Name}) + if err != nil { + return "", err + } + + if len(podList.Items) == 0 { + return "", nil + } + + // Return the first pod (there should only be one for our jobs) + return podList.Items[0].Name, nil +} + +// IsJobComplete returns true if the job has completed (successfully or failed) +func (m *JobManager) IsJobComplete(job *batchv1.Job) bool { + for _, condition := range job.Status.Conditions { + if (condition.Type == batchv1.JobComplete || condition.Type == batchv1.JobFailed) && + condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// IsJobSuccessful returns true if the job completed successfully +func (m *JobManager) IsJobSuccessful(job *batchv1.Job) bool { + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobComplete && condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// IsJobFailed returns true if the job failed +func (m *JobManager) IsJobFailed(job *batchv1.Job) bool { + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { + return true + } + } + return false +} + +// GetJobFailureReason returns the reason for job failure +func (m *JobManager) GetJobFailureReason(job *batchv1.Job) string { + for _, condition := range job.Status.Conditions { + if condition.Type == batchv1.JobFailed && condition.Status == corev1.ConditionTrue { + return condition.Message + } + } + return "Unknown failure reason" +} + +// CountActiveJobs returns the number of currently active scan jobs +func (m *JobManager) CountActiveJobs(ctx context.Context) (int, error) { + jobList := &batchv1.JobList{} + err := m.List(ctx, jobList, client.MatchingLabels{ + LabelManagedBy: "nuclei-operator", + LabelComponent: "scanner", + }) + if err != nil { + return 0, err + } + + count := 0 + for _, job := range jobList.Items { + if job.Status.Active > 0 { + count++ + } + } + return count, nil +} + +// AtCapacity returns true if the maximum number of concurrent jobs has been reached +func (m *JobManager) AtCapacity(ctx context.Context) (bool, error) { + count, err := m.CountActiveJobs(ctx) + if err != nil { + return false, err + } + return count >= m.Config.MaxConcurrent, nil +} + +// CleanupOrphanedJobs removes jobs that no longer have an associated NucleiScan +func (m *JobManager) CleanupOrphanedJobs(ctx context.Context) error { + logger := log.FromContext(ctx) + + jobList := &batchv1.JobList{} + err := m.List(ctx, jobList, client.MatchingLabels{ + LabelManagedBy: "nuclei-operator", + LabelComponent: "scanner", + }) + if err != nil { + return err + } + + for _, job := range jobList.Items { + // Check if owner reference exists and the owner still exists + ownerRef := metav1.GetControllerOf(&job) + if ownerRef == nil { + logger.Info("Deleting orphaned job without owner", "job", job.Name, "namespace", job.Namespace) + if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) { + logger.Error(err, "Failed to delete orphaned job", "job", job.Name) + } + continue + } + + // Check if the job is stuck (running longer than 2x the timeout) + if job.Status.StartTime != nil { + maxDuration := 2 * m.Config.DefaultTimeout + if time.Since(job.Status.StartTime.Time) > maxDuration && job.Status.Active > 0 { + logger.Info("Deleting stuck job", "job", job.Name, "namespace", job.Namespace, + "age", time.Since(job.Status.StartTime.Time)) + if err := m.DeleteJob(ctx, job.Name, job.Namespace); err != nil && !apierrors.IsNotFound(err) { + logger.Error(err, "Failed to delete stuck job", "job", job.Name) + } + } + } + } + + return nil +} + +// buildJob creates a Job specification for the given NucleiScan +func (m *JobManager) buildJob(scan *nucleiv1alpha1.NucleiScan) *batchv1.Job { + // Generate a unique job name + jobName := fmt.Sprintf("nucleiscan-%s-%d", scan.Name, time.Now().Unix()) + if len(jobName) > 63 { + jobName = jobName[:63] + } + + // Determine the scanner image + image := m.Config.ScannerImage + if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Image != "" { + image = scan.Spec.ScannerConfig.Image + } + + // Determine timeout + timeout := m.Config.DefaultTimeout + if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Timeout != nil { + timeout = scan.Spec.ScannerConfig.Timeout.Duration + } + activeDeadlineSeconds := int64(timeout.Seconds()) + + // Determine resources + resources := m.Config.DefaultResources + if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Resources != nil { + resources = *scan.Spec.ScannerConfig.Resources + } + + // Build command arguments for scanner mode + args := []string{ + "--mode=scanner", + fmt.Sprintf("--scan-name=%s", scan.Name), + fmt.Sprintf("--scan-namespace=%s", scan.Namespace), + } + + // Build labels + labels := map[string]string{ + LabelManagedBy: "nuclei-operator", + LabelComponent: "scanner", + LabelScanName: scan.Name, + LabelScanNamespace: scan.Namespace, + } + + // Build node selector + var nodeSelector map[string]string + if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.NodeSelector != nil { + nodeSelector = scan.Spec.ScannerConfig.NodeSelector + } + + // Build tolerations + var tolerations []corev1.Toleration + if scan.Spec.ScannerConfig != nil && scan.Spec.ScannerConfig.Tolerations != nil { + tolerations = scan.Spec.ScannerConfig.Tolerations + } + + job := &batchv1.Job{ + ObjectMeta: metav1.ObjectMeta{ + Name: jobName, + Namespace: scan.Namespace, + Labels: labels, + }, + Spec: batchv1.JobSpec{ + TTLSecondsAfterFinished: ptr.To(m.Config.TTLAfterFinished), + BackoffLimit: ptr.To(m.Config.BackoffLimit), + ActiveDeadlineSeconds: &activeDeadlineSeconds, + Template: corev1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: labels, + }, + Spec: corev1.PodSpec{ + RestartPolicy: corev1.RestartPolicyNever, + ServiceAccountName: m.Config.ServiceAccountName, + NodeSelector: nodeSelector, + Tolerations: tolerations, + SecurityContext: &corev1.PodSecurityContext{ + RunAsNonRoot: ptr.To(true), + RunAsUser: ptr.To(int64(65532)), + RunAsGroup: ptr.To(int64(65532)), + FSGroup: ptr.To(int64(65532)), + SeccompProfile: &corev1.SeccompProfile{ + Type: corev1.SeccompProfileTypeRuntimeDefault, + }, + }, + Containers: []corev1.Container{ + { + Name: "scanner", + Image: image, + Args: args, + Resources: resources, + SecurityContext: &corev1.SecurityContext{ + AllowPrivilegeEscalation: ptr.To(false), + ReadOnlyRootFilesystem: ptr.To(false), // Nuclei needs temp files + Capabilities: &corev1.Capabilities{ + Drop: []corev1.Capability{"ALL"}, + }, + }, + Env: []corev1.EnvVar{ + { + Name: "POD_NAME", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.name", + }, + }, + }, + { + Name: "POD_NAMESPACE", + ValueFrom: &corev1.EnvVarSource{ + FieldRef: &corev1.ObjectFieldSelector{ + FieldPath: "metadata.namespace", + }, + }, + }, + }, + }, + }, + }, + }, + }, + } + + return job +} diff --git a/internal/jobmanager/jobmanager_test.go b/internal/jobmanager/jobmanager_test.go new file mode 100644 index 0000000..471f07f --- /dev/null +++ b/internal/jobmanager/jobmanager_test.go @@ -0,0 +1,117 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package jobmanager + +import ( + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" +) + +func TestBuildJob(t *testing.T) { + config := DefaultConfig() + manager := &JobManager{ + Config: config, + } + + scan := &nucleiv1alpha1.NucleiScan{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-scan", + Namespace: "default", + }, + Spec: nucleiv1alpha1.NucleiScanSpec{ + Targets: []string{"https://example.com"}, + }, + } + + job := manager.buildJob(scan) + + // Verify job name prefix + if len(job.Name) == 0 { + t.Error("Job name should not be empty") + } + + // Verify namespace + if job.Namespace != "default" { + t.Errorf("Expected namespace 'default', got '%s'", job.Namespace) + } + + // Verify labels + if job.Labels[LabelManagedBy] != "nuclei-operator" { + t.Error("Job should have managed-by label") + } + + if job.Labels[LabelComponent] != "scanner" { + t.Error("Job should have component label") + } + + // Verify container + if len(job.Spec.Template.Spec.Containers) != 1 { + t.Error("Job should have exactly one container") + } + + container := job.Spec.Template.Spec.Containers[0] + if container.Image != config.ScannerImage { + t.Errorf("Expected image '%s', got '%s'", config.ScannerImage, container.Image) + } + + // Verify security context + if job.Spec.Template.Spec.SecurityContext.RunAsNonRoot == nil || !*job.Spec.Template.Spec.SecurityContext.RunAsNonRoot { + t.Error("Pod should run as non-root") + } +} + +func TestBuildJobWithCustomConfig(t *testing.T) { + config := DefaultConfig() + manager := &JobManager{ + Config: config, + } + + customImage := "custom/scanner:v1" + customTimeout := metav1.Duration{Duration: 45 * time.Minute} + + scan := &nucleiv1alpha1.NucleiScan{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-scan", + Namespace: "default", + }, + Spec: nucleiv1alpha1.NucleiScanSpec{ + Targets: []string{"https://example.com"}, + ScannerConfig: &nucleiv1alpha1.ScannerConfig{ + Image: customImage, + Timeout: &customTimeout, + }, + }, + } + + job := manager.buildJob(scan) + + // Verify custom image + container := job.Spec.Template.Spec.Containers[0] + if container.Image != customImage { + t.Errorf("Expected custom image '%s', got '%s'", customImage, container.Image) + } + + // Verify custom timeout + expectedDeadline := int64(45 * 60) // 45 minutes in seconds + if *job.Spec.ActiveDeadlineSeconds != expectedDeadline { + t.Errorf("Expected deadline %d, got %d", expectedDeadline, *job.Spec.ActiveDeadlineSeconds) + } +} diff --git a/internal/scanner/runner.go b/internal/scanner/runner.go new file mode 100644 index 0000000..aeef9d0 --- /dev/null +++ b/internal/scanner/runner.go @@ -0,0 +1,217 @@ +/* +Copyright 2024. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package scanner + +import ( + "context" + "fmt" + "os" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + nucleiv1alpha1 "github.com/mortenolsen/nuclei-operator/api/v1alpha1" +) + +// RunnerConfig holds configuration for the scanner runner +type RunnerConfig struct { + // ScanName is the name of the NucleiScan to execute + ScanName string + + // ScanNamespace is the namespace of the NucleiScan + ScanNamespace string + + // NucleiBinaryPath is the path to the nuclei binary + NucleiBinaryPath string + + // TemplatesPath is the path to nuclei templates + TemplatesPath string +} + +// Runner executes a single scan and updates the NucleiScan status +type Runner struct { + config RunnerConfig + client client.Client + scanner Scanner +} + +// NewRunner creates a new scanner runner +func NewRunner(config RunnerConfig) (*Runner, error) { + // Set up logging + log.SetLogger(zap.New(zap.UseDevMode(false))) + logger := log.Log.WithName("scanner-runner") + + // Create scheme + scheme := runtime.NewScheme() + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(nucleiv1alpha1.AddToScheme(scheme)) + + // Get in-cluster config + restConfig, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("failed to get in-cluster config: %w", err) + } + + // Create client + k8sClient, err := client.New(restConfig, client.Options{Scheme: scheme}) + if err != nil { + return nil, fmt.Errorf("failed to create client: %w", err) + } + + // Create scanner with configuration + scannerConfig := Config{ + NucleiBinaryPath: config.NucleiBinaryPath, + TemplatesPath: config.TemplatesPath, + } + // Use defaults if not specified + if scannerConfig.NucleiBinaryPath == "" { + scannerConfig.NucleiBinaryPath = "nuclei" + } + nucleiScanner := NewNucleiScanner(scannerConfig) + + logger.Info("Scanner runner initialized", + "scanName", config.ScanName, + "scanNamespace", config.ScanNamespace) + + return &Runner{ + config: config, + client: k8sClient, + scanner: nucleiScanner, + }, nil +} + +// Run executes the scan and updates the NucleiScan status +func (r *Runner) Run(ctx context.Context) error { + logger := log.FromContext(ctx).WithName("scanner-runner") + + // Fetch the NucleiScan + scan := &nucleiv1alpha1.NucleiScan{} + err := r.client.Get(ctx, types.NamespacedName{ + Name: r.config.ScanName, + Namespace: r.config.ScanNamespace, + }, scan) + if err != nil { + return fmt.Errorf("failed to get NucleiScan: %w", err) + } + + logger.Info("Starting scan", + "targets", len(scan.Spec.Targets), + "templates", scan.Spec.Templates, + "severity", scan.Spec.Severity) + + // Update status to indicate scan has started + startTime := metav1.Now() + scan.Status.ScanStartTime = &startTime + if err := r.client.Status().Update(ctx, scan); err != nil { + logger.Error(err, "Failed to update scan start time") + // Continue anyway - this is not critical + } + + // Build scan options + options := ScanOptions{ + Templates: scan.Spec.Templates, + Severity: scan.Spec.Severity, + } + + // Execute the scan + scanStartTime := time.Now() + result, err := r.scanner.Scan(ctx, scan.Spec.Targets, options) + scanDuration := time.Since(scanStartTime) + + // Re-fetch the scan to avoid conflicts + if fetchErr := r.client.Get(ctx, types.NamespacedName{ + Name: r.config.ScanName, + Namespace: r.config.ScanNamespace, + }, scan); fetchErr != nil { + return fmt.Errorf("failed to re-fetch NucleiScan: %w", fetchErr) + } + + // Update status based on result + completionTime := metav1.Now() + scan.Status.CompletionTime = &completionTime + + if err != nil { + logger.Error(err, "Scan failed") + scan.Status.Phase = nucleiv1alpha1.ScanPhaseFailed + scan.Status.LastError = err.Error() + } else { + logger.Info("Scan completed successfully", + "findings", len(result.Findings), + "duration", scanDuration) + + scan.Status.Phase = nucleiv1alpha1.ScanPhaseCompleted + scan.Status.Findings = result.Findings + scan.Status.Summary = &nucleiv1alpha1.ScanSummary{ + TotalFindings: len(result.Findings), + FindingsBySeverity: countFindingsBySeverity(result.Findings), + TargetsScanned: len(scan.Spec.Targets), + DurationSeconds: int64(scanDuration.Seconds()), + } + scan.Status.LastError = "" + } + + // Update the status + if err := r.client.Status().Update(ctx, scan); err != nil { + return fmt.Errorf("failed to update NucleiScan status: %w", err) + } + + logger.Info("Scan status updated", + "phase", scan.Status.Phase, + "findings", len(scan.Status.Findings)) + + return nil +} + +// countFindingsBySeverity counts findings by severity level +func countFindingsBySeverity(findings []nucleiv1alpha1.Finding) map[string]int { + counts := make(map[string]int) + for _, f := range findings { + counts[f.Severity]++ + } + return counts +} + +// RunScannerMode is the entry point for scanner mode +func RunScannerMode(scanName, scanNamespace string) error { + // Get configuration from environment + config := RunnerConfig{ + ScanName: scanName, + ScanNamespace: scanNamespace, + NucleiBinaryPath: os.Getenv("NUCLEI_BINARY_PATH"), + TemplatesPath: os.Getenv("NUCLEI_TEMPLATES_PATH"), + } + + if config.ScanName == "" || config.ScanNamespace == "" { + return fmt.Errorf("scan name and namespace are required") + } + + runner, err := NewRunner(config) + if err != nil { + return fmt.Errorf("failed to create runner: %w", err) + } + + ctx := context.Background() + return runner.Run(ctx) +}